inital commit

2026-05-04 19:31:46 +01:00
commit e0f2eedcd9
14 changed files with 3718 additions and 0 deletions
@@ -0,0 +1,61 @@
+"""Adult-content filtering helpers used before URLs reach the index."""
+
+from __future__ import annotations
+
+import re
+from urllib.parse import urlsplit
+
+from config import ADULT_DOMAINS, ADULT_KEYWORDS
+
+EXPLICIT_HOST_MARKERS = (
+    "porn",
+    "xxx",
+    "xvideo",
+    "xnxx",
+    "hentai",
+    "camgirl",
+    "camsoda",
+    "chaturbate",
+    "stripchat",
+    "redtube",
+)
+EXPLICIT_HOST_TOKENS = {"sex", "sexy", "adult", "nude", "erotic", "escort", "bdsm"}
+
+
+def _clean_host(url: str) -> str:
+    host = urlsplit(url.lower()).netloc
+    return host.removeprefix("www.")
+
+
+def _host_matches_blocked_domain(host: str, domain: str) -> bool:
+    clean_domain = domain.lower().removeprefix("www.")
+    return host == clean_domain or host.endswith(f".{clean_domain}")
+
+
+def is_adult_url(url: str) -> bool:
+    """Return True when a URL appears to point at pornographic/adult content."""
+
+    lowered = url.lower()
+    parsed = urlsplit(lowered)
+    host = _clean_host(lowered)
+
+    if any(_host_matches_blocked_domain(host, domain) for domain in ADULT_DOMAINS):
+        return True
+
+    if any(marker in host for marker in EXPLICIT_HOST_MARKERS):
+        return True
+
+    host_tokens = set(re.split(r"[^a-z0-9]+", host))
+    if any(token in host_tokens for token in EXPLICIT_HOST_TOKENS):
+        return True
+
+    path_tokens = set(re.split(r"[^a-z0-9]+", f"{parsed.path} {parsed.query}"))
+    return any(keyword in path_tokens for keyword in ADULT_KEYWORDS)
+
+
+def is_adult_text(text: str) -> bool:
+    """Use a conservative keyword threshold so one incidental word does not block a page."""
+
+    lowered = text.lower()
+    hits = sum(1 for keyword in ADULT_KEYWORDS if keyword in lowered)
+    return hits >= 3
@@ -0,0 +1,309 @@
+"""Async web crawler used to build the sFetch index."""
+
+from __future__ import annotations
+
+import asyncio
+from collections import defaultdict
+from typing import Iterable
+from urllib.parse import urljoin, urldefrag, urlsplit, urlunsplit
+from urllib.robotparser import RobotFileParser
+
+import httpx
+from bs4 import BeautifulSoup
+
+from config import (
+    CRAWL_DELAY_SECONDS,
+    DEFAULT_CRAWL_CONCURRENCY,
+    MAX_CRAWL_DEPTH,
+    MAX_PAGES_PER_DOMAIN,
+    USER_AGENT,
+)
+from content_filter import is_adult_text, is_adult_url
+from indexer import index_page
+
+
+class sFetchBot:
+    """A polite async crawler that stays within configurable crawl limits and filters adult content."""
+
+    def __init__(
+        self,
+        max_depth: int = MAX_CRAWL_DEPTH,
+        same_domain_only: bool = True,
+        crawl_delay: float = CRAWL_DELAY_SECONDS,
+        max_pages_per_domain: int = MAX_PAGES_PER_DOMAIN,
+        max_concurrency: int = DEFAULT_CRAWL_CONCURRENCY,
+        timeout_seconds: float = 15.0,
+    ) -> None:
+        self.max_depth = max_depth
+        self.same_domain_only = same_domain_only
+        self.crawl_delay = crawl_delay
+        self.max_pages_per_domain = max_pages_per_domain
+        self.max_concurrency = max(1, max_concurrency)
+        self.timeout_seconds = timeout_seconds
+        self.visited: set[str] = set()
+        self.domain_counts: defaultdict[str, int] = defaultdict(int)
+        self.robots_cache: dict[str, RobotFileParser] = {}
+        self.indexed_count = 0
+        self._state_lock = asyncio.Lock()
+        self._fetch_semaphore = asyncio.Semaphore(self.max_concurrency)
+        self._client: httpx.AsyncClient | None = None
+
+    async def start(self, seed_urls: list[str]) -> None:
+        if not seed_urls:
+            return
+
+        timeout = httpx.Timeout(self.timeout_seconds)
+        headers = {"User-Agent": USER_AGENT}
+        async with httpx.AsyncClient(
+            timeout=timeout,
+            follow_redirects=True,
+            headers=headers,
+        ) as client:
+            self._client = client
+            tasks = []
+            for seed_url in seed_urls:
+                normalized_seed = self._normalize_url(seed_url)
+                if normalized_seed is None:
+                    print(f"sFetch: skipped {seed_url} (invalid URL)")
+                    continue
+                if is_adult_url(normalized_seed):
+                    print(f"sFetch: skipped {seed_url} (adult content filtered)")
+                    continue
+                root_domain = urlsplit(normalized_seed).netloc.lower()
+                tasks.append(self._crawl_url(normalized_seed, root_domain, depth=0))
+
+            if tasks:
+                await asyncio.gather(*tasks, return_exceptions=True)
+
+        self._client = None
+
+    async def _crawl_url(self, url: str, root_domain: str, depth: int) -> None:
+        try:
+            if depth > self.max_depth:
+                return
+
+            normalized_url = self._normalize_url(url)
+            if normalized_url is None:
+                return
+
+            if is_adult_url(normalized_url):
+                print(f"sFetch: skipped {normalized_url} (adult)")
+                return
+
+            parsed = urlsplit(normalized_url)
+            current_domain = parsed.netloc.lower()
+            if self.same_domain_only and current_domain != root_domain:
+                return
+
+            if await self._already_seen(normalized_url):
+                return
+
+            if await self._domain_limit_reached(current_domain):
+                return
+
+            if not await self._is_allowed_by_robots(normalized_url):
+                return
+
+            client = self._require_client()
+            async with self._fetch_semaphore:
+                await asyncio.sleep(self.crawl_delay)
+                response = await client.get(normalized_url)
+            response.raise_for_status()
+
+            content_type = response.headers.get("content-type", "").lower()
+            if "text/html" not in content_type:
+                return
+
+            title, body_text, links, images, videos = self._extract_page_content(normalized_url, response.text)
+
+            if is_adult_text(body_text):
+                print(f"sFetch: skipped {normalized_url} (adult text)")
+                return
+
+            await index_page(normalized_url, title, body_text, images, videos)
+            await self._increment_domain_count(current_domain)
+            self.indexed_count += 1
+            print(f"sFetch: indexed {normalized_url}")
+
+            for link in links:
+                await self._crawl_url(link, root_domain, depth + 1)
+        except httpx.HTTPError as exc:
+            print(f"sFetch: HTTP error {url} ({exc})")
+        except Exception as exc:
+            print(f"sFetch: error {url} ({exc})")
+
+    def _require_client(self) -> httpx.AsyncClient:
+        if self._client is None:
+            raise RuntimeError("Crawler client is not initialized.")
+        return self._client
+
+    async def _already_seen(self, url: str) -> bool:
+        async with self._state_lock:
+            if url in self.visited:
+                return True
+            self.visited.add(url)
+            return False
+
+    async def _domain_limit_reached(self, domain: str) -> bool:
+        async with self._state_lock:
+            return self.domain_counts[domain] >= self.max_pages_per_domain
+
+    async def _increment_domain_count(self, domain: str) -> None:
+        async with self._state_lock:
+            self.domain_counts[domain] += 1
+
+    async def _is_allowed_by_robots(self, url: str) -> bool:
+        parsed = urlsplit(url)
+        robots_key = f"{parsed.scheme}://{parsed.netloc.lower()}"
+        parser = self.robots_cache.get(robots_key)
+        if parser is None:
+            parser = await self._fetch_robots_parser(robots_key)
+            self.robots_cache[robots_key] = parser
+        return parser.can_fetch(USER_AGENT, url)
+
+    async def _fetch_robots_parser(self, domain_base: str) -> RobotFileParser:
+        parser = RobotFileParser()
+        robots_url = f"{domain_base}/robots.txt"
+        parser.set_url(robots_url)
+
+        try:
+            client = self._require_client()
+            response = await client.get(robots_url)
+            if response.status_code == 200:
+                parser.parse(response.text.splitlines())
+            else:
+                parser.parse([])
+        except Exception:
+            parser.parse([])
+        return parser
+
+    def _extract_page_content(
+        self,
+        url: str,
+        html_text: str,
+    ) -> tuple[str, str, list[str], list[dict[str, str]], list[dict[str, str]]]:
+        soup = BeautifulSoup(html_text, "html.parser")
+
+        images = self._extract_images(url, soup)
+        videos = self._extract_videos(url, soup)
+
+        for element in soup(["script", "style", "noscript"]):
+            element.decompose()
+
+        title = ""
+        if soup.title and soup.title.string:
+            title = soup.title.string.strip()
+        if not title:
+            title = url
+
+        body_text = soup.get_text(separator=" ", strip=True)
+        links = self._extract_links(url, soup)
+        return title, body_text, links, images, videos
+
+    def _extract_images(self, base_url: str, soup: BeautifulSoup) -> list[dict[str, str]]:
+        images = []
+        for img in soup.find_all("img", src=True):
+            src = str(img["src"]).strip()
+            if not src or src.startswith(("data:", "javascript:")):
+                continue
+            absolute_url = urljoin(base_url, src)
+            normalized_url = self._normalize_url(absolute_url)
+            if normalized_url is not None:
+                alt = str(img.get("alt", "")).strip()
+                images.append({"url": normalized_url, "alt_text": alt})
+        return self._dedupe_media(images)
+
+    def _extract_videos(self, base_url: str, soup: BeautifulSoup) -> list[dict[str, str]]:
+        videos: list[dict[str, str]] = []
+
+        for video in soup.find_all("video"):
+            if video.get("src"):
+                normalized = self._normalize_url(urljoin(base_url, str(video["src"]).strip()))
+                if normalized:
+                    title = str(video.get("title") or video.get("aria-label") or "").strip()
+                    videos.append({"url": normalized, "title": title})
+            for source in video.find_all("source", src=True):
+                normalized = self._normalize_url(urljoin(base_url, str(source["src"]).strip()))
+                if normalized:
+                    title = str(video.get("title") or video.get("aria-label") or "").strip()
+                    videos.append({"url": normalized, "title": title})
+
+        for iframe in soup.find_all("iframe", src=True):
+            raw_src = str(iframe["src"]).strip()
+            normalized = self._normalize_url(urljoin(base_url, raw_src))
+            if normalized and self._is_video_url(normalized):
+                title = str(iframe.get("title") or iframe.get("aria-label") or "").strip()
+                videos.append({"url": normalized, "title": title})
+
+        for tag in soup.find_all("a", href=True):
+            raw_href = str(tag["href"]).strip()
+            normalized = self._normalize_url(urljoin(base_url, raw_href))
+            if normalized and self._is_video_url(normalized):
+                title = " ".join(tag.stripped_strings).strip()
+                videos.append({"url": normalized, "title": title})
+
+        return self._dedupe_media(videos)
+
+    def _is_video_url(self, url: str) -> bool:
+        lowered = url.lower()
+        return any(
+            marker in lowered
+            for marker in (
+                "youtube.com/watch",
+                "youtube.com/embed/",
+                "youtu.be/",
+                "vimeo.com/",
+                ".mp4",
+                ".webm",
+                ".mov",
+                ".m3u8",
+            )
+        )
+
+    def _dedupe_media(self, items: list[dict[str, str]]) -> list[dict[str, str]]:
+        seen: set[str] = set()
+        unique: list[dict[str, str]] = []
+        for item in items:
+            media_url = item.get("url")
+            if not media_url or media_url in seen:
+                continue
+            seen.add(media_url)
+            unique.append(item)
+        return unique
+
+    def _extract_links(self, base_url: str, soup: BeautifulSoup) -> list[str]:
+        collected_links: list[str] = []
+        for tag in soup.find_all("a", href=True):
+            href = str(tag["href"]).strip()
+            if not href or href.startswith(("javascript:", "mailto:", "tel:")):
+                continue
+            absolute_url = urljoin(base_url, href)
+            normalized_url = self._normalize_url(absolute_url)
+            if normalized_url is not None:
+                collected_links.append(normalized_url)
+        return self._dedupe_links(collected_links)
+
+    def _dedupe_links(self, links: Iterable[str]) -> list[str]:
+        seen: set[str] = set()
+        unique_links: list[str] = []
+        for link in links:
+            if link in seen:
+                continue
+            seen.add(link)
+            unique_links.append(link)
+        return unique_links
+
+    def _normalize_url(self, url: str) -> str | None:
+        if not url:
+            return None
+
+        clean_url, _ = urldefrag(url.strip())
+        parsed = urlsplit(clean_url)
+        if parsed.scheme not in {"http", "https"} or not parsed.netloc:
+            return None
+
+        normalized = parsed._replace(
+            scheme=parsed.scheme.lower(),
+            netloc=parsed.netloc.lower(),
+        )
+        return urlunsplit(normalized)
@@ -0,0 +1,395 @@
+"""Async SQLite helpers for sFetch's crawl index."""
+
+from __future__ import annotations
+
+from contextlib import asynccontextmanager
+from typing import Any, AsyncIterator
+
+import aiosqlite
+
+from config import DB_PATH
+
+
+@asynccontextmanager
+async def _get_connection() -> AsyncIterator[aiosqlite.Connection]:
+    async with aiosqlite.connect(DB_PATH) as connection:
+        connection.row_factory = aiosqlite.Row
+        await connection.execute("PRAGMA foreign_keys = ON;")
+        await connection.execute("PRAGMA journal_mode = WAL;")
+        yield connection
+
+
+def _to_fts_query(query: str) -> str:
+    tokens: list[str] = []
+    for raw_token in query.split():
+        token = raw_token.strip()
+        if not token:
+            continue
+        escaped = token.replace('"', '""')
+        tokens.append(f'"{escaped}"')
+    return " OR ".join(tokens)
+
+
+async def init_db() -> None:
+    async with _get_connection() as connection:
+        await connection.executescript(
+            """
+            CREATE TABLE IF NOT EXISTS pages (
+              id INTEGER PRIMARY KEY AUTOINCREMENT,
+              url TEXT UNIQUE NOT NULL,
+              title TEXT,
+              body_text TEXT,
+              indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP
+            );
+
+            CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts
+            USING fts5(title, body_text, content='pages', content_rowid='id');
+
+            CREATE TRIGGER IF NOT EXISTS pages_ai
+            AFTER INSERT ON pages
+            BEGIN
+              INSERT INTO pages_fts(rowid, title, body_text)
+              VALUES (new.id, new.title, new.body_text);
+            END;
+
+            CREATE TRIGGER IF NOT EXISTS pages_ad
+            AFTER DELETE ON pages
+            BEGIN
+              INSERT INTO pages_fts(pages_fts, rowid, title, body_text)
+              VALUES ('delete', old.id, old.title, old.body_text);
+            END;
+
+            CREATE TRIGGER IF NOT EXISTS pages_au
+            AFTER UPDATE ON pages
+            BEGIN
+              INSERT INTO pages_fts(pages_fts, rowid, title, body_text)
+              VALUES ('delete', old.id, old.title, old.body_text);
+              INSERT INTO pages_fts(rowid, title, body_text)
+              VALUES (new.id, new.title, new.body_text);
+            END;
+
+            CREATE TABLE IF NOT EXISTS images (
+              id INTEGER PRIMARY KEY AUTOINCREMENT,
+              url TEXT UNIQUE NOT NULL,
+              page_url TEXT NOT NULL,
+              alt_text TEXT,
+              indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+              FOREIGN KEY(page_url) REFERENCES pages(url) ON DELETE CASCADE
+            );
+
+            CREATE VIRTUAL TABLE IF NOT EXISTS images_fts
+            USING fts5(alt_text, content='images', content_rowid='id');
+
+            CREATE TRIGGER IF NOT EXISTS images_ai
+            AFTER INSERT ON images
+            BEGIN
+              INSERT INTO images_fts(rowid, alt_text)
+              VALUES (new.id, new.alt_text);
+            END;
+
+            CREATE TRIGGER IF NOT EXISTS images_ad
+            AFTER DELETE ON images
+            BEGIN
+              INSERT INTO images_fts(images_fts, rowid, alt_text)
+              VALUES ('delete', old.id, old.alt_text);
+            END;
+
+            CREATE TRIGGER IF NOT EXISTS images_au
+            AFTER UPDATE ON images
+            BEGIN
+              INSERT INTO images_fts(images_fts, rowid, alt_text)
+              VALUES ('delete', old.id, old.alt_text);
+              INSERT INTO images_fts(rowid, alt_text)
+              VALUES (new.id, new.alt_text);
+            END;
+
+            CREATE TABLE IF NOT EXISTS videos (
+              id INTEGER PRIMARY KEY AUTOINCREMENT,
+              url TEXT UNIQUE NOT NULL,
+              page_url TEXT NOT NULL,
+              title TEXT,
+              indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+              FOREIGN KEY(page_url) REFERENCES pages(url) ON DELETE CASCADE
+            );
+
+            CREATE VIRTUAL TABLE IF NOT EXISTS videos_fts
+            USING fts5(title, content='videos', content_rowid='id');
+
+            CREATE TRIGGER IF NOT EXISTS videos_ai
+            AFTER INSERT ON videos
+            BEGIN
+              INSERT INTO videos_fts(rowid, title)
+              VALUES (new.id, new.title);
+            END;
+
+            CREATE TRIGGER IF NOT EXISTS videos_ad
+            AFTER DELETE ON videos
+            BEGIN
+              INSERT INTO videos_fts(videos_fts, rowid, title)
+              VALUES ('delete', old.id, old.title);
+            END;
+
+            CREATE TRIGGER IF NOT EXISTS videos_au
+            AFTER UPDATE ON videos
+            BEGIN
+              INSERT INTO videos_fts(videos_fts, rowid, title)
+              VALUES ('delete', old.id, old.title);
+              INSERT INTO videos_fts(rowid, title)
+              VALUES (new.id, new.title);
+            END;
+
+            CREATE TABLE IF NOT EXISTS app_meta (
+              key TEXT PRIMARY KEY,
+              value TEXT NOT NULL,
+              updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
+            );
+            """
+        )
+        await connection.commit()
+
+
+async def get_meta_value(key: str) -> str | None:
+    async with _get_connection() as connection:
+        cursor = await connection.execute(
+            "SELECT value FROM app_meta WHERE key = ?",
+            (key,),
+        )
+        row = await cursor.fetchone()
+        await cursor.close()
+        return str(row["value"]) if row else None
+
+
+async def set_meta_value(key: str, value: str) -> None:
+    async with _get_connection() as connection:
+        await connection.execute(
+            """
+            INSERT INTO app_meta (key, value)
+            VALUES (?, ?)
+            ON CONFLICT(key) DO UPDATE SET
+                value = excluded.value,
+                updated_at = CURRENT_TIMESTAMP
+            """,
+            (key, value),
+        )
+        await connection.commit()
+
+
+async def insert_page(url: str, title: str, body_text: str) -> int:
+    async with _get_connection() as connection:
+        await connection.execute(
+            """
+            INSERT INTO pages (url, title, body_text)
+            VALUES (?, ?, ?)
+            ON CONFLICT(url) DO UPDATE SET
+                title = excluded.title,
+                body_text = excluded.body_text,
+                indexed_at = CURRENT_TIMESTAMP
+            """,
+            (url, title, body_text),
+        )
+        await connection.commit()
+
+        cursor = await connection.execute(
+            "SELECT id FROM pages WHERE url = ?",
+            (url,),
+        )
+        row = await cursor.fetchone()
+        await cursor.close()
+        if row is None:
+            raise RuntimeError("Inserted page could not be reloaded from the database.")
+        return int(row["id"])
+
+
+async def insert_image(url: str, page_url: str, alt_text: str) -> None:
+    async with _get_connection() as connection:
+        await connection.execute(
+            """
+            INSERT INTO images (url, page_url, alt_text)
+            VALUES (?, ?, ?)
+            ON CONFLICT(url) DO UPDATE SET
+                page_url = excluded.page_url,
+                alt_text = excluded.alt_text,
+                indexed_at = CURRENT_TIMESTAMP
+            """,
+            (url, page_url, alt_text),
+        )
+        await connection.commit()
+
+
+async def insert_video(url: str, page_url: str, title: str) -> None:
+    async with _get_connection() as connection:
+        await connection.execute(
+            """
+            INSERT INTO videos (url, page_url, title)
+            VALUES (?, ?, ?)
+            ON CONFLICT(url) DO UPDATE SET
+                page_url = excluded.page_url,
+                title = excluded.title,
+                indexed_at = CURRENT_TIMESTAMP
+            """,
+            (url, page_url, title),
+        )
+        await connection.commit()
+
+
+async def search_pages(query: str, limit: int = 10, offset: int = 0) -> list[dict[str, Any]]:
+    fts_query = _to_fts_query(query)
+    if not fts_query:
+        return []
+
+    safe_limit = max(1, min(limit, 50))
+    safe_offset = max(0, offset)
+    async with _get_connection() as connection:
+        cursor = await connection.execute(
+            """
+            SELECT
+                p.id,
+                p.url,
+                p.title,
+                p.body_text,
+                p.indexed_at
+            FROM pages_fts
+            JOIN pages AS p ON p.id = pages_fts.rowid
+            WHERE pages_fts MATCH ?
+            ORDER BY bm25(pages_fts), p.indexed_at DESC
+            LIMIT ? OFFSET ?
+            """,
+            (fts_query, safe_limit, safe_offset),
+        )
+        rows = await cursor.fetchall()
+        await cursor.close()
+        return [dict(row) for row in rows]
+
+
+async def count_search_results(query: str) -> int:
+    fts_query = _to_fts_query(query)
+    if not fts_query:
+        return 0
+
+    async with _get_connection() as connection:
+        cursor = await connection.execute(
+            """
+            SELECT COUNT(*) AS total
+            FROM pages_fts
+            WHERE pages_fts MATCH ?
+            """,
+            (fts_query,),
+        )
+        row = await cursor.fetchone()
+        await cursor.close()
+        return int(row["total"]) if row and row["total"] is not None else 0
+
+
+async def search_images(query: str, limit: int = 10, offset: int = 0) -> list[dict[str, Any]]:
+    fts_query = _to_fts_query(query)
+    if not fts_query:
+        return []
+
+    safe_limit = max(1, min(limit, 50))
+    safe_offset = max(0, offset)
+    async with _get_connection() as connection:
+        cursor = await connection.execute(
+            """
+            SELECT
+                i.id,
+                i.url,
+                i.page_url,
+                i.alt_text,
+                i.indexed_at
+            FROM images_fts
+            JOIN images AS i ON i.id = images_fts.rowid
+            WHERE images_fts MATCH ?
+            ORDER BY bm25(images_fts), i.indexed_at DESC
+            LIMIT ? OFFSET ?
+            """,
+            (fts_query, safe_limit, safe_offset),
+        )
+        rows = await cursor.fetchall()
+        await cursor.close()
+        return [dict(row) for row in rows]
+
+
+async def count_image_results(query: str) -> int:
+    fts_query = _to_fts_query(query)
+    if not fts_query:
+        return 0
+
+    async with _get_connection() as connection:
+        cursor = await connection.execute(
+            """
+            SELECT COUNT(*) AS total
+            FROM images_fts
+            WHERE images_fts MATCH ?
+            """,
+            (fts_query,),
+        )
+        row = await cursor.fetchone()
+        await cursor.close()
+        return int(row["total"]) if row and row["total"] is not None else 0
+
+
+async def search_videos(query: str, limit: int = 10, offset: int = 0) -> list[dict[str, Any]]:
+    fts_query = _to_fts_query(query)
+    if not fts_query:
+        return []
+
+    safe_limit = max(1, min(limit, 50))
+    safe_offset = max(0, offset)
+    async with _get_connection() as connection:
+        cursor = await connection.execute(
+            """
+            SELECT
+                v.id,
+                v.url,
+                v.page_url,
+                v.title,
+                v.indexed_at
+            FROM videos_fts
+            JOIN videos AS v ON v.id = videos_fts.rowid
+            WHERE videos_fts MATCH ?
+            ORDER BY bm25(videos_fts), v.indexed_at DESC
+            LIMIT ? OFFSET ?
+            """,
+            (fts_query, safe_limit, safe_offset),
+        )
+        rows = await cursor.fetchall()
+        await cursor.close()
+        return [dict(row) for row in rows]
+
+
+async def count_video_results(query: str) -> int:
+    fts_query = _to_fts_query(query)
+    if not fts_query:
+        return 0
+
+    async with _get_connection() as connection:
+        cursor = await connection.execute(
+            """
+            SELECT COUNT(*) AS total
+            FROM videos_fts
+            WHERE videos_fts MATCH ?
+            """,
+            (fts_query,),
+        )
+        row = await cursor.fetchone()
+        await cursor.close()
+        return int(row["total"]) if row and row["total"] is not None else 0
+
+
+async def get_stats() -> dict[str, Any]:
+    async with _get_connection() as connection:
+        cursor = await connection.execute(
+            """
+            SELECT
+                COUNT(*) AS total_pages,
+                MAX(indexed_at) AS last_indexed_at
+            FROM pages
+            """
+        )
+        row = await cursor.fetchone()
+        await cursor.close()
+
+    return {
+        "total_pages": int(row["total_pages"]) if row and row["total_pages"] is not None else 0,
+        "last_indexed_at": row["last_indexed_at"] if row else None,
+    }
@@ -0,0 +1,41 @@
+"""Normalization and indexing helpers for crawled pages."""
+
+from __future__ import annotations
+
+import re
+from database import insert_image, insert_page, insert_video
+
+MAX_BODY_LENGTH = 10_000
+
+
+def _normalize_text(body_text: str) -> str:
+    collapsed = re.sub(r"\s+", " ", body_text).strip()
+    return collapsed[:MAX_BODY_LENGTH]
+
+
+async def index_page(
+    url: str,
+    title: str,
+    body_text: str,
+    images: list[dict[str, str]] | None = None,
+    videos: list[dict[str, str]] | None = None,
+) -> None:
+    normalized_title = title.strip() or url
+    normalized_body = _normalize_text(body_text)
+    if not normalized_body:
+        return
+    await insert_page(url=url, title=normalized_title, body_text=normalized_body)
+
+    if images:
+        for img in images:
+            img_url = img.get("url")
+            alt_text = img.get("alt_text", "")
+            if img_url:
+                await insert_image(url=img_url, page_url=url, alt_text=alt_text)
+
+    if videos:
+        for video in videos:
+            video_url = video.get("url")
+            video_title = video.get("title") or normalized_title
+            if video_url:
+                await insert_video(url=video_url, page_url=url, title=video_title.strip())
@@ -0,0 +1,207 @@
+"""FastAPI entry point for the sFetch backend."""
+
+from __future__ import annotations
+
+import asyncio
+from datetime import UTC, datetime
+
+from fastapi import FastAPI, HTTPException, Query, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+
+from crawler import sFetchBot
+from config import TOP_SITE_SEED_LIMIT, TOP_SITE_SEED_META_KEY
+from database import (
+    count_image_results,
+    count_search_results,
+    count_video_results,
+    get_meta_value,
+    get_stats,
+    init_db,
+    set_meta_value,
+)
+from models import CrawlRequest, SearchResponse
+from searcher import search, search_images_api, search_videos_api
+from top_sites import load_top_site_seed_urls
+
+app = FastAPI(title="sFetch API", version="1.0.0")
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=False,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+def _utc_now() -> str:
+    return datetime.now(UTC).isoformat()
+
+
+def _set_seed_status(**updates: object) -> None:
+    current = getattr(app.state, "_top_scrape_status", {}).copy()
+    current.update({"updated_at": _utc_now(), **updates})
+    app.state._top_scrape_status = current
+
+
+async def _scrape_top_sites(force: bool = False) -> None:
+    await init_db()
+
+    async with app.state._crawl_lock:
+        if app.state._top_scrape_done and not force:
+            return
+
+        existing_seed = await get_meta_value(TOP_SITE_SEED_META_KEY)
+        if existing_seed and not force:
+            stats = await get_stats()
+            _set_seed_status(
+                state="stored",
+                message="Top-site seed already stored in the database.",
+                total=TOP_SITE_SEED_LIMIT,
+                indexed=stats["total_pages"],
+                source=existing_seed,
+            )
+            app.state._top_scrape_done = True
+            return
+
+        stats = await get_stats()
+        if stats["total_pages"] >= TOP_SITE_SEED_LIMIT and not force:
+            source = "existing database"
+            await set_meta_value(TOP_SITE_SEED_META_KEY, source)
+            _set_seed_status(
+                state="stored",
+                message="Top-site seed already stored in the database.",
+                total=TOP_SITE_SEED_LIMIT,
+                indexed=stats["total_pages"],
+                source=source,
+            )
+            app.state._top_scrape_done = True
+            return
+
+        _set_seed_status(state="loading", message="Loading top-site list.", total=TOP_SITE_SEED_LIMIT, indexed=0)
+        seed_urls, source = await load_top_site_seed_urls(limit=TOP_SITE_SEED_LIMIT)
+        _set_seed_status(
+            state="running",
+            message=f"Seeding {len(seed_urls)} non-adult top sites.",
+            total=len(seed_urls),
+            indexed=0,
+            source=source,
+        )
+
+        print(f"sFetch: seeding index with {len(seed_urls)} non-adult top sites from {source}...")
+        bot = sFetchBot(max_depth=0, same_domain_only=True, max_pages_per_domain=1, max_concurrency=12)
+        try:
+            await bot.start(seed_urls)
+        except Exception as exc:
+            _set_seed_status(state="error", message=f"Top-site seed failed: {exc}", indexed=bot.indexed_count)
+            print(f"sFetch: top-site seed failed ({exc})")
+            return
+
+        await set_meta_value(TOP_SITE_SEED_META_KEY, source)
+        _set_seed_status(
+            state="complete",
+            message="Top-site seed complete.",
+            total=len(seed_urls),
+            indexed=bot.indexed_count,
+            source=source,
+        )
+        print(f"sFetch: seeding complete. {bot.indexed_count} pages indexed.")
+        app.state._top_scrape_done = True
+
+
+@app.on_event("startup")
+async def startup_event() -> None:
+    app.state._top_scrape_done = False
+    app.state._crawl_lock = asyncio.Lock()
+    app.state._top_scrape_status = {
+        "state": "idle",
+        "message": "Waiting to check top-site seed.",
+        "total": TOP_SITE_SEED_LIMIT,
+        "indexed": 0,
+        "source": None,
+        "updated_at": _utc_now(),
+    }
+    asyncio.create_task(_scrape_top_sites())
+
+
+@app.get("/")
+async def health_check() -> dict[str, str]:
+    return {"status": "sFetch is alive"}
+
+
+@app.get("/search", response_model=SearchResponse)
+async def search_endpoint(
+    q: str = Query(..., description="Search query"),
+    type: str = Query("web", description="Search type: web, image, or video"),
+    limit: int = Query(10, ge=1, le=50),
+    offset: int = Query(0, ge=0),
+) -> SearchResponse:
+    query = q.strip()
+    if not query:
+        raise HTTPException(status_code=400, detail="Query parameter 'q' cannot be empty.")
+
+    if type == "image":
+        results = await search_images_api(query=query, limit=limit, offset=offset)
+        total = await count_image_results(query)
+        return SearchResponse(query=query, type=type, total=total, results=results)
+
+    if type == "video":
+        results = await search_videos_api(query=query, limit=limit, offset=offset)
+        total = await count_video_results(query)
+        return SearchResponse(query=query, type=type, total=total, results=results)
+
+    if type != "web":
+        raise HTTPException(status_code=400, detail="Invalid search type. Use web, image, or video.")
+
+    results = await search(query=query, limit=limit, offset=offset)
+    total = await count_search_results(query)
+    return SearchResponse(query=query, type=type, total=total, results=results)
+
+
+async def _run_crawl_job(request: CrawlRequest) -> None:
+    try:
+        bot = sFetchBot(
+            max_depth=request.max_depth,
+            max_pages_per_domain=request.max_pages_per_domain,
+            same_domain_only=request.same_domain_only,
+        )
+        await bot.start(request.seed_urls)
+    except Exception as exc:
+        print(f"sFetch: crawl job failed ({exc})")
+
+
+@app.post("/crawl")
+async def crawl_endpoint(request: CrawlRequest, background_tasks: BackgroundTasks) -> dict[str, object]:
+    background_tasks.add_task(_run_crawl_job, request)
+    return {"message": "Crawl started", "seed_urls": request.seed_urls}
+
+
+@app.post("/crawl/top-sites")
+async def crawl_top_sites_endpoint(
+    background_tasks: BackgroundTasks,
+    force: bool = Query(False, description="Run the top-site seed again even if it is marked complete."),
+) -> dict[str, object]:
+    background_tasks.add_task(_scrape_top_sites, force)
+    return {"message": "Top-site crawl queued", "force": force}
+
+
+@app.get("/crawl/top-sites/status")
+async def crawl_top_sites_status_endpoint() -> dict[str, object]:
+    return getattr(
+        app.state,
+        "_top_scrape_status",
+        {
+            "state": "idle",
+            "message": "Top-site seed has not started.",
+            "total": TOP_SITE_SEED_LIMIT,
+            "indexed": 0,
+            "source": None,
+            "updated_at": None,
+        },
+    )
+
+
+@app.get("/stats")
+async def stats_endpoint() -> dict[str, object]:
+    stats = await get_stats()
+    return stats
@@ -0,0 +1,43 @@
+"""Pydantic models for sFetch's API."""
+
+from __future__ import annotations
+
+from pydantic import BaseModel, Field
+
+
+class SearchResult(BaseModel):
+    id: int
+    url: str
+    title: str
+    snippet: str
+    indexed_at: str
+
+
+class ImageResult(BaseModel):
+    id: int
+    url: str
+    page_url: str
+    alt_text: str
+    indexed_at: str
+
+
+class VideoResult(BaseModel):
+    id: int
+    url: str
+    page_url: str
+    title: str
+    indexed_at: str
+
+
+class SearchResponse(BaseModel):
+    query: str
+    type: str = "web"
+    total: int
+    results: list[SearchResult] | list[ImageResult] | list[VideoResult]
+
+
+class CrawlRequest(BaseModel):
+    seed_urls: list[str] = Field(min_length=1)
+    max_depth: int = Field(default=2, ge=0, le=5)
+    max_pages_per_domain: int = Field(default=50, ge=1, le=500)
+    same_domain_only: bool = True
@@ -0,0 +1,6 @@
+fastapi
+uvicorn[standard]
+httpx
+beautifulsoup4
+pydantic
+aiosqlite
@@ -0,0 +1,90 @@
+"""Search result shaping for sFetch."""
+
+from __future__ import annotations
+
+import html
+import re
+
+from database import search_images, search_pages, search_videos
+
+SNIPPET_LENGTH = 200
+
+
+def _extract_terms(query: str) -> list[str]:
+    terms = {term.lower() for term in re.findall(r"\w+", query, flags=re.UNICODE)}
+    return sorted(terms, key=len, reverse=True)
+
+
+def _build_snippet(body_text: str) -> str:
+    snippet = body_text[:SNIPPET_LENGTH].strip()
+    if not snippet:
+        return "No preview available."
+    if len(body_text) > SNIPPET_LENGTH:
+        return f"{snippet}..."
+    return snippet
+
+
+def _highlight_terms(snippet: str, query: str) -> str:
+    safe_snippet = html.escape(snippet)
+    for term in _extract_terms(query):
+        pattern = re.compile(re.escape(html.escape(term)), flags=re.IGNORECASE)
+        safe_snippet = pattern.sub(lambda match: f"<mark>{match.group(0)}</mark>", safe_snippet)
+    return safe_snippet
+
+
+async def search(query: str, limit: int = 10, offset: int = 0) -> list[dict]:
+    rows = await search_pages(query=query, limit=limit, offset=offset)
+    results: list[dict] = []
+
+    for row in rows:
+        title = (row.get("title") or row.get("url") or "Untitled").strip()
+        body_text = row.get("body_text") or ""
+        snippet = _highlight_terms(_build_snippet(body_text), query)
+        results.append(
+            {
+                "id": row["id"],
+                "url": row["url"],
+                "title": title,
+                "snippet": snippet,
+                "indexed_at": row["indexed_at"],
+            }
+        )
+
+    return results
+
+
+async def search_images_api(query: str, limit: int = 10, offset: int = 0) -> list[dict]:
+    rows = await search_images(query=query, limit=limit, offset=offset)
+    results: list[dict] = []
+
+    for row in rows:
+        results.append(
+            {
+                "id": row["id"],
+                "url": row["url"],
+                "page_url": row["page_url"],
+                "alt_text": row["alt_text"] or "",
+                "indexed_at": row["indexed_at"],
+            }
+        )
+
+    return results
+
+
+async def search_videos_api(query: str, limit: int = 10, offset: int = 0) -> list[dict]:
+    rows = await search_videos(query=query, limit=limit, offset=offset)
+    results: list[dict] = []
+
+    for row in rows:
+        title = (row.get("title") or "Video result").strip()
+        results.append(
+            {
+                "id": row["id"],
+                "url": row["url"],
+                "page_url": row["page_url"],
+                "title": title,
+                "indexed_at": row["indexed_at"],
+            }
+        )
+
+    return results
@@ -0,0 +1,110 @@
+"""Load and sanitize the top-site seed list for first-launch indexing."""
+
+from __future__ import annotations
+
+import csv
+import io
+import zipfile
+from collections.abc import Iterable
+from urllib.parse import urlsplit, urlunsplit
+
+import httpx
+
+from config import (
+    TOP_SITE_DOWNLOAD_TIMEOUT_SECONDS,
+    TOP_SITE_SEED_LIMIT,
+    TOP_SITE_SOURCE_URL,
+    TOP_SITES,
+    USER_AGENT,
+)
+from content_filter import is_adult_url
+
+
+def _normalize_site_url(value: str) -> str | None:
+    raw_value = value.strip()
+    if not raw_value:
+        return None
+
+    candidate = raw_value if "://" in raw_value else f"https://{raw_value}"
+    parsed = urlsplit(candidate)
+    if parsed.scheme not in {"http", "https"} or not parsed.netloc:
+        return None
+
+    normalized = parsed._replace(
+        scheme=parsed.scheme.lower(),
+        netloc=parsed.netloc.lower(),
+        path=parsed.path.rstrip("/") if parsed.path not in {"", "/"} else "",
+        query="",
+        fragment="",
+    )
+    return urlunsplit(normalized)
+
+
+def _host_key(url: str) -> str:
+    return urlsplit(url).netloc.lower().removeprefix("www.")
+
+
+def _safe_top_urls(candidates: Iterable[str], limit: int = TOP_SITE_SEED_LIMIT) -> list[str]:
+    safe_urls: list[str] = []
+    seen_hosts: set[str] = set()
+
+    for candidate in candidates:
+        normalized = _normalize_site_url(candidate)
+        if normalized is None:
+            continue
+        host_key = _host_key(normalized)
+        if host_key in seen_hosts or is_adult_url(normalized):
+            continue
+        seen_hosts.add(host_key)
+        safe_urls.append(normalized)
+        if len(safe_urls) >= limit:
+            break
+
+    return safe_urls
+
+
+def _domains_from_csv_text(csv_text: str) -> list[str]:
+    domains: list[str] = []
+    reader = csv.reader(io.StringIO(csv_text))
+    for row in reader:
+        if not row:
+            continue
+        domain = row[1] if len(row) > 1 else row[0]
+        if domain and domain.lower() != "domain":
+            domains.append(domain)
+    return domains
+
+
+def _domains_from_zip(payload: bytes) -> list[str]:
+    with zipfile.ZipFile(io.BytesIO(payload)) as archive:
+        csv_name = next((name for name in archive.namelist() if name.endswith(".csv")), None)
+        if csv_name is None:
+            raise ValueError("Tranco archive did not contain a CSV file.")
+        with archive.open(csv_name) as csv_file:
+            text = csv_file.read().decode("utf-8", errors="replace")
+    return _domains_from_csv_text(text)
+
+
+async def load_top_site_seed_urls(limit: int = TOP_SITE_SEED_LIMIT) -> tuple[list[str], str]:
+    """Return the latest safe top-site URLs, falling back to the bundled list if needed."""
+
+    timeout = httpx.Timeout(TOP_SITE_DOWNLOAD_TIMEOUT_SECONDS)
+    headers = {"User-Agent": USER_AGENT}
+
+    try:
+        async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers) as client:
+            response = await client.get(TOP_SITE_SOURCE_URL)
+            response.raise_for_status()
+
+        if response.content.startswith(b"PK"):
+            candidates = _domains_from_zip(response.content)
+        else:
+            candidates = _domains_from_csv_text(response.text)
+
+        safe_urls = _safe_top_urls(candidates, limit=limit)
+        if safe_urls:
+            return safe_urls, TOP_SITE_SOURCE_URL
+    except Exception as exc:
+        print(f"sFetch: unable to load latest top-site list ({exc}); using bundled fallback.")
+
+    return _safe_top_urls(TOP_SITES, limit=limit), "bundled fallback list"