inital commit

This commit is contained in:
Ned Halksworth
2026-05-04 19:31:46 +01:00
commit e0f2eedcd9
14 changed files with 3718 additions and 0 deletions
+1226
View File
File diff suppressed because it is too large Load Diff
+61
View File
@@ -0,0 +1,61 @@
"""Adult-content filtering helpers used before URLs reach the index."""
from __future__ import annotations
import re
from urllib.parse import urlsplit
from config import ADULT_DOMAINS, ADULT_KEYWORDS
EXPLICIT_HOST_MARKERS = (
"porn",
"xxx",
"xvideo",
"xnxx",
"hentai",
"camgirl",
"camsoda",
"chaturbate",
"stripchat",
"redtube",
)
EXPLICIT_HOST_TOKENS = {"sex", "sexy", "adult", "nude", "erotic", "escort", "bdsm"}
def _clean_host(url: str) -> str:
host = urlsplit(url.lower()).netloc
return host.removeprefix("www.")
def _host_matches_blocked_domain(host: str, domain: str) -> bool:
clean_domain = domain.lower().removeprefix("www.")
return host == clean_domain or host.endswith(f".{clean_domain}")
def is_adult_url(url: str) -> bool:
"""Return True when a URL appears to point at pornographic/adult content."""
lowered = url.lower()
parsed = urlsplit(lowered)
host = _clean_host(lowered)
if any(_host_matches_blocked_domain(host, domain) for domain in ADULT_DOMAINS):
return True
if any(marker in host for marker in EXPLICIT_HOST_MARKERS):
return True
host_tokens = set(re.split(r"[^a-z0-9]+", host))
if any(token in host_tokens for token in EXPLICIT_HOST_TOKENS):
return True
path_tokens = set(re.split(r"[^a-z0-9]+", f"{parsed.path} {parsed.query}"))
return any(keyword in path_tokens for keyword in ADULT_KEYWORDS)
def is_adult_text(text: str) -> bool:
"""Use a conservative keyword threshold so one incidental word does not block a page."""
lowered = text.lower()
hits = sum(1 for keyword in ADULT_KEYWORDS if keyword in lowered)
return hits >= 3
+309
View File
@@ -0,0 +1,309 @@
"""Async web crawler used to build the sFetch index."""
from __future__ import annotations
import asyncio
from collections import defaultdict
from typing import Iterable
from urllib.parse import urljoin, urldefrag, urlsplit, urlunsplit
from urllib.robotparser import RobotFileParser
import httpx
from bs4 import BeautifulSoup
from config import (
CRAWL_DELAY_SECONDS,
DEFAULT_CRAWL_CONCURRENCY,
MAX_CRAWL_DEPTH,
MAX_PAGES_PER_DOMAIN,
USER_AGENT,
)
from content_filter import is_adult_text, is_adult_url
from indexer import index_page
class sFetchBot:
"""A polite async crawler that stays within configurable crawl limits and filters adult content."""
def __init__(
self,
max_depth: int = MAX_CRAWL_DEPTH,
same_domain_only: bool = True,
crawl_delay: float = CRAWL_DELAY_SECONDS,
max_pages_per_domain: int = MAX_PAGES_PER_DOMAIN,
max_concurrency: int = DEFAULT_CRAWL_CONCURRENCY,
timeout_seconds: float = 15.0,
) -> None:
self.max_depth = max_depth
self.same_domain_only = same_domain_only
self.crawl_delay = crawl_delay
self.max_pages_per_domain = max_pages_per_domain
self.max_concurrency = max(1, max_concurrency)
self.timeout_seconds = timeout_seconds
self.visited: set[str] = set()
self.domain_counts: defaultdict[str, int] = defaultdict(int)
self.robots_cache: dict[str, RobotFileParser] = {}
self.indexed_count = 0
self._state_lock = asyncio.Lock()
self._fetch_semaphore = asyncio.Semaphore(self.max_concurrency)
self._client: httpx.AsyncClient | None = None
async def start(self, seed_urls: list[str]) -> None:
if not seed_urls:
return
timeout = httpx.Timeout(self.timeout_seconds)
headers = {"User-Agent": USER_AGENT}
async with httpx.AsyncClient(
timeout=timeout,
follow_redirects=True,
headers=headers,
) as client:
self._client = client
tasks = []
for seed_url in seed_urls:
normalized_seed = self._normalize_url(seed_url)
if normalized_seed is None:
print(f"sFetch: skipped {seed_url} (invalid URL)")
continue
if is_adult_url(normalized_seed):
print(f"sFetch: skipped {seed_url} (adult content filtered)")
continue
root_domain = urlsplit(normalized_seed).netloc.lower()
tasks.append(self._crawl_url(normalized_seed, root_domain, depth=0))
if tasks:
await asyncio.gather(*tasks, return_exceptions=True)
self._client = None
async def _crawl_url(self, url: str, root_domain: str, depth: int) -> None:
try:
if depth > self.max_depth:
return
normalized_url = self._normalize_url(url)
if normalized_url is None:
return
if is_adult_url(normalized_url):
print(f"sFetch: skipped {normalized_url} (adult)")
return
parsed = urlsplit(normalized_url)
current_domain = parsed.netloc.lower()
if self.same_domain_only and current_domain != root_domain:
return
if await self._already_seen(normalized_url):
return
if await self._domain_limit_reached(current_domain):
return
if not await self._is_allowed_by_robots(normalized_url):
return
client = self._require_client()
async with self._fetch_semaphore:
await asyncio.sleep(self.crawl_delay)
response = await client.get(normalized_url)
response.raise_for_status()
content_type = response.headers.get("content-type", "").lower()
if "text/html" not in content_type:
return
title, body_text, links, images, videos = self._extract_page_content(normalized_url, response.text)
if is_adult_text(body_text):
print(f"sFetch: skipped {normalized_url} (adult text)")
return
await index_page(normalized_url, title, body_text, images, videos)
await self._increment_domain_count(current_domain)
self.indexed_count += 1
print(f"sFetch: indexed {normalized_url}")
for link in links:
await self._crawl_url(link, root_domain, depth + 1)
except httpx.HTTPError as exc:
print(f"sFetch: HTTP error {url} ({exc})")
except Exception as exc:
print(f"sFetch: error {url} ({exc})")
def _require_client(self) -> httpx.AsyncClient:
if self._client is None:
raise RuntimeError("Crawler client is not initialized.")
return self._client
async def _already_seen(self, url: str) -> bool:
async with self._state_lock:
if url in self.visited:
return True
self.visited.add(url)
return False
async def _domain_limit_reached(self, domain: str) -> bool:
async with self._state_lock:
return self.domain_counts[domain] >= self.max_pages_per_domain
async def _increment_domain_count(self, domain: str) -> None:
async with self._state_lock:
self.domain_counts[domain] += 1
async def _is_allowed_by_robots(self, url: str) -> bool:
parsed = urlsplit(url)
robots_key = f"{parsed.scheme}://{parsed.netloc.lower()}"
parser = self.robots_cache.get(robots_key)
if parser is None:
parser = await self._fetch_robots_parser(robots_key)
self.robots_cache[robots_key] = parser
return parser.can_fetch(USER_AGENT, url)
async def _fetch_robots_parser(self, domain_base: str) -> RobotFileParser:
parser = RobotFileParser()
robots_url = f"{domain_base}/robots.txt"
parser.set_url(robots_url)
try:
client = self._require_client()
response = await client.get(robots_url)
if response.status_code == 200:
parser.parse(response.text.splitlines())
else:
parser.parse([])
except Exception:
parser.parse([])
return parser
def _extract_page_content(
self,
url: str,
html_text: str,
) -> tuple[str, str, list[str], list[dict[str, str]], list[dict[str, str]]]:
soup = BeautifulSoup(html_text, "html.parser")
images = self._extract_images(url, soup)
videos = self._extract_videos(url, soup)
for element in soup(["script", "style", "noscript"]):
element.decompose()
title = ""
if soup.title and soup.title.string:
title = soup.title.string.strip()
if not title:
title = url
body_text = soup.get_text(separator=" ", strip=True)
links = self._extract_links(url, soup)
return title, body_text, links, images, videos
def _extract_images(self, base_url: str, soup: BeautifulSoup) -> list[dict[str, str]]:
images = []
for img in soup.find_all("img", src=True):
src = str(img["src"]).strip()
if not src or src.startswith(("data:", "javascript:")):
continue
absolute_url = urljoin(base_url, src)
normalized_url = self._normalize_url(absolute_url)
if normalized_url is not None:
alt = str(img.get("alt", "")).strip()
images.append({"url": normalized_url, "alt_text": alt})
return self._dedupe_media(images)
def _extract_videos(self, base_url: str, soup: BeautifulSoup) -> list[dict[str, str]]:
videos: list[dict[str, str]] = []
for video in soup.find_all("video"):
if video.get("src"):
normalized = self._normalize_url(urljoin(base_url, str(video["src"]).strip()))
if normalized:
title = str(video.get("title") or video.get("aria-label") or "").strip()
videos.append({"url": normalized, "title": title})
for source in video.find_all("source", src=True):
normalized = self._normalize_url(urljoin(base_url, str(source["src"]).strip()))
if normalized:
title = str(video.get("title") or video.get("aria-label") or "").strip()
videos.append({"url": normalized, "title": title})
for iframe in soup.find_all("iframe", src=True):
raw_src = str(iframe["src"]).strip()
normalized = self._normalize_url(urljoin(base_url, raw_src))
if normalized and self._is_video_url(normalized):
title = str(iframe.get("title") or iframe.get("aria-label") or "").strip()
videos.append({"url": normalized, "title": title})
for tag in soup.find_all("a", href=True):
raw_href = str(tag["href"]).strip()
normalized = self._normalize_url(urljoin(base_url, raw_href))
if normalized and self._is_video_url(normalized):
title = " ".join(tag.stripped_strings).strip()
videos.append({"url": normalized, "title": title})
return self._dedupe_media(videos)
def _is_video_url(self, url: str) -> bool:
lowered = url.lower()
return any(
marker in lowered
for marker in (
"youtube.com/watch",
"youtube.com/embed/",
"youtu.be/",
"vimeo.com/",
".mp4",
".webm",
".mov",
".m3u8",
)
)
def _dedupe_media(self, items: list[dict[str, str]]) -> list[dict[str, str]]:
seen: set[str] = set()
unique: list[dict[str, str]] = []
for item in items:
media_url = item.get("url")
if not media_url or media_url in seen:
continue
seen.add(media_url)
unique.append(item)
return unique
def _extract_links(self, base_url: str, soup: BeautifulSoup) -> list[str]:
collected_links: list[str] = []
for tag in soup.find_all("a", href=True):
href = str(tag["href"]).strip()
if not href or href.startswith(("javascript:", "mailto:", "tel:")):
continue
absolute_url = urljoin(base_url, href)
normalized_url = self._normalize_url(absolute_url)
if normalized_url is not None:
collected_links.append(normalized_url)
return self._dedupe_links(collected_links)
def _dedupe_links(self, links: Iterable[str]) -> list[str]:
seen: set[str] = set()
unique_links: list[str] = []
for link in links:
if link in seen:
continue
seen.add(link)
unique_links.append(link)
return unique_links
def _normalize_url(self, url: str) -> str | None:
if not url:
return None
clean_url, _ = urldefrag(url.strip())
parsed = urlsplit(clean_url)
if parsed.scheme not in {"http", "https"} or not parsed.netloc:
return None
normalized = parsed._replace(
scheme=parsed.scheme.lower(),
netloc=parsed.netloc.lower(),
)
return urlunsplit(normalized)
+395
View File
@@ -0,0 +1,395 @@
"""Async SQLite helpers for sFetch's crawl index."""
from __future__ import annotations
from contextlib import asynccontextmanager
from typing import Any, AsyncIterator
import aiosqlite
from config import DB_PATH
@asynccontextmanager
async def _get_connection() -> AsyncIterator[aiosqlite.Connection]:
async with aiosqlite.connect(DB_PATH) as connection:
connection.row_factory = aiosqlite.Row
await connection.execute("PRAGMA foreign_keys = ON;")
await connection.execute("PRAGMA journal_mode = WAL;")
yield connection
def _to_fts_query(query: str) -> str:
tokens: list[str] = []
for raw_token in query.split():
token = raw_token.strip()
if not token:
continue
escaped = token.replace('"', '""')
tokens.append(f'"{escaped}"')
return " OR ".join(tokens)
async def init_db() -> None:
async with _get_connection() as connection:
await connection.executescript(
"""
CREATE TABLE IF NOT EXISTS pages (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE NOT NULL,
title TEXT,
body_text TEXT,
indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts
USING fts5(title, body_text, content='pages', content_rowid='id');
CREATE TRIGGER IF NOT EXISTS pages_ai
AFTER INSERT ON pages
BEGIN
INSERT INTO pages_fts(rowid, title, body_text)
VALUES (new.id, new.title, new.body_text);
END;
CREATE TRIGGER IF NOT EXISTS pages_ad
AFTER DELETE ON pages
BEGIN
INSERT INTO pages_fts(pages_fts, rowid, title, body_text)
VALUES ('delete', old.id, old.title, old.body_text);
END;
CREATE TRIGGER IF NOT EXISTS pages_au
AFTER UPDATE ON pages
BEGIN
INSERT INTO pages_fts(pages_fts, rowid, title, body_text)
VALUES ('delete', old.id, old.title, old.body_text);
INSERT INTO pages_fts(rowid, title, body_text)
VALUES (new.id, new.title, new.body_text);
END;
CREATE TABLE IF NOT EXISTS images (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE NOT NULL,
page_url TEXT NOT NULL,
alt_text TEXT,
indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY(page_url) REFERENCES pages(url) ON DELETE CASCADE
);
CREATE VIRTUAL TABLE IF NOT EXISTS images_fts
USING fts5(alt_text, content='images', content_rowid='id');
CREATE TRIGGER IF NOT EXISTS images_ai
AFTER INSERT ON images
BEGIN
INSERT INTO images_fts(rowid, alt_text)
VALUES (new.id, new.alt_text);
END;
CREATE TRIGGER IF NOT EXISTS images_ad
AFTER DELETE ON images
BEGIN
INSERT INTO images_fts(images_fts, rowid, alt_text)
VALUES ('delete', old.id, old.alt_text);
END;
CREATE TRIGGER IF NOT EXISTS images_au
AFTER UPDATE ON images
BEGIN
INSERT INTO images_fts(images_fts, rowid, alt_text)
VALUES ('delete', old.id, old.alt_text);
INSERT INTO images_fts(rowid, alt_text)
VALUES (new.id, new.alt_text);
END;
CREATE TABLE IF NOT EXISTS videos (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE NOT NULL,
page_url TEXT NOT NULL,
title TEXT,
indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY(page_url) REFERENCES pages(url) ON DELETE CASCADE
);
CREATE VIRTUAL TABLE IF NOT EXISTS videos_fts
USING fts5(title, content='videos', content_rowid='id');
CREATE TRIGGER IF NOT EXISTS videos_ai
AFTER INSERT ON videos
BEGIN
INSERT INTO videos_fts(rowid, title)
VALUES (new.id, new.title);
END;
CREATE TRIGGER IF NOT EXISTS videos_ad
AFTER DELETE ON videos
BEGIN
INSERT INTO videos_fts(videos_fts, rowid, title)
VALUES ('delete', old.id, old.title);
END;
CREATE TRIGGER IF NOT EXISTS videos_au
AFTER UPDATE ON videos
BEGIN
INSERT INTO videos_fts(videos_fts, rowid, title)
VALUES ('delete', old.id, old.title);
INSERT INTO videos_fts(rowid, title)
VALUES (new.id, new.title);
END;
CREATE TABLE IF NOT EXISTS app_meta (
key TEXT PRIMARY KEY,
value TEXT NOT NULL,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
"""
)
await connection.commit()
async def get_meta_value(key: str) -> str | None:
async with _get_connection() as connection:
cursor = await connection.execute(
"SELECT value FROM app_meta WHERE key = ?",
(key,),
)
row = await cursor.fetchone()
await cursor.close()
return str(row["value"]) if row else None
async def set_meta_value(key: str, value: str) -> None:
async with _get_connection() as connection:
await connection.execute(
"""
INSERT INTO app_meta (key, value)
VALUES (?, ?)
ON CONFLICT(key) DO UPDATE SET
value = excluded.value,
updated_at = CURRENT_TIMESTAMP
""",
(key, value),
)
await connection.commit()
async def insert_page(url: str, title: str, body_text: str) -> int:
async with _get_connection() as connection:
await connection.execute(
"""
INSERT INTO pages (url, title, body_text)
VALUES (?, ?, ?)
ON CONFLICT(url) DO UPDATE SET
title = excluded.title,
body_text = excluded.body_text,
indexed_at = CURRENT_TIMESTAMP
""",
(url, title, body_text),
)
await connection.commit()
cursor = await connection.execute(
"SELECT id FROM pages WHERE url = ?",
(url,),
)
row = await cursor.fetchone()
await cursor.close()
if row is None:
raise RuntimeError("Inserted page could not be reloaded from the database.")
return int(row["id"])
async def insert_image(url: str, page_url: str, alt_text: str) -> None:
async with _get_connection() as connection:
await connection.execute(
"""
INSERT INTO images (url, page_url, alt_text)
VALUES (?, ?, ?)
ON CONFLICT(url) DO UPDATE SET
page_url = excluded.page_url,
alt_text = excluded.alt_text,
indexed_at = CURRENT_TIMESTAMP
""",
(url, page_url, alt_text),
)
await connection.commit()
async def insert_video(url: str, page_url: str, title: str) -> None:
async with _get_connection() as connection:
await connection.execute(
"""
INSERT INTO videos (url, page_url, title)
VALUES (?, ?, ?)
ON CONFLICT(url) DO UPDATE SET
page_url = excluded.page_url,
title = excluded.title,
indexed_at = CURRENT_TIMESTAMP
""",
(url, page_url, title),
)
await connection.commit()
async def search_pages(query: str, limit: int = 10, offset: int = 0) -> list[dict[str, Any]]:
fts_query = _to_fts_query(query)
if not fts_query:
return []
safe_limit = max(1, min(limit, 50))
safe_offset = max(0, offset)
async with _get_connection() as connection:
cursor = await connection.execute(
"""
SELECT
p.id,
p.url,
p.title,
p.body_text,
p.indexed_at
FROM pages_fts
JOIN pages AS p ON p.id = pages_fts.rowid
WHERE pages_fts MATCH ?
ORDER BY bm25(pages_fts), p.indexed_at DESC
LIMIT ? OFFSET ?
""",
(fts_query, safe_limit, safe_offset),
)
rows = await cursor.fetchall()
await cursor.close()
return [dict(row) for row in rows]
async def count_search_results(query: str) -> int:
fts_query = _to_fts_query(query)
if not fts_query:
return 0
async with _get_connection() as connection:
cursor = await connection.execute(
"""
SELECT COUNT(*) AS total
FROM pages_fts
WHERE pages_fts MATCH ?
""",
(fts_query,),
)
row = await cursor.fetchone()
await cursor.close()
return int(row["total"]) if row and row["total"] is not None else 0
async def search_images(query: str, limit: int = 10, offset: int = 0) -> list[dict[str, Any]]:
fts_query = _to_fts_query(query)
if not fts_query:
return []
safe_limit = max(1, min(limit, 50))
safe_offset = max(0, offset)
async with _get_connection() as connection:
cursor = await connection.execute(
"""
SELECT
i.id,
i.url,
i.page_url,
i.alt_text,
i.indexed_at
FROM images_fts
JOIN images AS i ON i.id = images_fts.rowid
WHERE images_fts MATCH ?
ORDER BY bm25(images_fts), i.indexed_at DESC
LIMIT ? OFFSET ?
""",
(fts_query, safe_limit, safe_offset),
)
rows = await cursor.fetchall()
await cursor.close()
return [dict(row) for row in rows]
async def count_image_results(query: str) -> int:
fts_query = _to_fts_query(query)
if not fts_query:
return 0
async with _get_connection() as connection:
cursor = await connection.execute(
"""
SELECT COUNT(*) AS total
FROM images_fts
WHERE images_fts MATCH ?
""",
(fts_query,),
)
row = await cursor.fetchone()
await cursor.close()
return int(row["total"]) if row and row["total"] is not None else 0
async def search_videos(query: str, limit: int = 10, offset: int = 0) -> list[dict[str, Any]]:
fts_query = _to_fts_query(query)
if not fts_query:
return []
safe_limit = max(1, min(limit, 50))
safe_offset = max(0, offset)
async with _get_connection() as connection:
cursor = await connection.execute(
"""
SELECT
v.id,
v.url,
v.page_url,
v.title,
v.indexed_at
FROM videos_fts
JOIN videos AS v ON v.id = videos_fts.rowid
WHERE videos_fts MATCH ?
ORDER BY bm25(videos_fts), v.indexed_at DESC
LIMIT ? OFFSET ?
""",
(fts_query, safe_limit, safe_offset),
)
rows = await cursor.fetchall()
await cursor.close()
return [dict(row) for row in rows]
async def count_video_results(query: str) -> int:
fts_query = _to_fts_query(query)
if not fts_query:
return 0
async with _get_connection() as connection:
cursor = await connection.execute(
"""
SELECT COUNT(*) AS total
FROM videos_fts
WHERE videos_fts MATCH ?
""",
(fts_query,),
)
row = await cursor.fetchone()
await cursor.close()
return int(row["total"]) if row and row["total"] is not None else 0
async def get_stats() -> dict[str, Any]:
async with _get_connection() as connection:
cursor = await connection.execute(
"""
SELECT
COUNT(*) AS total_pages,
MAX(indexed_at) AS last_indexed_at
FROM pages
"""
)
row = await cursor.fetchone()
await cursor.close()
return {
"total_pages": int(row["total_pages"]) if row and row["total_pages"] is not None else 0,
"last_indexed_at": row["last_indexed_at"] if row else None,
}
+41
View File
@@ -0,0 +1,41 @@
"""Normalization and indexing helpers for crawled pages."""
from __future__ import annotations
import re
from database import insert_image, insert_page, insert_video
MAX_BODY_LENGTH = 10_000
def _normalize_text(body_text: str) -> str:
collapsed = re.sub(r"\s+", " ", body_text).strip()
return collapsed[:MAX_BODY_LENGTH]
async def index_page(
url: str,
title: str,
body_text: str,
images: list[dict[str, str]] | None = None,
videos: list[dict[str, str]] | None = None,
) -> None:
normalized_title = title.strip() or url
normalized_body = _normalize_text(body_text)
if not normalized_body:
return
await insert_page(url=url, title=normalized_title, body_text=normalized_body)
if images:
for img in images:
img_url = img.get("url")
alt_text = img.get("alt_text", "")
if img_url:
await insert_image(url=img_url, page_url=url, alt_text=alt_text)
if videos:
for video in videos:
video_url = video.get("url")
video_title = video.get("title") or normalized_title
if video_url:
await insert_video(url=video_url, page_url=url, title=video_title.strip())
+207
View File
@@ -0,0 +1,207 @@
"""FastAPI entry point for the sFetch backend."""
from __future__ import annotations
import asyncio
from datetime import UTC, datetime
from fastapi import FastAPI, HTTPException, Query, BackgroundTasks
from fastapi.middleware.cors import CORSMiddleware
from crawler import sFetchBot
from config import TOP_SITE_SEED_LIMIT, TOP_SITE_SEED_META_KEY
from database import (
count_image_results,
count_search_results,
count_video_results,
get_meta_value,
get_stats,
init_db,
set_meta_value,
)
from models import CrawlRequest, SearchResponse
from searcher import search, search_images_api, search_videos_api
from top_sites import load_top_site_seed_urls
app = FastAPI(title="sFetch API", version="1.0.0")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=False,
allow_methods=["*"],
allow_headers=["*"],
)
def _utc_now() -> str:
return datetime.now(UTC).isoformat()
def _set_seed_status(**updates: object) -> None:
current = getattr(app.state, "_top_scrape_status", {}).copy()
current.update({"updated_at": _utc_now(), **updates})
app.state._top_scrape_status = current
async def _scrape_top_sites(force: bool = False) -> None:
await init_db()
async with app.state._crawl_lock:
if app.state._top_scrape_done and not force:
return
existing_seed = await get_meta_value(TOP_SITE_SEED_META_KEY)
if existing_seed and not force:
stats = await get_stats()
_set_seed_status(
state="stored",
message="Top-site seed already stored in the database.",
total=TOP_SITE_SEED_LIMIT,
indexed=stats["total_pages"],
source=existing_seed,
)
app.state._top_scrape_done = True
return
stats = await get_stats()
if stats["total_pages"] >= TOP_SITE_SEED_LIMIT and not force:
source = "existing database"
await set_meta_value(TOP_SITE_SEED_META_KEY, source)
_set_seed_status(
state="stored",
message="Top-site seed already stored in the database.",
total=TOP_SITE_SEED_LIMIT,
indexed=stats["total_pages"],
source=source,
)
app.state._top_scrape_done = True
return
_set_seed_status(state="loading", message="Loading top-site list.", total=TOP_SITE_SEED_LIMIT, indexed=0)
seed_urls, source = await load_top_site_seed_urls(limit=TOP_SITE_SEED_LIMIT)
_set_seed_status(
state="running",
message=f"Seeding {len(seed_urls)} non-adult top sites.",
total=len(seed_urls),
indexed=0,
source=source,
)
print(f"sFetch: seeding index with {len(seed_urls)} non-adult top sites from {source}...")
bot = sFetchBot(max_depth=0, same_domain_only=True, max_pages_per_domain=1, max_concurrency=12)
try:
await bot.start(seed_urls)
except Exception as exc:
_set_seed_status(state="error", message=f"Top-site seed failed: {exc}", indexed=bot.indexed_count)
print(f"sFetch: top-site seed failed ({exc})")
return
await set_meta_value(TOP_SITE_SEED_META_KEY, source)
_set_seed_status(
state="complete",
message="Top-site seed complete.",
total=len(seed_urls),
indexed=bot.indexed_count,
source=source,
)
print(f"sFetch: seeding complete. {bot.indexed_count} pages indexed.")
app.state._top_scrape_done = True
@app.on_event("startup")
async def startup_event() -> None:
app.state._top_scrape_done = False
app.state._crawl_lock = asyncio.Lock()
app.state._top_scrape_status = {
"state": "idle",
"message": "Waiting to check top-site seed.",
"total": TOP_SITE_SEED_LIMIT,
"indexed": 0,
"source": None,
"updated_at": _utc_now(),
}
asyncio.create_task(_scrape_top_sites())
@app.get("/")
async def health_check() -> dict[str, str]:
return {"status": "sFetch is alive"}
@app.get("/search", response_model=SearchResponse)
async def search_endpoint(
q: str = Query(..., description="Search query"),
type: str = Query("web", description="Search type: web, image, or video"),
limit: int = Query(10, ge=1, le=50),
offset: int = Query(0, ge=0),
) -> SearchResponse:
query = q.strip()
if not query:
raise HTTPException(status_code=400, detail="Query parameter 'q' cannot be empty.")
if type == "image":
results = await search_images_api(query=query, limit=limit, offset=offset)
total = await count_image_results(query)
return SearchResponse(query=query, type=type, total=total, results=results)
if type == "video":
results = await search_videos_api(query=query, limit=limit, offset=offset)
total = await count_video_results(query)
return SearchResponse(query=query, type=type, total=total, results=results)
if type != "web":
raise HTTPException(status_code=400, detail="Invalid search type. Use web, image, or video.")
results = await search(query=query, limit=limit, offset=offset)
total = await count_search_results(query)
return SearchResponse(query=query, type=type, total=total, results=results)
async def _run_crawl_job(request: CrawlRequest) -> None:
try:
bot = sFetchBot(
max_depth=request.max_depth,
max_pages_per_domain=request.max_pages_per_domain,
same_domain_only=request.same_domain_only,
)
await bot.start(request.seed_urls)
except Exception as exc:
print(f"sFetch: crawl job failed ({exc})")
@app.post("/crawl")
async def crawl_endpoint(request: CrawlRequest, background_tasks: BackgroundTasks) -> dict[str, object]:
background_tasks.add_task(_run_crawl_job, request)
return {"message": "Crawl started", "seed_urls": request.seed_urls}
@app.post("/crawl/top-sites")
async def crawl_top_sites_endpoint(
background_tasks: BackgroundTasks,
force: bool = Query(False, description="Run the top-site seed again even if it is marked complete."),
) -> dict[str, object]:
background_tasks.add_task(_scrape_top_sites, force)
return {"message": "Top-site crawl queued", "force": force}
@app.get("/crawl/top-sites/status")
async def crawl_top_sites_status_endpoint() -> dict[str, object]:
return getattr(
app.state,
"_top_scrape_status",
{
"state": "idle",
"message": "Top-site seed has not started.",
"total": TOP_SITE_SEED_LIMIT,
"indexed": 0,
"source": None,
"updated_at": None,
},
)
@app.get("/stats")
async def stats_endpoint() -> dict[str, object]:
stats = await get_stats()
return stats
+43
View File
@@ -0,0 +1,43 @@
"""Pydantic models for sFetch's API."""
from __future__ import annotations
from pydantic import BaseModel, Field
class SearchResult(BaseModel):
id: int
url: str
title: str
snippet: str
indexed_at: str
class ImageResult(BaseModel):
id: int
url: str
page_url: str
alt_text: str
indexed_at: str
class VideoResult(BaseModel):
id: int
url: str
page_url: str
title: str
indexed_at: str
class SearchResponse(BaseModel):
query: str
type: str = "web"
total: int
results: list[SearchResult] | list[ImageResult] | list[VideoResult]
class CrawlRequest(BaseModel):
seed_urls: list[str] = Field(min_length=1)
max_depth: int = Field(default=2, ge=0, le=5)
max_pages_per_domain: int = Field(default=50, ge=1, le=500)
same_domain_only: bool = True
+6
View File
@@ -0,0 +1,6 @@
fastapi
uvicorn[standard]
httpx
beautifulsoup4
pydantic
aiosqlite
+90
View File
@@ -0,0 +1,90 @@
"""Search result shaping for sFetch."""
from __future__ import annotations
import html
import re
from database import search_images, search_pages, search_videos
SNIPPET_LENGTH = 200
def _extract_terms(query: str) -> list[str]:
terms = {term.lower() for term in re.findall(r"\w+", query, flags=re.UNICODE)}
return sorted(terms, key=len, reverse=True)
def _build_snippet(body_text: str) -> str:
snippet = body_text[:SNIPPET_LENGTH].strip()
if not snippet:
return "No preview available."
if len(body_text) > SNIPPET_LENGTH:
return f"{snippet}..."
return snippet
def _highlight_terms(snippet: str, query: str) -> str:
safe_snippet = html.escape(snippet)
for term in _extract_terms(query):
pattern = re.compile(re.escape(html.escape(term)), flags=re.IGNORECASE)
safe_snippet = pattern.sub(lambda match: f"<mark>{match.group(0)}</mark>", safe_snippet)
return safe_snippet
async def search(query: str, limit: int = 10, offset: int = 0) -> list[dict]:
rows = await search_pages(query=query, limit=limit, offset=offset)
results: list[dict] = []
for row in rows:
title = (row.get("title") or row.get("url") or "Untitled").strip()
body_text = row.get("body_text") or ""
snippet = _highlight_terms(_build_snippet(body_text), query)
results.append(
{
"id": row["id"],
"url": row["url"],
"title": title,
"snippet": snippet,
"indexed_at": row["indexed_at"],
}
)
return results
async def search_images_api(query: str, limit: int = 10, offset: int = 0) -> list[dict]:
rows = await search_images(query=query, limit=limit, offset=offset)
results: list[dict] = []
for row in rows:
results.append(
{
"id": row["id"],
"url": row["url"],
"page_url": row["page_url"],
"alt_text": row["alt_text"] or "",
"indexed_at": row["indexed_at"],
}
)
return results
async def search_videos_api(query: str, limit: int = 10, offset: int = 0) -> list[dict]:
rows = await search_videos(query=query, limit=limit, offset=offset)
results: list[dict] = []
for row in rows:
title = (row.get("title") or "Video result").strip()
results.append(
{
"id": row["id"],
"url": row["url"],
"page_url": row["page_url"],
"title": title,
"indexed_at": row["indexed_at"],
}
)
return results
+110
View File
@@ -0,0 +1,110 @@
"""Load and sanitize the top-site seed list for first-launch indexing."""
from __future__ import annotations
import csv
import io
import zipfile
from collections.abc import Iterable
from urllib.parse import urlsplit, urlunsplit
import httpx
from config import (
TOP_SITE_DOWNLOAD_TIMEOUT_SECONDS,
TOP_SITE_SEED_LIMIT,
TOP_SITE_SOURCE_URL,
TOP_SITES,
USER_AGENT,
)
from content_filter import is_adult_url
def _normalize_site_url(value: str) -> str | None:
raw_value = value.strip()
if not raw_value:
return None
candidate = raw_value if "://" in raw_value else f"https://{raw_value}"
parsed = urlsplit(candidate)
if parsed.scheme not in {"http", "https"} or not parsed.netloc:
return None
normalized = parsed._replace(
scheme=parsed.scheme.lower(),
netloc=parsed.netloc.lower(),
path=parsed.path.rstrip("/") if parsed.path not in {"", "/"} else "",
query="",
fragment="",
)
return urlunsplit(normalized)
def _host_key(url: str) -> str:
return urlsplit(url).netloc.lower().removeprefix("www.")
def _safe_top_urls(candidates: Iterable[str], limit: int = TOP_SITE_SEED_LIMIT) -> list[str]:
safe_urls: list[str] = []
seen_hosts: set[str] = set()
for candidate in candidates:
normalized = _normalize_site_url(candidate)
if normalized is None:
continue
host_key = _host_key(normalized)
if host_key in seen_hosts or is_adult_url(normalized):
continue
seen_hosts.add(host_key)
safe_urls.append(normalized)
if len(safe_urls) >= limit:
break
return safe_urls
def _domains_from_csv_text(csv_text: str) -> list[str]:
domains: list[str] = []
reader = csv.reader(io.StringIO(csv_text))
for row in reader:
if not row:
continue
domain = row[1] if len(row) > 1 else row[0]
if domain and domain.lower() != "domain":
domains.append(domain)
return domains
def _domains_from_zip(payload: bytes) -> list[str]:
with zipfile.ZipFile(io.BytesIO(payload)) as archive:
csv_name = next((name for name in archive.namelist() if name.endswith(".csv")), None)
if csv_name is None:
raise ValueError("Tranco archive did not contain a CSV file.")
with archive.open(csv_name) as csv_file:
text = csv_file.read().decode("utf-8", errors="replace")
return _domains_from_csv_text(text)
async def load_top_site_seed_urls(limit: int = TOP_SITE_SEED_LIMIT) -> tuple[list[str], str]:
"""Return the latest safe top-site URLs, falling back to the bundled list if needed."""
timeout = httpx.Timeout(TOP_SITE_DOWNLOAD_TIMEOUT_SECONDS)
headers = {"User-Agent": USER_AGENT}
try:
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers) as client:
response = await client.get(TOP_SITE_SOURCE_URL)
response.raise_for_status()
if response.content.startswith(b"PK"):
candidates = _domains_from_zip(response.content)
else:
candidates = _domains_from_csv_text(response.text)
safe_urls = _safe_top_urls(candidates, limit=limit)
if safe_urls:
return safe_urls, TOP_SITE_SOURCE_URL
except Exception as exc:
print(f"sFetch: unable to load latest top-site list ({exc}); using bundled fallback.")
return _safe_top_urls(TOP_SITES, limit=limit), "bundled fallback list"