inital commit

2026-05-04 19:31:46 +01:00
commit e0f2eedcd9
14 changed files with 3718 additions and 0 deletions
@@ -0,0 +1,207 @@
+"""FastAPI entry point for the sFetch backend."""
+
+from __future__ import annotations
+
+import asyncio
+from datetime import UTC, datetime
+
+from fastapi import FastAPI, HTTPException, Query, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+
+from crawler import sFetchBot
+from config import TOP_SITE_SEED_LIMIT, TOP_SITE_SEED_META_KEY
+from database import (
+    count_image_results,
+    count_search_results,
+    count_video_results,
+    get_meta_value,
+    get_stats,
+    init_db,
+    set_meta_value,
+)
+from models import CrawlRequest, SearchResponse
+from searcher import search, search_images_api, search_videos_api
+from top_sites import load_top_site_seed_urls
+
+app = FastAPI(title="sFetch API", version="1.0.0")
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=False,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+def _utc_now() -> str:
+    return datetime.now(UTC).isoformat()
+
+
+def _set_seed_status(**updates: object) -> None:
+    current = getattr(app.state, "_top_scrape_status", {}).copy()
+    current.update({"updated_at": _utc_now(), **updates})
+    app.state._top_scrape_status = current
+
+
+async def _scrape_top_sites(force: bool = False) -> None:
+    await init_db()
+
+    async with app.state._crawl_lock:
+        if app.state._top_scrape_done and not force:
+            return
+
+        existing_seed = await get_meta_value(TOP_SITE_SEED_META_KEY)
+        if existing_seed and not force:
+            stats = await get_stats()
+            _set_seed_status(
+                state="stored",
+                message="Top-site seed already stored in the database.",
+                total=TOP_SITE_SEED_LIMIT,
+                indexed=stats["total_pages"],
+                source=existing_seed,
+            )
+            app.state._top_scrape_done = True
+            return
+
+        stats = await get_stats()
+        if stats["total_pages"] >= TOP_SITE_SEED_LIMIT and not force:
+            source = "existing database"
+            await set_meta_value(TOP_SITE_SEED_META_KEY, source)
+            _set_seed_status(
+                state="stored",
+                message="Top-site seed already stored in the database.",
+                total=TOP_SITE_SEED_LIMIT,
+                indexed=stats["total_pages"],
+                source=source,
+            )
+            app.state._top_scrape_done = True
+            return
+
+        _set_seed_status(state="loading", message="Loading top-site list.", total=TOP_SITE_SEED_LIMIT, indexed=0)
+        seed_urls, source = await load_top_site_seed_urls(limit=TOP_SITE_SEED_LIMIT)
+        _set_seed_status(
+            state="running",
+            message=f"Seeding {len(seed_urls)} non-adult top sites.",
+            total=len(seed_urls),
+            indexed=0,
+            source=source,
+        )
+
+        print(f"sFetch: seeding index with {len(seed_urls)} non-adult top sites from {source}...")
+        bot = sFetchBot(max_depth=0, same_domain_only=True, max_pages_per_domain=1, max_concurrency=12)
+        try:
+            await bot.start(seed_urls)
+        except Exception as exc:
+            _set_seed_status(state="error", message=f"Top-site seed failed: {exc}", indexed=bot.indexed_count)
+            print(f"sFetch: top-site seed failed ({exc})")
+            return
+
+        await set_meta_value(TOP_SITE_SEED_META_KEY, source)
+        _set_seed_status(
+            state="complete",
+            message="Top-site seed complete.",
+            total=len(seed_urls),
+            indexed=bot.indexed_count,
+            source=source,
+        )
+        print(f"sFetch: seeding complete. {bot.indexed_count} pages indexed.")
+        app.state._top_scrape_done = True
+
+
+@app.on_event("startup")
+async def startup_event() -> None:
+    app.state._top_scrape_done = False
+    app.state._crawl_lock = asyncio.Lock()
+    app.state._top_scrape_status = {
+        "state": "idle",
+        "message": "Waiting to check top-site seed.",
+        "total": TOP_SITE_SEED_LIMIT,
+        "indexed": 0,
+        "source": None,
+        "updated_at": _utc_now(),
+    }
+    asyncio.create_task(_scrape_top_sites())
+
+
+@app.get("/")
+async def health_check() -> dict[str, str]:
+    return {"status": "sFetch is alive"}
+
+
+@app.get("/search", response_model=SearchResponse)
+async def search_endpoint(
+    q: str = Query(..., description="Search query"),
+    type: str = Query("web", description="Search type: web, image, or video"),
+    limit: int = Query(10, ge=1, le=50),
+    offset: int = Query(0, ge=0),
+) -> SearchResponse:
+    query = q.strip()
+    if not query:
+        raise HTTPException(status_code=400, detail="Query parameter 'q' cannot be empty.")
+
+    if type == "image":
+        results = await search_images_api(query=query, limit=limit, offset=offset)
+        total = await count_image_results(query)
+        return SearchResponse(query=query, type=type, total=total, results=results)
+
+    if type == "video":
+        results = await search_videos_api(query=query, limit=limit, offset=offset)
+        total = await count_video_results(query)
+        return SearchResponse(query=query, type=type, total=total, results=results)
+
+    if type != "web":
+        raise HTTPException(status_code=400, detail="Invalid search type. Use web, image, or video.")
+
+    results = await search(query=query, limit=limit, offset=offset)
+    total = await count_search_results(query)
+    return SearchResponse(query=query, type=type, total=total, results=results)
+
+
+async def _run_crawl_job(request: CrawlRequest) -> None:
+    try:
+        bot = sFetchBot(
+            max_depth=request.max_depth,
+            max_pages_per_domain=request.max_pages_per_domain,
+            same_domain_only=request.same_domain_only,
+        )
+        await bot.start(request.seed_urls)
+    except Exception as exc:
+        print(f"sFetch: crawl job failed ({exc})")
+
+
+@app.post("/crawl")
+async def crawl_endpoint(request: CrawlRequest, background_tasks: BackgroundTasks) -> dict[str, object]:
+    background_tasks.add_task(_run_crawl_job, request)
+    return {"message": "Crawl started", "seed_urls": request.seed_urls}
+
+
+@app.post("/crawl/top-sites")
+async def crawl_top_sites_endpoint(
+    background_tasks: BackgroundTasks,
+    force: bool = Query(False, description="Run the top-site seed again even if it is marked complete."),
+) -> dict[str, object]:
+    background_tasks.add_task(_scrape_top_sites, force)
+    return {"message": "Top-site crawl queued", "force": force}
+
+
+@app.get("/crawl/top-sites/status")
+async def crawl_top_sites_status_endpoint() -> dict[str, object]:
+    return getattr(
+        app.state,
+        "_top_scrape_status",
+        {
+            "state": "idle",
+            "message": "Top-site seed has not started.",
+            "total": TOP_SITE_SEED_LIMIT,
+            "indexed": 0,
+            "source": None,
+            "updated_at": None,
+        },
+    )
+
+
+@app.get("/stats")
+async def stats_endpoint() -> dict[str, object]:
+    stats = await get_stats()
+    return stats