"""FastAPI entry point for the sFetch backend.""" from __future__ import annotations import asyncio from datetime import UTC, datetime from fastapi import FastAPI, HTTPException, Query, BackgroundTasks from fastapi.middleware.cors import CORSMiddleware from crawler import sFetchBot from config import TOP_SITE_SEED_LIMIT, TOP_SITE_SEED_META_KEY from database import ( count_image_results, count_search_results, count_video_results, get_meta_value, get_stats, init_db, set_meta_value, ) from models import CrawlRequest, SearchResponse from searcher import search, search_images_api, search_videos_api from top_sites import load_top_site_seed_urls app = FastAPI(title="sFetch API", version="1.0.0") app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=False, allow_methods=["*"], allow_headers=["*"], ) def _utc_now() -> str: return datetime.now(UTC).isoformat() def _set_seed_status(**updates: object) -> None: current = getattr(app.state, "_top_scrape_status", {}).copy() current.update({"updated_at": _utc_now(), **updates}) app.state._top_scrape_status = current async def _scrape_top_sites(force: bool = False) -> None: await init_db() async with app.state._crawl_lock: if app.state._top_scrape_done and not force: return existing_seed = await get_meta_value(TOP_SITE_SEED_META_KEY) if existing_seed and not force: stats = await get_stats() _set_seed_status( state="stored", message="Top-site seed already stored in the database.", total=TOP_SITE_SEED_LIMIT, indexed=stats["total_pages"], source=existing_seed, ) app.state._top_scrape_done = True return stats = await get_stats() if stats["total_pages"] >= TOP_SITE_SEED_LIMIT and not force: source = "existing database" await set_meta_value(TOP_SITE_SEED_META_KEY, source) _set_seed_status( state="stored", message="Top-site seed already stored in the database.", total=TOP_SITE_SEED_LIMIT, indexed=stats["total_pages"], source=source, ) app.state._top_scrape_done = True return _set_seed_status(state="loading", message="Loading top-site list.", total=TOP_SITE_SEED_LIMIT, indexed=0) seed_urls, source = await load_top_site_seed_urls(limit=TOP_SITE_SEED_LIMIT) _set_seed_status( state="running", message=f"Seeding {len(seed_urls)} non-adult top sites.", total=len(seed_urls), indexed=0, source=source, ) print(f"sFetch: seeding index with {len(seed_urls)} non-adult top sites from {source}...") bot = sFetchBot(max_depth=0, same_domain_only=True, max_pages_per_domain=1, max_concurrency=12) try: await bot.start(seed_urls) except Exception as exc: _set_seed_status(state="error", message=f"Top-site seed failed: {exc}", indexed=bot.indexed_count) print(f"sFetch: top-site seed failed ({exc})") return await set_meta_value(TOP_SITE_SEED_META_KEY, source) _set_seed_status( state="complete", message="Top-site seed complete.", total=len(seed_urls), indexed=bot.indexed_count, source=source, ) print(f"sFetch: seeding complete. {bot.indexed_count} pages indexed.") app.state._top_scrape_done = True @app.on_event("startup") async def startup_event() -> None: app.state._top_scrape_done = False app.state._crawl_lock = asyncio.Lock() app.state._top_scrape_status = { "state": "idle", "message": "Waiting to check top-site seed.", "total": TOP_SITE_SEED_LIMIT, "indexed": 0, "source": None, "updated_at": _utc_now(), } asyncio.create_task(_scrape_top_sites()) @app.get("/") async def health_check() -> dict[str, str]: return {"status": "sFetch is alive"} @app.get("/search", response_model=SearchResponse) async def search_endpoint( q: str = Query(..., description="Search query"), type: str = Query("web", description="Search type: web, image, or video"), limit: int = Query(10, ge=1, le=50), offset: int = Query(0, ge=0), ) -> SearchResponse: query = q.strip() if not query: raise HTTPException(status_code=400, detail="Query parameter 'q' cannot be empty.") if type == "image": results = await search_images_api(query=query, limit=limit, offset=offset) total = await count_image_results(query) return SearchResponse(query=query, type=type, total=total, results=results) if type == "video": results = await search_videos_api(query=query, limit=limit, offset=offset) total = await count_video_results(query) return SearchResponse(query=query, type=type, total=total, results=results) if type != "web": raise HTTPException(status_code=400, detail="Invalid search type. Use web, image, or video.") results = await search(query=query, limit=limit, offset=offset) total = await count_search_results(query) return SearchResponse(query=query, type=type, total=total, results=results) async def _run_crawl_job(request: CrawlRequest) -> None: try: bot = sFetchBot( max_depth=request.max_depth, max_pages_per_domain=request.max_pages_per_domain, same_domain_only=request.same_domain_only, ) await bot.start(request.seed_urls) except Exception as exc: print(f"sFetch: crawl job failed ({exc})") @app.post("/crawl") async def crawl_endpoint(request: CrawlRequest, background_tasks: BackgroundTasks) -> dict[str, object]: background_tasks.add_task(_run_crawl_job, request) return {"message": "Crawl started", "seed_urls": request.seed_urls} @app.post("/crawl/top-sites") async def crawl_top_sites_endpoint( background_tasks: BackgroundTasks, force: bool = Query(False, description="Run the top-site seed again even if it is marked complete."), ) -> dict[str, object]: background_tasks.add_task(_scrape_top_sites, force) return {"message": "Top-site crawl queued", "force": force} @app.get("/crawl/top-sites/status") async def crawl_top_sites_status_endpoint() -> dict[str, object]: return getattr( app.state, "_top_scrape_status", { "state": "idle", "message": "Top-site seed has not started.", "total": TOP_SITE_SEED_LIMIT, "indexed": 0, "source": None, "updated_at": None, }, ) @app.get("/stats") async def stats_endpoint() -> dict[str, object]: stats = await get_stats() return stats