sFetch/backend/main.py

"""FastAPI entry point for the sFetch backend."""

from __future__ import annotations

import asyncio
from datetime import UTC, datetime

from fastapi import FastAPI, HTTPException, Query, BackgroundTasks
from fastapi.middleware.cors import CORSMiddleware

from crawler import sFetchBot
from config import TOP_SITE_SEED_LIMIT, TOP_SITE_SEED_META_KEY
from database import (
    count_image_results,
    count_search_results,
    count_video_results,
    get_meta_value,
    get_stats,
    init_db,
    set_meta_value,
)
from models import CrawlRequest, SearchResponse
from searcher import search, search_images_api, search_videos_api
from top_sites import load_top_site_seed_urls

app = FastAPI(title="sFetch API", version="1.0.0")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=False,
    allow_methods=["*"],
    allow_headers=["*"],
)


def _utc_now() -> str:
    return datetime.now(UTC).isoformat()


def _set_seed_status(**updates: object) -> None:
    current = getattr(app.state, "_top_scrape_status", {}).copy()
    current.update({"updated_at": _utc_now(), **updates})
    app.state._top_scrape_status = current


async def _scrape_top_sites(force: bool = False) -> None:
    await init_db()

    async with app.state._crawl_lock:
        if app.state._top_scrape_done and not force:
            return

        existing_seed = await get_meta_value(TOP_SITE_SEED_META_KEY)
        if existing_seed and not force:
            stats = await get_stats()
            _set_seed_status(
                state="stored",
                message="Top-site seed already stored in the database.",
                total=TOP_SITE_SEED_LIMIT,
                indexed=stats["total_pages"],
                source=existing_seed,
            )
            app.state._top_scrape_done = True
            return

        stats = await get_stats()
        if stats["total_pages"] >= TOP_SITE_SEED_LIMIT and not force:
            source = "existing database"
            await set_meta_value(TOP_SITE_SEED_META_KEY, source)
            _set_seed_status(
                state="stored",
                message="Top-site seed already stored in the database.",
                total=TOP_SITE_SEED_LIMIT,
                indexed=stats["total_pages"],
                source=source,
            )
            app.state._top_scrape_done = True
            return

        _set_seed_status(state="loading", message="Loading top-site list.", total=TOP_SITE_SEED_LIMIT, indexed=0)
        seed_urls, source = await load_top_site_seed_urls(limit=TOP_SITE_SEED_LIMIT)
        _set_seed_status(
            state="running",
            message=f"Seeding {len(seed_urls)} non-adult top sites.",
            total=len(seed_urls),
            indexed=0,
            source=source,
        )

        print(f"sFetch: seeding index with {len(seed_urls)} non-adult top sites from {source}...")
        bot = sFetchBot(max_depth=0, same_domain_only=True, max_pages_per_domain=1, max_concurrency=12)
        try:
            await bot.start(seed_urls)
        except Exception as exc:
            _set_seed_status(state="error", message=f"Top-site seed failed: {exc}", indexed=bot.indexed_count)
            print(f"sFetch: top-site seed failed ({exc})")
            return

        await set_meta_value(TOP_SITE_SEED_META_KEY, source)
        _set_seed_status(
            state="complete",
            message="Top-site seed complete.",
            total=len(seed_urls),
            indexed=bot.indexed_count,
            source=source,
        )
        print(f"sFetch: seeding complete. {bot.indexed_count} pages indexed.")
        app.state._top_scrape_done = True


@app.on_event("startup")
async def startup_event() -> None:
    app.state._top_scrape_done = False
    app.state._crawl_lock = asyncio.Lock()
    app.state._top_scrape_status = {
        "state": "idle",
        "message": "Waiting to check top-site seed.",
        "total": TOP_SITE_SEED_LIMIT,
        "indexed": 0,
        "source": None,
        "updated_at": _utc_now(),
    }
    asyncio.create_task(_scrape_top_sites())


@app.get("/")
async def health_check() -> dict[str, str]:
    return {"status": "sFetch is alive"}


@app.get("/search", response_model=SearchResponse)
async def search_endpoint(
    q: str = Query(..., description="Search query"),
    type: str = Query("web", description="Search type: web, image, or video"),
    limit: int = Query(10, ge=1, le=50),
    offset: int = Query(0, ge=0),
) -> SearchResponse:
    query = q.strip()
    if not query:
        raise HTTPException(status_code=400, detail="Query parameter 'q' cannot be empty.")

    if type == "image":
        results = await search_images_api(query=query, limit=limit, offset=offset)
        total = await count_image_results(query)
        return SearchResponse(query=query, type=type, total=total, results=results)

    if type == "video":
        results = await search_videos_api(query=query, limit=limit, offset=offset)
        total = await count_video_results(query)
        return SearchResponse(query=query, type=type, total=total, results=results)

    if type != "web":
        raise HTTPException(status_code=400, detail="Invalid search type. Use web, image, or video.")

    results = await search(query=query, limit=limit, offset=offset)
    total = await count_search_results(query)
    return SearchResponse(query=query, type=type, total=total, results=results)


async def _run_crawl_job(request: CrawlRequest) -> None:
    try:
        bot = sFetchBot(
            max_depth=request.max_depth,
            max_pages_per_domain=request.max_pages_per_domain,
            same_domain_only=request.same_domain_only,
        )
        await bot.start(request.seed_urls)
    except Exception as exc:
        print(f"sFetch: crawl job failed ({exc})")


@app.post("/crawl")
async def crawl_endpoint(request: CrawlRequest, background_tasks: BackgroundTasks) -> dict[str, object]:
    background_tasks.add_task(_run_crawl_job, request)
    return {"message": "Crawl started", "seed_urls": request.seed_urls}


@app.post("/crawl/top-sites")
async def crawl_top_sites_endpoint(
    background_tasks: BackgroundTasks,
    force: bool = Query(False, description="Run the top-site seed again even if it is marked complete."),
) -> dict[str, object]:
    background_tasks.add_task(_scrape_top_sites, force)
    return {"message": "Top-site crawl queued", "force": force}


@app.get("/crawl/top-sites/status")
async def crawl_top_sites_status_endpoint() -> dict[str, object]:
    return getattr(
        app.state,
        "_top_scrape_status",
        {
            "state": "idle",
            "message": "Top-site seed has not started.",
            "total": TOP_SITE_SEED_LIMIT,
            "indexed": 0,
            "source": None,
            "updated_at": None,
        },
    )


@app.get("/stats")
async def stats_endpoint() -> dict[str, object]:
    stats = await get_stats()
    return stats