inital commit
This commit is contained in:
+207
@@ -0,0 +1,207 @@
|
||||
"""FastAPI entry point for the sFetch backend."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from datetime import UTC, datetime
|
||||
|
||||
from fastapi import FastAPI, HTTPException, Query, BackgroundTasks
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
from crawler import sFetchBot
|
||||
from config import TOP_SITE_SEED_LIMIT, TOP_SITE_SEED_META_KEY
|
||||
from database import (
|
||||
count_image_results,
|
||||
count_search_results,
|
||||
count_video_results,
|
||||
get_meta_value,
|
||||
get_stats,
|
||||
init_db,
|
||||
set_meta_value,
|
||||
)
|
||||
from models import CrawlRequest, SearchResponse
|
||||
from searcher import search, search_images_api, search_videos_api
|
||||
from top_sites import load_top_site_seed_urls
|
||||
|
||||
app = FastAPI(title="sFetch API", version="1.0.0")
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=False,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
def _utc_now() -> str:
|
||||
return datetime.now(UTC).isoformat()
|
||||
|
||||
|
||||
def _set_seed_status(**updates: object) -> None:
|
||||
current = getattr(app.state, "_top_scrape_status", {}).copy()
|
||||
current.update({"updated_at": _utc_now(), **updates})
|
||||
app.state._top_scrape_status = current
|
||||
|
||||
|
||||
async def _scrape_top_sites(force: bool = False) -> None:
|
||||
await init_db()
|
||||
|
||||
async with app.state._crawl_lock:
|
||||
if app.state._top_scrape_done and not force:
|
||||
return
|
||||
|
||||
existing_seed = await get_meta_value(TOP_SITE_SEED_META_KEY)
|
||||
if existing_seed and not force:
|
||||
stats = await get_stats()
|
||||
_set_seed_status(
|
||||
state="stored",
|
||||
message="Top-site seed already stored in the database.",
|
||||
total=TOP_SITE_SEED_LIMIT,
|
||||
indexed=stats["total_pages"],
|
||||
source=existing_seed,
|
||||
)
|
||||
app.state._top_scrape_done = True
|
||||
return
|
||||
|
||||
stats = await get_stats()
|
||||
if stats["total_pages"] >= TOP_SITE_SEED_LIMIT and not force:
|
||||
source = "existing database"
|
||||
await set_meta_value(TOP_SITE_SEED_META_KEY, source)
|
||||
_set_seed_status(
|
||||
state="stored",
|
||||
message="Top-site seed already stored in the database.",
|
||||
total=TOP_SITE_SEED_LIMIT,
|
||||
indexed=stats["total_pages"],
|
||||
source=source,
|
||||
)
|
||||
app.state._top_scrape_done = True
|
||||
return
|
||||
|
||||
_set_seed_status(state="loading", message="Loading top-site list.", total=TOP_SITE_SEED_LIMIT, indexed=0)
|
||||
seed_urls, source = await load_top_site_seed_urls(limit=TOP_SITE_SEED_LIMIT)
|
||||
_set_seed_status(
|
||||
state="running",
|
||||
message=f"Seeding {len(seed_urls)} non-adult top sites.",
|
||||
total=len(seed_urls),
|
||||
indexed=0,
|
||||
source=source,
|
||||
)
|
||||
|
||||
print(f"sFetch: seeding index with {len(seed_urls)} non-adult top sites from {source}...")
|
||||
bot = sFetchBot(max_depth=0, same_domain_only=True, max_pages_per_domain=1, max_concurrency=12)
|
||||
try:
|
||||
await bot.start(seed_urls)
|
||||
except Exception as exc:
|
||||
_set_seed_status(state="error", message=f"Top-site seed failed: {exc}", indexed=bot.indexed_count)
|
||||
print(f"sFetch: top-site seed failed ({exc})")
|
||||
return
|
||||
|
||||
await set_meta_value(TOP_SITE_SEED_META_KEY, source)
|
||||
_set_seed_status(
|
||||
state="complete",
|
||||
message="Top-site seed complete.",
|
||||
total=len(seed_urls),
|
||||
indexed=bot.indexed_count,
|
||||
source=source,
|
||||
)
|
||||
print(f"sFetch: seeding complete. {bot.indexed_count} pages indexed.")
|
||||
app.state._top_scrape_done = True
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event() -> None:
|
||||
app.state._top_scrape_done = False
|
||||
app.state._crawl_lock = asyncio.Lock()
|
||||
app.state._top_scrape_status = {
|
||||
"state": "idle",
|
||||
"message": "Waiting to check top-site seed.",
|
||||
"total": TOP_SITE_SEED_LIMIT,
|
||||
"indexed": 0,
|
||||
"source": None,
|
||||
"updated_at": _utc_now(),
|
||||
}
|
||||
asyncio.create_task(_scrape_top_sites())
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def health_check() -> dict[str, str]:
|
||||
return {"status": "sFetch is alive"}
|
||||
|
||||
|
||||
@app.get("/search", response_model=SearchResponse)
|
||||
async def search_endpoint(
|
||||
q: str = Query(..., description="Search query"),
|
||||
type: str = Query("web", description="Search type: web, image, or video"),
|
||||
limit: int = Query(10, ge=1, le=50),
|
||||
offset: int = Query(0, ge=0),
|
||||
) -> SearchResponse:
|
||||
query = q.strip()
|
||||
if not query:
|
||||
raise HTTPException(status_code=400, detail="Query parameter 'q' cannot be empty.")
|
||||
|
||||
if type == "image":
|
||||
results = await search_images_api(query=query, limit=limit, offset=offset)
|
||||
total = await count_image_results(query)
|
||||
return SearchResponse(query=query, type=type, total=total, results=results)
|
||||
|
||||
if type == "video":
|
||||
results = await search_videos_api(query=query, limit=limit, offset=offset)
|
||||
total = await count_video_results(query)
|
||||
return SearchResponse(query=query, type=type, total=total, results=results)
|
||||
|
||||
if type != "web":
|
||||
raise HTTPException(status_code=400, detail="Invalid search type. Use web, image, or video.")
|
||||
|
||||
results = await search(query=query, limit=limit, offset=offset)
|
||||
total = await count_search_results(query)
|
||||
return SearchResponse(query=query, type=type, total=total, results=results)
|
||||
|
||||
|
||||
async def _run_crawl_job(request: CrawlRequest) -> None:
|
||||
try:
|
||||
bot = sFetchBot(
|
||||
max_depth=request.max_depth,
|
||||
max_pages_per_domain=request.max_pages_per_domain,
|
||||
same_domain_only=request.same_domain_only,
|
||||
)
|
||||
await bot.start(request.seed_urls)
|
||||
except Exception as exc:
|
||||
print(f"sFetch: crawl job failed ({exc})")
|
||||
|
||||
|
||||
@app.post("/crawl")
|
||||
async def crawl_endpoint(request: CrawlRequest, background_tasks: BackgroundTasks) -> dict[str, object]:
|
||||
background_tasks.add_task(_run_crawl_job, request)
|
||||
return {"message": "Crawl started", "seed_urls": request.seed_urls}
|
||||
|
||||
|
||||
@app.post("/crawl/top-sites")
|
||||
async def crawl_top_sites_endpoint(
|
||||
background_tasks: BackgroundTasks,
|
||||
force: bool = Query(False, description="Run the top-site seed again even if it is marked complete."),
|
||||
) -> dict[str, object]:
|
||||
background_tasks.add_task(_scrape_top_sites, force)
|
||||
return {"message": "Top-site crawl queued", "force": force}
|
||||
|
||||
|
||||
@app.get("/crawl/top-sites/status")
|
||||
async def crawl_top_sites_status_endpoint() -> dict[str, object]:
|
||||
return getattr(
|
||||
app.state,
|
||||
"_top_scrape_status",
|
||||
{
|
||||
"state": "idle",
|
||||
"message": "Top-site seed has not started.",
|
||||
"total": TOP_SITE_SEED_LIMIT,
|
||||
"indexed": 0,
|
||||
"source": None,
|
||||
"updated_at": None,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@app.get("/stats")
|
||||
async def stats_endpoint() -> dict[str, object]:
|
||||
stats = await get_stats()
|
||||
return stats
|
||||
Reference in New Issue
Block a user