sFetch/backend/searcher.py

"""Search result shaping for sFetch."""

from __future__ import annotations

import html
import re

from database import search_images, search_pages, search_videos

SNIPPET_LENGTH = 200


def _extract_terms(query: str) -> list[str]:
    terms = {term.lower() for term in re.findall(r"\w+", query, flags=re.UNICODE)}
    return sorted(terms, key=len, reverse=True)


def _build_snippet(body_text: str) -> str:
    snippet = body_text[:SNIPPET_LENGTH].strip()
    if not snippet:
        return "No preview available."
    if len(body_text) > SNIPPET_LENGTH:
        return f"{snippet}..."
    return snippet


def _highlight_terms(snippet: str, query: str) -> str:
    safe_snippet = html.escape(snippet)
    for term in _extract_terms(query):
        pattern = re.compile(re.escape(html.escape(term)), flags=re.IGNORECASE)
        safe_snippet = pattern.sub(lambda match: f"<mark>{match.group(0)}</mark>", safe_snippet)
    return safe_snippet


async def search(query: str, limit: int = 10, offset: int = 0) -> list[dict]:
    rows = await search_pages(query=query, limit=limit, offset=offset)
    results: list[dict] = []

    for row in rows:
        title = (row.get("title") or row.get("url") or "Untitled").strip()
        body_text = row.get("body_text") or ""
        snippet = _highlight_terms(_build_snippet(body_text), query)
        results.append(
            {
                "id": row["id"],
                "url": row["url"],
                "title": title,
                "snippet": snippet,
                "indexed_at": row["indexed_at"],
            }
        )

    return results


async def search_images_api(query: str, limit: int = 10, offset: int = 0) -> list[dict]:
    rows = await search_images(query=query, limit=limit, offset=offset)
    results: list[dict] = []

    for row in rows:
        results.append(
            {
                "id": row["id"],
                "url": row["url"],
                "page_url": row["page_url"],
                "alt_text": row["alt_text"] or "",
                "indexed_at": row["indexed_at"],
            }
        )

    return results


async def search_videos_api(query: str, limit: int = 10, offset: int = 0) -> list[dict]:
    rows = await search_videos(query=query, limit=limit, offset=offset)
    results: list[dict] = []

    for row in rows:
        title = (row.get("title") or "Video result").strip()
        results.append(
            {
                "id": row["id"],
                "url": row["url"],
                "page_url": row["page_url"],
                "title": title,
                "indexed_at": row["indexed_at"],
            }
        )

    return results