"""Normalization and indexing helpers for crawled pages."""

from __future__ import annotations

import re
from database import insert_image, insert_page, insert_video

MAX_BODY_LENGTH = 10_000


def _normalize_text(body_text: str) -> str:
    collapsed = re.sub(r"\s+", " ", body_text).strip()
    return collapsed[:MAX_BODY_LENGTH]


async def index_page(
    url: str,
    title: str,
    body_text: str,
    images: list[dict[str, str]] | None = None,
    videos: list[dict[str, str]] | None = None,
) -> None:
    normalized_title = title.strip() or url
    normalized_body = _normalize_text(body_text)
    if not normalized_body:
        return
    await insert_page(url=url, title=normalized_title, body_text=normalized_body)

    if images:
        for img in images:
            img_url = img.get("url")
            alt_text = img.get("alt_text", "")
            if img_url:
                await insert_image(url=img_url, page_url=url, alt_text=alt_text)

    if videos:
        for video in videos:
            video_url = video.get("url")
            video_title = video.get("title") or normalized_title
            if video_url:
                await insert_video(url=video_url, page_url=url, title=video_title.strip())