"""Normalization and indexing helpers for crawled pages.""" from __future__ import annotations import re from database import insert_image, insert_page, insert_video MAX_BODY_LENGTH = 10_000 def _normalize_text(body_text: str) -> str: collapsed = re.sub(r"\s+", " ", body_text).strip() return collapsed[:MAX_BODY_LENGTH] async def index_page( url: str, title: str, body_text: str, images: list[dict[str, str]] | None = None, videos: list[dict[str, str]] | None = None, ) -> None: normalized_title = title.strip() or url normalized_body = _normalize_text(body_text) if not normalized_body: return await insert_page(url=url, title=normalized_title, body_text=normalized_body) if images: for img in images: img_url = img.get("url") alt_text = img.get("alt_text", "") if img_url: await insert_image(url=img_url, page_url=url, alt_text=alt_text) if videos: for video in videos: video_url = video.get("url") video_title = video.get("title") or normalized_title if video_url: await insert_video(url=video_url, page_url=url, title=video_title.strip())