Files
sFetch/backend/indexer.py
T
Ned Halksworth e0f2eedcd9 inital commit
2026-05-04 19:31:46 +01:00

42 lines
1.2 KiB
Python

"""Normalization and indexing helpers for crawled pages."""
from __future__ import annotations
import re
from database import insert_image, insert_page, insert_video
MAX_BODY_LENGTH = 10_000
def _normalize_text(body_text: str) -> str:
collapsed = re.sub(r"\s+", " ", body_text).strip()
return collapsed[:MAX_BODY_LENGTH]
async def index_page(
url: str,
title: str,
body_text: str,
images: list[dict[str, str]] | None = None,
videos: list[dict[str, str]] | None = None,
) -> None:
normalized_title = title.strip() or url
normalized_body = _normalize_text(body_text)
if not normalized_body:
return
await insert_page(url=url, title=normalized_title, body_text=normalized_body)
if images:
for img in images:
img_url = img.get("url")
alt_text = img.get("alt_text", "")
if img_url:
await insert_image(url=img_url, page_url=url, alt_text=alt_text)
if videos:
for video in videos:
video_url = video.get("url")
video_title = video.get("title") or normalized_title
if video_url:
await insert_video(url=video_url, page_url=url, title=video_title.strip())