42 lines
1.2 KiB
Python
42 lines
1.2 KiB
Python
"""Normalization and indexing helpers for crawled pages."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from database import insert_image, insert_page, insert_video
|
|
|
|
MAX_BODY_LENGTH = 10_000
|
|
|
|
|
|
def _normalize_text(body_text: str) -> str:
|
|
collapsed = re.sub(r"\s+", " ", body_text).strip()
|
|
return collapsed[:MAX_BODY_LENGTH]
|
|
|
|
|
|
async def index_page(
|
|
url: str,
|
|
title: str,
|
|
body_text: str,
|
|
images: list[dict[str, str]] | None = None,
|
|
videos: list[dict[str, str]] | None = None,
|
|
) -> None:
|
|
normalized_title = title.strip() or url
|
|
normalized_body = _normalize_text(body_text)
|
|
if not normalized_body:
|
|
return
|
|
await insert_page(url=url, title=normalized_title, body_text=normalized_body)
|
|
|
|
if images:
|
|
for img in images:
|
|
img_url = img.get("url")
|
|
alt_text = img.get("alt_text", "")
|
|
if img_url:
|
|
await insert_image(url=img_url, page_url=url, alt_text=alt_text)
|
|
|
|
if videos:
|
|
for video in videos:
|
|
video_url = video.get("url")
|
|
video_title = video.get("title") or normalized_title
|
|
if video_url:
|
|
await insert_video(url=video_url, page_url=url, title=video_title.strip())
|