inital commit
This commit is contained in:
@@ -0,0 +1,41 @@
|
||||
"""Normalization and indexing helpers for crawled pages."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from database import insert_image, insert_page, insert_video
|
||||
|
||||
MAX_BODY_LENGTH = 10_000
|
||||
|
||||
|
||||
def _normalize_text(body_text: str) -> str:
|
||||
collapsed = re.sub(r"\s+", " ", body_text).strip()
|
||||
return collapsed[:MAX_BODY_LENGTH]
|
||||
|
||||
|
||||
async def index_page(
|
||||
url: str,
|
||||
title: str,
|
||||
body_text: str,
|
||||
images: list[dict[str, str]] | None = None,
|
||||
videos: list[dict[str, str]] | None = None,
|
||||
) -> None:
|
||||
normalized_title = title.strip() or url
|
||||
normalized_body = _normalize_text(body_text)
|
||||
if not normalized_body:
|
||||
return
|
||||
await insert_page(url=url, title=normalized_title, body_text=normalized_body)
|
||||
|
||||
if images:
|
||||
for img in images:
|
||||
img_url = img.get("url")
|
||||
alt_text = img.get("alt_text", "")
|
||||
if img_url:
|
||||
await insert_image(url=img_url, page_url=url, alt_text=alt_text)
|
||||
|
||||
if videos:
|
||||
for video in videos:
|
||||
video_url = video.get("url")
|
||||
video_title = video.get("title") or normalized_title
|
||||
if video_url:
|
||||
await insert_video(url=video_url, page_url=url, title=video_title.strip())
|
||||
Reference in New Issue
Block a user