inital commit
This commit is contained in:
@@ -0,0 +1,110 @@
|
||||
"""Load and sanitize the top-site seed list for first-launch indexing."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import io
|
||||
import zipfile
|
||||
from collections.abc import Iterable
|
||||
from urllib.parse import urlsplit, urlunsplit
|
||||
|
||||
import httpx
|
||||
|
||||
from config import (
|
||||
TOP_SITE_DOWNLOAD_TIMEOUT_SECONDS,
|
||||
TOP_SITE_SEED_LIMIT,
|
||||
TOP_SITE_SOURCE_URL,
|
||||
TOP_SITES,
|
||||
USER_AGENT,
|
||||
)
|
||||
from content_filter import is_adult_url
|
||||
|
||||
|
||||
def _normalize_site_url(value: str) -> str | None:
|
||||
raw_value = value.strip()
|
||||
if not raw_value:
|
||||
return None
|
||||
|
||||
candidate = raw_value if "://" in raw_value else f"https://{raw_value}"
|
||||
parsed = urlsplit(candidate)
|
||||
if parsed.scheme not in {"http", "https"} or not parsed.netloc:
|
||||
return None
|
||||
|
||||
normalized = parsed._replace(
|
||||
scheme=parsed.scheme.lower(),
|
||||
netloc=parsed.netloc.lower(),
|
||||
path=parsed.path.rstrip("/") if parsed.path not in {"", "/"} else "",
|
||||
query="",
|
||||
fragment="",
|
||||
)
|
||||
return urlunsplit(normalized)
|
||||
|
||||
|
||||
def _host_key(url: str) -> str:
|
||||
return urlsplit(url).netloc.lower().removeprefix("www.")
|
||||
|
||||
|
||||
def _safe_top_urls(candidates: Iterable[str], limit: int = TOP_SITE_SEED_LIMIT) -> list[str]:
|
||||
safe_urls: list[str] = []
|
||||
seen_hosts: set[str] = set()
|
||||
|
||||
for candidate in candidates:
|
||||
normalized = _normalize_site_url(candidate)
|
||||
if normalized is None:
|
||||
continue
|
||||
host_key = _host_key(normalized)
|
||||
if host_key in seen_hosts or is_adult_url(normalized):
|
||||
continue
|
||||
seen_hosts.add(host_key)
|
||||
safe_urls.append(normalized)
|
||||
if len(safe_urls) >= limit:
|
||||
break
|
||||
|
||||
return safe_urls
|
||||
|
||||
|
||||
def _domains_from_csv_text(csv_text: str) -> list[str]:
|
||||
domains: list[str] = []
|
||||
reader = csv.reader(io.StringIO(csv_text))
|
||||
for row in reader:
|
||||
if not row:
|
||||
continue
|
||||
domain = row[1] if len(row) > 1 else row[0]
|
||||
if domain and domain.lower() != "domain":
|
||||
domains.append(domain)
|
||||
return domains
|
||||
|
||||
|
||||
def _domains_from_zip(payload: bytes) -> list[str]:
|
||||
with zipfile.ZipFile(io.BytesIO(payload)) as archive:
|
||||
csv_name = next((name for name in archive.namelist() if name.endswith(".csv")), None)
|
||||
if csv_name is None:
|
||||
raise ValueError("Tranco archive did not contain a CSV file.")
|
||||
with archive.open(csv_name) as csv_file:
|
||||
text = csv_file.read().decode("utf-8", errors="replace")
|
||||
return _domains_from_csv_text(text)
|
||||
|
||||
|
||||
async def load_top_site_seed_urls(limit: int = TOP_SITE_SEED_LIMIT) -> tuple[list[str], str]:
|
||||
"""Return the latest safe top-site URLs, falling back to the bundled list if needed."""
|
||||
|
||||
timeout = httpx.Timeout(TOP_SITE_DOWNLOAD_TIMEOUT_SECONDS)
|
||||
headers = {"User-Agent": USER_AGENT}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers) as client:
|
||||
response = await client.get(TOP_SITE_SOURCE_URL)
|
||||
response.raise_for_status()
|
||||
|
||||
if response.content.startswith(b"PK"):
|
||||
candidates = _domains_from_zip(response.content)
|
||||
else:
|
||||
candidates = _domains_from_csv_text(response.text)
|
||||
|
||||
safe_urls = _safe_top_urls(candidates, limit=limit)
|
||||
if safe_urls:
|
||||
return safe_urls, TOP_SITE_SOURCE_URL
|
||||
except Exception as exc:
|
||||
print(f"sFetch: unable to load latest top-site list ({exc}); using bundled fallback.")
|
||||
|
||||
return _safe_top_urls(TOP_SITES, limit=limit), "bundled fallback list"
|
||||
Reference in New Issue
Block a user