"""Adult-content filtering helpers used before URLs reach the index.""" from __future__ import annotations import re from urllib.parse import urlsplit from config import ADULT_DOMAINS, ADULT_KEYWORDS EXPLICIT_HOST_MARKERS = ( "porn", "xxx", "xvideo", "xnxx", "hentai", "camgirl", "camsoda", "chaturbate", "stripchat", "redtube", ) EXPLICIT_HOST_TOKENS = {"sex", "sexy", "adult", "nude", "erotic", "escort", "bdsm"} def _clean_host(url: str) -> str: host = urlsplit(url.lower()).netloc return host.removeprefix("www.") def _host_matches_blocked_domain(host: str, domain: str) -> bool: clean_domain = domain.lower().removeprefix("www.") return host == clean_domain or host.endswith(f".{clean_domain}") def is_adult_url(url: str) -> bool: """Return True when a URL appears to point at pornographic/adult content.""" lowered = url.lower() parsed = urlsplit(lowered) host = _clean_host(lowered) if any(_host_matches_blocked_domain(host, domain) for domain in ADULT_DOMAINS): return True if any(marker in host for marker in EXPLICIT_HOST_MARKERS): return True host_tokens = set(re.split(r"[^a-z0-9]+", host)) if any(token in host_tokens for token in EXPLICIT_HOST_TOKENS): return True path_tokens = set(re.split(r"[^a-z0-9]+", f"{parsed.path} {parsed.query}")) return any(keyword in path_tokens for keyword in ADULT_KEYWORDS) def is_adult_text(text: str) -> bool: """Use a conservative keyword threshold so one incidental word does not block a page.""" lowered = text.lower() hits = sum(1 for keyword in ADULT_KEYWORDS if keyword in lowered) return hits >= 3