Files
sFetch/backend/content_filter.py
T
Ned Halksworth e0f2eedcd9 inital commit
2026-05-04 19:31:46 +01:00

62 lines
1.7 KiB
Python

"""Adult-content filtering helpers used before URLs reach the index."""
from __future__ import annotations
import re
from urllib.parse import urlsplit
from config import ADULT_DOMAINS, ADULT_KEYWORDS
EXPLICIT_HOST_MARKERS = (
"porn",
"xxx",
"xvideo",
"xnxx",
"hentai",
"camgirl",
"camsoda",
"chaturbate",
"stripchat",
"redtube",
)
EXPLICIT_HOST_TOKENS = {"sex", "sexy", "adult", "nude", "erotic", "escort", "bdsm"}
def _clean_host(url: str) -> str:
host = urlsplit(url.lower()).netloc
return host.removeprefix("www.")
def _host_matches_blocked_domain(host: str, domain: str) -> bool:
clean_domain = domain.lower().removeprefix("www.")
return host == clean_domain or host.endswith(f".{clean_domain}")
def is_adult_url(url: str) -> bool:
"""Return True when a URL appears to point at pornographic/adult content."""
lowered = url.lower()
parsed = urlsplit(lowered)
host = _clean_host(lowered)
if any(_host_matches_blocked_domain(host, domain) for domain in ADULT_DOMAINS):
return True
if any(marker in host for marker in EXPLICIT_HOST_MARKERS):
return True
host_tokens = set(re.split(r"[^a-z0-9]+", host))
if any(token in host_tokens for token in EXPLICIT_HOST_TOKENS):
return True
path_tokens = set(re.split(r"[^a-z0-9]+", f"{parsed.path} {parsed.query}"))
return any(keyword in path_tokens for keyword in ADULT_KEYWORDS)
def is_adult_text(text: str) -> bool:
"""Use a conservative keyword threshold so one incidental word does not block a page."""
lowered = text.lower()
hits = sum(1 for keyword in ADULT_KEYWORDS if keyword in lowered)
return hits >= 3