62 lines
1.7 KiB
Python
62 lines
1.7 KiB
Python
"""Adult-content filtering helpers used before URLs reach the index."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from urllib.parse import urlsplit
|
|
|
|
from config import ADULT_DOMAINS, ADULT_KEYWORDS
|
|
|
|
EXPLICIT_HOST_MARKERS = (
|
|
"porn",
|
|
"xxx",
|
|
"xvideo",
|
|
"xnxx",
|
|
"hentai",
|
|
"camgirl",
|
|
"camsoda",
|
|
"chaturbate",
|
|
"stripchat",
|
|
"redtube",
|
|
)
|
|
EXPLICIT_HOST_TOKENS = {"sex", "sexy", "adult", "nude", "erotic", "escort", "bdsm"}
|
|
|
|
|
|
def _clean_host(url: str) -> str:
|
|
host = urlsplit(url.lower()).netloc
|
|
return host.removeprefix("www.")
|
|
|
|
|
|
def _host_matches_blocked_domain(host: str, domain: str) -> bool:
|
|
clean_domain = domain.lower().removeprefix("www.")
|
|
return host == clean_domain or host.endswith(f".{clean_domain}")
|
|
|
|
|
|
def is_adult_url(url: str) -> bool:
|
|
"""Return True when a URL appears to point at pornographic/adult content."""
|
|
|
|
lowered = url.lower()
|
|
parsed = urlsplit(lowered)
|
|
host = _clean_host(lowered)
|
|
|
|
if any(_host_matches_blocked_domain(host, domain) for domain in ADULT_DOMAINS):
|
|
return True
|
|
|
|
if any(marker in host for marker in EXPLICIT_HOST_MARKERS):
|
|
return True
|
|
|
|
host_tokens = set(re.split(r"[^a-z0-9]+", host))
|
|
if any(token in host_tokens for token in EXPLICIT_HOST_TOKENS):
|
|
return True
|
|
|
|
path_tokens = set(re.split(r"[^a-z0-9]+", f"{parsed.path} {parsed.query}"))
|
|
return any(keyword in path_tokens for keyword in ADULT_KEYWORDS)
|
|
|
|
|
|
def is_adult_text(text: str) -> bool:
|
|
"""Use a conservative keyword threshold so one incidental word does not block a page."""
|
|
|
|
lowered = text.lower()
|
|
hits = sum(1 for keyword in ADULT_KEYWORDS if keyword in lowered)
|
|
return hits >= 3
|