inital commit
This commit is contained in:
@@ -0,0 +1,61 @@
|
||||
"""Adult-content filtering helpers used before URLs reach the index."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from urllib.parse import urlsplit
|
||||
|
||||
from config import ADULT_DOMAINS, ADULT_KEYWORDS
|
||||
|
||||
EXPLICIT_HOST_MARKERS = (
|
||||
"porn",
|
||||
"xxx",
|
||||
"xvideo",
|
||||
"xnxx",
|
||||
"hentai",
|
||||
"camgirl",
|
||||
"camsoda",
|
||||
"chaturbate",
|
||||
"stripchat",
|
||||
"redtube",
|
||||
)
|
||||
EXPLICIT_HOST_TOKENS = {"sex", "sexy", "adult", "nude", "erotic", "escort", "bdsm"}
|
||||
|
||||
|
||||
def _clean_host(url: str) -> str:
|
||||
host = urlsplit(url.lower()).netloc
|
||||
return host.removeprefix("www.")
|
||||
|
||||
|
||||
def _host_matches_blocked_domain(host: str, domain: str) -> bool:
|
||||
clean_domain = domain.lower().removeprefix("www.")
|
||||
return host == clean_domain or host.endswith(f".{clean_domain}")
|
||||
|
||||
|
||||
def is_adult_url(url: str) -> bool:
|
||||
"""Return True when a URL appears to point at pornographic/adult content."""
|
||||
|
||||
lowered = url.lower()
|
||||
parsed = urlsplit(lowered)
|
||||
host = _clean_host(lowered)
|
||||
|
||||
if any(_host_matches_blocked_domain(host, domain) for domain in ADULT_DOMAINS):
|
||||
return True
|
||||
|
||||
if any(marker in host for marker in EXPLICIT_HOST_MARKERS):
|
||||
return True
|
||||
|
||||
host_tokens = set(re.split(r"[^a-z0-9]+", host))
|
||||
if any(token in host_tokens for token in EXPLICIT_HOST_TOKENS):
|
||||
return True
|
||||
|
||||
path_tokens = set(re.split(r"[^a-z0-9]+", f"{parsed.path} {parsed.query}"))
|
||||
return any(keyword in path_tokens for keyword in ADULT_KEYWORDS)
|
||||
|
||||
|
||||
def is_adult_text(text: str) -> bool:
|
||||
"""Use a conservative keyword threshold so one incidental word does not block a page."""
|
||||
|
||||
lowered = text.lower()
|
||||
hits = sum(1 for keyword in ADULT_KEYWORDS if keyword in lowered)
|
||||
return hits >= 3
|
||||
Reference in New Issue
Block a user