sFetch/backend/content_filter.py

"""Adult-content filtering helpers used before URLs reach the index."""

from __future__ import annotations

import re
from urllib.parse import urlsplit

from config import ADULT_DOMAINS, ADULT_KEYWORDS

EXPLICIT_HOST_MARKERS = (
    "porn",
    "xxx",
    "xvideo",
    "xnxx",
    "hentai",
    "camgirl",
    "camsoda",
    "chaturbate",
    "stripchat",
    "redtube",
)
EXPLICIT_HOST_TOKENS = {"sex", "sexy", "adult", "nude", "erotic", "escort", "bdsm"}


def _clean_host(url: str) -> str:
    host = urlsplit(url.lower()).netloc
    return host.removeprefix("www.")


def _host_matches_blocked_domain(host: str, domain: str) -> bool:
    clean_domain = domain.lower().removeprefix("www.")
    return host == clean_domain or host.endswith(f".{clean_domain}")


def is_adult_url(url: str) -> bool:
    """Return True when a URL appears to point at pornographic/adult content."""

    lowered = url.lower()
    parsed = urlsplit(lowered)
    host = _clean_host(lowered)

    if any(_host_matches_blocked_domain(host, domain) for domain in ADULT_DOMAINS):
        return True

    if any(marker in host for marker in EXPLICIT_HOST_MARKERS):
        return True

    host_tokens = set(re.split(r"[^a-z0-9]+", host))
    if any(token in host_tokens for token in EXPLICIT_HOST_TOKENS):
        return True

    path_tokens = set(re.split(r"[^a-z0-9]+", f"{parsed.path} {parsed.query}"))
    return any(keyword in path_tokens for keyword in ADULT_KEYWORDS)


def is_adult_text(text: str) -> bool:
    """Use a conservative keyword threshold so one incidental word does not block a page."""

    lowered = text.lower()
    hits = sum(1 for keyword in ADULT_KEYWORDS if keyword in lowered)
    return hits >= 3