inital commit
This commit is contained in:
@@ -0,0 +1,309 @@
|
||||
"""Async web crawler used to build the sFetch index."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from collections import defaultdict
|
||||
from typing import Iterable
|
||||
from urllib.parse import urljoin, urldefrag, urlsplit, urlunsplit
|
||||
from urllib.robotparser import RobotFileParser
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from config import (
|
||||
CRAWL_DELAY_SECONDS,
|
||||
DEFAULT_CRAWL_CONCURRENCY,
|
||||
MAX_CRAWL_DEPTH,
|
||||
MAX_PAGES_PER_DOMAIN,
|
||||
USER_AGENT,
|
||||
)
|
||||
from content_filter import is_adult_text, is_adult_url
|
||||
from indexer import index_page
|
||||
|
||||
|
||||
class sFetchBot:
|
||||
"""A polite async crawler that stays within configurable crawl limits and filters adult content."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_depth: int = MAX_CRAWL_DEPTH,
|
||||
same_domain_only: bool = True,
|
||||
crawl_delay: float = CRAWL_DELAY_SECONDS,
|
||||
max_pages_per_domain: int = MAX_PAGES_PER_DOMAIN,
|
||||
max_concurrency: int = DEFAULT_CRAWL_CONCURRENCY,
|
||||
timeout_seconds: float = 15.0,
|
||||
) -> None:
|
||||
self.max_depth = max_depth
|
||||
self.same_domain_only = same_domain_only
|
||||
self.crawl_delay = crawl_delay
|
||||
self.max_pages_per_domain = max_pages_per_domain
|
||||
self.max_concurrency = max(1, max_concurrency)
|
||||
self.timeout_seconds = timeout_seconds
|
||||
self.visited: set[str] = set()
|
||||
self.domain_counts: defaultdict[str, int] = defaultdict(int)
|
||||
self.robots_cache: dict[str, RobotFileParser] = {}
|
||||
self.indexed_count = 0
|
||||
self._state_lock = asyncio.Lock()
|
||||
self._fetch_semaphore = asyncio.Semaphore(self.max_concurrency)
|
||||
self._client: httpx.AsyncClient | None = None
|
||||
|
||||
async def start(self, seed_urls: list[str]) -> None:
|
||||
if not seed_urls:
|
||||
return
|
||||
|
||||
timeout = httpx.Timeout(self.timeout_seconds)
|
||||
headers = {"User-Agent": USER_AGENT}
|
||||
async with httpx.AsyncClient(
|
||||
timeout=timeout,
|
||||
follow_redirects=True,
|
||||
headers=headers,
|
||||
) as client:
|
||||
self._client = client
|
||||
tasks = []
|
||||
for seed_url in seed_urls:
|
||||
normalized_seed = self._normalize_url(seed_url)
|
||||
if normalized_seed is None:
|
||||
print(f"sFetch: skipped {seed_url} (invalid URL)")
|
||||
continue
|
||||
if is_adult_url(normalized_seed):
|
||||
print(f"sFetch: skipped {seed_url} (adult content filtered)")
|
||||
continue
|
||||
root_domain = urlsplit(normalized_seed).netloc.lower()
|
||||
tasks.append(self._crawl_url(normalized_seed, root_domain, depth=0))
|
||||
|
||||
if tasks:
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
self._client = None
|
||||
|
||||
async def _crawl_url(self, url: str, root_domain: str, depth: int) -> None:
|
||||
try:
|
||||
if depth > self.max_depth:
|
||||
return
|
||||
|
||||
normalized_url = self._normalize_url(url)
|
||||
if normalized_url is None:
|
||||
return
|
||||
|
||||
if is_adult_url(normalized_url):
|
||||
print(f"sFetch: skipped {normalized_url} (adult)")
|
||||
return
|
||||
|
||||
parsed = urlsplit(normalized_url)
|
||||
current_domain = parsed.netloc.lower()
|
||||
if self.same_domain_only and current_domain != root_domain:
|
||||
return
|
||||
|
||||
if await self._already_seen(normalized_url):
|
||||
return
|
||||
|
||||
if await self._domain_limit_reached(current_domain):
|
||||
return
|
||||
|
||||
if not await self._is_allowed_by_robots(normalized_url):
|
||||
return
|
||||
|
||||
client = self._require_client()
|
||||
async with self._fetch_semaphore:
|
||||
await asyncio.sleep(self.crawl_delay)
|
||||
response = await client.get(normalized_url)
|
||||
response.raise_for_status()
|
||||
|
||||
content_type = response.headers.get("content-type", "").lower()
|
||||
if "text/html" not in content_type:
|
||||
return
|
||||
|
||||
title, body_text, links, images, videos = self._extract_page_content(normalized_url, response.text)
|
||||
|
||||
if is_adult_text(body_text):
|
||||
print(f"sFetch: skipped {normalized_url} (adult text)")
|
||||
return
|
||||
|
||||
await index_page(normalized_url, title, body_text, images, videos)
|
||||
await self._increment_domain_count(current_domain)
|
||||
self.indexed_count += 1
|
||||
print(f"sFetch: indexed {normalized_url}")
|
||||
|
||||
for link in links:
|
||||
await self._crawl_url(link, root_domain, depth + 1)
|
||||
except httpx.HTTPError as exc:
|
||||
print(f"sFetch: HTTP error {url} ({exc})")
|
||||
except Exception as exc:
|
||||
print(f"sFetch: error {url} ({exc})")
|
||||
|
||||
def _require_client(self) -> httpx.AsyncClient:
|
||||
if self._client is None:
|
||||
raise RuntimeError("Crawler client is not initialized.")
|
||||
return self._client
|
||||
|
||||
async def _already_seen(self, url: str) -> bool:
|
||||
async with self._state_lock:
|
||||
if url in self.visited:
|
||||
return True
|
||||
self.visited.add(url)
|
||||
return False
|
||||
|
||||
async def _domain_limit_reached(self, domain: str) -> bool:
|
||||
async with self._state_lock:
|
||||
return self.domain_counts[domain] >= self.max_pages_per_domain
|
||||
|
||||
async def _increment_domain_count(self, domain: str) -> None:
|
||||
async with self._state_lock:
|
||||
self.domain_counts[domain] += 1
|
||||
|
||||
async def _is_allowed_by_robots(self, url: str) -> bool:
|
||||
parsed = urlsplit(url)
|
||||
robots_key = f"{parsed.scheme}://{parsed.netloc.lower()}"
|
||||
parser = self.robots_cache.get(robots_key)
|
||||
if parser is None:
|
||||
parser = await self._fetch_robots_parser(robots_key)
|
||||
self.robots_cache[robots_key] = parser
|
||||
return parser.can_fetch(USER_AGENT, url)
|
||||
|
||||
async def _fetch_robots_parser(self, domain_base: str) -> RobotFileParser:
|
||||
parser = RobotFileParser()
|
||||
robots_url = f"{domain_base}/robots.txt"
|
||||
parser.set_url(robots_url)
|
||||
|
||||
try:
|
||||
client = self._require_client()
|
||||
response = await client.get(robots_url)
|
||||
if response.status_code == 200:
|
||||
parser.parse(response.text.splitlines())
|
||||
else:
|
||||
parser.parse([])
|
||||
except Exception:
|
||||
parser.parse([])
|
||||
return parser
|
||||
|
||||
def _extract_page_content(
|
||||
self,
|
||||
url: str,
|
||||
html_text: str,
|
||||
) -> tuple[str, str, list[str], list[dict[str, str]], list[dict[str, str]]]:
|
||||
soup = BeautifulSoup(html_text, "html.parser")
|
||||
|
||||
images = self._extract_images(url, soup)
|
||||
videos = self._extract_videos(url, soup)
|
||||
|
||||
for element in soup(["script", "style", "noscript"]):
|
||||
element.decompose()
|
||||
|
||||
title = ""
|
||||
if soup.title and soup.title.string:
|
||||
title = soup.title.string.strip()
|
||||
if not title:
|
||||
title = url
|
||||
|
||||
body_text = soup.get_text(separator=" ", strip=True)
|
||||
links = self._extract_links(url, soup)
|
||||
return title, body_text, links, images, videos
|
||||
|
||||
def _extract_images(self, base_url: str, soup: BeautifulSoup) -> list[dict[str, str]]:
|
||||
images = []
|
||||
for img in soup.find_all("img", src=True):
|
||||
src = str(img["src"]).strip()
|
||||
if not src or src.startswith(("data:", "javascript:")):
|
||||
continue
|
||||
absolute_url = urljoin(base_url, src)
|
||||
normalized_url = self._normalize_url(absolute_url)
|
||||
if normalized_url is not None:
|
||||
alt = str(img.get("alt", "")).strip()
|
||||
images.append({"url": normalized_url, "alt_text": alt})
|
||||
return self._dedupe_media(images)
|
||||
|
||||
def _extract_videos(self, base_url: str, soup: BeautifulSoup) -> list[dict[str, str]]:
|
||||
videos: list[dict[str, str]] = []
|
||||
|
||||
for video in soup.find_all("video"):
|
||||
if video.get("src"):
|
||||
normalized = self._normalize_url(urljoin(base_url, str(video["src"]).strip()))
|
||||
if normalized:
|
||||
title = str(video.get("title") or video.get("aria-label") or "").strip()
|
||||
videos.append({"url": normalized, "title": title})
|
||||
for source in video.find_all("source", src=True):
|
||||
normalized = self._normalize_url(urljoin(base_url, str(source["src"]).strip()))
|
||||
if normalized:
|
||||
title = str(video.get("title") or video.get("aria-label") or "").strip()
|
||||
videos.append({"url": normalized, "title": title})
|
||||
|
||||
for iframe in soup.find_all("iframe", src=True):
|
||||
raw_src = str(iframe["src"]).strip()
|
||||
normalized = self._normalize_url(urljoin(base_url, raw_src))
|
||||
if normalized and self._is_video_url(normalized):
|
||||
title = str(iframe.get("title") or iframe.get("aria-label") or "").strip()
|
||||
videos.append({"url": normalized, "title": title})
|
||||
|
||||
for tag in soup.find_all("a", href=True):
|
||||
raw_href = str(tag["href"]).strip()
|
||||
normalized = self._normalize_url(urljoin(base_url, raw_href))
|
||||
if normalized and self._is_video_url(normalized):
|
||||
title = " ".join(tag.stripped_strings).strip()
|
||||
videos.append({"url": normalized, "title": title})
|
||||
|
||||
return self._dedupe_media(videos)
|
||||
|
||||
def _is_video_url(self, url: str) -> bool:
|
||||
lowered = url.lower()
|
||||
return any(
|
||||
marker in lowered
|
||||
for marker in (
|
||||
"youtube.com/watch",
|
||||
"youtube.com/embed/",
|
||||
"youtu.be/",
|
||||
"vimeo.com/",
|
||||
".mp4",
|
||||
".webm",
|
||||
".mov",
|
||||
".m3u8",
|
||||
)
|
||||
)
|
||||
|
||||
def _dedupe_media(self, items: list[dict[str, str]]) -> list[dict[str, str]]:
|
||||
seen: set[str] = set()
|
||||
unique: list[dict[str, str]] = []
|
||||
for item in items:
|
||||
media_url = item.get("url")
|
||||
if not media_url or media_url in seen:
|
||||
continue
|
||||
seen.add(media_url)
|
||||
unique.append(item)
|
||||
return unique
|
||||
|
||||
def _extract_links(self, base_url: str, soup: BeautifulSoup) -> list[str]:
|
||||
collected_links: list[str] = []
|
||||
for tag in soup.find_all("a", href=True):
|
||||
href = str(tag["href"]).strip()
|
||||
if not href or href.startswith(("javascript:", "mailto:", "tel:")):
|
||||
continue
|
||||
absolute_url = urljoin(base_url, href)
|
||||
normalized_url = self._normalize_url(absolute_url)
|
||||
if normalized_url is not None:
|
||||
collected_links.append(normalized_url)
|
||||
return self._dedupe_links(collected_links)
|
||||
|
||||
def _dedupe_links(self, links: Iterable[str]) -> list[str]:
|
||||
seen: set[str] = set()
|
||||
unique_links: list[str] = []
|
||||
for link in links:
|
||||
if link in seen:
|
||||
continue
|
||||
seen.add(link)
|
||||
unique_links.append(link)
|
||||
return unique_links
|
||||
|
||||
def _normalize_url(self, url: str) -> str | None:
|
||||
if not url:
|
||||
return None
|
||||
|
||||
clean_url, _ = urldefrag(url.strip())
|
||||
parsed = urlsplit(clean_url)
|
||||
if parsed.scheme not in {"http", "https"} or not parsed.netloc:
|
||||
return None
|
||||
|
||||
normalized = parsed._replace(
|
||||
scheme=parsed.scheme.lower(),
|
||||
netloc=parsed.netloc.lower(),
|
||||
)
|
||||
return urlunsplit(normalized)
|
||||
Reference in New Issue
Block a user