inital commit
This commit is contained in:
+16
@@ -0,0 +1,16 @@
|
||||
.DS_Store
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
.pytest_cache/
|
||||
|
||||
.venv/
|
||||
venv/
|
||||
backend/venv/
|
||||
|
||||
*.db
|
||||
*.db-*
|
||||
*.sqlite
|
||||
*.sqlite3
|
||||
|
||||
.env
|
||||
.env.*
|
||||
@@ -0,0 +1,119 @@
|
||||
# sFetch
|
||||
|
||||
sFetch is a full-stack search engine prototype with a lightweight Google/DDG-inspired frontend, a FastAPI search API, and an async crawler that indexes pages into a local SQLite FTS5 database.
|
||||
|
||||
On first backend launch, sFetch downloads the latest Tranco top-site list, filters pornographic/adult domains, and seeds up to 1,000 non-adult sites if that seed has not already been recorded in the database.
|
||||
|
||||
## Project Structure
|
||||
|
||||
```text
|
||||
sFetch/
|
||||
├── backend/
|
||||
│ ├── main.py
|
||||
│ ├── crawler.py
|
||||
│ ├── top_sites.py
|
||||
│ ├── content_filter.py
|
||||
│ ├── indexer.py
|
||||
│ ├── searcher.py
|
||||
│ ├── models.py
|
||||
│ ├── database.py
|
||||
│ ├── config.py
|
||||
│ └── requirements.txt
|
||||
├── frontend/
|
||||
│ ├── index.html
|
||||
│ └── results.html
|
||||
└── README.md
|
||||
```
|
||||
|
||||
## Setup
|
||||
|
||||
1. Create a virtual environment and install the backend dependencies:
|
||||
|
||||
```bash
|
||||
cd backend
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
2. Start the API:
|
||||
|
||||
```bash
|
||||
uvicorn main:app --reload
|
||||
```
|
||||
|
||||
3. Open `frontend/index.html` in your browser.
|
||||
|
||||
The frontend uses `const API_BASE = "http://localhost:8000";` at the top of each page script.
|
||||
|
||||
## Crawling
|
||||
|
||||
The home page has index controls for:
|
||||
|
||||
- seeding the top 1,000 non-adult sites
|
||||
- launching a custom crawl with seed URLs, depth, per-domain page limits, and same-domain filtering
|
||||
- viewing current index and seed status
|
||||
|
||||
You can also call the API directly:
|
||||
|
||||
```bash
|
||||
curl -X POST "http://localhost:8000/crawl" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"seed_urls": ["https://example.com"],
|
||||
"max_depth": 2,
|
||||
"max_pages_per_domain": 50,
|
||||
"same_domain_only": true
|
||||
}'
|
||||
```
|
||||
|
||||
Seed the top-site list manually:
|
||||
|
||||
```bash
|
||||
curl -X POST "http://localhost:8000/crawl/top-sites"
|
||||
```
|
||||
|
||||
The crawler:
|
||||
|
||||
- respects `robots.txt`
|
||||
- filters adult URLs and adult-heavy page text
|
||||
- stays on the same domain by default
|
||||
- avoids revisiting URLs
|
||||
- indexes HTML pages, images, and videos into SQLite
|
||||
- records top-site seeding completion in `app_meta`
|
||||
|
||||
## API Endpoints
|
||||
|
||||
| Method | Path | Purpose |
|
||||
| --- | --- | --- |
|
||||
| `GET` | `/` | Health check |
|
||||
| `GET` | `/search` | Full-text search endpoint |
|
||||
| `POST` | `/crawl` | Start a custom background crawl job |
|
||||
| `POST` | `/crawl/top-sites` | Queue the top-site seed crawl |
|
||||
| `GET` | `/crawl/top-sites/status` | Check top-site seed state |
|
||||
| `GET` | `/stats` | Total indexed pages and latest index time |
|
||||
|
||||
## Configuration
|
||||
|
||||
sFetch's crawl and storage behavior lives in `backend/config.py`:
|
||||
|
||||
| Setting | Description |
|
||||
| --- | --- |
|
||||
| `MAX_CRAWL_DEPTH` | Default link depth followed from each seed URL |
|
||||
| `MAX_PAGES_PER_DOMAIN` | Default per-domain crawl cap |
|
||||
| `CRAWL_DELAY_SECONDS` | Delay before requests |
|
||||
| `DEFAULT_CRAWL_CONCURRENCY` | Concurrent fetch limit |
|
||||
| `DB_PATH` | SQLite database path |
|
||||
| `TOP_SITE_SOURCE_URL` | Top-site list source |
|
||||
| `TOP_SITE_SEED_LIMIT` | Number of safe top sites to seed |
|
||||
| `USER_AGENT` | User agent sent by `sFetchBot` |
|
||||
|
||||
## Tech Stack
|
||||
|
||||
| Layer | Technology |
|
||||
| --- | --- |
|
||||
| Frontend | HTML, TailwindCSS CDN, Vanilla JavaScript |
|
||||
| Backend | Python, FastAPI |
|
||||
| Crawler | Python, `httpx`, `BeautifulSoup4`, `asyncio` |
|
||||
| Search Index | SQLite FTS5 via `aiosqlite` |
|
||||
| Top Sites | Tranco daily top-site ZIP with bundled fallback |
|
||||
+1226
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,61 @@
|
||||
"""Adult-content filtering helpers used before URLs reach the index."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from urllib.parse import urlsplit
|
||||
|
||||
from config import ADULT_DOMAINS, ADULT_KEYWORDS
|
||||
|
||||
EXPLICIT_HOST_MARKERS = (
|
||||
"porn",
|
||||
"xxx",
|
||||
"xvideo",
|
||||
"xnxx",
|
||||
"hentai",
|
||||
"camgirl",
|
||||
"camsoda",
|
||||
"chaturbate",
|
||||
"stripchat",
|
||||
"redtube",
|
||||
)
|
||||
EXPLICIT_HOST_TOKENS = {"sex", "sexy", "adult", "nude", "erotic", "escort", "bdsm"}
|
||||
|
||||
|
||||
def _clean_host(url: str) -> str:
|
||||
host = urlsplit(url.lower()).netloc
|
||||
return host.removeprefix("www.")
|
||||
|
||||
|
||||
def _host_matches_blocked_domain(host: str, domain: str) -> bool:
|
||||
clean_domain = domain.lower().removeprefix("www.")
|
||||
return host == clean_domain or host.endswith(f".{clean_domain}")
|
||||
|
||||
|
||||
def is_adult_url(url: str) -> bool:
|
||||
"""Return True when a URL appears to point at pornographic/adult content."""
|
||||
|
||||
lowered = url.lower()
|
||||
parsed = urlsplit(lowered)
|
||||
host = _clean_host(lowered)
|
||||
|
||||
if any(_host_matches_blocked_domain(host, domain) for domain in ADULT_DOMAINS):
|
||||
return True
|
||||
|
||||
if any(marker in host for marker in EXPLICIT_HOST_MARKERS):
|
||||
return True
|
||||
|
||||
host_tokens = set(re.split(r"[^a-z0-9]+", host))
|
||||
if any(token in host_tokens for token in EXPLICIT_HOST_TOKENS):
|
||||
return True
|
||||
|
||||
path_tokens = set(re.split(r"[^a-z0-9]+", f"{parsed.path} {parsed.query}"))
|
||||
return any(keyword in path_tokens for keyword in ADULT_KEYWORDS)
|
||||
|
||||
|
||||
def is_adult_text(text: str) -> bool:
|
||||
"""Use a conservative keyword threshold so one incidental word does not block a page."""
|
||||
|
||||
lowered = text.lower()
|
||||
hits = sum(1 for keyword in ADULT_KEYWORDS if keyword in lowered)
|
||||
return hits >= 3
|
||||
@@ -0,0 +1,309 @@
|
||||
"""Async web crawler used to build the sFetch index."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from collections import defaultdict
|
||||
from typing import Iterable
|
||||
from urllib.parse import urljoin, urldefrag, urlsplit, urlunsplit
|
||||
from urllib.robotparser import RobotFileParser
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from config import (
|
||||
CRAWL_DELAY_SECONDS,
|
||||
DEFAULT_CRAWL_CONCURRENCY,
|
||||
MAX_CRAWL_DEPTH,
|
||||
MAX_PAGES_PER_DOMAIN,
|
||||
USER_AGENT,
|
||||
)
|
||||
from content_filter import is_adult_text, is_adult_url
|
||||
from indexer import index_page
|
||||
|
||||
|
||||
class sFetchBot:
|
||||
"""A polite async crawler that stays within configurable crawl limits and filters adult content."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_depth: int = MAX_CRAWL_DEPTH,
|
||||
same_domain_only: bool = True,
|
||||
crawl_delay: float = CRAWL_DELAY_SECONDS,
|
||||
max_pages_per_domain: int = MAX_PAGES_PER_DOMAIN,
|
||||
max_concurrency: int = DEFAULT_CRAWL_CONCURRENCY,
|
||||
timeout_seconds: float = 15.0,
|
||||
) -> None:
|
||||
self.max_depth = max_depth
|
||||
self.same_domain_only = same_domain_only
|
||||
self.crawl_delay = crawl_delay
|
||||
self.max_pages_per_domain = max_pages_per_domain
|
||||
self.max_concurrency = max(1, max_concurrency)
|
||||
self.timeout_seconds = timeout_seconds
|
||||
self.visited: set[str] = set()
|
||||
self.domain_counts: defaultdict[str, int] = defaultdict(int)
|
||||
self.robots_cache: dict[str, RobotFileParser] = {}
|
||||
self.indexed_count = 0
|
||||
self._state_lock = asyncio.Lock()
|
||||
self._fetch_semaphore = asyncio.Semaphore(self.max_concurrency)
|
||||
self._client: httpx.AsyncClient | None = None
|
||||
|
||||
async def start(self, seed_urls: list[str]) -> None:
|
||||
if not seed_urls:
|
||||
return
|
||||
|
||||
timeout = httpx.Timeout(self.timeout_seconds)
|
||||
headers = {"User-Agent": USER_AGENT}
|
||||
async with httpx.AsyncClient(
|
||||
timeout=timeout,
|
||||
follow_redirects=True,
|
||||
headers=headers,
|
||||
) as client:
|
||||
self._client = client
|
||||
tasks = []
|
||||
for seed_url in seed_urls:
|
||||
normalized_seed = self._normalize_url(seed_url)
|
||||
if normalized_seed is None:
|
||||
print(f"sFetch: skipped {seed_url} (invalid URL)")
|
||||
continue
|
||||
if is_adult_url(normalized_seed):
|
||||
print(f"sFetch: skipped {seed_url} (adult content filtered)")
|
||||
continue
|
||||
root_domain = urlsplit(normalized_seed).netloc.lower()
|
||||
tasks.append(self._crawl_url(normalized_seed, root_domain, depth=0))
|
||||
|
||||
if tasks:
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
self._client = None
|
||||
|
||||
async def _crawl_url(self, url: str, root_domain: str, depth: int) -> None:
|
||||
try:
|
||||
if depth > self.max_depth:
|
||||
return
|
||||
|
||||
normalized_url = self._normalize_url(url)
|
||||
if normalized_url is None:
|
||||
return
|
||||
|
||||
if is_adult_url(normalized_url):
|
||||
print(f"sFetch: skipped {normalized_url} (adult)")
|
||||
return
|
||||
|
||||
parsed = urlsplit(normalized_url)
|
||||
current_domain = parsed.netloc.lower()
|
||||
if self.same_domain_only and current_domain != root_domain:
|
||||
return
|
||||
|
||||
if await self._already_seen(normalized_url):
|
||||
return
|
||||
|
||||
if await self._domain_limit_reached(current_domain):
|
||||
return
|
||||
|
||||
if not await self._is_allowed_by_robots(normalized_url):
|
||||
return
|
||||
|
||||
client = self._require_client()
|
||||
async with self._fetch_semaphore:
|
||||
await asyncio.sleep(self.crawl_delay)
|
||||
response = await client.get(normalized_url)
|
||||
response.raise_for_status()
|
||||
|
||||
content_type = response.headers.get("content-type", "").lower()
|
||||
if "text/html" not in content_type:
|
||||
return
|
||||
|
||||
title, body_text, links, images, videos = self._extract_page_content(normalized_url, response.text)
|
||||
|
||||
if is_adult_text(body_text):
|
||||
print(f"sFetch: skipped {normalized_url} (adult text)")
|
||||
return
|
||||
|
||||
await index_page(normalized_url, title, body_text, images, videos)
|
||||
await self._increment_domain_count(current_domain)
|
||||
self.indexed_count += 1
|
||||
print(f"sFetch: indexed {normalized_url}")
|
||||
|
||||
for link in links:
|
||||
await self._crawl_url(link, root_domain, depth + 1)
|
||||
except httpx.HTTPError as exc:
|
||||
print(f"sFetch: HTTP error {url} ({exc})")
|
||||
except Exception as exc:
|
||||
print(f"sFetch: error {url} ({exc})")
|
||||
|
||||
def _require_client(self) -> httpx.AsyncClient:
|
||||
if self._client is None:
|
||||
raise RuntimeError("Crawler client is not initialized.")
|
||||
return self._client
|
||||
|
||||
async def _already_seen(self, url: str) -> bool:
|
||||
async with self._state_lock:
|
||||
if url in self.visited:
|
||||
return True
|
||||
self.visited.add(url)
|
||||
return False
|
||||
|
||||
async def _domain_limit_reached(self, domain: str) -> bool:
|
||||
async with self._state_lock:
|
||||
return self.domain_counts[domain] >= self.max_pages_per_domain
|
||||
|
||||
async def _increment_domain_count(self, domain: str) -> None:
|
||||
async with self._state_lock:
|
||||
self.domain_counts[domain] += 1
|
||||
|
||||
async def _is_allowed_by_robots(self, url: str) -> bool:
|
||||
parsed = urlsplit(url)
|
||||
robots_key = f"{parsed.scheme}://{parsed.netloc.lower()}"
|
||||
parser = self.robots_cache.get(robots_key)
|
||||
if parser is None:
|
||||
parser = await self._fetch_robots_parser(robots_key)
|
||||
self.robots_cache[robots_key] = parser
|
||||
return parser.can_fetch(USER_AGENT, url)
|
||||
|
||||
async def _fetch_robots_parser(self, domain_base: str) -> RobotFileParser:
|
||||
parser = RobotFileParser()
|
||||
robots_url = f"{domain_base}/robots.txt"
|
||||
parser.set_url(robots_url)
|
||||
|
||||
try:
|
||||
client = self._require_client()
|
||||
response = await client.get(robots_url)
|
||||
if response.status_code == 200:
|
||||
parser.parse(response.text.splitlines())
|
||||
else:
|
||||
parser.parse([])
|
||||
except Exception:
|
||||
parser.parse([])
|
||||
return parser
|
||||
|
||||
def _extract_page_content(
|
||||
self,
|
||||
url: str,
|
||||
html_text: str,
|
||||
) -> tuple[str, str, list[str], list[dict[str, str]], list[dict[str, str]]]:
|
||||
soup = BeautifulSoup(html_text, "html.parser")
|
||||
|
||||
images = self._extract_images(url, soup)
|
||||
videos = self._extract_videos(url, soup)
|
||||
|
||||
for element in soup(["script", "style", "noscript"]):
|
||||
element.decompose()
|
||||
|
||||
title = ""
|
||||
if soup.title and soup.title.string:
|
||||
title = soup.title.string.strip()
|
||||
if not title:
|
||||
title = url
|
||||
|
||||
body_text = soup.get_text(separator=" ", strip=True)
|
||||
links = self._extract_links(url, soup)
|
||||
return title, body_text, links, images, videos
|
||||
|
||||
def _extract_images(self, base_url: str, soup: BeautifulSoup) -> list[dict[str, str]]:
|
||||
images = []
|
||||
for img in soup.find_all("img", src=True):
|
||||
src = str(img["src"]).strip()
|
||||
if not src or src.startswith(("data:", "javascript:")):
|
||||
continue
|
||||
absolute_url = urljoin(base_url, src)
|
||||
normalized_url = self._normalize_url(absolute_url)
|
||||
if normalized_url is not None:
|
||||
alt = str(img.get("alt", "")).strip()
|
||||
images.append({"url": normalized_url, "alt_text": alt})
|
||||
return self._dedupe_media(images)
|
||||
|
||||
def _extract_videos(self, base_url: str, soup: BeautifulSoup) -> list[dict[str, str]]:
|
||||
videos: list[dict[str, str]] = []
|
||||
|
||||
for video in soup.find_all("video"):
|
||||
if video.get("src"):
|
||||
normalized = self._normalize_url(urljoin(base_url, str(video["src"]).strip()))
|
||||
if normalized:
|
||||
title = str(video.get("title") or video.get("aria-label") or "").strip()
|
||||
videos.append({"url": normalized, "title": title})
|
||||
for source in video.find_all("source", src=True):
|
||||
normalized = self._normalize_url(urljoin(base_url, str(source["src"]).strip()))
|
||||
if normalized:
|
||||
title = str(video.get("title") or video.get("aria-label") or "").strip()
|
||||
videos.append({"url": normalized, "title": title})
|
||||
|
||||
for iframe in soup.find_all("iframe", src=True):
|
||||
raw_src = str(iframe["src"]).strip()
|
||||
normalized = self._normalize_url(urljoin(base_url, raw_src))
|
||||
if normalized and self._is_video_url(normalized):
|
||||
title = str(iframe.get("title") or iframe.get("aria-label") or "").strip()
|
||||
videos.append({"url": normalized, "title": title})
|
||||
|
||||
for tag in soup.find_all("a", href=True):
|
||||
raw_href = str(tag["href"]).strip()
|
||||
normalized = self._normalize_url(urljoin(base_url, raw_href))
|
||||
if normalized and self._is_video_url(normalized):
|
||||
title = " ".join(tag.stripped_strings).strip()
|
||||
videos.append({"url": normalized, "title": title})
|
||||
|
||||
return self._dedupe_media(videos)
|
||||
|
||||
def _is_video_url(self, url: str) -> bool:
|
||||
lowered = url.lower()
|
||||
return any(
|
||||
marker in lowered
|
||||
for marker in (
|
||||
"youtube.com/watch",
|
||||
"youtube.com/embed/",
|
||||
"youtu.be/",
|
||||
"vimeo.com/",
|
||||
".mp4",
|
||||
".webm",
|
||||
".mov",
|
||||
".m3u8",
|
||||
)
|
||||
)
|
||||
|
||||
def _dedupe_media(self, items: list[dict[str, str]]) -> list[dict[str, str]]:
|
||||
seen: set[str] = set()
|
||||
unique: list[dict[str, str]] = []
|
||||
for item in items:
|
||||
media_url = item.get("url")
|
||||
if not media_url or media_url in seen:
|
||||
continue
|
||||
seen.add(media_url)
|
||||
unique.append(item)
|
||||
return unique
|
||||
|
||||
def _extract_links(self, base_url: str, soup: BeautifulSoup) -> list[str]:
|
||||
collected_links: list[str] = []
|
||||
for tag in soup.find_all("a", href=True):
|
||||
href = str(tag["href"]).strip()
|
||||
if not href or href.startswith(("javascript:", "mailto:", "tel:")):
|
||||
continue
|
||||
absolute_url = urljoin(base_url, href)
|
||||
normalized_url = self._normalize_url(absolute_url)
|
||||
if normalized_url is not None:
|
||||
collected_links.append(normalized_url)
|
||||
return self._dedupe_links(collected_links)
|
||||
|
||||
def _dedupe_links(self, links: Iterable[str]) -> list[str]:
|
||||
seen: set[str] = set()
|
||||
unique_links: list[str] = []
|
||||
for link in links:
|
||||
if link in seen:
|
||||
continue
|
||||
seen.add(link)
|
||||
unique_links.append(link)
|
||||
return unique_links
|
||||
|
||||
def _normalize_url(self, url: str) -> str | None:
|
||||
if not url:
|
||||
return None
|
||||
|
||||
clean_url, _ = urldefrag(url.strip())
|
||||
parsed = urlsplit(clean_url)
|
||||
if parsed.scheme not in {"http", "https"} or not parsed.netloc:
|
||||
return None
|
||||
|
||||
normalized = parsed._replace(
|
||||
scheme=parsed.scheme.lower(),
|
||||
netloc=parsed.netloc.lower(),
|
||||
)
|
||||
return urlunsplit(normalized)
|
||||
@@ -0,0 +1,395 @@
|
||||
"""Async SQLite helpers for sFetch's crawl index."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Any, AsyncIterator
|
||||
|
||||
import aiosqlite
|
||||
|
||||
from config import DB_PATH
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def _get_connection() -> AsyncIterator[aiosqlite.Connection]:
|
||||
async with aiosqlite.connect(DB_PATH) as connection:
|
||||
connection.row_factory = aiosqlite.Row
|
||||
await connection.execute("PRAGMA foreign_keys = ON;")
|
||||
await connection.execute("PRAGMA journal_mode = WAL;")
|
||||
yield connection
|
||||
|
||||
|
||||
def _to_fts_query(query: str) -> str:
|
||||
tokens: list[str] = []
|
||||
for raw_token in query.split():
|
||||
token = raw_token.strip()
|
||||
if not token:
|
||||
continue
|
||||
escaped = token.replace('"', '""')
|
||||
tokens.append(f'"{escaped}"')
|
||||
return " OR ".join(tokens)
|
||||
|
||||
|
||||
async def init_db() -> None:
|
||||
async with _get_connection() as connection:
|
||||
await connection.executescript(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS pages (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
url TEXT UNIQUE NOT NULL,
|
||||
title TEXT,
|
||||
body_text TEXT,
|
||||
indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts
|
||||
USING fts5(title, body_text, content='pages', content_rowid='id');
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS pages_ai
|
||||
AFTER INSERT ON pages
|
||||
BEGIN
|
||||
INSERT INTO pages_fts(rowid, title, body_text)
|
||||
VALUES (new.id, new.title, new.body_text);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS pages_ad
|
||||
AFTER DELETE ON pages
|
||||
BEGIN
|
||||
INSERT INTO pages_fts(pages_fts, rowid, title, body_text)
|
||||
VALUES ('delete', old.id, old.title, old.body_text);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS pages_au
|
||||
AFTER UPDATE ON pages
|
||||
BEGIN
|
||||
INSERT INTO pages_fts(pages_fts, rowid, title, body_text)
|
||||
VALUES ('delete', old.id, old.title, old.body_text);
|
||||
INSERT INTO pages_fts(rowid, title, body_text)
|
||||
VALUES (new.id, new.title, new.body_text);
|
||||
END;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS images (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
url TEXT UNIQUE NOT NULL,
|
||||
page_url TEXT NOT NULL,
|
||||
alt_text TEXT,
|
||||
indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY(page_url) REFERENCES pages(url) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS images_fts
|
||||
USING fts5(alt_text, content='images', content_rowid='id');
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS images_ai
|
||||
AFTER INSERT ON images
|
||||
BEGIN
|
||||
INSERT INTO images_fts(rowid, alt_text)
|
||||
VALUES (new.id, new.alt_text);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS images_ad
|
||||
AFTER DELETE ON images
|
||||
BEGIN
|
||||
INSERT INTO images_fts(images_fts, rowid, alt_text)
|
||||
VALUES ('delete', old.id, old.alt_text);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS images_au
|
||||
AFTER UPDATE ON images
|
||||
BEGIN
|
||||
INSERT INTO images_fts(images_fts, rowid, alt_text)
|
||||
VALUES ('delete', old.id, old.alt_text);
|
||||
INSERT INTO images_fts(rowid, alt_text)
|
||||
VALUES (new.id, new.alt_text);
|
||||
END;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS videos (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
url TEXT UNIQUE NOT NULL,
|
||||
page_url TEXT NOT NULL,
|
||||
title TEXT,
|
||||
indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY(page_url) REFERENCES pages(url) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS videos_fts
|
||||
USING fts5(title, content='videos', content_rowid='id');
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS videos_ai
|
||||
AFTER INSERT ON videos
|
||||
BEGIN
|
||||
INSERT INTO videos_fts(rowid, title)
|
||||
VALUES (new.id, new.title);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS videos_ad
|
||||
AFTER DELETE ON videos
|
||||
BEGIN
|
||||
INSERT INTO videos_fts(videos_fts, rowid, title)
|
||||
VALUES ('delete', old.id, old.title);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS videos_au
|
||||
AFTER UPDATE ON videos
|
||||
BEGIN
|
||||
INSERT INTO videos_fts(videos_fts, rowid, title)
|
||||
VALUES ('delete', old.id, old.title);
|
||||
INSERT INTO videos_fts(rowid, title)
|
||||
VALUES (new.id, new.title);
|
||||
END;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS app_meta (
|
||||
key TEXT PRIMARY KEY,
|
||||
value TEXT NOT NULL,
|
||||
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
"""
|
||||
)
|
||||
await connection.commit()
|
||||
|
||||
|
||||
async def get_meta_value(key: str) -> str | None:
|
||||
async with _get_connection() as connection:
|
||||
cursor = await connection.execute(
|
||||
"SELECT value FROM app_meta WHERE key = ?",
|
||||
(key,),
|
||||
)
|
||||
row = await cursor.fetchone()
|
||||
await cursor.close()
|
||||
return str(row["value"]) if row else None
|
||||
|
||||
|
||||
async def set_meta_value(key: str, value: str) -> None:
|
||||
async with _get_connection() as connection:
|
||||
await connection.execute(
|
||||
"""
|
||||
INSERT INTO app_meta (key, value)
|
||||
VALUES (?, ?)
|
||||
ON CONFLICT(key) DO UPDATE SET
|
||||
value = excluded.value,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
""",
|
||||
(key, value),
|
||||
)
|
||||
await connection.commit()
|
||||
|
||||
|
||||
async def insert_page(url: str, title: str, body_text: str) -> int:
|
||||
async with _get_connection() as connection:
|
||||
await connection.execute(
|
||||
"""
|
||||
INSERT INTO pages (url, title, body_text)
|
||||
VALUES (?, ?, ?)
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
title = excluded.title,
|
||||
body_text = excluded.body_text,
|
||||
indexed_at = CURRENT_TIMESTAMP
|
||||
""",
|
||||
(url, title, body_text),
|
||||
)
|
||||
await connection.commit()
|
||||
|
||||
cursor = await connection.execute(
|
||||
"SELECT id FROM pages WHERE url = ?",
|
||||
(url,),
|
||||
)
|
||||
row = await cursor.fetchone()
|
||||
await cursor.close()
|
||||
if row is None:
|
||||
raise RuntimeError("Inserted page could not be reloaded from the database.")
|
||||
return int(row["id"])
|
||||
|
||||
|
||||
async def insert_image(url: str, page_url: str, alt_text: str) -> None:
|
||||
async with _get_connection() as connection:
|
||||
await connection.execute(
|
||||
"""
|
||||
INSERT INTO images (url, page_url, alt_text)
|
||||
VALUES (?, ?, ?)
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
page_url = excluded.page_url,
|
||||
alt_text = excluded.alt_text,
|
||||
indexed_at = CURRENT_TIMESTAMP
|
||||
""",
|
||||
(url, page_url, alt_text),
|
||||
)
|
||||
await connection.commit()
|
||||
|
||||
|
||||
async def insert_video(url: str, page_url: str, title: str) -> None:
|
||||
async with _get_connection() as connection:
|
||||
await connection.execute(
|
||||
"""
|
||||
INSERT INTO videos (url, page_url, title)
|
||||
VALUES (?, ?, ?)
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
page_url = excluded.page_url,
|
||||
title = excluded.title,
|
||||
indexed_at = CURRENT_TIMESTAMP
|
||||
""",
|
||||
(url, page_url, title),
|
||||
)
|
||||
await connection.commit()
|
||||
|
||||
|
||||
async def search_pages(query: str, limit: int = 10, offset: int = 0) -> list[dict[str, Any]]:
|
||||
fts_query = _to_fts_query(query)
|
||||
if not fts_query:
|
||||
return []
|
||||
|
||||
safe_limit = max(1, min(limit, 50))
|
||||
safe_offset = max(0, offset)
|
||||
async with _get_connection() as connection:
|
||||
cursor = await connection.execute(
|
||||
"""
|
||||
SELECT
|
||||
p.id,
|
||||
p.url,
|
||||
p.title,
|
||||
p.body_text,
|
||||
p.indexed_at
|
||||
FROM pages_fts
|
||||
JOIN pages AS p ON p.id = pages_fts.rowid
|
||||
WHERE pages_fts MATCH ?
|
||||
ORDER BY bm25(pages_fts), p.indexed_at DESC
|
||||
LIMIT ? OFFSET ?
|
||||
""",
|
||||
(fts_query, safe_limit, safe_offset),
|
||||
)
|
||||
rows = await cursor.fetchall()
|
||||
await cursor.close()
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
|
||||
async def count_search_results(query: str) -> int:
|
||||
fts_query = _to_fts_query(query)
|
||||
if not fts_query:
|
||||
return 0
|
||||
|
||||
async with _get_connection() as connection:
|
||||
cursor = await connection.execute(
|
||||
"""
|
||||
SELECT COUNT(*) AS total
|
||||
FROM pages_fts
|
||||
WHERE pages_fts MATCH ?
|
||||
""",
|
||||
(fts_query,),
|
||||
)
|
||||
row = await cursor.fetchone()
|
||||
await cursor.close()
|
||||
return int(row["total"]) if row and row["total"] is not None else 0
|
||||
|
||||
|
||||
async def search_images(query: str, limit: int = 10, offset: int = 0) -> list[dict[str, Any]]:
|
||||
fts_query = _to_fts_query(query)
|
||||
if not fts_query:
|
||||
return []
|
||||
|
||||
safe_limit = max(1, min(limit, 50))
|
||||
safe_offset = max(0, offset)
|
||||
async with _get_connection() as connection:
|
||||
cursor = await connection.execute(
|
||||
"""
|
||||
SELECT
|
||||
i.id,
|
||||
i.url,
|
||||
i.page_url,
|
||||
i.alt_text,
|
||||
i.indexed_at
|
||||
FROM images_fts
|
||||
JOIN images AS i ON i.id = images_fts.rowid
|
||||
WHERE images_fts MATCH ?
|
||||
ORDER BY bm25(images_fts), i.indexed_at DESC
|
||||
LIMIT ? OFFSET ?
|
||||
""",
|
||||
(fts_query, safe_limit, safe_offset),
|
||||
)
|
||||
rows = await cursor.fetchall()
|
||||
await cursor.close()
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
|
||||
async def count_image_results(query: str) -> int:
|
||||
fts_query = _to_fts_query(query)
|
||||
if not fts_query:
|
||||
return 0
|
||||
|
||||
async with _get_connection() as connection:
|
||||
cursor = await connection.execute(
|
||||
"""
|
||||
SELECT COUNT(*) AS total
|
||||
FROM images_fts
|
||||
WHERE images_fts MATCH ?
|
||||
""",
|
||||
(fts_query,),
|
||||
)
|
||||
row = await cursor.fetchone()
|
||||
await cursor.close()
|
||||
return int(row["total"]) if row and row["total"] is not None else 0
|
||||
|
||||
|
||||
async def search_videos(query: str, limit: int = 10, offset: int = 0) -> list[dict[str, Any]]:
|
||||
fts_query = _to_fts_query(query)
|
||||
if not fts_query:
|
||||
return []
|
||||
|
||||
safe_limit = max(1, min(limit, 50))
|
||||
safe_offset = max(0, offset)
|
||||
async with _get_connection() as connection:
|
||||
cursor = await connection.execute(
|
||||
"""
|
||||
SELECT
|
||||
v.id,
|
||||
v.url,
|
||||
v.page_url,
|
||||
v.title,
|
||||
v.indexed_at
|
||||
FROM videos_fts
|
||||
JOIN videos AS v ON v.id = videos_fts.rowid
|
||||
WHERE videos_fts MATCH ?
|
||||
ORDER BY bm25(videos_fts), v.indexed_at DESC
|
||||
LIMIT ? OFFSET ?
|
||||
""",
|
||||
(fts_query, safe_limit, safe_offset),
|
||||
)
|
||||
rows = await cursor.fetchall()
|
||||
await cursor.close()
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
|
||||
async def count_video_results(query: str) -> int:
|
||||
fts_query = _to_fts_query(query)
|
||||
if not fts_query:
|
||||
return 0
|
||||
|
||||
async with _get_connection() as connection:
|
||||
cursor = await connection.execute(
|
||||
"""
|
||||
SELECT COUNT(*) AS total
|
||||
FROM videos_fts
|
||||
WHERE videos_fts MATCH ?
|
||||
""",
|
||||
(fts_query,),
|
||||
)
|
||||
row = await cursor.fetchone()
|
||||
await cursor.close()
|
||||
return int(row["total"]) if row and row["total"] is not None else 0
|
||||
|
||||
|
||||
async def get_stats() -> dict[str, Any]:
|
||||
async with _get_connection() as connection:
|
||||
cursor = await connection.execute(
|
||||
"""
|
||||
SELECT
|
||||
COUNT(*) AS total_pages,
|
||||
MAX(indexed_at) AS last_indexed_at
|
||||
FROM pages
|
||||
"""
|
||||
)
|
||||
row = await cursor.fetchone()
|
||||
await cursor.close()
|
||||
|
||||
return {
|
||||
"total_pages": int(row["total_pages"]) if row and row["total_pages"] is not None else 0,
|
||||
"last_indexed_at": row["last_indexed_at"] if row else None,
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
"""Normalization and indexing helpers for crawled pages."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from database import insert_image, insert_page, insert_video
|
||||
|
||||
MAX_BODY_LENGTH = 10_000
|
||||
|
||||
|
||||
def _normalize_text(body_text: str) -> str:
|
||||
collapsed = re.sub(r"\s+", " ", body_text).strip()
|
||||
return collapsed[:MAX_BODY_LENGTH]
|
||||
|
||||
|
||||
async def index_page(
|
||||
url: str,
|
||||
title: str,
|
||||
body_text: str,
|
||||
images: list[dict[str, str]] | None = None,
|
||||
videos: list[dict[str, str]] | None = None,
|
||||
) -> None:
|
||||
normalized_title = title.strip() or url
|
||||
normalized_body = _normalize_text(body_text)
|
||||
if not normalized_body:
|
||||
return
|
||||
await insert_page(url=url, title=normalized_title, body_text=normalized_body)
|
||||
|
||||
if images:
|
||||
for img in images:
|
||||
img_url = img.get("url")
|
||||
alt_text = img.get("alt_text", "")
|
||||
if img_url:
|
||||
await insert_image(url=img_url, page_url=url, alt_text=alt_text)
|
||||
|
||||
if videos:
|
||||
for video in videos:
|
||||
video_url = video.get("url")
|
||||
video_title = video.get("title") or normalized_title
|
||||
if video_url:
|
||||
await insert_video(url=video_url, page_url=url, title=video_title.strip())
|
||||
+207
@@ -0,0 +1,207 @@
|
||||
"""FastAPI entry point for the sFetch backend."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from datetime import UTC, datetime
|
||||
|
||||
from fastapi import FastAPI, HTTPException, Query, BackgroundTasks
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
from crawler import sFetchBot
|
||||
from config import TOP_SITE_SEED_LIMIT, TOP_SITE_SEED_META_KEY
|
||||
from database import (
|
||||
count_image_results,
|
||||
count_search_results,
|
||||
count_video_results,
|
||||
get_meta_value,
|
||||
get_stats,
|
||||
init_db,
|
||||
set_meta_value,
|
||||
)
|
||||
from models import CrawlRequest, SearchResponse
|
||||
from searcher import search, search_images_api, search_videos_api
|
||||
from top_sites import load_top_site_seed_urls
|
||||
|
||||
app = FastAPI(title="sFetch API", version="1.0.0")
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=False,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
def _utc_now() -> str:
|
||||
return datetime.now(UTC).isoformat()
|
||||
|
||||
|
||||
def _set_seed_status(**updates: object) -> None:
|
||||
current = getattr(app.state, "_top_scrape_status", {}).copy()
|
||||
current.update({"updated_at": _utc_now(), **updates})
|
||||
app.state._top_scrape_status = current
|
||||
|
||||
|
||||
async def _scrape_top_sites(force: bool = False) -> None:
|
||||
await init_db()
|
||||
|
||||
async with app.state._crawl_lock:
|
||||
if app.state._top_scrape_done and not force:
|
||||
return
|
||||
|
||||
existing_seed = await get_meta_value(TOP_SITE_SEED_META_KEY)
|
||||
if existing_seed and not force:
|
||||
stats = await get_stats()
|
||||
_set_seed_status(
|
||||
state="stored",
|
||||
message="Top-site seed already stored in the database.",
|
||||
total=TOP_SITE_SEED_LIMIT,
|
||||
indexed=stats["total_pages"],
|
||||
source=existing_seed,
|
||||
)
|
||||
app.state._top_scrape_done = True
|
||||
return
|
||||
|
||||
stats = await get_stats()
|
||||
if stats["total_pages"] >= TOP_SITE_SEED_LIMIT and not force:
|
||||
source = "existing database"
|
||||
await set_meta_value(TOP_SITE_SEED_META_KEY, source)
|
||||
_set_seed_status(
|
||||
state="stored",
|
||||
message="Top-site seed already stored in the database.",
|
||||
total=TOP_SITE_SEED_LIMIT,
|
||||
indexed=stats["total_pages"],
|
||||
source=source,
|
||||
)
|
||||
app.state._top_scrape_done = True
|
||||
return
|
||||
|
||||
_set_seed_status(state="loading", message="Loading top-site list.", total=TOP_SITE_SEED_LIMIT, indexed=0)
|
||||
seed_urls, source = await load_top_site_seed_urls(limit=TOP_SITE_SEED_LIMIT)
|
||||
_set_seed_status(
|
||||
state="running",
|
||||
message=f"Seeding {len(seed_urls)} non-adult top sites.",
|
||||
total=len(seed_urls),
|
||||
indexed=0,
|
||||
source=source,
|
||||
)
|
||||
|
||||
print(f"sFetch: seeding index with {len(seed_urls)} non-adult top sites from {source}...")
|
||||
bot = sFetchBot(max_depth=0, same_domain_only=True, max_pages_per_domain=1, max_concurrency=12)
|
||||
try:
|
||||
await bot.start(seed_urls)
|
||||
except Exception as exc:
|
||||
_set_seed_status(state="error", message=f"Top-site seed failed: {exc}", indexed=bot.indexed_count)
|
||||
print(f"sFetch: top-site seed failed ({exc})")
|
||||
return
|
||||
|
||||
await set_meta_value(TOP_SITE_SEED_META_KEY, source)
|
||||
_set_seed_status(
|
||||
state="complete",
|
||||
message="Top-site seed complete.",
|
||||
total=len(seed_urls),
|
||||
indexed=bot.indexed_count,
|
||||
source=source,
|
||||
)
|
||||
print(f"sFetch: seeding complete. {bot.indexed_count} pages indexed.")
|
||||
app.state._top_scrape_done = True
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event() -> None:
|
||||
app.state._top_scrape_done = False
|
||||
app.state._crawl_lock = asyncio.Lock()
|
||||
app.state._top_scrape_status = {
|
||||
"state": "idle",
|
||||
"message": "Waiting to check top-site seed.",
|
||||
"total": TOP_SITE_SEED_LIMIT,
|
||||
"indexed": 0,
|
||||
"source": None,
|
||||
"updated_at": _utc_now(),
|
||||
}
|
||||
asyncio.create_task(_scrape_top_sites())
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def health_check() -> dict[str, str]:
|
||||
return {"status": "sFetch is alive"}
|
||||
|
||||
|
||||
@app.get("/search", response_model=SearchResponse)
|
||||
async def search_endpoint(
|
||||
q: str = Query(..., description="Search query"),
|
||||
type: str = Query("web", description="Search type: web, image, or video"),
|
||||
limit: int = Query(10, ge=1, le=50),
|
||||
offset: int = Query(0, ge=0),
|
||||
) -> SearchResponse:
|
||||
query = q.strip()
|
||||
if not query:
|
||||
raise HTTPException(status_code=400, detail="Query parameter 'q' cannot be empty.")
|
||||
|
||||
if type == "image":
|
||||
results = await search_images_api(query=query, limit=limit, offset=offset)
|
||||
total = await count_image_results(query)
|
||||
return SearchResponse(query=query, type=type, total=total, results=results)
|
||||
|
||||
if type == "video":
|
||||
results = await search_videos_api(query=query, limit=limit, offset=offset)
|
||||
total = await count_video_results(query)
|
||||
return SearchResponse(query=query, type=type, total=total, results=results)
|
||||
|
||||
if type != "web":
|
||||
raise HTTPException(status_code=400, detail="Invalid search type. Use web, image, or video.")
|
||||
|
||||
results = await search(query=query, limit=limit, offset=offset)
|
||||
total = await count_search_results(query)
|
||||
return SearchResponse(query=query, type=type, total=total, results=results)
|
||||
|
||||
|
||||
async def _run_crawl_job(request: CrawlRequest) -> None:
|
||||
try:
|
||||
bot = sFetchBot(
|
||||
max_depth=request.max_depth,
|
||||
max_pages_per_domain=request.max_pages_per_domain,
|
||||
same_domain_only=request.same_domain_only,
|
||||
)
|
||||
await bot.start(request.seed_urls)
|
||||
except Exception as exc:
|
||||
print(f"sFetch: crawl job failed ({exc})")
|
||||
|
||||
|
||||
@app.post("/crawl")
|
||||
async def crawl_endpoint(request: CrawlRequest, background_tasks: BackgroundTasks) -> dict[str, object]:
|
||||
background_tasks.add_task(_run_crawl_job, request)
|
||||
return {"message": "Crawl started", "seed_urls": request.seed_urls}
|
||||
|
||||
|
||||
@app.post("/crawl/top-sites")
|
||||
async def crawl_top_sites_endpoint(
|
||||
background_tasks: BackgroundTasks,
|
||||
force: bool = Query(False, description="Run the top-site seed again even if it is marked complete."),
|
||||
) -> dict[str, object]:
|
||||
background_tasks.add_task(_scrape_top_sites, force)
|
||||
return {"message": "Top-site crawl queued", "force": force}
|
||||
|
||||
|
||||
@app.get("/crawl/top-sites/status")
|
||||
async def crawl_top_sites_status_endpoint() -> dict[str, object]:
|
||||
return getattr(
|
||||
app.state,
|
||||
"_top_scrape_status",
|
||||
{
|
||||
"state": "idle",
|
||||
"message": "Top-site seed has not started.",
|
||||
"total": TOP_SITE_SEED_LIMIT,
|
||||
"indexed": 0,
|
||||
"source": None,
|
||||
"updated_at": None,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@app.get("/stats")
|
||||
async def stats_endpoint() -> dict[str, object]:
|
||||
stats = await get_stats()
|
||||
return stats
|
||||
@@ -0,0 +1,43 @@
|
||||
"""Pydantic models for sFetch's API."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class SearchResult(BaseModel):
|
||||
id: int
|
||||
url: str
|
||||
title: str
|
||||
snippet: str
|
||||
indexed_at: str
|
||||
|
||||
|
||||
class ImageResult(BaseModel):
|
||||
id: int
|
||||
url: str
|
||||
page_url: str
|
||||
alt_text: str
|
||||
indexed_at: str
|
||||
|
||||
|
||||
class VideoResult(BaseModel):
|
||||
id: int
|
||||
url: str
|
||||
page_url: str
|
||||
title: str
|
||||
indexed_at: str
|
||||
|
||||
|
||||
class SearchResponse(BaseModel):
|
||||
query: str
|
||||
type: str = "web"
|
||||
total: int
|
||||
results: list[SearchResult] | list[ImageResult] | list[VideoResult]
|
||||
|
||||
|
||||
class CrawlRequest(BaseModel):
|
||||
seed_urls: list[str] = Field(min_length=1)
|
||||
max_depth: int = Field(default=2, ge=0, le=5)
|
||||
max_pages_per_domain: int = Field(default=50, ge=1, le=500)
|
||||
same_domain_only: bool = True
|
||||
@@ -0,0 +1,6 @@
|
||||
fastapi
|
||||
uvicorn[standard]
|
||||
httpx
|
||||
beautifulsoup4
|
||||
pydantic
|
||||
aiosqlite
|
||||
@@ -0,0 +1,90 @@
|
||||
"""Search result shaping for sFetch."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import html
|
||||
import re
|
||||
|
||||
from database import search_images, search_pages, search_videos
|
||||
|
||||
SNIPPET_LENGTH = 200
|
||||
|
||||
|
||||
def _extract_terms(query: str) -> list[str]:
|
||||
terms = {term.lower() for term in re.findall(r"\w+", query, flags=re.UNICODE)}
|
||||
return sorted(terms, key=len, reverse=True)
|
||||
|
||||
|
||||
def _build_snippet(body_text: str) -> str:
|
||||
snippet = body_text[:SNIPPET_LENGTH].strip()
|
||||
if not snippet:
|
||||
return "No preview available."
|
||||
if len(body_text) > SNIPPET_LENGTH:
|
||||
return f"{snippet}..."
|
||||
return snippet
|
||||
|
||||
|
||||
def _highlight_terms(snippet: str, query: str) -> str:
|
||||
safe_snippet = html.escape(snippet)
|
||||
for term in _extract_terms(query):
|
||||
pattern = re.compile(re.escape(html.escape(term)), flags=re.IGNORECASE)
|
||||
safe_snippet = pattern.sub(lambda match: f"<mark>{match.group(0)}</mark>", safe_snippet)
|
||||
return safe_snippet
|
||||
|
||||
|
||||
async def search(query: str, limit: int = 10, offset: int = 0) -> list[dict]:
|
||||
rows = await search_pages(query=query, limit=limit, offset=offset)
|
||||
results: list[dict] = []
|
||||
|
||||
for row in rows:
|
||||
title = (row.get("title") or row.get("url") or "Untitled").strip()
|
||||
body_text = row.get("body_text") or ""
|
||||
snippet = _highlight_terms(_build_snippet(body_text), query)
|
||||
results.append(
|
||||
{
|
||||
"id": row["id"],
|
||||
"url": row["url"],
|
||||
"title": title,
|
||||
"snippet": snippet,
|
||||
"indexed_at": row["indexed_at"],
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
async def search_images_api(query: str, limit: int = 10, offset: int = 0) -> list[dict]:
|
||||
rows = await search_images(query=query, limit=limit, offset=offset)
|
||||
results: list[dict] = []
|
||||
|
||||
for row in rows:
|
||||
results.append(
|
||||
{
|
||||
"id": row["id"],
|
||||
"url": row["url"],
|
||||
"page_url": row["page_url"],
|
||||
"alt_text": row["alt_text"] or "",
|
||||
"indexed_at": row["indexed_at"],
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
async def search_videos_api(query: str, limit: int = 10, offset: int = 0) -> list[dict]:
|
||||
rows = await search_videos(query=query, limit=limit, offset=offset)
|
||||
results: list[dict] = []
|
||||
|
||||
for row in rows:
|
||||
title = (row.get("title") or "Video result").strip()
|
||||
results.append(
|
||||
{
|
||||
"id": row["id"],
|
||||
"url": row["url"],
|
||||
"page_url": row["page_url"],
|
||||
"title": title,
|
||||
"indexed_at": row["indexed_at"],
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
@@ -0,0 +1,110 @@
|
||||
"""Load and sanitize the top-site seed list for first-launch indexing."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import io
|
||||
import zipfile
|
||||
from collections.abc import Iterable
|
||||
from urllib.parse import urlsplit, urlunsplit
|
||||
|
||||
import httpx
|
||||
|
||||
from config import (
|
||||
TOP_SITE_DOWNLOAD_TIMEOUT_SECONDS,
|
||||
TOP_SITE_SEED_LIMIT,
|
||||
TOP_SITE_SOURCE_URL,
|
||||
TOP_SITES,
|
||||
USER_AGENT,
|
||||
)
|
||||
from content_filter import is_adult_url
|
||||
|
||||
|
||||
def _normalize_site_url(value: str) -> str | None:
|
||||
raw_value = value.strip()
|
||||
if not raw_value:
|
||||
return None
|
||||
|
||||
candidate = raw_value if "://" in raw_value else f"https://{raw_value}"
|
||||
parsed = urlsplit(candidate)
|
||||
if parsed.scheme not in {"http", "https"} or not parsed.netloc:
|
||||
return None
|
||||
|
||||
normalized = parsed._replace(
|
||||
scheme=parsed.scheme.lower(),
|
||||
netloc=parsed.netloc.lower(),
|
||||
path=parsed.path.rstrip("/") if parsed.path not in {"", "/"} else "",
|
||||
query="",
|
||||
fragment="",
|
||||
)
|
||||
return urlunsplit(normalized)
|
||||
|
||||
|
||||
def _host_key(url: str) -> str:
|
||||
return urlsplit(url).netloc.lower().removeprefix("www.")
|
||||
|
||||
|
||||
def _safe_top_urls(candidates: Iterable[str], limit: int = TOP_SITE_SEED_LIMIT) -> list[str]:
|
||||
safe_urls: list[str] = []
|
||||
seen_hosts: set[str] = set()
|
||||
|
||||
for candidate in candidates:
|
||||
normalized = _normalize_site_url(candidate)
|
||||
if normalized is None:
|
||||
continue
|
||||
host_key = _host_key(normalized)
|
||||
if host_key in seen_hosts or is_adult_url(normalized):
|
||||
continue
|
||||
seen_hosts.add(host_key)
|
||||
safe_urls.append(normalized)
|
||||
if len(safe_urls) >= limit:
|
||||
break
|
||||
|
||||
return safe_urls
|
||||
|
||||
|
||||
def _domains_from_csv_text(csv_text: str) -> list[str]:
|
||||
domains: list[str] = []
|
||||
reader = csv.reader(io.StringIO(csv_text))
|
||||
for row in reader:
|
||||
if not row:
|
||||
continue
|
||||
domain = row[1] if len(row) > 1 else row[0]
|
||||
if domain and domain.lower() != "domain":
|
||||
domains.append(domain)
|
||||
return domains
|
||||
|
||||
|
||||
def _domains_from_zip(payload: bytes) -> list[str]:
|
||||
with zipfile.ZipFile(io.BytesIO(payload)) as archive:
|
||||
csv_name = next((name for name in archive.namelist() if name.endswith(".csv")), None)
|
||||
if csv_name is None:
|
||||
raise ValueError("Tranco archive did not contain a CSV file.")
|
||||
with archive.open(csv_name) as csv_file:
|
||||
text = csv_file.read().decode("utf-8", errors="replace")
|
||||
return _domains_from_csv_text(text)
|
||||
|
||||
|
||||
async def load_top_site_seed_urls(limit: int = TOP_SITE_SEED_LIMIT) -> tuple[list[str], str]:
|
||||
"""Return the latest safe top-site URLs, falling back to the bundled list if needed."""
|
||||
|
||||
timeout = httpx.Timeout(TOP_SITE_DOWNLOAD_TIMEOUT_SECONDS)
|
||||
headers = {"User-Agent": USER_AGENT}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers) as client:
|
||||
response = await client.get(TOP_SITE_SOURCE_URL)
|
||||
response.raise_for_status()
|
||||
|
||||
if response.content.startswith(b"PK"):
|
||||
candidates = _domains_from_zip(response.content)
|
||||
else:
|
||||
candidates = _domains_from_csv_text(response.text)
|
||||
|
||||
safe_urls = _safe_top_urls(candidates, limit=limit)
|
||||
if safe_urls:
|
||||
return safe_urls, TOP_SITE_SOURCE_URL
|
||||
except Exception as exc:
|
||||
print(f"sFetch: unable to load latest top-site list ({exc}); using bundled fallback.")
|
||||
|
||||
return _safe_top_urls(TOP_SITES, limit=limit), "bundled fallback list"
|
||||
@@ -0,0 +1,402 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>sFetch</title>
|
||||
<script src="https://cdn.tailwindcss.com"></script>
|
||||
<script>
|
||||
tailwind.config = {
|
||||
theme: {
|
||||
extend: {
|
||||
colors: {
|
||||
sfetch: {
|
||||
bg: "#f8fafc",
|
||||
surface: "#ffffff",
|
||||
surfaceSoft: "#f1f5f9",
|
||||
ink: "#202124",
|
||||
muted: "#5f6368",
|
||||
border: "#dadce0",
|
||||
blue: "#1a73e8",
|
||||
orange: "#de5833",
|
||||
green: "#0b8043",
|
||||
},
|
||||
},
|
||||
boxShadow: {
|
||||
search: "0 2px 8px rgba(60, 64, 67, 0.14), 0 1px 3px rgba(60, 64, 67, 0.12)",
|
||||
panel: "0 16px 40px rgba(15, 23, 42, 0.08)",
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
</script>
|
||||
<style>
|
||||
:root {
|
||||
color-scheme: light;
|
||||
}
|
||||
|
||||
body {
|
||||
background: #f8fafc;
|
||||
color: #202124;
|
||||
font-family: Arial, Helvetica, sans-serif;
|
||||
}
|
||||
|
||||
.brand {
|
||||
font-family: Arial, Helvetica, sans-serif;
|
||||
font-weight: 700;
|
||||
letter-spacing: 0;
|
||||
}
|
||||
|
||||
.brand span:nth-child(1) { color: #de5833; }
|
||||
.brand span:nth-child(2) { color: #1a73e8; }
|
||||
.brand span:nth-child(3) { color: #188038; }
|
||||
.brand span:nth-child(4) { color: #fbbc04; }
|
||||
.brand span:nth-child(5) { color: #1a73e8; }
|
||||
.brand span:nth-child(6) { color: #de5833; }
|
||||
|
||||
.modal-open {
|
||||
overflow: hidden;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body class="min-h-screen">
|
||||
<main class="flex min-h-screen flex-col">
|
||||
<header class="flex items-center justify-between px-5 py-4 text-sm text-sfetch-muted sm:px-8">
|
||||
<a href="./index.html" class="brand text-2xl" aria-label="sFetch home">
|
||||
<span>s</span><span>F</span><span>e</span><span>t</span><span>c</span><span>h</span>
|
||||
</a>
|
||||
<button
|
||||
id="openCrawlerModal"
|
||||
class="rounded-full border border-sfetch-border bg-white px-4 py-2 font-medium text-sfetch-ink transition hover:border-sfetch-orange hover:text-sfetch-orange"
|
||||
>
|
||||
Index tools
|
||||
</button>
|
||||
</header>
|
||||
|
||||
<section class="mx-auto flex w-full max-w-5xl flex-1 flex-col items-center justify-center px-5 pb-24 pt-10">
|
||||
<h1 class="brand text-center text-6xl leading-none sm:text-7xl">
|
||||
<span>s</span><span>F</span><span>e</span><span>t</span><span>c</span><span>h</span>
|
||||
</h1>
|
||||
|
||||
<form id="searchForm" class="mt-9 w-full max-w-2xl">
|
||||
<label
|
||||
for="searchInput"
|
||||
class="flex min-h-14 items-center gap-3 rounded-full border border-sfetch-border bg-white px-5 transition focus-within:border-transparent focus-within:shadow-search"
|
||||
>
|
||||
<svg class="h-5 w-5 shrink-0 text-sfetch-muted" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.8" aria-hidden="true">
|
||||
<circle cx="11" cy="11" r="6"></circle>
|
||||
<path d="M20 20L16.65 16.65"></path>
|
||||
</svg>
|
||||
<input
|
||||
id="searchInput"
|
||||
type="text"
|
||||
autocomplete="off"
|
||||
placeholder="Search sFetch"
|
||||
class="w-full bg-transparent text-base text-sfetch-ink outline-none placeholder:text-sfetch-muted sm:text-lg"
|
||||
/>
|
||||
</label>
|
||||
|
||||
<div class="mt-6 flex flex-wrap items-center justify-center gap-3">
|
||||
<button
|
||||
type="submit"
|
||||
class="rounded-md bg-sfetch-blue px-5 py-2.5 text-sm font-medium text-white transition hover:bg-[#1558b0]"
|
||||
>
|
||||
sFetch Search
|
||||
</button>
|
||||
<button
|
||||
type="button"
|
||||
data-search-type="image"
|
||||
class="rounded-md border border-sfetch-border bg-white px-5 py-2.5 text-sm font-medium text-sfetch-ink transition hover:border-sfetch-blue hover:text-sfetch-blue"
|
||||
>
|
||||
Images
|
||||
</button>
|
||||
<button
|
||||
type="button"
|
||||
data-search-type="video"
|
||||
class="rounded-md border border-sfetch-border bg-white px-5 py-2.5 text-sm font-medium text-sfetch-ink transition hover:border-sfetch-blue hover:text-sfetch-blue"
|
||||
>
|
||||
Videos
|
||||
</button>
|
||||
</div>
|
||||
</form>
|
||||
|
||||
<section class="mt-12 w-full max-w-3xl rounded-lg border border-sfetch-border bg-white p-4 shadow-panel" aria-label="Index controls">
|
||||
<div class="flex flex-col gap-4 sm:flex-row sm:items-center sm:justify-between">
|
||||
<div>
|
||||
<p class="text-xs font-semibold uppercase text-sfetch-orange">Index</p>
|
||||
<p id="statsSummary" class="mt-1 text-sm text-sfetch-muted">Checking index...</p>
|
||||
</div>
|
||||
<div class="flex flex-wrap gap-2">
|
||||
<button
|
||||
id="seedTopSites"
|
||||
class="rounded-md bg-sfetch-orange px-4 py-2 text-sm font-medium text-white transition hover:bg-[#c44724]"
|
||||
>
|
||||
Seed top 1000
|
||||
</button>
|
||||
<button
|
||||
id="openCrawlerModalSecondary"
|
||||
class="rounded-md border border-sfetch-border bg-white px-4 py-2 text-sm font-medium text-sfetch-ink transition hover:border-sfetch-orange hover:text-sfetch-orange"
|
||||
>
|
||||
Custom crawl
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
<div class="mt-4 h-2 overflow-hidden rounded-full bg-sfetch-surfaceSoft">
|
||||
<div id="seedProgress" class="h-full w-0 bg-sfetch-orange transition-all duration-300"></div>
|
||||
</div>
|
||||
<p id="seedStatus" class="mt-3 min-h-5 text-sm text-sfetch-muted">Top-site seed status unavailable.</p>
|
||||
</section>
|
||||
</section>
|
||||
|
||||
<footer class="border-t border-sfetch-border bg-white px-5 py-4 text-center text-xs text-sfetch-muted">
|
||||
© 2026 sFetch
|
||||
</footer>
|
||||
</main>
|
||||
|
||||
<div
|
||||
id="crawlerModal"
|
||||
class="pointer-events-none fixed inset-0 z-30 flex items-center justify-center bg-slate-900/35 px-4 opacity-0 transition"
|
||||
aria-hidden="true"
|
||||
>
|
||||
<div class="w-full max-w-xl rounded-lg border border-sfetch-border bg-white p-5 shadow-panel">
|
||||
<div class="flex items-center justify-between gap-4 border-b border-sfetch-border pb-4">
|
||||
<h2 class="text-lg font-semibold text-sfetch-ink">Custom crawl</h2>
|
||||
<button
|
||||
id="closeCrawlerModal"
|
||||
class="flex h-9 w-9 items-center justify-center rounded-full text-sfetch-muted transition hover:bg-sfetch-surfaceSoft hover:text-sfetch-ink"
|
||||
aria-label="Close crawler modal"
|
||||
>
|
||||
X
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<form id="crawlerForm" class="mt-5 space-y-4">
|
||||
<div>
|
||||
<label for="seedUrls" class="mb-2 block text-sm font-medium text-sfetch-ink">Seed URLs</label>
|
||||
<textarea
|
||||
id="seedUrls"
|
||||
rows="6"
|
||||
placeholder="https://example.com https://docs.python.org/"
|
||||
class="w-full rounded-md border border-sfetch-border bg-white px-3 py-2 text-sm text-sfetch-ink outline-none transition focus:border-sfetch-blue focus:ring-2 focus:ring-blue-100"
|
||||
></textarea>
|
||||
</div>
|
||||
|
||||
<div class="grid gap-4 sm:grid-cols-2">
|
||||
<div>
|
||||
<label for="crawlDepth" class="mb-2 block text-sm font-medium text-sfetch-ink">Max depth</label>
|
||||
<input
|
||||
id="crawlDepth"
|
||||
type="number"
|
||||
min="0"
|
||||
max="5"
|
||||
value="2"
|
||||
class="w-full rounded-md border border-sfetch-border bg-white px-3 py-2 text-sm text-sfetch-ink outline-none transition focus:border-sfetch-blue focus:ring-2 focus:ring-blue-100"
|
||||
/>
|
||||
</div>
|
||||
<div>
|
||||
<label for="maxPagesPerDomain" class="mb-2 block text-sm font-medium text-sfetch-ink">Pages per domain</label>
|
||||
<input
|
||||
id="maxPagesPerDomain"
|
||||
type="number"
|
||||
min="1"
|
||||
max="500"
|
||||
value="50"
|
||||
class="w-full rounded-md border border-sfetch-border bg-white px-3 py-2 text-sm text-sfetch-ink outline-none transition focus:border-sfetch-blue focus:ring-2 focus:ring-blue-100"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<label class="flex items-center gap-3 text-sm text-sfetch-ink">
|
||||
<input id="sameDomainOnly" type="checkbox" checked class="h-4 w-4 rounded border-sfetch-border text-sfetch-blue" />
|
||||
Same domain only
|
||||
</label>
|
||||
|
||||
<p id="crawlerStatus" class="min-h-5 text-sm text-sfetch-muted"></p>
|
||||
|
||||
<div class="flex flex-col-reverse gap-3 sm:flex-row sm:justify-end">
|
||||
<button
|
||||
type="button"
|
||||
id="cancelCrawler"
|
||||
class="rounded-md border border-sfetch-border bg-white px-4 py-2 text-sm font-medium text-sfetch-ink transition hover:bg-sfetch-surfaceSoft"
|
||||
>
|
||||
Cancel
|
||||
</button>
|
||||
<button
|
||||
type="submit"
|
||||
class="rounded-md bg-sfetch-blue px-4 py-2 text-sm font-medium text-white transition hover:bg-[#1558b0]"
|
||||
>
|
||||
Launch crawl
|
||||
</button>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
const API_BASE = "http://localhost:8000";
|
||||
|
||||
const searchForm = document.getElementById("searchForm");
|
||||
const searchInput = document.getElementById("searchInput");
|
||||
const openCrawlerModal = document.getElementById("openCrawlerModal");
|
||||
const openCrawlerModalSecondary = document.getElementById("openCrawlerModalSecondary");
|
||||
const closeCrawlerModal = document.getElementById("closeCrawlerModal");
|
||||
const cancelCrawler = document.getElementById("cancelCrawler");
|
||||
const crawlerModal = document.getElementById("crawlerModal");
|
||||
const crawlerForm = document.getElementById("crawlerForm");
|
||||
const crawlerStatus = document.getElementById("crawlerStatus");
|
||||
const seedUrlsField = document.getElementById("seedUrls");
|
||||
const crawlDepthField = document.getElementById("crawlDepth");
|
||||
const maxPagesPerDomainField = document.getElementById("maxPagesPerDomain");
|
||||
const sameDomainOnlyField = document.getElementById("sameDomainOnly");
|
||||
const statsSummary = document.getElementById("statsSummary");
|
||||
const seedStatus = document.getElementById("seedStatus");
|
||||
const seedProgress = document.getElementById("seedProgress");
|
||||
const seedTopSites = document.getElementById("seedTopSites");
|
||||
|
||||
function runSearch(type = "all") {
|
||||
const query = searchInput.value.trim();
|
||||
if (!query) {
|
||||
searchInput.focus();
|
||||
return;
|
||||
}
|
||||
const params = new URLSearchParams({ q: query });
|
||||
if (type !== "all") {
|
||||
params.set("type", type);
|
||||
}
|
||||
window.location.href = `results.html?${params.toString()}`;
|
||||
}
|
||||
|
||||
function setModalOpen(isOpen) {
|
||||
crawlerModal.classList.toggle("opacity-0", !isOpen);
|
||||
crawlerModal.classList.toggle("pointer-events-none", !isOpen);
|
||||
crawlerModal.setAttribute("aria-hidden", String(!isOpen));
|
||||
document.body.classList.toggle("modal-open", isOpen);
|
||||
if (isOpen) {
|
||||
seedUrlsField.focus();
|
||||
} else {
|
||||
crawlerStatus.textContent = "";
|
||||
}
|
||||
}
|
||||
|
||||
async function refreshStats() {
|
||||
try {
|
||||
const response = await fetch(`${API_BASE}/stats`);
|
||||
const stats = await response.json();
|
||||
if (!response.ok) {
|
||||
throw new Error();
|
||||
}
|
||||
const lastIndexed = stats.last_indexed_at ? `, last indexed ${stats.last_indexed_at}` : "";
|
||||
statsSummary.textContent = `${stats.total_pages.toLocaleString()} pages${lastIndexed}`;
|
||||
} catch {
|
||||
statsSummary.textContent = "Backend unavailable";
|
||||
}
|
||||
}
|
||||
|
||||
async function refreshSeedStatus() {
|
||||
try {
|
||||
const response = await fetch(`${API_BASE}/crawl/top-sites/status`);
|
||||
const status = await response.json();
|
||||
if (!response.ok) {
|
||||
throw new Error();
|
||||
}
|
||||
const total = Number(status.total || 0);
|
||||
const indexed = Number(status.indexed || 0);
|
||||
const percent = total > 0 && status.state === "complete" ? 100 : total > 0 ? Math.min(96, (indexed / total) * 100) : 0;
|
||||
seedProgress.style.width = `${percent}%`;
|
||||
seedStatus.textContent = `${status.message || "Idle"}${status.source ? ` Source: ${status.source}` : ""}`;
|
||||
} catch {
|
||||
seedProgress.style.width = "0%";
|
||||
seedStatus.textContent = "Top-site seed status unavailable.";
|
||||
}
|
||||
}
|
||||
|
||||
async function seedTopSitesNow() {
|
||||
seedTopSites.disabled = true;
|
||||
seedTopSites.textContent = "Queued";
|
||||
try {
|
||||
const response = await fetch(`${API_BASE}/crawl/top-sites`, { method: "POST" });
|
||||
const data = await response.json().catch(() => ({}));
|
||||
if (!response.ok) {
|
||||
throw new Error(data.detail || "Unable to queue top-site seed.");
|
||||
}
|
||||
seedStatus.textContent = "Top-site seed queued.";
|
||||
await refreshSeedStatus();
|
||||
} catch (error) {
|
||||
seedStatus.textContent = error.message || "Unable to queue top-site seed.";
|
||||
} finally {
|
||||
setTimeout(() => {
|
||||
seedTopSites.disabled = false;
|
||||
seedTopSites.textContent = "Seed top 1000";
|
||||
}, 1200);
|
||||
}
|
||||
}
|
||||
|
||||
async function handleCrawlerSubmit(event) {
|
||||
event.preventDefault();
|
||||
const seedUrls = seedUrlsField.value
|
||||
.split("\n")
|
||||
.map((value) => value.trim())
|
||||
.filter(Boolean);
|
||||
|
||||
if (!seedUrls.length) {
|
||||
crawlerStatus.textContent = "Add at least one seed URL.";
|
||||
return;
|
||||
}
|
||||
|
||||
const payload = {
|
||||
seed_urls: seedUrls,
|
||||
max_depth: Number.parseInt(crawlDepthField.value, 10) || 0,
|
||||
max_pages_per_domain: Number.parseInt(maxPagesPerDomainField.value, 10) || 1,
|
||||
same_domain_only: sameDomainOnlyField.checked,
|
||||
};
|
||||
|
||||
crawlerStatus.textContent = "Starting crawl...";
|
||||
|
||||
try {
|
||||
const response = await fetch(`${API_BASE}/crawl`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify(payload),
|
||||
});
|
||||
const data = await response.json().catch(() => ({}));
|
||||
if (!response.ok) {
|
||||
throw new Error(data.detail || "Unable to start the crawler.");
|
||||
}
|
||||
crawlerStatus.textContent = `Crawl started for ${seedUrls.length} seed URL${seedUrls.length === 1 ? "" : "s"}.`;
|
||||
setTimeout(() => {
|
||||
setModalOpen(false);
|
||||
refreshStats();
|
||||
}, 900);
|
||||
} catch (error) {
|
||||
crawlerStatus.textContent = error.message || "Unable to start the crawler.";
|
||||
}
|
||||
}
|
||||
|
||||
searchForm.addEventListener("submit", (event) => {
|
||||
event.preventDefault();
|
||||
runSearch("all");
|
||||
});
|
||||
|
||||
document.querySelectorAll("[data-search-type]").forEach((button) => {
|
||||
button.addEventListener("click", () => runSearch(button.dataset.searchType || "all"));
|
||||
});
|
||||
|
||||
openCrawlerModal.addEventListener("click", () => setModalOpen(true));
|
||||
openCrawlerModalSecondary.addEventListener("click", () => setModalOpen(true));
|
||||
closeCrawlerModal.addEventListener("click", () => setModalOpen(false));
|
||||
cancelCrawler.addEventListener("click", () => setModalOpen(false));
|
||||
crawlerModal.addEventListener("click", (event) => {
|
||||
if (event.target === crawlerModal) {
|
||||
setModalOpen(false);
|
||||
}
|
||||
});
|
||||
seedTopSites.addEventListener("click", seedTopSitesNow);
|
||||
crawlerForm.addEventListener("submit", handleCrawlerSubmit);
|
||||
|
||||
refreshStats();
|
||||
refreshSeedStatus();
|
||||
setInterval(refreshStats, 10000);
|
||||
setInterval(refreshSeedStatus, 5000);
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,693 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>sFetch Results</title>
|
||||
<script src="https://cdn.tailwindcss.com"></script>
|
||||
<script>
|
||||
tailwind.config = {
|
||||
theme: {
|
||||
extend: {
|
||||
colors: {
|
||||
sfetch: {
|
||||
bg: "#f8fafc",
|
||||
surface: "#ffffff",
|
||||
surfaceSoft: "#f1f5f9",
|
||||
ink: "#202124",
|
||||
muted: "#5f6368",
|
||||
border: "#dadce0",
|
||||
blue: "#1a73e8",
|
||||
orange: "#de5833",
|
||||
green: "#0b8043",
|
||||
},
|
||||
},
|
||||
boxShadow: {
|
||||
search: "0 2px 8px rgba(60, 64, 67, 0.14), 0 1px 3px rgba(60, 64, 67, 0.12)",
|
||||
panel: "0 16px 40px rgba(15, 23, 42, 0.08)",
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
</script>
|
||||
<style>
|
||||
:root {
|
||||
color-scheme: light;
|
||||
}
|
||||
|
||||
body {
|
||||
background: #ffffff;
|
||||
color: #202124;
|
||||
font-family: Arial, Helvetica, sans-serif;
|
||||
}
|
||||
|
||||
.brand {
|
||||
font-family: Arial, Helvetica, sans-serif;
|
||||
font-weight: 700;
|
||||
letter-spacing: 0;
|
||||
}
|
||||
|
||||
.brand span:nth-child(1) { color: #de5833; }
|
||||
.brand span:nth-child(2) { color: #1a73e8; }
|
||||
.brand span:nth-child(3) { color: #188038; }
|
||||
.brand span:nth-child(4) { color: #fbbc04; }
|
||||
.brand span:nth-child(5) { color: #1a73e8; }
|
||||
.brand span:nth-child(6) { color: #de5833; }
|
||||
|
||||
.skeleton {
|
||||
background: linear-gradient(90deg, #eef2f7 25%, #f8fafc 37%, #eef2f7 63%);
|
||||
background-size: 400% 100%;
|
||||
animation: shimmer 1.4s ease infinite;
|
||||
}
|
||||
|
||||
mark {
|
||||
background: rgba(251, 188, 4, 0.28);
|
||||
color: #202124;
|
||||
padding: 0 0.12rem;
|
||||
border-radius: 0.2rem;
|
||||
}
|
||||
|
||||
@keyframes shimmer {
|
||||
0% { background-position: 100% 50%; }
|
||||
100% { background-position: 0 50%; }
|
||||
}
|
||||
|
||||
@keyframes barrel-roll {
|
||||
0% { transform: rotateZ(0deg); }
|
||||
100% { transform: rotateZ(360deg); }
|
||||
}
|
||||
|
||||
.barrel-roll {
|
||||
animation: barrel-roll 1.2s cubic-bezier(0.25, 0.46, 0.45, 0.94) forwards;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body class="min-h-screen">
|
||||
<div class="min-h-screen">
|
||||
<header class="sticky top-0 z-20 border-b border-sfetch-border bg-white/95 backdrop-blur">
|
||||
<div class="mx-auto flex max-w-6xl flex-col gap-4 px-5 py-4 sm:flex-row sm:items-center">
|
||||
<a href="./index.html" class="brand text-3xl leading-none" aria-label="sFetch home">
|
||||
<span>s</span><span>F</span><span>e</span><span>t</span><span>c</span><span>h</span>
|
||||
</a>
|
||||
|
||||
<form id="searchForm" class="flex flex-1 items-center gap-3">
|
||||
<label
|
||||
for="searchInput"
|
||||
class="flex min-h-12 flex-1 items-center gap-3 rounded-full border border-sfetch-border bg-white px-4 transition focus-within:border-transparent focus-within:shadow-search"
|
||||
>
|
||||
<svg class="h-5 w-5 shrink-0 text-sfetch-muted" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.8" aria-hidden="true">
|
||||
<circle cx="11" cy="11" r="6"></circle>
|
||||
<path d="M20 20L16.65 16.65"></path>
|
||||
</svg>
|
||||
<input
|
||||
id="searchInput"
|
||||
type="text"
|
||||
autocomplete="off"
|
||||
class="w-full bg-transparent text-base text-sfetch-ink outline-none placeholder:text-sfetch-muted"
|
||||
placeholder="Search sFetch"
|
||||
/>
|
||||
</label>
|
||||
<button
|
||||
id="searchButton"
|
||||
type="submit"
|
||||
class="rounded-md bg-sfetch-blue px-5 py-3 text-sm font-medium text-white transition hover:bg-[#1558b0]"
|
||||
>
|
||||
Search
|
||||
</button>
|
||||
</form>
|
||||
|
||||
<a
|
||||
href="./index.html"
|
||||
class="rounded-md border border-sfetch-border bg-white px-4 py-2 text-sm font-medium text-sfetch-ink transition hover:border-sfetch-orange hover:text-sfetch-orange"
|
||||
>
|
||||
Index tools
|
||||
</a>
|
||||
</div>
|
||||
<nav class="mx-auto flex max-w-6xl gap-7 px-5 text-sm" aria-label="Search verticals">
|
||||
<button id="tabAll" class="tab-btn border-b-2 border-transparent pb-3 font-medium text-sfetch-muted">All</button>
|
||||
<button id="tabImages" class="tab-btn border-b-2 border-transparent pb-3 font-medium text-sfetch-muted">Images</button>
|
||||
<button id="tabVideos" class="tab-btn border-b-2 border-transparent pb-3 font-medium text-sfetch-muted">Videos</button>
|
||||
</nav>
|
||||
</header>
|
||||
|
||||
<main class="mx-auto max-w-6xl px-5 py-8">
|
||||
<p id="metaText" class="text-sm text-sfetch-muted"></p>
|
||||
<section id="resultsContainer" class="mt-6"></section>
|
||||
<nav id="pagination" class="mt-10 flex items-center justify-start gap-2" aria-label="Pagination"></nav>
|
||||
</main>
|
||||
</div>
|
||||
|
||||
<div id="imageModal" class="fixed inset-0 z-50 hidden bg-slate-950/60">
|
||||
<div class="absolute inset-y-0 right-0 w-full max-w-4xl border-l border-sfetch-border bg-white shadow-panel">
|
||||
<div class="flex items-center justify-between border-b border-sfetch-border px-6 py-4">
|
||||
<h3 id="modalTitle" class="truncate text-base font-medium text-sfetch-ink">Image preview</h3>
|
||||
<button id="closeModal" class="flex h-9 w-9 items-center justify-center rounded-full text-sfetch-muted transition hover:bg-sfetch-surfaceSoft hover:text-sfetch-ink">
|
||||
X
|
||||
</button>
|
||||
</div>
|
||||
<div class="h-[calc(100vh-73px)] overflow-y-auto px-6 py-5">
|
||||
<div class="overflow-hidden rounded-lg bg-sfetch-surfaceSoft">
|
||||
<img id="modalImage" class="max-h-[62vh] w-full object-contain" alt="Preview" />
|
||||
</div>
|
||||
<div class="mt-6">
|
||||
<h4 class="mb-3 text-sm font-medium text-sfetch-muted">Related images</h4>
|
||||
<div id="relatedImages" class="grid grid-cols-2 gap-3 sm:grid-cols-3"></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
const API_BASE = "http://localhost:8000";
|
||||
const RESULTS_PER_PAGE = 10;
|
||||
|
||||
const searchForm = document.getElementById("searchForm");
|
||||
const searchInput = document.getElementById("searchInput");
|
||||
const resultsContainer = document.getElementById("resultsContainer");
|
||||
const metaText = document.getElementById("metaText");
|
||||
const paginationNav = document.getElementById("pagination");
|
||||
const tabAll = document.getElementById("tabAll");
|
||||
const tabImages = document.getElementById("tabImages");
|
||||
const tabVideos = document.getElementById("tabVideos");
|
||||
const imageModal = document.getElementById("imageModal");
|
||||
const closeModalBtn = document.getElementById("closeModal");
|
||||
const modalImage = document.getElementById("modalImage");
|
||||
const modalTitle = document.getElementById("modalTitle");
|
||||
const relatedImagesContainer = document.getElementById("relatedImages");
|
||||
|
||||
let currentType = "all";
|
||||
|
||||
function escapeHTML(value) {
|
||||
return String(value || "")
|
||||
.replaceAll("&", "&")
|
||||
.replaceAll("<", "<")
|
||||
.replaceAll(">", ">")
|
||||
.replaceAll('"', """)
|
||||
.replaceAll("'", "'");
|
||||
}
|
||||
|
||||
function getTypeFromUrl() {
|
||||
const typeValue = new URLSearchParams(window.location.search).get("type");
|
||||
if (typeValue === "image" || typeValue === "video" || typeValue === "all") {
|
||||
return typeValue;
|
||||
}
|
||||
return "all";
|
||||
}
|
||||
|
||||
function getQueryFromUrl() {
|
||||
return (new URLSearchParams(window.location.search).get("q") || "").trim();
|
||||
}
|
||||
|
||||
function getPageFromUrl() {
|
||||
const raw = new URLSearchParams(window.location.search).get("page") || "1";
|
||||
const page = Number.parseInt(raw, 10);
|
||||
return Number.isNaN(page) || page < 1 ? 1 : page;
|
||||
}
|
||||
|
||||
function updateUrl(query, page) {
|
||||
const params = new URLSearchParams(window.location.search);
|
||||
params.set("q", query);
|
||||
page > 1 ? params.set("page", String(page)) : params.delete("page");
|
||||
currentType === "all" ? params.delete("type") : params.set("type", currentType);
|
||||
window.history.replaceState({}, "", `${window.location.pathname}?${params.toString()}`);
|
||||
}
|
||||
|
||||
function updateTabsUI() {
|
||||
const tabs = [
|
||||
[tabAll, currentType === "all"],
|
||||
[tabImages, currentType === "image"],
|
||||
[tabVideos, currentType === "video"],
|
||||
];
|
||||
tabs.forEach(([tab, active]) => {
|
||||
tab.classList.toggle("border-sfetch-orange", active);
|
||||
tab.classList.toggle("text-sfetch-ink", active);
|
||||
tab.classList.toggle("border-transparent", !active);
|
||||
tab.classList.toggle("text-sfetch-muted", !active);
|
||||
});
|
||||
}
|
||||
|
||||
async function fetchSearch(type, query, limit, offset) {
|
||||
const response = await fetch(
|
||||
`${API_BASE}/search?q=${encodeURIComponent(query)}&type=${type}&limit=${limit}&offset=${offset}`
|
||||
);
|
||||
const data = await response.json().catch(() => ({}));
|
||||
if (!response.ok) {
|
||||
throw new Error(data.detail || "Search request failed.");
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
function extractHost(url) {
|
||||
try {
|
||||
return new URL(url).hostname.replace(/^www\./, "");
|
||||
} catch {
|
||||
return url;
|
||||
}
|
||||
}
|
||||
|
||||
function getYouTubeId(url) {
|
||||
try {
|
||||
const parsed = new URL(url);
|
||||
if (parsed.hostname.includes("youtube.com")) {
|
||||
if (parsed.pathname.startsWith("/watch")) {
|
||||
return parsed.searchParams.get("v");
|
||||
}
|
||||
if (parsed.pathname.startsWith("/embed/")) {
|
||||
return parsed.pathname.split("/embed/")[1] || null;
|
||||
}
|
||||
}
|
||||
if (parsed.hostname.includes("youtu.be")) {
|
||||
return parsed.pathname.slice(1) || null;
|
||||
}
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function videoThumbnail(url) {
|
||||
const ytId = getYouTubeId(url);
|
||||
return ytId ? `https://img.youtube.com/vi/${ytId}/hqdefault.jpg` : null;
|
||||
}
|
||||
|
||||
function renderError(message) {
|
||||
metaText.textContent = "Search unavailable";
|
||||
resultsContainer.className = "mt-6";
|
||||
resultsContainer.innerHTML = `
|
||||
<section class="max-w-2xl rounded-lg border border-sfetch-border bg-sfetch-bg px-5 py-6">
|
||||
<p class="text-lg text-sfetch-ink">Unable to load results.</p>
|
||||
<p class="mt-2 text-sm text-sfetch-muted">${escapeHTML(message)}</p>
|
||||
</section>
|
||||
`;
|
||||
paginationNav.innerHTML = "";
|
||||
}
|
||||
|
||||
function renderEmpty(query) {
|
||||
metaText.textContent = "About 0 results";
|
||||
resultsContainer.className = "mt-6";
|
||||
resultsContainer.innerHTML = `
|
||||
<section class="max-w-2xl rounded-lg border border-sfetch-border bg-sfetch-bg px-5 py-8">
|
||||
<div class="flex h-12 w-12 items-center justify-center rounded-full bg-sfetch-surfaceSoft text-lg font-bold text-sfetch-orange">s</div>
|
||||
<h2 class="mt-4 text-xl text-sfetch-ink">No results found</h2>
|
||||
<p class="mt-2 text-sm text-sfetch-muted">No indexed pages matched "${escapeHTML(query)}".</p>
|
||||
</section>
|
||||
`;
|
||||
paginationNav.innerHTML = "";
|
||||
}
|
||||
|
||||
function renderPagination(total, currentPage, query) {
|
||||
paginationNav.innerHTML = "";
|
||||
const totalPages = Math.ceil(total / RESULTS_PER_PAGE);
|
||||
if (totalPages <= 1) {
|
||||
paginationNav.style.display = "none";
|
||||
return;
|
||||
}
|
||||
paginationNav.style.display = "flex";
|
||||
|
||||
const button = (label, page, disabled = false, active = false) => {
|
||||
const btn = document.createElement("button");
|
||||
btn.textContent = label;
|
||||
btn.disabled = disabled;
|
||||
btn.className = `flex h-10 min-w-10 items-center justify-center rounded-md border px-3 text-sm transition ${
|
||||
active
|
||||
? "border-sfetch-blue bg-sfetch-blue text-white"
|
||||
: disabled
|
||||
? "cursor-not-allowed border-sfetch-border text-sfetch-muted/50"
|
||||
: "border-sfetch-border text-sfetch-ink hover:border-sfetch-blue hover:text-sfetch-blue"
|
||||
}`;
|
||||
if (!disabled && !active) {
|
||||
btn.addEventListener("click", () => runSearch(query, page));
|
||||
}
|
||||
return btn;
|
||||
};
|
||||
|
||||
paginationNav.appendChild(button("<", currentPage - 1, currentPage === 1));
|
||||
|
||||
const maxVisiblePages = 5;
|
||||
let start = Math.max(1, currentPage - 2);
|
||||
let end = Math.min(totalPages, start + maxVisiblePages - 1);
|
||||
if (end - start < maxVisiblePages - 1) {
|
||||
start = Math.max(1, end - maxVisiblePages + 1);
|
||||
}
|
||||
|
||||
for (let i = start; i <= end; i += 1) {
|
||||
paginationNav.appendChild(button(String(i), i, false, i === currentPage));
|
||||
}
|
||||
|
||||
paginationNav.appendChild(button(">", currentPage + 1, currentPage === totalPages));
|
||||
}
|
||||
|
||||
function openImageModal(imageResult, imageIndex, relatedPool) {
|
||||
modalImage.src = imageResult.url;
|
||||
modalImage.alt = imageResult.alt_text || "Image preview";
|
||||
modalTitle.textContent = imageResult.alt_text || extractHost(imageResult.page_url);
|
||||
relatedImagesContainer.innerHTML = "";
|
||||
|
||||
relatedPool
|
||||
.filter((_, idx) => idx !== imageIndex)
|
||||
.slice(0, 8)
|
||||
.forEach((item) => {
|
||||
const thumb = document.createElement("button");
|
||||
thumb.className = "overflow-hidden rounded-md border border-sfetch-border transition hover:border-sfetch-orange";
|
||||
thumb.innerHTML = `
|
||||
<img
|
||||
src="${escapeHTML(item.url)}"
|
||||
alt="${escapeHTML(item.alt_text || "Related image")}"
|
||||
class="h-24 w-full object-cover"
|
||||
loading="lazy"
|
||||
/>
|
||||
`;
|
||||
thumb.addEventListener("click", () => {
|
||||
const realIndex = relatedPool.findIndex((candidate) => candidate.id === item.id);
|
||||
openImageModal(item, realIndex, relatedPool);
|
||||
});
|
||||
relatedImagesContainer.appendChild(thumb);
|
||||
});
|
||||
|
||||
imageModal.classList.remove("hidden");
|
||||
}
|
||||
|
||||
function closeImageModal() {
|
||||
imageModal.classList.add("hidden");
|
||||
}
|
||||
|
||||
function renderImageGrid(results) {
|
||||
resultsContainer.className = "mt-6 grid grid-cols-2 gap-4 sm:grid-cols-3 lg:grid-cols-4";
|
||||
resultsContainer.innerHTML = "";
|
||||
|
||||
results.forEach((result, index) => {
|
||||
const card = document.createElement("article");
|
||||
card.className = "group cursor-pointer overflow-hidden rounded-lg border border-sfetch-border bg-white transition hover:border-sfetch-orange";
|
||||
card.innerHTML = `
|
||||
<div class="aspect-square overflow-hidden bg-sfetch-surfaceSoft">
|
||||
<img
|
||||
src="${escapeHTML(result.url)}"
|
||||
alt="${escapeHTML(result.alt_text || "Image result")}"
|
||||
class="h-full w-full object-cover transition duration-200 group-hover:scale-105"
|
||||
loading="lazy"
|
||||
/>
|
||||
</div>
|
||||
<div class="truncate px-3 py-2 text-xs text-sfetch-muted">${escapeHTML(result.alt_text || extractHost(result.page_url))}</div>
|
||||
`;
|
||||
card.addEventListener("click", () => openImageModal(result, index, results));
|
||||
resultsContainer.appendChild(card);
|
||||
});
|
||||
}
|
||||
|
||||
function renderVideoCards(results) {
|
||||
resultsContainer.className = "mt-6 space-y-4";
|
||||
resultsContainer.innerHTML = "";
|
||||
|
||||
results.forEach((result) => {
|
||||
const thumbnail = videoThumbnail(result.url);
|
||||
const card = document.createElement("article");
|
||||
card.className = "overflow-hidden rounded-lg border border-sfetch-border bg-white";
|
||||
card.innerHTML = `
|
||||
<a href="${escapeHTML(result.url)}" target="_blank" rel="noreferrer noopener" class="block md:flex">
|
||||
<div class="relative h-44 w-full shrink-0 overflow-hidden bg-sfetch-surfaceSoft md:w-72">
|
||||
${
|
||||
thumbnail
|
||||
? `<img src="${escapeHTML(thumbnail)}" alt="${escapeHTML(result.title)}" class="h-full w-full object-cover" loading="lazy" />`
|
||||
: `<div class="flex h-full items-center justify-center text-sfetch-muted">Video</div>`
|
||||
}
|
||||
</div>
|
||||
<div class="space-y-2 p-5">
|
||||
<p class="text-xs uppercase text-sfetch-green">${escapeHTML(extractHost(result.url))}</p>
|
||||
<h3 class="text-xl font-medium text-sfetch-blue">${escapeHTML(result.title)}</h3>
|
||||
<p class="text-sm text-sfetch-muted">Source: ${escapeHTML(extractHost(result.page_url))}</p>
|
||||
</div>
|
||||
</a>
|
||||
`;
|
||||
resultsContainer.appendChild(card);
|
||||
});
|
||||
}
|
||||
|
||||
function renderWebList(results) {
|
||||
const wrapper = document.createElement("div");
|
||||
wrapper.className = "max-w-3xl space-y-7";
|
||||
|
||||
results.forEach((result) => {
|
||||
const article = document.createElement("article");
|
||||
const host = extractHost(result.url);
|
||||
article.className = "space-y-1";
|
||||
article.innerHTML = `
|
||||
<div class="flex items-center gap-2 text-sm text-sfetch-muted">
|
||||
<div class="flex h-7 w-7 shrink-0 items-center justify-center rounded-full bg-sfetch-surfaceSoft text-xs font-bold text-sfetch-orange">${escapeHTML(host.slice(0, 1).toUpperCase())}</div>
|
||||
<div class="min-w-0">
|
||||
<p class="text-sfetch-ink">${escapeHTML(host)}</p>
|
||||
<p class="truncate text-xs">${escapeHTML(result.url)}</p>
|
||||
</div>
|
||||
</div>
|
||||
<a
|
||||
href="${escapeHTML(result.url)}"
|
||||
target="_blank"
|
||||
rel="noreferrer noopener"
|
||||
class="block text-xl leading-tight text-sfetch-blue hover:underline"
|
||||
>${escapeHTML(result.title)}</a>
|
||||
<p class="text-sm leading-6 text-sfetch-muted">${result.snippet}</p>
|
||||
`;
|
||||
wrapper.appendChild(article);
|
||||
});
|
||||
|
||||
return wrapper;
|
||||
}
|
||||
|
||||
function renderAllMode(webData, imageData, videoData, page) {
|
||||
const start = (page - 1) * RESULTS_PER_PAGE + 1;
|
||||
const end = Math.min(start + webData.results.length - 1, webData.total);
|
||||
if (webData.total === 0 && imageData.total === 0 && videoData.total === 0) {
|
||||
renderEmpty(webData.query);
|
||||
return;
|
||||
}
|
||||
|
||||
metaText.textContent = webData.total > 0
|
||||
? `${start}-${end} of about ${webData.total} web results`
|
||||
: "No direct web matches, showing media results";
|
||||
|
||||
resultsContainer.className = "mt-6 space-y-9";
|
||||
resultsContainer.innerHTML = "";
|
||||
|
||||
if (imageData.results.length) {
|
||||
const imageSection = document.createElement("section");
|
||||
imageSection.innerHTML = `
|
||||
<div class="mb-3 flex max-w-3xl items-center justify-between">
|
||||
<h2 class="text-sm font-semibold text-sfetch-ink">Images</h2>
|
||||
<button id="seeAllImagesBtn" class="text-sm font-medium text-sfetch-blue hover:underline">See all</button>
|
||||
</div>
|
||||
`;
|
||||
const grid = document.createElement("div");
|
||||
grid.className = "grid max-w-3xl grid-cols-3 gap-2 sm:grid-cols-4 md:grid-cols-6";
|
||||
imageData.results.slice(0, 6).forEach((result, index) => {
|
||||
const button = document.createElement("button");
|
||||
button.className = "overflow-hidden rounded-md border border-sfetch-border bg-sfetch-surfaceSoft";
|
||||
button.innerHTML = `<img src="${escapeHTML(result.url)}" alt="${escapeHTML(result.alt_text || "Image result")}" class="aspect-square w-full object-cover" loading="lazy" />`;
|
||||
button.addEventListener("click", () => openImageModal(result, index, imageData.results));
|
||||
grid.appendChild(button);
|
||||
});
|
||||
imageSection.appendChild(grid);
|
||||
resultsContainer.appendChild(imageSection);
|
||||
imageSection.querySelector("#seeAllImagesBtn").addEventListener("click", () => {
|
||||
currentType = "image";
|
||||
runSearch(searchInput.value.trim(), 1);
|
||||
});
|
||||
}
|
||||
|
||||
if (webData.results.length) {
|
||||
resultsContainer.appendChild(renderWebList(webData.results));
|
||||
}
|
||||
|
||||
if (videoData.results.length) {
|
||||
const videoSection = document.createElement("section");
|
||||
videoSection.innerHTML = `
|
||||
<div class="mb-3 flex max-w-3xl items-center justify-between">
|
||||
<h2 class="text-sm font-semibold text-sfetch-ink">Videos</h2>
|
||||
<button id="seeAllVideosBtn" class="text-sm font-medium text-sfetch-blue hover:underline">See all</button>
|
||||
</div>
|
||||
`;
|
||||
const list = document.createElement("div");
|
||||
list.className = "max-w-3xl space-y-3";
|
||||
videoData.results.slice(0, 3).forEach((result) => {
|
||||
const thumb = videoThumbnail(result.url);
|
||||
const card = document.createElement("a");
|
||||
card.href = result.url;
|
||||
card.target = "_blank";
|
||||
card.rel = "noreferrer noopener";
|
||||
card.className = "block overflow-hidden rounded-lg border border-sfetch-border bg-white transition hover:border-sfetch-orange sm:flex";
|
||||
card.innerHTML = `
|
||||
<div class="h-36 w-full shrink-0 overflow-hidden bg-sfetch-surfaceSoft sm:w-56">
|
||||
${
|
||||
thumb
|
||||
? `<img src="${escapeHTML(thumb)}" alt="${escapeHTML(result.title)}" class="h-full w-full object-cover" loading="lazy" />`
|
||||
: `<div class="flex h-full items-center justify-center text-sfetch-muted">Video</div>`
|
||||
}
|
||||
</div>
|
||||
<div class="space-y-2 p-4">
|
||||
<p class="text-xs uppercase text-sfetch-green">${escapeHTML(extractHost(result.url))}</p>
|
||||
<h3 class="text-lg font-medium text-sfetch-blue">${escapeHTML(result.title)}</h3>
|
||||
<p class="text-sm text-sfetch-muted">${escapeHTML(extractHost(result.page_url))}</p>
|
||||
</div>
|
||||
`;
|
||||
list.appendChild(card);
|
||||
});
|
||||
videoSection.appendChild(list);
|
||||
resultsContainer.appendChild(videoSection);
|
||||
videoSection.querySelector("#seeAllVideosBtn").addEventListener("click", () => {
|
||||
currentType = "video";
|
||||
runSearch(searchInput.value.trim(), 1);
|
||||
});
|
||||
}
|
||||
|
||||
renderPagination(webData.total, page, webData.query);
|
||||
}
|
||||
|
||||
function renderVerticalMode(data, page) {
|
||||
const start = (page - 1) * RESULTS_PER_PAGE + 1;
|
||||
const end = Math.min(start + data.results.length - 1, data.total);
|
||||
if (data.total === 0) {
|
||||
renderEmpty(data.query);
|
||||
return;
|
||||
}
|
||||
|
||||
metaText.textContent = `${start}-${end} of about ${data.total} ${data.type} results`;
|
||||
|
||||
if (data.type === "image") {
|
||||
renderImageGrid(data.results);
|
||||
} else if (data.type === "video") {
|
||||
renderVideoCards(data.results);
|
||||
} else {
|
||||
resultsContainer.className = "mt-6";
|
||||
resultsContainer.innerHTML = "";
|
||||
resultsContainer.appendChild(renderWebList(data.results));
|
||||
}
|
||||
|
||||
renderPagination(data.total, page, data.query);
|
||||
}
|
||||
|
||||
function renderLoadingSkeleton() {
|
||||
if (currentType === "image") {
|
||||
resultsContainer.className = "mt-6 grid grid-cols-2 gap-4 sm:grid-cols-3 lg:grid-cols-4";
|
||||
resultsContainer.innerHTML = Array.from({ length: 8 })
|
||||
.map(() => '<div class="skeleton aspect-square rounded-lg"></div>')
|
||||
.join("");
|
||||
metaText.textContent = "Searching images...";
|
||||
} else if (currentType === "video") {
|
||||
resultsContainer.className = "mt-6 max-w-3xl space-y-4";
|
||||
resultsContainer.innerHTML = Array.from({ length: 4 })
|
||||
.map(() => `
|
||||
<div class="overflow-hidden rounded-lg border border-sfetch-border bg-white">
|
||||
<div class="skeleton h-36 w-full"></div>
|
||||
<div class="space-y-3 p-4">
|
||||
<div class="skeleton h-3 w-24 rounded-full"></div>
|
||||
<div class="skeleton h-6 w-3/4 rounded-full"></div>
|
||||
<div class="skeleton h-3 w-1/2 rounded-full"></div>
|
||||
</div>
|
||||
</div>
|
||||
`)
|
||||
.join("");
|
||||
metaText.textContent = "Searching videos...";
|
||||
} else {
|
||||
resultsContainer.className = "mt-6 max-w-3xl space-y-6";
|
||||
resultsContainer.innerHTML = Array.from({ length: 4 })
|
||||
.map(() => `
|
||||
<article class="space-y-3">
|
||||
<div class="skeleton h-3 w-56 rounded-full"></div>
|
||||
<div class="skeleton h-6 w-2/3 rounded-full"></div>
|
||||
<div class="space-y-2">
|
||||
<div class="skeleton h-3 w-full rounded-full"></div>
|
||||
<div class="skeleton h-3 w-11/12 rounded-full"></div>
|
||||
</div>
|
||||
</article>
|
||||
`)
|
||||
.join("");
|
||||
metaText.textContent = "Searching...";
|
||||
}
|
||||
}
|
||||
|
||||
async function runSearch(query, page = 1) {
|
||||
const normalizedQuery = query.trim();
|
||||
if (!normalizedQuery) {
|
||||
metaText.textContent = "Enter a search query.";
|
||||
resultsContainer.className = "mt-6";
|
||||
resultsContainer.innerHTML = `
|
||||
<section class="max-w-2xl rounded-lg border border-sfetch-border bg-sfetch-bg px-5 py-6 text-sm text-sfetch-muted">
|
||||
Type a query above and press Search.
|
||||
</section>
|
||||
`;
|
||||
paginationNav.innerHTML = "";
|
||||
return;
|
||||
}
|
||||
|
||||
if (normalizedQuery.toLowerCase() === "do a barrel roll") {
|
||||
document.documentElement.classList.add("barrel-roll");
|
||||
setTimeout(() => document.documentElement.classList.remove("barrel-roll"), 1200);
|
||||
}
|
||||
|
||||
updateTabsUI();
|
||||
updateUrl(normalizedQuery, page);
|
||||
searchInput.value = normalizedQuery;
|
||||
renderLoadingSkeleton();
|
||||
paginationNav.innerHTML = "";
|
||||
|
||||
const offset = (page - 1) * RESULTS_PER_PAGE;
|
||||
|
||||
try {
|
||||
if (currentType === "all") {
|
||||
const [webData, imageData, videoData] = await Promise.all([
|
||||
fetchSearch("web", normalizedQuery, RESULTS_PER_PAGE, offset),
|
||||
fetchSearch("image", normalizedQuery, 8, offset),
|
||||
fetchSearch("video", normalizedQuery, 6, offset),
|
||||
]);
|
||||
renderAllMode(webData, imageData, videoData, page);
|
||||
} else {
|
||||
const data = await fetchSearch(currentType, normalizedQuery, RESULTS_PER_PAGE, offset);
|
||||
renderVerticalMode(data, page);
|
||||
}
|
||||
window.scrollTo({ top: 0, behavior: "smooth" });
|
||||
} catch (error) {
|
||||
renderError(error.message || "The search request failed.");
|
||||
}
|
||||
}
|
||||
|
||||
tabAll.addEventListener("click", () => {
|
||||
if (currentType !== "all") {
|
||||
currentType = "all";
|
||||
runSearch(searchInput.value || getQueryFromUrl(), 1);
|
||||
}
|
||||
});
|
||||
|
||||
tabImages.addEventListener("click", () => {
|
||||
if (currentType !== "image") {
|
||||
currentType = "image";
|
||||
runSearch(searchInput.value || getQueryFromUrl(), 1);
|
||||
}
|
||||
});
|
||||
|
||||
tabVideos.addEventListener("click", () => {
|
||||
if (currentType !== "video") {
|
||||
currentType = "video";
|
||||
runSearch(searchInput.value || getQueryFromUrl(), 1);
|
||||
}
|
||||
});
|
||||
|
||||
searchForm.addEventListener("submit", (event) => {
|
||||
event.preventDefault();
|
||||
runSearch(searchInput.value, 1);
|
||||
});
|
||||
|
||||
closeModalBtn.addEventListener("click", closeImageModal);
|
||||
imageModal.addEventListener("click", (event) => {
|
||||
if (event.target === imageModal) {
|
||||
closeImageModal();
|
||||
}
|
||||
});
|
||||
document.addEventListener("keydown", (event) => {
|
||||
if (event.key === "Escape" && !imageModal.classList.contains("hidden")) {
|
||||
closeImageModal();
|
||||
}
|
||||
});
|
||||
|
||||
currentType = getTypeFromUrl();
|
||||
runSearch(getQueryFromUrl(), getPageFromUrl());
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user