inital commit

This commit is contained in:
Ned Halksworth
2026-05-04 19:31:46 +01:00
commit e0f2eedcd9
14 changed files with 3718 additions and 0 deletions
+16
View File
@@ -0,0 +1,16 @@
.DS_Store
__pycache__/
*.py[cod]
.pytest_cache/
.venv/
venv/
backend/venv/
*.db
*.db-*
*.sqlite
*.sqlite3
.env
.env.*
+119
View File
@@ -0,0 +1,119 @@
# sFetch
sFetch is a full-stack search engine prototype with a lightweight Google/DDG-inspired frontend, a FastAPI search API, and an async crawler that indexes pages into a local SQLite FTS5 database.
On first backend launch, sFetch downloads the latest Tranco top-site list, filters pornographic/adult domains, and seeds up to 1,000 non-adult sites if that seed has not already been recorded in the database.
## Project Structure
```text
sFetch/
├── backend/
│ ├── main.py
│ ├── crawler.py
│ ├── top_sites.py
│ ├── content_filter.py
│ ├── indexer.py
│ ├── searcher.py
│ ├── models.py
│ ├── database.py
│ ├── config.py
│ └── requirements.txt
├── frontend/
│ ├── index.html
│ └── results.html
└── README.md
```
## Setup
1. Create a virtual environment and install the backend dependencies:
```bash
cd backend
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
```
2. Start the API:
```bash
uvicorn main:app --reload
```
3. Open `frontend/index.html` in your browser.
The frontend uses `const API_BASE = "http://localhost:8000";` at the top of each page script.
## Crawling
The home page has index controls for:
- seeding the top 1,000 non-adult sites
- launching a custom crawl with seed URLs, depth, per-domain page limits, and same-domain filtering
- viewing current index and seed status
You can also call the API directly:
```bash
curl -X POST "http://localhost:8000/crawl" \
-H "Content-Type: application/json" \
-d '{
"seed_urls": ["https://example.com"],
"max_depth": 2,
"max_pages_per_domain": 50,
"same_domain_only": true
}'
```
Seed the top-site list manually:
```bash
curl -X POST "http://localhost:8000/crawl/top-sites"
```
The crawler:
- respects `robots.txt`
- filters adult URLs and adult-heavy page text
- stays on the same domain by default
- avoids revisiting URLs
- indexes HTML pages, images, and videos into SQLite
- records top-site seeding completion in `app_meta`
## API Endpoints
| Method | Path | Purpose |
| --- | --- | --- |
| `GET` | `/` | Health check |
| `GET` | `/search` | Full-text search endpoint |
| `POST` | `/crawl` | Start a custom background crawl job |
| `POST` | `/crawl/top-sites` | Queue the top-site seed crawl |
| `GET` | `/crawl/top-sites/status` | Check top-site seed state |
| `GET` | `/stats` | Total indexed pages and latest index time |
## Configuration
sFetch's crawl and storage behavior lives in `backend/config.py`:
| Setting | Description |
| --- | --- |
| `MAX_CRAWL_DEPTH` | Default link depth followed from each seed URL |
| `MAX_PAGES_PER_DOMAIN` | Default per-domain crawl cap |
| `CRAWL_DELAY_SECONDS` | Delay before requests |
| `DEFAULT_CRAWL_CONCURRENCY` | Concurrent fetch limit |
| `DB_PATH` | SQLite database path |
| `TOP_SITE_SOURCE_URL` | Top-site list source |
| `TOP_SITE_SEED_LIMIT` | Number of safe top sites to seed |
| `USER_AGENT` | User agent sent by `sFetchBot` |
## Tech Stack
| Layer | Technology |
| --- | --- |
| Frontend | HTML, TailwindCSS CDN, Vanilla JavaScript |
| Backend | Python, FastAPI |
| Crawler | Python, `httpx`, `BeautifulSoup4`, `asyncio` |
| Search Index | SQLite FTS5 via `aiosqlite` |
| Top Sites | Tranco daily top-site ZIP with bundled fallback |
+1226
View File
File diff suppressed because it is too large Load Diff
+61
View File
@@ -0,0 +1,61 @@
"""Adult-content filtering helpers used before URLs reach the index."""
from __future__ import annotations
import re
from urllib.parse import urlsplit
from config import ADULT_DOMAINS, ADULT_KEYWORDS
EXPLICIT_HOST_MARKERS = (
"porn",
"xxx",
"xvideo",
"xnxx",
"hentai",
"camgirl",
"camsoda",
"chaturbate",
"stripchat",
"redtube",
)
EXPLICIT_HOST_TOKENS = {"sex", "sexy", "adult", "nude", "erotic", "escort", "bdsm"}
def _clean_host(url: str) -> str:
host = urlsplit(url.lower()).netloc
return host.removeprefix("www.")
def _host_matches_blocked_domain(host: str, domain: str) -> bool:
clean_domain = domain.lower().removeprefix("www.")
return host == clean_domain or host.endswith(f".{clean_domain}")
def is_adult_url(url: str) -> bool:
"""Return True when a URL appears to point at pornographic/adult content."""
lowered = url.lower()
parsed = urlsplit(lowered)
host = _clean_host(lowered)
if any(_host_matches_blocked_domain(host, domain) for domain in ADULT_DOMAINS):
return True
if any(marker in host for marker in EXPLICIT_HOST_MARKERS):
return True
host_tokens = set(re.split(r"[^a-z0-9]+", host))
if any(token in host_tokens for token in EXPLICIT_HOST_TOKENS):
return True
path_tokens = set(re.split(r"[^a-z0-9]+", f"{parsed.path} {parsed.query}"))
return any(keyword in path_tokens for keyword in ADULT_KEYWORDS)
def is_adult_text(text: str) -> bool:
"""Use a conservative keyword threshold so one incidental word does not block a page."""
lowered = text.lower()
hits = sum(1 for keyword in ADULT_KEYWORDS if keyword in lowered)
return hits >= 3
+309
View File
@@ -0,0 +1,309 @@
"""Async web crawler used to build the sFetch index."""
from __future__ import annotations
import asyncio
from collections import defaultdict
from typing import Iterable
from urllib.parse import urljoin, urldefrag, urlsplit, urlunsplit
from urllib.robotparser import RobotFileParser
import httpx
from bs4 import BeautifulSoup
from config import (
CRAWL_DELAY_SECONDS,
DEFAULT_CRAWL_CONCURRENCY,
MAX_CRAWL_DEPTH,
MAX_PAGES_PER_DOMAIN,
USER_AGENT,
)
from content_filter import is_adult_text, is_adult_url
from indexer import index_page
class sFetchBot:
"""A polite async crawler that stays within configurable crawl limits and filters adult content."""
def __init__(
self,
max_depth: int = MAX_CRAWL_DEPTH,
same_domain_only: bool = True,
crawl_delay: float = CRAWL_DELAY_SECONDS,
max_pages_per_domain: int = MAX_PAGES_PER_DOMAIN,
max_concurrency: int = DEFAULT_CRAWL_CONCURRENCY,
timeout_seconds: float = 15.0,
) -> None:
self.max_depth = max_depth
self.same_domain_only = same_domain_only
self.crawl_delay = crawl_delay
self.max_pages_per_domain = max_pages_per_domain
self.max_concurrency = max(1, max_concurrency)
self.timeout_seconds = timeout_seconds
self.visited: set[str] = set()
self.domain_counts: defaultdict[str, int] = defaultdict(int)
self.robots_cache: dict[str, RobotFileParser] = {}
self.indexed_count = 0
self._state_lock = asyncio.Lock()
self._fetch_semaphore = asyncio.Semaphore(self.max_concurrency)
self._client: httpx.AsyncClient | None = None
async def start(self, seed_urls: list[str]) -> None:
if not seed_urls:
return
timeout = httpx.Timeout(self.timeout_seconds)
headers = {"User-Agent": USER_AGENT}
async with httpx.AsyncClient(
timeout=timeout,
follow_redirects=True,
headers=headers,
) as client:
self._client = client
tasks = []
for seed_url in seed_urls:
normalized_seed = self._normalize_url(seed_url)
if normalized_seed is None:
print(f"sFetch: skipped {seed_url} (invalid URL)")
continue
if is_adult_url(normalized_seed):
print(f"sFetch: skipped {seed_url} (adult content filtered)")
continue
root_domain = urlsplit(normalized_seed).netloc.lower()
tasks.append(self._crawl_url(normalized_seed, root_domain, depth=0))
if tasks:
await asyncio.gather(*tasks, return_exceptions=True)
self._client = None
async def _crawl_url(self, url: str, root_domain: str, depth: int) -> None:
try:
if depth > self.max_depth:
return
normalized_url = self._normalize_url(url)
if normalized_url is None:
return
if is_adult_url(normalized_url):
print(f"sFetch: skipped {normalized_url} (adult)")
return
parsed = urlsplit(normalized_url)
current_domain = parsed.netloc.lower()
if self.same_domain_only and current_domain != root_domain:
return
if await self._already_seen(normalized_url):
return
if await self._domain_limit_reached(current_domain):
return
if not await self._is_allowed_by_robots(normalized_url):
return
client = self._require_client()
async with self._fetch_semaphore:
await asyncio.sleep(self.crawl_delay)
response = await client.get(normalized_url)
response.raise_for_status()
content_type = response.headers.get("content-type", "").lower()
if "text/html" not in content_type:
return
title, body_text, links, images, videos = self._extract_page_content(normalized_url, response.text)
if is_adult_text(body_text):
print(f"sFetch: skipped {normalized_url} (adult text)")
return
await index_page(normalized_url, title, body_text, images, videos)
await self._increment_domain_count(current_domain)
self.indexed_count += 1
print(f"sFetch: indexed {normalized_url}")
for link in links:
await self._crawl_url(link, root_domain, depth + 1)
except httpx.HTTPError as exc:
print(f"sFetch: HTTP error {url} ({exc})")
except Exception as exc:
print(f"sFetch: error {url} ({exc})")
def _require_client(self) -> httpx.AsyncClient:
if self._client is None:
raise RuntimeError("Crawler client is not initialized.")
return self._client
async def _already_seen(self, url: str) -> bool:
async with self._state_lock:
if url in self.visited:
return True
self.visited.add(url)
return False
async def _domain_limit_reached(self, domain: str) -> bool:
async with self._state_lock:
return self.domain_counts[domain] >= self.max_pages_per_domain
async def _increment_domain_count(self, domain: str) -> None:
async with self._state_lock:
self.domain_counts[domain] += 1
async def _is_allowed_by_robots(self, url: str) -> bool:
parsed = urlsplit(url)
robots_key = f"{parsed.scheme}://{parsed.netloc.lower()}"
parser = self.robots_cache.get(robots_key)
if parser is None:
parser = await self._fetch_robots_parser(robots_key)
self.robots_cache[robots_key] = parser
return parser.can_fetch(USER_AGENT, url)
async def _fetch_robots_parser(self, domain_base: str) -> RobotFileParser:
parser = RobotFileParser()
robots_url = f"{domain_base}/robots.txt"
parser.set_url(robots_url)
try:
client = self._require_client()
response = await client.get(robots_url)
if response.status_code == 200:
parser.parse(response.text.splitlines())
else:
parser.parse([])
except Exception:
parser.parse([])
return parser
def _extract_page_content(
self,
url: str,
html_text: str,
) -> tuple[str, str, list[str], list[dict[str, str]], list[dict[str, str]]]:
soup = BeautifulSoup(html_text, "html.parser")
images = self._extract_images(url, soup)
videos = self._extract_videos(url, soup)
for element in soup(["script", "style", "noscript"]):
element.decompose()
title = ""
if soup.title and soup.title.string:
title = soup.title.string.strip()
if not title:
title = url
body_text = soup.get_text(separator=" ", strip=True)
links = self._extract_links(url, soup)
return title, body_text, links, images, videos
def _extract_images(self, base_url: str, soup: BeautifulSoup) -> list[dict[str, str]]:
images = []
for img in soup.find_all("img", src=True):
src = str(img["src"]).strip()
if not src or src.startswith(("data:", "javascript:")):
continue
absolute_url = urljoin(base_url, src)
normalized_url = self._normalize_url(absolute_url)
if normalized_url is not None:
alt = str(img.get("alt", "")).strip()
images.append({"url": normalized_url, "alt_text": alt})
return self._dedupe_media(images)
def _extract_videos(self, base_url: str, soup: BeautifulSoup) -> list[dict[str, str]]:
videos: list[dict[str, str]] = []
for video in soup.find_all("video"):
if video.get("src"):
normalized = self._normalize_url(urljoin(base_url, str(video["src"]).strip()))
if normalized:
title = str(video.get("title") or video.get("aria-label") or "").strip()
videos.append({"url": normalized, "title": title})
for source in video.find_all("source", src=True):
normalized = self._normalize_url(urljoin(base_url, str(source["src"]).strip()))
if normalized:
title = str(video.get("title") or video.get("aria-label") or "").strip()
videos.append({"url": normalized, "title": title})
for iframe in soup.find_all("iframe", src=True):
raw_src = str(iframe["src"]).strip()
normalized = self._normalize_url(urljoin(base_url, raw_src))
if normalized and self._is_video_url(normalized):
title = str(iframe.get("title") or iframe.get("aria-label") or "").strip()
videos.append({"url": normalized, "title": title})
for tag in soup.find_all("a", href=True):
raw_href = str(tag["href"]).strip()
normalized = self._normalize_url(urljoin(base_url, raw_href))
if normalized and self._is_video_url(normalized):
title = " ".join(tag.stripped_strings).strip()
videos.append({"url": normalized, "title": title})
return self._dedupe_media(videos)
def _is_video_url(self, url: str) -> bool:
lowered = url.lower()
return any(
marker in lowered
for marker in (
"youtube.com/watch",
"youtube.com/embed/",
"youtu.be/",
"vimeo.com/",
".mp4",
".webm",
".mov",
".m3u8",
)
)
def _dedupe_media(self, items: list[dict[str, str]]) -> list[dict[str, str]]:
seen: set[str] = set()
unique: list[dict[str, str]] = []
for item in items:
media_url = item.get("url")
if not media_url or media_url in seen:
continue
seen.add(media_url)
unique.append(item)
return unique
def _extract_links(self, base_url: str, soup: BeautifulSoup) -> list[str]:
collected_links: list[str] = []
for tag in soup.find_all("a", href=True):
href = str(tag["href"]).strip()
if not href or href.startswith(("javascript:", "mailto:", "tel:")):
continue
absolute_url = urljoin(base_url, href)
normalized_url = self._normalize_url(absolute_url)
if normalized_url is not None:
collected_links.append(normalized_url)
return self._dedupe_links(collected_links)
def _dedupe_links(self, links: Iterable[str]) -> list[str]:
seen: set[str] = set()
unique_links: list[str] = []
for link in links:
if link in seen:
continue
seen.add(link)
unique_links.append(link)
return unique_links
def _normalize_url(self, url: str) -> str | None:
if not url:
return None
clean_url, _ = urldefrag(url.strip())
parsed = urlsplit(clean_url)
if parsed.scheme not in {"http", "https"} or not parsed.netloc:
return None
normalized = parsed._replace(
scheme=parsed.scheme.lower(),
netloc=parsed.netloc.lower(),
)
return urlunsplit(normalized)
+395
View File
@@ -0,0 +1,395 @@
"""Async SQLite helpers for sFetch's crawl index."""
from __future__ import annotations
from contextlib import asynccontextmanager
from typing import Any, AsyncIterator
import aiosqlite
from config import DB_PATH
@asynccontextmanager
async def _get_connection() -> AsyncIterator[aiosqlite.Connection]:
async with aiosqlite.connect(DB_PATH) as connection:
connection.row_factory = aiosqlite.Row
await connection.execute("PRAGMA foreign_keys = ON;")
await connection.execute("PRAGMA journal_mode = WAL;")
yield connection
def _to_fts_query(query: str) -> str:
tokens: list[str] = []
for raw_token in query.split():
token = raw_token.strip()
if not token:
continue
escaped = token.replace('"', '""')
tokens.append(f'"{escaped}"')
return " OR ".join(tokens)
async def init_db() -> None:
async with _get_connection() as connection:
await connection.executescript(
"""
CREATE TABLE IF NOT EXISTS pages (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE NOT NULL,
title TEXT,
body_text TEXT,
indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts
USING fts5(title, body_text, content='pages', content_rowid='id');
CREATE TRIGGER IF NOT EXISTS pages_ai
AFTER INSERT ON pages
BEGIN
INSERT INTO pages_fts(rowid, title, body_text)
VALUES (new.id, new.title, new.body_text);
END;
CREATE TRIGGER IF NOT EXISTS pages_ad
AFTER DELETE ON pages
BEGIN
INSERT INTO pages_fts(pages_fts, rowid, title, body_text)
VALUES ('delete', old.id, old.title, old.body_text);
END;
CREATE TRIGGER IF NOT EXISTS pages_au
AFTER UPDATE ON pages
BEGIN
INSERT INTO pages_fts(pages_fts, rowid, title, body_text)
VALUES ('delete', old.id, old.title, old.body_text);
INSERT INTO pages_fts(rowid, title, body_text)
VALUES (new.id, new.title, new.body_text);
END;
CREATE TABLE IF NOT EXISTS images (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE NOT NULL,
page_url TEXT NOT NULL,
alt_text TEXT,
indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY(page_url) REFERENCES pages(url) ON DELETE CASCADE
);
CREATE VIRTUAL TABLE IF NOT EXISTS images_fts
USING fts5(alt_text, content='images', content_rowid='id');
CREATE TRIGGER IF NOT EXISTS images_ai
AFTER INSERT ON images
BEGIN
INSERT INTO images_fts(rowid, alt_text)
VALUES (new.id, new.alt_text);
END;
CREATE TRIGGER IF NOT EXISTS images_ad
AFTER DELETE ON images
BEGIN
INSERT INTO images_fts(images_fts, rowid, alt_text)
VALUES ('delete', old.id, old.alt_text);
END;
CREATE TRIGGER IF NOT EXISTS images_au
AFTER UPDATE ON images
BEGIN
INSERT INTO images_fts(images_fts, rowid, alt_text)
VALUES ('delete', old.id, old.alt_text);
INSERT INTO images_fts(rowid, alt_text)
VALUES (new.id, new.alt_text);
END;
CREATE TABLE IF NOT EXISTS videos (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE NOT NULL,
page_url TEXT NOT NULL,
title TEXT,
indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY(page_url) REFERENCES pages(url) ON DELETE CASCADE
);
CREATE VIRTUAL TABLE IF NOT EXISTS videos_fts
USING fts5(title, content='videos', content_rowid='id');
CREATE TRIGGER IF NOT EXISTS videos_ai
AFTER INSERT ON videos
BEGIN
INSERT INTO videos_fts(rowid, title)
VALUES (new.id, new.title);
END;
CREATE TRIGGER IF NOT EXISTS videos_ad
AFTER DELETE ON videos
BEGIN
INSERT INTO videos_fts(videos_fts, rowid, title)
VALUES ('delete', old.id, old.title);
END;
CREATE TRIGGER IF NOT EXISTS videos_au
AFTER UPDATE ON videos
BEGIN
INSERT INTO videos_fts(videos_fts, rowid, title)
VALUES ('delete', old.id, old.title);
INSERT INTO videos_fts(rowid, title)
VALUES (new.id, new.title);
END;
CREATE TABLE IF NOT EXISTS app_meta (
key TEXT PRIMARY KEY,
value TEXT NOT NULL,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
"""
)
await connection.commit()
async def get_meta_value(key: str) -> str | None:
async with _get_connection() as connection:
cursor = await connection.execute(
"SELECT value FROM app_meta WHERE key = ?",
(key,),
)
row = await cursor.fetchone()
await cursor.close()
return str(row["value"]) if row else None
async def set_meta_value(key: str, value: str) -> None:
async with _get_connection() as connection:
await connection.execute(
"""
INSERT INTO app_meta (key, value)
VALUES (?, ?)
ON CONFLICT(key) DO UPDATE SET
value = excluded.value,
updated_at = CURRENT_TIMESTAMP
""",
(key, value),
)
await connection.commit()
async def insert_page(url: str, title: str, body_text: str) -> int:
async with _get_connection() as connection:
await connection.execute(
"""
INSERT INTO pages (url, title, body_text)
VALUES (?, ?, ?)
ON CONFLICT(url) DO UPDATE SET
title = excluded.title,
body_text = excluded.body_text,
indexed_at = CURRENT_TIMESTAMP
""",
(url, title, body_text),
)
await connection.commit()
cursor = await connection.execute(
"SELECT id FROM pages WHERE url = ?",
(url,),
)
row = await cursor.fetchone()
await cursor.close()
if row is None:
raise RuntimeError("Inserted page could not be reloaded from the database.")
return int(row["id"])
async def insert_image(url: str, page_url: str, alt_text: str) -> None:
async with _get_connection() as connection:
await connection.execute(
"""
INSERT INTO images (url, page_url, alt_text)
VALUES (?, ?, ?)
ON CONFLICT(url) DO UPDATE SET
page_url = excluded.page_url,
alt_text = excluded.alt_text,
indexed_at = CURRENT_TIMESTAMP
""",
(url, page_url, alt_text),
)
await connection.commit()
async def insert_video(url: str, page_url: str, title: str) -> None:
async with _get_connection() as connection:
await connection.execute(
"""
INSERT INTO videos (url, page_url, title)
VALUES (?, ?, ?)
ON CONFLICT(url) DO UPDATE SET
page_url = excluded.page_url,
title = excluded.title,
indexed_at = CURRENT_TIMESTAMP
""",
(url, page_url, title),
)
await connection.commit()
async def search_pages(query: str, limit: int = 10, offset: int = 0) -> list[dict[str, Any]]:
fts_query = _to_fts_query(query)
if not fts_query:
return []
safe_limit = max(1, min(limit, 50))
safe_offset = max(0, offset)
async with _get_connection() as connection:
cursor = await connection.execute(
"""
SELECT
p.id,
p.url,
p.title,
p.body_text,
p.indexed_at
FROM pages_fts
JOIN pages AS p ON p.id = pages_fts.rowid
WHERE pages_fts MATCH ?
ORDER BY bm25(pages_fts), p.indexed_at DESC
LIMIT ? OFFSET ?
""",
(fts_query, safe_limit, safe_offset),
)
rows = await cursor.fetchall()
await cursor.close()
return [dict(row) for row in rows]
async def count_search_results(query: str) -> int:
fts_query = _to_fts_query(query)
if not fts_query:
return 0
async with _get_connection() as connection:
cursor = await connection.execute(
"""
SELECT COUNT(*) AS total
FROM pages_fts
WHERE pages_fts MATCH ?
""",
(fts_query,),
)
row = await cursor.fetchone()
await cursor.close()
return int(row["total"]) if row and row["total"] is not None else 0
async def search_images(query: str, limit: int = 10, offset: int = 0) -> list[dict[str, Any]]:
fts_query = _to_fts_query(query)
if not fts_query:
return []
safe_limit = max(1, min(limit, 50))
safe_offset = max(0, offset)
async with _get_connection() as connection:
cursor = await connection.execute(
"""
SELECT
i.id,
i.url,
i.page_url,
i.alt_text,
i.indexed_at
FROM images_fts
JOIN images AS i ON i.id = images_fts.rowid
WHERE images_fts MATCH ?
ORDER BY bm25(images_fts), i.indexed_at DESC
LIMIT ? OFFSET ?
""",
(fts_query, safe_limit, safe_offset),
)
rows = await cursor.fetchall()
await cursor.close()
return [dict(row) for row in rows]
async def count_image_results(query: str) -> int:
fts_query = _to_fts_query(query)
if not fts_query:
return 0
async with _get_connection() as connection:
cursor = await connection.execute(
"""
SELECT COUNT(*) AS total
FROM images_fts
WHERE images_fts MATCH ?
""",
(fts_query,),
)
row = await cursor.fetchone()
await cursor.close()
return int(row["total"]) if row and row["total"] is not None else 0
async def search_videos(query: str, limit: int = 10, offset: int = 0) -> list[dict[str, Any]]:
fts_query = _to_fts_query(query)
if not fts_query:
return []
safe_limit = max(1, min(limit, 50))
safe_offset = max(0, offset)
async with _get_connection() as connection:
cursor = await connection.execute(
"""
SELECT
v.id,
v.url,
v.page_url,
v.title,
v.indexed_at
FROM videos_fts
JOIN videos AS v ON v.id = videos_fts.rowid
WHERE videos_fts MATCH ?
ORDER BY bm25(videos_fts), v.indexed_at DESC
LIMIT ? OFFSET ?
""",
(fts_query, safe_limit, safe_offset),
)
rows = await cursor.fetchall()
await cursor.close()
return [dict(row) for row in rows]
async def count_video_results(query: str) -> int:
fts_query = _to_fts_query(query)
if not fts_query:
return 0
async with _get_connection() as connection:
cursor = await connection.execute(
"""
SELECT COUNT(*) AS total
FROM videos_fts
WHERE videos_fts MATCH ?
""",
(fts_query,),
)
row = await cursor.fetchone()
await cursor.close()
return int(row["total"]) if row and row["total"] is not None else 0
async def get_stats() -> dict[str, Any]:
async with _get_connection() as connection:
cursor = await connection.execute(
"""
SELECT
COUNT(*) AS total_pages,
MAX(indexed_at) AS last_indexed_at
FROM pages
"""
)
row = await cursor.fetchone()
await cursor.close()
return {
"total_pages": int(row["total_pages"]) if row and row["total_pages"] is not None else 0,
"last_indexed_at": row["last_indexed_at"] if row else None,
}
+41
View File
@@ -0,0 +1,41 @@
"""Normalization and indexing helpers for crawled pages."""
from __future__ import annotations
import re
from database import insert_image, insert_page, insert_video
MAX_BODY_LENGTH = 10_000
def _normalize_text(body_text: str) -> str:
collapsed = re.sub(r"\s+", " ", body_text).strip()
return collapsed[:MAX_BODY_LENGTH]
async def index_page(
url: str,
title: str,
body_text: str,
images: list[dict[str, str]] | None = None,
videos: list[dict[str, str]] | None = None,
) -> None:
normalized_title = title.strip() or url
normalized_body = _normalize_text(body_text)
if not normalized_body:
return
await insert_page(url=url, title=normalized_title, body_text=normalized_body)
if images:
for img in images:
img_url = img.get("url")
alt_text = img.get("alt_text", "")
if img_url:
await insert_image(url=img_url, page_url=url, alt_text=alt_text)
if videos:
for video in videos:
video_url = video.get("url")
video_title = video.get("title") or normalized_title
if video_url:
await insert_video(url=video_url, page_url=url, title=video_title.strip())
+207
View File
@@ -0,0 +1,207 @@
"""FastAPI entry point for the sFetch backend."""
from __future__ import annotations
import asyncio
from datetime import UTC, datetime
from fastapi import FastAPI, HTTPException, Query, BackgroundTasks
from fastapi.middleware.cors import CORSMiddleware
from crawler import sFetchBot
from config import TOP_SITE_SEED_LIMIT, TOP_SITE_SEED_META_KEY
from database import (
count_image_results,
count_search_results,
count_video_results,
get_meta_value,
get_stats,
init_db,
set_meta_value,
)
from models import CrawlRequest, SearchResponse
from searcher import search, search_images_api, search_videos_api
from top_sites import load_top_site_seed_urls
app = FastAPI(title="sFetch API", version="1.0.0")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=False,
allow_methods=["*"],
allow_headers=["*"],
)
def _utc_now() -> str:
return datetime.now(UTC).isoformat()
def _set_seed_status(**updates: object) -> None:
current = getattr(app.state, "_top_scrape_status", {}).copy()
current.update({"updated_at": _utc_now(), **updates})
app.state._top_scrape_status = current
async def _scrape_top_sites(force: bool = False) -> None:
await init_db()
async with app.state._crawl_lock:
if app.state._top_scrape_done and not force:
return
existing_seed = await get_meta_value(TOP_SITE_SEED_META_KEY)
if existing_seed and not force:
stats = await get_stats()
_set_seed_status(
state="stored",
message="Top-site seed already stored in the database.",
total=TOP_SITE_SEED_LIMIT,
indexed=stats["total_pages"],
source=existing_seed,
)
app.state._top_scrape_done = True
return
stats = await get_stats()
if stats["total_pages"] >= TOP_SITE_SEED_LIMIT and not force:
source = "existing database"
await set_meta_value(TOP_SITE_SEED_META_KEY, source)
_set_seed_status(
state="stored",
message="Top-site seed already stored in the database.",
total=TOP_SITE_SEED_LIMIT,
indexed=stats["total_pages"],
source=source,
)
app.state._top_scrape_done = True
return
_set_seed_status(state="loading", message="Loading top-site list.", total=TOP_SITE_SEED_LIMIT, indexed=0)
seed_urls, source = await load_top_site_seed_urls(limit=TOP_SITE_SEED_LIMIT)
_set_seed_status(
state="running",
message=f"Seeding {len(seed_urls)} non-adult top sites.",
total=len(seed_urls),
indexed=0,
source=source,
)
print(f"sFetch: seeding index with {len(seed_urls)} non-adult top sites from {source}...")
bot = sFetchBot(max_depth=0, same_domain_only=True, max_pages_per_domain=1, max_concurrency=12)
try:
await bot.start(seed_urls)
except Exception as exc:
_set_seed_status(state="error", message=f"Top-site seed failed: {exc}", indexed=bot.indexed_count)
print(f"sFetch: top-site seed failed ({exc})")
return
await set_meta_value(TOP_SITE_SEED_META_KEY, source)
_set_seed_status(
state="complete",
message="Top-site seed complete.",
total=len(seed_urls),
indexed=bot.indexed_count,
source=source,
)
print(f"sFetch: seeding complete. {bot.indexed_count} pages indexed.")
app.state._top_scrape_done = True
@app.on_event("startup")
async def startup_event() -> None:
app.state._top_scrape_done = False
app.state._crawl_lock = asyncio.Lock()
app.state._top_scrape_status = {
"state": "idle",
"message": "Waiting to check top-site seed.",
"total": TOP_SITE_SEED_LIMIT,
"indexed": 0,
"source": None,
"updated_at": _utc_now(),
}
asyncio.create_task(_scrape_top_sites())
@app.get("/")
async def health_check() -> dict[str, str]:
return {"status": "sFetch is alive"}
@app.get("/search", response_model=SearchResponse)
async def search_endpoint(
q: str = Query(..., description="Search query"),
type: str = Query("web", description="Search type: web, image, or video"),
limit: int = Query(10, ge=1, le=50),
offset: int = Query(0, ge=0),
) -> SearchResponse:
query = q.strip()
if not query:
raise HTTPException(status_code=400, detail="Query parameter 'q' cannot be empty.")
if type == "image":
results = await search_images_api(query=query, limit=limit, offset=offset)
total = await count_image_results(query)
return SearchResponse(query=query, type=type, total=total, results=results)
if type == "video":
results = await search_videos_api(query=query, limit=limit, offset=offset)
total = await count_video_results(query)
return SearchResponse(query=query, type=type, total=total, results=results)
if type != "web":
raise HTTPException(status_code=400, detail="Invalid search type. Use web, image, or video.")
results = await search(query=query, limit=limit, offset=offset)
total = await count_search_results(query)
return SearchResponse(query=query, type=type, total=total, results=results)
async def _run_crawl_job(request: CrawlRequest) -> None:
try:
bot = sFetchBot(
max_depth=request.max_depth,
max_pages_per_domain=request.max_pages_per_domain,
same_domain_only=request.same_domain_only,
)
await bot.start(request.seed_urls)
except Exception as exc:
print(f"sFetch: crawl job failed ({exc})")
@app.post("/crawl")
async def crawl_endpoint(request: CrawlRequest, background_tasks: BackgroundTasks) -> dict[str, object]:
background_tasks.add_task(_run_crawl_job, request)
return {"message": "Crawl started", "seed_urls": request.seed_urls}
@app.post("/crawl/top-sites")
async def crawl_top_sites_endpoint(
background_tasks: BackgroundTasks,
force: bool = Query(False, description="Run the top-site seed again even if it is marked complete."),
) -> dict[str, object]:
background_tasks.add_task(_scrape_top_sites, force)
return {"message": "Top-site crawl queued", "force": force}
@app.get("/crawl/top-sites/status")
async def crawl_top_sites_status_endpoint() -> dict[str, object]:
return getattr(
app.state,
"_top_scrape_status",
{
"state": "idle",
"message": "Top-site seed has not started.",
"total": TOP_SITE_SEED_LIMIT,
"indexed": 0,
"source": None,
"updated_at": None,
},
)
@app.get("/stats")
async def stats_endpoint() -> dict[str, object]:
stats = await get_stats()
return stats
+43
View File
@@ -0,0 +1,43 @@
"""Pydantic models for sFetch's API."""
from __future__ import annotations
from pydantic import BaseModel, Field
class SearchResult(BaseModel):
id: int
url: str
title: str
snippet: str
indexed_at: str
class ImageResult(BaseModel):
id: int
url: str
page_url: str
alt_text: str
indexed_at: str
class VideoResult(BaseModel):
id: int
url: str
page_url: str
title: str
indexed_at: str
class SearchResponse(BaseModel):
query: str
type: str = "web"
total: int
results: list[SearchResult] | list[ImageResult] | list[VideoResult]
class CrawlRequest(BaseModel):
seed_urls: list[str] = Field(min_length=1)
max_depth: int = Field(default=2, ge=0, le=5)
max_pages_per_domain: int = Field(default=50, ge=1, le=500)
same_domain_only: bool = True
+6
View File
@@ -0,0 +1,6 @@
fastapi
uvicorn[standard]
httpx
beautifulsoup4
pydantic
aiosqlite
+90
View File
@@ -0,0 +1,90 @@
"""Search result shaping for sFetch."""
from __future__ import annotations
import html
import re
from database import search_images, search_pages, search_videos
SNIPPET_LENGTH = 200
def _extract_terms(query: str) -> list[str]:
terms = {term.lower() for term in re.findall(r"\w+", query, flags=re.UNICODE)}
return sorted(terms, key=len, reverse=True)
def _build_snippet(body_text: str) -> str:
snippet = body_text[:SNIPPET_LENGTH].strip()
if not snippet:
return "No preview available."
if len(body_text) > SNIPPET_LENGTH:
return f"{snippet}..."
return snippet
def _highlight_terms(snippet: str, query: str) -> str:
safe_snippet = html.escape(snippet)
for term in _extract_terms(query):
pattern = re.compile(re.escape(html.escape(term)), flags=re.IGNORECASE)
safe_snippet = pattern.sub(lambda match: f"<mark>{match.group(0)}</mark>", safe_snippet)
return safe_snippet
async def search(query: str, limit: int = 10, offset: int = 0) -> list[dict]:
rows = await search_pages(query=query, limit=limit, offset=offset)
results: list[dict] = []
for row in rows:
title = (row.get("title") or row.get("url") or "Untitled").strip()
body_text = row.get("body_text") or ""
snippet = _highlight_terms(_build_snippet(body_text), query)
results.append(
{
"id": row["id"],
"url": row["url"],
"title": title,
"snippet": snippet,
"indexed_at": row["indexed_at"],
}
)
return results
async def search_images_api(query: str, limit: int = 10, offset: int = 0) -> list[dict]:
rows = await search_images(query=query, limit=limit, offset=offset)
results: list[dict] = []
for row in rows:
results.append(
{
"id": row["id"],
"url": row["url"],
"page_url": row["page_url"],
"alt_text": row["alt_text"] or "",
"indexed_at": row["indexed_at"],
}
)
return results
async def search_videos_api(query: str, limit: int = 10, offset: int = 0) -> list[dict]:
rows = await search_videos(query=query, limit=limit, offset=offset)
results: list[dict] = []
for row in rows:
title = (row.get("title") or "Video result").strip()
results.append(
{
"id": row["id"],
"url": row["url"],
"page_url": row["page_url"],
"title": title,
"indexed_at": row["indexed_at"],
}
)
return results
+110
View File
@@ -0,0 +1,110 @@
"""Load and sanitize the top-site seed list for first-launch indexing."""
from __future__ import annotations
import csv
import io
import zipfile
from collections.abc import Iterable
from urllib.parse import urlsplit, urlunsplit
import httpx
from config import (
TOP_SITE_DOWNLOAD_TIMEOUT_SECONDS,
TOP_SITE_SEED_LIMIT,
TOP_SITE_SOURCE_URL,
TOP_SITES,
USER_AGENT,
)
from content_filter import is_adult_url
def _normalize_site_url(value: str) -> str | None:
raw_value = value.strip()
if not raw_value:
return None
candidate = raw_value if "://" in raw_value else f"https://{raw_value}"
parsed = urlsplit(candidate)
if parsed.scheme not in {"http", "https"} or not parsed.netloc:
return None
normalized = parsed._replace(
scheme=parsed.scheme.lower(),
netloc=parsed.netloc.lower(),
path=parsed.path.rstrip("/") if parsed.path not in {"", "/"} else "",
query="",
fragment="",
)
return urlunsplit(normalized)
def _host_key(url: str) -> str:
return urlsplit(url).netloc.lower().removeprefix("www.")
def _safe_top_urls(candidates: Iterable[str], limit: int = TOP_SITE_SEED_LIMIT) -> list[str]:
safe_urls: list[str] = []
seen_hosts: set[str] = set()
for candidate in candidates:
normalized = _normalize_site_url(candidate)
if normalized is None:
continue
host_key = _host_key(normalized)
if host_key in seen_hosts or is_adult_url(normalized):
continue
seen_hosts.add(host_key)
safe_urls.append(normalized)
if len(safe_urls) >= limit:
break
return safe_urls
def _domains_from_csv_text(csv_text: str) -> list[str]:
domains: list[str] = []
reader = csv.reader(io.StringIO(csv_text))
for row in reader:
if not row:
continue
domain = row[1] if len(row) > 1 else row[0]
if domain and domain.lower() != "domain":
domains.append(domain)
return domains
def _domains_from_zip(payload: bytes) -> list[str]:
with zipfile.ZipFile(io.BytesIO(payload)) as archive:
csv_name = next((name for name in archive.namelist() if name.endswith(".csv")), None)
if csv_name is None:
raise ValueError("Tranco archive did not contain a CSV file.")
with archive.open(csv_name) as csv_file:
text = csv_file.read().decode("utf-8", errors="replace")
return _domains_from_csv_text(text)
async def load_top_site_seed_urls(limit: int = TOP_SITE_SEED_LIMIT) -> tuple[list[str], str]:
"""Return the latest safe top-site URLs, falling back to the bundled list if needed."""
timeout = httpx.Timeout(TOP_SITE_DOWNLOAD_TIMEOUT_SECONDS)
headers = {"User-Agent": USER_AGENT}
try:
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers) as client:
response = await client.get(TOP_SITE_SOURCE_URL)
response.raise_for_status()
if response.content.startswith(b"PK"):
candidates = _domains_from_zip(response.content)
else:
candidates = _domains_from_csv_text(response.text)
safe_urls = _safe_top_urls(candidates, limit=limit)
if safe_urls:
return safe_urls, TOP_SITE_SOURCE_URL
except Exception as exc:
print(f"sFetch: unable to load latest top-site list ({exc}); using bundled fallback.")
return _safe_top_urls(TOP_SITES, limit=limit), "bundled fallback list"
+402
View File
@@ -0,0 +1,402 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>sFetch</title>
<script src="https://cdn.tailwindcss.com"></script>
<script>
tailwind.config = {
theme: {
extend: {
colors: {
sfetch: {
bg: "#f8fafc",
surface: "#ffffff",
surfaceSoft: "#f1f5f9",
ink: "#202124",
muted: "#5f6368",
border: "#dadce0",
blue: "#1a73e8",
orange: "#de5833",
green: "#0b8043",
},
},
boxShadow: {
search: "0 2px 8px rgba(60, 64, 67, 0.14), 0 1px 3px rgba(60, 64, 67, 0.12)",
panel: "0 16px 40px rgba(15, 23, 42, 0.08)",
},
},
},
};
</script>
<style>
:root {
color-scheme: light;
}
body {
background: #f8fafc;
color: #202124;
font-family: Arial, Helvetica, sans-serif;
}
.brand {
font-family: Arial, Helvetica, sans-serif;
font-weight: 700;
letter-spacing: 0;
}
.brand span:nth-child(1) { color: #de5833; }
.brand span:nth-child(2) { color: #1a73e8; }
.brand span:nth-child(3) { color: #188038; }
.brand span:nth-child(4) { color: #fbbc04; }
.brand span:nth-child(5) { color: #1a73e8; }
.brand span:nth-child(6) { color: #de5833; }
.modal-open {
overflow: hidden;
}
</style>
</head>
<body class="min-h-screen">
<main class="flex min-h-screen flex-col">
<header class="flex items-center justify-between px-5 py-4 text-sm text-sfetch-muted sm:px-8">
<a href="./index.html" class="brand text-2xl" aria-label="sFetch home">
<span>s</span><span>F</span><span>e</span><span>t</span><span>c</span><span>h</span>
</a>
<button
id="openCrawlerModal"
class="rounded-full border border-sfetch-border bg-white px-4 py-2 font-medium text-sfetch-ink transition hover:border-sfetch-orange hover:text-sfetch-orange"
>
Index tools
</button>
</header>
<section class="mx-auto flex w-full max-w-5xl flex-1 flex-col items-center justify-center px-5 pb-24 pt-10">
<h1 class="brand text-center text-6xl leading-none sm:text-7xl">
<span>s</span><span>F</span><span>e</span><span>t</span><span>c</span><span>h</span>
</h1>
<form id="searchForm" class="mt-9 w-full max-w-2xl">
<label
for="searchInput"
class="flex min-h-14 items-center gap-3 rounded-full border border-sfetch-border bg-white px-5 transition focus-within:border-transparent focus-within:shadow-search"
>
<svg class="h-5 w-5 shrink-0 text-sfetch-muted" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.8" aria-hidden="true">
<circle cx="11" cy="11" r="6"></circle>
<path d="M20 20L16.65 16.65"></path>
</svg>
<input
id="searchInput"
type="text"
autocomplete="off"
placeholder="Search sFetch"
class="w-full bg-transparent text-base text-sfetch-ink outline-none placeholder:text-sfetch-muted sm:text-lg"
/>
</label>
<div class="mt-6 flex flex-wrap items-center justify-center gap-3">
<button
type="submit"
class="rounded-md bg-sfetch-blue px-5 py-2.5 text-sm font-medium text-white transition hover:bg-[#1558b0]"
>
sFetch Search
</button>
<button
type="button"
data-search-type="image"
class="rounded-md border border-sfetch-border bg-white px-5 py-2.5 text-sm font-medium text-sfetch-ink transition hover:border-sfetch-blue hover:text-sfetch-blue"
>
Images
</button>
<button
type="button"
data-search-type="video"
class="rounded-md border border-sfetch-border bg-white px-5 py-2.5 text-sm font-medium text-sfetch-ink transition hover:border-sfetch-blue hover:text-sfetch-blue"
>
Videos
</button>
</div>
</form>
<section class="mt-12 w-full max-w-3xl rounded-lg border border-sfetch-border bg-white p-4 shadow-panel" aria-label="Index controls">
<div class="flex flex-col gap-4 sm:flex-row sm:items-center sm:justify-between">
<div>
<p class="text-xs font-semibold uppercase text-sfetch-orange">Index</p>
<p id="statsSummary" class="mt-1 text-sm text-sfetch-muted">Checking index...</p>
</div>
<div class="flex flex-wrap gap-2">
<button
id="seedTopSites"
class="rounded-md bg-sfetch-orange px-4 py-2 text-sm font-medium text-white transition hover:bg-[#c44724]"
>
Seed top 1000
</button>
<button
id="openCrawlerModalSecondary"
class="rounded-md border border-sfetch-border bg-white px-4 py-2 text-sm font-medium text-sfetch-ink transition hover:border-sfetch-orange hover:text-sfetch-orange"
>
Custom crawl
</button>
</div>
</div>
<div class="mt-4 h-2 overflow-hidden rounded-full bg-sfetch-surfaceSoft">
<div id="seedProgress" class="h-full w-0 bg-sfetch-orange transition-all duration-300"></div>
</div>
<p id="seedStatus" class="mt-3 min-h-5 text-sm text-sfetch-muted">Top-site seed status unavailable.</p>
</section>
</section>
<footer class="border-t border-sfetch-border bg-white px-5 py-4 text-center text-xs text-sfetch-muted">
&copy; 2026 sFetch
</footer>
</main>
<div
id="crawlerModal"
class="pointer-events-none fixed inset-0 z-30 flex items-center justify-center bg-slate-900/35 px-4 opacity-0 transition"
aria-hidden="true"
>
<div class="w-full max-w-xl rounded-lg border border-sfetch-border bg-white p-5 shadow-panel">
<div class="flex items-center justify-between gap-4 border-b border-sfetch-border pb-4">
<h2 class="text-lg font-semibold text-sfetch-ink">Custom crawl</h2>
<button
id="closeCrawlerModal"
class="flex h-9 w-9 items-center justify-center rounded-full text-sfetch-muted transition hover:bg-sfetch-surfaceSoft hover:text-sfetch-ink"
aria-label="Close crawler modal"
>
X
</button>
</div>
<form id="crawlerForm" class="mt-5 space-y-4">
<div>
<label for="seedUrls" class="mb-2 block text-sm font-medium text-sfetch-ink">Seed URLs</label>
<textarea
id="seedUrls"
rows="6"
placeholder="https://example.com&#10;https://docs.python.org/"
class="w-full rounded-md border border-sfetch-border bg-white px-3 py-2 text-sm text-sfetch-ink outline-none transition focus:border-sfetch-blue focus:ring-2 focus:ring-blue-100"
></textarea>
</div>
<div class="grid gap-4 sm:grid-cols-2">
<div>
<label for="crawlDepth" class="mb-2 block text-sm font-medium text-sfetch-ink">Max depth</label>
<input
id="crawlDepth"
type="number"
min="0"
max="5"
value="2"
class="w-full rounded-md border border-sfetch-border bg-white px-3 py-2 text-sm text-sfetch-ink outline-none transition focus:border-sfetch-blue focus:ring-2 focus:ring-blue-100"
/>
</div>
<div>
<label for="maxPagesPerDomain" class="mb-2 block text-sm font-medium text-sfetch-ink">Pages per domain</label>
<input
id="maxPagesPerDomain"
type="number"
min="1"
max="500"
value="50"
class="w-full rounded-md border border-sfetch-border bg-white px-3 py-2 text-sm text-sfetch-ink outline-none transition focus:border-sfetch-blue focus:ring-2 focus:ring-blue-100"
/>
</div>
</div>
<label class="flex items-center gap-3 text-sm text-sfetch-ink">
<input id="sameDomainOnly" type="checkbox" checked class="h-4 w-4 rounded border-sfetch-border text-sfetch-blue" />
Same domain only
</label>
<p id="crawlerStatus" class="min-h-5 text-sm text-sfetch-muted"></p>
<div class="flex flex-col-reverse gap-3 sm:flex-row sm:justify-end">
<button
type="button"
id="cancelCrawler"
class="rounded-md border border-sfetch-border bg-white px-4 py-2 text-sm font-medium text-sfetch-ink transition hover:bg-sfetch-surfaceSoft"
>
Cancel
</button>
<button
type="submit"
class="rounded-md bg-sfetch-blue px-4 py-2 text-sm font-medium text-white transition hover:bg-[#1558b0]"
>
Launch crawl
</button>
</div>
</form>
</div>
</div>
<script>
const API_BASE = "http://localhost:8000";
const searchForm = document.getElementById("searchForm");
const searchInput = document.getElementById("searchInput");
const openCrawlerModal = document.getElementById("openCrawlerModal");
const openCrawlerModalSecondary = document.getElementById("openCrawlerModalSecondary");
const closeCrawlerModal = document.getElementById("closeCrawlerModal");
const cancelCrawler = document.getElementById("cancelCrawler");
const crawlerModal = document.getElementById("crawlerModal");
const crawlerForm = document.getElementById("crawlerForm");
const crawlerStatus = document.getElementById("crawlerStatus");
const seedUrlsField = document.getElementById("seedUrls");
const crawlDepthField = document.getElementById("crawlDepth");
const maxPagesPerDomainField = document.getElementById("maxPagesPerDomain");
const sameDomainOnlyField = document.getElementById("sameDomainOnly");
const statsSummary = document.getElementById("statsSummary");
const seedStatus = document.getElementById("seedStatus");
const seedProgress = document.getElementById("seedProgress");
const seedTopSites = document.getElementById("seedTopSites");
function runSearch(type = "all") {
const query = searchInput.value.trim();
if (!query) {
searchInput.focus();
return;
}
const params = new URLSearchParams({ q: query });
if (type !== "all") {
params.set("type", type);
}
window.location.href = `results.html?${params.toString()}`;
}
function setModalOpen(isOpen) {
crawlerModal.classList.toggle("opacity-0", !isOpen);
crawlerModal.classList.toggle("pointer-events-none", !isOpen);
crawlerModal.setAttribute("aria-hidden", String(!isOpen));
document.body.classList.toggle("modal-open", isOpen);
if (isOpen) {
seedUrlsField.focus();
} else {
crawlerStatus.textContent = "";
}
}
async function refreshStats() {
try {
const response = await fetch(`${API_BASE}/stats`);
const stats = await response.json();
if (!response.ok) {
throw new Error();
}
const lastIndexed = stats.last_indexed_at ? `, last indexed ${stats.last_indexed_at}` : "";
statsSummary.textContent = `${stats.total_pages.toLocaleString()} pages${lastIndexed}`;
} catch {
statsSummary.textContent = "Backend unavailable";
}
}
async function refreshSeedStatus() {
try {
const response = await fetch(`${API_BASE}/crawl/top-sites/status`);
const status = await response.json();
if (!response.ok) {
throw new Error();
}
const total = Number(status.total || 0);
const indexed = Number(status.indexed || 0);
const percent = total > 0 && status.state === "complete" ? 100 : total > 0 ? Math.min(96, (indexed / total) * 100) : 0;
seedProgress.style.width = `${percent}%`;
seedStatus.textContent = `${status.message || "Idle"}${status.source ? ` Source: ${status.source}` : ""}`;
} catch {
seedProgress.style.width = "0%";
seedStatus.textContent = "Top-site seed status unavailable.";
}
}
async function seedTopSitesNow() {
seedTopSites.disabled = true;
seedTopSites.textContent = "Queued";
try {
const response = await fetch(`${API_BASE}/crawl/top-sites`, { method: "POST" });
const data = await response.json().catch(() => ({}));
if (!response.ok) {
throw new Error(data.detail || "Unable to queue top-site seed.");
}
seedStatus.textContent = "Top-site seed queued.";
await refreshSeedStatus();
} catch (error) {
seedStatus.textContent = error.message || "Unable to queue top-site seed.";
} finally {
setTimeout(() => {
seedTopSites.disabled = false;
seedTopSites.textContent = "Seed top 1000";
}, 1200);
}
}
async function handleCrawlerSubmit(event) {
event.preventDefault();
const seedUrls = seedUrlsField.value
.split("\n")
.map((value) => value.trim())
.filter(Boolean);
if (!seedUrls.length) {
crawlerStatus.textContent = "Add at least one seed URL.";
return;
}
const payload = {
seed_urls: seedUrls,
max_depth: Number.parseInt(crawlDepthField.value, 10) || 0,
max_pages_per_domain: Number.parseInt(maxPagesPerDomainField.value, 10) || 1,
same_domain_only: sameDomainOnlyField.checked,
};
crawlerStatus.textContent = "Starting crawl...";
try {
const response = await fetch(`${API_BASE}/crawl`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(payload),
});
const data = await response.json().catch(() => ({}));
if (!response.ok) {
throw new Error(data.detail || "Unable to start the crawler.");
}
crawlerStatus.textContent = `Crawl started for ${seedUrls.length} seed URL${seedUrls.length === 1 ? "" : "s"}.`;
setTimeout(() => {
setModalOpen(false);
refreshStats();
}, 900);
} catch (error) {
crawlerStatus.textContent = error.message || "Unable to start the crawler.";
}
}
searchForm.addEventListener("submit", (event) => {
event.preventDefault();
runSearch("all");
});
document.querySelectorAll("[data-search-type]").forEach((button) => {
button.addEventListener("click", () => runSearch(button.dataset.searchType || "all"));
});
openCrawlerModal.addEventListener("click", () => setModalOpen(true));
openCrawlerModalSecondary.addEventListener("click", () => setModalOpen(true));
closeCrawlerModal.addEventListener("click", () => setModalOpen(false));
cancelCrawler.addEventListener("click", () => setModalOpen(false));
crawlerModal.addEventListener("click", (event) => {
if (event.target === crawlerModal) {
setModalOpen(false);
}
});
seedTopSites.addEventListener("click", seedTopSitesNow);
crawlerForm.addEventListener("submit", handleCrawlerSubmit);
refreshStats();
refreshSeedStatus();
setInterval(refreshStats, 10000);
setInterval(refreshSeedStatus, 5000);
</script>
</body>
</html>
+693
View File
@@ -0,0 +1,693 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>sFetch Results</title>
<script src="https://cdn.tailwindcss.com"></script>
<script>
tailwind.config = {
theme: {
extend: {
colors: {
sfetch: {
bg: "#f8fafc",
surface: "#ffffff",
surfaceSoft: "#f1f5f9",
ink: "#202124",
muted: "#5f6368",
border: "#dadce0",
blue: "#1a73e8",
orange: "#de5833",
green: "#0b8043",
},
},
boxShadow: {
search: "0 2px 8px rgba(60, 64, 67, 0.14), 0 1px 3px rgba(60, 64, 67, 0.12)",
panel: "0 16px 40px rgba(15, 23, 42, 0.08)",
},
},
},
};
</script>
<style>
:root {
color-scheme: light;
}
body {
background: #ffffff;
color: #202124;
font-family: Arial, Helvetica, sans-serif;
}
.brand {
font-family: Arial, Helvetica, sans-serif;
font-weight: 700;
letter-spacing: 0;
}
.brand span:nth-child(1) { color: #de5833; }
.brand span:nth-child(2) { color: #1a73e8; }
.brand span:nth-child(3) { color: #188038; }
.brand span:nth-child(4) { color: #fbbc04; }
.brand span:nth-child(5) { color: #1a73e8; }
.brand span:nth-child(6) { color: #de5833; }
.skeleton {
background: linear-gradient(90deg, #eef2f7 25%, #f8fafc 37%, #eef2f7 63%);
background-size: 400% 100%;
animation: shimmer 1.4s ease infinite;
}
mark {
background: rgba(251, 188, 4, 0.28);
color: #202124;
padding: 0 0.12rem;
border-radius: 0.2rem;
}
@keyframes shimmer {
0% { background-position: 100% 50%; }
100% { background-position: 0 50%; }
}
@keyframes barrel-roll {
0% { transform: rotateZ(0deg); }
100% { transform: rotateZ(360deg); }
}
.barrel-roll {
animation: barrel-roll 1.2s cubic-bezier(0.25, 0.46, 0.45, 0.94) forwards;
}
</style>
</head>
<body class="min-h-screen">
<div class="min-h-screen">
<header class="sticky top-0 z-20 border-b border-sfetch-border bg-white/95 backdrop-blur">
<div class="mx-auto flex max-w-6xl flex-col gap-4 px-5 py-4 sm:flex-row sm:items-center">
<a href="./index.html" class="brand text-3xl leading-none" aria-label="sFetch home">
<span>s</span><span>F</span><span>e</span><span>t</span><span>c</span><span>h</span>
</a>
<form id="searchForm" class="flex flex-1 items-center gap-3">
<label
for="searchInput"
class="flex min-h-12 flex-1 items-center gap-3 rounded-full border border-sfetch-border bg-white px-4 transition focus-within:border-transparent focus-within:shadow-search"
>
<svg class="h-5 w-5 shrink-0 text-sfetch-muted" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.8" aria-hidden="true">
<circle cx="11" cy="11" r="6"></circle>
<path d="M20 20L16.65 16.65"></path>
</svg>
<input
id="searchInput"
type="text"
autocomplete="off"
class="w-full bg-transparent text-base text-sfetch-ink outline-none placeholder:text-sfetch-muted"
placeholder="Search sFetch"
/>
</label>
<button
id="searchButton"
type="submit"
class="rounded-md bg-sfetch-blue px-5 py-3 text-sm font-medium text-white transition hover:bg-[#1558b0]"
>
Search
</button>
</form>
<a
href="./index.html"
class="rounded-md border border-sfetch-border bg-white px-4 py-2 text-sm font-medium text-sfetch-ink transition hover:border-sfetch-orange hover:text-sfetch-orange"
>
Index tools
</a>
</div>
<nav class="mx-auto flex max-w-6xl gap-7 px-5 text-sm" aria-label="Search verticals">
<button id="tabAll" class="tab-btn border-b-2 border-transparent pb-3 font-medium text-sfetch-muted">All</button>
<button id="tabImages" class="tab-btn border-b-2 border-transparent pb-3 font-medium text-sfetch-muted">Images</button>
<button id="tabVideos" class="tab-btn border-b-2 border-transparent pb-3 font-medium text-sfetch-muted">Videos</button>
</nav>
</header>
<main class="mx-auto max-w-6xl px-5 py-8">
<p id="metaText" class="text-sm text-sfetch-muted"></p>
<section id="resultsContainer" class="mt-6"></section>
<nav id="pagination" class="mt-10 flex items-center justify-start gap-2" aria-label="Pagination"></nav>
</main>
</div>
<div id="imageModal" class="fixed inset-0 z-50 hidden bg-slate-950/60">
<div class="absolute inset-y-0 right-0 w-full max-w-4xl border-l border-sfetch-border bg-white shadow-panel">
<div class="flex items-center justify-between border-b border-sfetch-border px-6 py-4">
<h3 id="modalTitle" class="truncate text-base font-medium text-sfetch-ink">Image preview</h3>
<button id="closeModal" class="flex h-9 w-9 items-center justify-center rounded-full text-sfetch-muted transition hover:bg-sfetch-surfaceSoft hover:text-sfetch-ink">
X
</button>
</div>
<div class="h-[calc(100vh-73px)] overflow-y-auto px-6 py-5">
<div class="overflow-hidden rounded-lg bg-sfetch-surfaceSoft">
<img id="modalImage" class="max-h-[62vh] w-full object-contain" alt="Preview" />
</div>
<div class="mt-6">
<h4 class="mb-3 text-sm font-medium text-sfetch-muted">Related images</h4>
<div id="relatedImages" class="grid grid-cols-2 gap-3 sm:grid-cols-3"></div>
</div>
</div>
</div>
</div>
<script>
const API_BASE = "http://localhost:8000";
const RESULTS_PER_PAGE = 10;
const searchForm = document.getElementById("searchForm");
const searchInput = document.getElementById("searchInput");
const resultsContainer = document.getElementById("resultsContainer");
const metaText = document.getElementById("metaText");
const paginationNav = document.getElementById("pagination");
const tabAll = document.getElementById("tabAll");
const tabImages = document.getElementById("tabImages");
const tabVideos = document.getElementById("tabVideos");
const imageModal = document.getElementById("imageModal");
const closeModalBtn = document.getElementById("closeModal");
const modalImage = document.getElementById("modalImage");
const modalTitle = document.getElementById("modalTitle");
const relatedImagesContainer = document.getElementById("relatedImages");
let currentType = "all";
function escapeHTML(value) {
return String(value || "")
.replaceAll("&", "&amp;")
.replaceAll("<", "&lt;")
.replaceAll(">", "&gt;")
.replaceAll('"', "&quot;")
.replaceAll("'", "&#039;");
}
function getTypeFromUrl() {
const typeValue = new URLSearchParams(window.location.search).get("type");
if (typeValue === "image" || typeValue === "video" || typeValue === "all") {
return typeValue;
}
return "all";
}
function getQueryFromUrl() {
return (new URLSearchParams(window.location.search).get("q") || "").trim();
}
function getPageFromUrl() {
const raw = new URLSearchParams(window.location.search).get("page") || "1";
const page = Number.parseInt(raw, 10);
return Number.isNaN(page) || page < 1 ? 1 : page;
}
function updateUrl(query, page) {
const params = new URLSearchParams(window.location.search);
params.set("q", query);
page > 1 ? params.set("page", String(page)) : params.delete("page");
currentType === "all" ? params.delete("type") : params.set("type", currentType);
window.history.replaceState({}, "", `${window.location.pathname}?${params.toString()}`);
}
function updateTabsUI() {
const tabs = [
[tabAll, currentType === "all"],
[tabImages, currentType === "image"],
[tabVideos, currentType === "video"],
];
tabs.forEach(([tab, active]) => {
tab.classList.toggle("border-sfetch-orange", active);
tab.classList.toggle("text-sfetch-ink", active);
tab.classList.toggle("border-transparent", !active);
tab.classList.toggle("text-sfetch-muted", !active);
});
}
async function fetchSearch(type, query, limit, offset) {
const response = await fetch(
`${API_BASE}/search?q=${encodeURIComponent(query)}&type=${type}&limit=${limit}&offset=${offset}`
);
const data = await response.json().catch(() => ({}));
if (!response.ok) {
throw new Error(data.detail || "Search request failed.");
}
return data;
}
function extractHost(url) {
try {
return new URL(url).hostname.replace(/^www\./, "");
} catch {
return url;
}
}
function getYouTubeId(url) {
try {
const parsed = new URL(url);
if (parsed.hostname.includes("youtube.com")) {
if (parsed.pathname.startsWith("/watch")) {
return parsed.searchParams.get("v");
}
if (parsed.pathname.startsWith("/embed/")) {
return parsed.pathname.split("/embed/")[1] || null;
}
}
if (parsed.hostname.includes("youtu.be")) {
return parsed.pathname.slice(1) || null;
}
} catch {
return null;
}
return null;
}
function videoThumbnail(url) {
const ytId = getYouTubeId(url);
return ytId ? `https://img.youtube.com/vi/${ytId}/hqdefault.jpg` : null;
}
function renderError(message) {
metaText.textContent = "Search unavailable";
resultsContainer.className = "mt-6";
resultsContainer.innerHTML = `
<section class="max-w-2xl rounded-lg border border-sfetch-border bg-sfetch-bg px-5 py-6">
<p class="text-lg text-sfetch-ink">Unable to load results.</p>
<p class="mt-2 text-sm text-sfetch-muted">${escapeHTML(message)}</p>
</section>
`;
paginationNav.innerHTML = "";
}
function renderEmpty(query) {
metaText.textContent = "About 0 results";
resultsContainer.className = "mt-6";
resultsContainer.innerHTML = `
<section class="max-w-2xl rounded-lg border border-sfetch-border bg-sfetch-bg px-5 py-8">
<div class="flex h-12 w-12 items-center justify-center rounded-full bg-sfetch-surfaceSoft text-lg font-bold text-sfetch-orange">s</div>
<h2 class="mt-4 text-xl text-sfetch-ink">No results found</h2>
<p class="mt-2 text-sm text-sfetch-muted">No indexed pages matched "${escapeHTML(query)}".</p>
</section>
`;
paginationNav.innerHTML = "";
}
function renderPagination(total, currentPage, query) {
paginationNav.innerHTML = "";
const totalPages = Math.ceil(total / RESULTS_PER_PAGE);
if (totalPages <= 1) {
paginationNav.style.display = "none";
return;
}
paginationNav.style.display = "flex";
const button = (label, page, disabled = false, active = false) => {
const btn = document.createElement("button");
btn.textContent = label;
btn.disabled = disabled;
btn.className = `flex h-10 min-w-10 items-center justify-center rounded-md border px-3 text-sm transition ${
active
? "border-sfetch-blue bg-sfetch-blue text-white"
: disabled
? "cursor-not-allowed border-sfetch-border text-sfetch-muted/50"
: "border-sfetch-border text-sfetch-ink hover:border-sfetch-blue hover:text-sfetch-blue"
}`;
if (!disabled && !active) {
btn.addEventListener("click", () => runSearch(query, page));
}
return btn;
};
paginationNav.appendChild(button("<", currentPage - 1, currentPage === 1));
const maxVisiblePages = 5;
let start = Math.max(1, currentPage - 2);
let end = Math.min(totalPages, start + maxVisiblePages - 1);
if (end - start < maxVisiblePages - 1) {
start = Math.max(1, end - maxVisiblePages + 1);
}
for (let i = start; i <= end; i += 1) {
paginationNav.appendChild(button(String(i), i, false, i === currentPage));
}
paginationNav.appendChild(button(">", currentPage + 1, currentPage === totalPages));
}
function openImageModal(imageResult, imageIndex, relatedPool) {
modalImage.src = imageResult.url;
modalImage.alt = imageResult.alt_text || "Image preview";
modalTitle.textContent = imageResult.alt_text || extractHost(imageResult.page_url);
relatedImagesContainer.innerHTML = "";
relatedPool
.filter((_, idx) => idx !== imageIndex)
.slice(0, 8)
.forEach((item) => {
const thumb = document.createElement("button");
thumb.className = "overflow-hidden rounded-md border border-sfetch-border transition hover:border-sfetch-orange";
thumb.innerHTML = `
<img
src="${escapeHTML(item.url)}"
alt="${escapeHTML(item.alt_text || "Related image")}"
class="h-24 w-full object-cover"
loading="lazy"
/>
`;
thumb.addEventListener("click", () => {
const realIndex = relatedPool.findIndex((candidate) => candidate.id === item.id);
openImageModal(item, realIndex, relatedPool);
});
relatedImagesContainer.appendChild(thumb);
});
imageModal.classList.remove("hidden");
}
function closeImageModal() {
imageModal.classList.add("hidden");
}
function renderImageGrid(results) {
resultsContainer.className = "mt-6 grid grid-cols-2 gap-4 sm:grid-cols-3 lg:grid-cols-4";
resultsContainer.innerHTML = "";
results.forEach((result, index) => {
const card = document.createElement("article");
card.className = "group cursor-pointer overflow-hidden rounded-lg border border-sfetch-border bg-white transition hover:border-sfetch-orange";
card.innerHTML = `
<div class="aspect-square overflow-hidden bg-sfetch-surfaceSoft">
<img
src="${escapeHTML(result.url)}"
alt="${escapeHTML(result.alt_text || "Image result")}"
class="h-full w-full object-cover transition duration-200 group-hover:scale-105"
loading="lazy"
/>
</div>
<div class="truncate px-3 py-2 text-xs text-sfetch-muted">${escapeHTML(result.alt_text || extractHost(result.page_url))}</div>
`;
card.addEventListener("click", () => openImageModal(result, index, results));
resultsContainer.appendChild(card);
});
}
function renderVideoCards(results) {
resultsContainer.className = "mt-6 space-y-4";
resultsContainer.innerHTML = "";
results.forEach((result) => {
const thumbnail = videoThumbnail(result.url);
const card = document.createElement("article");
card.className = "overflow-hidden rounded-lg border border-sfetch-border bg-white";
card.innerHTML = `
<a href="${escapeHTML(result.url)}" target="_blank" rel="noreferrer noopener" class="block md:flex">
<div class="relative h-44 w-full shrink-0 overflow-hidden bg-sfetch-surfaceSoft md:w-72">
${
thumbnail
? `<img src="${escapeHTML(thumbnail)}" alt="${escapeHTML(result.title)}" class="h-full w-full object-cover" loading="lazy" />`
: `<div class="flex h-full items-center justify-center text-sfetch-muted">Video</div>`
}
</div>
<div class="space-y-2 p-5">
<p class="text-xs uppercase text-sfetch-green">${escapeHTML(extractHost(result.url))}</p>
<h3 class="text-xl font-medium text-sfetch-blue">${escapeHTML(result.title)}</h3>
<p class="text-sm text-sfetch-muted">Source: ${escapeHTML(extractHost(result.page_url))}</p>
</div>
</a>
`;
resultsContainer.appendChild(card);
});
}
function renderWebList(results) {
const wrapper = document.createElement("div");
wrapper.className = "max-w-3xl space-y-7";
results.forEach((result) => {
const article = document.createElement("article");
const host = extractHost(result.url);
article.className = "space-y-1";
article.innerHTML = `
<div class="flex items-center gap-2 text-sm text-sfetch-muted">
<div class="flex h-7 w-7 shrink-0 items-center justify-center rounded-full bg-sfetch-surfaceSoft text-xs font-bold text-sfetch-orange">${escapeHTML(host.slice(0, 1).toUpperCase())}</div>
<div class="min-w-0">
<p class="text-sfetch-ink">${escapeHTML(host)}</p>
<p class="truncate text-xs">${escapeHTML(result.url)}</p>
</div>
</div>
<a
href="${escapeHTML(result.url)}"
target="_blank"
rel="noreferrer noopener"
class="block text-xl leading-tight text-sfetch-blue hover:underline"
>${escapeHTML(result.title)}</a>
<p class="text-sm leading-6 text-sfetch-muted">${result.snippet}</p>
`;
wrapper.appendChild(article);
});
return wrapper;
}
function renderAllMode(webData, imageData, videoData, page) {
const start = (page - 1) * RESULTS_PER_PAGE + 1;
const end = Math.min(start + webData.results.length - 1, webData.total);
if (webData.total === 0 && imageData.total === 0 && videoData.total === 0) {
renderEmpty(webData.query);
return;
}
metaText.textContent = webData.total > 0
? `${start}-${end} of about ${webData.total} web results`
: "No direct web matches, showing media results";
resultsContainer.className = "mt-6 space-y-9";
resultsContainer.innerHTML = "";
if (imageData.results.length) {
const imageSection = document.createElement("section");
imageSection.innerHTML = `
<div class="mb-3 flex max-w-3xl items-center justify-between">
<h2 class="text-sm font-semibold text-sfetch-ink">Images</h2>
<button id="seeAllImagesBtn" class="text-sm font-medium text-sfetch-blue hover:underline">See all</button>
</div>
`;
const grid = document.createElement("div");
grid.className = "grid max-w-3xl grid-cols-3 gap-2 sm:grid-cols-4 md:grid-cols-6";
imageData.results.slice(0, 6).forEach((result, index) => {
const button = document.createElement("button");
button.className = "overflow-hidden rounded-md border border-sfetch-border bg-sfetch-surfaceSoft";
button.innerHTML = `<img src="${escapeHTML(result.url)}" alt="${escapeHTML(result.alt_text || "Image result")}" class="aspect-square w-full object-cover" loading="lazy" />`;
button.addEventListener("click", () => openImageModal(result, index, imageData.results));
grid.appendChild(button);
});
imageSection.appendChild(grid);
resultsContainer.appendChild(imageSection);
imageSection.querySelector("#seeAllImagesBtn").addEventListener("click", () => {
currentType = "image";
runSearch(searchInput.value.trim(), 1);
});
}
if (webData.results.length) {
resultsContainer.appendChild(renderWebList(webData.results));
}
if (videoData.results.length) {
const videoSection = document.createElement("section");
videoSection.innerHTML = `
<div class="mb-3 flex max-w-3xl items-center justify-between">
<h2 class="text-sm font-semibold text-sfetch-ink">Videos</h2>
<button id="seeAllVideosBtn" class="text-sm font-medium text-sfetch-blue hover:underline">See all</button>
</div>
`;
const list = document.createElement("div");
list.className = "max-w-3xl space-y-3";
videoData.results.slice(0, 3).forEach((result) => {
const thumb = videoThumbnail(result.url);
const card = document.createElement("a");
card.href = result.url;
card.target = "_blank";
card.rel = "noreferrer noopener";
card.className = "block overflow-hidden rounded-lg border border-sfetch-border bg-white transition hover:border-sfetch-orange sm:flex";
card.innerHTML = `
<div class="h-36 w-full shrink-0 overflow-hidden bg-sfetch-surfaceSoft sm:w-56">
${
thumb
? `<img src="${escapeHTML(thumb)}" alt="${escapeHTML(result.title)}" class="h-full w-full object-cover" loading="lazy" />`
: `<div class="flex h-full items-center justify-center text-sfetch-muted">Video</div>`
}
</div>
<div class="space-y-2 p-4">
<p class="text-xs uppercase text-sfetch-green">${escapeHTML(extractHost(result.url))}</p>
<h3 class="text-lg font-medium text-sfetch-blue">${escapeHTML(result.title)}</h3>
<p class="text-sm text-sfetch-muted">${escapeHTML(extractHost(result.page_url))}</p>
</div>
`;
list.appendChild(card);
});
videoSection.appendChild(list);
resultsContainer.appendChild(videoSection);
videoSection.querySelector("#seeAllVideosBtn").addEventListener("click", () => {
currentType = "video";
runSearch(searchInput.value.trim(), 1);
});
}
renderPagination(webData.total, page, webData.query);
}
function renderVerticalMode(data, page) {
const start = (page - 1) * RESULTS_PER_PAGE + 1;
const end = Math.min(start + data.results.length - 1, data.total);
if (data.total === 0) {
renderEmpty(data.query);
return;
}
metaText.textContent = `${start}-${end} of about ${data.total} ${data.type} results`;
if (data.type === "image") {
renderImageGrid(data.results);
} else if (data.type === "video") {
renderVideoCards(data.results);
} else {
resultsContainer.className = "mt-6";
resultsContainer.innerHTML = "";
resultsContainer.appendChild(renderWebList(data.results));
}
renderPagination(data.total, page, data.query);
}
function renderLoadingSkeleton() {
if (currentType === "image") {
resultsContainer.className = "mt-6 grid grid-cols-2 gap-4 sm:grid-cols-3 lg:grid-cols-4";
resultsContainer.innerHTML = Array.from({ length: 8 })
.map(() => '<div class="skeleton aspect-square rounded-lg"></div>')
.join("");
metaText.textContent = "Searching images...";
} else if (currentType === "video") {
resultsContainer.className = "mt-6 max-w-3xl space-y-4";
resultsContainer.innerHTML = Array.from({ length: 4 })
.map(() => `
<div class="overflow-hidden rounded-lg border border-sfetch-border bg-white">
<div class="skeleton h-36 w-full"></div>
<div class="space-y-3 p-4">
<div class="skeleton h-3 w-24 rounded-full"></div>
<div class="skeleton h-6 w-3/4 rounded-full"></div>
<div class="skeleton h-3 w-1/2 rounded-full"></div>
</div>
</div>
`)
.join("");
metaText.textContent = "Searching videos...";
} else {
resultsContainer.className = "mt-6 max-w-3xl space-y-6";
resultsContainer.innerHTML = Array.from({ length: 4 })
.map(() => `
<article class="space-y-3">
<div class="skeleton h-3 w-56 rounded-full"></div>
<div class="skeleton h-6 w-2/3 rounded-full"></div>
<div class="space-y-2">
<div class="skeleton h-3 w-full rounded-full"></div>
<div class="skeleton h-3 w-11/12 rounded-full"></div>
</div>
</article>
`)
.join("");
metaText.textContent = "Searching...";
}
}
async function runSearch(query, page = 1) {
const normalizedQuery = query.trim();
if (!normalizedQuery) {
metaText.textContent = "Enter a search query.";
resultsContainer.className = "mt-6";
resultsContainer.innerHTML = `
<section class="max-w-2xl rounded-lg border border-sfetch-border bg-sfetch-bg px-5 py-6 text-sm text-sfetch-muted">
Type a query above and press Search.
</section>
`;
paginationNav.innerHTML = "";
return;
}
if (normalizedQuery.toLowerCase() === "do a barrel roll") {
document.documentElement.classList.add("barrel-roll");
setTimeout(() => document.documentElement.classList.remove("barrel-roll"), 1200);
}
updateTabsUI();
updateUrl(normalizedQuery, page);
searchInput.value = normalizedQuery;
renderLoadingSkeleton();
paginationNav.innerHTML = "";
const offset = (page - 1) * RESULTS_PER_PAGE;
try {
if (currentType === "all") {
const [webData, imageData, videoData] = await Promise.all([
fetchSearch("web", normalizedQuery, RESULTS_PER_PAGE, offset),
fetchSearch("image", normalizedQuery, 8, offset),
fetchSearch("video", normalizedQuery, 6, offset),
]);
renderAllMode(webData, imageData, videoData, page);
} else {
const data = await fetchSearch(currentType, normalizedQuery, RESULTS_PER_PAGE, offset);
renderVerticalMode(data, page);
}
window.scrollTo({ top: 0, behavior: "smooth" });
} catch (error) {
renderError(error.message || "The search request failed.");
}
}
tabAll.addEventListener("click", () => {
if (currentType !== "all") {
currentType = "all";
runSearch(searchInput.value || getQueryFromUrl(), 1);
}
});
tabImages.addEventListener("click", () => {
if (currentType !== "image") {
currentType = "image";
runSearch(searchInput.value || getQueryFromUrl(), 1);
}
});
tabVideos.addEventListener("click", () => {
if (currentType !== "video") {
currentType = "video";
runSearch(searchInput.value || getQueryFromUrl(), 1);
}
});
searchForm.addEventListener("submit", (event) => {
event.preventDefault();
runSearch(searchInput.value, 1);
});
closeModalBtn.addEventListener("click", closeImageModal);
imageModal.addEventListener("click", (event) => {
if (event.target === imageModal) {
closeImageModal();
}
});
document.addEventListener("keydown", (event) => {
if (event.key === "Escape" && !imageModal.classList.contains("hidden")) {
closeImageModal();
}
});
currentType = getTypeFromUrl();
runSearch(getQueryFromUrl(), getPageFromUrl());
</script>
</body>
</html>