From e0f2eedcd97ee2e0d57eaab8f641a28ff1617420 Mon Sep 17 00:00:00 2001 From: Ned Halksworth Date: Mon, 4 May 2026 19:31:46 +0100 Subject: [PATCH] inital commit --- .gitignore | 16 + README.md | 119 ++++ backend/config.py | 1226 +++++++++++++++++++++++++++++++++++++ backend/content_filter.py | 61 ++ backend/crawler.py | 309 ++++++++++ backend/database.py | 395 ++++++++++++ backend/indexer.py | 41 ++ backend/main.py | 207 +++++++ backend/models.py | 43 ++ backend/requirements.txt | 6 + backend/searcher.py | 90 +++ backend/top_sites.py | 110 ++++ frontend/index.html | 402 ++++++++++++ frontend/results.html | 693 +++++++++++++++++++++ 14 files changed, 3718 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 backend/config.py create mode 100644 backend/content_filter.py create mode 100644 backend/crawler.py create mode 100644 backend/database.py create mode 100644 backend/indexer.py create mode 100644 backend/main.py create mode 100644 backend/models.py create mode 100644 backend/requirements.txt create mode 100644 backend/searcher.py create mode 100644 backend/top_sites.py create mode 100644 frontend/index.html create mode 100644 frontend/results.html diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f135b94 --- /dev/null +++ b/.gitignore @@ -0,0 +1,16 @@ +.DS_Store +__pycache__/ +*.py[cod] +.pytest_cache/ + +.venv/ +venv/ +backend/venv/ + +*.db +*.db-* +*.sqlite +*.sqlite3 + +.env +.env.* diff --git a/README.md b/README.md new file mode 100644 index 0000000..e39bb05 --- /dev/null +++ b/README.md @@ -0,0 +1,119 @@ +# sFetch + +sFetch is a full-stack search engine prototype with a lightweight Google/DDG-inspired frontend, a FastAPI search API, and an async crawler that indexes pages into a local SQLite FTS5 database. + +On first backend launch, sFetch downloads the latest Tranco top-site list, filters pornographic/adult domains, and seeds up to 1,000 non-adult sites if that seed has not already been recorded in the database. + +## Project Structure + +```text +sFetch/ +├── backend/ +│ ├── main.py +│ ├── crawler.py +│ ├── top_sites.py +│ ├── content_filter.py +│ ├── indexer.py +│ ├── searcher.py +│ ├── models.py +│ ├── database.py +│ ├── config.py +│ └── requirements.txt +├── frontend/ +│ ├── index.html +│ └── results.html +└── README.md +``` + +## Setup + +1. Create a virtual environment and install the backend dependencies: + + ```bash + cd backend + python3 -m venv venv + source venv/bin/activate + pip install -r requirements.txt + ``` + +2. Start the API: + + ```bash + uvicorn main:app --reload + ``` + +3. Open `frontend/index.html` in your browser. + +The frontend uses `const API_BASE = "http://localhost:8000";` at the top of each page script. + +## Crawling + +The home page has index controls for: + +- seeding the top 1,000 non-adult sites +- launching a custom crawl with seed URLs, depth, per-domain page limits, and same-domain filtering +- viewing current index and seed status + +You can also call the API directly: + +```bash +curl -X POST "http://localhost:8000/crawl" \ + -H "Content-Type: application/json" \ + -d '{ + "seed_urls": ["https://example.com"], + "max_depth": 2, + "max_pages_per_domain": 50, + "same_domain_only": true + }' +``` + +Seed the top-site list manually: + +```bash +curl -X POST "http://localhost:8000/crawl/top-sites" +``` + +The crawler: + +- respects `robots.txt` +- filters adult URLs and adult-heavy page text +- stays on the same domain by default +- avoids revisiting URLs +- indexes HTML pages, images, and videos into SQLite +- records top-site seeding completion in `app_meta` + +## API Endpoints + +| Method | Path | Purpose | +| --- | --- | --- | +| `GET` | `/` | Health check | +| `GET` | `/search` | Full-text search endpoint | +| `POST` | `/crawl` | Start a custom background crawl job | +| `POST` | `/crawl/top-sites` | Queue the top-site seed crawl | +| `GET` | `/crawl/top-sites/status` | Check top-site seed state | +| `GET` | `/stats` | Total indexed pages and latest index time | + +## Configuration + +sFetch's crawl and storage behavior lives in `backend/config.py`: + +| Setting | Description | +| --- | --- | +| `MAX_CRAWL_DEPTH` | Default link depth followed from each seed URL | +| `MAX_PAGES_PER_DOMAIN` | Default per-domain crawl cap | +| `CRAWL_DELAY_SECONDS` | Delay before requests | +| `DEFAULT_CRAWL_CONCURRENCY` | Concurrent fetch limit | +| `DB_PATH` | SQLite database path | +| `TOP_SITE_SOURCE_URL` | Top-site list source | +| `TOP_SITE_SEED_LIMIT` | Number of safe top sites to seed | +| `USER_AGENT` | User agent sent by `sFetchBot` | + +## Tech Stack + +| Layer | Technology | +| --- | --- | +| Frontend | HTML, TailwindCSS CDN, Vanilla JavaScript | +| Backend | Python, FastAPI | +| Crawler | Python, `httpx`, `BeautifulSoup4`, `asyncio` | +| Search Index | SQLite FTS5 via `aiosqlite` | +| Top Sites | Tranco daily top-site ZIP with bundled fallback | diff --git a/backend/config.py b/backend/config.py new file mode 100644 index 0000000..2c08761 --- /dev/null +++ b/backend/config.py @@ -0,0 +1,1226 @@ +"""Application-wide configuration for sFetch.""" + +from pathlib import Path + +BASE_DIR = Path(__file__).resolve().parent + +MAX_CRAWL_DEPTH = 2 +MAX_PAGES_PER_DOMAIN = 50 +CRAWL_DELAY_SECONDS = 1.0 +DEFAULT_CRAWL_CONCURRENCY = 12 +DB_PATH = str(BASE_DIR / "sfetch_index.db") +USER_AGENT = "sFetchBot/1.0 (+https://sfetch.io/bot)" +TOP_SITE_SOURCE_URL = "https://tranco-list.eu/top-1m.csv.zip" +TOP_SITE_SEED_LIMIT = 1000 +TOP_SITE_DOWNLOAD_TIMEOUT_SECONDS = 30.0 +TOP_SITE_SEED_META_KEY = "top_site_seed_v1" + +ADULT_DOMAINS = { + "pornhub.com", "xvideos.com", "xnxx.com", "xhamster.com", "redtube.com", + "youporn.com", "spankbang.com", "chaturbate.com", "onlyfans.com", + "bongacams.com", "livejasmin.com", "stripchat.com", "cam4.com", + "myfreecams.com", "flirt4free.com", "camsoda.com", "streamate.com", + "brazzers.com", "realitykings.com", "naughtyamerica.com", "bangbros.com", + "vixen.com", "tushy.com", "blacked.com", "deeper.com", "adulttime.com", + "jerkmate.com", "imlive.com", "fap18.com", "fapster.com", + "nudevista.com", "eporner.com", "hqporner.com", "porntrex.com", + "porndoe.com", "perfectgirls.net", "txxx.com", "tubegalore.com", + "theporndude.com", "porn.com", "pornpics.com", "nuvid.com", + "fuq.com", "porn00.org", "anysex.com", "tubxporn.com", + "gotporn.com", "drtuber.com", "tube8.com", "xhamster.com", + "sex.com", "youjizz.com", "tnaflix.com", "sunporno.com", + "thumbzilla.com", "motherless.com", "beeg.com", "efukt.com", + "rule34.xxx", "rule34video.com", "erome.com", "fansly.com", + "manyvids.com", "clips4sale.com", "literotica.com", +} + +ADULT_KEYWORDS = [ + "porn", "xxx", "adult", "sex", "nude", "naked", "erotic", + "escort", "camgirl", "cams", "fetish", "bdsm", "hentai", + "onlyfans", "milf", "teen", +] + +TOP_SITES = [ + "https://www.google.com", + "https://www.youtube.com", + "https://www.facebook.com", + "https://www.instagram.com", + "https://www.twitter.com", + "https://www.baidu.com", + "https://www.wikipedia.org", + "https://www.yahoo.com", + "https://www.yandex.com", + "https://www.whatsapp.com", + "https://www.amazon.com", + "https://www.tiktok.com", + "https://www.reddit.com", + "https://www.linkedin.com", + "https://www.live.com", + "https://www.netflix.com", + "https://www.openai.com", + "https://www.office.com", + "https://www.discord.com", + "https://www.bing.com", + "https://www.microsoft.com", + "https://www.twitch.tv", + "https://www.naver.com", + "https://www.pinterest.com", + "https://www.telegram.org", + "https://www.zoom.us", + "https://www.ebay.com", + "https://www.github.com", + "https://www.duckduckgo.com", + "https://www.spotify.com", + "https://www.canva.com", + "https://www.quora.com", + "https://www.fandom.com", + "https://www.samsung.com", + "https://www.globo.com", + "https://www.indeed.com", + "https://www.nytimes.com", + "https://www.cnbc.com", + "https://www.foxnews.com", + "https://www.espn.com", + "https://www.walmart.com", + "https://www.bbc.com", + "https://www.cnn.com", + "https://www.accuweather.com", + "https://www.stackoverflow.com", + "https://www.shopify.com", + "https://www.paypal.com", + "https://www.adobe.com", + "https://www.chase.com", + "https://www.intuit.com", + "https://www.dropbox.com", + "https://www.airbnb.com", + "https://www.booking.com", + "https://www.expedia.com", + "https://www.homedepot.com", + "https://www.target.com", + "https://www.bestbuy.com", + "https://www.costco.com", + "https://www.etsy.com", + "https://www.zillow.com", + "https://www.realtor.com", + "https://www.indiatimes.com", + "https://www.flipkart.com", + "https://www.aliexpress.com", + "https://www.stackexchange.com", + "https://www.trello.com", + "https://www.notion.so", + "https://www.slack.com", + "https://www.atlassian.com", + "https://www.figma.com", + "https://www.vercel.com", + "https://www.netlify.com", + "https://www.cloudflare.com", + "https://www.digitalocean.com", + "https://www.linode.com", + "https://www.heroku.com", + "https://www.mongodb.com", + "https://www.postgresql.org", + "https://www.mysql.com", + "https://www.docker.com", + "https://www.kubernetes.io", + "https://www.nginx.com", + "https://www.apache.org", + "https://www.nodejs.org", + "https://www.python.org", + "https://www.r-project.org", + "https://www.ruby-lang.org", + "https://www.golang.org", + "https://www.rust-lang.org", + "https://www.npmjs.com", + "https://www.pypi.org", + "https://www.dev.to", + "https://www.medium.com", + "https://www.hashnode.com", + "https://www.techcrunch.com", + "https://www.theverge.com", + "https://www.wired.com", + "https://www.arstechnica.com", + "https://www.engadget.com", + "https://www.zdnet.com", + "https://www.cnet.com", + "https://www.businessinsider.com", + "https://www.forbes.com", + "https://www.bloomberg.com", + "https://www.wsj.com", + "https://www.reuters.com", + "https://www.npr.org", + "https://www.theguardian.com", + "https://www.washingtonpost.com", + "https://www.usatoday.com", + "https://www.latimes.com", + "https://www.chicagotribune.com", + "https://www.news.yahoo.co.jp", + "https://www.dailymail.co.uk", + "https://www.mirror.co.uk", + "https://www.express.co.uk", + "https://www.rt.com", + "https://www.aljazeera.com", + "https://www.dw.com", + "https://www.france24.com", + "https://www.abc.net.au", + "https://www.9gag.com", + "https://www.imgur.com", + "https://www.snapchat.com", + "https://www.weibo.com", + "https://www.zhihu.com", + "https://www.douyin.com", + "https://www.kuaishou.com", + "https://www.jd.com", + "https://www.taobao.com", + "https://www.tmall.com", + "https://www.sohu.com", + "https://www.sina.com.cn", + "https://www.163.com", + "https://www.qq.com", + "https://www.vk.com", + "https://www.mail.ru", + "https://www.rambler.ru", + "https://www.yahoo.co.jp", + "https://www.line.me", + "https://www.nicovideo.jp", + "https://www.pixiv.net", + "https://www.hatena.ne.jp", + "https://www.nikkei.com", + "https://www.asahi.com", + "https://www.yomiuri.co.jp", + "https://www.mainichi.jp", + "https://www.tokyo-sports.co.jp", + "https://www.cricbuzz.com", + "https://www.espncricinfo.com", + "https://www.nba.com", + "https://www.nfl.com", + "https://www.mlb.com", + "https://www.nhl.com", + "https://www.uefa.com", + "https://www.fifa.com", + "https://www.olympics.com", + "https://www.chess.com", + "https://www.lichess.org", + "https://www.roblox.com", + "https://www.minecraft.net", + "https://www.fortnite.com", + "https://www.ea.com", + "https://www.steampowered.com", + "https://www.epicgames.com", + "https://www.gog.com", + "https://www.battle.net", + "https://www.ubisoft.com", + "https://www.riotgames.com", + "https://www.ign.com", + "https://www.gamespot.com", + "https://www.pcgamer.com", + "https://www.polygon.com", + "https://www.khanacademy.org", + "https://www.coursera.org", + "https://www.udemy.com", + "https://www.edx.org", + "https://www.udacity.com", + "https://www.codecademy.com", + "https://www.freecodecamp.org", + "https://www.w3schools.com", + "https://www.geeksforgeeks.org", + "https://www.tutorialspoint.com", + "https://www.javatpoint.com", + "https://www.programiz.com", + "https://www.leetcode.com", + "https://www.hackerrank.com", + "https://www.codewars.com", + "https://www.topcoder.com", + "https://www.codeforces.com", + "https://www.atcoder.jp", + "https://www.projecteuler.net", + "https://www.rosalind.info", + "https://www.adventofcode.com", + "https://www.kaggle.com", + "https://www.data.gov", + "https://www.archive.org", + "https://www.wikihow.com", + "https://www.dictionary.com", + "https://www.thesaurus.com", + "https://www.merriam-webster.com", + "https://www.urbandictionary.com", + "https://www.britannica.com", + "https://www.howstuffworks.com", + "https://www.space.com", + "https://www.nasa.gov", + "https://www.esa.int", + "https://www.noaa.gov", + "https://www.nature.com", + "https://www.science.org", + "https://www.sciencedaily.com", + "https://www.livescience.com", + "https://www.newscientist.com", + "https://www.scientificamerican.com", + "https://www.healthline.com", + "https://www.webmd.com", + "https://www.mayoclinic.org", + "https://www.nih.gov", + "https://www.who.int", + "https://www.cdc.gov", + "https://www.medscape.com", + "https://www.verywellhealth.com", + "https://www.psychologytoday.com", + "https://www.imdb.com", + "https://www.rottentomatoes.com", + "https://www.metacritic.com", + "https://www.hulu.com", + "https://www.disneyplus.com", + "https://www.hbomax.com", + "https://www.primevideo.com", + "https://www.apple.com", + "https://www.deepl.com", + "https://www.grammarly.com", + "https://www.typeform.com", + "https://www.surveymonkey.com", + "https://www.jotform.com", + "https://www.mailchimp.com", + "https://www.hubspot.com", + "https://www.salesforce.com", + "https://www.zendesk.com", + "https://www.intercom.com", + "https://www.drift.com", + "https://www.tidio.com", + "https://www.godaddy.com", + "https://www.namecheap.com", + "https://www.bluehost.com", + "https://www.hostinger.com", + "https://www.wix.com", + "https://www.squarespace.com", + "https://www.wordpress.com", + "https://www.webflow.com", + "https://www.stripe.com", + "https://www.squareup.com", + "https://www.coinbase.com", + "https://www.binance.com", + "https://www.robinhood.com", + "https://www.fidelity.com", + "https://www.vanguard.com", + "https://www.schwab.com", + "https://www.tdameritrade.com", + "https://www.etrade.com", + "https://www.morningstar.com", + "https://www.investopedia.com", + "https://www.marketwatch.com", + "https://www.seekingalpha.com", + "https://www.nerdwallet.com", + "https://www.bankrate.com", + "https://www.creditkarma.com", + "https://www.glassdoor.com", + "https://www.monster.com", + "https://www.ziprecruiter.com", + "https://www.ladders.com", + "https://www.dice.com", + "https://www.freelancer.com", + "https://www.upwork.com", + "https://www.fiverr.com", + "https://www.toptal.com", + "https://www.guru.com", + "https://www.peopleperhour.com", + "https://www.craigslist.org", + "https://www.kijiji.ca", + "https://www.olx.com", + "https://www.letgo.com", + "https://www.mercari.com", + "https://www.poshmark.com", + "https://www.thredup.com", + "https://www.depop.com", + "https://www.overstock.com", + "https://www.wayfair.com", + "https://www.ikea.com", + "https://www.lowes.com", + "https://www.nike.com", + "https://www.adidas.com", + "https://www.zara.com", + "https://www.hm.com", + "https://www.uniqlo.com", + "https://www.asos.com", + "https://www.shein.com", + "https://www.urbanoutfitters.com", + "https://www.nordstrom.com", + "https://www.macys.com", + "https://www.kohls.com", + "https://www.dickssportinggoods.com", + "https://www.rei.com", + "https://www.wholefoodsmarket.com", + "https://www.grubhub.com", + "https://www.doordash.com", + "https://www.ubereats.com", + "https://www.postmates.com", + "https://www.instacart.com", + "https://www.foodnetwork.com", + "https://www.allrecipes.com", + "https://www.tripadvisor.com", + "https://www.kayak.com", + "https://www.priceline.com", + "https://www.hotels.com", + "https://www.trivago.com", + "https://www.agoda.com", + "https://www.hostelworld.com", + "https://www.skyscanner.net", + "https://www.uber.com", + "https://www.lyft.com", + "https://www.hertz.com", + "https://www.enterprise.com", + "https://www.avis.com", + "https://www.united.com", + "https://www.aa.com", + "https://www.delta.com", + "https://www.southwest.com", + "https://www.jetblue.com", + "https://www.emirates.com", + "https://www.qatarairways.com", + "https://www.singaporeair.com", + "https://www.lufthansa.com", + "https://www.ba.com", + "https://www.airfrance.com", + "https://www.klm.com", + "https://www.turkishairlines.com", + "https://www.wunderground.com", + "https://www.weather.com", + "https://www.openweathermap.org", + "https://www.windy.com", + "https://www.wolframalpha.com", + "https://www.desmos.com", + "https://www.geogebra.org", + "https://www.symbolab.com", + "https://www.mathway.com", + "https://www.chemicalaid.com", + "https://www.pubchem.ncbi.nlm.nih.gov", + "https://www.chemspider.com", + "https://www.biorxiv.org", + "https://www.arxiv.org", + "https://www.scholar.google.com", + "https://www.researchgate.net", + "https://www.academia.edu", + "https://www.ssrn.com", + "https://www.jstor.org", + "https://www.plos.org", + "https://www.pubmed.ncbi.nlm.nih.gov", + "https://www.clinicaltrials.gov", + "https://www.cochranelibrary.com", + "https://www.frontiersin.org", + "https://www.mdpi.com", + "https://www.springer.com", + "https://www.wiley.com", + "https://www.elsevier.com", + "https://www.tandfonline.com", + "https://www.sagepub.com", + "https://www.oxfordjournals.org", + "https://www.cambridge.org", + "https://www.taylorandfrancis.com", + "https://www.ieee.org", + "https://www.acm.org", + "https://www.aaai.org", + "https://www.neurips.cc", + "https://www.icml.cc", + "https://www.cv-foundation.org", + "https://www.emnlp.org", + "https://www.aclweb.org", + "https://www.usenix.org", + "https://www.sigcomm.org", + "https://www.ndss-symposium.org", + "https://www.ccs.org", + "https://www.sigsac.org", + "https://www.isoc.org", + "https://www.w3.org", + "https://www.ietf.org", + "https://www.whatwg.org", + "https://www.ecma-international.org", + "https://www.iso.org", + "https://www.ansi.org", + "https://www.bsigroup.com", + "https://www.openapis.org", + "https://www.graphql.org", + "https://www.jsonapi.org", + "https://www.grpc.io", + "https://www.protocolbuffers.dev", + "https://www.swagger.io", + "https://www.postman.com", + "https://www.insomnia.rest", + "https://www.getpostman.com", + "https://www.rapidapi.com", + "https://www.apiflash.com", + "https://www.serpapi.com", + "https://www.scrapingbee.com", + "https://www.scraperapi.com", + "https://www.scrapingdog.com", + "https://www.brightdata.com", + "https://www.oxylabs.io", + "https://www.smartproxy.com", + "https://www.crawlbase.com", + "https://www.zenrows.com", + "https://www.scrapingant.com", + "https://www.apify.com", + "https://www.webscraper.io", + "https://www.parsehub.com", + "https://www.octoparse.com", + "https://www.import.io", + "https://www.diffbot.com", + "https://www.datawrapper.de", + "https://www.tableau.com", + "https://www.powerbi.microsoft.com", + "https://www.looker.com", + "https://www.qlik.com", + "https://www.domo.com", + "https://www.sisense.com", + "https://www.metabase.com", + "https://www.redash.io", + "https://www.superset.apache.org", + "https://www.grafana.com", + "https://www.datadoghq.com", + "https://www.newrelic.com", + "https://www.dynatrace.com", + "https://www.appdynamics.com", + "https://www.splunk.com", + "https://www.sumologic.com", + "https://www.elastic.co", + "https://www.logz.io", + "https://www.honeycomb.io", + "https://www.lightstep.com", + "https://www.sentry.io", + "https://www.rollbar.com", + "https://www.bugsnag.com", + "https://www.raygun.com", + "https://www.datadog.com", + "https://www.pagerduty.com", + "https://www.opsgenie.com", + "https://www.victorops.com", + "https://www.monday.com", + "https://www.asana.com", + "https://www.clickup.com", + "https://www.basecamp.com", + "https://www.teamwork.com", + "https://www.wrike.com", + "https://www.airtable.com", + "https://www.smartsheet.com", + "https://www.zapier.com", + "https://www.ifttt.com", + "https://www.make.com", + "https://www.n8n.io", + "https://www.automationanywhere.com", + "https://www.uipath.com", + "https://www.blueprism.com", + "https://www.pipedream.com", + "https://www.twilio.com", + "https://www.sendgrid.com", + "https://www.mailgun.com", + "https://www.postmarkapp.com", + "https://www.sendinblue.com", + "https://www.mailerlite.com", + "https://www.convertkit.com", + "https://www.activecampaign.com", + "https://www.drip.com", + "https://www.klaviyo.com", + "https://www.privy.com", + "https://www.opencart.com", + "https://www.magento.com", + "https://www.bigcommerce.com", + "https://www.prestashop.com", + "https://www.woocommerce.com", + "https://www.contentful.com", + "https://www.strapi.io", + "https://www.sanity.io", + "https://www.prismic.io", + "https://www.storyblok.com", + "https://www.ghost.org", + "https://www.drupal.org", + "https://www.joomla.org", + "https://www.gatsbyjs.com", + "https://www.nextjs.org", + "https://www.nuxtjs.org", + "https://www.svelte.dev", + "https://www.reactjs.org", + "https://www.vuejs.org", + "https://www.angular.io", + "https://www.emberjs.com", + "https://www.backbonejs.org", + "https://www.preactjs.com", + "https://www.solidjs.com", + "https://www.qwik.dev", + "https://www.htmx.org", + "https://www.alpinejs.dev", + "https://www.tailwindcss.com", + "https://www.getbootstrap.com", + "https://www.bulma.io", + "https://www.foundation.zurb.com", + "https://www.material-ui.com", + "https://www.chakra-ui.com", + "https://www.ant.design", + "https://www.radix-ui.com", + "https://www.shadcn.com", + "https://www.daisyui.com", + "https://www.flowbite.com", + "https://www.merakiui.com", + "https://www.primefaces.org", + "https://www.quasar.dev", + "https://www.vuetifyjs.com", + "https://www.electronjs.org", + "https://www.tauri.app", + "https://www.reactnative.dev", + "https://www.flutter.dev", + "https://www.ionicframework.com", + "https://www.kotlinlang.org", + "https://www.swift.org", + "https://www.typescriptlang.org", + "https://www.deno.com", + "https://www.bun.sh", + "https://www.webassembly.org", + "https://www.llvm.org", + "https://www.gnu.org", + "https://www.sourceware.org", + "https://www.kernel.org", + "https://www.freebsd.org", + "https://www.openbsd.org", + "https://www.netbsd.org", + "https://www.debian.org", + "https://www.ubuntu.com", + "https://www.fedoraproject.org", + "https://www.centos.org", + "https://www.archlinux.org", + "https://www.gentoo.org", + "https://www.alpinelinux.org", + "https://www.redhat.com", + "https://www.suse.com", + "https://www.oracle.com", + "https://www.ibm.com", + "https://www.hpe.com", + "https://www.dell.com", + "https://www.lenovo.com", + "https://www.hp.com", + "https://www.asus.com", + "https://www.acer.com", + "https://www.msi.com", + "https://www.gigabyte.com", + "https://www.raspberrypi.org", + "https://www.arduino.cc", + "https://www.esp32.com", + "https://www.adafruit.com", + "https://www.sparkfun.com", + "https://www.digikey.com", + "https://www.mouser.com", + "https://www.eeweb.com", + "https://www.electronicdesign.com", + "https://www.allaboutcircuits.com", + "https://www.hackaday.com", + "https://www.instructables.com", + "https://www.makezine.com", + "https://www.thingiverse.com", + "https://www.printables.com", + "https://www.cults3d.com", + "https://www.myminifactory.com", + "https://www.pinshape.com", + "https://www.youmagine.com", + "https://www.thangs.com", + "https://www.sketchfab.com", + "https://www.blendswap.com", + "https://www.turbosquid.com", + "https://www.cgtrader.com", + "https://www.mixamo.com", + "https://www.assetstore.unity.com", + "https://www.unity.com", + "https://www.unrealengine.com", + "https://www.godotengine.org", + "https://www.cryengine.com", + "https://www.lumberyard.com", + "https://www.garmin.com", + "https://www.specialized.com", + "https://www.trekbikes.com", + "https://www.cannondale.com", + "https://www.giant-bicycles.com", + "https://www.santacruzbicycles.com", + "https://www.rapha.cc", + "https://www.patagonia.com", + "https://www.thenorthface.com", + "https://www.columbia.com", + "https://www.arcsky.com", + "https://www.black-diamond.com", + "https://www.mammut.com", + "https://www.salomon.com", + "https://www.merrell.com", + "https://www.keenfootwear.com", + "https://www.timberland.com", + "https://www.drmartens.com", + "https://www.vans.com", + "https://www.converse.com", + "https://www.reebok.com", + "https://www.puma.com", + "https://www.newbalance.com", + "https://www.underarmour.com", + "https://www.lululemon.com", + "https://www.gymshark.com", + "https://www.fabletics.com", + "https://www.aloyoga.com", + "https://www.bandier.com", + "https://www.carbon38.com", + "https://www.outdoorvoices.com", + "https://www.nike.com", + "https://www.allbirds.com", + "https://www.brooksrunning.com", + "https://www.hoka.com", + "https://www.saucony.com", + "https://www.asics.com", + "https://www.mizuno.com", + "https://www.on-running.com", + "https://www.skechers.com", + "https://www.clarks.com", + "https://www.ubereats.com", + "https://www.doctorswithoutborders.org", + "https://www.unicef.org", + "https://www.redcross.org", + "https://www.worldwildlife.org", + "https://www.greenpeace.org", + "https://www.nature.org", + "https://www.sierraclub.org", + "https://www.audubon.org", + "https://www.charitynavigator.org", + "https://www.givewell.org", + "https://www.effectivealtruism.org", + "https://www.80000hours.org", + "https://www.givingwhatwecan.org", + "https://www.openphilanthropy.org", + "https://www.gatesfoundation.org", + "https://www.fordfoundation.org", + "https://www.macfound.org", + "https://www.rwjf.org", + "https://www.pewtrusts.org", + "https://www.cgdev.org", + "https://www.brookings.edu", + "https://www.rand.org", + "https://www.urban.org", + "https://www.cato.org", + "https://www.heritage.org", + "https://www.iiea.org", + "https://www.cfr.org", + "https://www.chathamhouse.org", + "https://www.eiu.com", + "https://www.economist.com", + "https://www.foreignaffairs.com", + "https://www.politico.com", + "https://www.axios.com", + "https://www.vox.com", + "https://www.slate.com", + "https://www.theatlantic.com", + "https://www.newyorker.com", + "https://www.newrepublic.com", + "https://www.motherjones.com", + "https://www.thenation.com", + "https://www.jacobinmag.com", + "https://www.propublica.org", + "https://www.revealnews.org", + "https://www.marshallproject.org", + "https://www.themarshallproject.org", + "https://www.texastribune.org", + "https://www.californiamatters.org", + "https://www.governing.com", + "https://www.stateline.org", + "https://www.pewresearch.org", + "https://www.gallup.com", + "https://www.ipsos.com", + "https://www.yougov.com", + "https://www.surveymonkey.com", + "https://www.qualtrics.com", + "https://www.census.gov", + "https://www.bls.gov", + "https://www.bea.gov", + "https://www.federalreserve.gov", + "https://www.treasury.gov", + "https://www.whitehouse.gov", + "https://www.supremecourt.gov", + "https://www.congress.gov", + "https://www.usa.gov", + "https://www.usa.gov/espanol", + "https://www.gov.uk", + "https://www.canada.ca", + "https://www.australia.gov.au", + "https://www.india.gov.in", + "https://www.gov.za", + "https://www.japan.go.jp", + "https://www.gov.br", + "https://www.gouvernement.fr", + "https://www.bundesregierung.de", + "https://www.admin.ch", + "https://www.government.se", + "https://www.government.nl", + "https://www.belgium.be", + "https://www.austria.gv.at", + "https://www.italia.it", + "https://www.spain.info", + "https://www.visitportugal.com", + "https://www.greektravel.com", + "https://www.visitnorway.com", + "https://www.visitdenmark.com", + "https://www.visitfinland.com", + "https://www.visiticeland.com", + "https://www.ireland.com", + "https://www.visitscotland.com", + "https://www.visitwales.com", + "https://www.visitengland.com", + "https://www.croatia.hr", + "https://www.slovenia.info", + "https://www.czechtourism.com", + "https://www.poland.travel", + "https://www.hungary.com", + "https://www.romaniatourism.com", + "https://www.bulgariatravel.org", + "https://www.visitestonia.com", + "https://www.latvia.travel", + "https://www.lithuania.travel", + "https://www.belarus.by", + "https://www.ukraine.com", + "https://www.russiatourism.ru", + "https://www.visit-turkey.com", + "https://www.goisrael.com", + "https://www.visitjordan.com", + "https://www.egypt.travel", + "https://www.moroccotourism.com", + "https://www.visittunisia.com", + "https://www.algeria.com", + "https://www.southafrica.net", + "https://www.kenyatourism.org", + "https://www.tanzaniatourism.go.tz", + "https://www.visitrwanda.com", + "https://www.ugandatourism.org", + "https://www.ethiopianairlines.com", + "https://www.nigeriatourism.org", + "https://www.ghanatourism.gov.gh", + "https://www.senegal-tourism.com", + "https://www.visitmorocco.com", + "https://www.tourismthailand.org", + "https://www.vietnamtourism.gov.vn", + "https://www.tourismcambodia.com", + "https://www.tourismlaos.org", + "https://www.myanmartourism.org", + "https://www.indonesia-tourism.com", + "https://www.malaysia.travel", + "https://www.tourism.gov.ph", + "https://www.srilanka.travel", + "https://www.indiatourism.org", + "https://www.nepaltourism.org", + "https://www.tourismbhutan.org", + "https://www.bangladeshtourism.gov.bd", + "https://www.pakistan.gov.pk", + "https://www.uzbekistan.travel", + "https://www.kazakhstan.travel", + "https://www.kyrgyzstan.travel", + "https://www.tajikistan.travel", + "https://www.turkmenistan.travel", + "https://www.mongolia.travel", + "https://www.koreatourism.or.kr", + "https://www.japan.travel", + "https://www.taiwan.net.tw", + "https://www.hongkongtourismboard.com", + "https://www.visitsingapore.com", + "https://www.newzealand.com", + "https://www.australia.com", + "https://www.fiji.travel", + "https://www.tahititourisme.com", + "https://www.hawaii.com", + "https://www.mexicotourism.org", + "https://www.visitcostarica.com", + "https://www.visitpanama.com", + "https://www.colombia.travel", + "https://www.visitperu.com", + "https://www.boliviatravel.org", + "https://www.chile.travel", + "https://www.argentina.travel", + "https://www.uruguaynatural.com", + "https://www.visitbrasil.com", + "https://www.ecuador.travel", + "https://www.aruba.com", + "https://www.bahamas.com", + "https://www.barbados.org", + "https://www.jamaica.com", + "https://www.dominica.dm", + "https://www.grenadagrenadines.com", + "https://www.stlucia.org", + "https://www.antigua-barbuda.org", + "https://www.trinidadtobago.com", + "https://www.caymanislands.com", + "https://www.bermuda.com", + "https://www.maldives.com", + "https://www.mauritius.net", + "https://www.seychelles.travel", + "https://www.icelandictouristboard.com", + "https://www.greenland.com", + "https://www.faroeislands.com", + "https://www.duolingo.com", + "https://www.memrise.com", + "https://www.babbel.com", + "https://www.rosettastone.com", + "https://www.busuu.com", + "https://www.lingodeer.com", + "https://www.hellotalk.com", + "https://www.tandem.net", + "https://www.italki.com", + "https://www.verbling.com", + "https://www.preply.com", + "https://www.cambly.com", + "https://www.vipkid.com", + "https://www.lingoda.com", + "https://www.skooli.com", + "https://www.teachertube.com", + "https://www.mathgames.com", + "https://www.coolmathgames.com", + "https://www.abcya.com", + "https://www.starfall.com", + "https://www.pbskids.org", + "https://www.nick.com", + "https://www.cartoonnetwork.com", + "https://www.pokemon.com", + "https://www.lego.com", + "https://www.hasbro.com", + "https://www.mattel.com", + "https://www.barbie.com", + "https://www.hotwheels.com", + "https://www.nerf.com", + "https://www.funko.com", + "https://www.popmart.com", + "https://www.spinmaster.com", + "https://www.moosetoys.com", + "https://www.ravensburger.com", + "https://www.catan.com", + "https://www.magic.wizards.com", + "https://www.pokemon.com/us/pokemon-tcg", + "https://www.yugioh.com", + "https://www.vanguardcardfight.com", + "https://www.digimoncard.com", + "https://www.onepiece-cardgame.com", + "https://www.dragonball-cardgame.com", + "https://www.lorcana.com", + "https://www.flashscore.com", + "https://www.livescore.com", + "https://www.sofascore.com", + "https://www.strava.com", + "https://www.mapmyrun.com", + "https://www.runkeeper.com", + "https://www.fitbit.com", + "https://www.garmin.com", + "https://www.whoop.com", + "https://www.ouraring.com", + "https://www.myfitnesspal.com", + "https://www.loseit.com", + "https://www.cronometer.com", + "https://www.yazio.com", + "https://www.lifesum.com", + "https://www.noom.com", + "https://www.weightwatchers.com", + "https://www.headspace.com", + "https://www.calm.com", + "https://www.insighttimer.com", + "https://www.wakingup.com", + "https://www.tenpercent.com", + "https://www.balanceapp.com", + "https://www.smilingmind.com.au", + "https://www.petalsofapricot.com", + "https://www.sleepcycle.com", + "https://www.pzizz.com", + "https://www.bettersleep.com", + "https://www.sleepio.com", + "https://www.snorelab.com", + "https://www.pillow.app", + "https://www.auto.sleep", + "https://www.sleepwatchapp.com", + "https://www.mindbodygreen.com", + "https://www.goop.com", + "https://www.poosh.com", + "https://www.wellandgood.com", + "https://www.self.com", + "https://www.shape.com", + "https://www.menshealth.com", + "https://www.womenshealthmag.com", + "https://www.runnersworld.com", + "https://www.bicycling.com", + "https://www.yogajournal.com", + "https://www.prevention.com", + "https://www.goodhousekeeping.com", + "https://www.cosmopolitan.com", + "https://www.esquire.com", + "https://www.gq.com", + "https://www.vanityfair.com", + "https://www.vogue.com", + "https://www.harpersbazaar.com", + "https://www.elle.com", + "https://www.marieclaire.com", + "https://www.instyle.com", + "https://www.people.com", + "https://www.usanews.com", + "https://www.time.com", + "https://www.newsweek.com", + "https://www.economist.com", + "https://www.newyorker.com", + "https://www.theatlantic.com", + "https://www.harpers.org", + "https://www.weeklystandard.com", + "https://www.nationalreview.com", + "https://www.commentarymagazine.com", + "https://www.dissentmagazine.org", + "https://www.nytmag.com", + "https://www.technologyreview.com", + "https://www.spectrum.ieee.org", + "https://www.wired.co.uk", + "https://www.wired.com", + "https://www.technologyreview.com", + "https://www.cnet.com", + "https://www.tomshardware.com", + "https://www.anandtech.com", + "https://www.pcworld.com", + "https://www.techradar.com", + "https://www.gsmarena.com", + "https://www.phonearena.com", + "https://www.xda-developers.com", + "https://www.androidpolice.com", + "https://www.androidcentral.com", + "https://www.9to5mac.com", + "https://www.macrumors.com", + "https://www.appleinsider.com", + "https://www.cultofmac.com", + "https://www.idownloadblog.com", + "https://www.droid-life.com", + "https://www.wpcentral.com", + "https://www.neowin.net", + "https://www.bleepingcomputer.com", + "https://www.ghacks.net", + "https://www.betanews.com", + "https://www.theguardian.com/technology", + "https://www.bbc.com/news/technology", + "https://www.independent.co.uk/tech", + "https://www.telegraph.co.uk/technology", + "https://www.ft.com/technology", + "https://www.recode.net", + "https://www.protocol.com", + "https://www.theinformation.com", + "https://www.restofworld.org", + "https://www.gizmodo.com", + "https://www.lifehacker.com", + "https://www.jalopnik.com", + "https://www.deadspin.com", + "https://www.kotaku.com", + "https://www.io9.com", + "https://www.jezebel.com", + "https://www.thedailybeast.com", + "https://www.thedailywire.com", + "https://www.breitbart.com", + "https://www.theblaze.com", + "https://www.cnn.com", + "https://www.foxnews.com", + "https://www.msnbc.com", + "https://www.abcnews.go.com", + "https://www.nbcnews.com", + "https://www.cbsnews.com", + "https://www.pbs.org", + "https://www.c-span.org", + "https://www.aljazeera.com", + "https://www.bbc.com", + "https://www.sky.com", + "https://www.france24.com", + "https://www.euronews.com", + "https://www.rtve.es", + "https://www.lemonde.fr", + "https://www.lefigaro.fr", + "https://www.liberation.fr", + "https://www.francetvinfo.fr", + "https://www.lesechos.fr", + "https://www.latribune.fr", + "https://www.lepoint.fr", + "https://www.lexpress.fr", + "https://www.parismatch.com", + "https://www.gala.fr", + "https://www.purepeople.com", + "https://www.voici.fr", + "https://www.closermag.fr", + "https://www.public.fr", + "https://www.bfmtv.com", + "https://www.lci.fr", + "https://www.francetv.fr", + "https://www.radiofrance.fr", + "https://www.spiegel.de", + "https://www.welt.de", + "https://www.zeit.de", + "https://www.sueddeutsche.de", + "https://www.faz.net", + "https://www.handelsblatt.com", + "https://www.bild.de", + "https://www.stern.de", + "https://www.focus.de", + "https://www.wiwo.de", + "https://www.heise.de", + "https://www.golem.de", + "https://www.netzpolitik.org", + "https://www.tagesschau.de", + "https://www.zdf.de", + "https://www.n-tv.de", + "https://www.rp-online.de", + "https://www.rundschau-online.de", + "https://www.express.de", + "https://www.bz-berlin.de", + "https://www.morgenpost.de", + "https://www.abendblatt.de", + "https://www.abendzeitung-muenchen.de", + "https://www.nrc.nl", + "https://www.volkskrant.nl", + "https://www.ad.nl", + "https://www.telegraaf.nl", + "https://www.trouw.nl", + "https://www.gelderlander.nl", + "https://www.parool.nl", + "https://www.bd.nl", + "https://www.nd.nl", + "https://www.fd.nl", + "https://www.nu.nl", + "https://www.rtlnieuws.nl", + "https://www.nos.nl", + "https://www.at5.nl", + "https://www.omroepwest.nl", + "https://www.omroepbrabant.nl", + "https://www.limburger.nl", + "https://www.destentor.nl", + "https://www.tubantia.nl", + "https://www.ed.nl", + "https://www.pzc.nl", + "https://www.rvd.nl", + "https://www.lc.nl", + "https://www.gva.be", + "https://www.hln.be", + "https://www.nieuwsblad.be", + "https://www.standaard.be", + "https://www.demorgen.be", + "https://www.tijd.be", + "https://www.rtbf.be", + "https://www.rtl.be", + "https://www.vrt.be", + "https://www.sudinfo.be", + "https://www.lalibre.be", + "https://www.lameuse.be", + "https://www.lanouvellegazette.be", + "https://www.laprovince.be", + "https://www.nordpresse.be", + "https://www.7sur7.be", + "https://www.lecho.be", + "https://www.linkedin.com", + "https://www.xing.com", + "https://www.meetup.com", + "https://www.eventbrite.com", + "https://www.bandsintown.com", + "https://www.songkick.com", + "https://www.ticketmaster.com", + "https://www.ticketfly.com", + "https://www.ticks.com", + "https://www.fandango.com", + "https://www.amctheatres.com", + "https://www.regmovies.com", + "https://www.metopera.org", + "https://www.carnegiehall.org", + "https://www.lincolncenter.org", + "https://www.broadway.com", + "https://www.telecharge.com", + "https://www.ticketcity.com", + "https://www.goldstar.com", + "https://www.todaytix.com", + "https://www.rush49.com", + "https://www.vividseats.com", + "https://www.stubhub.com", + "https://www.seatgeek.com", + "https://www.gametime.co", + "https://www.tickpick.com", + "https://www.thrivemarket.com", + "https://www.freshdirect.com", + "https://www.peapod.com", + "https://www.shipt.com", + "https://www.goinstacart.com", + "https://www.boxed.com", + "https://www.jet.com", + "https://www.bonanza.com", + "https://www.rakuten.com", + "https://www.tophatter.com", + "https://www.wish.com", + "https://www.gearbest.com", + "https://www.banggood.com", + "https://www.dhgate.com", + "https://www.lightinthebox.com", + "https://www.made-in-china.com", + "https://www.alibaba.com", + "https://www.indiamart.com", + "https://www.tradeindia.com", + "https://www.exportersindia.com", + "https://www.indiamart.com", + "https://www.amazon.de", + "https://www.amazon.co.jp", + "https://www.amazon.co.uk", + "https://www.amazon.ca", + "https://www.amazon.fr", + "https://www.amazon.it", + "https://www.amazon.es", + "https://www.amazon.com.mx", + "https://www.amazon.com.br", + "https://www.amazon.in", + "https://www.amazon.com.au", + "https://www.amazon.ae", + "https://www.amazon.sg", + "https://www.amazon.nl", + "https://www.amazon.se", + "https://www.amazon.pl", + "https://www.amazon.com.be", + "https://www.amazon.com.tr", + "https://www.citibank.com", + "https://www.wellsfargo.com", + "https://www.bankofamerica.com", + "https://www.jpmorganchase.com", + "https://www.usbank.com", + "https://www.pnc.com", + "https://www.tdbank.com", + "https://www.capitalone.com", + "https://www.discover.com", + "https://www.amex.com", + "https://www.americanexpress.com", + "https://www.barclays.com", + "https://www.hsbc.com", + "https://www.deutsche-bank.com", + "https://www.credit-suisse.com", + "https://www.ubs.com", + "https://www.bnpparibas.com", + "https://www.santander.com", + "https://www.mastercard.com", + "https://www.visa.com", + "https://www.swift.com", + "https://www.bloomberg.com", + "https://www.sec.gov", + "https://www.edgar-online.com", + "https://www.nasdaq.com", + "https://www.nyse.com", + "https://www.lsengroup.com", + "https://www.londonstockexchange.com", + "https://www.tokyo.jp.exchange", + "https://www.shanghai-stock.com", + "https://www.hkex.com.hk", + "https://www.asx.com.au", + "https://www.tsx.com", + "https://www.euronext.com", + "https://www.six-group.com", + "https://www.borsaitaliana.it", + "https://www.bolsasymercados.es", + "https://www.cmegroup.com", + "https://www.cboe.com", + "https://www.interactivebrokers.com", + "https://www.tradingview.com", + "https://www.investing.com", + "https://www.stockcharts.com", + "https://www.finviz.com", + "https://www.tipranks.com", + "https://www.simplywall.st", + "https://www.gurufocus.com", + "https://www.validea.com", + "https://www.stockrover.com", + "https://www.portfolio123.com", + "https://www.alphaquery.com", + "https://www.finviz.com", + "https://www.barchart.com", + "https://www.trading212.com", + "https://www.etoro.com", + "https://www.revolut.com", + "https://www.moneyboxapp.com", + "https://www.freelancetrade.com", +] diff --git a/backend/content_filter.py b/backend/content_filter.py new file mode 100644 index 0000000..2f35199 --- /dev/null +++ b/backend/content_filter.py @@ -0,0 +1,61 @@ +"""Adult-content filtering helpers used before URLs reach the index.""" + +from __future__ import annotations + +import re +from urllib.parse import urlsplit + +from config import ADULT_DOMAINS, ADULT_KEYWORDS + +EXPLICIT_HOST_MARKERS = ( + "porn", + "xxx", + "xvideo", + "xnxx", + "hentai", + "camgirl", + "camsoda", + "chaturbate", + "stripchat", + "redtube", +) +EXPLICIT_HOST_TOKENS = {"sex", "sexy", "adult", "nude", "erotic", "escort", "bdsm"} + + +def _clean_host(url: str) -> str: + host = urlsplit(url.lower()).netloc + return host.removeprefix("www.") + + +def _host_matches_blocked_domain(host: str, domain: str) -> bool: + clean_domain = domain.lower().removeprefix("www.") + return host == clean_domain or host.endswith(f".{clean_domain}") + + +def is_adult_url(url: str) -> bool: + """Return True when a URL appears to point at pornographic/adult content.""" + + lowered = url.lower() + parsed = urlsplit(lowered) + host = _clean_host(lowered) + + if any(_host_matches_blocked_domain(host, domain) for domain in ADULT_DOMAINS): + return True + + if any(marker in host for marker in EXPLICIT_HOST_MARKERS): + return True + + host_tokens = set(re.split(r"[^a-z0-9]+", host)) + if any(token in host_tokens for token in EXPLICIT_HOST_TOKENS): + return True + + path_tokens = set(re.split(r"[^a-z0-9]+", f"{parsed.path} {parsed.query}")) + return any(keyword in path_tokens for keyword in ADULT_KEYWORDS) + + +def is_adult_text(text: str) -> bool: + """Use a conservative keyword threshold so one incidental word does not block a page.""" + + lowered = text.lower() + hits = sum(1 for keyword in ADULT_KEYWORDS if keyword in lowered) + return hits >= 3 diff --git a/backend/crawler.py b/backend/crawler.py new file mode 100644 index 0000000..a6bae40 --- /dev/null +++ b/backend/crawler.py @@ -0,0 +1,309 @@ +"""Async web crawler used to build the sFetch index.""" + +from __future__ import annotations + +import asyncio +from collections import defaultdict +from typing import Iterable +from urllib.parse import urljoin, urldefrag, urlsplit, urlunsplit +from urllib.robotparser import RobotFileParser + +import httpx +from bs4 import BeautifulSoup + +from config import ( + CRAWL_DELAY_SECONDS, + DEFAULT_CRAWL_CONCURRENCY, + MAX_CRAWL_DEPTH, + MAX_PAGES_PER_DOMAIN, + USER_AGENT, +) +from content_filter import is_adult_text, is_adult_url +from indexer import index_page + + +class sFetchBot: + """A polite async crawler that stays within configurable crawl limits and filters adult content.""" + + def __init__( + self, + max_depth: int = MAX_CRAWL_DEPTH, + same_domain_only: bool = True, + crawl_delay: float = CRAWL_DELAY_SECONDS, + max_pages_per_domain: int = MAX_PAGES_PER_DOMAIN, + max_concurrency: int = DEFAULT_CRAWL_CONCURRENCY, + timeout_seconds: float = 15.0, + ) -> None: + self.max_depth = max_depth + self.same_domain_only = same_domain_only + self.crawl_delay = crawl_delay + self.max_pages_per_domain = max_pages_per_domain + self.max_concurrency = max(1, max_concurrency) + self.timeout_seconds = timeout_seconds + self.visited: set[str] = set() + self.domain_counts: defaultdict[str, int] = defaultdict(int) + self.robots_cache: dict[str, RobotFileParser] = {} + self.indexed_count = 0 + self._state_lock = asyncio.Lock() + self._fetch_semaphore = asyncio.Semaphore(self.max_concurrency) + self._client: httpx.AsyncClient | None = None + + async def start(self, seed_urls: list[str]) -> None: + if not seed_urls: + return + + timeout = httpx.Timeout(self.timeout_seconds) + headers = {"User-Agent": USER_AGENT} + async with httpx.AsyncClient( + timeout=timeout, + follow_redirects=True, + headers=headers, + ) as client: + self._client = client + tasks = [] + for seed_url in seed_urls: + normalized_seed = self._normalize_url(seed_url) + if normalized_seed is None: + print(f"sFetch: skipped {seed_url} (invalid URL)") + continue + if is_adult_url(normalized_seed): + print(f"sFetch: skipped {seed_url} (adult content filtered)") + continue + root_domain = urlsplit(normalized_seed).netloc.lower() + tasks.append(self._crawl_url(normalized_seed, root_domain, depth=0)) + + if tasks: + await asyncio.gather(*tasks, return_exceptions=True) + + self._client = None + + async def _crawl_url(self, url: str, root_domain: str, depth: int) -> None: + try: + if depth > self.max_depth: + return + + normalized_url = self._normalize_url(url) + if normalized_url is None: + return + + if is_adult_url(normalized_url): + print(f"sFetch: skipped {normalized_url} (adult)") + return + + parsed = urlsplit(normalized_url) + current_domain = parsed.netloc.lower() + if self.same_domain_only and current_domain != root_domain: + return + + if await self._already_seen(normalized_url): + return + + if await self._domain_limit_reached(current_domain): + return + + if not await self._is_allowed_by_robots(normalized_url): + return + + client = self._require_client() + async with self._fetch_semaphore: + await asyncio.sleep(self.crawl_delay) + response = await client.get(normalized_url) + response.raise_for_status() + + content_type = response.headers.get("content-type", "").lower() + if "text/html" not in content_type: + return + + title, body_text, links, images, videos = self._extract_page_content(normalized_url, response.text) + + if is_adult_text(body_text): + print(f"sFetch: skipped {normalized_url} (adult text)") + return + + await index_page(normalized_url, title, body_text, images, videos) + await self._increment_domain_count(current_domain) + self.indexed_count += 1 + print(f"sFetch: indexed {normalized_url}") + + for link in links: + await self._crawl_url(link, root_domain, depth + 1) + except httpx.HTTPError as exc: + print(f"sFetch: HTTP error {url} ({exc})") + except Exception as exc: + print(f"sFetch: error {url} ({exc})") + + def _require_client(self) -> httpx.AsyncClient: + if self._client is None: + raise RuntimeError("Crawler client is not initialized.") + return self._client + + async def _already_seen(self, url: str) -> bool: + async with self._state_lock: + if url in self.visited: + return True + self.visited.add(url) + return False + + async def _domain_limit_reached(self, domain: str) -> bool: + async with self._state_lock: + return self.domain_counts[domain] >= self.max_pages_per_domain + + async def _increment_domain_count(self, domain: str) -> None: + async with self._state_lock: + self.domain_counts[domain] += 1 + + async def _is_allowed_by_robots(self, url: str) -> bool: + parsed = urlsplit(url) + robots_key = f"{parsed.scheme}://{parsed.netloc.lower()}" + parser = self.robots_cache.get(robots_key) + if parser is None: + parser = await self._fetch_robots_parser(robots_key) + self.robots_cache[robots_key] = parser + return parser.can_fetch(USER_AGENT, url) + + async def _fetch_robots_parser(self, domain_base: str) -> RobotFileParser: + parser = RobotFileParser() + robots_url = f"{domain_base}/robots.txt" + parser.set_url(robots_url) + + try: + client = self._require_client() + response = await client.get(robots_url) + if response.status_code == 200: + parser.parse(response.text.splitlines()) + else: + parser.parse([]) + except Exception: + parser.parse([]) + return parser + + def _extract_page_content( + self, + url: str, + html_text: str, + ) -> tuple[str, str, list[str], list[dict[str, str]], list[dict[str, str]]]: + soup = BeautifulSoup(html_text, "html.parser") + + images = self._extract_images(url, soup) + videos = self._extract_videos(url, soup) + + for element in soup(["script", "style", "noscript"]): + element.decompose() + + title = "" + if soup.title and soup.title.string: + title = soup.title.string.strip() + if not title: + title = url + + body_text = soup.get_text(separator=" ", strip=True) + links = self._extract_links(url, soup) + return title, body_text, links, images, videos + + def _extract_images(self, base_url: str, soup: BeautifulSoup) -> list[dict[str, str]]: + images = [] + for img in soup.find_all("img", src=True): + src = str(img["src"]).strip() + if not src or src.startswith(("data:", "javascript:")): + continue + absolute_url = urljoin(base_url, src) + normalized_url = self._normalize_url(absolute_url) + if normalized_url is not None: + alt = str(img.get("alt", "")).strip() + images.append({"url": normalized_url, "alt_text": alt}) + return self._dedupe_media(images) + + def _extract_videos(self, base_url: str, soup: BeautifulSoup) -> list[dict[str, str]]: + videos: list[dict[str, str]] = [] + + for video in soup.find_all("video"): + if video.get("src"): + normalized = self._normalize_url(urljoin(base_url, str(video["src"]).strip())) + if normalized: + title = str(video.get("title") or video.get("aria-label") or "").strip() + videos.append({"url": normalized, "title": title}) + for source in video.find_all("source", src=True): + normalized = self._normalize_url(urljoin(base_url, str(source["src"]).strip())) + if normalized: + title = str(video.get("title") or video.get("aria-label") or "").strip() + videos.append({"url": normalized, "title": title}) + + for iframe in soup.find_all("iframe", src=True): + raw_src = str(iframe["src"]).strip() + normalized = self._normalize_url(urljoin(base_url, raw_src)) + if normalized and self._is_video_url(normalized): + title = str(iframe.get("title") or iframe.get("aria-label") or "").strip() + videos.append({"url": normalized, "title": title}) + + for tag in soup.find_all("a", href=True): + raw_href = str(tag["href"]).strip() + normalized = self._normalize_url(urljoin(base_url, raw_href)) + if normalized and self._is_video_url(normalized): + title = " ".join(tag.stripped_strings).strip() + videos.append({"url": normalized, "title": title}) + + return self._dedupe_media(videos) + + def _is_video_url(self, url: str) -> bool: + lowered = url.lower() + return any( + marker in lowered + for marker in ( + "youtube.com/watch", + "youtube.com/embed/", + "youtu.be/", + "vimeo.com/", + ".mp4", + ".webm", + ".mov", + ".m3u8", + ) + ) + + def _dedupe_media(self, items: list[dict[str, str]]) -> list[dict[str, str]]: + seen: set[str] = set() + unique: list[dict[str, str]] = [] + for item in items: + media_url = item.get("url") + if not media_url or media_url in seen: + continue + seen.add(media_url) + unique.append(item) + return unique + + def _extract_links(self, base_url: str, soup: BeautifulSoup) -> list[str]: + collected_links: list[str] = [] + for tag in soup.find_all("a", href=True): + href = str(tag["href"]).strip() + if not href or href.startswith(("javascript:", "mailto:", "tel:")): + continue + absolute_url = urljoin(base_url, href) + normalized_url = self._normalize_url(absolute_url) + if normalized_url is not None: + collected_links.append(normalized_url) + return self._dedupe_links(collected_links) + + def _dedupe_links(self, links: Iterable[str]) -> list[str]: + seen: set[str] = set() + unique_links: list[str] = [] + for link in links: + if link in seen: + continue + seen.add(link) + unique_links.append(link) + return unique_links + + def _normalize_url(self, url: str) -> str | None: + if not url: + return None + + clean_url, _ = urldefrag(url.strip()) + parsed = urlsplit(clean_url) + if parsed.scheme not in {"http", "https"} or not parsed.netloc: + return None + + normalized = parsed._replace( + scheme=parsed.scheme.lower(), + netloc=parsed.netloc.lower(), + ) + return urlunsplit(normalized) diff --git a/backend/database.py b/backend/database.py new file mode 100644 index 0000000..e020727 --- /dev/null +++ b/backend/database.py @@ -0,0 +1,395 @@ +"""Async SQLite helpers for sFetch's crawl index.""" + +from __future__ import annotations + +from contextlib import asynccontextmanager +from typing import Any, AsyncIterator + +import aiosqlite + +from config import DB_PATH + + +@asynccontextmanager +async def _get_connection() -> AsyncIterator[aiosqlite.Connection]: + async with aiosqlite.connect(DB_PATH) as connection: + connection.row_factory = aiosqlite.Row + await connection.execute("PRAGMA foreign_keys = ON;") + await connection.execute("PRAGMA journal_mode = WAL;") + yield connection + + +def _to_fts_query(query: str) -> str: + tokens: list[str] = [] + for raw_token in query.split(): + token = raw_token.strip() + if not token: + continue + escaped = token.replace('"', '""') + tokens.append(f'"{escaped}"') + return " OR ".join(tokens) + + +async def init_db() -> None: + async with _get_connection() as connection: + await connection.executescript( + """ + CREATE TABLE IF NOT EXISTS pages ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url TEXT UNIQUE NOT NULL, + title TEXT, + body_text TEXT, + indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP + ); + + CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts + USING fts5(title, body_text, content='pages', content_rowid='id'); + + CREATE TRIGGER IF NOT EXISTS pages_ai + AFTER INSERT ON pages + BEGIN + INSERT INTO pages_fts(rowid, title, body_text) + VALUES (new.id, new.title, new.body_text); + END; + + CREATE TRIGGER IF NOT EXISTS pages_ad + AFTER DELETE ON pages + BEGIN + INSERT INTO pages_fts(pages_fts, rowid, title, body_text) + VALUES ('delete', old.id, old.title, old.body_text); + END; + + CREATE TRIGGER IF NOT EXISTS pages_au + AFTER UPDATE ON pages + BEGIN + INSERT INTO pages_fts(pages_fts, rowid, title, body_text) + VALUES ('delete', old.id, old.title, old.body_text); + INSERT INTO pages_fts(rowid, title, body_text) + VALUES (new.id, new.title, new.body_text); + END; + + CREATE TABLE IF NOT EXISTS images ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url TEXT UNIQUE NOT NULL, + page_url TEXT NOT NULL, + alt_text TEXT, + indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY(page_url) REFERENCES pages(url) ON DELETE CASCADE + ); + + CREATE VIRTUAL TABLE IF NOT EXISTS images_fts + USING fts5(alt_text, content='images', content_rowid='id'); + + CREATE TRIGGER IF NOT EXISTS images_ai + AFTER INSERT ON images + BEGIN + INSERT INTO images_fts(rowid, alt_text) + VALUES (new.id, new.alt_text); + END; + + CREATE TRIGGER IF NOT EXISTS images_ad + AFTER DELETE ON images + BEGIN + INSERT INTO images_fts(images_fts, rowid, alt_text) + VALUES ('delete', old.id, old.alt_text); + END; + + CREATE TRIGGER IF NOT EXISTS images_au + AFTER UPDATE ON images + BEGIN + INSERT INTO images_fts(images_fts, rowid, alt_text) + VALUES ('delete', old.id, old.alt_text); + INSERT INTO images_fts(rowid, alt_text) + VALUES (new.id, new.alt_text); + END; + + CREATE TABLE IF NOT EXISTS videos ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url TEXT UNIQUE NOT NULL, + page_url TEXT NOT NULL, + title TEXT, + indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY(page_url) REFERENCES pages(url) ON DELETE CASCADE + ); + + CREATE VIRTUAL TABLE IF NOT EXISTS videos_fts + USING fts5(title, content='videos', content_rowid='id'); + + CREATE TRIGGER IF NOT EXISTS videos_ai + AFTER INSERT ON videos + BEGIN + INSERT INTO videos_fts(rowid, title) + VALUES (new.id, new.title); + END; + + CREATE TRIGGER IF NOT EXISTS videos_ad + AFTER DELETE ON videos + BEGIN + INSERT INTO videos_fts(videos_fts, rowid, title) + VALUES ('delete', old.id, old.title); + END; + + CREATE TRIGGER IF NOT EXISTS videos_au + AFTER UPDATE ON videos + BEGIN + INSERT INTO videos_fts(videos_fts, rowid, title) + VALUES ('delete', old.id, old.title); + INSERT INTO videos_fts(rowid, title) + VALUES (new.id, new.title); + END; + + CREATE TABLE IF NOT EXISTS app_meta ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL, + updated_at DATETIME DEFAULT CURRENT_TIMESTAMP + ); + """ + ) + await connection.commit() + + +async def get_meta_value(key: str) -> str | None: + async with _get_connection() as connection: + cursor = await connection.execute( + "SELECT value FROM app_meta WHERE key = ?", + (key,), + ) + row = await cursor.fetchone() + await cursor.close() + return str(row["value"]) if row else None + + +async def set_meta_value(key: str, value: str) -> None: + async with _get_connection() as connection: + await connection.execute( + """ + INSERT INTO app_meta (key, value) + VALUES (?, ?) + ON CONFLICT(key) DO UPDATE SET + value = excluded.value, + updated_at = CURRENT_TIMESTAMP + """, + (key, value), + ) + await connection.commit() + + +async def insert_page(url: str, title: str, body_text: str) -> int: + async with _get_connection() as connection: + await connection.execute( + """ + INSERT INTO pages (url, title, body_text) + VALUES (?, ?, ?) + ON CONFLICT(url) DO UPDATE SET + title = excluded.title, + body_text = excluded.body_text, + indexed_at = CURRENT_TIMESTAMP + """, + (url, title, body_text), + ) + await connection.commit() + + cursor = await connection.execute( + "SELECT id FROM pages WHERE url = ?", + (url,), + ) + row = await cursor.fetchone() + await cursor.close() + if row is None: + raise RuntimeError("Inserted page could not be reloaded from the database.") + return int(row["id"]) + + +async def insert_image(url: str, page_url: str, alt_text: str) -> None: + async with _get_connection() as connection: + await connection.execute( + """ + INSERT INTO images (url, page_url, alt_text) + VALUES (?, ?, ?) + ON CONFLICT(url) DO UPDATE SET + page_url = excluded.page_url, + alt_text = excluded.alt_text, + indexed_at = CURRENT_TIMESTAMP + """, + (url, page_url, alt_text), + ) + await connection.commit() + + +async def insert_video(url: str, page_url: str, title: str) -> None: + async with _get_connection() as connection: + await connection.execute( + """ + INSERT INTO videos (url, page_url, title) + VALUES (?, ?, ?) + ON CONFLICT(url) DO UPDATE SET + page_url = excluded.page_url, + title = excluded.title, + indexed_at = CURRENT_TIMESTAMP + """, + (url, page_url, title), + ) + await connection.commit() + + +async def search_pages(query: str, limit: int = 10, offset: int = 0) -> list[dict[str, Any]]: + fts_query = _to_fts_query(query) + if not fts_query: + return [] + + safe_limit = max(1, min(limit, 50)) + safe_offset = max(0, offset) + async with _get_connection() as connection: + cursor = await connection.execute( + """ + SELECT + p.id, + p.url, + p.title, + p.body_text, + p.indexed_at + FROM pages_fts + JOIN pages AS p ON p.id = pages_fts.rowid + WHERE pages_fts MATCH ? + ORDER BY bm25(pages_fts), p.indexed_at DESC + LIMIT ? OFFSET ? + """, + (fts_query, safe_limit, safe_offset), + ) + rows = await cursor.fetchall() + await cursor.close() + return [dict(row) for row in rows] + + +async def count_search_results(query: str) -> int: + fts_query = _to_fts_query(query) + if not fts_query: + return 0 + + async with _get_connection() as connection: + cursor = await connection.execute( + """ + SELECT COUNT(*) AS total + FROM pages_fts + WHERE pages_fts MATCH ? + """, + (fts_query,), + ) + row = await cursor.fetchone() + await cursor.close() + return int(row["total"]) if row and row["total"] is not None else 0 + + +async def search_images(query: str, limit: int = 10, offset: int = 0) -> list[dict[str, Any]]: + fts_query = _to_fts_query(query) + if not fts_query: + return [] + + safe_limit = max(1, min(limit, 50)) + safe_offset = max(0, offset) + async with _get_connection() as connection: + cursor = await connection.execute( + """ + SELECT + i.id, + i.url, + i.page_url, + i.alt_text, + i.indexed_at + FROM images_fts + JOIN images AS i ON i.id = images_fts.rowid + WHERE images_fts MATCH ? + ORDER BY bm25(images_fts), i.indexed_at DESC + LIMIT ? OFFSET ? + """, + (fts_query, safe_limit, safe_offset), + ) + rows = await cursor.fetchall() + await cursor.close() + return [dict(row) for row in rows] + + +async def count_image_results(query: str) -> int: + fts_query = _to_fts_query(query) + if not fts_query: + return 0 + + async with _get_connection() as connection: + cursor = await connection.execute( + """ + SELECT COUNT(*) AS total + FROM images_fts + WHERE images_fts MATCH ? + """, + (fts_query,), + ) + row = await cursor.fetchone() + await cursor.close() + return int(row["total"]) if row and row["total"] is not None else 0 + + +async def search_videos(query: str, limit: int = 10, offset: int = 0) -> list[dict[str, Any]]: + fts_query = _to_fts_query(query) + if not fts_query: + return [] + + safe_limit = max(1, min(limit, 50)) + safe_offset = max(0, offset) + async with _get_connection() as connection: + cursor = await connection.execute( + """ + SELECT + v.id, + v.url, + v.page_url, + v.title, + v.indexed_at + FROM videos_fts + JOIN videos AS v ON v.id = videos_fts.rowid + WHERE videos_fts MATCH ? + ORDER BY bm25(videos_fts), v.indexed_at DESC + LIMIT ? OFFSET ? + """, + (fts_query, safe_limit, safe_offset), + ) + rows = await cursor.fetchall() + await cursor.close() + return [dict(row) for row in rows] + + +async def count_video_results(query: str) -> int: + fts_query = _to_fts_query(query) + if not fts_query: + return 0 + + async with _get_connection() as connection: + cursor = await connection.execute( + """ + SELECT COUNT(*) AS total + FROM videos_fts + WHERE videos_fts MATCH ? + """, + (fts_query,), + ) + row = await cursor.fetchone() + await cursor.close() + return int(row["total"]) if row and row["total"] is not None else 0 + + +async def get_stats() -> dict[str, Any]: + async with _get_connection() as connection: + cursor = await connection.execute( + """ + SELECT + COUNT(*) AS total_pages, + MAX(indexed_at) AS last_indexed_at + FROM pages + """ + ) + row = await cursor.fetchone() + await cursor.close() + + return { + "total_pages": int(row["total_pages"]) if row and row["total_pages"] is not None else 0, + "last_indexed_at": row["last_indexed_at"] if row else None, + } diff --git a/backend/indexer.py b/backend/indexer.py new file mode 100644 index 0000000..19174da --- /dev/null +++ b/backend/indexer.py @@ -0,0 +1,41 @@ +"""Normalization and indexing helpers for crawled pages.""" + +from __future__ import annotations + +import re +from database import insert_image, insert_page, insert_video + +MAX_BODY_LENGTH = 10_000 + + +def _normalize_text(body_text: str) -> str: + collapsed = re.sub(r"\s+", " ", body_text).strip() + return collapsed[:MAX_BODY_LENGTH] + + +async def index_page( + url: str, + title: str, + body_text: str, + images: list[dict[str, str]] | None = None, + videos: list[dict[str, str]] | None = None, +) -> None: + normalized_title = title.strip() or url + normalized_body = _normalize_text(body_text) + if not normalized_body: + return + await insert_page(url=url, title=normalized_title, body_text=normalized_body) + + if images: + for img in images: + img_url = img.get("url") + alt_text = img.get("alt_text", "") + if img_url: + await insert_image(url=img_url, page_url=url, alt_text=alt_text) + + if videos: + for video in videos: + video_url = video.get("url") + video_title = video.get("title") or normalized_title + if video_url: + await insert_video(url=video_url, page_url=url, title=video_title.strip()) diff --git a/backend/main.py b/backend/main.py new file mode 100644 index 0000000..910b9ef --- /dev/null +++ b/backend/main.py @@ -0,0 +1,207 @@ +"""FastAPI entry point for the sFetch backend.""" + +from __future__ import annotations + +import asyncio +from datetime import UTC, datetime + +from fastapi import FastAPI, HTTPException, Query, BackgroundTasks +from fastapi.middleware.cors import CORSMiddleware + +from crawler import sFetchBot +from config import TOP_SITE_SEED_LIMIT, TOP_SITE_SEED_META_KEY +from database import ( + count_image_results, + count_search_results, + count_video_results, + get_meta_value, + get_stats, + init_db, + set_meta_value, +) +from models import CrawlRequest, SearchResponse +from searcher import search, search_images_api, search_videos_api +from top_sites import load_top_site_seed_urls + +app = FastAPI(title="sFetch API", version="1.0.0") + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=False, + allow_methods=["*"], + allow_headers=["*"], +) + + +def _utc_now() -> str: + return datetime.now(UTC).isoformat() + + +def _set_seed_status(**updates: object) -> None: + current = getattr(app.state, "_top_scrape_status", {}).copy() + current.update({"updated_at": _utc_now(), **updates}) + app.state._top_scrape_status = current + + +async def _scrape_top_sites(force: bool = False) -> None: + await init_db() + + async with app.state._crawl_lock: + if app.state._top_scrape_done and not force: + return + + existing_seed = await get_meta_value(TOP_SITE_SEED_META_KEY) + if existing_seed and not force: + stats = await get_stats() + _set_seed_status( + state="stored", + message="Top-site seed already stored in the database.", + total=TOP_SITE_SEED_LIMIT, + indexed=stats["total_pages"], + source=existing_seed, + ) + app.state._top_scrape_done = True + return + + stats = await get_stats() + if stats["total_pages"] >= TOP_SITE_SEED_LIMIT and not force: + source = "existing database" + await set_meta_value(TOP_SITE_SEED_META_KEY, source) + _set_seed_status( + state="stored", + message="Top-site seed already stored in the database.", + total=TOP_SITE_SEED_LIMIT, + indexed=stats["total_pages"], + source=source, + ) + app.state._top_scrape_done = True + return + + _set_seed_status(state="loading", message="Loading top-site list.", total=TOP_SITE_SEED_LIMIT, indexed=0) + seed_urls, source = await load_top_site_seed_urls(limit=TOP_SITE_SEED_LIMIT) + _set_seed_status( + state="running", + message=f"Seeding {len(seed_urls)} non-adult top sites.", + total=len(seed_urls), + indexed=0, + source=source, + ) + + print(f"sFetch: seeding index with {len(seed_urls)} non-adult top sites from {source}...") + bot = sFetchBot(max_depth=0, same_domain_only=True, max_pages_per_domain=1, max_concurrency=12) + try: + await bot.start(seed_urls) + except Exception as exc: + _set_seed_status(state="error", message=f"Top-site seed failed: {exc}", indexed=bot.indexed_count) + print(f"sFetch: top-site seed failed ({exc})") + return + + await set_meta_value(TOP_SITE_SEED_META_KEY, source) + _set_seed_status( + state="complete", + message="Top-site seed complete.", + total=len(seed_urls), + indexed=bot.indexed_count, + source=source, + ) + print(f"sFetch: seeding complete. {bot.indexed_count} pages indexed.") + app.state._top_scrape_done = True + + +@app.on_event("startup") +async def startup_event() -> None: + app.state._top_scrape_done = False + app.state._crawl_lock = asyncio.Lock() + app.state._top_scrape_status = { + "state": "idle", + "message": "Waiting to check top-site seed.", + "total": TOP_SITE_SEED_LIMIT, + "indexed": 0, + "source": None, + "updated_at": _utc_now(), + } + asyncio.create_task(_scrape_top_sites()) + + +@app.get("/") +async def health_check() -> dict[str, str]: + return {"status": "sFetch is alive"} + + +@app.get("/search", response_model=SearchResponse) +async def search_endpoint( + q: str = Query(..., description="Search query"), + type: str = Query("web", description="Search type: web, image, or video"), + limit: int = Query(10, ge=1, le=50), + offset: int = Query(0, ge=0), +) -> SearchResponse: + query = q.strip() + if not query: + raise HTTPException(status_code=400, detail="Query parameter 'q' cannot be empty.") + + if type == "image": + results = await search_images_api(query=query, limit=limit, offset=offset) + total = await count_image_results(query) + return SearchResponse(query=query, type=type, total=total, results=results) + + if type == "video": + results = await search_videos_api(query=query, limit=limit, offset=offset) + total = await count_video_results(query) + return SearchResponse(query=query, type=type, total=total, results=results) + + if type != "web": + raise HTTPException(status_code=400, detail="Invalid search type. Use web, image, or video.") + + results = await search(query=query, limit=limit, offset=offset) + total = await count_search_results(query) + return SearchResponse(query=query, type=type, total=total, results=results) + + +async def _run_crawl_job(request: CrawlRequest) -> None: + try: + bot = sFetchBot( + max_depth=request.max_depth, + max_pages_per_domain=request.max_pages_per_domain, + same_domain_only=request.same_domain_only, + ) + await bot.start(request.seed_urls) + except Exception as exc: + print(f"sFetch: crawl job failed ({exc})") + + +@app.post("/crawl") +async def crawl_endpoint(request: CrawlRequest, background_tasks: BackgroundTasks) -> dict[str, object]: + background_tasks.add_task(_run_crawl_job, request) + return {"message": "Crawl started", "seed_urls": request.seed_urls} + + +@app.post("/crawl/top-sites") +async def crawl_top_sites_endpoint( + background_tasks: BackgroundTasks, + force: bool = Query(False, description="Run the top-site seed again even if it is marked complete."), +) -> dict[str, object]: + background_tasks.add_task(_scrape_top_sites, force) + return {"message": "Top-site crawl queued", "force": force} + + +@app.get("/crawl/top-sites/status") +async def crawl_top_sites_status_endpoint() -> dict[str, object]: + return getattr( + app.state, + "_top_scrape_status", + { + "state": "idle", + "message": "Top-site seed has not started.", + "total": TOP_SITE_SEED_LIMIT, + "indexed": 0, + "source": None, + "updated_at": None, + }, + ) + + +@app.get("/stats") +async def stats_endpoint() -> dict[str, object]: + stats = await get_stats() + return stats diff --git a/backend/models.py b/backend/models.py new file mode 100644 index 0000000..fac07d9 --- /dev/null +++ b/backend/models.py @@ -0,0 +1,43 @@ +"""Pydantic models for sFetch's API.""" + +from __future__ import annotations + +from pydantic import BaseModel, Field + + +class SearchResult(BaseModel): + id: int + url: str + title: str + snippet: str + indexed_at: str + + +class ImageResult(BaseModel): + id: int + url: str + page_url: str + alt_text: str + indexed_at: str + + +class VideoResult(BaseModel): + id: int + url: str + page_url: str + title: str + indexed_at: str + + +class SearchResponse(BaseModel): + query: str + type: str = "web" + total: int + results: list[SearchResult] | list[ImageResult] | list[VideoResult] + + +class CrawlRequest(BaseModel): + seed_urls: list[str] = Field(min_length=1) + max_depth: int = Field(default=2, ge=0, le=5) + max_pages_per_domain: int = Field(default=50, ge=1, le=500) + same_domain_only: bool = True diff --git a/backend/requirements.txt b/backend/requirements.txt new file mode 100644 index 0000000..7b71d52 --- /dev/null +++ b/backend/requirements.txt @@ -0,0 +1,6 @@ +fastapi +uvicorn[standard] +httpx +beautifulsoup4 +pydantic +aiosqlite diff --git a/backend/searcher.py b/backend/searcher.py new file mode 100644 index 0000000..81895d5 --- /dev/null +++ b/backend/searcher.py @@ -0,0 +1,90 @@ +"""Search result shaping for sFetch.""" + +from __future__ import annotations + +import html +import re + +from database import search_images, search_pages, search_videos + +SNIPPET_LENGTH = 200 + + +def _extract_terms(query: str) -> list[str]: + terms = {term.lower() for term in re.findall(r"\w+", query, flags=re.UNICODE)} + return sorted(terms, key=len, reverse=True) + + +def _build_snippet(body_text: str) -> str: + snippet = body_text[:SNIPPET_LENGTH].strip() + if not snippet: + return "No preview available." + if len(body_text) > SNIPPET_LENGTH: + return f"{snippet}..." + return snippet + + +def _highlight_terms(snippet: str, query: str) -> str: + safe_snippet = html.escape(snippet) + for term in _extract_terms(query): + pattern = re.compile(re.escape(html.escape(term)), flags=re.IGNORECASE) + safe_snippet = pattern.sub(lambda match: f"{match.group(0)}", safe_snippet) + return safe_snippet + + +async def search(query: str, limit: int = 10, offset: int = 0) -> list[dict]: + rows = await search_pages(query=query, limit=limit, offset=offset) + results: list[dict] = [] + + for row in rows: + title = (row.get("title") or row.get("url") or "Untitled").strip() + body_text = row.get("body_text") or "" + snippet = _highlight_terms(_build_snippet(body_text), query) + results.append( + { + "id": row["id"], + "url": row["url"], + "title": title, + "snippet": snippet, + "indexed_at": row["indexed_at"], + } + ) + + return results + + +async def search_images_api(query: str, limit: int = 10, offset: int = 0) -> list[dict]: + rows = await search_images(query=query, limit=limit, offset=offset) + results: list[dict] = [] + + for row in rows: + results.append( + { + "id": row["id"], + "url": row["url"], + "page_url": row["page_url"], + "alt_text": row["alt_text"] or "", + "indexed_at": row["indexed_at"], + } + ) + + return results + + +async def search_videos_api(query: str, limit: int = 10, offset: int = 0) -> list[dict]: + rows = await search_videos(query=query, limit=limit, offset=offset) + results: list[dict] = [] + + for row in rows: + title = (row.get("title") or "Video result").strip() + results.append( + { + "id": row["id"], + "url": row["url"], + "page_url": row["page_url"], + "title": title, + "indexed_at": row["indexed_at"], + } + ) + + return results diff --git a/backend/top_sites.py b/backend/top_sites.py new file mode 100644 index 0000000..bed2513 --- /dev/null +++ b/backend/top_sites.py @@ -0,0 +1,110 @@ +"""Load and sanitize the top-site seed list for first-launch indexing.""" + +from __future__ import annotations + +import csv +import io +import zipfile +from collections.abc import Iterable +from urllib.parse import urlsplit, urlunsplit + +import httpx + +from config import ( + TOP_SITE_DOWNLOAD_TIMEOUT_SECONDS, + TOP_SITE_SEED_LIMIT, + TOP_SITE_SOURCE_URL, + TOP_SITES, + USER_AGENT, +) +from content_filter import is_adult_url + + +def _normalize_site_url(value: str) -> str | None: + raw_value = value.strip() + if not raw_value: + return None + + candidate = raw_value if "://" in raw_value else f"https://{raw_value}" + parsed = urlsplit(candidate) + if parsed.scheme not in {"http", "https"} or not parsed.netloc: + return None + + normalized = parsed._replace( + scheme=parsed.scheme.lower(), + netloc=parsed.netloc.lower(), + path=parsed.path.rstrip("/") if parsed.path not in {"", "/"} else "", + query="", + fragment="", + ) + return urlunsplit(normalized) + + +def _host_key(url: str) -> str: + return urlsplit(url).netloc.lower().removeprefix("www.") + + +def _safe_top_urls(candidates: Iterable[str], limit: int = TOP_SITE_SEED_LIMIT) -> list[str]: + safe_urls: list[str] = [] + seen_hosts: set[str] = set() + + for candidate in candidates: + normalized = _normalize_site_url(candidate) + if normalized is None: + continue + host_key = _host_key(normalized) + if host_key in seen_hosts or is_adult_url(normalized): + continue + seen_hosts.add(host_key) + safe_urls.append(normalized) + if len(safe_urls) >= limit: + break + + return safe_urls + + +def _domains_from_csv_text(csv_text: str) -> list[str]: + domains: list[str] = [] + reader = csv.reader(io.StringIO(csv_text)) + for row in reader: + if not row: + continue + domain = row[1] if len(row) > 1 else row[0] + if domain and domain.lower() != "domain": + domains.append(domain) + return domains + + +def _domains_from_zip(payload: bytes) -> list[str]: + with zipfile.ZipFile(io.BytesIO(payload)) as archive: + csv_name = next((name for name in archive.namelist() if name.endswith(".csv")), None) + if csv_name is None: + raise ValueError("Tranco archive did not contain a CSV file.") + with archive.open(csv_name) as csv_file: + text = csv_file.read().decode("utf-8", errors="replace") + return _domains_from_csv_text(text) + + +async def load_top_site_seed_urls(limit: int = TOP_SITE_SEED_LIMIT) -> tuple[list[str], str]: + """Return the latest safe top-site URLs, falling back to the bundled list if needed.""" + + timeout = httpx.Timeout(TOP_SITE_DOWNLOAD_TIMEOUT_SECONDS) + headers = {"User-Agent": USER_AGENT} + + try: + async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers) as client: + response = await client.get(TOP_SITE_SOURCE_URL) + response.raise_for_status() + + if response.content.startswith(b"PK"): + candidates = _domains_from_zip(response.content) + else: + candidates = _domains_from_csv_text(response.text) + + safe_urls = _safe_top_urls(candidates, limit=limit) + if safe_urls: + return safe_urls, TOP_SITE_SOURCE_URL + except Exception as exc: + print(f"sFetch: unable to load latest top-site list ({exc}); using bundled fallback.") + + return _safe_top_urls(TOP_SITES, limit=limit), "bundled fallback list" diff --git a/frontend/index.html b/frontend/index.html new file mode 100644 index 0000000..7ed3650 --- /dev/null +++ b/frontend/index.html @@ -0,0 +1,402 @@ + + + + + + sFetch + + + + + +
+
+ + sFetch + + +
+ +
+

+ sFetch +

+ +
+ + +
+ + + +
+
+ +
+
+
+

Index

+

Checking index...

+
+
+ + +
+
+
+
+
+

Top-site seed status unavailable.

+
+
+ +
+ © 2026 sFetch +
+
+ + + + + + diff --git a/frontend/results.html b/frontend/results.html new file mode 100644 index 0000000..e11db49 --- /dev/null +++ b/frontend/results.html @@ -0,0 +1,693 @@ + + + + + + sFetch Results + + + + + +
+
+
+ + sFetch + + +
+ + +
+ + + Index tools + +
+ +
+ +
+

+
+ +
+
+ + + + + +