From dfd902fb906ac62311fc050c01ec05aecba0e78e Mon Sep 17 00:00:00 2001 From: Gene Johnson Date: Sat, 14 Mar 2026 22:51:50 -0700 Subject: [PATCH] feat: add module for fetching RSS feeds, parsing Project Gutenberg texts, and caching headlines --- engine/fetch.py | 133 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 engine/fetch.py diff --git a/engine/fetch.py b/engine/fetch.py new file mode 100644 index 0000000..906a8b3 --- /dev/null +++ b/engine/fetch.py @@ -0,0 +1,133 @@ +""" +RSS feed fetching, Project Gutenberg parsing, and headline caching. +Depends on: config, sources, filter, terminal. +""" + +import re +import json +import pathlib +import urllib.request +from datetime import datetime + +import feedparser + +from engine import config +from engine.sources import FEEDS, POETRY_SOURCES +from engine.filter import strip_tags, skip +from engine.terminal import boot_ln + +# ─── SINGLE FEED ────────────────────────────────────────── +def fetch_feed(url): + try: + req = urllib.request.Request(url, headers={"User-Agent": "mainline/0.1"}) + resp = urllib.request.urlopen(req, timeout=config.FEED_TIMEOUT) + return feedparser.parse(resp.read()) + except Exception: + return None + + +# ─── ALL RSS FEEDS ──────────────────────────────────────── +def fetch_all(): + items = [] + linked = failed = 0 + for src, url in FEEDS.items(): + feed = fetch_feed(url) + if feed is None or (feed.bozo and not feed.entries): + boot_ln(src, "DARK", False) + failed += 1 + continue + n = 0 + for e in feed.entries: + t = strip_tags(e.get("title", "")) + if not t or skip(t): + continue + pub = e.get("published_parsed") or e.get("updated_parsed") + try: + ts = datetime(*pub[:6]).strftime("%H:%M") if pub else "——:——" + except Exception: + ts = "——:——" + items.append((t, src, ts)) + n += 1 + if n: + boot_ln(src, f"LINKED [{n}]", True) + linked += 1 + else: + boot_ln(src, "EMPTY", False) + failed += 1 + return items, linked, failed + + +# ─── PROJECT GUTENBERG ──────────────────────────────────── +def _fetch_gutenberg(url, label): + """Download and parse stanzas/passages from a Project Gutenberg text.""" + try: + req = urllib.request.Request(url, headers={"User-Agent": "mainline/0.1"}) + resp = urllib.request.urlopen(req, timeout=15) + text = resp.read().decode('utf-8', errors='replace').replace('\r\n', '\n').replace('\r', '\n') + # Strip PG boilerplate + m = re.search(r'\*\*\*\s*START OF[^\n]*\n', text) + if m: + text = text[m.end():] + m = re.search(r'\*\*\*\s*END OF', text) + if m: + text = text[:m.start()] + # Split on blank lines into stanzas/passages + blocks = re.split(r'\n{2,}', text.strip()) + items = [] + for blk in blocks: + blk = ' '.join(blk.split()) # flatten to one line + if len(blk) < 20 or len(blk) > 280: + continue + if blk.isupper(): # skip all-caps headers + continue + if re.match(r'^[IVXLCDM]+\.?\s*$', blk): # roman numerals + continue + items.append((blk, label, '')) + return items + except Exception: + return [] + + +def fetch_poetry(): + """Fetch all poetry/literature sources.""" + items = [] + linked = failed = 0 + for label, url in POETRY_SOURCES.items(): + stanzas = _fetch_gutenberg(url, label) + if stanzas: + boot_ln(label, f"LOADED [{len(stanzas)}]", True) + items.extend(stanzas) + linked += 1 + else: + boot_ln(label, "DARK", False) + failed += 1 + return items, linked, failed + + +# ─── CACHE ──────────────────────────────────────────────── +_CACHE_DIR = pathlib.Path(__file__).resolve().parent.parent + + +def _cache_path(): + return _CACHE_DIR / f".mainline_cache_{config.MODE}.json" + + +def load_cache(): + """Load cached items from disk if available.""" + p = _cache_path() + if not p.exists(): + return None + try: + data = json.loads(p.read_text()) + items = [tuple(i) for i in data["items"]] + return items if items else None + except Exception: + return None + + +def save_cache(items): + """Save fetched items to disk for fast subsequent runs.""" + try: + _cache_path().write_text(json.dumps({"items": items})) + except Exception: + pass