""" RSS feed fetching, Project Gutenberg parsing, and headline caching. Depends on: config, sources, filter, terminal. """ import re import json import pathlib import urllib.request from datetime import datetime import feedparser from engine import config from engine.sources import FEEDS, POETRY_SOURCES from engine.filter import strip_tags, skip from engine.terminal import boot_ln # ─── SINGLE FEED ────────────────────────────────────────── def fetch_feed(url): try: req = urllib.request.Request(url, headers={"User-Agent": "mainline/0.1"}) resp = urllib.request.urlopen(req, timeout=config.FEED_TIMEOUT) return feedparser.parse(resp.read()) except Exception: return None # ─── ALL RSS FEEDS ──────────────────────────────────────── def fetch_all(): items = [] linked = failed = 0 for src, url in FEEDS.items(): feed = fetch_feed(url) if feed is None or (feed.bozo and not feed.entries): boot_ln(src, "DARK", False) failed += 1 continue n = 0 for e in feed.entries: t = strip_tags(e.get("title", "")) if not t or skip(t): continue pub = e.get("published_parsed") or e.get("updated_parsed") try: ts = datetime(*pub[:6]).strftime("%H:%M") if pub else "——:——" except Exception: ts = "——:——" items.append((t, src, ts)) n += 1 if n: boot_ln(src, f"LINKED [{n}]", True) linked += 1 else: boot_ln(src, "EMPTY", False) failed += 1 return items, linked, failed # ─── PROJECT GUTENBERG ──────────────────────────────────── def _fetch_gutenberg(url, label): """Download and parse stanzas/passages from a Project Gutenberg text.""" try: req = urllib.request.Request(url, headers={"User-Agent": "mainline/0.1"}) resp = urllib.request.urlopen(req, timeout=15) text = resp.read().decode('utf-8', errors='replace').replace('\r\n', '\n').replace('\r', '\n') # Strip PG boilerplate m = re.search(r'\*\*\*\s*START OF[^\n]*\n', text) if m: text = text[m.end():] m = re.search(r'\*\*\*\s*END OF', text) if m: text = text[:m.start()] # Split on blank lines into stanzas/passages blocks = re.split(r'\n{2,}', text.strip()) items = [] for blk in blocks: blk = ' '.join(blk.split()) # flatten to one line if len(blk) < 20 or len(blk) > 280: continue if blk.isupper(): # skip all-caps headers continue if re.match(r'^[IVXLCDM]+\.?\s*$', blk): # roman numerals continue items.append((blk, label, '')) return items except Exception: return [] def fetch_poetry(): """Fetch all poetry/literature sources.""" items = [] linked = failed = 0 for label, url in POETRY_SOURCES.items(): stanzas = _fetch_gutenberg(url, label) if stanzas: boot_ln(label, f"LOADED [{len(stanzas)}]", True) items.extend(stanzas) linked += 1 else: boot_ln(label, "DARK", False) failed += 1 return items, linked, failed # ─── CACHE ──────────────────────────────────────────────── _CACHE_DIR = pathlib.Path(__file__).resolve().parent.parent def _cache_path(): return _CACHE_DIR / f".mainline_cache_{config.MODE}.json" def load_cache(): """Load cached items from disk if available.""" p = _cache_path() if not p.exists(): return None try: data = json.loads(p.read_text()) items = [tuple(i) for i in data["items"]] return items if items else None except Exception: return None def save_cache(items): """Save fetched items to disk for fast subsequent runs.""" try: _cache_path().write_text(json.dumps({"items": items})) except Exception: pass