""" RSS feed fetching, Project Gutenberg parsing, and headline caching. Depends on: config, sources, filter, terminal. """ import json import pathlib import re import urllib.request from datetime import datetime from typing import Any import feedparser from engine import config from engine.filter import skip, strip_tags from engine.sources import FEEDS, POETRY_SOURCES from engine.terminal import boot_ln # Type alias for headline items HeadlineTuple = tuple[str, str, str] # ─── SINGLE FEED ────────────────────────────────────────── def fetch_feed(url: str) -> Any | None: """Fetch and parse a single RSS feed URL.""" try: req = urllib.request.Request(url, headers={"User-Agent": "mainline/0.1"}) resp = urllib.request.urlopen(req, timeout=config.FEED_TIMEOUT) return feedparser.parse(resp.read()) except Exception: return None # ─── ALL RSS FEEDS ──────────────────────────────────────── def fetch_all() -> tuple[list[HeadlineTuple], int, int]: """Fetch all RSS feeds and return items, linked count, failed count.""" items: list[HeadlineTuple] = [] linked = failed = 0 for src, url in FEEDS.items(): feed = fetch_feed(url) if feed is None or (feed.bozo and not feed.entries): boot_ln(src, "DARK", False) failed += 1 continue n = 0 for e in feed.entries: t = strip_tags(e.get("title", "")) if not t or skip(t): continue pub = e.get("published_parsed") or e.get("updated_parsed") try: ts = datetime(*pub[:6]).strftime("%H:%M") if pub else "——:——" except Exception: ts = "——:——" items.append((t, src, ts)) n += 1 if n: boot_ln(src, f"LINKED [{n}]", True) linked += 1 else: boot_ln(src, "EMPTY", False) failed += 1 return items, linked, failed # ─── PROJECT GUTENBERG ──────────────────────────────────── def _fetch_gutenberg(url: str, label: str) -> list[HeadlineTuple]: """Download and parse stanzas/passages from a Project Gutenberg text.""" try: req = urllib.request.Request(url, headers={"User-Agent": "mainline/0.1"}) resp = urllib.request.urlopen(req, timeout=15) text = ( resp.read() .decode("utf-8", errors="replace") .replace("\r\n", "\n") .replace("\r", "\n") ) # Strip PG boilerplate m = re.search(r"\*\*\*\s*START OF[^\n]*\n", text) if m: text = text[m.end() :] m = re.search(r"\*\*\*\s*END OF", text) if m: text = text[: m.start()] # Split on blank lines into stanzas/passages blocks = re.split(r"\n{2,}", text.strip()) items = [] for blk in blocks: blk = " ".join(blk.split()) # flatten to one line if len(blk) < 20 or len(blk) > 280: continue if blk.isupper(): # skip all-caps headers continue if re.match(r"^[IVXLCDM]+\.?\s*$", blk): # roman numerals continue items.append((blk, label, "")) return items except Exception: return [] def fetch_poetry(): """Fetch all poetry/literature sources.""" items = [] linked = failed = 0 for label, url in POETRY_SOURCES.items(): stanzas = _fetch_gutenberg(url, label) if stanzas: boot_ln(label, f"LOADED [{len(stanzas)}]", True) items.extend(stanzas) linked += 1 else: boot_ln(label, "DARK", False) failed += 1 return items, linked, failed # ─── CACHE ──────────────────────────────────────────────── _CACHE_DIR = pathlib.Path(__file__).resolve().parent.parent def _cache_path(): return _CACHE_DIR / f".mainline_cache_{config.MODE}.json" def load_cache(): """Load cached items from disk if available.""" p = _cache_path() if not p.exists(): return None try: data = json.loads(p.read_text()) items = [tuple(i) for i in data["items"]] return items if items else None except Exception: return None def save_cache(items): """Save fetched items to disk for fast subsequent runs.""" try: _cache_path().write_text(json.dumps({"items": items})) except Exception: pass