Files
Mainline/engine/fetch.py
David Gwilliam 0a16e3e564 style: apply ruff auto-fixes across codebase
- Fix import sorting (isort) across all engine modules
- Fix SIM105 try-except-pass patterns (contextlib.suppress)
- Fix nested with statements in tests
- Fix unused loop variables

Run 'uv run pytest' to verify tests still pass.
2026-03-16 04:18:34 -07:00

135 lines
4.4 KiB
Python

"""
RSS feed fetching, Project Gutenberg parsing, and headline caching.
Depends on: config, sources, filter, terminal.
"""
import json
import pathlib
import re
import urllib.request
from datetime import datetime
import feedparser
from engine import config
from engine.filter import skip, strip_tags
from engine.sources import FEEDS, POETRY_SOURCES
from engine.terminal import boot_ln
# ─── SINGLE FEED ──────────────────────────────────────────
def fetch_feed(url):
try:
req = urllib.request.Request(url, headers={"User-Agent": "mainline/0.1"})
resp = urllib.request.urlopen(req, timeout=config.FEED_TIMEOUT)
return feedparser.parse(resp.read())
except Exception:
return None
# ─── ALL RSS FEEDS ────────────────────────────────────────
def fetch_all():
items = []
linked = failed = 0
for src, url in FEEDS.items():
feed = fetch_feed(url)
if feed is None or (feed.bozo and not feed.entries):
boot_ln(src, "DARK", False)
failed += 1
continue
n = 0
for e in feed.entries:
t = strip_tags(e.get("title", ""))
if not t or skip(t):
continue
pub = e.get("published_parsed") or e.get("updated_parsed")
try:
ts = datetime(*pub[:6]).strftime("%H:%M") if pub else "——:——"
except Exception:
ts = "——:——"
items.append((t, src, ts))
n += 1
if n:
boot_ln(src, f"LINKED [{n}]", True)
linked += 1
else:
boot_ln(src, "EMPTY", False)
failed += 1
return items, linked, failed
# ─── PROJECT GUTENBERG ────────────────────────────────────
def _fetch_gutenberg(url, label):
"""Download and parse stanzas/passages from a Project Gutenberg text."""
try:
req = urllib.request.Request(url, headers={"User-Agent": "mainline/0.1"})
resp = urllib.request.urlopen(req, timeout=15)
text = resp.read().decode('utf-8', errors='replace').replace('\r\n', '\n').replace('\r', '\n')
# Strip PG boilerplate
m = re.search(r'\*\*\*\s*START OF[^\n]*\n', text)
if m:
text = text[m.end():]
m = re.search(r'\*\*\*\s*END OF', text)
if m:
text = text[:m.start()]
# Split on blank lines into stanzas/passages
blocks = re.split(r'\n{2,}', text.strip())
items = []
for blk in blocks:
blk = ' '.join(blk.split()) # flatten to one line
if len(blk) < 20 or len(blk) > 280:
continue
if blk.isupper(): # skip all-caps headers
continue
if re.match(r'^[IVXLCDM]+\.?\s*$', blk): # roman numerals
continue
items.append((blk, label, ''))
return items
except Exception:
return []
def fetch_poetry():
"""Fetch all poetry/literature sources."""
items = []
linked = failed = 0
for label, url in POETRY_SOURCES.items():
stanzas = _fetch_gutenberg(url, label)
if stanzas:
boot_ln(label, f"LOADED [{len(stanzas)}]", True)
items.extend(stanzas)
linked += 1
else:
boot_ln(label, "DARK", False)
failed += 1
return items, linked, failed
# ─── CACHE ────────────────────────────────────────────────
_CACHE_DIR = pathlib.Path(__file__).resolve().parent.parent
def _cache_path():
return _CACHE_DIR / f".mainline_cache_{config.MODE}.json"
def load_cache():
"""Load cached items from disk if available."""
p = _cache_path()
if not p.exists():
return None
try:
data = json.loads(p.read_text())
items = [tuple(i) for i in data["items"]]
return items if items else None
except Exception:
return None
def save_cache(items):
"""Save fetched items to disk for fast subsequent runs."""
try:
_cache_path().write_text(json.dumps({"items": items}))
except Exception:
pass