- Fix pre-existing lint errors in engine/ modules using ruff --unsafe-fixes - Add hk.pkl with pre-commit and pre-push hooks using ruff builtin - Configure hooks to use 'uv run' prefix for tool execution - Update mise.toml to include hk and pkl tools - All 73 tests pass
140 lines
4.4 KiB
Python
140 lines
4.4 KiB
Python
"""
|
|
RSS feed fetching, Project Gutenberg parsing, and headline caching.
|
|
Depends on: config, sources, filter, terminal.
|
|
"""
|
|
|
|
import json
|
|
import pathlib
|
|
import re
|
|
import urllib.request
|
|
from datetime import datetime
|
|
|
|
import feedparser
|
|
|
|
from engine import config
|
|
from engine.filter import skip, strip_tags
|
|
from engine.sources import FEEDS, POETRY_SOURCES
|
|
from engine.terminal import boot_ln
|
|
|
|
|
|
# ─── SINGLE FEED ──────────────────────────────────────────
|
|
def fetch_feed(url):
|
|
try:
|
|
req = urllib.request.Request(url, headers={"User-Agent": "mainline/0.1"})
|
|
resp = urllib.request.urlopen(req, timeout=config.FEED_TIMEOUT)
|
|
return feedparser.parse(resp.read())
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
# ─── ALL RSS FEEDS ────────────────────────────────────────
|
|
def fetch_all():
|
|
items = []
|
|
linked = failed = 0
|
|
for src, url in FEEDS.items():
|
|
feed = fetch_feed(url)
|
|
if feed is None or (feed.bozo and not feed.entries):
|
|
boot_ln(src, "DARK", False)
|
|
failed += 1
|
|
continue
|
|
n = 0
|
|
for e in feed.entries:
|
|
t = strip_tags(e.get("title", ""))
|
|
if not t or skip(t):
|
|
continue
|
|
pub = e.get("published_parsed") or e.get("updated_parsed")
|
|
try:
|
|
ts = datetime(*pub[:6]).strftime("%H:%M") if pub else "——:——"
|
|
except Exception:
|
|
ts = "——:——"
|
|
items.append((t, src, ts))
|
|
n += 1
|
|
if n:
|
|
boot_ln(src, f"LINKED [{n}]", True)
|
|
linked += 1
|
|
else:
|
|
boot_ln(src, "EMPTY", False)
|
|
failed += 1
|
|
return items, linked, failed
|
|
|
|
|
|
# ─── PROJECT GUTENBERG ────────────────────────────────────
|
|
def _fetch_gutenberg(url, label):
|
|
"""Download and parse stanzas/passages from a Project Gutenberg text."""
|
|
try:
|
|
req = urllib.request.Request(url, headers={"User-Agent": "mainline/0.1"})
|
|
resp = urllib.request.urlopen(req, timeout=15)
|
|
text = (
|
|
resp.read()
|
|
.decode("utf-8", errors="replace")
|
|
.replace("\r\n", "\n")
|
|
.replace("\r", "\n")
|
|
)
|
|
# Strip PG boilerplate
|
|
m = re.search(r"\*\*\*\s*START OF[^\n]*\n", text)
|
|
if m:
|
|
text = text[m.end() :]
|
|
m = re.search(r"\*\*\*\s*END OF", text)
|
|
if m:
|
|
text = text[: m.start()]
|
|
# Split on blank lines into stanzas/passages
|
|
blocks = re.split(r"\n{2,}", text.strip())
|
|
items = []
|
|
for blk in blocks:
|
|
blk = " ".join(blk.split()) # flatten to one line
|
|
if len(blk) < 20 or len(blk) > 280:
|
|
continue
|
|
if blk.isupper(): # skip all-caps headers
|
|
continue
|
|
if re.match(r"^[IVXLCDM]+\.?\s*$", blk): # roman numerals
|
|
continue
|
|
items.append((blk, label, ""))
|
|
return items
|
|
except Exception:
|
|
return []
|
|
|
|
|
|
def fetch_poetry():
|
|
"""Fetch all poetry/literature sources."""
|
|
items = []
|
|
linked = failed = 0
|
|
for label, url in POETRY_SOURCES.items():
|
|
stanzas = _fetch_gutenberg(url, label)
|
|
if stanzas:
|
|
boot_ln(label, f"LOADED [{len(stanzas)}]", True)
|
|
items.extend(stanzas)
|
|
linked += 1
|
|
else:
|
|
boot_ln(label, "DARK", False)
|
|
failed += 1
|
|
return items, linked, failed
|
|
|
|
|
|
# ─── CACHE ────────────────────────────────────────────────
|
|
_CACHE_DIR = pathlib.Path(__file__).resolve().parent.parent
|
|
|
|
|
|
def _cache_path():
|
|
return _CACHE_DIR / f".mainline_cache_{config.MODE}.json"
|
|
|
|
|
|
def load_cache():
|
|
"""Load cached items from disk if available."""
|
|
p = _cache_path()
|
|
if not p.exists():
|
|
return None
|
|
try:
|
|
data = json.loads(p.read_text())
|
|
items = [tuple(i) for i in data["items"]]
|
|
return items if items else None
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def save_cache(items):
|
|
"""Save fetched items to disk for fast subsequent runs."""
|
|
try:
|
|
_cache_path().write_text(json.dumps({"items": items}))
|
|
except Exception:
|
|
pass
|