Mainline/engine/fetch.py

"""
RSS feed fetching, Project Gutenberg parsing, and headline caching.
Depends on: config, sources, filter, terminal.
"""

import json
import pathlib
import re
import urllib.request
from datetime import datetime
from typing import Any

import feedparser

from engine import config
from engine.filter import skip, strip_tags
from engine.sources import FEEDS, POETRY_SOURCES
from engine.terminal import boot_ln

# Type alias for headline items
HeadlineTuple = tuple[str, str, str]


# ─── SINGLE FEED ──────────────────────────────────────────
def fetch_feed(url: str) -> Any | None:
    """Fetch and parse a single RSS feed URL."""
    try:
        req = urllib.request.Request(url, headers={"User-Agent": "mainline/0.1"})
        resp = urllib.request.urlopen(req, timeout=config.FEED_TIMEOUT)
        return feedparser.parse(resp.read())
    except Exception:
        return None


# ─── ALL RSS FEEDS ────────────────────────────────────────
def fetch_all() -> tuple[list[HeadlineTuple], int, int]:
    """Fetch all RSS feeds and return items, linked count, failed count."""
    items: list[HeadlineTuple] = []
    linked = failed = 0
    for src, url in FEEDS.items():
        feed = fetch_feed(url)
        if feed is None or (feed.bozo and not feed.entries):
            boot_ln(src, "DARK", False)
            failed += 1
            continue
        n = 0
        for e in feed.entries:
            t = strip_tags(e.get("title", ""))
            if not t or skip(t):
                continue
            pub = e.get("published_parsed") or e.get("updated_parsed")
            try:
                ts = datetime(*pub[:6]).strftime("%H:%M") if pub else "——:——"
            except Exception:
                ts = "——:——"
            items.append((t, src, ts))
            n += 1
        if n:
            boot_ln(src, f"LINKED [{n}]", True)
            linked += 1
        else:
            boot_ln(src, "EMPTY", False)
            failed += 1
    return items, linked, failed


# ─── PROJECT GUTENBERG ────────────────────────────────────
def _fetch_gutenberg(url: str, label: str) -> list[HeadlineTuple]:
    """Download and parse stanzas/passages from a Project Gutenberg text."""
    try:
        req = urllib.request.Request(url, headers={"User-Agent": "mainline/0.1"})
        resp = urllib.request.urlopen(req, timeout=15)
        text = (
            resp.read()
            .decode("utf-8", errors="replace")
            .replace("\r\n", "\n")
            .replace("\r", "\n")
        )
        # Strip PG boilerplate
        m = re.search(r"\*\*\*\s*START OF[^\n]*\n", text)
        if m:
            text = text[m.end() :]
        m = re.search(r"\*\*\*\s*END OF", text)
        if m:
            text = text[: m.start()]
        # Split on blank lines into stanzas/passages
        blocks = re.split(r"\n{2,}", text.strip())
        items = []
        for blk in blocks:
            blk = " ".join(blk.split())  # flatten to one line
            if len(blk) < 20 or len(blk) > 280:
                continue
            if blk.isupper():  # skip all-caps headers
                continue
            if re.match(r"^[IVXLCDM]+\.?\s*$", blk):  # roman numerals
                continue
            items.append((blk, label, ""))
        return items
    except Exception:
        return []


def fetch_poetry():
    """Fetch all poetry/literature sources."""
    items = []
    linked = failed = 0
    for label, url in POETRY_SOURCES.items():
        stanzas = _fetch_gutenberg(url, label)
        if stanzas:
            boot_ln(label, f"LOADED [{len(stanzas)}]", True)
            items.extend(stanzas)
            linked += 1
        else:
            boot_ln(label, "DARK", False)
            failed += 1
    return items, linked, failed


# ─── CACHE ────────────────────────────────────────────────
# Cache moved to engine/fixtures/headlines.json
_CACHE_DIR = pathlib.Path(__file__).resolve().parent / "fixtures"


def _cache_path():
    return _CACHE_DIR / "headlines.json"


def load_cache():
    """Load cached items from disk if available."""
    p = _cache_path()
    if not p.exists():
        return None
    try:
        data = json.loads(p.read_text())
        items = [tuple(i) for i in data["items"]]
        return items if items else None
    except Exception:
        return None


def save_cache(items):
    """Save fetched items to disk for fast subsequent runs."""
    try:
        _cache_path().write_text(json.dumps({"items": items}))
    except Exception:
        pass