feat: Add fast startup fetch and background caching

- Add for quick startup using first N feeds - Add background thread for full fetch and caching - Update to use fast fetch - Update docs and skills
2026-03-19 22:38:55 -07:00
parent 4f2cf49a80
commit 7eaa441574
13 changed files with 393 additions and 220 deletions
--- a/engine/fetch.py
+++ b/engine/fetch.py
@@ -7,6 +7,7 @@ import json
 import pathlib
 import re
 import urllib.request
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime
 from typing import Any

@@ -17,54 +18,98 @@ from engine.filter import skip, strip_tags
 from engine.sources import FEEDS, POETRY_SOURCES
 from engine.terminal import boot_ln

-# Type alias for headline items
 HeadlineTuple = tuple[str, str, str]

+DEFAULT_MAX_WORKERS = 10
+FAST_START_SOURCES = 5
+FAST_START_TIMEOUT = 3

-# ─── SINGLE FEED ──────────────────────────────────────────
-def fetch_feed(url: str) -> Any | None:
-    """Fetch and parse a single RSS feed URL."""
+
+def fetch_feed(url: str) -> tuple[str, Any] | tuple[None, None]:
+    """Fetch and parse a single RSS feed URL. Returns (url, feed) tuple."""
    try:
        req = urllib.request.Request(url, headers={"User-Agent": "mainline/0.1"})
-        resp = urllib.request.urlopen(req, timeout=config.FEED_TIMEOUT)
-        return feedparser.parse(resp.read())
+        timeout = FAST_START_TIMEOUT if url in _fast_start_urls else config.FEED_TIMEOUT
+        resp = urllib.request.urlopen(req, timeout=timeout)
+        return (url, feedparser.parse(resp.read()))
    except Exception:
-        return None
+        return (url, None)
+
+
+def _parse_feed(feed: Any, src: str) -> list[HeadlineTuple]:
+    """Parse a feed and return list of headline tuples."""
+    items = []
+    if feed is None or (feed.bozo and not feed.entries):
+        return items
+
+    for e in feed.entries:
+        t = strip_tags(e.get("title", ""))
+        if not t or skip(t):
+            continue
+        pub = e.get("published_parsed") or e.get("updated_parsed")
+        try:
+            ts = datetime(*pub[:6]).strftime("%H:%M") if pub else "——:——"
+        except Exception:
+            ts = "——:——"
+        items.append((t, src, ts))
+    return items
+
+
+def fetch_all_fast() -> list[HeadlineTuple]:
+    """Fetch only the first N sources for fast startup."""
+    global _fast_start_urls
+    _fast_start_urls = set(list(FEEDS.values())[:FAST_START_SOURCES])
+
+    items: list[HeadlineTuple] = []
+    with ThreadPoolExecutor(max_workers=FAST_START_SOURCES) as executor:
+        futures = {
+            executor.submit(fetch_feed, url): src
+            for src, url in list(FEEDS.items())[:FAST_START_SOURCES]
+        }
+        for future in as_completed(futures):
+            src = futures[future]
+            url, feed = future.result()
+            if feed is None or (feed.bozo and not feed.entries):
+                boot_ln(src, "DARK", False)
+                continue
+            parsed = _parse_feed(feed, src)
+            if parsed:
+                items.extend(parsed)
+                boot_ln(src, f"LINKED [{len(parsed)}]", True)
+            else:
+                boot_ln(src, "EMPTY", False)
+    return items


-# ─── ALL RSS FEEDS ────────────────────────────────────────
 def fetch_all() -> tuple[list[HeadlineTuple], int, int]:
-    """Fetch all RSS feeds and return items, linked count, failed count."""
+    """Fetch all RSS feeds concurrently and return items, linked count, failed count."""
+    global _fast_start_urls
+    _fast_start_urls = set()
+
    items: list[HeadlineTuple] = []
    linked = failed = 0
-    for src, url in FEEDS.items():
-        feed = fetch_feed(url)
-        if feed is None or (feed.bozo and not feed.entries):
-            boot_ln(src, "DARK", False)
-            failed += 1
-            continue
-        n = 0
-        for e in feed.entries:
-            t = strip_tags(e.get("title", ""))
-            if not t or skip(t):
+
+    with ThreadPoolExecutor(max_workers=DEFAULT_MAX_WORKERS) as executor:
+        futures = {executor.submit(fetch_feed, url): src for src, url in FEEDS.items()}
+        for future in as_completed(futures):
+            src = futures[future]
+            url, feed = future.result()
+            if feed is None or (feed.bozo and not feed.entries):
+                boot_ln(src, "DARK", False)
+                failed += 1
                continue
-            pub = e.get("published_parsed") or e.get("updated_parsed")
-            try:
-                ts = datetime(*pub[:6]).strftime("%H:%M") if pub else "——:——"
-            except Exception:
-                ts = "——:——"
-            items.append((t, src, ts))
-            n += 1
-        if n:
-            boot_ln(src, f"LINKED [{n}]", True)
-            linked += 1
-        else:
-            boot_ln(src, "EMPTY", False)
-            failed += 1
+            parsed = _parse_feed(feed, src)
+            if parsed:
+                items.extend(parsed)
+                boot_ln(src, f"LINKED [{len(parsed)}]", True)
+                linked += 1
+            else:
+                boot_ln(src, "EMPTY", False)
+                failed += 1
+
    return items, linked, failed


-# ─── PROJECT GUTENBERG ────────────────────────────────────
 def _fetch_gutenberg(url: str, label: str) -> list[HeadlineTuple]:
    """Download and parse stanzas/passages from a Project Gutenberg text."""
    try:
@@ -76,23 +121,21 @@ def _fetch_gutenberg(url: str, label: str) -> list[HeadlineTuple]:
            .replace("\r\n", "\n")
            .replace("\r", "\n")
        )
-        # Strip PG boilerplate
        m = re.search(r"\*\*\*\s*START OF[^\n]*\n", text)
        if m:
            text = text[m.end() :]
        m = re.search(r"\*\*\*\s*END OF", text)
        if m:
            text = text[: m.start()]
-        # Split on blank lines into stanzas/passages
        blocks = re.split(r"\n{2,}", text.strip())
        items = []
        for blk in blocks:
-            blk = " ".join(blk.split())  # flatten to one line
+            blk = " ".join(blk.split())
            if len(blk) < 20 or len(blk) > 280:
                continue
-            if blk.isupper():  # skip all-caps headers
+            if blk.isupper():
                continue
-            if re.match(r"^[IVXLCDM]+\.?\s*$", blk):  # roman numerals
+            if re.match(r"^[IVXLCDM]+\.?\s*$", blk):
                continue
            items.append((blk, label, ""))
        return items
@@ -100,29 +143,35 @@ def _fetch_gutenberg(url: str, label: str) -> list[HeadlineTuple]:
        return []


-def fetch_poetry():
-    """Fetch all poetry/literature sources."""
+def fetch_poetry() -> tuple[list[HeadlineTuple], int, int]:
+    """Fetch all poetry/literature sources concurrently."""
    items = []
    linked = failed = 0
-    for label, url in POETRY_SOURCES.items():
-        stanzas = _fetch_gutenberg(url, label)
-        if stanzas:
-            boot_ln(label, f"LOADED [{len(stanzas)}]", True)
-            items.extend(stanzas)
-            linked += 1
-        else:
-            boot_ln(label, "DARK", False)
-            failed += 1
+
+    with ThreadPoolExecutor(max_workers=DEFAULT_MAX_WORKERS) as executor:
+        futures = {
+            executor.submit(_fetch_gutenberg, url, label): label
+            for label, url in POETRY_SOURCES.items()
+        }
+        for future in as_completed(futures):
+            label = futures[future]
+            stanzas = future.result()
+            if stanzas:
+                boot_ln(label, f"LOADED [{len(stanzas)}]", True)
+                items.extend(stanzas)
+                linked += 1
+            else:
+                boot_ln(label, "DARK", False)
+                failed += 1
+
    return items, linked, failed


-# ─── CACHE ────────────────────────────────────────────────
-# Cache moved to engine/fixtures/headlines.json
-_CACHE_DIR = pathlib.Path(__file__).resolve().parent / "fixtures"
+_cache_dir = pathlib.Path(__file__).resolve().parent / "fixtures"


 def _cache_path():
-    return _CACHE_DIR / "headlines.json"
+    return _cache_dir / "headlines.json"


 def load_cache():
@@ -144,3 +193,6 @@ def save_cache(items):
        _cache_path().write_text(json.dumps({"items": items}))
    except Exception:
        pass
+
+
+_fast_start_urls: set = set()