Files
sideline/engine/filter.py
David Gwilliam 757c854584 fix: apply ruff auto-fixes and add hk git hooks
- Fix pre-existing lint errors in engine/ modules using ruff --unsafe-fixes
- Add hk.pkl with pre-commit and pre-push hooks using ruff builtin
- Configure hooks to use 'uv run' prefix for tool execution
- Update mise.toml to include hk and pkl tools
- All 73 tests pass

fix: apply ruff auto-fixes and add hk git hooks

- Fix pre-existing lint errors in engine/ modules using ruff --unsafe-fixes
- Add hk.pkl with pre-commit and pre-push hooks using ruff builtin
- Configure hooks to use 'uv run' prefix for tool execution
- Update mise.toml to include hk and pkl tools
- Use 'hk install --mise' for proper mise integration
- All 73 tests pass
2026-03-15 15:16:37 -07:00

61 lines
1.8 KiB
Python

"""
HTML stripping and content filter (sports, vapid, insipid).
No internal dependencies.
"""
import re
from html import unescape
from html.parser import HTMLParser
# ─── HTML STRIPPING ───────────────────────────────────────
class _Strip(HTMLParser):
def __init__(self):
super().__init__()
self._t = []
def handle_data(self, d):
self._t.append(d)
def text(self):
return "".join(self._t).strip()
def strip_tags(html):
s = _Strip()
s.feed(unescape(html or ""))
return s.text()
# ─── CONTENT FILTER ───────────────────────────────────────
_SKIP_RE = re.compile(
r"\b(?:"
# ── sports ──
r"football|soccer|basketball|baseball|softball|tennis|golf|cricket|rugby|"
r"hockey|lacrosse|volleyball|badminton|"
r"nba|nfl|nhl|mlb|mls|fifa|uefa|"
r"premier league|champions league|la liga|serie a|bundesliga|"
r"world cup|super bowl|world series|stanley cup|"
r"playoff|playoffs|touchdown|goalkeeper|striker|quarterback|"
r"slam dunk|home run|grand slam|offside|halftime|"
r"batting|wicket|innings|"
r"formula 1|nascar|motogp|"
r"boxing|ufc|mma|"
r"marathon|tour de france|"
r"transfer window|draft pick|relegation|"
# ── vapid / insipid ──
r"kardashian|jenner|reality tv|reality show|"
r"influencer|viral video|tiktok|instagram|"
r"best dressed|worst dressed|red carpet|"
r"horoscope|zodiac|gossip|bikini|selfie|"
r"you won.t believe|what happened next|"
r"celebrity couple|celebrity feud|baby bump"
r")\b",
re.IGNORECASE,
)
def skip(title):
"""Return True if headline is sports, vapid, or insipid."""
return bool(_SKIP_RE.search(title))