- Fix pre-existing lint errors in engine/ modules using ruff --unsafe-fixes - Add hk.pkl with pre-commit and pre-push hooks using ruff builtin - Configure hooks to use 'uv run' prefix for tool execution - Update mise.toml to include hk and pkl tools - All 73 tests pass fix: apply ruff auto-fixes and add hk git hooks - Fix pre-existing lint errors in engine/ modules using ruff --unsafe-fixes - Add hk.pkl with pre-commit and pre-push hooks using ruff builtin - Configure hooks to use 'uv run' prefix for tool execution - Update mise.toml to include hk and pkl tools - Use 'hk install --mise' for proper mise integration - All 73 tests pass
61 lines
1.8 KiB
Python
61 lines
1.8 KiB
Python
"""
|
|
HTML stripping and content filter (sports, vapid, insipid).
|
|
No internal dependencies.
|
|
"""
|
|
|
|
import re
|
|
from html import unescape
|
|
from html.parser import HTMLParser
|
|
|
|
|
|
# ─── HTML STRIPPING ───────────────────────────────────────
|
|
class _Strip(HTMLParser):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self._t = []
|
|
|
|
def handle_data(self, d):
|
|
self._t.append(d)
|
|
|
|
def text(self):
|
|
return "".join(self._t).strip()
|
|
|
|
|
|
def strip_tags(html):
|
|
s = _Strip()
|
|
s.feed(unescape(html or ""))
|
|
return s.text()
|
|
|
|
|
|
# ─── CONTENT FILTER ───────────────────────────────────────
|
|
_SKIP_RE = re.compile(
|
|
r"\b(?:"
|
|
# ── sports ──
|
|
r"football|soccer|basketball|baseball|softball|tennis|golf|cricket|rugby|"
|
|
r"hockey|lacrosse|volleyball|badminton|"
|
|
r"nba|nfl|nhl|mlb|mls|fifa|uefa|"
|
|
r"premier league|champions league|la liga|serie a|bundesliga|"
|
|
r"world cup|super bowl|world series|stanley cup|"
|
|
r"playoff|playoffs|touchdown|goalkeeper|striker|quarterback|"
|
|
r"slam dunk|home run|grand slam|offside|halftime|"
|
|
r"batting|wicket|innings|"
|
|
r"formula 1|nascar|motogp|"
|
|
r"boxing|ufc|mma|"
|
|
r"marathon|tour de france|"
|
|
r"transfer window|draft pick|relegation|"
|
|
# ── vapid / insipid ──
|
|
r"kardashian|jenner|reality tv|reality show|"
|
|
r"influencer|viral video|tiktok|instagram|"
|
|
r"best dressed|worst dressed|red carpet|"
|
|
r"horoscope|zodiac|gossip|bikini|selfie|"
|
|
r"you won.t believe|what happened next|"
|
|
r"celebrity couple|celebrity feud|baby bump"
|
|
r")\b",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def skip(title):
|
|
"""Return True if headline is sports, vapid, or insipid."""
|
|
return bool(_SKIP_RE.search(title))
|