Files
sideline/engine/filter.py

61 lines
1.8 KiB
Python

"""
HTML stripping and content filter (sports, vapid, insipid).
No internal dependencies.
"""
import re
from html import unescape
from html.parser import HTMLParser
# ─── HTML STRIPPING ───────────────────────────────────────
class _Strip(HTMLParser):
def __init__(self):
super().__init__()
self._t = []
def handle_data(self, d):
self._t.append(d)
def text(self):
return "".join(self._t).strip()
def strip_tags(html):
s = _Strip()
s.feed(unescape(html or ""))
return s.text()
# ─── CONTENT FILTER ───────────────────────────────────────
_SKIP_RE = re.compile(
r'\b(?:'
# ── sports ──
r'football|soccer|basketball|baseball|softball|tennis|golf|cricket|rugby|'
r'hockey|lacrosse|volleyball|badminton|'
r'nba|nfl|nhl|mlb|mls|fifa|uefa|'
r'premier league|champions league|la liga|serie a|bundesliga|'
r'world cup|super bowl|world series|stanley cup|'
r'playoff|playoffs|touchdown|goalkeeper|striker|quarterback|'
r'slam dunk|home run|grand slam|offside|halftime|'
r'batting|wicket|innings|'
r'formula 1|nascar|motogp|'
r'boxing|ufc|mma|'
r'marathon|tour de france|'
r'transfer window|draft pick|relegation|'
# ── vapid / insipid ──
r'kardashian|jenner|reality tv|reality show|'
r'influencer|viral video|tiktok|instagram|'
r'best dressed|worst dressed|red carpet|'
r'horoscope|zodiac|gossip|bikini|selfie|'
r'you won.t believe|what happened next|'
r'celebrity couple|celebrity feud|baby bump'
r')\b',
re.IGNORECASE
)
def skip(title):
"""Return True if headline is sports, vapid, or insipid."""
return bool(_SKIP_RE.search(title))