""" HTML stripping and content filter (sports, vapid, insipid). No internal dependencies. """ import re from html import unescape from html.parser import HTMLParser # ─── HTML STRIPPING ─────────────────────────────────────── class _Strip(HTMLParser): def __init__(self): super().__init__() self._t = [] def handle_data(self, d): self._t.append(d) def text(self): return "".join(self._t).strip() def strip_tags(html): s = _Strip() s.feed(unescape(html or "")) return s.text() # ─── CONTENT FILTER ─────────────────────────────────────── _SKIP_RE = re.compile( r"\b(?:" # ── sports ── r"football|soccer|basketball|baseball|softball|tennis|golf|cricket|rugby|" r"hockey|lacrosse|volleyball|badminton|" r"nba|nfl|nhl|mlb|mls|fifa|uefa|" r"premier league|champions league|la liga|serie a|bundesliga|" r"world cup|super bowl|world series|stanley cup|" r"playoff|playoffs|touchdown|goalkeeper|striker|quarterback|" r"slam dunk|home run|grand slam|offside|halftime|" r"batting|wicket|innings|" r"formula 1|nascar|motogp|" r"boxing|ufc|mma|" r"marathon|tour de france|" r"transfer window|draft pick|relegation|" # ── vapid / insipid ── r"kardashian|jenner|reality tv|reality show|" r"influencer|viral video|tiktok|instagram|" r"best dressed|worst dressed|red carpet|" r"horoscope|zodiac|gossip|bikini|selfie|" r"you won.t believe|what happened next|" r"celebrity couple|celebrity feud|baby bump" r")\b", re.IGNORECASE, ) def skip(title): """Return True if headline is sports, vapid, or insipid.""" return bool(_SKIP_RE.search(title))