""" HTML stripping and content filter (sports, vapid, insipid). No internal dependencies. """ import re from html import unescape from html.parser import HTMLParser # ─── HTML STRIPPING ─────────────────────────────────────── class _Strip(HTMLParser): def __init__(self): super().__init__() self._t = [] def handle_data(self, d): self._t.append(d) def text(self): return "".join(self._t).strip() def strip_tags(html): s = _Strip() s.feed(unescape(html or "")) return s.text() # ─── CONTENT FILTER ─────────────────────────────────────── _SKIP_RE = re.compile( r'\b(?:' # ── sports ── r'football|soccer|basketball|baseball|softball|tennis|golf|cricket|rugby|' r'hockey|lacrosse|volleyball|badminton|' r'nba|nfl|nhl|mlb|mls|fifa|uefa|' r'premier league|champions league|la liga|serie a|bundesliga|' r'world cup|super bowl|world series|stanley cup|' r'playoff|playoffs|touchdown|goalkeeper|striker|quarterback|' r'slam dunk|home run|grand slam|offside|halftime|' r'batting|wicket|innings|' r'formula 1|nascar|motogp|' r'boxing|ufc|mma|' r'marathon|tour de france|' r'transfer window|draft pick|relegation|' # ── vapid / insipid ── r'kardashian|jenner|reality tv|reality show|' r'influencer|viral video|tiktok|instagram|' r'best dressed|worst dressed|red carpet|' r'horoscope|zodiac|gossip|bikini|selfie|' r'you won.t believe|what happened next|' r'celebrity couple|celebrity feud|baby bump' r')\b', re.IGNORECASE ) def skip(title): """Return True if headline is sports, vapid, or insipid.""" return bool(_SKIP_RE.search(title))