61 lines
1.8 KiB
Python
61 lines
1.8 KiB
Python
"""
|
|
HTML stripping and content filter (sports, vapid, insipid).
|
|
No internal dependencies.
|
|
"""
|
|
|
|
import re
|
|
from html import unescape
|
|
from html.parser import HTMLParser
|
|
|
|
|
|
# ─── HTML STRIPPING ───────────────────────────────────────
|
|
class _Strip(HTMLParser):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self._t = []
|
|
|
|
def handle_data(self, d):
|
|
self._t.append(d)
|
|
|
|
def text(self):
|
|
return "".join(self._t).strip()
|
|
|
|
|
|
def strip_tags(html):
|
|
s = _Strip()
|
|
s.feed(unescape(html or ""))
|
|
return s.text()
|
|
|
|
|
|
# ─── CONTENT FILTER ───────────────────────────────────────
|
|
_SKIP_RE = re.compile(
|
|
r'\b(?:'
|
|
# ── sports ──
|
|
r'football|soccer|basketball|baseball|softball|tennis|golf|cricket|rugby|'
|
|
r'hockey|lacrosse|volleyball|badminton|'
|
|
r'nba|nfl|nhl|mlb|mls|fifa|uefa|'
|
|
r'premier league|champions league|la liga|serie a|bundesliga|'
|
|
r'world cup|super bowl|world series|stanley cup|'
|
|
r'playoff|playoffs|touchdown|goalkeeper|striker|quarterback|'
|
|
r'slam dunk|home run|grand slam|offside|halftime|'
|
|
r'batting|wicket|innings|'
|
|
r'formula 1|nascar|motogp|'
|
|
r'boxing|ufc|mma|'
|
|
r'marathon|tour de france|'
|
|
r'transfer window|draft pick|relegation|'
|
|
# ── vapid / insipid ──
|
|
r'kardashian|jenner|reality tv|reality show|'
|
|
r'influencer|viral video|tiktok|instagram|'
|
|
r'best dressed|worst dressed|red carpet|'
|
|
r'horoscope|zodiac|gossip|bikini|selfie|'
|
|
r'you won.t believe|what happened next|'
|
|
r'celebrity couple|celebrity feud|baby bump'
|
|
r')\b',
|
|
re.IGNORECASE
|
|
)
|
|
|
|
|
|
def skip(title):
|
|
"""Return True if headline is sports, vapid, or insipid."""
|
|
return bool(_SKIP_RE.search(title))
|