feat: Introduce translate module for Google Translate integration and location-based language detection, and add a new filter module.
This commit is contained in:
60
engine/filter.py
Normal file
60
engine/filter.py
Normal file
@@ -0,0 +1,60 @@
|
||||
"""
|
||||
HTML stripping and content filter (sports, vapid, insipid).
|
||||
No internal dependencies.
|
||||
"""
|
||||
|
||||
import re
|
||||
from html import unescape
|
||||
from html.parser import HTMLParser
|
||||
|
||||
|
||||
# ─── HTML STRIPPING ───────────────────────────────────────
|
||||
class _Strip(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._t = []
|
||||
|
||||
def handle_data(self, d):
|
||||
self._t.append(d)
|
||||
|
||||
def text(self):
|
||||
return "".join(self._t).strip()
|
||||
|
||||
|
||||
def strip_tags(html):
|
||||
s = _Strip()
|
||||
s.feed(unescape(html or ""))
|
||||
return s.text()
|
||||
|
||||
|
||||
# ─── CONTENT FILTER ───────────────────────────────────────
|
||||
_SKIP_RE = re.compile(
|
||||
r'\b(?:'
|
||||
# ── sports ──
|
||||
r'football|soccer|basketball|baseball|softball|tennis|golf|cricket|rugby|'
|
||||
r'hockey|lacrosse|volleyball|badminton|'
|
||||
r'nba|nfl|nhl|mlb|mls|fifa|uefa|'
|
||||
r'premier league|champions league|la liga|serie a|bundesliga|'
|
||||
r'world cup|super bowl|world series|stanley cup|'
|
||||
r'playoff|playoffs|touchdown|goalkeeper|striker|quarterback|'
|
||||
r'slam dunk|home run|grand slam|offside|halftime|'
|
||||
r'batting|wicket|innings|'
|
||||
r'formula 1|nascar|motogp|'
|
||||
r'boxing|ufc|mma|'
|
||||
r'marathon|tour de france|'
|
||||
r'transfer window|draft pick|relegation|'
|
||||
# ── vapid / insipid ──
|
||||
r'kardashian|jenner|reality tv|reality show|'
|
||||
r'influencer|viral video|tiktok|instagram|'
|
||||
r'best dressed|worst dressed|red carpet|'
|
||||
r'horoscope|zodiac|gossip|bikini|selfie|'
|
||||
r'you won.t believe|what happened next|'
|
||||
r'celebrity couple|celebrity feud|baby bump'
|
||||
r')\b',
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
|
||||
def skip(title):
|
||||
"""Return True if headline is sports, vapid, or insipid."""
|
||||
return bool(_SKIP_RE.search(title))
|
||||
Reference in New Issue
Block a user