feat/scalability #10
60
engine/filter.py
Normal file
60
engine/filter.py
Normal file
@@ -0,0 +1,60 @@
|
||||
"""
|
||||
HTML stripping and content filter (sports, vapid, insipid).
|
||||
No internal dependencies.
|
||||
"""
|
||||
|
||||
import re
|
||||
from html import unescape
|
||||
from html.parser import HTMLParser
|
||||
|
||||
|
||||
# ─── HTML STRIPPING ───────────────────────────────────────
|
||||
class _Strip(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._t = []
|
||||
|
||||
def handle_data(self, d):
|
||||
self._t.append(d)
|
||||
|
||||
def text(self):
|
||||
return "".join(self._t).strip()
|
||||
|
||||
|
||||
def strip_tags(html):
|
||||
s = _Strip()
|
||||
s.feed(unescape(html or ""))
|
||||
return s.text()
|
||||
|
||||
|
||||
# ─── CONTENT FILTER ───────────────────────────────────────
|
||||
_SKIP_RE = re.compile(
|
||||
r'\b(?:'
|
||||
# ── sports ──
|
||||
r'football|soccer|basketball|baseball|softball|tennis|golf|cricket|rugby|'
|
||||
r'hockey|lacrosse|volleyball|badminton|'
|
||||
r'nba|nfl|nhl|mlb|mls|fifa|uefa|'
|
||||
r'premier league|champions league|la liga|serie a|bundesliga|'
|
||||
r'world cup|super bowl|world series|stanley cup|'
|
||||
r'playoff|playoffs|touchdown|goalkeeper|striker|quarterback|'
|
||||
r'slam dunk|home run|grand slam|offside|halftime|'
|
||||
r'batting|wicket|innings|'
|
||||
r'formula 1|nascar|motogp|'
|
||||
r'boxing|ufc|mma|'
|
||||
r'marathon|tour de france|'
|
||||
r'transfer window|draft pick|relegation|'
|
||||
# ── vapid / insipid ──
|
||||
r'kardashian|jenner|reality tv|reality show|'
|
||||
r'influencer|viral video|tiktok|instagram|'
|
||||
r'best dressed|worst dressed|red carpet|'
|
||||
r'horoscope|zodiac|gossip|bikini|selfie|'
|
||||
r'you won.t believe|what happened next|'
|
||||
r'celebrity couple|celebrity feud|baby bump'
|
||||
r')\b',
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
|
||||
def skip(title):
|
||||
"""Return True if headline is sports, vapid, or insipid."""
|
||||
return bool(_SKIP_RE.search(title))
|
||||
41
engine/translate.py
Normal file
41
engine/translate.py
Normal file
@@ -0,0 +1,41 @@
|
||||
"""
|
||||
Google Translate wrapper and location→language detection.
|
||||
Depends on: sources (for LOCATION_LANGS).
|
||||
"""
|
||||
|
||||
import re
|
||||
import json
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
|
||||
from engine.sources import LOCATION_LANGS
|
||||
|
||||
_TRANSLATE_CACHE = {}
|
||||
|
||||
|
||||
def detect_location_language(title):
|
||||
"""Detect if headline mentions a location, return target language."""
|
||||
title_lower = title.lower()
|
||||
for pattern, lang in LOCATION_LANGS.items():
|
||||
if re.search(pattern, title_lower):
|
||||
return lang
|
||||
return None
|
||||
|
||||
|
||||
def translate_headline(title, target_lang):
|
||||
"""Translate headline via Google Translate API (zero dependencies)."""
|
||||
key = (title, target_lang)
|
||||
if key in _TRANSLATE_CACHE:
|
||||
return _TRANSLATE_CACHE[key]
|
||||
try:
|
||||
q = urllib.parse.quote(title)
|
||||
url = ("https://translate.googleapis.com/translate_a/single"
|
||||
f"?client=gtx&sl=en&tl={target_lang}&dt=t&q={q}")
|
||||
req = urllib.request.Request(url, headers={"User-Agent": "mainline/0.1"})
|
||||
resp = urllib.request.urlopen(req, timeout=5)
|
||||
data = json.loads(resp.read())
|
||||
result = "".join(p[0] for p in data[0] if p[0]) or title
|
||||
except Exception:
|
||||
result = title
|
||||
_TRANSLATE_CACHE[key] = result
|
||||
return result
|
||||
Reference in New Issue
Block a user