2026-03-15 06:50:12 +00:00
2 changed files with 101 additions and 0 deletions
@@ -0,0 +1,60 @@
+"""
+HTML stripping and content filter (sports, vapid, insipid).
+No internal dependencies.
+"""
+
+import re
+from html import unescape
+from html.parser import HTMLParser
+
+
+# ─── HTML STRIPPING ───────────────────────────────────────
+class _Strip(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self._t = []
+
+    def handle_data(self, d):
+        self._t.append(d)
+
+    def text(self):
+        return "".join(self._t).strip()
+
+
+def strip_tags(html):
+    s = _Strip()
+    s.feed(unescape(html or ""))
+    return s.text()
+
+
+# ─── CONTENT FILTER ───────────────────────────────────────
+_SKIP_RE = re.compile(
+    r'\b(?:'
+    # ── sports ──
+    r'football|soccer|basketball|baseball|softball|tennis|golf|cricket|rugby|'
+    r'hockey|lacrosse|volleyball|badminton|'
+    r'nba|nfl|nhl|mlb|mls|fifa|uefa|'
+    r'premier league|champions league|la liga|serie a|bundesliga|'
+    r'world cup|super bowl|world series|stanley cup|'
+    r'playoff|playoffs|touchdown|goalkeeper|striker|quarterback|'
+    r'slam dunk|home run|grand slam|offside|halftime|'
+    r'batting|wicket|innings|'
+    r'formula 1|nascar|motogp|'
+    r'boxing|ufc|mma|'
+    r'marathon|tour de france|'
+    r'transfer window|draft pick|relegation|'
+    # ── vapid / insipid ──
+    r'kardashian|jenner|reality tv|reality show|'
+    r'influencer|viral video|tiktok|instagram|'
+    r'best dressed|worst dressed|red carpet|'
+    r'horoscope|zodiac|gossip|bikini|selfie|'
+    r'you won.t believe|what happened next|'
+    r'celebrity couple|celebrity feud|baby bump'
+    r')\b',
+    re.IGNORECASE
+)
+
+
+def skip(title):
+    """Return True if headline is sports, vapid, or insipid."""
+    return bool(_SKIP_RE.search(title))
@@ -0,0 +1,41 @@
+"""
+Google Translate wrapper and location→language detection.
+Depends on: sources (for LOCATION_LANGS).
+"""
+
+import re
+import json
+import urllib.request
+import urllib.parse
+
+from engine.sources import LOCATION_LANGS
+
+_TRANSLATE_CACHE = {}
+
+
+def detect_location_language(title):
+    """Detect if headline mentions a location, return target language."""
+    title_lower = title.lower()
+    for pattern, lang in LOCATION_LANGS.items():
+        if re.search(pattern, title_lower):
+            return lang
+    return None
+
+
+def translate_headline(title, target_lang):
+    """Translate headline via Google Translate API (zero dependencies)."""
+    key = (title, target_lang)
+    if key in _TRANSLATE_CACHE:
+        return _TRANSLATE_CACHE[key]
+    try:
+        q = urllib.parse.quote(title)
+        url = ("https://translate.googleapis.com/translate_a/single"
+               f"?client=gtx&sl=en&tl={target_lang}&dt=t&q={q}")
+        req = urllib.request.Request(url, headers={"User-Agent": "mainline/0.1"})
+        resp = urllib.request.urlopen(req, timeout=5)
+        data = json.loads(resp.read())
+        result = "".join(p[0] for p in data[0] if p[0]) or title
+    except Exception:
+        result = title
+    _TRANSLATE_CACHE[key] = result
+    return result