From d3c403848c7dde0c4a2281c990d95c1bf2bc7f48 Mon Sep 17 00:00:00 2001 From: Gene Johnson Date: Sat, 14 Mar 2026 21:00:24 -0700 Subject: [PATCH] feat: Introduce `translate` module for Google Translate integration and location-based language detection, and add a new `filter` module. --- engine/filter.py | 60 +++++++++++++++++++++++++++++++++++++++++++++ engine/translate.py | 41 +++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 engine/filter.py create mode 100644 engine/translate.py diff --git a/engine/filter.py b/engine/filter.py new file mode 100644 index 0000000..4725493 --- /dev/null +++ b/engine/filter.py @@ -0,0 +1,60 @@ +""" +HTML stripping and content filter (sports, vapid, insipid). +No internal dependencies. +""" + +import re +from html import unescape +from html.parser import HTMLParser + + +# ─── HTML STRIPPING ─────────────────────────────────────── +class _Strip(HTMLParser): + def __init__(self): + super().__init__() + self._t = [] + + def handle_data(self, d): + self._t.append(d) + + def text(self): + return "".join(self._t).strip() + + +def strip_tags(html): + s = _Strip() + s.feed(unescape(html or "")) + return s.text() + + +# ─── CONTENT FILTER ─────────────────────────────────────── +_SKIP_RE = re.compile( + r'\b(?:' + # ── sports ── + r'football|soccer|basketball|baseball|softball|tennis|golf|cricket|rugby|' + r'hockey|lacrosse|volleyball|badminton|' + r'nba|nfl|nhl|mlb|mls|fifa|uefa|' + r'premier league|champions league|la liga|serie a|bundesliga|' + r'world cup|super bowl|world series|stanley cup|' + r'playoff|playoffs|touchdown|goalkeeper|striker|quarterback|' + r'slam dunk|home run|grand slam|offside|halftime|' + r'batting|wicket|innings|' + r'formula 1|nascar|motogp|' + r'boxing|ufc|mma|' + r'marathon|tour de france|' + r'transfer window|draft pick|relegation|' + # ── vapid / insipid ── + r'kardashian|jenner|reality tv|reality show|' + r'influencer|viral video|tiktok|instagram|' + r'best dressed|worst dressed|red carpet|' + r'horoscope|zodiac|gossip|bikini|selfie|' + r'you won.t believe|what happened next|' + r'celebrity couple|celebrity feud|baby bump' + r')\b', + re.IGNORECASE +) + + +def skip(title): + """Return True if headline is sports, vapid, or insipid.""" + return bool(_SKIP_RE.search(title)) diff --git a/engine/translate.py b/engine/translate.py new file mode 100644 index 0000000..57bb795 --- /dev/null +++ b/engine/translate.py @@ -0,0 +1,41 @@ +""" +Google Translate wrapper and location→language detection. +Depends on: sources (for LOCATION_LANGS). +""" + +import re +import json +import urllib.request +import urllib.parse + +from engine.sources import LOCATION_LANGS + +_TRANSLATE_CACHE = {} + + +def detect_location_language(title): + """Detect if headline mentions a location, return target language.""" + title_lower = title.lower() + for pattern, lang in LOCATION_LANGS.items(): + if re.search(pattern, title_lower): + return lang + return None + + +def translate_headline(title, target_lang): + """Translate headline via Google Translate API (zero dependencies).""" + key = (title, target_lang) + if key in _TRANSLATE_CACHE: + return _TRANSLATE_CACHE[key] + try: + q = urllib.parse.quote(title) + url = ("https://translate.googleapis.com/translate_a/single" + f"?client=gtx&sl=en&tl={target_lang}&dt=t&q={q}") + req = urllib.request.Request(url, headers={"User-Agent": "mainline/0.1"}) + resp = urllib.request.urlopen(req, timeout=5) + data = json.loads(resp.read()) + result = "".join(p[0] for p in data[0] if p[0]) or title + except Exception: + result = title + _TRANSLATE_CACHE[key] = result + return result