forked from genewildish/Mainline
feat: Introduce translate module for Google Translate integration and location-based language detection, and add a new filter module.
This commit is contained in:
60
engine/filter.py
Normal file
60
engine/filter.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
"""
|
||||||
|
HTML stripping and content filter (sports, vapid, insipid).
|
||||||
|
No internal dependencies.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from html import unescape
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
|
||||||
|
|
||||||
|
# ─── HTML STRIPPING ───────────────────────────────────────
|
||||||
|
class _Strip(HTMLParser):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self._t = []
|
||||||
|
|
||||||
|
def handle_data(self, d):
|
||||||
|
self._t.append(d)
|
||||||
|
|
||||||
|
def text(self):
|
||||||
|
return "".join(self._t).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def strip_tags(html):
|
||||||
|
s = _Strip()
|
||||||
|
s.feed(unescape(html or ""))
|
||||||
|
return s.text()
|
||||||
|
|
||||||
|
|
||||||
|
# ─── CONTENT FILTER ───────────────────────────────────────
|
||||||
|
_SKIP_RE = re.compile(
|
||||||
|
r'\b(?:'
|
||||||
|
# ── sports ──
|
||||||
|
r'football|soccer|basketball|baseball|softball|tennis|golf|cricket|rugby|'
|
||||||
|
r'hockey|lacrosse|volleyball|badminton|'
|
||||||
|
r'nba|nfl|nhl|mlb|mls|fifa|uefa|'
|
||||||
|
r'premier league|champions league|la liga|serie a|bundesliga|'
|
||||||
|
r'world cup|super bowl|world series|stanley cup|'
|
||||||
|
r'playoff|playoffs|touchdown|goalkeeper|striker|quarterback|'
|
||||||
|
r'slam dunk|home run|grand slam|offside|halftime|'
|
||||||
|
r'batting|wicket|innings|'
|
||||||
|
r'formula 1|nascar|motogp|'
|
||||||
|
r'boxing|ufc|mma|'
|
||||||
|
r'marathon|tour de france|'
|
||||||
|
r'transfer window|draft pick|relegation|'
|
||||||
|
# ── vapid / insipid ──
|
||||||
|
r'kardashian|jenner|reality tv|reality show|'
|
||||||
|
r'influencer|viral video|tiktok|instagram|'
|
||||||
|
r'best dressed|worst dressed|red carpet|'
|
||||||
|
r'horoscope|zodiac|gossip|bikini|selfie|'
|
||||||
|
r'you won.t believe|what happened next|'
|
||||||
|
r'celebrity couple|celebrity feud|baby bump'
|
||||||
|
r')\b',
|
||||||
|
re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def skip(title):
|
||||||
|
"""Return True if headline is sports, vapid, or insipid."""
|
||||||
|
return bool(_SKIP_RE.search(title))
|
||||||
41
engine/translate.py
Normal file
41
engine/translate.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
"""
|
||||||
|
Google Translate wrapper and location→language detection.
|
||||||
|
Depends on: sources (for LOCATION_LANGS).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import urllib.request
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
|
from engine.sources import LOCATION_LANGS
|
||||||
|
|
||||||
|
_TRANSLATE_CACHE = {}
|
||||||
|
|
||||||
|
|
||||||
|
def detect_location_language(title):
|
||||||
|
"""Detect if headline mentions a location, return target language."""
|
||||||
|
title_lower = title.lower()
|
||||||
|
for pattern, lang in LOCATION_LANGS.items():
|
||||||
|
if re.search(pattern, title_lower):
|
||||||
|
return lang
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def translate_headline(title, target_lang):
|
||||||
|
"""Translate headline via Google Translate API (zero dependencies)."""
|
||||||
|
key = (title, target_lang)
|
||||||
|
if key in _TRANSLATE_CACHE:
|
||||||
|
return _TRANSLATE_CACHE[key]
|
||||||
|
try:
|
||||||
|
q = urllib.parse.quote(title)
|
||||||
|
url = ("https://translate.googleapis.com/translate_a/single"
|
||||||
|
f"?client=gtx&sl=en&tl={target_lang}&dt=t&q={q}")
|
||||||
|
req = urllib.request.Request(url, headers={"User-Agent": "mainline/0.1"})
|
||||||
|
resp = urllib.request.urlopen(req, timeout=5)
|
||||||
|
data = json.loads(resp.read())
|
||||||
|
result = "".join(p[0] for p in data[0] if p[0]) or title
|
||||||
|
except Exception:
|
||||||
|
result = title
|
||||||
|
_TRANSLATE_CACHE[key] = result
|
||||||
|
return result
|
||||||
Reference in New Issue
Block a user