# -*- coding: utf-8 -*- """RSS 抓取:按源独立超时与错误隔离,单源失败不影响其他源""" import re import socket from datetime import datetime, timezone import feedparser from config import KEYWORDS, FEED_TIMEOUT, get_feed_sources from parser_ai import classify_and_severity def _parse_date(entry) -> datetime: for attr in ("published_parsed", "updated_parsed"): val = getattr(entry, attr, None) if val: try: return datetime(*val[:6], tzinfo=timezone.utc) except (TypeError, ValueError): pass return datetime.now(timezone.utc) def _strip_html(s: str) -> str: return re.sub(r"<[^>]+>", "", s) if s else "" def _matches_keywords(text: str) -> bool: t = (text or "").lower() for k in KEYWORDS: if k.lower() in t: return True return False def _fetch_one_feed(name: str, url: str, timeout: int) -> list[dict]: """抓取单个 RSS 源,超时或异常返回空列表。不负责去重。""" old_timeout = socket.getdefaulttimeout() socket.setdefaulttimeout(timeout) try: feed = feedparser.parse( url, request_headers={"User-Agent": "US-Iran-Dashboard/1.0"}, agent="US-Iran-Dashboard/1.0", ) except Exception as e: print(f" [rss] {name} error: {e}") return [] finally: socket.setdefaulttimeout(old_timeout) out = [] for entry in feed.entries: title = getattr(entry, "title", "") or "" raw_summary = getattr(entry, "summary", "") or getattr(entry, "description", "") or "" summary = _strip_html(raw_summary) link = getattr(entry, "link", "") or "" text = f"{title} {summary}" if not _matches_keywords(text): continue published = _parse_date(entry) cat, sev = classify_and_severity(text) out.append({ "title": title, "summary": summary[:400] if summary else title, "url": link, "published": published, "category": cat, "severity": sev, "source": name, }) return out def fetch_all() -> list[dict]: """抓取所有配置的 RSS 源,按源超时与隔离错误,全局去重后返回。""" sources = get_feed_sources() if not sources: return [] items: list[dict] = [] seen: set[tuple[str, str]] = set() for name, url in sources: batch = _fetch_one_feed(name, url, FEED_TIMEOUT) for item in batch: key = (item["title"][:80], item["url"]) if key in seen: continue seen.add(key) # 写入 DB 的 schema 不包含 source,可后续扩展 items.append({k: v for k, v in item.items() if k != "source"}) return items