Files
usa/crawler/scrapers/rss_scraper.py
2026-03-03 13:02:28 +08:00

95 lines
2.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""RSS 抓取:按源独立超时与错误隔离,单源失败不影响其他源"""
import re
import socket
from datetime import datetime, timezone
import feedparser
from config import KEYWORDS, FEED_TIMEOUT, get_feed_sources
from parser_ai import classify_and_severity
def _parse_date(entry) -> datetime:
for attr in ("published_parsed", "updated_parsed"):
val = getattr(entry, attr, None)
if val:
try:
return datetime(*val[:6], tzinfo=timezone.utc)
except (TypeError, ValueError):
pass
return datetime.now(timezone.utc)
def _strip_html(s: str) -> str:
return re.sub(r"<[^>]+>", "", s) if s else ""
def _matches_keywords(text: str) -> bool:
t = (text or "").lower()
for k in KEYWORDS:
if k.lower() in t:
return True
return False
def _fetch_one_feed(name: str, url: str, timeout: int) -> list[dict]:
"""抓取单个 RSS 源,超时或异常返回空列表。不负责去重。"""
old_timeout = socket.getdefaulttimeout()
socket.setdefaulttimeout(timeout)
try:
feed = feedparser.parse(
url,
request_headers={"User-Agent": "US-Iran-Dashboard/1.0"},
agent="US-Iran-Dashboard/1.0",
)
except Exception as e:
print(f" [rss] {name} error: {e}")
return []
finally:
socket.setdefaulttimeout(old_timeout)
out = []
for entry in feed.entries:
title = getattr(entry, "title", "") or ""
raw_summary = getattr(entry, "summary", "") or getattr(entry, "description", "") or ""
summary = _strip_html(raw_summary)
link = getattr(entry, "link", "") or ""
text = f"{title} {summary}"
if not _matches_keywords(text):
continue
published = _parse_date(entry)
cat, sev = classify_and_severity(text)
out.append({
"title": title,
"summary": summary[:400] if summary else title,
"url": link,
"published": published,
"category": cat,
"severity": sev,
"source": name,
})
return out
def fetch_all() -> list[dict]:
"""抓取所有配置的 RSS 源,按源超时与隔离错误,全局去重后返回。"""
sources = get_feed_sources()
if not sources:
return []
items: list[dict] = []
seen: set[tuple[str, str]] = set()
for name, url in sources:
batch = _fetch_one_feed(name, url, FEED_TIMEOUT)
for item in batch:
key = (item["title"][:80], item["url"])
if key in seen:
continue
seen.add(key)
# 写入 DB 的 schema 不包含 source可后续扩展
items.append({k: v for k, v in item.items() if k != "source"})
return items