fix: 优化后端数据更新机制
This commit is contained in:
Binary file not shown.
Binary file not shown.
@@ -1,11 +1,12 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""RSS 抓取"""
|
||||
"""RSS 抓取:按源独立超时与错误隔离,单源失败不影响其他源"""
|
||||
import re
|
||||
import socket
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import feedparser
|
||||
|
||||
from config import RSS_FEEDS, KEYWORDS
|
||||
from config import KEYWORDS, FEED_TIMEOUT, get_feed_sources
|
||||
from parser_ai import classify_and_severity
|
||||
|
||||
|
||||
@@ -32,45 +33,62 @@ def _matches_keywords(text: str) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def fetch_all() -> list[dict]:
|
||||
import socket
|
||||
items: list[dict] = []
|
||||
seen: set[str] = set()
|
||||
# 单源超时 10 秒,避免某源卡住
|
||||
def _fetch_one_feed(name: str, url: str, timeout: int) -> list[dict]:
|
||||
"""抓取单个 RSS 源,超时或异常返回空列表。不负责去重。"""
|
||||
old_timeout = socket.getdefaulttimeout()
|
||||
socket.setdefaulttimeout(10)
|
||||
socket.setdefaulttimeout(timeout)
|
||||
try:
|
||||
for url in RSS_FEEDS:
|
||||
try:
|
||||
feed = feedparser.parse(
|
||||
url,
|
||||
request_headers={"User-Agent": "US-Iran-Dashboard/1.0"},
|
||||
agent="US-Iran-Dashboard/1.0",
|
||||
)
|
||||
except Exception:
|
||||
continue
|
||||
for entry in feed.entries:
|
||||
title = getattr(entry, "title", "") or ""
|
||||
raw_summary = getattr(entry, "summary", "") or getattr(entry, "description", "") or ""
|
||||
summary = _strip_html(raw_summary)
|
||||
link = getattr(entry, "link", "") or ""
|
||||
text = f"{title} {summary}"
|
||||
if not _matches_keywords(text):
|
||||
continue
|
||||
key = (title[:80], link)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
published = _parse_date(entry)
|
||||
cat, sev = classify_and_severity(text)
|
||||
items.append({
|
||||
"title": title,
|
||||
"summary": summary[:400] if summary else title,
|
||||
"url": link,
|
||||
"published": _parse_date(entry),
|
||||
"category": cat,
|
||||
"severity": sev,
|
||||
})
|
||||
feed = feedparser.parse(
|
||||
url,
|
||||
request_headers={"User-Agent": "US-Iran-Dashboard/1.0"},
|
||||
agent="US-Iran-Dashboard/1.0",
|
||||
)
|
||||
except Exception as e:
|
||||
print(f" [rss] {name} error: {e}")
|
||||
return []
|
||||
finally:
|
||||
socket.setdefaulttimeout(old_timeout)
|
||||
|
||||
out = []
|
||||
for entry in feed.entries:
|
||||
title = getattr(entry, "title", "") or ""
|
||||
raw_summary = getattr(entry, "summary", "") or getattr(entry, "description", "") or ""
|
||||
summary = _strip_html(raw_summary)
|
||||
link = getattr(entry, "link", "") or ""
|
||||
text = f"{title} {summary}"
|
||||
if not _matches_keywords(text):
|
||||
continue
|
||||
published = _parse_date(entry)
|
||||
cat, sev = classify_and_severity(text)
|
||||
out.append({
|
||||
"title": title,
|
||||
"summary": summary[:400] if summary else title,
|
||||
"url": link,
|
||||
"published": published,
|
||||
"category": cat,
|
||||
"severity": sev,
|
||||
"source": name,
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
def fetch_all() -> list[dict]:
|
||||
"""抓取所有配置的 RSS 源,按源超时与隔离错误,全局去重后返回。"""
|
||||
sources = get_feed_sources()
|
||||
if not sources:
|
||||
return []
|
||||
|
||||
items: list[dict] = []
|
||||
seen: set[tuple[str, str]] = set()
|
||||
|
||||
for name, url in sources:
|
||||
batch = _fetch_one_feed(name, url, FEED_TIMEOUT)
|
||||
for item in batch:
|
||||
key = (item["title"][:80], item["url"])
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
# 写入 DB 的 schema 不包含 source,可后续扩展
|
||||
items.append({k: v for k, v in item.items() if k != "source"})
|
||||
|
||||
return items
|
||||
|
||||
Reference in New Issue
Block a user