fix: 优化后端数据更新机制

2026-03-03 13:02:28 +08:00
parent 7284a1a60d
commit fa6f7407f0
20 changed files with 592 additions and 201 deletions
--- a/crawler/article_fetcher.py
+++ b/crawler/article_fetcher.py
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+"""
+从文章 URL 抓取正文，供 AI 提取精确数据使用。
+RSS 仅提供标题和短摘要，正文可提供伤亡、番号、地点等具体数字与事实。
+"""
+import os
+import re
+from typing import Optional
+
+# 单页超时（秒）
+FETCH_TIMEOUT = int(os.environ.get("ARTICLE_FETCH_TIMEOUT", "12"))
+# 正文最大字符数，避免超长输入
+MAX_BODY_CHARS = int(os.environ.get("ARTICLE_MAX_BODY_CHARS", "6000"))
+# 是否启用正文抓取（设为 0 则仅用标题+摘要）
+FETCH_FULL_ARTICLE = os.environ.get("FETCH_FULL_ARTICLE", "1") == "1"
+
+
+def _strip_html(html: str) -> str:
+    """简单去除 HTML 标签与多余空白"""
+    if not html:
+        return ""
+    text = re.sub(r"<script[^>]*>[\s\S]*?</script>", " ", html, flags=re.I)
+    text = re.sub(r"<style[^>]*>[\s\S]*?</style>", " ", text, flags=re.I)
+    text = re.sub(r"<[^>]+>", " ", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+
+
+def fetch_article_body(url: str, timeout: int = FETCH_TIMEOUT) -> Optional[str]:
+    """
+    请求文章 URL，提取正文纯文本。失败或非 HTML 返回 None。
+    优先用 BeautifulSoup 取 main/article 或 body，否则退化为正则去标签。
+    """
+    if not url or not url.strip().startswith("http"):
+        return None
+    try:
+        import requests
+        headers = {"User-Agent": "US-Iran-Dashboard/1.0 (News Aggregator)"}
+        # 不跟随代理，避免墙内超时
+        proxies = {"http": None, "https": None} if os.environ.get("CRAWLER_USE_PROXY") != "1" else None
+        r = requests.get(url, headers=headers, timeout=timeout, proxies=proxies)
+        r.raise_for_status()
+        ct = (r.headers.get("Content-Type") or "").lower()
+        if "html" not in ct and "xml" not in ct:
+            return None
+        html = r.text
+        if not html or len(html) < 200:
+            return None
+        try:
+            from bs4 import BeautifulSoup
+        except ImportError:
+            return _strip_html(html)[:MAX_BODY_CHARS]
+        try:
+            soup = BeautifulSoup(html, "html.parser")
+            for tag in ("article", "main", "[role='main']", ".article-body", ".post-content", ".entry-content", ".content"):
+                if tag.startswith((".", "[")):
+                    node = soup.select_one(tag)
+                else:
+                    node = soup.find(tag)
+                if node:
+                    body = node.get_text(separator=" ", strip=True)
+                    if len(body) > 300:
+                        return _strip_html(body)[:MAX_BODY_CHARS]
+            body = soup.body.get_text(separator=" ", strip=True) if soup.body else ""
+            if len(body) > 300:
+                return _strip_html(body)[:MAX_BODY_CHARS]
+        except Exception:
+            pass
+        return _strip_html(html)[:MAX_BODY_CHARS]
+    except Exception:
+        return None
+
+
+def enrich_item_with_body(item: dict, max_chars: int = MAX_BODY_CHARS) -> None:
+    """
+    若 item 有 url 且无 full_text，则抓取正文并写入 item["full_text"]。
+    用于 AI 提取时获得更多上下文。原地修改 item。
+    """
+    if not FETCH_FULL_ARTICLE:
+        return
+    url = (item.get("url") or "").strip()
+    if not url or item.get("full_text"):
+        return
+    body = fetch_article_body(url)
+    if not body:
+        return
+    title = (item.get("title") or "").strip()
+    summary = (item.get("summary") or "").strip()
+    combined = f"{title}\n{summary}\n{body}" if summary else f"{title}\n{body}"
+    item["full_text"] = combined[:max_chars]