Files
usa/crawler/article_fetcher.py
2026-03-03 13:02:28 +08:00

91 lines
3.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
从文章 URL 抓取正文,供 AI 提取精确数据使用。
RSS 仅提供标题和短摘要,正文可提供伤亡、番号、地点等具体数字与事实。
"""
import os
import re
from typing import Optional
# 单页超时(秒)
FETCH_TIMEOUT = int(os.environ.get("ARTICLE_FETCH_TIMEOUT", "12"))
# 正文最大字符数,避免超长输入
MAX_BODY_CHARS = int(os.environ.get("ARTICLE_MAX_BODY_CHARS", "6000"))
# 是否启用正文抓取(设为 0 则仅用标题+摘要)
FETCH_FULL_ARTICLE = os.environ.get("FETCH_FULL_ARTICLE", "1") == "1"
def _strip_html(html: str) -> str:
"""简单去除 HTML 标签与多余空白"""
if not html:
return ""
text = re.sub(r"<script[^>]*>[\s\S]*?</script>", " ", html, flags=re.I)
text = re.sub(r"<style[^>]*>[\s\S]*?</style>", " ", text, flags=re.I)
text = re.sub(r"<[^>]+>", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def fetch_article_body(url: str, timeout: int = FETCH_TIMEOUT) -> Optional[str]:
"""
请求文章 URL提取正文纯文本。失败或非 HTML 返回 None。
优先用 BeautifulSoup 取 main/article 或 body否则退化为正则去标签。
"""
if not url or not url.strip().startswith("http"):
return None
try:
import requests
headers = {"User-Agent": "US-Iran-Dashboard/1.0 (News Aggregator)"}
# 不跟随代理,避免墙内超时
proxies = {"http": None, "https": None} if os.environ.get("CRAWLER_USE_PROXY") != "1" else None
r = requests.get(url, headers=headers, timeout=timeout, proxies=proxies)
r.raise_for_status()
ct = (r.headers.get("Content-Type") or "").lower()
if "html" not in ct and "xml" not in ct:
return None
html = r.text
if not html or len(html) < 200:
return None
try:
from bs4 import BeautifulSoup
except ImportError:
return _strip_html(html)[:MAX_BODY_CHARS]
try:
soup = BeautifulSoup(html, "html.parser")
for tag in ("article", "main", "[role='main']", ".article-body", ".post-content", ".entry-content", ".content"):
if tag.startswith((".", "[")):
node = soup.select_one(tag)
else:
node = soup.find(tag)
if node:
body = node.get_text(separator=" ", strip=True)
if len(body) > 300:
return _strip_html(body)[:MAX_BODY_CHARS]
body = soup.body.get_text(separator=" ", strip=True) if soup.body else ""
if len(body) > 300:
return _strip_html(body)[:MAX_BODY_CHARS]
except Exception:
pass
return _strip_html(html)[:MAX_BODY_CHARS]
except Exception:
return None
def enrich_item_with_body(item: dict, max_chars: int = MAX_BODY_CHARS) -> None:
"""
若 item 有 url 且无 full_text则抓取正文并写入 item["full_text"]。
用于 AI 提取时获得更多上下文。原地修改 item。
"""
if not FETCH_FULL_ARTICLE:
return
url = (item.get("url") or "").strip()
if not url or item.get("full_text"):
return
body = fetch_article_body(url)
if not body:
return
title = (item.get("title") or "").strip()
summary = (item.get("summary") or "").strip()
combined = f"{title}\n{summary}\n{body}" if summary else f"{title}\n{body}"
item["full_text"] = combined[:max_chars]