# -*- coding: utf-8 -*- """ 从文章 URL 抓取正文,供 AI 提取精确数据使用。 RSS 仅提供标题和短摘要,正文可提供伤亡、番号、地点等具体数字与事实。 """ import os import re from typing import Optional # 单页超时(秒) FETCH_TIMEOUT = int(os.environ.get("ARTICLE_FETCH_TIMEOUT", "12")) # 正文最大字符数,避免超长输入 MAX_BODY_CHARS = int(os.environ.get("ARTICLE_MAX_BODY_CHARS", "6000")) # 是否启用正文抓取(设为 0 则仅用标题+摘要) FETCH_FULL_ARTICLE = os.environ.get("FETCH_FULL_ARTICLE", "1") == "1" def _strip_html(html: str) -> str: """简单去除 HTML 标签与多余空白""" if not html: return "" text = re.sub(r"]*>[\s\S]*?", " ", html, flags=re.I) text = re.sub(r"]*>[\s\S]*?", " ", text, flags=re.I) text = re.sub(r"<[^>]+>", " ", text) text = re.sub(r"\s+", " ", text).strip() return text def fetch_article_body(url: str, timeout: int = FETCH_TIMEOUT) -> Optional[str]: """ 请求文章 URL,提取正文纯文本。失败或非 HTML 返回 None。 优先用 BeautifulSoup 取 main/article 或 body,否则退化为正则去标签。 """ if not url or not url.strip().startswith("http"): return None try: import requests headers = {"User-Agent": "US-Iran-Dashboard/1.0 (News Aggregator)"} # 不跟随代理,避免墙内超时 proxies = {"http": None, "https": None} if os.environ.get("CRAWLER_USE_PROXY") != "1" else None r = requests.get(url, headers=headers, timeout=timeout, proxies=proxies) r.raise_for_status() ct = (r.headers.get("Content-Type") or "").lower() if "html" not in ct and "xml" not in ct: return None html = r.text if not html or len(html) < 200: return None try: from bs4 import BeautifulSoup except ImportError: return _strip_html(html)[:MAX_BODY_CHARS] try: soup = BeautifulSoup(html, "html.parser") for tag in ("article", "main", "[role='main']", ".article-body", ".post-content", ".entry-content", ".content"): if tag.startswith((".", "[")): node = soup.select_one(tag) else: node = soup.find(tag) if node: body = node.get_text(separator=" ", strip=True) if len(body) > 300: return _strip_html(body)[:MAX_BODY_CHARS] body = soup.body.get_text(separator=" ", strip=True) if soup.body else "" if len(body) > 300: return _strip_html(body)[:MAX_BODY_CHARS] except Exception: pass return _strip_html(html)[:MAX_BODY_CHARS] except Exception: return None def enrich_item_with_body(item: dict, max_chars: int = MAX_BODY_CHARS) -> None: """ 若 item 有 url 且无 full_text,则抓取正文并写入 item["full_text"]。 用于 AI 提取时获得更多上下文。原地修改 item。 """ if not FETCH_FULL_ARTICLE: return url = (item.get("url") or "").strip() if not url or item.get("full_text"): return body = fetch_article_body(url) if not body: return title = (item.get("title") or "").strip() summary = (item.get("summary") or "").strip() combined = f"{title}\n{summary}\n{body}" if summary else f"{title}\n{body}" item["full_text"] = combined[:max_chars]