fix: 修复爬虫问题
This commit is contained in:
@@ -1,4 +1,8 @@
|
|||||||
# Mapbox 地图令牌 (波斯湾区域展示)
|
# Mapbox 地图令牌 (波斯湾区域展示)
|
||||||
# 免费申请: https://account.mapbox.com/access-tokens/
|
# 免费申请: https://account.mapbox.com/access-tokens/
|
||||||
# 复制本文件为 .env 并填入你的 token
|
|
||||||
VITE_MAPBOX_ACCESS_TOKEN=your_mapbox_public_token_here
|
VITE_MAPBOX_ACCESS_TOKEN=your_mapbox_public_token_here
|
||||||
|
|
||||||
|
# 阿里云 DashScope API Key(爬虫 AI 提取用,不设则用规则或 Ollama)
|
||||||
|
# 在 crawler 目录或系统环境变量中设置,例如:
|
||||||
|
# export DASHSCOPE_API_KEY=sk-xxx
|
||||||
|
DASHSCOPE_API_KEY=
|
||||||
|
|||||||
@@ -10,7 +10,6 @@ COPY crawler ./
|
|||||||
|
|
||||||
ENV DB_PATH=/data/data.db
|
ENV DB_PATH=/data/data.db
|
||||||
ENV API_BASE=http://api:3001
|
ENV API_BASE=http://api:3001
|
||||||
ENV CLEANER_AI_DISABLED=1
|
|
||||||
ENV GDELT_DISABLED=1
|
ENV GDELT_DISABLED=1
|
||||||
ENV RSS_INTERVAL_SEC=60
|
ENV RSS_INTERVAL_SEC=60
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
BIN
crawler/__pycache__/extractor_dashscope.cpython-39.pyc
Normal file
BIN
crawler/__pycache__/extractor_dashscope.cpython-39.pyc
Normal file
Binary file not shown.
BIN
crawler/__pycache__/news_storage.cpython-39.pyc
Normal file
BIN
crawler/__pycache__/news_storage.cpython-39.pyc
Normal file
Binary file not shown.
Binary file not shown.
@@ -10,6 +10,9 @@ DB_PATH = os.environ.get("DB_PATH", str(PROJECT_ROOT / "server" / "data.db"))
|
|||||||
# Node API 地址(用于通知推送)
|
# Node API 地址(用于通知推送)
|
||||||
API_BASE = os.environ.get("API_BASE", "http://localhost:3001")
|
API_BASE = os.environ.get("API_BASE", "http://localhost:3001")
|
||||||
|
|
||||||
|
# 阿里云 DashScope API Key(用于 AI 提取面板数据,不设则回退到规则/Ollama)
|
||||||
|
DASHSCOPE_API_KEY = os.environ.get("DASHSCOPE_API_KEY", "")
|
||||||
|
|
||||||
# 抓取间隔(秒)
|
# 抓取间隔(秒)
|
||||||
CRAWL_INTERVAL = int(os.environ.get("CRAWL_INTERVAL", "300"))
|
CRAWL_INTERVAL = int(os.environ.get("CRAWL_INTERVAL", "300"))
|
||||||
|
|
||||||
|
|||||||
121
crawler/extractor_dashscope.py
Normal file
121
crawler/extractor_dashscope.py
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
阿里云 DashScope(通义千问)提取面板结构化数据
|
||||||
|
从新闻文本中提取战损、报复指数、基地状态等,供 db_merge 落库
|
||||||
|
API Key 通过环境变量 DASHSCOPE_API_KEY 配置
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
from panel_schema import validate_category, validate_severity, validate_summary
|
||||||
|
|
||||||
|
|
||||||
|
def _call_dashscope_extract(text: str, timeout: int = 15) -> Optional[Dict[str, Any]]:
|
||||||
|
"""调用阿里云 DashScope 提取结构化数据"""
|
||||||
|
api_key = os.environ.get("DASHSCOPE_API_KEY", "").strip()
|
||||||
|
if not api_key or not text or len(str(text).strip()) < 10:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
import dashscope
|
||||||
|
from http import HTTPStatus
|
||||||
|
|
||||||
|
dashscope.api_key = api_key
|
||||||
|
|
||||||
|
prompt = f"""从以下美伊/中东军事新闻中提取可明确推断的数值,输出 JSON。无依据的字段省略不写。
|
||||||
|
|
||||||
|
要求:
|
||||||
|
- summary: 1-2句中文事实摘要,≤80字
|
||||||
|
- category: deployment|alert|intel|diplomatic|other
|
||||||
|
- severity: low|medium|high|critical
|
||||||
|
- 战损(仅当新闻明确提及数字时填写):
|
||||||
|
us_personnel_killed, iran_personnel_killed, us_personnel_wounded, iran_personnel_wounded,
|
||||||
|
us_civilian_killed, iran_civilian_killed, us_civilian_wounded, iran_civilian_wounded,
|
||||||
|
us_bases_destroyed, iran_bases_destroyed, us_bases_damaged, iran_bases_damaged,
|
||||||
|
us_aircraft, iran_aircraft, us_warships, iran_warships, us_armor, iran_armor, us_vehicles, iran_vehicles
|
||||||
|
- retaliation_sentiment: 0-100,仅当新闻涉及伊朗报复/反击情绪时
|
||||||
|
- wall_street_value: 0-100,仅当新闻涉及美股/市场反应时
|
||||||
|
- key_location_updates: 当新闻提及具体基地遭袭时,数组 [{{"name_keywords":"阿萨德|asad|assad","side":"us","status":"attacked","damage_level":1-3}}]
|
||||||
|
|
||||||
|
原文:
|
||||||
|
{str(text)[:800]}
|
||||||
|
|
||||||
|
直接输出 JSON,不要其他解释:"""
|
||||||
|
|
||||||
|
response = dashscope.Generation.call(
|
||||||
|
model="qwen-turbo",
|
||||||
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
result_format="message",
|
||||||
|
max_tokens=512,
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code != HTTPStatus.OK:
|
||||||
|
return None
|
||||||
|
raw = (response.output.get("choices", [{}])[0].get("message", {}).get("content", "") or "").strip()
|
||||||
|
raw = re.sub(r"^```\w*\s*|\s*```$", "", raw)
|
||||||
|
return json.loads(raw)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_from_news(text: str, timestamp: Optional[str] = None) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
从新闻文本提取结构化数据,符合面板 schema
|
||||||
|
返回: { situation_update?, combat_losses_delta?, retaliation?, wall_street?, key_location_updates? }
|
||||||
|
"""
|
||||||
|
ts = timestamp or datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.000Z")
|
||||||
|
out: Dict[str, Any] = {}
|
||||||
|
parsed = _call_dashscope_extract(text)
|
||||||
|
if not parsed:
|
||||||
|
return out
|
||||||
|
|
||||||
|
if parsed.get("summary"):
|
||||||
|
out["situation_update"] = {
|
||||||
|
"summary": validate_summary(str(parsed["summary"])[:120], 120),
|
||||||
|
"category": validate_category(str(parsed.get("category", "other")).lower()),
|
||||||
|
"severity": validate_severity(str(parsed.get("severity", "medium")).lower()),
|
||||||
|
"timestamp": ts,
|
||||||
|
}
|
||||||
|
|
||||||
|
loss_us = {}
|
||||||
|
loss_ir = {}
|
||||||
|
for k in ["personnel_killed", "personnel_wounded", "civilian_killed", "civilian_wounded",
|
||||||
|
"bases_destroyed", "bases_damaged", "aircraft", "warships", "armor", "vehicles"]:
|
||||||
|
uk, ik = f"us_{k}", f"iran_{k}"
|
||||||
|
if uk in parsed and isinstance(parsed[uk], (int, float)):
|
||||||
|
loss_us[k] = max(0, int(parsed[uk]))
|
||||||
|
if ik in parsed and isinstance(parsed[ik], (int, float)):
|
||||||
|
loss_ir[k] = max(0, int(parsed[ik]))
|
||||||
|
if loss_us or loss_ir:
|
||||||
|
out["combat_losses_delta"] = {}
|
||||||
|
if loss_us:
|
||||||
|
out["combat_losses_delta"]["us"] = loss_us
|
||||||
|
if loss_ir:
|
||||||
|
out["combat_losses_delta"]["iran"] = loss_ir
|
||||||
|
|
||||||
|
if "retaliation_sentiment" in parsed:
|
||||||
|
v = parsed["retaliation_sentiment"]
|
||||||
|
if isinstance(v, (int, float)) and 0 <= v <= 100:
|
||||||
|
out["retaliation"] = {"value": int(v), "time": ts}
|
||||||
|
|
||||||
|
if "wall_street_value" in parsed:
|
||||||
|
v = parsed["wall_street_value"]
|
||||||
|
if isinstance(v, (int, float)) and 0 <= v <= 100:
|
||||||
|
out["wall_street"] = {"time": ts, "value": int(v)}
|
||||||
|
|
||||||
|
if "key_location_updates" in parsed and isinstance(parsed["key_location_updates"], list):
|
||||||
|
valid = []
|
||||||
|
for u in parsed["key_location_updates"]:
|
||||||
|
if isinstance(u, dict) and u.get("name_keywords") and u.get("side") in ("us", "iran"):
|
||||||
|
valid.append({
|
||||||
|
"name_keywords": str(u["name_keywords"]),
|
||||||
|
"side": u["side"],
|
||||||
|
"status": str(u.get("status", "attacked"))[:20],
|
||||||
|
"damage_level": min(3, max(1, int(u["damage_level"]))) if isinstance(u.get("damage_level"), (int, float)) else 2,
|
||||||
|
})
|
||||||
|
if valid:
|
||||||
|
out["key_location_updates"] = valid
|
||||||
|
|
||||||
|
return out
|
||||||
141
crawler/news_storage.py
Normal file
141
crawler/news_storage.py
Normal file
@@ -0,0 +1,141 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
资讯内容独立存储,支持历史去重
|
||||||
|
爬虫拉回数据 → 计算 content_hash → 若已存在则跳过(去重)→ 新数据落库 news_content
|
||||||
|
"""
|
||||||
|
import hashlib
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sqlite3
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
|
from config import DB_PATH
|
||||||
|
|
||||||
|
|
||||||
|
def _to_utc_iso(dt: datetime) -> str:
|
||||||
|
if dt.tzinfo:
|
||||||
|
dt = dt.astimezone(timezone.utc)
|
||||||
|
return dt.strftime("%Y-%m-%dT%H:%M:%S.000Z")
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_for_hash(text: str) -> str:
|
||||||
|
"""归一化文本用于生成去重 hash"""
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
t = re.sub(r"\s+", " ", str(text).strip().lower())[:600]
|
||||||
|
return re.sub(r"[\x00-\x1f]", "", t)
|
||||||
|
|
||||||
|
|
||||||
|
def content_hash(title: str, summary: str, url: str) -> str:
|
||||||
|
"""根据标题、摘要、URL 生成去重 hash,相似内容视为重复"""
|
||||||
|
raw = _normalize_for_hash(title) + "|" + _normalize_for_hash(summary) + "|" + (url or "").strip()
|
||||||
|
return hashlib.sha256(raw.encode("utf-8")).hexdigest()[:32]
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_table(conn: sqlite3.Connection) -> None:
|
||||||
|
conn.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS news_content (
|
||||||
|
id TEXT PRIMARY KEY,
|
||||||
|
content_hash TEXT NOT NULL UNIQUE,
|
||||||
|
title TEXT NOT NULL,
|
||||||
|
summary TEXT NOT NULL,
|
||||||
|
url TEXT NOT NULL DEFAULT '',
|
||||||
|
source TEXT NOT NULL DEFAULT '',
|
||||||
|
published_at TEXT NOT NULL,
|
||||||
|
category TEXT NOT NULL DEFAULT 'other',
|
||||||
|
severity TEXT NOT NULL DEFAULT 'medium',
|
||||||
|
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
try:
|
||||||
|
conn.execute("CREATE UNIQUE INDEX IF NOT EXISTS idx_news_content_hash ON news_content(content_hash)")
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_news_content_pub ON news_content(published_at DESC)")
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
pass
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def exists_by_hash(conn: sqlite3.Connection, h: str) -> bool:
|
||||||
|
row = conn.execute("SELECT 1 FROM news_content WHERE content_hash = ? LIMIT 1", (h,)).fetchone()
|
||||||
|
return row is not None
|
||||||
|
|
||||||
|
|
||||||
|
def insert_news(
|
||||||
|
conn: sqlite3.Connection,
|
||||||
|
*,
|
||||||
|
title: str,
|
||||||
|
summary: str,
|
||||||
|
url: str = "",
|
||||||
|
source: str = "",
|
||||||
|
published: datetime,
|
||||||
|
category: str = "other",
|
||||||
|
severity: str = "medium",
|
||||||
|
) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
插入资讯,若 content_hash 已存在则跳过(去重)
|
||||||
|
返回: 新插入的 id,或 None 表示重复跳过
|
||||||
|
"""
|
||||||
|
_ensure_table(conn)
|
||||||
|
h = content_hash(title, summary, url)
|
||||||
|
if exists_by_hash(conn, h):
|
||||||
|
return None
|
||||||
|
uid = "nc_" + hashlib.sha256(f"{h}{datetime.utcnow().isoformat()}".encode()).hexdigest()[:14]
|
||||||
|
ts = _to_utc_iso(published)
|
||||||
|
conn.execute(
|
||||||
|
"""INSERT INTO news_content (id, content_hash, title, summary, url, source, published_at, category, severity)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||||
|
(uid, h, (title or "")[:500], (summary or "")[:2000], (url or "")[:500], (source or "")[:100], ts, category, severity),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
return uid
|
||||||
|
|
||||||
|
|
||||||
|
def save_and_dedup(items: List[dict], db_path: Optional[str] = None) -> Tuple[List[dict], int]:
|
||||||
|
"""
|
||||||
|
去重后落库 news_content
|
||||||
|
items: [{"title","summary","url","published","category","severity","source"?}, ...]
|
||||||
|
返回: (通过去重的新项列表, 实际新增条数)
|
||||||
|
"""
|
||||||
|
path = db_path or DB_PATH
|
||||||
|
if not os.path.exists(path):
|
||||||
|
return [], 0
|
||||||
|
conn = sqlite3.connect(path, timeout=10)
|
||||||
|
try:
|
||||||
|
_ensure_table(conn)
|
||||||
|
new_items: List[dict] = []
|
||||||
|
count = 0
|
||||||
|
for u in items:
|
||||||
|
title = (u.get("title") or "")[:500]
|
||||||
|
summary = (u.get("summary") or u.get("title") or "")[:2000]
|
||||||
|
url = (u.get("url") or "")[:500]
|
||||||
|
source = (u.get("source") or "")[:100]
|
||||||
|
pub = u.get("published")
|
||||||
|
if isinstance(pub, str):
|
||||||
|
try:
|
||||||
|
pub = datetime.fromisoformat(pub.replace("Z", "+00:00"))
|
||||||
|
except ValueError:
|
||||||
|
pub = datetime.now(timezone.utc)
|
||||||
|
elif pub is None:
|
||||||
|
pub = datetime.now(timezone.utc)
|
||||||
|
cat = u.get("category", "other")
|
||||||
|
sev = u.get("severity", "medium")
|
||||||
|
uid = insert_news(
|
||||||
|
conn,
|
||||||
|
title=title,
|
||||||
|
summary=summary,
|
||||||
|
url=url,
|
||||||
|
source=source,
|
||||||
|
published=pub,
|
||||||
|
category=cat,
|
||||||
|
severity=sev,
|
||||||
|
)
|
||||||
|
if uid:
|
||||||
|
count += 1
|
||||||
|
new_items.append({**u, "news_id": uid})
|
||||||
|
return new_items, count
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
@@ -283,7 +283,7 @@ def _rss_to_gdelt_fallback() -> None:
|
|||||||
|
|
||||||
|
|
||||||
# ==========================
|
# ==========================
|
||||||
# RSS 新闻抓取(补充 situation_update + AI 提取面板数据)
|
# RSS 新闻抓取:资讯落库(去重) → AI 提取 → 面板数据落库 → 通知前端
|
||||||
# ==========================
|
# ==========================
|
||||||
LAST_FETCH = {"items": 0, "inserted": 0, "error": None}
|
LAST_FETCH = {"items": 0, "inserted": 0, "error": None}
|
||||||
|
|
||||||
@@ -292,6 +292,7 @@ def fetch_news() -> None:
|
|||||||
try:
|
try:
|
||||||
from scrapers.rss_scraper import fetch_all
|
from scrapers.rss_scraper import fetch_all
|
||||||
from db_writer import write_updates
|
from db_writer import write_updates
|
||||||
|
from news_storage import save_and_dedup
|
||||||
from translate_utils import translate_to_chinese
|
from translate_utils import translate_to_chinese
|
||||||
from cleaner_ai import clean_news_for_panel
|
from cleaner_ai import clean_news_for_panel
|
||||||
from cleaner_ai import ensure_category, ensure_severity
|
from cleaner_ai import ensure_category, ensure_severity
|
||||||
@@ -304,36 +305,44 @@ def fetch_news() -> None:
|
|||||||
it["summary"] = clean_news_for_panel(raw_summary or raw_title, max_len=120)
|
it["summary"] = clean_news_for_panel(raw_summary or raw_title, max_len=120)
|
||||||
it["category"] = ensure_category(it.get("category", "other"))
|
it["category"] = ensure_category(it.get("category", "other"))
|
||||||
it["severity"] = ensure_severity(it.get("severity", "medium"))
|
it["severity"] = ensure_severity(it.get("severity", "medium"))
|
||||||
n = write_updates(items) if items else 0
|
it["source"] = it.get("source") or "rss"
|
||||||
|
# 1. 历史去重:资讯内容落库 news_content(独立表,便于后续消费)
|
||||||
|
new_items, n_news = save_and_dedup(items, db_path=DB_PATH)
|
||||||
|
# 2. 面板展示:新增资讯写入 situation_update(供前端 recentUpdates)
|
||||||
|
n_panel = write_updates(new_items) if new_items else 0
|
||||||
LAST_FETCH["items"] = len(items)
|
LAST_FETCH["items"] = len(items)
|
||||||
LAST_FETCH["inserted"] = n
|
LAST_FETCH["inserted"] = n_news
|
||||||
if items:
|
# 3. AI 提取 + 合并到 combat_losses / key_location 等
|
||||||
_extract_and_merge_panel_data(items)
|
if new_items:
|
||||||
|
_extract_and_merge_panel_data(new_items)
|
||||||
# GDELT 禁用时用 RSS 填充 gdelt_events,使地图有冲突点
|
# GDELT 禁用时用 RSS 填充 gdelt_events,使地图有冲突点
|
||||||
if GDELT_DISABLED:
|
if GDELT_DISABLED:
|
||||||
_rss_to_gdelt_fallback()
|
_rss_to_gdelt_fallback()
|
||||||
# 每次抓取完成都通知 Node 更新时间戳,便于「实时更新」显示
|
|
||||||
_notify_node()
|
_notify_node()
|
||||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] RSS 抓取 {len(items)} 条,新增入库 {n} 条")
|
print(f"[{datetime.now().strftime('%H:%M:%S')}] RSS 抓取 {len(items)} 条,去重后新增 {n_news} 条资讯,面板 {n_panel} 条")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
LAST_FETCH["error"] = str(e)
|
LAST_FETCH["error"] = str(e)
|
||||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] 新闻抓取失败: {e}")
|
print(f"[{datetime.now().strftime('%H:%M:%S')}] 新闻抓取失败: {e}")
|
||||||
|
|
||||||
|
|
||||||
def _extract_and_merge_panel_data(items: list) -> None:
|
def _extract_and_merge_panel_data(items: list) -> None:
|
||||||
"""对新闻做 AI/规则 提取,合并到 combat_losses / retaliation / wall_street_trend 等表"""
|
"""AI 分析提取面板相关数据,清洗后落库"""
|
||||||
if not items or not os.path.exists(DB_PATH):
|
if not items or not os.path.exists(DB_PATH):
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
from db_merge import merge
|
from db_merge import merge
|
||||||
if os.environ.get("CLEANER_AI_DISABLED", "0") == "1":
|
use_dashscope = bool(os.environ.get("DASHSCOPE_API_KEY", "").strip())
|
||||||
|
if use_dashscope:
|
||||||
|
from extractor_dashscope import extract_from_news
|
||||||
|
limit = 10
|
||||||
|
elif os.environ.get("CLEANER_AI_DISABLED", "0") == "1":
|
||||||
from extractor_rules import extract_from_news
|
from extractor_rules import extract_from_news
|
||||||
|
limit = 25
|
||||||
else:
|
else:
|
||||||
from extractor_ai import extract_from_news
|
from extractor_ai import extract_from_news
|
||||||
|
limit = 10
|
||||||
from datetime import timezone
|
from datetime import timezone
|
||||||
merged_any = False
|
merged_any = False
|
||||||
# 规则模式可多处理几条(无 Ollama);AI 模式限制 5 条避免调用过多
|
|
||||||
limit = 25 if os.environ.get("CLEANER_AI_DISABLED", "0") == "1" else 10
|
|
||||||
for it in items[:limit]:
|
for it in items[:limit]:
|
||||||
text = (it.get("title", "") or "") + " " + (it.get("summary", "") or "")
|
text = (it.get("title", "") or "") + " " + (it.get("summary", "") or "")
|
||||||
if len(text.strip()) < 20:
|
if len(text.strip()) < 20:
|
||||||
|
|||||||
@@ -3,3 +3,4 @@ feedparser>=6.0.0
|
|||||||
fastapi>=0.109.0
|
fastapi>=0.109.0
|
||||||
uvicorn>=0.27.0
|
uvicorn>=0.27.0
|
||||||
deep-translator>=1.11.0
|
deep-translator>=1.11.0
|
||||||
|
dashscope>=1.20.0
|
||||||
|
|||||||
@@ -22,9 +22,9 @@ services:
|
|||||||
environment:
|
environment:
|
||||||
- DB_PATH=/data/data.db
|
- DB_PATH=/data/data.db
|
||||||
- API_BASE=http://api:3001
|
- API_BASE=http://api:3001
|
||||||
- CLEANER_AI_DISABLED=1
|
|
||||||
- GDELT_DISABLED=1
|
- GDELT_DISABLED=1
|
||||||
- RSS_INTERVAL_SEC=60
|
- RSS_INTERVAL_SEC=60
|
||||||
|
- DASHSCOPE_API_KEY=${DASHSCOPE_API_KEY:-}
|
||||||
volumes:
|
volumes:
|
||||||
- app-data:/data
|
- app-data:/data
|
||||||
depends_on:
|
depends_on:
|
||||||
|
|||||||
65
docs/CRAWLER_PIPELINE.md
Normal file
65
docs/CRAWLER_PIPELINE.md
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
# 爬虫数据流水线
|
||||||
|
|
||||||
|
## 数据流
|
||||||
|
|
||||||
|
```
|
||||||
|
RSS 抓取
|
||||||
|
↓ 翻译、清洗
|
||||||
|
↓ news_storage.save_and_dedup() → 历史去重
|
||||||
|
↓
|
||||||
|
news_content(资讯独立表,供后续消费)
|
||||||
|
↓
|
||||||
|
↓ 去重后的新数据
|
||||||
|
↓
|
||||||
|
situation_update(面板展示用)
|
||||||
|
↓
|
||||||
|
↓ AI 提取(阿里云 DashScope)
|
||||||
|
↓
|
||||||
|
combat_losses / retaliation / key_location / wall_street_trend
|
||||||
|
↓
|
||||||
|
↓ notify Node
|
||||||
|
↓
|
||||||
|
前端 WebSocket + 轮询
|
||||||
|
```
|
||||||
|
|
||||||
|
## 阿里云 DashScope API Key
|
||||||
|
|
||||||
|
设置环境变量 `DASHSCOPE_API_KEY` 后,爬虫使用阿里云通义千问进行 AI 提取。不设置时回退到规则提取(`extractor_rules`)或 Ollama(若可用)。
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 本地
|
||||||
|
export DASHSCOPE_API_KEY=sk-xxx
|
||||||
|
|
||||||
|
# Docker
|
||||||
|
docker compose up -d -e DASHSCOPE_API_KEY=sk-xxx
|
||||||
|
# 或在 .env 中写入 DASHSCOPE_API_KEY=sk-xxx
|
||||||
|
```
|
||||||
|
|
||||||
|
## 表说明
|
||||||
|
|
||||||
|
| 表 | 用途 |
|
||||||
|
|----|------|
|
||||||
|
| `news_content` | 资讯原文,独立存储,支持去重(content_hash),供后续消费 |
|
||||||
|
| `situation_update` | 面板「近期更新」展示 |
|
||||||
|
| `combat_losses` | 战损数据(AI/规则提取) |
|
||||||
|
| `key_location` | 基地状态 |
|
||||||
|
| `gdelt_events` | 地图冲突点 |
|
||||||
|
|
||||||
|
## 去重逻辑
|
||||||
|
|
||||||
|
根据 `content_hash = sha256(normalize(title) + normalize(summary) + url)` 判断,相同或高度相似内容视为重复,不入库。
|
||||||
|
|
||||||
|
## 消费资讯
|
||||||
|
|
||||||
|
- HTTP: `GET /api/news?limit=50`
|
||||||
|
- 调试: `/db` 面板查看 `news_content` 表
|
||||||
|
|
||||||
|
## 链路验证
|
||||||
|
|
||||||
|
运行脚本一键检查全链路:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/verify-pipeline.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
支持环境变量覆盖:`API_URL`、`CRAWLER_URL`
|
||||||
@@ -14,7 +14,9 @@
|
|||||||
"build": "vite build",
|
"build": "vite build",
|
||||||
"typecheck": "tsc --noEmit",
|
"typecheck": "tsc --noEmit",
|
||||||
"lint": "eslint .",
|
"lint": "eslint .",
|
||||||
"preview": "vite preview"
|
"preview": "vite preview",
|
||||||
|
"verify": "./scripts/verify-pipeline.sh",
|
||||||
|
"verify:full": "./scripts/verify-pipeline.sh --start-crawler"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"better-sqlite3": "^11.6.0",
|
"better-sqlite3": "^11.6.0",
|
||||||
|
|||||||
124
scripts/verify-pipeline.sh
Executable file
124
scripts/verify-pipeline.sh
Executable file
@@ -0,0 +1,124 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# 验证爬虫 → 数据库 → API → 前端 全链路
|
||||||
|
# 用法: ./scripts/verify-pipeline.sh [--start-crawler]
|
||||||
|
set -e
|
||||||
|
|
||||||
|
API_URL="${API_URL:-http://localhost:3001}"
|
||||||
|
CRAWLER_URL="${CRAWLER_URL:-http://localhost:8000}"
|
||||||
|
START_CRAWLER=false
|
||||||
|
[[ "${1:-}" = "--start-crawler" ]] && START_CRAWLER=true
|
||||||
|
|
||||||
|
PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
||||||
|
|
||||||
|
echo "=========================================="
|
||||||
|
echo "US-Iran 态势面板 链路验证"
|
||||||
|
echo "API: $API_URL | Crawler: $CRAWLER_URL"
|
||||||
|
echo "=========================================="
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 可选:启动爬虫
|
||||||
|
if $START_CRAWLER; then
|
||||||
|
echo "[0/6] 启动爬虫..."
|
||||||
|
if curl -sf "$CRAWLER_URL/crawler/status" >/dev/null 2>&1; then
|
||||||
|
echo " ✓ 爬虫已在运行"
|
||||||
|
else
|
||||||
|
cd "$PROJECT_ROOT/crawler"
|
||||||
|
python3 -c "import uvicorn" 2>/dev/null || { echo " 需安装: pip install uvicorn"; exit 1; }
|
||||||
|
uvicorn realtime_conflict_service:app --host 127.0.0.1 --port 8000 &
|
||||||
|
echo " 等待爬虫就绪..."
|
||||||
|
for i in $(seq 1 15); do
|
||||||
|
sleep 2
|
||||||
|
if curl -sf "$CRAWLER_URL/crawler/status" >/dev/null 2>&1; then
|
||||||
|
echo " ✓ 爬虫已启动"
|
||||||
|
echo " 等待首次 RSS 抓取(约 70 秒)..."
|
||||||
|
sleep 70
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if ! curl -sf "$CRAWLER_URL/crawler/status" >/dev/null 2>&1; then
|
||||||
|
echo " ✗ 爬虫启动超时"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 1. API 健康检查
|
||||||
|
echo "[1/6] API 健康检查..."
|
||||||
|
if curl -sf "$API_URL/api/health" > /dev/null; then
|
||||||
|
echo " ✓ API 正常"
|
||||||
|
else
|
||||||
|
echo " ✗ API 无响应,请先运行: npm run api"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 2. 态势数据
|
||||||
|
echo "[2/6] 态势数据..."
|
||||||
|
SIT=$(curl -sf "$API_URL/api/situation" 2>/dev/null || echo "{}")
|
||||||
|
if echo "$SIT" | grep -q "lastUpdated"; then
|
||||||
|
echo " ✓ 态势数据可读"
|
||||||
|
LAST=$(echo "$SIT" | grep -o '"lastUpdated":"[^"]*"' | head -1)
|
||||||
|
echo " $LAST"
|
||||||
|
else
|
||||||
|
echo " ✗ 态势数据异常"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 3. 爬虫状态
|
||||||
|
echo "[3/6] 爬虫状态..."
|
||||||
|
CRAWLER=$(curl -sf "$CRAWLER_URL/crawler/status" 2>/dev/null || echo "{}")
|
||||||
|
if echo "$CRAWLER" | grep -q "db_path\|db_exists"; then
|
||||||
|
echo " ✓ 爬虫服务可访问"
|
||||||
|
if command -v jq &>/dev/null; then
|
||||||
|
CNT=$(echo "$CRAWLER" | jq -r '.situation_update_count // "?"')
|
||||||
|
echo " situation_update 条数: $CNT"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo " ⚠ 爬虫未启动或不可达(可选,需单独运行爬虫)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 4. 资讯表
|
||||||
|
echo "[4/6] 资讯表 news_content..."
|
||||||
|
NEWS=$(curl -sf "$API_URL/api/news?limit=3" 2>/dev/null || echo '{"items":[]}')
|
||||||
|
if echo "$NEWS" | grep -q '"items"'; then
|
||||||
|
if command -v jq &>/dev/null; then
|
||||||
|
N=$(echo "$NEWS" | jq '.items | length')
|
||||||
|
echo " ✓ 最近 $N 条资讯"
|
||||||
|
else
|
||||||
|
echo " ✓ 资讯接口可读"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo " ⚠ news_content 可能为空(爬虫未跑或刚启动)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 5. 战损数据
|
||||||
|
echo "[5/6] 战损数据 combat_losses..."
|
||||||
|
if echo "$SIT" | grep -q "personnelCasualties"; then
|
||||||
|
echo " ✓ 战损字段存在"
|
||||||
|
if command -v jq &>/dev/null; then
|
||||||
|
US_K=$(echo "$SIT" | jq -r '.usForces.combatLosses.personnelCasualties.killed // "?"')
|
||||||
|
IR_K=$(echo "$SIT" | jq -r '.iranForces.combatLosses.personnelCasualties.killed // "?"')
|
||||||
|
echo " 美军阵亡: $US_K | 伊朗阵亡: $IR_K"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo " ✗ 战损结构异常"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 6. 通知接口(仅验证可调用)
|
||||||
|
echo "[6/6] 通知接口 POST /api/crawler/notify..."
|
||||||
|
NOTIFY=$(curl -sf -X POST "$API_URL/api/crawler/notify" 2>/dev/null || echo "{}")
|
||||||
|
if echo "$NOTIFY" | grep -q '"ok"'; then
|
||||||
|
echo " ✓ 通知接口正常"
|
||||||
|
else
|
||||||
|
echo " ⚠ 通知接口可能异常"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=========================================="
|
||||||
|
echo "验证完成。"
|
||||||
|
echo ""
|
||||||
|
echo "建议:"
|
||||||
|
echo " - 访问 $API_URL/db 查看各表数据"
|
||||||
|
echo " - 爬虫未启动时: ./scripts/verify-pipeline.sh --start-crawler"
|
||||||
|
echo " - 或手动启动: cd crawler && uvicorn realtime_conflict_service:app --port 8000"
|
||||||
|
echo "=========================================="
|
||||||
Binary file not shown.
Binary file not shown.
15
server/db.js
15
server/db.js
@@ -112,6 +112,21 @@ db.exec(`
|
|||||||
estimated_strike_count INTEGER NOT NULL DEFAULT 0,
|
estimated_strike_count INTEGER NOT NULL DEFAULT 0,
|
||||||
updated_at TEXT NOT NULL
|
updated_at TEXT NOT NULL
|
||||||
);
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS news_content (
|
||||||
|
id TEXT PRIMARY KEY,
|
||||||
|
content_hash TEXT NOT NULL UNIQUE,
|
||||||
|
title TEXT NOT NULL,
|
||||||
|
summary TEXT NOT NULL,
|
||||||
|
url TEXT NOT NULL DEFAULT '',
|
||||||
|
source TEXT NOT NULL DEFAULT '',
|
||||||
|
published_at TEXT NOT NULL,
|
||||||
|
category TEXT NOT NULL DEFAULT 'other',
|
||||||
|
severity TEXT NOT NULL DEFAULT 'medium',
|
||||||
|
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||||
|
);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_news_content_hash ON news_content(content_hash);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_news_content_published ON news_content(published_at DESC);
|
||||||
`)
|
`)
|
||||||
|
|
||||||
// 迁移:为已有 key_location 表添加 type、region、status、damage_level 列
|
// 迁移:为已有 key_location 表添加 type、region、status、damage_level 列
|
||||||
|
|||||||
@@ -25,6 +25,15 @@ module.exports = {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
'/api/news': {
|
||||||
|
get: {
|
||||||
|
summary: '资讯内容',
|
||||||
|
description: '从 news_content 表读取,支持 ?limit=50 分页',
|
||||||
|
tags: ['资讯'],
|
||||||
|
parameters: [{ in: 'query', name: 'limit', schema: { type: 'integer', default: 50 } }],
|
||||||
|
responses: { 200: { description: 'items 数组' } },
|
||||||
|
},
|
||||||
|
},
|
||||||
'/api/db/dashboard': {
|
'/api/db/dashboard': {
|
||||||
get: {
|
get: {
|
||||||
summary: '数据库面板',
|
summary: '数据库面板',
|
||||||
@@ -130,5 +139,5 @@ module.exports = {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
tags: [{ name: '态势' }, { name: '统计' }, { name: '反馈' }, { name: '调试' }, { name: '系统' }],
|
tags: [{ name: '态势' }, { name: '资讯' }, { name: '统计' }, { name: '反馈' }, { name: '调试' }, { name: '系统' }],
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ router.get('/db/dashboard', (req, res) => {
|
|||||||
'retaliation_current',
|
'retaliation_current',
|
||||||
'retaliation_history',
|
'retaliation_history',
|
||||||
'situation_update',
|
'situation_update',
|
||||||
|
'news_content',
|
||||||
'gdelt_events',
|
'gdelt_events',
|
||||||
'conflict_stats',
|
'conflict_stats',
|
||||||
]
|
]
|
||||||
@@ -27,6 +28,7 @@ router.get('/db/dashboard', (req, res) => {
|
|||||||
feedback: 'created_at DESC',
|
feedback: 'created_at DESC',
|
||||||
situation: 'updated_at DESC',
|
situation: 'updated_at DESC',
|
||||||
situation_update: 'timestamp DESC',
|
situation_update: 'timestamp DESC',
|
||||||
|
news_content: 'published_at DESC',
|
||||||
gdelt_events: 'event_time DESC',
|
gdelt_events: 'event_time DESC',
|
||||||
wall_street_trend: 'time DESC',
|
wall_street_trend: 'time DESC',
|
||||||
retaliation_history: 'time DESC',
|
retaliation_history: 'time DESC',
|
||||||
@@ -55,6 +57,17 @@ router.get('/db/dashboard', (req, res) => {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
// 资讯内容(独立表,供后续消费)
|
||||||
|
router.get('/news', (req, res) => {
|
||||||
|
try {
|
||||||
|
const limit = Math.min(parseInt(req.query.limit, 10) || 50, 200)
|
||||||
|
const rows = db.prepare('SELECT id, title, summary, url, source, published_at, category, severity, created_at FROM news_content ORDER BY published_at DESC LIMIT ?').all(limit)
|
||||||
|
res.json({ items: rows })
|
||||||
|
} catch (err) {
|
||||||
|
res.status(500).json({ error: err.message })
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
router.get('/situation', (req, res) => {
|
router.get('/situation', (req, res) => {
|
||||||
try {
|
try {
|
||||||
res.json(getSituation())
|
res.json(getSituation())
|
||||||
|
|||||||
@@ -8,3 +8,4 @@ export const config = {
|
|||||||
/** 是否显示滚动情报 */
|
/** 是否显示滚动情报 */
|
||||||
showNewsTicker: false,
|
showNewsTicker: false,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user