fix:优化数据来源

This commit is contained in:
Daniel
2026-03-02 01:00:04 +08:00
parent 91d9e48e1e
commit 4a8fff5a00
26 changed files with 1361 additions and 0 deletions

0
=1.11.0 Normal file
View File

95
crawler/README.md Normal file
View File

@@ -0,0 +1,95 @@
# GDELT 实时冲突服务 + 新闻爬虫
## 数据来源梳理
### 1. GDELT Project (gdelt_events)
| 项目 | 说明 |
|------|------|
| API | `https://api.gdeltproject.org/api/v2/doc/doc` |
| 查询 | `query=United States Iran military`(可配 `GDELT_QUERY` |
| 模式 | `mode=ArtList``format=json``maxrecords=30` |
| 时间范围 | **未指定时默认最近 3 个月**,按相关性排序,易返回较旧文章 |
| 更新频率 | GDELT 约 15 分钟级,爬虫 60 秒拉一次 |
**数据偏老原因**:未传 `timespan``sort=datedesc`API 返回 3 个月内“最相关”文章,不保证最新。
### 2. RSS 新闻 (situation_update) — 主事件脉络来源
| 项目 | 说明 |
|------|------|
| 源 | Reuters、BBC World/MiddleEast、Al Jazeera、NYT World |
| 过滤 | 标题/摘要需含 `KEYWORDS` 之一iran、usa、strike、military 等) |
| 更新 | 爬虫 45 秒拉一次(`RSS_INTERVAL_SEC`),优先保证事件脉络 |
| 优先级 | 启动时先拉 RSS再拉 GDELT |
**GDELT 无法访问时**:设置 `GDELT_DISABLED=1`,仅用 RSS 新闻即可维持事件脉络。
---
**事件脉络可实时更新**:爬虫抓取后 → 写入 SQLite → 调用 Node 通知 → WebSocket 广播 → 前端自动刷新。
## 依赖
```bash
pip install -r requirements.txt
```
新增 `deep-translator`GDELT 与 RSS 新闻入库前自动翻译为中文。
## 运行(需同时启动 3 个服务)
| 终端 | 命令 | 说明 |
|------|------|------|
| 1 | `npm run api` | Node API + WebSocket必须 |
| 2 | `npm run gdelt` | GDELT + RSS 爬虫(**事件脉络数据来源** |
| 3 | `npm run dev` | 前端开发 |
**事件脉络不更新时**:多半是未启动 `npm run gdelt`。只跑 `npm run api` 时,事件脉络会显示空或仅有缓存。
## 数据流
```
GDELT API → 抓取(60s) → SQLite (gdelt_events, conflict_stats) → POST /api/crawler/notify
Node 更新 situation.updated_at + WebSocket 广播
前端实时展示
```
## 配置
环境变量:
- `DB_PATH`: SQLite 路径,默认 `../server/data.db`
- `API_BASE`: Node API 地址,默认 `http://localhost:3001`
- `GDELT_QUERY`: 搜索关键词,默认 `United States Iran military`
- `GDELT_MAX_RECORDS`: 最大条数,默认 30
- `GDELT_TIMESPAN`: 时间范围,`1h` / `1d` / `1week`,默认 `1d`(近日资讯)
- `GDELT_DISABLED`: 设为 `1` 则跳过 GDELT仅用 RSS 新闻GDELT 无法访问时用)
- `FETCH_INTERVAL_SEC`: GDELT 抓取间隔(秒),默认 60
- `RSS_INTERVAL_SEC`: RSS 抓取间隔(秒),默认 45优先保证事件脉络
## 冲突强度 (impact_score)
| 分数 | 地图效果 |
|------|------------|
| 13 | 绿色点 |
| 46 | 橙色闪烁 |
| 710 | 红色脉冲扩散 |
## API
- `GET http://localhost:8000/events`返回事件列表与冲突统计Python 服务直连)
- `GET http://localhost:3001/api/events`:从 Node 读取(推荐,含 WebSocket 同步)
## 故障排查
| 现象 | 可能原因 | 排查 |
|------|----------|------|
| 事件脉络始终为空 | 未启动 GDELT 爬虫 | 另开终端运行 `npm run gdelt`,观察是否有 `GDELT 更新 X 条事件` 输出 |
| 事件脉络不刷新 | WebSocket 未连上 | 确认 `npm run api` 已启动,前端需通过 `npm run dev` 访问Vite 会代理 /ws |
| GDELT 抓取失败 | 系统代理超时 / ProxyError | 爬虫默认直连,不走代理;若需代理请设 `CRAWLER_USE_PROXY=1` |
| GDELT 抓取失败 | 网络 / GDELT API 限流 | 检查 Python 终端报错GDELT 在国外,国内网络可能较慢或超时 |
| 新闻条数为 0 | RSS 源被墙或关键词不匹配 | 检查 crawler/config.py 中 RSS_FEEDS、KEYWORDS国内需代理 |
| **返回数据偏老** | GDELT 默认 3 个月内按相关性 | 设置 `GDELT_TIMESPAN=1d` 限制为近日;加 `sort=datedesc` 最新优先 |

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

33
crawler/config.py Normal file
View File

@@ -0,0 +1,33 @@
# -*- coding: utf-8 -*-
"""爬虫配置"""
import os
from pathlib import Path
# 数据库路径(与 server 共用 SQLite
PROJECT_ROOT = Path(__file__).resolve().parent.parent
DB_PATH = os.environ.get("DB_PATH", str(PROJECT_ROOT / "server" / "data.db"))
# Node API 地址(用于通知推送)
API_BASE = os.environ.get("API_BASE", "http://localhost:3001")
# 抓取间隔(秒)
CRAWL_INTERVAL = int(os.environ.get("CRAWL_INTERVAL", "300"))
# RSS 源(美伊/中东相关,多源保证实时事件脉络)
RSS_FEEDS = [
"https://feeds.reuters.com/reuters/topNews",
"https://feeds.bbci.co.uk/news/world/rss.xml",
"https://feeds.bbci.co.uk/news/world/middle_east/rss.xml",
"https://www.aljazeera.com/xml/rss/all.xml",
"https://www.aljazeera.com/xml/rss/middleeast.xml",
"https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
]
# 关键词过滤:至少匹配一个才会入库
KEYWORDS = [
"iran", "iranian", "tehran", "以色列", "israel",
"usa", "us ", "american", "美军", "美国",
"middle east", "中东", "persian gulf", "波斯湾",
"strike", "attack", "military", "missile", "", "nuclear",
"carrier", "航母", "houthi", "胡塞", "hamas",
]

110
crawler/db_writer.py Normal file
View File

@@ -0,0 +1,110 @@
# -*- coding: utf-8 -*-
"""写入 SQLite 并确保 situation_update 表存在"""
import sqlite3
import hashlib
import os
from datetime import datetime, timezone
from config import DB_PATH
CATEGORIES = ("deployment", "alert", "intel", "diplomatic", "other")
SEVERITIES = ("low", "medium", "high", "critical")
def _ensure_table(conn: sqlite3.Connection) -> None:
conn.execute("""
CREATE TABLE IF NOT EXISTS situation_update (
id TEXT PRIMARY KEY,
timestamp TEXT NOT NULL,
category TEXT NOT NULL,
summary TEXT NOT NULL,
severity TEXT NOT NULL
)
""")
conn.commit()
def _make_id(title: str, url: str, published: str) -> str:
raw = f"{title}|{url}|{published}"
return "nw_" + hashlib.sha256(raw.encode("utf-8")).hexdigest()[:16]
def _to_utc_iso(dt: datetime) -> str:
if dt.tzinfo:
dt = dt.astimezone(timezone.utc)
return dt.strftime("%Y-%m-%dT%H:%M:%S.000Z")
def insert_update(
conn: sqlite3.Connection,
title: str,
summary: str,
url: str,
published: datetime,
category: str = "other",
severity: str = "medium",
) -> bool:
"""插入一条更新,若 id 已存在则跳过。返回是否插入了新记录。"""
_ensure_table(conn)
ts = _to_utc_iso(published)
uid = _make_id(title, url, ts)
if category not in CATEGORIES:
category = "other"
if severity not in SEVERITIES:
severity = "medium"
try:
conn.execute(
"INSERT OR IGNORE INTO situation_update (id, timestamp, category, summary, severity) VALUES (?, ?, ?, ?, ?)",
(uid, ts, category, summary[:500], severity),
)
conn.commit()
return conn.total_changes > 0
except Exception:
conn.rollback()
return False
def touch_situation_updated_at(conn: sqlite3.Connection) -> None:
"""更新 situation 表的 updated_at"""
conn.execute(
"INSERT OR REPLACE INTO situation (id, data, updated_at) VALUES (1, '{}', ?)",
(datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"),),
)
conn.commit()
def write_updates(updates: list[dict]) -> int:
"""
updates: [{"title","summary","url","published","category","severity"}, ...]
返回新增条数。
"""
if not os.path.exists(DB_PATH):
return 0
conn = sqlite3.connect(DB_PATH, timeout=10)
try:
count = 0
for u in updates:
pub = u.get("published")
if isinstance(pub, str):
try:
pub = datetime.fromisoformat(pub.replace("Z", "+00:00"))
except ValueError:
pub = datetime.utcnow()
elif pub is None:
pub = datetime.utcnow()
ok = insert_update(
conn,
title=u.get("title", "")[:200],
summary=u.get("summary", "") or u.get("title", ""),
url=u.get("url", ""),
published=pub,
category=u.get("category", "other"),
severity=u.get("severity", "medium"),
)
if ok:
count += 1
if count > 0:
touch_situation_updated_at(conn)
return count
finally:
conn.close()

57
crawler/main.py Normal file
View File

@@ -0,0 +1,57 @@
# -*- coding: utf-8 -*-
"""爬虫入口:定时抓取 → 解析 → 入库 → 通知 API"""
import time
import sys
from pathlib import Path
# 确保能导入 config
sys.path.insert(0, str(Path(__file__).resolve().parent))
from config import DB_PATH, API_BASE, CRAWL_INTERVAL
from scrapers.rss_scraper import fetch_all
from db_writer import write_updates
def notify_api() -> bool:
"""调用 Node API 触发立即广播"""
try:
import urllib.request
req = urllib.request.Request(
f"{API_BASE}/api/crawler/notify",
method="POST",
headers={"Content-Type": "application/json"},
)
with urllib.request.urlopen(req, timeout=5) as resp:
return resp.status == 200
except Exception as e:
print(f" [warn] notify API failed: {e}")
return False
def run_once() -> int:
items = fetch_all()
if not items:
return 0
n = write_updates(items)
if n > 0:
notify_api()
return n
def main() -> None:
print("Crawler started. DB:", DB_PATH)
print("API:", API_BASE, "| Interval:", CRAWL_INTERVAL, "s")
while True:
try:
n = run_once()
if n > 0:
print(f"[{time.strftime('%H:%M:%S')}] Inserted {n} new update(s)")
except KeyboardInterrupt:
break
except Exception as e:
print(f"[{time.strftime('%H:%M:%S')}] Error: {e}")
time.sleep(CRAWL_INTERVAL)
if __name__ == "__main__":
main()

52
crawler/parser.py Normal file
View File

@@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
"""新闻分类与严重度判定"""
import re
from typing import Literal
Category = Literal["deployment", "alert", "intel", "diplomatic", "other"]
Severity = Literal["low", "medium", "high", "critical"]
# 分类关键词
CAT_DEPLOYMENT = ["deploy", "carrier", "航母", "military build", "troop", "forces"]
CAT_ALERT = ["strike", "attack", "fire", "blast", "hit", "爆炸", "袭击", "打击"]
CAT_INTEL = ["satellite", "intel", "image", "surveillance", "卫星", "情报"]
CAT_DIPLOMATIC = ["talk", "negotiation", "diplomat", "sanction", "谈判", "制裁"]
def _match(text: str, words: list[str]) -> bool:
t = (text or "").lower()
for w in words:
if w.lower() in t:
return True
return False
def classify(text: str) -> Category:
if _match(text, CAT_ALERT):
return "alert"
if _match(text, CAT_DEPLOYMENT):
return "deployment"
if _match(text, CAT_INTEL):
return "intel"
if _match(text, CAT_DIPLOMATIC):
return "diplomatic"
return "other"
def severity(text: str, category: Category) -> Severity:
t = (text or "").lower()
critical = [
"nuclear", "", "strike", "attack", "killed", "dead", "casualty",
"war", "invasion", "袭击", "打击", "死亡",
]
high = [
"missile", "drone", "bomb", "explosion", "blasted", "fire",
"导弹", "无人机", "爆炸", "轰炸",
]
if _match(t, critical):
return "critical"
if _match(t, high) or category == "alert":
return "high"
if category == "deployment":
return "medium"
return "low"

View File

@@ -0,0 +1,286 @@
# -*- coding: utf-8 -*-
"""
GDELT 实时冲突抓取 + API 服务
核心数据源GDELT Project约 15 分钟级更新,含经纬度、事件编码、参与方、事件强度
"""
import os
# 直连外网,避免系统代理导致 ProxyError / 超时(需代理时设置 CRAWLER_USE_PROXY=1
if os.environ.get("CRAWLER_USE_PROXY") != "1":
os.environ.setdefault("NO_PROXY", "*")
import hashlib
import sqlite3
from datetime import datetime
from pathlib import Path
from typing import List, Optional
import requests
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from apscheduler.schedulers.background import BackgroundScheduler
app = FastAPI(title="GDELT Conflict Service")
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"])
# 配置
PROJECT_ROOT = Path(__file__).resolve().parent.parent
DB_PATH = os.environ.get("DB_PATH", str(PROJECT_ROOT / "server" / "data.db"))
API_BASE = os.environ.get("API_BASE", "http://localhost:3001")
QUERY = os.environ.get("GDELT_QUERY", "United States Iran military")
MAX_RECORDS = int(os.environ.get("GDELT_MAX_RECORDS", "30"))
FETCH_INTERVAL_SEC = int(os.environ.get("FETCH_INTERVAL_SEC", "60"))
RSS_INTERVAL_SEC = int(os.environ.get("RSS_INTERVAL_SEC", "45")) # 新闻抓取更频繁,优先保证事件脉络
# 时间范围1h=1小时 1d=1天 1week=1周不设则默认 3 个月(易返回旧文)
GDELT_TIMESPAN = os.environ.get("GDELT_TIMESPAN", "1d")
# 设为 1 则跳过 GDELT仅用 RSS 新闻作为事件脉络GDELT 国外可能无法访问)
GDELT_DISABLED = os.environ.get("GDELT_DISABLED", "0") == "1"
# 伊朗攻击源(无经纬度时默认)
IRAN_COORD = [51.3890, 35.6892] # Tehran [lng, lat]
# 请求直连,不经过系统代理(避免 ProxyError / 代理超时)
_REQ_KW = {"timeout": 15, "headers": {"User-Agent": "US-Iran-Dashboard/1.0"}}
if os.environ.get("CRAWLER_USE_PROXY") != "1":
_REQ_KW["proxies"] = {"http": None, "https": None}
EVENT_CACHE: List[dict] = []
# ==========================
# 冲突强度评分 (110)
# ==========================
def calculate_impact_score(title: str) -> int:
score = 1
t = (title or "").lower()
if "missile" in t:
score += 3
if "strike" in t:
score += 2
if "killed" in t or "death" in t or "casualt" in t:
score += 4
if "troops" in t or "soldier" in t:
score += 2
if "attack" in t or "attacked" in t:
score += 3
if "nuclear" in t or "" in t:
score += 4
if "explosion" in t or "blast" in t or "bomb" in t:
score += 2
return min(score, 10)
# ==========================
# 获取 GDELT 实时事件
# ==========================
def _parse_article(article: dict) -> Optional[dict]:
title_raw = article.get("title") or article.get("seendate") or ""
if not title_raw:
return None
from translate_utils import translate_to_chinese
title = translate_to_chinese(str(title_raw)[:500])
url = article.get("url") or article.get("socialimage") or ""
seendate = article.get("seendate") or datetime.utcnow().isoformat()
lat = article.get("lat")
lng = article.get("lng")
# 无经纬度时使用伊朗坐标(攻击源)
if lat is None or lng is None:
lat, lng = IRAN_COORD[1], IRAN_COORD[0]
try:
lat, lng = float(lat), float(lng)
except (TypeError, ValueError):
lat, lng = IRAN_COORD[1], IRAN_COORD[0]
impact = calculate_impact_score(title_raw)
event_id = hashlib.sha256(f"{url}{seendate}".encode()).hexdigest()[:24]
return {
"event_id": event_id,
"event_time": seendate,
"title": title[:500],
"lat": lat,
"lng": lng,
"impact_score": impact,
"url": url,
}
def fetch_gdelt_events() -> None:
if GDELT_DISABLED:
return
url = (
"https://api.gdeltproject.org/api/v2/doc/doc"
f"?query={QUERY}"
"&mode=ArtList"
"&format=json"
f"&maxrecords={MAX_RECORDS}"
f"&timespan={GDELT_TIMESPAN}"
"&sort=datedesc"
)
try:
resp = requests.get(url, **_REQ_KW)
resp.raise_for_status()
data = resp.json()
articles = data.get("articles", data) if isinstance(data, dict) else (data if isinstance(data, list) else [])
if not isinstance(articles, list):
articles = []
new_events = []
for a in articles:
ev = _parse_article(a) if isinstance(a, dict) else None
if ev:
new_events.append(ev)
# 按 event_time 排序,最新在前
new_events.sort(key=lambda e: e.get("event_time", ""), reverse=True)
global EVENT_CACHE
EVENT_CACHE = new_events
# 写入 SQLite 并通知 Node
_write_to_db(new_events)
_notify_node()
print(f"[{datetime.now().strftime('%H:%M:%S')}] GDELT 更新 {len(new_events)} 条事件")
except Exception as e:
print(f"GDELT 抓取失败: {e}")
def _ensure_table(conn: sqlite3.Connection) -> None:
conn.execute("""
CREATE TABLE IF NOT EXISTS gdelt_events (
event_id TEXT PRIMARY KEY,
event_time TEXT NOT NULL,
title TEXT NOT NULL,
lat REAL NOT NULL,
lng REAL NOT NULL,
impact_score INTEGER NOT NULL,
url TEXT,
created_at TEXT DEFAULT (datetime('now'))
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS conflict_stats (
id INTEGER PRIMARY KEY CHECK (id = 1),
total_events INTEGER NOT NULL,
high_impact_events INTEGER NOT NULL,
estimated_casualties INTEGER NOT NULL,
estimated_strike_count INTEGER NOT NULL,
updated_at TEXT NOT NULL
)
""")
conn.commit()
def _write_to_db(events: List[dict]) -> None:
if not os.path.exists(DB_PATH):
return
conn = sqlite3.connect(DB_PATH, timeout=10)
try:
_ensure_table(conn)
for e in events:
conn.execute(
"INSERT OR REPLACE INTO gdelt_events (event_id, event_time, title, lat, lng, impact_score, url) VALUES (?, ?, ?, ?, ?, ?, ?)",
(
e["event_id"],
e.get("event_time", ""),
e.get("title", ""),
e.get("lat", 0),
e.get("lng", 0),
e.get("impact_score", 1),
e.get("url", ""),
),
)
# 战损统计模型(展示用)
high = sum(1 for x in events if x.get("impact_score", 0) >= 7)
strikes = sum(1 for x in events if "strike" in (x.get("title") or "").lower() or "attack" in (x.get("title") or "").lower())
casualties = min(5000, high * 80 + len(events) * 10) # 估算
conn.execute(
"INSERT OR REPLACE INTO conflict_stats (id, total_events, high_impact_events, estimated_casualties, estimated_strike_count, updated_at) VALUES (1, ?, ?, ?, ?, ?)",
(len(events), high, casualties, strikes, datetime.utcnow().isoformat()),
)
conn.execute(
"INSERT OR REPLACE INTO situation (id, data, updated_at) VALUES (1, '{}', ?)",
(datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"),),
)
conn.commit()
except Exception as e:
print(f"写入 DB 失败: {e}")
conn.rollback()
finally:
conn.close()
def _notify_node() -> None:
try:
r = requests.post(f"{API_BASE}/api/crawler/notify", timeout=5, proxies={"http": None, "https": None})
if r.status_code != 200:
print(" [warn] notify API 失败")
except Exception as e:
print(f" [warn] notify API: {e}")
# ==========================
# RSS 新闻抓取(补充 situation_update
# ==========================
def fetch_news() -> None:
try:
from scrapers.rss_scraper import fetch_all
from db_writer import write_updates
from translate_utils import translate_to_chinese
items = fetch_all()
for it in items:
it["title"] = translate_to_chinese(it.get("title", "") or "")
it["summary"] = translate_to_chinese(it.get("summary", "") or it.get("title", ""))
if items:
n = write_updates(items)
if n > 0:
_notify_node()
print(f"[{datetime.now().strftime('%H:%M:%S')}] 新闻入库 {n}")
except Exception as e:
print(f"新闻抓取失败: {e}")
# ==========================
# 定时任务RSS 更频繁,优先保证事件脉络实时)
# ==========================
scheduler = BackgroundScheduler()
scheduler.add_job(fetch_news, "interval", seconds=RSS_INTERVAL_SEC)
scheduler.add_job(fetch_gdelt_events, "interval", seconds=FETCH_INTERVAL_SEC)
scheduler.start()
# ==========================
# API 接口
# ==========================
@app.get("/events")
def get_events():
return {
"updated_at": datetime.utcnow().isoformat(),
"count": len(EVENT_CACHE),
"events": EVENT_CACHE,
"conflict_stats": _get_conflict_stats(),
}
def _get_conflict_stats() -> dict:
if not os.path.exists(DB_PATH):
return {"total_events": 0, "high_impact_events": 0, "estimated_casualties": 0, "estimated_strike_count": 0}
try:
conn = sqlite3.connect(DB_PATH, timeout=5)
row = conn.execute("SELECT total_events, high_impact_events, estimated_casualties, estimated_strike_count FROM conflict_stats WHERE id = 1").fetchone()
conn.close()
if row:
return {
"total_events": row[0],
"high_impact_events": row[1],
"estimated_casualties": row[2],
"estimated_strike_count": row[3],
}
except Exception:
pass
return {"total_events": 0, "high_impact_events": 0, "estimated_casualties": 0, "estimated_strike_count": 0}
@app.on_event("startup")
def startup():
# 新闻优先启动,确保事件脉络有数据
fetch_news()
fetch_gdelt_events()
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)

6
crawler/requirements.txt Normal file
View File

@@ -0,0 +1,6 @@
requests>=2.31.0
feedparser>=6.0.0
fastapi>=0.109.0
uvicorn>=0.27.0
apscheduler>=3.10.0
deep-translator>=1.11.0

View File

@@ -0,0 +1 @@
# -*- coding: utf-8 -*-

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,77 @@
# -*- coding: utf-8 -*-
"""RSS 抓取"""
import re
from datetime import datetime, timezone
import feedparser
from config import RSS_FEEDS, KEYWORDS
from parser import classify, severity
def _parse_date(entry) -> datetime:
for attr in ("published_parsed", "updated_parsed"):
val = getattr(entry, attr, None)
if val:
try:
return datetime(*val[:6], tzinfo=timezone.utc)
except (TypeError, ValueError):
pass
return datetime.now(timezone.utc)
def _strip_html(s: str) -> str:
return re.sub(r"<[^>]+>", "", s) if s else ""
def _matches_keywords(text: str) -> bool:
t = (text or "").lower()
for k in KEYWORDS:
if k.lower() in t:
return True
return False
def fetch_all() -> list[dict]:
import socket
items: list[dict] = []
seen: set[str] = set()
# 单源超时 10 秒,避免某源卡住
old_timeout = socket.getdefaulttimeout()
socket.setdefaulttimeout(10)
try:
for url in RSS_FEEDS:
try:
feed = feedparser.parse(
url,
request_headers={"User-Agent": "US-Iran-Dashboard/1.0"},
agent="US-Iran-Dashboard/1.0",
)
except Exception:
continue
for entry in feed.entries:
title = getattr(entry, "title", "") or ""
raw_summary = getattr(entry, "summary", "") or getattr(entry, "description", "") or ""
summary = _strip_html(raw_summary)
link = getattr(entry, "link", "") or ""
text = f"{title} {summary}"
if not _matches_keywords(text):
continue
key = (title[:80], link)
if key in seen:
continue
seen.add(key)
published = _parse_date(entry)
cat = classify(text)
sev = severity(text, cat)
items.append({
"title": title,
"summary": summary[:400] if summary else title,
"url": link,
"published": _parse_date(entry),
"category": cat,
"severity": sev,
})
finally:
socket.setdefaulttimeout(old_timeout)
return items

View File

@@ -0,0 +1,28 @@
# -*- coding: utf-8 -*-
"""英译中,入库前统一翻译"""
import re
from typing import Optional
def _is_mostly_chinese(text: str) -> bool:
if not text or len(text.strip()) < 2:
return False
chinese = len(re.findall(r"[\u4e00-\u9fff]", text))
return chinese / max(len(text), 1) > 0.3
def translate_to_chinese(text: str) -> str:
"""将文本翻译成中文,失败或已是中文则返回原文。"""
if not text or not text.strip():
return text
s = str(text).strip()
if len(s) > 2000:
s = s[:2000]
if _is_mostly_chinese(s):
return text
try:
from deep_translator import GoogleTranslator
out = GoogleTranslator(source="auto", target="zh-CN").translate(s)
return out if out else text
except Exception:
return text

91
docs/BACKEND_MODULES.md Normal file
View File

@@ -0,0 +1,91 @@
# 后端模块说明
## 一、现有模块结构
```
server/
├── index.js # HTTP + WebSocket 入口
├── routes.js # REST API 路由
├── db.js # SQLite schema 与连接
├── situationData.js # 态势数据聚合 (从 DB 读取)
├── seed.js # 初始数据填充
├── data.db # SQLite 数据库
└── package.json
crawler/
├── realtime_conflict_service.py # GDELT 实时冲突服务 (核心)
├── requirements.txt
├── config.py, db_writer.py # 旧 RSS 爬虫(可保留)
├── main.py
└── README.md
```
### 1. server/index.js
- Express + CORS
- WebSocket (`/ws`),每 5 秒广播 `situation`
- `POST /api/crawler/notify`:爬虫写入后触发立即广播
### 2. server/routes.js
- `GET /api/situation`:完整态势
- `GET /api/events`GDELT 事件 + 冲突统计
- `GET /api/health`:健康检查
### 3. server/db.js
- 表:`situation``force_summary``power_index``force_asset`
`key_location``combat_losses``wall_street_trend`
`retaliation_current``retaliation_history``situation_update`
**`gdelt_events`**、**`conflict_stats`**
---
## 二、GDELT 核心数据源
**GDELT Project**:全球冲突数据库,约 15 分钟级更新,含经纬度、事件编码、参与方、事件强度。
### realtime_conflict_service.py
- 定时(默认 60 秒)从 GDELT API 抓取
- 冲突强度评分missile +3, strike +2, killed +4 等
- 无经纬度时默认攻击源:`IRAN_COORD = [51.3890, 35.6892]`
- 写入 `gdelt_events``conflict_stats`
- 调用 `POST /api/crawler/notify` 触发 Node 广播
### 冲突强度 → 地图效果
| impact_score | 效果 |
|--------------|------------|
| 13 | 绿色点 |
| 46 | 橙色闪烁 |
| 710 | 红色脉冲扩散 |
### 战损统计模型(展示用)
- `total_events`
- `high_impact_events` (impact ≥ 7)
- `estimated_casualties`
- `estimated_strike_count`
---
## 三、数据流
```
GDELT API → Python 服务(60s) → gdelt_events, conflict_stats
POST /api/crawler/notify → situation.updated_at
WebSocket 广播 getSituation() → 前端
```
---
## 四、运行方式
```bash
# 1. 启动 Node API
npm run api
# 2. 启动 GDELT 服务
npm run gdelt
# 或: cd crawler && uvicorn realtime_conflict_service:app --port 8000
```

View File

@@ -0,0 +1,89 @@
import type { SituationUpdate, ConflictEvent } from '@/data/mockData'
import { History } from 'lucide-react'
interface EventTimelinePanelProps {
updates: SituationUpdate[]
conflictEvents?: ConflictEvent[]
className?: string
}
const CAT_LABELS: Record<string, string> = {
deployment: '部署',
alert: '警报',
intel: '情报',
diplomatic: '外交',
other: '其他',
}
function formatTime(iso: string): string {
const d = new Date(iso)
return d.toLocaleString('zh-CN', { month: '2-digit', day: '2-digit', hour: '2-digit', minute: '2-digit', hour12: false })
}
type TimelineItem = {
id: string
summary: string
timestamp: string
source: 'gdelt' | 'rss'
category?: string
severity?: string
}
export function EventTimelinePanel({ updates = [], conflictEvents = [], className = '' }: EventTimelinePanelProps) {
// 合并 GDELT + RSS按时间倒序最新在前
const merged: TimelineItem[] = [
...(conflictEvents || []).map((e) => ({
id: e.event_id,
summary: e.title,
timestamp: e.event_time,
source: 'gdelt' as const,
category: 'alert',
severity: e.impact_score >= 7 ? 'high' : e.impact_score >= 4 ? 'medium' : 'low',
})),
...(updates || []).map((u) => ({
id: u.id,
summary: u.summary,
timestamp: u.timestamp,
source: 'rss' as const,
category: u.category,
severity: u.severity,
})),
]
.sort((a, b) => new Date(b.timestamp).getTime() - new Date(a.timestamp).getTime())
.slice(0, 6)
return (
<div className={`flex min-w-0 max-w-[280px] shrink flex-col overflow-hidden rounded border border-military-border bg-military-panel/80 ${className}`}>
<div className="flex shrink-0 items-center justify-between border-b border-military-border px-3 py-1.5">
<span className="flex items-center gap-1.5 text-[10px] font-semibold uppercase tracking-wider text-military-text-secondary">
<History className="h-3.5 w-3.5 shrink-0 text-military-accent" />
</span>
<span className="text-[8px] text-military-text-secondary/80">GDELT · Reuters · BBC · Al Jazeera · NYT</span>
</div>
<div className="min-h-0 max-h-[140px] flex-1 overflow-y-auto overflow-x-hidden px-2 py-1">
{merged.length === 0 ? (
<p className="py-4 text-center text-[11px] text-military-text-secondary"></p>
) : (
<ul className="space-y-2">
{merged.map((item) => (
<li key={item.id} className="flex gap-2 border-b border-military-border/50 pb-2 last:border-0 last:pb-0">
<span className="shrink-0 pt-0.5">
<span className={`inline-block h-1.5 w-1.5 rounded-full ${item.source === 'gdelt' ? 'bg-cyan-500' : 'bg-amber-500'}`} />
</span>
<div className="min-w-0 flex-1 text-right">
<p className="text-[11px] leading-tight text-military-text-primary line-clamp-2">{item.summary}</p>
<span className="mt-0.5 flex items-center justify-end gap-1 text-[9px] text-military-text-secondary">
{formatTime(item.timestamp)}
<span className="text-military-text-secondary/60">
{item.source === 'gdelt' ? 'GDELT' : CAT_LABELS[item.category ?? ''] ?? '新闻'}
</span>
</span>
</div>
</li>
))}
</ul>
)}
</div>
</div>
)
}

View File

@@ -0,0 +1,71 @@
import type { SituationUpdate, ConflictEvent } from '@/data/mockData'
import { Newspaper, AlertTriangle } from 'lucide-react'
interface RecentUpdatesPanelProps {
updates: SituationUpdate[]
conflictEvents?: ConflictEvent[]
className?: string
}
const CAT_LABELS: Record<string, string> = {
deployment: '部署',
alert: '警报',
intel: '情报',
diplomatic: '外交',
other: '其他',
}
const SEV_COLORS: Record<string, string> = {
low: 'text-military-text-secondary',
medium: 'text-amber-400',
high: 'text-orange-500',
critical: 'text-red-500',
}
function formatTime(iso: string): string {
const d = new Date(iso)
const now = new Date()
const diff = now.getTime() - d.getTime()
if (diff < 60000) return '刚刚'
if (diff < 3600000) return `${Math.floor(diff / 60000)}分钟前`
if (diff < 86400000) return `${Math.floor(diff / 3600000)}小时前`
return d.toLocaleString('zh-CN', { month: '2-digit', day: '2-digit', hour: '2-digit', minute: '2-digit' })
}
export function RecentUpdatesPanel({ updates, conflictEvents = [], className = '' }: RecentUpdatesPanelProps) {
// 优先展示 GDELT 冲突事件(最新 10 条),无则用 updates
const fromConflict = (conflictEvents || [])
.slice(0, 10)
.map((e) => ({ id: e.event_id, summary: e.title, timestamp: e.event_time, category: 'alert' as const, severity: (e.impact_score >= 7 ? 'high' : e.impact_score >= 4 ? 'medium' : 'low') as const }))
const list = fromConflict.length > 0 ? fromConflict : (updates || []).slice(0, 8)
return (
<div className={`flex min-w-0 flex-1 flex-col overflow-hidden rounded border border-military-border bg-military-panel/80 ${className}`}>
<div className="flex shrink-0 items-center gap-1.5 border-b border-military-border px-3 py-1.5">
<Newspaper className="h-3.5 w-3.5 shrink-0 text-military-accent" />
<span className="truncate text-[10px] font-semibold uppercase tracking-wider text-military-text-secondary">
</span>
</div>
<div className="min-h-0 flex-1 overflow-y-auto overflow-x-hidden px-2 py-1">
{list.length === 0 ? (
<p className="py-4 text-center text-[11px] text-military-text-secondary"></p>
) : (
<ul className="space-y-2">
{list.map((u) => (
<li key={u.id} className="flex gap-2 border-b border-military-border/50 pb-2 last:border-0 last:pb-0">
<span className={`shrink-0 text-[9px] ${SEV_COLORS[u.severity] ?? 'text-military-text-secondary'}`}>
{u.severity === 'critical' && <AlertTriangle className="inline h-2.5 w-2.5" />}
{CAT_LABELS[u.category] ?? u.category}
</span>
<div className="min-w-0 flex-1 text-right">
<p className="text-[11px] leading-tight text-military-text-primary line-clamp-2">{u.summary}</p>
<span className="mt-0.5 block text-[9px] text-military-text-secondary">{formatTime(u.timestamp)}</span>
</div>
</li>
))}
</ul>
)}
</div>
</div>
)
}

View File

@@ -0,0 +1,142 @@
import { useEffect, useRef } from 'react'
import { Play, Pause, SkipBack, SkipForward, History } from 'lucide-react'
import { usePlaybackStore, REPLAY_TICKS, REPLAY_START, REPLAY_END } from '@/store/playbackStore'
function formatTick(iso: string): string {
const d = new Date(iso)
return d.toLocaleString('zh-CN', {
month: '2-digit',
day: '2-digit',
hour: '2-digit',
minute: '2-digit',
hour12: false,
})
}
export function TimelinePanel() {
const {
isReplayMode,
playbackTime,
isPlaying,
speedSecPerTick,
setReplayMode,
setPlaybackTime,
setIsPlaying,
stepForward,
stepBack,
setSpeed,
} = usePlaybackStore()
const timerRef = useRef<ReturnType<typeof setInterval> | null>(null)
useEffect(() => {
if (!isPlaying || !isReplayMode) {
if (timerRef.current) {
clearInterval(timerRef.current)
timerRef.current = null
}
return
}
timerRef.current = setInterval(() => {
const current = usePlaybackStore.getState().playbackTime
const i = REPLAY_TICKS.indexOf(current)
if (i >= REPLAY_TICKS.length - 1) {
setIsPlaying(false)
return
}
setPlaybackTime(REPLAY_TICKS[i + 1])
}, speedSecPerTick * 1000)
return () => {
if (timerRef.current) clearInterval(timerRef.current)
}
}, [isPlaying, isReplayMode, speedSecPerTick, setPlaybackTime, setIsPlaying])
const index = REPLAY_TICKS.indexOf(playbackTime)
const value = index >= 0 ? index : REPLAY_TICKS.length - 1
const handleSliderChange = (e: React.ChangeEvent<HTMLInputElement>) => {
const i = parseInt(e.target.value, 10)
setPlaybackTime(REPLAY_TICKS[i])
}
return (
<div className="shrink-0 border-b border-military-border bg-military-panel/95 px-3 py-2">
<div className="flex flex-wrap items-center gap-3">
<button
type="button"
onClick={() => setReplayMode(!isReplayMode)}
className={`flex items-center gap-1.5 rounded px-2 py-1 text-xs font-medium transition-colors ${
isReplayMode
? 'bg-military-accent/30 text-military-accent'
: 'bg-military-border/50 text-military-text-secondary hover:bg-military-border hover:text-military-text-primary'
}`}
>
<History className="h-3.5 w-3.5" />
</button>
{isReplayMode && (
<>
<div className="flex items-center gap-1">
<button
type="button"
onClick={stepBack}
disabled={index <= 0}
className="rounded p-1 text-military-text-secondary hover:bg-military-border hover:text-military-text-primary disabled:opacity-40 disabled:hover:bg-transparent disabled:hover:text-military-text-secondary"
title="上一步"
>
<SkipBack className="h-4 w-4" />
</button>
<button
type="button"
onClick={() => setIsPlaying(!isPlaying)}
className="rounded p-1 text-military-text-secondary hover:bg-military-border hover:text-military-text-primary"
title={isPlaying ? '暂停' : '播放'}
>
{isPlaying ? <Pause className="h-4 w-4" /> : <Play className="h-4 w-4" />}
</button>
<button
type="button"
onClick={stepForward}
disabled={index >= REPLAY_TICKS.length - 1}
className="rounded p-1 text-military-text-secondary hover:bg-military-border hover:text-military-text-primary disabled:opacity-40 disabled:hover:bg-transparent disabled:hover:text-military-text-secondary"
title="下一步"
>
<SkipForward className="h-4 w-4" />
</button>
</div>
<div className="flex min-w-0 flex-1 items-center gap-2 lg:min-w-[320px]">
<input
type="range"
min={0}
max={REPLAY_TICKS.length - 1}
value={value}
onChange={handleSliderChange}
className="h-1.5 flex-1 cursor-pointer appearance-none rounded-full bg-military-border [&::-webkit-slider-thumb]:h-3 [&::-webkit-slider-thumb]:w-3 [&::-webkit-slider-thumb]:appearance-none [&::-webkit-slider-thumb]:cursor-pointer [&::-webkit-slider-thumb]:rounded-full [&::-webkit-slider-thumb]:bg-military-accent"
/>
</div>
<div className="flex items-center gap-2 text-[11px] tabular-nums text-military-text-secondary">
<span>{formatTick(REPLAY_START)}</span>
<span className="font-medium text-military-accent">{formatTick(playbackTime)}</span>
<span>{formatTick(REPLAY_END)}</span>
</div>
<select
value={speedSecPerTick}
onChange={(e) => setSpeed(Number(e.target.value))}
className="rounded border border-military-border bg-military-dark/80 px-2 py-1 text-[11px] text-military-text-secondary focus:border-military-accent focus:outline-none"
>
<option value={0.5}>0.5 /</option>
<option value={1}>1 /</option>
<option value={2}>2 /</option>
<option value={3}>3 /</option>
<option value={5}>5 /</option>
</select>
</>
)}
</div>
</div>
)
}

View File

@@ -0,0 +1,143 @@
import { useMemo } from 'react'
import type { MilitarySituation } from '@/data/mockData'
import { useSituationStore } from '@/store/situationStore'
import { usePlaybackStore } from '@/store/playbackStore'
/** 将系列时间映射到回放日 (2026-03-01) 以便按当天时刻插值 */
function toReplayDay(iso: string, baseDay: string): string {
const d = new Date(iso)
const [y, m, day] = baseDay.slice(0, 10).split('-').map(Number)
return new Date(y, (m || 1) - 1, day || 1, d.getUTCHours(), d.getUTCMinutes(), 0, 0).toISOString()
}
function interpolateAt(
series: { time: string; value: number }[],
at: string,
baseDay = '2026-03-01'
): number {
if (series.length === 0) return 0
const t = new Date(at).getTime()
const mapped = series.map((p) => ({
time: toReplayDay(p.time, baseDay),
value: p.value,
}))
const sorted = [...mapped].sort((a, b) => new Date(a.time).getTime() - new Date(b.time).getTime())
const before = sorted.filter((p) => new Date(p.time).getTime() <= t)
const after = sorted.filter((p) => new Date(p.time).getTime() > t)
if (before.length === 0) return sorted[0].value
if (after.length === 0) return sorted[sorted.length - 1].value
const a = before[before.length - 1]
const b = after[0]
const ta = new Date(a.time).getTime()
const tb = new Date(b.time).getTime()
const f = tb === ta ? 1 : (t - ta) / (tb - ta)
return a.value + f * (b.value - a.value)
}
function linearProgress(start: string, end: string, at: string): number {
const ts = new Date(start).getTime()
const te = new Date(end).getTime()
const ta = new Date(at).getTime()
if (ta <= ts) return 0
if (ta >= te) return 1
return (ta - ts) / (te - ts)
}
/** 根据回放时刻派生态势数据 */
export function useReplaySituation(): MilitarySituation {
const situation = useSituationStore((s) => s.situation)
const { isReplayMode, playbackTime } = usePlaybackStore()
return useMemo(() => {
if (!isReplayMode) return situation
const progress = linearProgress('2026-03-01T02:00:00.000Z', '2026-03-01T11:45:00.000Z', playbackTime)
// 华尔街趋势、反击情绪:按时间插值
const wsValue = interpolateAt(situation.usForces.wallStreetInvestmentTrend, playbackTime)
const retValue = interpolateAt(situation.iranForces.retaliationSentimentHistory, playbackTime)
// 战斗损失:从 0 线性增长到当前值
const lerp = (a: number, b: number) => Math.round(a + progress * (b - a))
const usLoss = situation.usForces.combatLosses
const irLoss = situation.iranForces.combatLosses
const civUs = usLoss.civilianCasualties ?? { killed: 0, wounded: 0 }
const civIr = irLoss.civilianCasualties ?? { killed: 0, wounded: 0 }
const usLossesAt = {
bases: {
destroyed: lerp(0, usLoss.bases.destroyed),
damaged: lerp(0, usLoss.bases.damaged),
},
personnelCasualties: {
killed: lerp(0, usLoss.personnelCasualties.killed),
wounded: lerp(0, usLoss.personnelCasualties.wounded),
},
civilianCasualties: { killed: lerp(0, civUs.killed), wounded: lerp(0, civUs.wounded) },
aircraft: lerp(0, usLoss.aircraft),
warships: lerp(0, usLoss.warships),
armor: lerp(0, usLoss.armor),
vehicles: lerp(0, usLoss.vehicles),
}
const irLossesAt = {
bases: {
destroyed: lerp(0, irLoss.bases.destroyed),
damaged: lerp(0, irLoss.bases.damaged),
},
personnelCasualties: {
killed: lerp(0, irLoss.personnelCasualties.killed),
wounded: lerp(0, irLoss.personnelCasualties.wounded),
},
civilianCasualties: { killed: lerp(0, civIr.killed), wounded: lerp(0, civIr.wounded) },
aircraft: lerp(0, irLoss.aircraft),
warships: lerp(0, irLoss.warships),
armor: lerp(0, irLoss.armor),
vehicles: lerp(0, irLoss.vehicles),
}
// 被袭基地:按 damage_level 排序,高损毁先出现;根据 progress 决定显示哪些为 attacked
const usLocs = situation.usForces.keyLocations || []
const attackedBases = usLocs
.filter((loc) => loc.status === 'attacked')
.sort((a, b) => (b.damage_level ?? 0) - (a.damage_level ?? 0))
const totalAttacked = attackedBases.length
const shownAttackedCount = Math.round(progress * totalAttacked)
const attackedNames = new Set(
attackedBases.slice(0, shownAttackedCount).map((l) => l.name)
)
const usLocsAt = usLocs.map((loc) => {
if (loc.status === 'attacked' && !attackedNames.has(loc.name)) {
return { ...loc, status: 'operational' as const }
}
return { ...loc }
})
return {
...situation,
lastUpdated: playbackTime,
usForces: {
...situation.usForces,
keyLocations: usLocsAt,
combatLosses: usLossesAt,
wallStreetInvestmentTrend: [
...situation.usForces.wallStreetInvestmentTrend.filter((p) => new Date(p.time).getTime() <= new Date(playbackTime).getTime()),
{ time: playbackTime, value: wsValue },
].slice(-20),
},
iranForces: {
...situation.iranForces,
combatLosses: irLossesAt,
retaliationSentiment: retValue,
retaliationSentimentHistory: [
...situation.iranForces.retaliationSentimentHistory.filter((p) => new Date(p.time).getTime() <= new Date(playbackTime).getTime()),
{ time: playbackTime, value: retValue },
].slice(-20),
},
recentUpdates: (situation.recentUpdates || []).filter(
(u) => new Date(u.timestamp).getTime() <= new Date(playbackTime).getTime()
),
conflictEvents: situation.conflictEvents || [],
conflictStats: situation.conflictStats || { total_events: 0, high_impact_events: 0, estimated_casualties: 0, estimated_strike_count: 0 },
}
}, [situation, isReplayMode, playbackTime])
}

View File

@@ -0,0 +1,80 @@
import { create } from 'zustand'
const REPLAY_DAY = '2026-03-01'
const TICK_MS = 30 * 60 * 1000 // 30 minutes
export const REPLAY_START = `${REPLAY_DAY}T00:00:00.000Z`
export const REPLAY_END = `${REPLAY_DAY}T23:30:00.000Z`
function parseTime(iso: string): number {
return new Date(iso).getTime()
}
export function getTicks(): string[] {
const ticks: string[] = []
let t = parseTime(REPLAY_START)
const end = parseTime(REPLAY_END)
while (t <= end) {
ticks.push(new Date(t).toISOString())
t += TICK_MS
}
return ticks
}
export const REPLAY_TICKS = getTicks()
export interface PlaybackState {
/** 是否开启回放模式 */
isReplayMode: boolean
/** 当前回放时刻 (ISO) */
playbackTime: string
/** 是否正在自动播放 */
isPlaying: boolean
/** 播放速度 (秒/刻度) */
speedSecPerTick: number
setReplayMode: (v: boolean) => void
setPlaybackTime: (iso: string) => void
setIsPlaying: (v: boolean) => void
stepForward: () => void
stepBack: () => void
setSpeed: (sec: number) => void
}
export const usePlaybackStore = create<PlaybackState>((set, get) => ({
isReplayMode: false,
playbackTime: REPLAY_END,
isPlaying: false,
speedSecPerTick: 2,
setReplayMode: (v) => set({ isReplayMode: v, isPlaying: false }),
setPlaybackTime: (iso) => {
const ticks = REPLAY_TICKS
if (ticks.includes(iso)) {
set({ playbackTime: iso })
return
}
const idx = ticks.findIndex((t) => t >= iso)
const clamp = Math.max(0, Math.min(idx < 0 ? ticks.length - 1 : idx, ticks.length - 1))
set({ playbackTime: ticks[clamp] })
},
setIsPlaying: (v) => set({ isPlaying: v }),
stepForward: () => {
const { playbackTime } = get()
const ticks = REPLAY_TICKS
const i = ticks.indexOf(playbackTime)
if (i < ticks.length - 1) set({ playbackTime: ticks[i + 1] })
else set({ isPlaying: false })
},
stepBack: () => {
const { playbackTime } = get()
const ticks = REPLAY_TICKS
const i = ticks.indexOf(playbackTime)
if (i > 0) set({ playbackTime: ticks[i - 1] })
},
setSpeed: (sec) => set({ speedSecPerTick: sec }),
}))