diff --git a/=1.11.0 b/=1.11.0 new file mode 100644 index 0000000..e69de29 diff --git a/crawler/README.md b/crawler/README.md new file mode 100644 index 0000000..d392455 --- /dev/null +++ b/crawler/README.md @@ -0,0 +1,95 @@ +# GDELT 实时冲突服务 + 新闻爬虫 + +## 数据来源梳理 + +### 1. GDELT Project (gdelt_events) + +| 项目 | 说明 | +|------|------| +| API | `https://api.gdeltproject.org/api/v2/doc/doc` | +| 查询 | `query=United States Iran military`(可配 `GDELT_QUERY`) | +| 模式 | `mode=ArtList`,`format=json`,`maxrecords=30` | +| 时间范围 | **未指定时默认最近 3 个月**,按相关性排序,易返回较旧文章 | +| 更新频率 | GDELT 约 15 分钟级,爬虫 60 秒拉一次 | + +**数据偏老原因**:未传 `timespan` 和 `sort=datedesc`,API 返回 3 个月内“最相关”文章,不保证最新。 + +### 2. RSS 新闻 (situation_update) — 主事件脉络来源 + +| 项目 | 说明 | +|------|------| +| 源 | Reuters、BBC World/MiddleEast、Al Jazeera、NYT World | +| 过滤 | 标题/摘要需含 `KEYWORDS` 之一(iran、usa、strike、military 等) | +| 更新 | 爬虫 45 秒拉一次(`RSS_INTERVAL_SEC`),优先保证事件脉络 | +| 优先级 | 启动时先拉 RSS,再拉 GDELT | + +**GDELT 无法访问时**:设置 `GDELT_DISABLED=1`,仅用 RSS 新闻即可维持事件脉络。 + +--- + +**事件脉络可实时更新**:爬虫抓取后 → 写入 SQLite → 调用 Node 通知 → WebSocket 广播 → 前端自动刷新。 + +## 依赖 + +```bash +pip install -r requirements.txt +``` + +新增 `deep-translator`:GDELT 与 RSS 新闻入库前自动翻译为中文。 + +## 运行(需同时启动 3 个服务) + +| 终端 | 命令 | 说明 | +|------|------|------| +| 1 | `npm run api` | Node API + WebSocket(必须) | +| 2 | `npm run gdelt` | GDELT + RSS 爬虫(**事件脉络数据来源**) | +| 3 | `npm run dev` | 前端开发 | + +**事件脉络不更新时**:多半是未启动 `npm run gdelt`。只跑 `npm run api` 时,事件脉络会显示空或仅有缓存。 + +## 数据流 + +``` +GDELT API → 抓取(60s) → SQLite (gdelt_events, conflict_stats) → POST /api/crawler/notify + ↓ + Node 更新 situation.updated_at + WebSocket 广播 + ↓ + 前端实时展示 +``` + +## 配置 + +环境变量: + +- `DB_PATH`: SQLite 路径,默认 `../server/data.db` +- `API_BASE`: Node API 地址,默认 `http://localhost:3001` +- `GDELT_QUERY`: 搜索关键词,默认 `United States Iran military` +- `GDELT_MAX_RECORDS`: 最大条数,默认 30 +- `GDELT_TIMESPAN`: 时间范围,`1h` / `1d` / `1week`,默认 `1d`(近日资讯) +- `GDELT_DISABLED`: 设为 `1` 则跳过 GDELT,仅用 RSS 新闻(GDELT 无法访问时用) +- `FETCH_INTERVAL_SEC`: GDELT 抓取间隔(秒),默认 60 +- `RSS_INTERVAL_SEC`: RSS 抓取间隔(秒),默认 45(优先保证事件脉络) + +## 冲突强度 (impact_score) + +| 分数 | 地图效果 | +|------|------------| +| 1–3 | 绿色点 | +| 4–6 | 橙色闪烁 | +| 7–10 | 红色脉冲扩散 | + +## API + +- `GET http://localhost:8000/events`:返回事件列表与冲突统计(Python 服务直连) +- `GET http://localhost:3001/api/events`:从 Node 读取(推荐,含 WebSocket 同步) + +## 故障排查 + +| 现象 | 可能原因 | 排查 | +|------|----------|------| +| 事件脉络始终为空 | 未启动 GDELT 爬虫 | 另开终端运行 `npm run gdelt`,观察是否有 `GDELT 更新 X 条事件` 输出 | +| 事件脉络不刷新 | WebSocket 未连上 | 确认 `npm run api` 已启动,前端需通过 `npm run dev` 访问(Vite 会代理 /ws) | +| GDELT 抓取失败 | 系统代理超时 / ProxyError | 爬虫默认直连,不走代理;若需代理请设 `CRAWLER_USE_PROXY=1` | +| GDELT 抓取失败 | 网络 / GDELT API 限流 | 检查 Python 终端报错;GDELT 在国外,国内网络可能较慢或超时 | +| 新闻条数为 0 | RSS 源被墙或关键词不匹配 | 检查 crawler/config.py 中 RSS_FEEDS、KEYWORDS;国内需代理 | +| **返回数据偏老** | GDELT 默认 3 个月内按相关性 | 设置 `GDELT_TIMESPAN=1d` 限制为近日;加 `sort=datedesc` 最新优先 | diff --git a/crawler/__pycache__/config.cpython-39.pyc b/crawler/__pycache__/config.cpython-39.pyc new file mode 100644 index 0000000..b48687b Binary files /dev/null and b/crawler/__pycache__/config.cpython-39.pyc differ diff --git a/crawler/__pycache__/db_writer.cpython-39.pyc b/crawler/__pycache__/db_writer.cpython-39.pyc new file mode 100644 index 0000000..f0ac9bc Binary files /dev/null and b/crawler/__pycache__/db_writer.cpython-39.pyc differ diff --git a/crawler/__pycache__/parser.cpython-39.pyc b/crawler/__pycache__/parser.cpython-39.pyc new file mode 100644 index 0000000..d6e3d64 Binary files /dev/null and b/crawler/__pycache__/parser.cpython-39.pyc differ diff --git a/crawler/__pycache__/realtime_conflict_service.cpython-39.pyc b/crawler/__pycache__/realtime_conflict_service.cpython-39.pyc new file mode 100644 index 0000000..c22b35a Binary files /dev/null and b/crawler/__pycache__/realtime_conflict_service.cpython-39.pyc differ diff --git a/crawler/__pycache__/translate_utils.cpython-39.pyc b/crawler/__pycache__/translate_utils.cpython-39.pyc new file mode 100644 index 0000000..a6fb40b Binary files /dev/null and b/crawler/__pycache__/translate_utils.cpython-39.pyc differ diff --git a/crawler/config.py b/crawler/config.py new file mode 100644 index 0000000..db0a7af --- /dev/null +++ b/crawler/config.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +"""爬虫配置""" +import os +from pathlib import Path + +# 数据库路径(与 server 共用 SQLite) +PROJECT_ROOT = Path(__file__).resolve().parent.parent +DB_PATH = os.environ.get("DB_PATH", str(PROJECT_ROOT / "server" / "data.db")) + +# Node API 地址(用于通知推送) +API_BASE = os.environ.get("API_BASE", "http://localhost:3001") + +# 抓取间隔(秒) +CRAWL_INTERVAL = int(os.environ.get("CRAWL_INTERVAL", "300")) + +# RSS 源(美伊/中东相关,多源保证实时事件脉络) +RSS_FEEDS = [ + "https://feeds.reuters.com/reuters/topNews", + "https://feeds.bbci.co.uk/news/world/rss.xml", + "https://feeds.bbci.co.uk/news/world/middle_east/rss.xml", + "https://www.aljazeera.com/xml/rss/all.xml", + "https://www.aljazeera.com/xml/rss/middleeast.xml", + "https://rss.nytimes.com/services/xml/rss/nyt/World.xml", +] + +# 关键词过滤:至少匹配一个才会入库 +KEYWORDS = [ + "iran", "iranian", "tehran", "以色列", "israel", + "usa", "us ", "american", "美军", "美国", + "middle east", "中东", "persian gulf", "波斯湾", + "strike", "attack", "military", "missile", "核", "nuclear", + "carrier", "航母", "houthi", "胡塞", "hamas", +] diff --git a/crawler/db_writer.py b/crawler/db_writer.py new file mode 100644 index 0000000..64bb6da --- /dev/null +++ b/crawler/db_writer.py @@ -0,0 +1,110 @@ +# -*- coding: utf-8 -*- +"""写入 SQLite 并确保 situation_update 表存在""" +import sqlite3 +import hashlib +import os +from datetime import datetime, timezone + +from config import DB_PATH + +CATEGORIES = ("deployment", "alert", "intel", "diplomatic", "other") +SEVERITIES = ("low", "medium", "high", "critical") + + +def _ensure_table(conn: sqlite3.Connection) -> None: + conn.execute(""" + CREATE TABLE IF NOT EXISTS situation_update ( + id TEXT PRIMARY KEY, + timestamp TEXT NOT NULL, + category TEXT NOT NULL, + summary TEXT NOT NULL, + severity TEXT NOT NULL + ) + """) + conn.commit() + + +def _make_id(title: str, url: str, published: str) -> str: + raw = f"{title}|{url}|{published}" + return "nw_" + hashlib.sha256(raw.encode("utf-8")).hexdigest()[:16] + + +def _to_utc_iso(dt: datetime) -> str: + if dt.tzinfo: + dt = dt.astimezone(timezone.utc) + return dt.strftime("%Y-%m-%dT%H:%M:%S.000Z") + + +def insert_update( + conn: sqlite3.Connection, + title: str, + summary: str, + url: str, + published: datetime, + category: str = "other", + severity: str = "medium", +) -> bool: + """插入一条更新,若 id 已存在则跳过。返回是否插入了新记录。""" + _ensure_table(conn) + ts = _to_utc_iso(published) + uid = _make_id(title, url, ts) + if category not in CATEGORIES: + category = "other" + if severity not in SEVERITIES: + severity = "medium" + try: + conn.execute( + "INSERT OR IGNORE INTO situation_update (id, timestamp, category, summary, severity) VALUES (?, ?, ?, ?, ?)", + (uid, ts, category, summary[:500], severity), + ) + conn.commit() + return conn.total_changes > 0 + except Exception: + conn.rollback() + return False + + +def touch_situation_updated_at(conn: sqlite3.Connection) -> None: + """更新 situation 表的 updated_at""" + conn.execute( + "INSERT OR REPLACE INTO situation (id, data, updated_at) VALUES (1, '{}', ?)", + (datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"),), + ) + conn.commit() + + +def write_updates(updates: list[dict]) -> int: + """ + updates: [{"title","summary","url","published","category","severity"}, ...] + 返回新增条数。 + """ + if not os.path.exists(DB_PATH): + return 0 + conn = sqlite3.connect(DB_PATH, timeout=10) + try: + count = 0 + for u in updates: + pub = u.get("published") + if isinstance(pub, str): + try: + pub = datetime.fromisoformat(pub.replace("Z", "+00:00")) + except ValueError: + pub = datetime.utcnow() + elif pub is None: + pub = datetime.utcnow() + ok = insert_update( + conn, + title=u.get("title", "")[:200], + summary=u.get("summary", "") or u.get("title", ""), + url=u.get("url", ""), + published=pub, + category=u.get("category", "other"), + severity=u.get("severity", "medium"), + ) + if ok: + count += 1 + if count > 0: + touch_situation_updated_at(conn) + return count + finally: + conn.close() diff --git a/crawler/main.py b/crawler/main.py new file mode 100644 index 0000000..3dfc6ab --- /dev/null +++ b/crawler/main.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +"""爬虫入口:定时抓取 → 解析 → 入库 → 通知 API""" +import time +import sys +from pathlib import Path + +# 确保能导入 config +sys.path.insert(0, str(Path(__file__).resolve().parent)) + +from config import DB_PATH, API_BASE, CRAWL_INTERVAL +from scrapers.rss_scraper import fetch_all +from db_writer import write_updates + + +def notify_api() -> bool: + """调用 Node API 触发立即广播""" + try: + import urllib.request + req = urllib.request.Request( + f"{API_BASE}/api/crawler/notify", + method="POST", + headers={"Content-Type": "application/json"}, + ) + with urllib.request.urlopen(req, timeout=5) as resp: + return resp.status == 200 + except Exception as e: + print(f" [warn] notify API failed: {e}") + return False + + +def run_once() -> int: + items = fetch_all() + if not items: + return 0 + n = write_updates(items) + if n > 0: + notify_api() + return n + + +def main() -> None: + print("Crawler started. DB:", DB_PATH) + print("API:", API_BASE, "| Interval:", CRAWL_INTERVAL, "s") + while True: + try: + n = run_once() + if n > 0: + print(f"[{time.strftime('%H:%M:%S')}] Inserted {n} new update(s)") + except KeyboardInterrupt: + break + except Exception as e: + print(f"[{time.strftime('%H:%M:%S')}] Error: {e}") + time.sleep(CRAWL_INTERVAL) + + +if __name__ == "__main__": + main() diff --git a/crawler/parser.py b/crawler/parser.py new file mode 100644 index 0000000..f069872 --- /dev/null +++ b/crawler/parser.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- +"""新闻分类与严重度判定""" +import re +from typing import Literal + +Category = Literal["deployment", "alert", "intel", "diplomatic", "other"] +Severity = Literal["low", "medium", "high", "critical"] + +# 分类关键词 +CAT_DEPLOYMENT = ["deploy", "carrier", "航母", "military build", "troop", "forces"] +CAT_ALERT = ["strike", "attack", "fire", "blast", "hit", "爆炸", "袭击", "打击"] +CAT_INTEL = ["satellite", "intel", "image", "surveillance", "卫星", "情报"] +CAT_DIPLOMATIC = ["talk", "negotiation", "diplomat", "sanction", "谈判", "制裁"] + + +def _match(text: str, words: list[str]) -> bool: + t = (text or "").lower() + for w in words: + if w.lower() in t: + return True + return False + + +def classify(text: str) -> Category: + if _match(text, CAT_ALERT): + return "alert" + if _match(text, CAT_DEPLOYMENT): + return "deployment" + if _match(text, CAT_INTEL): + return "intel" + if _match(text, CAT_DIPLOMATIC): + return "diplomatic" + return "other" + + +def severity(text: str, category: Category) -> Severity: + t = (text or "").lower() + critical = [ + "nuclear", "核", "strike", "attack", "killed", "dead", "casualty", + "war", "invasion", "袭击", "打击", "死亡", + ] + high = [ + "missile", "drone", "bomb", "explosion", "blasted", "fire", + "导弹", "无人机", "爆炸", "轰炸", + ] + if _match(t, critical): + return "critical" + if _match(t, high) or category == "alert": + return "high" + if category == "deployment": + return "medium" + return "low" diff --git a/crawler/realtime_conflict_service.py b/crawler/realtime_conflict_service.py new file mode 100644 index 0000000..de2b11c --- /dev/null +++ b/crawler/realtime_conflict_service.py @@ -0,0 +1,286 @@ +# -*- coding: utf-8 -*- +""" +GDELT 实时冲突抓取 + API 服务 +核心数据源:GDELT Project,约 15 分钟级更新,含经纬度、事件编码、参与方、事件强度 +""" +import os +# 直连外网,避免系统代理导致 ProxyError / 超时(需代理时设置 CRAWLER_USE_PROXY=1) +if os.environ.get("CRAWLER_USE_PROXY") != "1": + os.environ.setdefault("NO_PROXY", "*") + +import hashlib +import sqlite3 +from datetime import datetime +from pathlib import Path +from typing import List, Optional + +import requests +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from apscheduler.schedulers.background import BackgroundScheduler + +app = FastAPI(title="GDELT Conflict Service") +app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"]) + +# 配置 +PROJECT_ROOT = Path(__file__).resolve().parent.parent +DB_PATH = os.environ.get("DB_PATH", str(PROJECT_ROOT / "server" / "data.db")) +API_BASE = os.environ.get("API_BASE", "http://localhost:3001") +QUERY = os.environ.get("GDELT_QUERY", "United States Iran military") +MAX_RECORDS = int(os.environ.get("GDELT_MAX_RECORDS", "30")) +FETCH_INTERVAL_SEC = int(os.environ.get("FETCH_INTERVAL_SEC", "60")) +RSS_INTERVAL_SEC = int(os.environ.get("RSS_INTERVAL_SEC", "45")) # 新闻抓取更频繁,优先保证事件脉络 +# 时间范围:1h=1小时 1d=1天 1week=1周;不设则默认 3 个月(易返回旧文) +GDELT_TIMESPAN = os.environ.get("GDELT_TIMESPAN", "1d") +# 设为 1 则跳过 GDELT,仅用 RSS 新闻作为事件脉络(GDELT 国外可能无法访问) +GDELT_DISABLED = os.environ.get("GDELT_DISABLED", "0") == "1" + +# 伊朗攻击源(无经纬度时默认) +IRAN_COORD = [51.3890, 35.6892] # Tehran [lng, lat] + +# 请求直连,不经过系统代理(避免 ProxyError / 代理超时) +_REQ_KW = {"timeout": 15, "headers": {"User-Agent": "US-Iran-Dashboard/1.0"}} +if os.environ.get("CRAWLER_USE_PROXY") != "1": + _REQ_KW["proxies"] = {"http": None, "https": None} + +EVENT_CACHE: List[dict] = [] + + +# ========================== +# 冲突强度评分 (1–10) +# ========================== +def calculate_impact_score(title: str) -> int: + score = 1 + t = (title or "").lower() + if "missile" in t: + score += 3 + if "strike" in t: + score += 2 + if "killed" in t or "death" in t or "casualt" in t: + score += 4 + if "troops" in t or "soldier" in t: + score += 2 + if "attack" in t or "attacked" in t: + score += 3 + if "nuclear" in t or "核" in t: + score += 4 + if "explosion" in t or "blast" in t or "bomb" in t: + score += 2 + return min(score, 10) + + +# ========================== +# 获取 GDELT 实时事件 +# ========================== +def _parse_article(article: dict) -> Optional[dict]: + title_raw = article.get("title") or article.get("seendate") or "" + if not title_raw: + return None + from translate_utils import translate_to_chinese + title = translate_to_chinese(str(title_raw)[:500]) + url = article.get("url") or article.get("socialimage") or "" + seendate = article.get("seendate") or datetime.utcnow().isoformat() + lat = article.get("lat") + lng = article.get("lng") + # 无经纬度时使用伊朗坐标(攻击源) + if lat is None or lng is None: + lat, lng = IRAN_COORD[1], IRAN_COORD[0] + try: + lat, lng = float(lat), float(lng) + except (TypeError, ValueError): + lat, lng = IRAN_COORD[1], IRAN_COORD[0] + impact = calculate_impact_score(title_raw) + event_id = hashlib.sha256(f"{url}{seendate}".encode()).hexdigest()[:24] + return { + "event_id": event_id, + "event_time": seendate, + "title": title[:500], + "lat": lat, + "lng": lng, + "impact_score": impact, + "url": url, + } + + +def fetch_gdelt_events() -> None: + if GDELT_DISABLED: + return + url = ( + "https://api.gdeltproject.org/api/v2/doc/doc" + f"?query={QUERY}" + "&mode=ArtList" + "&format=json" + f"&maxrecords={MAX_RECORDS}" + f"×pan={GDELT_TIMESPAN}" + "&sort=datedesc" + ) + try: + resp = requests.get(url, **_REQ_KW) + resp.raise_for_status() + data = resp.json() + articles = data.get("articles", data) if isinstance(data, dict) else (data if isinstance(data, list) else []) + if not isinstance(articles, list): + articles = [] + new_events = [] + for a in articles: + ev = _parse_article(a) if isinstance(a, dict) else None + if ev: + new_events.append(ev) + # 按 event_time 排序,最新在前 + new_events.sort(key=lambda e: e.get("event_time", ""), reverse=True) + global EVENT_CACHE + EVENT_CACHE = new_events + # 写入 SQLite 并通知 Node + _write_to_db(new_events) + _notify_node() + print(f"[{datetime.now().strftime('%H:%M:%S')}] GDELT 更新 {len(new_events)} 条事件") + except Exception as e: + print(f"GDELT 抓取失败: {e}") + + +def _ensure_table(conn: sqlite3.Connection) -> None: + conn.execute(""" + CREATE TABLE IF NOT EXISTS gdelt_events ( + event_id TEXT PRIMARY KEY, + event_time TEXT NOT NULL, + title TEXT NOT NULL, + lat REAL NOT NULL, + lng REAL NOT NULL, + impact_score INTEGER NOT NULL, + url TEXT, + created_at TEXT DEFAULT (datetime('now')) + ) + """) + conn.execute(""" + CREATE TABLE IF NOT EXISTS conflict_stats ( + id INTEGER PRIMARY KEY CHECK (id = 1), + total_events INTEGER NOT NULL, + high_impact_events INTEGER NOT NULL, + estimated_casualties INTEGER NOT NULL, + estimated_strike_count INTEGER NOT NULL, + updated_at TEXT NOT NULL + ) + """) + conn.commit() + + +def _write_to_db(events: List[dict]) -> None: + if not os.path.exists(DB_PATH): + return + conn = sqlite3.connect(DB_PATH, timeout=10) + try: + _ensure_table(conn) + for e in events: + conn.execute( + "INSERT OR REPLACE INTO gdelt_events (event_id, event_time, title, lat, lng, impact_score, url) VALUES (?, ?, ?, ?, ?, ?, ?)", + ( + e["event_id"], + e.get("event_time", ""), + e.get("title", ""), + e.get("lat", 0), + e.get("lng", 0), + e.get("impact_score", 1), + e.get("url", ""), + ), + ) + # 战损统计模型(展示用) + high = sum(1 for x in events if x.get("impact_score", 0) >= 7) + strikes = sum(1 for x in events if "strike" in (x.get("title") or "").lower() or "attack" in (x.get("title") or "").lower()) + casualties = min(5000, high * 80 + len(events) * 10) # 估算 + conn.execute( + "INSERT OR REPLACE INTO conflict_stats (id, total_events, high_impact_events, estimated_casualties, estimated_strike_count, updated_at) VALUES (1, ?, ?, ?, ?, ?)", + (len(events), high, casualties, strikes, datetime.utcnow().isoformat()), + ) + conn.execute( + "INSERT OR REPLACE INTO situation (id, data, updated_at) VALUES (1, '{}', ?)", + (datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"),), + ) + conn.commit() + except Exception as e: + print(f"写入 DB 失败: {e}") + conn.rollback() + finally: + conn.close() + + +def _notify_node() -> None: + try: + r = requests.post(f"{API_BASE}/api/crawler/notify", timeout=5, proxies={"http": None, "https": None}) + if r.status_code != 200: + print(" [warn] notify API 失败") + except Exception as e: + print(f" [warn] notify API: {e}") + + +# ========================== +# RSS 新闻抓取(补充 situation_update) +# ========================== +def fetch_news() -> None: + try: + from scrapers.rss_scraper import fetch_all + from db_writer import write_updates + from translate_utils import translate_to_chinese + items = fetch_all() + for it in items: + it["title"] = translate_to_chinese(it.get("title", "") or "") + it["summary"] = translate_to_chinese(it.get("summary", "") or it.get("title", "")) + if items: + n = write_updates(items) + if n > 0: + _notify_node() + print(f"[{datetime.now().strftime('%H:%M:%S')}] 新闻入库 {n} 条") + except Exception as e: + print(f"新闻抓取失败: {e}") + + +# ========================== +# 定时任务(RSS 更频繁,优先保证事件脉络实时) +# ========================== +scheduler = BackgroundScheduler() +scheduler.add_job(fetch_news, "interval", seconds=RSS_INTERVAL_SEC) +scheduler.add_job(fetch_gdelt_events, "interval", seconds=FETCH_INTERVAL_SEC) +scheduler.start() + + +# ========================== +# API 接口 +# ========================== +@app.get("/events") +def get_events(): + return { + "updated_at": datetime.utcnow().isoformat(), + "count": len(EVENT_CACHE), + "events": EVENT_CACHE, + "conflict_stats": _get_conflict_stats(), + } + + +def _get_conflict_stats() -> dict: + if not os.path.exists(DB_PATH): + return {"total_events": 0, "high_impact_events": 0, "estimated_casualties": 0, "estimated_strike_count": 0} + try: + conn = sqlite3.connect(DB_PATH, timeout=5) + row = conn.execute("SELECT total_events, high_impact_events, estimated_casualties, estimated_strike_count FROM conflict_stats WHERE id = 1").fetchone() + conn.close() + if row: + return { + "total_events": row[0], + "high_impact_events": row[1], + "estimated_casualties": row[2], + "estimated_strike_count": row[3], + } + except Exception: + pass + return {"total_events": 0, "high_impact_events": 0, "estimated_casualties": 0, "estimated_strike_count": 0} + + +@app.on_event("startup") +def startup(): + # 新闻优先启动,确保事件脉络有数据 + fetch_news() + fetch_gdelt_events() + + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/crawler/requirements.txt b/crawler/requirements.txt new file mode 100644 index 0000000..0268df8 --- /dev/null +++ b/crawler/requirements.txt @@ -0,0 +1,6 @@ +requests>=2.31.0 +feedparser>=6.0.0 +fastapi>=0.109.0 +uvicorn>=0.27.0 +apscheduler>=3.10.0 +deep-translator>=1.11.0 diff --git a/crawler/scrapers/__init__.py b/crawler/scrapers/__init__.py new file mode 100644 index 0000000..40a96af --- /dev/null +++ b/crawler/scrapers/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- diff --git a/crawler/scrapers/__pycache__/__init__.cpython-311.pyc b/crawler/scrapers/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..ff0a033 Binary files /dev/null and b/crawler/scrapers/__pycache__/__init__.cpython-311.pyc differ diff --git a/crawler/scrapers/__pycache__/__init__.cpython-39.pyc b/crawler/scrapers/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..1412ab9 Binary files /dev/null and b/crawler/scrapers/__pycache__/__init__.cpython-39.pyc differ diff --git a/crawler/scrapers/__pycache__/rss_scraper.cpython-311.pyc b/crawler/scrapers/__pycache__/rss_scraper.cpython-311.pyc new file mode 100644 index 0000000..d6837b9 Binary files /dev/null and b/crawler/scrapers/__pycache__/rss_scraper.cpython-311.pyc differ diff --git a/crawler/scrapers/__pycache__/rss_scraper.cpython-39.pyc b/crawler/scrapers/__pycache__/rss_scraper.cpython-39.pyc new file mode 100644 index 0000000..ca8a8c2 Binary files /dev/null and b/crawler/scrapers/__pycache__/rss_scraper.cpython-39.pyc differ diff --git a/crawler/scrapers/rss_scraper.py b/crawler/scrapers/rss_scraper.py new file mode 100644 index 0000000..9bdf9c2 --- /dev/null +++ b/crawler/scrapers/rss_scraper.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- +"""RSS 抓取""" +import re +from datetime import datetime, timezone + +import feedparser + +from config import RSS_FEEDS, KEYWORDS +from parser import classify, severity + + +def _parse_date(entry) -> datetime: + for attr in ("published_parsed", "updated_parsed"): + val = getattr(entry, attr, None) + if val: + try: + return datetime(*val[:6], tzinfo=timezone.utc) + except (TypeError, ValueError): + pass + return datetime.now(timezone.utc) + + +def _strip_html(s: str) -> str: + return re.sub(r"<[^>]+>", "", s) if s else "" + + +def _matches_keywords(text: str) -> bool: + t = (text or "").lower() + for k in KEYWORDS: + if k.lower() in t: + return True + return False + + +def fetch_all() -> list[dict]: + import socket + items: list[dict] = [] + seen: set[str] = set() + # 单源超时 10 秒,避免某源卡住 + old_timeout = socket.getdefaulttimeout() + socket.setdefaulttimeout(10) + try: + for url in RSS_FEEDS: + try: + feed = feedparser.parse( + url, + request_headers={"User-Agent": "US-Iran-Dashboard/1.0"}, + agent="US-Iran-Dashboard/1.0", + ) + except Exception: + continue + for entry in feed.entries: + title = getattr(entry, "title", "") or "" + raw_summary = getattr(entry, "summary", "") or getattr(entry, "description", "") or "" + summary = _strip_html(raw_summary) + link = getattr(entry, "link", "") or "" + text = f"{title} {summary}" + if not _matches_keywords(text): + continue + key = (title[:80], link) + if key in seen: + continue + seen.add(key) + published = _parse_date(entry) + cat = classify(text) + sev = severity(text, cat) + items.append({ + "title": title, + "summary": summary[:400] if summary else title, + "url": link, + "published": _parse_date(entry), + "category": cat, + "severity": sev, + }) + finally: + socket.setdefaulttimeout(old_timeout) + return items diff --git a/crawler/translate_utils.py b/crawler/translate_utils.py new file mode 100644 index 0000000..63ddd77 --- /dev/null +++ b/crawler/translate_utils.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +"""英译中,入库前统一翻译""" +import re +from typing import Optional + + +def _is_mostly_chinese(text: str) -> bool: + if not text or len(text.strip()) < 2: + return False + chinese = len(re.findall(r"[\u4e00-\u9fff]", text)) + return chinese / max(len(text), 1) > 0.3 + + +def translate_to_chinese(text: str) -> str: + """将文本翻译成中文,失败或已是中文则返回原文。""" + if not text or not text.strip(): + return text + s = str(text).strip() + if len(s) > 2000: + s = s[:2000] + if _is_mostly_chinese(s): + return text + try: + from deep_translator import GoogleTranslator + out = GoogleTranslator(source="auto", target="zh-CN").translate(s) + return out if out else text + except Exception: + return text diff --git a/docs/BACKEND_MODULES.md b/docs/BACKEND_MODULES.md new file mode 100644 index 0000000..4b26780 --- /dev/null +++ b/docs/BACKEND_MODULES.md @@ -0,0 +1,91 @@ +# 后端模块说明 + +## 一、现有模块结构 + +``` +server/ +├── index.js # HTTP + WebSocket 入口 +├── routes.js # REST API 路由 +├── db.js # SQLite schema 与连接 +├── situationData.js # 态势数据聚合 (从 DB 读取) +├── seed.js # 初始数据填充 +├── data.db # SQLite 数据库 +└── package.json + +crawler/ +├── realtime_conflict_service.py # GDELT 实时冲突服务 (核心) +├── requirements.txt +├── config.py, db_writer.py # 旧 RSS 爬虫(可保留) +├── main.py +└── README.md +``` + +### 1. server/index.js +- Express + CORS +- WebSocket (`/ws`),每 5 秒广播 `situation` +- `POST /api/crawler/notify`:爬虫写入后触发立即广播 + +### 2. server/routes.js +- `GET /api/situation`:完整态势 +- `GET /api/events`:GDELT 事件 + 冲突统计 +- `GET /api/health`:健康检查 + +### 3. server/db.js +- 表:`situation`、`force_summary`、`power_index`、`force_asset`、 + `key_location`、`combat_losses`、`wall_street_trend`、 + `retaliation_current`、`retaliation_history`、`situation_update`、 + **`gdelt_events`**、**`conflict_stats`** + +--- + +## 二、GDELT 核心数据源 + +**GDELT Project**:全球冲突数据库,约 15 分钟级更新,含经纬度、事件编码、参与方、事件强度。 + +### realtime_conflict_service.py + +- 定时(默认 60 秒)从 GDELT API 抓取 +- 冲突强度评分:missile +3, strike +2, killed +4 等 +- 无经纬度时默认攻击源:`IRAN_COORD = [51.3890, 35.6892]` +- 写入 `gdelt_events`、`conflict_stats` +- 调用 `POST /api/crawler/notify` 触发 Node 广播 + +### 冲突强度 → 地图效果 + +| impact_score | 效果 | +|--------------|------------| +| 1–3 | 绿色点 | +| 4–6 | 橙色闪烁 | +| 7–10 | 红色脉冲扩散 | + +### 战损统计模型(展示用) + +- `total_events` +- `high_impact_events` (impact ≥ 7) +- `estimated_casualties` +- `estimated_strike_count` + +--- + +## 三、数据流 + +``` +GDELT API → Python 服务(60s) → gdelt_events, conflict_stats + ↓ + POST /api/crawler/notify → situation.updated_at + ↓ + WebSocket 广播 getSituation() → 前端 +``` + +--- + +## 四、运行方式 + +```bash +# 1. 启动 Node API +npm run api + +# 2. 启动 GDELT 服务 +npm run gdelt +# 或: cd crawler && uvicorn realtime_conflict_service:app --port 8000 +``` diff --git a/src/components/EventTimelinePanel.tsx b/src/components/EventTimelinePanel.tsx new file mode 100644 index 0000000..26533a5 --- /dev/null +++ b/src/components/EventTimelinePanel.tsx @@ -0,0 +1,89 @@ +import type { SituationUpdate, ConflictEvent } from '@/data/mockData' +import { History } from 'lucide-react' + +interface EventTimelinePanelProps { + updates: SituationUpdate[] + conflictEvents?: ConflictEvent[] + className?: string +} + +const CAT_LABELS: Record = { + deployment: '部署', + alert: '警报', + intel: '情报', + diplomatic: '外交', + other: '其他', +} +function formatTime(iso: string): string { + const d = new Date(iso) + return d.toLocaleString('zh-CN', { month: '2-digit', day: '2-digit', hour: '2-digit', minute: '2-digit', hour12: false }) +} + +type TimelineItem = { + id: string + summary: string + timestamp: string + source: 'gdelt' | 'rss' + category?: string + severity?: string +} + +export function EventTimelinePanel({ updates = [], conflictEvents = [], className = '' }: EventTimelinePanelProps) { + // 合并 GDELT + RSS,按时间倒序(最新在前) + const merged: TimelineItem[] = [ + ...(conflictEvents || []).map((e) => ({ + id: e.event_id, + summary: e.title, + timestamp: e.event_time, + source: 'gdelt' as const, + category: 'alert', + severity: e.impact_score >= 7 ? 'high' : e.impact_score >= 4 ? 'medium' : 'low', + })), + ...(updates || []).map((u) => ({ + id: u.id, + summary: u.summary, + timestamp: u.timestamp, + source: 'rss' as const, + category: u.category, + severity: u.severity, + })), + ] + .sort((a, b) => new Date(b.timestamp).getTime() - new Date(a.timestamp).getTime()) + .slice(0, 6) + + return ( +
+
+ + + 事件脉络 + + GDELT · Reuters · BBC · Al Jazeera · NYT +
+
+ {merged.length === 0 ? ( +

暂无事件

+ ) : ( +
    + {merged.map((item) => ( +
  • + + + +
    +

    {item.summary}

    + + {formatTime(item.timestamp)} + + {item.source === 'gdelt' ? 'GDELT' : CAT_LABELS[item.category ?? ''] ?? '新闻'} + + +
    +
  • + ))} +
+ )} +
+
+ ) +} diff --git a/src/components/RecentUpdatesPanel.tsx b/src/components/RecentUpdatesPanel.tsx new file mode 100644 index 0000000..fa0cab1 --- /dev/null +++ b/src/components/RecentUpdatesPanel.tsx @@ -0,0 +1,71 @@ +import type { SituationUpdate, ConflictEvent } from '@/data/mockData' +import { Newspaper, AlertTriangle } from 'lucide-react' + +interface RecentUpdatesPanelProps { + updates: SituationUpdate[] + conflictEvents?: ConflictEvent[] + className?: string +} + +const CAT_LABELS: Record = { + deployment: '部署', + alert: '警报', + intel: '情报', + diplomatic: '外交', + other: '其他', +} +const SEV_COLORS: Record = { + low: 'text-military-text-secondary', + medium: 'text-amber-400', + high: 'text-orange-500', + critical: 'text-red-500', +} + +function formatTime(iso: string): string { + const d = new Date(iso) + const now = new Date() + const diff = now.getTime() - d.getTime() + if (diff < 60000) return '刚刚' + if (diff < 3600000) return `${Math.floor(diff / 60000)}分钟前` + if (diff < 86400000) return `${Math.floor(diff / 3600000)}小时前` + return d.toLocaleString('zh-CN', { month: '2-digit', day: '2-digit', hour: '2-digit', minute: '2-digit' }) +} + +export function RecentUpdatesPanel({ updates, conflictEvents = [], className = '' }: RecentUpdatesPanelProps) { + // 优先展示 GDELT 冲突事件(最新 10 条),无则用 updates + const fromConflict = (conflictEvents || []) + .slice(0, 10) + .map((e) => ({ id: e.event_id, summary: e.title, timestamp: e.event_time, category: 'alert' as const, severity: (e.impact_score >= 7 ? 'high' : e.impact_score >= 4 ? 'medium' : 'low') as const })) + const list = fromConflict.length > 0 ? fromConflict : (updates || []).slice(0, 8) + + return ( +
+
+ + + 近期动态 + +
+
+ {list.length === 0 ? ( +

暂无动态

+ ) : ( +
    + {list.map((u) => ( +
  • + + {u.severity === 'critical' && } + {CAT_LABELS[u.category] ?? u.category} + +
    +

    {u.summary}

    + {formatTime(u.timestamp)} +
    +
  • + ))} +
+ )} +
+
+ ) +} diff --git a/src/components/TimelinePanel.tsx b/src/components/TimelinePanel.tsx new file mode 100644 index 0000000..8705ae1 --- /dev/null +++ b/src/components/TimelinePanel.tsx @@ -0,0 +1,142 @@ +import { useEffect, useRef } from 'react' +import { Play, Pause, SkipBack, SkipForward, History } from 'lucide-react' +import { usePlaybackStore, REPLAY_TICKS, REPLAY_START, REPLAY_END } from '@/store/playbackStore' + +function formatTick(iso: string): string { + const d = new Date(iso) + return d.toLocaleString('zh-CN', { + month: '2-digit', + day: '2-digit', + hour: '2-digit', + minute: '2-digit', + hour12: false, + }) +} + +export function TimelinePanel() { + const { + isReplayMode, + playbackTime, + isPlaying, + speedSecPerTick, + setReplayMode, + setPlaybackTime, + setIsPlaying, + stepForward, + stepBack, + setSpeed, + } = usePlaybackStore() + + const timerRef = useRef | null>(null) + + useEffect(() => { + if (!isPlaying || !isReplayMode) { + if (timerRef.current) { + clearInterval(timerRef.current) + timerRef.current = null + } + return + } + timerRef.current = setInterval(() => { + const current = usePlaybackStore.getState().playbackTime + const i = REPLAY_TICKS.indexOf(current) + if (i >= REPLAY_TICKS.length - 1) { + setIsPlaying(false) + return + } + setPlaybackTime(REPLAY_TICKS[i + 1]) + }, speedSecPerTick * 1000) + return () => { + if (timerRef.current) clearInterval(timerRef.current) + } + }, [isPlaying, isReplayMode, speedSecPerTick, setPlaybackTime, setIsPlaying]) + + const index = REPLAY_TICKS.indexOf(playbackTime) + const value = index >= 0 ? index : REPLAY_TICKS.length - 1 + + const handleSliderChange = (e: React.ChangeEvent) => { + const i = parseInt(e.target.value, 10) + setPlaybackTime(REPLAY_TICKS[i]) + } + + return ( +
+
+ + + {isReplayMode && ( + <> +
+ + + +
+ +
+ +
+ +
+ {formatTick(REPLAY_START)} + {formatTick(playbackTime)} + {formatTick(REPLAY_END)} +
+ + + + )} +
+
+ ) +} diff --git a/src/hooks/useReplaySituation.ts b/src/hooks/useReplaySituation.ts new file mode 100644 index 0000000..5ab9095 --- /dev/null +++ b/src/hooks/useReplaySituation.ts @@ -0,0 +1,143 @@ +import { useMemo } from 'react' +import type { MilitarySituation } from '@/data/mockData' +import { useSituationStore } from '@/store/situationStore' +import { usePlaybackStore } from '@/store/playbackStore' + +/** 将系列时间映射到回放日 (2026-03-01) 以便按当天时刻插值 */ +function toReplayDay(iso: string, baseDay: string): string { + const d = new Date(iso) + const [y, m, day] = baseDay.slice(0, 10).split('-').map(Number) + return new Date(y, (m || 1) - 1, day || 1, d.getUTCHours(), d.getUTCMinutes(), 0, 0).toISOString() +} + +function interpolateAt( + series: { time: string; value: number }[], + at: string, + baseDay = '2026-03-01' +): number { + if (series.length === 0) return 0 + const t = new Date(at).getTime() + const mapped = series.map((p) => ({ + time: toReplayDay(p.time, baseDay), + value: p.value, + })) + const sorted = [...mapped].sort((a, b) => new Date(a.time).getTime() - new Date(b.time).getTime()) + const before = sorted.filter((p) => new Date(p.time).getTime() <= t) + const after = sorted.filter((p) => new Date(p.time).getTime() > t) + if (before.length === 0) return sorted[0].value + if (after.length === 0) return sorted[sorted.length - 1].value + const a = before[before.length - 1] + const b = after[0] + const ta = new Date(a.time).getTime() + const tb = new Date(b.time).getTime() + const f = tb === ta ? 1 : (t - ta) / (tb - ta) + return a.value + f * (b.value - a.value) +} + +function linearProgress(start: string, end: string, at: string): number { + const ts = new Date(start).getTime() + const te = new Date(end).getTime() + const ta = new Date(at).getTime() + if (ta <= ts) return 0 + if (ta >= te) return 1 + return (ta - ts) / (te - ts) +} + +/** 根据回放时刻派生态势数据 */ +export function useReplaySituation(): MilitarySituation { + const situation = useSituationStore((s) => s.situation) + const { isReplayMode, playbackTime } = usePlaybackStore() + + return useMemo(() => { + if (!isReplayMode) return situation + + const progress = linearProgress('2026-03-01T02:00:00.000Z', '2026-03-01T11:45:00.000Z', playbackTime) + + // 华尔街趋势、反击情绪:按时间插值 + const wsValue = interpolateAt(situation.usForces.wallStreetInvestmentTrend, playbackTime) + const retValue = interpolateAt(situation.iranForces.retaliationSentimentHistory, playbackTime) + + // 战斗损失:从 0 线性增长到当前值 + const lerp = (a: number, b: number) => Math.round(a + progress * (b - a)) + const usLoss = situation.usForces.combatLosses + const irLoss = situation.iranForces.combatLosses + const civUs = usLoss.civilianCasualties ?? { killed: 0, wounded: 0 } + const civIr = irLoss.civilianCasualties ?? { killed: 0, wounded: 0 } + const usLossesAt = { + bases: { + destroyed: lerp(0, usLoss.bases.destroyed), + damaged: lerp(0, usLoss.bases.damaged), + }, + personnelCasualties: { + killed: lerp(0, usLoss.personnelCasualties.killed), + wounded: lerp(0, usLoss.personnelCasualties.wounded), + }, + civilianCasualties: { killed: lerp(0, civUs.killed), wounded: lerp(0, civUs.wounded) }, + aircraft: lerp(0, usLoss.aircraft), + warships: lerp(0, usLoss.warships), + armor: lerp(0, usLoss.armor), + vehicles: lerp(0, usLoss.vehicles), + } + const irLossesAt = { + bases: { + destroyed: lerp(0, irLoss.bases.destroyed), + damaged: lerp(0, irLoss.bases.damaged), + }, + personnelCasualties: { + killed: lerp(0, irLoss.personnelCasualties.killed), + wounded: lerp(0, irLoss.personnelCasualties.wounded), + }, + civilianCasualties: { killed: lerp(0, civIr.killed), wounded: lerp(0, civIr.wounded) }, + aircraft: lerp(0, irLoss.aircraft), + warships: lerp(0, irLoss.warships), + armor: lerp(0, irLoss.armor), + vehicles: lerp(0, irLoss.vehicles), + } + + // 被袭基地:按 damage_level 排序,高损毁先出现;根据 progress 决定显示哪些为 attacked + const usLocs = situation.usForces.keyLocations || [] + const attackedBases = usLocs + .filter((loc) => loc.status === 'attacked') + .sort((a, b) => (b.damage_level ?? 0) - (a.damage_level ?? 0)) + const totalAttacked = attackedBases.length + const shownAttackedCount = Math.round(progress * totalAttacked) + const attackedNames = new Set( + attackedBases.slice(0, shownAttackedCount).map((l) => l.name) + ) + + const usLocsAt = usLocs.map((loc) => { + if (loc.status === 'attacked' && !attackedNames.has(loc.name)) { + return { ...loc, status: 'operational' as const } + } + return { ...loc } + }) + + return { + ...situation, + lastUpdated: playbackTime, + usForces: { + ...situation.usForces, + keyLocations: usLocsAt, + combatLosses: usLossesAt, + wallStreetInvestmentTrend: [ + ...situation.usForces.wallStreetInvestmentTrend.filter((p) => new Date(p.time).getTime() <= new Date(playbackTime).getTime()), + { time: playbackTime, value: wsValue }, + ].slice(-20), + }, + iranForces: { + ...situation.iranForces, + combatLosses: irLossesAt, + retaliationSentiment: retValue, + retaliationSentimentHistory: [ + ...situation.iranForces.retaliationSentimentHistory.filter((p) => new Date(p.time).getTime() <= new Date(playbackTime).getTime()), + { time: playbackTime, value: retValue }, + ].slice(-20), + }, + recentUpdates: (situation.recentUpdates || []).filter( + (u) => new Date(u.timestamp).getTime() <= new Date(playbackTime).getTime() + ), + conflictEvents: situation.conflictEvents || [], + conflictStats: situation.conflictStats || { total_events: 0, high_impact_events: 0, estimated_casualties: 0, estimated_strike_count: 0 }, + } + }, [situation, isReplayMode, playbackTime]) +} diff --git a/src/store/playbackStore.ts b/src/store/playbackStore.ts new file mode 100644 index 0000000..6b49811 --- /dev/null +++ b/src/store/playbackStore.ts @@ -0,0 +1,80 @@ +import { create } from 'zustand' + +const REPLAY_DAY = '2026-03-01' +const TICK_MS = 30 * 60 * 1000 // 30 minutes + +export const REPLAY_START = `${REPLAY_DAY}T00:00:00.000Z` +export const REPLAY_END = `${REPLAY_DAY}T23:30:00.000Z` + +function parseTime(iso: string): number { + return new Date(iso).getTime() +} + +export function getTicks(): string[] { + const ticks: string[] = [] + let t = parseTime(REPLAY_START) + const end = parseTime(REPLAY_END) + while (t <= end) { + ticks.push(new Date(t).toISOString()) + t += TICK_MS + } + return ticks +} + +export const REPLAY_TICKS = getTicks() + +export interface PlaybackState { + /** 是否开启回放模式 */ + isReplayMode: boolean + /** 当前回放时刻 (ISO) */ + playbackTime: string + /** 是否正在自动播放 */ + isPlaying: boolean + /** 播放速度 (秒/刻度) */ + speedSecPerTick: number + setReplayMode: (v: boolean) => void + setPlaybackTime: (iso: string) => void + setIsPlaying: (v: boolean) => void + stepForward: () => void + stepBack: () => void + setSpeed: (sec: number) => void +} + +export const usePlaybackStore = create((set, get) => ({ + isReplayMode: false, + playbackTime: REPLAY_END, + isPlaying: false, + speedSecPerTick: 2, + + setReplayMode: (v) => set({ isReplayMode: v, isPlaying: false }), + + setPlaybackTime: (iso) => { + const ticks = REPLAY_TICKS + if (ticks.includes(iso)) { + set({ playbackTime: iso }) + return + } + const idx = ticks.findIndex((t) => t >= iso) + const clamp = Math.max(0, Math.min(idx < 0 ? ticks.length - 1 : idx, ticks.length - 1)) + set({ playbackTime: ticks[clamp] }) + }, + + setIsPlaying: (v) => set({ isPlaying: v }), + + stepForward: () => { + const { playbackTime } = get() + const ticks = REPLAY_TICKS + const i = ticks.indexOf(playbackTime) + if (i < ticks.length - 1) set({ playbackTime: ticks[i + 1] }) + else set({ isPlaying: false }) + }, + + stepBack: () => { + const { playbackTime } = get() + const ticks = REPLAY_TICKS + const i = ticks.indexOf(playbackTime) + if (i > 0) set({ playbackTime: ticks[i - 1] }) + }, + + setSpeed: (sec) => set({ speedSecPerTick: sec }), +}))