diff --git a/.env.example b/.env.example index 1c1ca53..c6f33ba 100644 --- a/.env.example +++ b/.env.example @@ -1,4 +1,8 @@ # Mapbox 地图令牌 (波斯湾区域展示) # 免费申请: https://account.mapbox.com/access-tokens/ -# 复制本文件为 .env 并填入你的 token VITE_MAPBOX_ACCESS_TOKEN=your_mapbox_public_token_here + +# 阿里云 DashScope API Key(爬虫 AI 提取用,不设则用规则或 Ollama) +# 在 crawler 目录或系统环境变量中设置,例如: +# export DASHSCOPE_API_KEY=sk-xxx +DASHSCOPE_API_KEY= diff --git a/Dockerfile.crawler b/Dockerfile.crawler index c1907fc..936f3cf 100644 --- a/Dockerfile.crawler +++ b/Dockerfile.crawler @@ -10,7 +10,6 @@ COPY crawler ./ ENV DB_PATH=/data/data.db ENV API_BASE=http://api:3001 -ENV CLEANER_AI_DISABLED=1 ENV GDELT_DISABLED=1 ENV RSS_INTERVAL_SEC=60 diff --git a/crawler/__pycache__/config.cpython-39.pyc b/crawler/__pycache__/config.cpython-39.pyc index cad6fbd..d85e50a 100644 Binary files a/crawler/__pycache__/config.cpython-39.pyc and b/crawler/__pycache__/config.cpython-39.pyc differ diff --git a/crawler/__pycache__/extractor_dashscope.cpython-39.pyc b/crawler/__pycache__/extractor_dashscope.cpython-39.pyc new file mode 100644 index 0000000..5036d66 Binary files /dev/null and b/crawler/__pycache__/extractor_dashscope.cpython-39.pyc differ diff --git a/crawler/__pycache__/news_storage.cpython-39.pyc b/crawler/__pycache__/news_storage.cpython-39.pyc new file mode 100644 index 0000000..99c8ef7 Binary files /dev/null and b/crawler/__pycache__/news_storage.cpython-39.pyc differ diff --git a/crawler/__pycache__/realtime_conflict_service.cpython-39.pyc b/crawler/__pycache__/realtime_conflict_service.cpython-39.pyc index 438cbaf..fc861f5 100644 Binary files a/crawler/__pycache__/realtime_conflict_service.cpython-39.pyc and b/crawler/__pycache__/realtime_conflict_service.cpython-39.pyc differ diff --git a/crawler/config.py b/crawler/config.py index 79ebb29..f5bc435 100644 --- a/crawler/config.py +++ b/crawler/config.py @@ -10,6 +10,9 @@ DB_PATH = os.environ.get("DB_PATH", str(PROJECT_ROOT / "server" / "data.db")) # Node API 地址(用于通知推送) API_BASE = os.environ.get("API_BASE", "http://localhost:3001") +# 阿里云 DashScope API Key(用于 AI 提取面板数据,不设则回退到规则/Ollama) +DASHSCOPE_API_KEY = os.environ.get("DASHSCOPE_API_KEY", "") + # 抓取间隔(秒) CRAWL_INTERVAL = int(os.environ.get("CRAWL_INTERVAL", "300")) diff --git a/crawler/extractor_dashscope.py b/crawler/extractor_dashscope.py new file mode 100644 index 0000000..8d92bbf --- /dev/null +++ b/crawler/extractor_dashscope.py @@ -0,0 +1,121 @@ +# -*- coding: utf-8 -*- +""" +阿里云 DashScope(通义千问)提取面板结构化数据 +从新闻文本中提取战损、报复指数、基地状态等,供 db_merge 落库 +API Key 通过环境变量 DASHSCOPE_API_KEY 配置 +""" +import json +import os +import re +from datetime import datetime, timezone +from typing import Any, Dict, Optional + +from panel_schema import validate_category, validate_severity, validate_summary + + +def _call_dashscope_extract(text: str, timeout: int = 15) -> Optional[Dict[str, Any]]: + """调用阿里云 DashScope 提取结构化数据""" + api_key = os.environ.get("DASHSCOPE_API_KEY", "").strip() + if not api_key or not text or len(str(text).strip()) < 10: + return None + try: + import dashscope + from http import HTTPStatus + + dashscope.api_key = api_key + + prompt = f"""从以下美伊/中东军事新闻中提取可明确推断的数值,输出 JSON。无依据的字段省略不写。 + +要求: +- summary: 1-2句中文事实摘要,≤80字 +- category: deployment|alert|intel|diplomatic|other +- severity: low|medium|high|critical +- 战损(仅当新闻明确提及数字时填写): + us_personnel_killed, iran_personnel_killed, us_personnel_wounded, iran_personnel_wounded, + us_civilian_killed, iran_civilian_killed, us_civilian_wounded, iran_civilian_wounded, + us_bases_destroyed, iran_bases_destroyed, us_bases_damaged, iran_bases_damaged, + us_aircraft, iran_aircraft, us_warships, iran_warships, us_armor, iran_armor, us_vehicles, iran_vehicles +- retaliation_sentiment: 0-100,仅当新闻涉及伊朗报复/反击情绪时 +- wall_street_value: 0-100,仅当新闻涉及美股/市场反应时 +- key_location_updates: 当新闻提及具体基地遭袭时,数组 [{{"name_keywords":"阿萨德|asad|assad","side":"us","status":"attacked","damage_level":1-3}}] + +原文: +{str(text)[:800]} + +直接输出 JSON,不要其他解释:""" + + response = dashscope.Generation.call( + model="qwen-turbo", + messages=[{"role": "user", "content": prompt}], + result_format="message", + max_tokens=512, + ) + + if response.status_code != HTTPStatus.OK: + return None + raw = (response.output.get("choices", [{}])[0].get("message", {}).get("content", "") or "").strip() + raw = re.sub(r"^```\w*\s*|\s*```$", "", raw) + return json.loads(raw) + except Exception: + return None + + +def extract_from_news(text: str, timestamp: Optional[str] = None) -> Dict[str, Any]: + """ + 从新闻文本提取结构化数据,符合面板 schema + 返回: { situation_update?, combat_losses_delta?, retaliation?, wall_street?, key_location_updates? } + """ + ts = timestamp or datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.000Z") + out: Dict[str, Any] = {} + parsed = _call_dashscope_extract(text) + if not parsed: + return out + + if parsed.get("summary"): + out["situation_update"] = { + "summary": validate_summary(str(parsed["summary"])[:120], 120), + "category": validate_category(str(parsed.get("category", "other")).lower()), + "severity": validate_severity(str(parsed.get("severity", "medium")).lower()), + "timestamp": ts, + } + + loss_us = {} + loss_ir = {} + for k in ["personnel_killed", "personnel_wounded", "civilian_killed", "civilian_wounded", + "bases_destroyed", "bases_damaged", "aircraft", "warships", "armor", "vehicles"]: + uk, ik = f"us_{k}", f"iran_{k}" + if uk in parsed and isinstance(parsed[uk], (int, float)): + loss_us[k] = max(0, int(parsed[uk])) + if ik in parsed and isinstance(parsed[ik], (int, float)): + loss_ir[k] = max(0, int(parsed[ik])) + if loss_us or loss_ir: + out["combat_losses_delta"] = {} + if loss_us: + out["combat_losses_delta"]["us"] = loss_us + if loss_ir: + out["combat_losses_delta"]["iran"] = loss_ir + + if "retaliation_sentiment" in parsed: + v = parsed["retaliation_sentiment"] + if isinstance(v, (int, float)) and 0 <= v <= 100: + out["retaliation"] = {"value": int(v), "time": ts} + + if "wall_street_value" in parsed: + v = parsed["wall_street_value"] + if isinstance(v, (int, float)) and 0 <= v <= 100: + out["wall_street"] = {"time": ts, "value": int(v)} + + if "key_location_updates" in parsed and isinstance(parsed["key_location_updates"], list): + valid = [] + for u in parsed["key_location_updates"]: + if isinstance(u, dict) and u.get("name_keywords") and u.get("side") in ("us", "iran"): + valid.append({ + "name_keywords": str(u["name_keywords"]), + "side": u["side"], + "status": str(u.get("status", "attacked"))[:20], + "damage_level": min(3, max(1, int(u["damage_level"]))) if isinstance(u.get("damage_level"), (int, float)) else 2, + }) + if valid: + out["key_location_updates"] = valid + + return out diff --git a/crawler/news_storage.py b/crawler/news_storage.py new file mode 100644 index 0000000..9a5e101 --- /dev/null +++ b/crawler/news_storage.py @@ -0,0 +1,141 @@ +# -*- coding: utf-8 -*- +""" +资讯内容独立存储,支持历史去重 +爬虫拉回数据 → 计算 content_hash → 若已存在则跳过(去重)→ 新数据落库 news_content +""" +import hashlib +import os +import re +import sqlite3 +from datetime import datetime, timezone +from typing import List, Optional, Tuple + +from config import DB_PATH + + +def _to_utc_iso(dt: datetime) -> str: + if dt.tzinfo: + dt = dt.astimezone(timezone.utc) + return dt.strftime("%Y-%m-%dT%H:%M:%S.000Z") + + +def _normalize_for_hash(text: str) -> str: + """归一化文本用于生成去重 hash""" + if not text: + return "" + t = re.sub(r"\s+", " ", str(text).strip().lower())[:600] + return re.sub(r"[\x00-\x1f]", "", t) + + +def content_hash(title: str, summary: str, url: str) -> str: + """根据标题、摘要、URL 生成去重 hash,相似内容视为重复""" + raw = _normalize_for_hash(title) + "|" + _normalize_for_hash(summary) + "|" + (url or "").strip() + return hashlib.sha256(raw.encode("utf-8")).hexdigest()[:32] + + +def _ensure_table(conn: sqlite3.Connection) -> None: + conn.execute(""" + CREATE TABLE IF NOT EXISTS news_content ( + id TEXT PRIMARY KEY, + content_hash TEXT NOT NULL UNIQUE, + title TEXT NOT NULL, + summary TEXT NOT NULL, + url TEXT NOT NULL DEFAULT '', + source TEXT NOT NULL DEFAULT '', + published_at TEXT NOT NULL, + category TEXT NOT NULL DEFAULT 'other', + severity TEXT NOT NULL DEFAULT 'medium', + created_at TEXT NOT NULL DEFAULT (datetime('now')) + ) + """) + try: + conn.execute("CREATE UNIQUE INDEX IF NOT EXISTS idx_news_content_hash ON news_content(content_hash)") + except sqlite3.OperationalError: + pass + try: + conn.execute("CREATE INDEX IF NOT EXISTS idx_news_content_pub ON news_content(published_at DESC)") + except sqlite3.OperationalError: + pass + conn.commit() + + +def exists_by_hash(conn: sqlite3.Connection, h: str) -> bool: + row = conn.execute("SELECT 1 FROM news_content WHERE content_hash = ? LIMIT 1", (h,)).fetchone() + return row is not None + + +def insert_news( + conn: sqlite3.Connection, + *, + title: str, + summary: str, + url: str = "", + source: str = "", + published: datetime, + category: str = "other", + severity: str = "medium", +) -> Optional[str]: + """ + 插入资讯,若 content_hash 已存在则跳过(去重) + 返回: 新插入的 id,或 None 表示重复跳过 + """ + _ensure_table(conn) + h = content_hash(title, summary, url) + if exists_by_hash(conn, h): + return None + uid = "nc_" + hashlib.sha256(f"{h}{datetime.utcnow().isoformat()}".encode()).hexdigest()[:14] + ts = _to_utc_iso(published) + conn.execute( + """INSERT INTO news_content (id, content_hash, title, summary, url, source, published_at, category, severity) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""", + (uid, h, (title or "")[:500], (summary or "")[:2000], (url or "")[:500], (source or "")[:100], ts, category, severity), + ) + conn.commit() + return uid + + +def save_and_dedup(items: List[dict], db_path: Optional[str] = None) -> Tuple[List[dict], int]: + """ + 去重后落库 news_content + items: [{"title","summary","url","published","category","severity","source"?}, ...] + 返回: (通过去重的新项列表, 实际新增条数) + """ + path = db_path or DB_PATH + if not os.path.exists(path): + return [], 0 + conn = sqlite3.connect(path, timeout=10) + try: + _ensure_table(conn) + new_items: List[dict] = [] + count = 0 + for u in items: + title = (u.get("title") or "")[:500] + summary = (u.get("summary") or u.get("title") or "")[:2000] + url = (u.get("url") or "")[:500] + source = (u.get("source") or "")[:100] + pub = u.get("published") + if isinstance(pub, str): + try: + pub = datetime.fromisoformat(pub.replace("Z", "+00:00")) + except ValueError: + pub = datetime.now(timezone.utc) + elif pub is None: + pub = datetime.now(timezone.utc) + cat = u.get("category", "other") + sev = u.get("severity", "medium") + uid = insert_news( + conn, + title=title, + summary=summary, + url=url, + source=source, + published=pub, + category=cat, + severity=sev, + ) + if uid: + count += 1 + new_items.append({**u, "news_id": uid}) + return new_items, count + finally: + conn.close() diff --git a/crawler/realtime_conflict_service.py b/crawler/realtime_conflict_service.py index 76b87c4..5e4e626 100644 --- a/crawler/realtime_conflict_service.py +++ b/crawler/realtime_conflict_service.py @@ -283,7 +283,7 @@ def _rss_to_gdelt_fallback() -> None: # ========================== -# RSS 新闻抓取(补充 situation_update + AI 提取面板数据) +# RSS 新闻抓取:资讯落库(去重) → AI 提取 → 面板数据落库 → 通知前端 # ========================== LAST_FETCH = {"items": 0, "inserted": 0, "error": None} @@ -292,6 +292,7 @@ def fetch_news() -> None: try: from scrapers.rss_scraper import fetch_all from db_writer import write_updates + from news_storage import save_and_dedup from translate_utils import translate_to_chinese from cleaner_ai import clean_news_for_panel from cleaner_ai import ensure_category, ensure_severity @@ -304,36 +305,44 @@ def fetch_news() -> None: it["summary"] = clean_news_for_panel(raw_summary or raw_title, max_len=120) it["category"] = ensure_category(it.get("category", "other")) it["severity"] = ensure_severity(it.get("severity", "medium")) - n = write_updates(items) if items else 0 + it["source"] = it.get("source") or "rss" + # 1. 历史去重:资讯内容落库 news_content(独立表,便于后续消费) + new_items, n_news = save_and_dedup(items, db_path=DB_PATH) + # 2. 面板展示:新增资讯写入 situation_update(供前端 recentUpdates) + n_panel = write_updates(new_items) if new_items else 0 LAST_FETCH["items"] = len(items) - LAST_FETCH["inserted"] = n - if items: - _extract_and_merge_panel_data(items) + LAST_FETCH["inserted"] = n_news + # 3. AI 提取 + 合并到 combat_losses / key_location 等 + if new_items: + _extract_and_merge_panel_data(new_items) # GDELT 禁用时用 RSS 填充 gdelt_events,使地图有冲突点 if GDELT_DISABLED: _rss_to_gdelt_fallback() - # 每次抓取完成都通知 Node 更新时间戳,便于「实时更新」显示 _notify_node() - print(f"[{datetime.now().strftime('%H:%M:%S')}] RSS 抓取 {len(items)} 条,新增入库 {n} 条") + print(f"[{datetime.now().strftime('%H:%M:%S')}] RSS 抓取 {len(items)} 条,去重后新增 {n_news} 条资讯,面板 {n_panel} 条") except Exception as e: LAST_FETCH["error"] = str(e) print(f"[{datetime.now().strftime('%H:%M:%S')}] 新闻抓取失败: {e}") def _extract_and_merge_panel_data(items: list) -> None: - """对新闻做 AI/规则 提取,合并到 combat_losses / retaliation / wall_street_trend 等表""" + """AI 分析提取面板相关数据,清洗后落库""" if not items or not os.path.exists(DB_PATH): return try: from db_merge import merge - if os.environ.get("CLEANER_AI_DISABLED", "0") == "1": + use_dashscope = bool(os.environ.get("DASHSCOPE_API_KEY", "").strip()) + if use_dashscope: + from extractor_dashscope import extract_from_news + limit = 10 + elif os.environ.get("CLEANER_AI_DISABLED", "0") == "1": from extractor_rules import extract_from_news + limit = 25 else: from extractor_ai import extract_from_news + limit = 10 from datetime import timezone merged_any = False - # 规则模式可多处理几条(无 Ollama);AI 模式限制 5 条避免调用过多 - limit = 25 if os.environ.get("CLEANER_AI_DISABLED", "0") == "1" else 10 for it in items[:limit]: text = (it.get("title", "") or "") + " " + (it.get("summary", "") or "") if len(text.strip()) < 20: diff --git a/crawler/requirements.txt b/crawler/requirements.txt index e1a5d6e..5facd77 100644 --- a/crawler/requirements.txt +++ b/crawler/requirements.txt @@ -3,3 +3,4 @@ feedparser>=6.0.0 fastapi>=0.109.0 uvicorn>=0.27.0 deep-translator>=1.11.0 +dashscope>=1.20.0 diff --git a/docker-compose.yml b/docker-compose.yml index 9d0dbc9..10bdb59 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -22,9 +22,9 @@ services: environment: - DB_PATH=/data/data.db - API_BASE=http://api:3001 - - CLEANER_AI_DISABLED=1 - GDELT_DISABLED=1 - RSS_INTERVAL_SEC=60 + - DASHSCOPE_API_KEY=${DASHSCOPE_API_KEY:-} volumes: - app-data:/data depends_on: diff --git a/docs/CRAWLER_PIPELINE.md b/docs/CRAWLER_PIPELINE.md new file mode 100644 index 0000000..30064c0 --- /dev/null +++ b/docs/CRAWLER_PIPELINE.md @@ -0,0 +1,65 @@ +# 爬虫数据流水线 + +## 数据流 + +``` +RSS 抓取 + ↓ 翻译、清洗 + ↓ news_storage.save_and_dedup() → 历史去重 + ↓ +news_content(资讯独立表,供后续消费) + ↓ + ↓ 去重后的新数据 + ↓ +situation_update(面板展示用) + ↓ + ↓ AI 提取(阿里云 DashScope) + ↓ +combat_losses / retaliation / key_location / wall_street_trend + ↓ + ↓ notify Node + ↓ +前端 WebSocket + 轮询 +``` + +## 阿里云 DashScope API Key + +设置环境变量 `DASHSCOPE_API_KEY` 后,爬虫使用阿里云通义千问进行 AI 提取。不设置时回退到规则提取(`extractor_rules`)或 Ollama(若可用)。 + +```bash +# 本地 +export DASHSCOPE_API_KEY=sk-xxx + +# Docker +docker compose up -d -e DASHSCOPE_API_KEY=sk-xxx +# 或在 .env 中写入 DASHSCOPE_API_KEY=sk-xxx +``` + +## 表说明 + +| 表 | 用途 | +|----|------| +| `news_content` | 资讯原文,独立存储,支持去重(content_hash),供后续消费 | +| `situation_update` | 面板「近期更新」展示 | +| `combat_losses` | 战损数据(AI/规则提取) | +| `key_location` | 基地状态 | +| `gdelt_events` | 地图冲突点 | + +## 去重逻辑 + +根据 `content_hash = sha256(normalize(title) + normalize(summary) + url)` 判断,相同或高度相似内容视为重复,不入库。 + +## 消费资讯 + +- HTTP: `GET /api/news?limit=50` +- 调试: `/db` 面板查看 `news_content` 表 + +## 链路验证 + +运行脚本一键检查全链路: + +```bash +./scripts/verify-pipeline.sh +``` + +支持环境变量覆盖:`API_URL`、`CRAWLER_URL` diff --git a/package.json b/package.json index bf8d36a..6548247 100644 --- a/package.json +++ b/package.json @@ -14,7 +14,9 @@ "build": "vite build", "typecheck": "tsc --noEmit", "lint": "eslint .", - "preview": "vite preview" + "preview": "vite preview", + "verify": "./scripts/verify-pipeline.sh", + "verify:full": "./scripts/verify-pipeline.sh --start-crawler" }, "dependencies": { "better-sqlite3": "^11.6.0", diff --git a/scripts/verify-pipeline.sh b/scripts/verify-pipeline.sh new file mode 100755 index 0000000..5098f89 --- /dev/null +++ b/scripts/verify-pipeline.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +# 验证爬虫 → 数据库 → API → 前端 全链路 +# 用法: ./scripts/verify-pipeline.sh [--start-crawler] +set -e + +API_URL="${API_URL:-http://localhost:3001}" +CRAWLER_URL="${CRAWLER_URL:-http://localhost:8000}" +START_CRAWLER=false +[[ "${1:-}" = "--start-crawler" ]] && START_CRAWLER=true + +PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)" + +echo "==========================================" +echo "US-Iran 态势面板 链路验证" +echo "API: $API_URL | Crawler: $CRAWLER_URL" +echo "==========================================" +echo "" + +# 可选:启动爬虫 +if $START_CRAWLER; then + echo "[0/6] 启动爬虫..." + if curl -sf "$CRAWLER_URL/crawler/status" >/dev/null 2>&1; then + echo " ✓ 爬虫已在运行" + else + cd "$PROJECT_ROOT/crawler" + python3 -c "import uvicorn" 2>/dev/null || { echo " 需安装: pip install uvicorn"; exit 1; } + uvicorn realtime_conflict_service:app --host 127.0.0.1 --port 8000 & + echo " 等待爬虫就绪..." + for i in $(seq 1 15); do + sleep 2 + if curl -sf "$CRAWLER_URL/crawler/status" >/dev/null 2>&1; then + echo " ✓ 爬虫已启动" + echo " 等待首次 RSS 抓取(约 70 秒)..." + sleep 70 + break + fi + done + if ! curl -sf "$CRAWLER_URL/crawler/status" >/dev/null 2>&1; then + echo " ✗ 爬虫启动超时" + exit 1 + fi + fi + echo "" +fi + +# 1. API 健康检查 +echo "[1/6] API 健康检查..." +if curl -sf "$API_URL/api/health" > /dev/null; then + echo " ✓ API 正常" +else + echo " ✗ API 无响应,请先运行: npm run api" + exit 1 +fi + +# 2. 态势数据 +echo "[2/6] 态势数据..." +SIT=$(curl -sf "$API_URL/api/situation" 2>/dev/null || echo "{}") +if echo "$SIT" | grep -q "lastUpdated"; then + echo " ✓ 态势数据可读" + LAST=$(echo "$SIT" | grep -o '"lastUpdated":"[^"]*"' | head -1) + echo " $LAST" +else + echo " ✗ 态势数据异常" + exit 1 +fi + +# 3. 爬虫状态 +echo "[3/6] 爬虫状态..." +CRAWLER=$(curl -sf "$CRAWLER_URL/crawler/status" 2>/dev/null || echo "{}") +if echo "$CRAWLER" | grep -q "db_path\|db_exists"; then + echo " ✓ 爬虫服务可访问" + if command -v jq &>/dev/null; then + CNT=$(echo "$CRAWLER" | jq -r '.situation_update_count // "?"') + echo " situation_update 条数: $CNT" + fi +else + echo " ⚠ 爬虫未启动或不可达(可选,需单独运行爬虫)" +fi + +# 4. 资讯表 +echo "[4/6] 资讯表 news_content..." +NEWS=$(curl -sf "$API_URL/api/news?limit=3" 2>/dev/null || echo '{"items":[]}') +if echo "$NEWS" | grep -q '"items"'; then + if command -v jq &>/dev/null; then + N=$(echo "$NEWS" | jq '.items | length') + echo " ✓ 最近 $N 条资讯" + else + echo " ✓ 资讯接口可读" + fi +else + echo " ⚠ news_content 可能为空(爬虫未跑或刚启动)" +fi + +# 5. 战损数据 +echo "[5/6] 战损数据 combat_losses..." +if echo "$SIT" | grep -q "personnelCasualties"; then + echo " ✓ 战损字段存在" + if command -v jq &>/dev/null; then + US_K=$(echo "$SIT" | jq -r '.usForces.combatLosses.personnelCasualties.killed // "?"') + IR_K=$(echo "$SIT" | jq -r '.iranForces.combatLosses.personnelCasualties.killed // "?"') + echo " 美军阵亡: $US_K | 伊朗阵亡: $IR_K" + fi +else + echo " ✗ 战损结构异常" +fi + +# 6. 通知接口(仅验证可调用) +echo "[6/6] 通知接口 POST /api/crawler/notify..." +NOTIFY=$(curl -sf -X POST "$API_URL/api/crawler/notify" 2>/dev/null || echo "{}") +if echo "$NOTIFY" | grep -q '"ok"'; then + echo " ✓ 通知接口正常" +else + echo " ⚠ 通知接口可能异常" +fi + +echo "" +echo "==========================================" +echo "验证完成。" +echo "" +echo "建议:" +echo " - 访问 $API_URL/db 查看各表数据" +echo " - 爬虫未启动时: ./scripts/verify-pipeline.sh --start-crawler" +echo " - 或手动启动: cd crawler && uvicorn realtime_conflict_service:app --port 8000" +echo "==========================================" diff --git a/server/data.db-shm b/server/data.db-shm index 4829ca3..54018ee 100644 Binary files a/server/data.db-shm and b/server/data.db-shm differ diff --git a/server/data.db-wal b/server/data.db-wal index 628ee38..1fc0997 100644 Binary files a/server/data.db-wal and b/server/data.db-wal differ diff --git a/server/db.js b/server/db.js index b55a414..70eb356 100644 --- a/server/db.js +++ b/server/db.js @@ -112,6 +112,21 @@ db.exec(` estimated_strike_count INTEGER NOT NULL DEFAULT 0, updated_at TEXT NOT NULL ); + + CREATE TABLE IF NOT EXISTS news_content ( + id TEXT PRIMARY KEY, + content_hash TEXT NOT NULL UNIQUE, + title TEXT NOT NULL, + summary TEXT NOT NULL, + url TEXT NOT NULL DEFAULT '', + source TEXT NOT NULL DEFAULT '', + published_at TEXT NOT NULL, + category TEXT NOT NULL DEFAULT 'other', + severity TEXT NOT NULL DEFAULT 'medium', + created_at TEXT NOT NULL DEFAULT (datetime('now')) + ); + CREATE INDEX IF NOT EXISTS idx_news_content_hash ON news_content(content_hash); + CREATE INDEX IF NOT EXISTS idx_news_content_published ON news_content(published_at DESC); `) // 迁移:为已有 key_location 表添加 type、region、status、damage_level 列 diff --git a/server/openapi.js b/server/openapi.js index 7f61366..e46b3d8 100644 --- a/server/openapi.js +++ b/server/openapi.js @@ -25,6 +25,15 @@ module.exports = { }, }, }, + '/api/news': { + get: { + summary: '资讯内容', + description: '从 news_content 表读取,支持 ?limit=50 分页', + tags: ['资讯'], + parameters: [{ in: 'query', name: 'limit', schema: { type: 'integer', default: 50 } }], + responses: { 200: { description: 'items 数组' } }, + }, + }, '/api/db/dashboard': { get: { summary: '数据库面板', @@ -130,5 +139,5 @@ module.exports = { }, }, }, - tags: [{ name: '态势' }, { name: '统计' }, { name: '反馈' }, { name: '调试' }, { name: '系统' }], + tags: [{ name: '态势' }, { name: '资讯' }, { name: '统计' }, { name: '反馈' }, { name: '调试' }, { name: '系统' }], } diff --git a/server/routes.js b/server/routes.js index d8427e3..1b65869 100644 --- a/server/routes.js +++ b/server/routes.js @@ -19,6 +19,7 @@ router.get('/db/dashboard', (req, res) => { 'retaliation_current', 'retaliation_history', 'situation_update', + 'news_content', 'gdelt_events', 'conflict_stats', ] @@ -27,6 +28,7 @@ router.get('/db/dashboard', (req, res) => { feedback: 'created_at DESC', situation: 'updated_at DESC', situation_update: 'timestamp DESC', + news_content: 'published_at DESC', gdelt_events: 'event_time DESC', wall_street_trend: 'time DESC', retaliation_history: 'time DESC', @@ -55,6 +57,17 @@ router.get('/db/dashboard', (req, res) => { } }) +// 资讯内容(独立表,供后续消费) +router.get('/news', (req, res) => { + try { + const limit = Math.min(parseInt(req.query.limit, 10) || 50, 200) + const rows = db.prepare('SELECT id, title, summary, url, source, published_at, category, severity, created_at FROM news_content ORDER BY published_at DESC LIMIT ?').all(limit) + res.json({ items: rows }) + } catch (err) { + res.status(500).json({ error: err.message }) + } +}) + router.get('/situation', (req, res) => { try { res.json(getSituation()) diff --git a/src/config.ts b/src/config.ts index 7f2ba45..3221625 100644 --- a/src/config.ts +++ b/src/config.ts @@ -8,3 +8,4 @@ export const config = { /** 是否显示滚动情报 */ showNewsTicker: false, } +