fix: 优化数据
This commit is contained in:
@@ -18,12 +18,19 @@
|
||||
|
||||
| 项目 | 说明 |
|
||||
|------|------|
|
||||
| 源 | Reuters、BBC World/MiddleEast、Al Jazeera、NYT World |
|
||||
| 源 | 多国主流媒体:美(Reuters/NYT)、英(BBC)、法(France 24)、俄(TASS/RT)、中(Xinhua/CGTN)、伊(Press TV)、卡塔尔(Al Jazeera) |
|
||||
| 过滤 | 标题/摘要需含 `KEYWORDS` 之一(iran、usa、strike、military 等) |
|
||||
| 更新 | 爬虫 45 秒拉一次(`RSS_INTERVAL_SEC`),优先保证事件脉络 |
|
||||
| 优先级 | 启动时先拉 RSS,再拉 GDELT |
|
||||
|
||||
**GDELT 无法访问时**:设置 `GDELT_DISABLED=1`,仅用 RSS 新闻即可维持事件脉络。
|
||||
**GDELT 无法访问时**:设置 `GDELT_DISABLED=1`,仅用 RSS 新闻即可维持事件脉络。部分境外源可能受网络限制。
|
||||
|
||||
### 3. AI 新闻清洗与分类(可选)
|
||||
|
||||
- **清洗**:`cleaner_ai.py` 用 Ollama 提炼新闻为简洁摘要,供面板展示
|
||||
- **分类**:`parser_ai.py` 用 Ollama 替代规则做 category/severity 判定
|
||||
- 需先安装并运行 Ollama:`ollama run llama3.1`
|
||||
- 环境变量:`OLLAMA_MODEL=llama3.1`、`PARSER_AI_DISABLED=1`、`CLEANER_AI_DISABLED=1`(禁用对应 AI)
|
||||
|
||||
---
|
||||
|
||||
@@ -69,6 +76,9 @@ GDELT API → 抓取(60s) → SQLite (gdelt_events, conflict_stats) → POST /ap
|
||||
- `GDELT_DISABLED`: 设为 `1` 则跳过 GDELT,仅用 RSS 新闻(GDELT 无法访问时用)
|
||||
- `FETCH_INTERVAL_SEC`: GDELT 抓取间隔(秒),默认 60
|
||||
- `RSS_INTERVAL_SEC`: RSS 抓取间隔(秒),默认 45(优先保证事件脉络)
|
||||
- `OLLAMA_MODEL`: AI 分类模型,默认 `llama3.1`
|
||||
- `PARSER_AI_DISABLED`: 设为 `1` 则禁用 AI 分类,仅用规则
|
||||
- `CLEANER_AI_DISABLED`: 设为 `1` 则禁用 AI 清洗,仅用规则截断
|
||||
|
||||
## 冲突强度 (impact_score)
|
||||
|
||||
|
||||
BIN
crawler/__pycache__/cleaner_ai.cpython-39.pyc
Normal file
BIN
crawler/__pycache__/cleaner_ai.cpython-39.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
crawler/__pycache__/parser_ai.cpython-39.pyc
Normal file
BIN
crawler/__pycache__/parser_ai.cpython-39.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
87
crawler/cleaner_ai.py
Normal file
87
crawler/cleaner_ai.py
Normal file
@@ -0,0 +1,87 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
AI 清洗新闻数据,严格按面板字段约束输出
|
||||
面板 EventTimelinePanel 所需:summary(≤120字)、category(枚举)、severity(枚举)
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
CLEANER_AI_DISABLED = os.environ.get("CLEANER_AI_DISABLED", "0") == "1"
|
||||
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "llama3.1")
|
||||
|
||||
# 面板 schema:必须与 EventTimelinePanel / SituationUpdate 一致
|
||||
SUMMARY_MAX_LEN = 120 # 面板 line-clamp-2 展示
|
||||
CATEGORIES = ("deployment", "alert", "intel", "diplomatic", "other")
|
||||
SEVERITIES = ("low", "medium", "high", "critical")
|
||||
|
||||
|
||||
def _sanitize_summary(text: str, max_len: int = SUMMARY_MAX_LEN) -> str:
|
||||
"""确保 summary 符合面板:纯文本、无换行、限制长度"""
|
||||
if not text or not isinstance(text, str):
|
||||
return ""
|
||||
s = re.sub(r"\s+", " ", str(text).strip())
|
||||
s = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", "", s) # 去除控制字符
|
||||
return s[:max_len].rstrip()
|
||||
|
||||
|
||||
def _rule_clean(text: str, max_len: int = SUMMARY_MAX_LEN) -> str:
|
||||
"""规则清洗:去空白、去控制符、截断"""
|
||||
return _sanitize_summary(text, max_len)
|
||||
|
||||
|
||||
def _call_ollama_summary(text: str, max_len: int, timeout: int = 6) -> Optional[str]:
|
||||
"""调用 Ollama 提炼摘要,输出须为纯文本、≤max_len 字"""
|
||||
if CLEANER_AI_DISABLED or not text or len(str(text).strip()) < 5:
|
||||
return None
|
||||
try:
|
||||
import requests
|
||||
prompt = f"""将新闻提炼为1-2句简洁中文事实,直接输出纯文本,不要标号、引号、解释。限{max_len}字内。
|
||||
|
||||
原文:{str(text)[:350]}
|
||||
|
||||
输出:"""
|
||||
r = requests.post(
|
||||
"http://localhost:11434/api/chat",
|
||||
json={
|
||||
"model": OLLAMA_MODEL,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"stream": False,
|
||||
"options": {"num_predict": 150},
|
||||
},
|
||||
timeout=timeout,
|
||||
)
|
||||
if r.status_code != 200:
|
||||
return None
|
||||
out = (r.json().get("message", {}).get("content", "") or "").strip()
|
||||
out = re.sub(r"^[\d\.\-\*\s]+", "", out) # 去编号
|
||||
out = re.sub(r"^['\"\s]+|['\"\s]+$", "", out)
|
||||
out = _sanitize_summary(out, max_len)
|
||||
if out and len(out) > 3:
|
||||
return out
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def clean_news_for_panel(text: str, max_len: int = SUMMARY_MAX_LEN) -> str:
|
||||
"""清洗 summary 字段,供 EventTimelinePanel 展示。输出必为≤max_len 的纯文本"""
|
||||
if not text or not isinstance(text, str):
|
||||
return ""
|
||||
t = str(text).strip()
|
||||
if not t:
|
||||
return ""
|
||||
res = _call_ollama_summary(t, max_len, timeout=6)
|
||||
if res:
|
||||
return res
|
||||
return _rule_clean(t, max_len)
|
||||
|
||||
|
||||
def ensure_category(cat: str) -> str:
|
||||
"""确保 category 在面板枚举内"""
|
||||
return cat if cat in CATEGORIES else "other"
|
||||
|
||||
|
||||
def ensure_severity(sev: str) -> str:
|
||||
"""确保 severity 在面板枚举内"""
|
||||
return sev if sev in SEVERITIES else "medium"
|
||||
@@ -13,14 +13,30 @@ API_BASE = os.environ.get("API_BASE", "http://localhost:3001")
|
||||
# 抓取间隔(秒)
|
||||
CRAWL_INTERVAL = int(os.environ.get("CRAWL_INTERVAL", "300"))
|
||||
|
||||
# RSS 源(美伊/中东相关,多源保证实时事件脉络)
|
||||
# RSS 源:世界主流媒体,覆盖美伊/中东多视角
|
||||
RSS_FEEDS = [
|
||||
# 美国
|
||||
"https://feeds.reuters.com/reuters/topNews",
|
||||
"https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
|
||||
# 英国
|
||||
"https://feeds.bbci.co.uk/news/world/rss.xml",
|
||||
"https://feeds.bbci.co.uk/news/world/middle_east/rss.xml",
|
||||
"https://www.theguardian.com/world/rss",
|
||||
# 法国
|
||||
"https://www.france24.com/en/rss",
|
||||
# 德国
|
||||
"https://rss.dw.com/xml/rss-en-world",
|
||||
# 俄罗斯
|
||||
"https://tass.com/rss/v2.xml",
|
||||
"https://www.rt.com/rss/",
|
||||
# 中国
|
||||
"https://english.news.cn/rss/world.xml",
|
||||
"https://www.cgtn.com/rss/world",
|
||||
# 伊朗
|
||||
"https://www.presstv.ir/rss",
|
||||
# 卡塔尔(中东)
|
||||
"https://www.aljazeera.com/xml/rss/all.xml",
|
||||
"https://www.aljazeera.com/xml/rss/middleeast.xml",
|
||||
"https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
|
||||
]
|
||||
|
||||
# 关键词过滤:至少匹配一个才会入库
|
||||
|
||||
126
crawler/db_merge.py
Normal file
126
crawler/db_merge.py
Normal file
@@ -0,0 +1,126 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
将 AI 提取的结构化数据合并到 SQLite
|
||||
与 panel schema 及 situationData.getSituation 对齐,支持回放
|
||||
"""
|
||||
import os
|
||||
import sqlite3
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||||
DB_PATH = os.environ.get("DB_PATH", str(PROJECT_ROOT / "server" / "data.db"))
|
||||
|
||||
|
||||
def _ensure_tables(conn: sqlite3.Connection) -> None:
|
||||
"""确保所需表存在(与 db.js 一致)"""
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS situation_update (
|
||||
id TEXT PRIMARY KEY, timestamp TEXT NOT NULL, category TEXT NOT NULL,
|
||||
summary TEXT NOT NULL, severity TEXT NOT NULL
|
||||
)
|
||||
""")
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS combat_losses (
|
||||
side TEXT PRIMARY KEY CHECK (side IN ('us', 'iran')),
|
||||
bases_destroyed INTEGER NOT NULL, bases_damaged INTEGER NOT NULL,
|
||||
personnel_killed INTEGER NOT NULL, personnel_wounded INTEGER NOT NULL,
|
||||
aircraft INTEGER NOT NULL, warships INTEGER NOT NULL, armor INTEGER NOT NULL, vehicles INTEGER NOT NULL
|
||||
)
|
||||
""")
|
||||
try:
|
||||
conn.execute("ALTER TABLE combat_losses ADD COLUMN civilian_killed INTEGER NOT NULL DEFAULT 0")
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
try:
|
||||
conn.execute("ALTER TABLE combat_losses ADD COLUMN civilian_wounded INTEGER NOT NULL DEFAULT 0")
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
try:
|
||||
conn.execute("ALTER TABLE combat_losses ADD COLUMN updated_at TEXT DEFAULT (datetime('now'))")
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
conn.execute("CREATE TABLE IF NOT EXISTS wall_street_trend (id INTEGER PRIMARY KEY AUTOINCREMENT, time TEXT NOT NULL, value INTEGER NOT NULL)")
|
||||
conn.execute("CREATE TABLE IF NOT EXISTS retaliation_current (id INTEGER PRIMARY KEY CHECK (id = 1), value INTEGER NOT NULL)")
|
||||
conn.execute("CREATE TABLE IF NOT EXISTS retaliation_history (id INTEGER PRIMARY KEY AUTOINCREMENT, time TEXT NOT NULL, value INTEGER NOT NULL)")
|
||||
conn.execute("CREATE TABLE IF NOT EXISTS situation (id INTEGER PRIMARY KEY CHECK (id = 1), data TEXT NOT NULL, updated_at TEXT NOT NULL)")
|
||||
conn.commit()
|
||||
|
||||
|
||||
def merge(extracted: Dict[str, Any], db_path: Optional[str] = None) -> bool:
|
||||
"""将提取数据合并到 DB,返回是否有更新"""
|
||||
path = db_path or DB_PATH
|
||||
if not os.path.exists(path):
|
||||
return False
|
||||
conn = sqlite3.connect(path, timeout=10)
|
||||
try:
|
||||
_ensure_tables(conn)
|
||||
updated = False
|
||||
# situation_update
|
||||
if "situation_update" in extracted:
|
||||
u = extracted["situation_update"]
|
||||
uid = f"ai_{hash(u.get('summary','')+u.get('timestamp','')) % 10**10}"
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO situation_update (id, timestamp, category, summary, severity) VALUES (?, ?, ?, ?, ?)",
|
||||
(uid, u.get("timestamp", ""), u.get("category", "other"), u.get("summary", "")[:500], u.get("severity", "medium")),
|
||||
)
|
||||
if conn.total_changes > 0:
|
||||
updated = True
|
||||
# combat_losses:增量叠加到当前值
|
||||
if "combat_losses_delta" in extracted:
|
||||
for side, delta in extracted["combat_losses_delta"].items():
|
||||
if side not in ("us", "iran"):
|
||||
continue
|
||||
try:
|
||||
row = conn.execute(
|
||||
"SELECT personnel_killed,personnel_wounded,civilian_killed,civilian_wounded,bases_destroyed,bases_damaged,aircraft,warships,armor,vehicles FROM combat_losses WHERE side = ?",
|
||||
(side,),
|
||||
).fetchone()
|
||||
if not row:
|
||||
continue
|
||||
cur = {
|
||||
"personnel_killed": row[0], "personnel_wounded": row[1], "civilian_killed": row[2] or 0,
|
||||
"civilian_wounded": row[3] or 0, "bases_destroyed": row[4], "bases_damaged": row[5],
|
||||
"aircraft": row[6], "warships": row[7], "armor": row[8], "vehicles": row[9],
|
||||
}
|
||||
pk = max(0, (cur["personnel_killed"] or 0) + delta.get("personnel_killed", 0))
|
||||
pw = max(0, (cur["personnel_wounded"] or 0) + delta.get("personnel_wounded", 0))
|
||||
ck = max(0, (cur["civilian_killed"] or 0) + delta.get("civilian_killed", 0))
|
||||
cw = max(0, (cur["civilian_wounded"] or 0) + delta.get("civilian_wounded", 0))
|
||||
bd = max(0, (cur["bases_destroyed"] or 0) + delta.get("bases_destroyed", 0))
|
||||
bm = max(0, (cur["bases_damaged"] or 0) + delta.get("bases_damaged", 0))
|
||||
ac = max(0, (cur["aircraft"] or 0) + delta.get("aircraft", 0))
|
||||
ws = max(0, (cur["warships"] or 0) + delta.get("warships", 0))
|
||||
ar = max(0, (cur["armor"] or 0) + delta.get("armor", 0))
|
||||
vh = max(0, (cur["vehicles"] or 0) + delta.get("vehicles", 0))
|
||||
ts = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z")
|
||||
conn.execute(
|
||||
"""UPDATE combat_losses SET personnel_killed=?, personnel_wounded=?, civilian_killed=?, civilian_wounded=?,
|
||||
bases_destroyed=?, bases_damaged=?, aircraft=?, warships=?, armor=?, vehicles=?, updated_at=? WHERE side=?""",
|
||||
(pk, pw, ck, cw, bd, bm, ac, ws, ar, vh, ts, side),
|
||||
)
|
||||
if conn.total_changes > 0:
|
||||
updated = True
|
||||
except Exception:
|
||||
pass
|
||||
# retaliation
|
||||
if "retaliation" in extracted:
|
||||
r = extracted["retaliation"]
|
||||
conn.execute("INSERT OR REPLACE INTO retaliation_current (id, value) VALUES (1, ?)", (r["value"],))
|
||||
conn.execute("INSERT INTO retaliation_history (time, value) VALUES (?, ?)", (r["time"], r["value"]))
|
||||
updated = True
|
||||
# wall_street_trend
|
||||
if "wall_street" in extracted:
|
||||
w = extracted["wall_street"]
|
||||
conn.execute("INSERT INTO wall_street_trend (time, value) VALUES (?, ?)", (w["time"], w["value"]))
|
||||
updated = True
|
||||
if updated:
|
||||
conn.execute("INSERT OR REPLACE INTO situation (id, data, updated_at) VALUES (1, '{}', ?)", (datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"),))
|
||||
conn.commit()
|
||||
return updated
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
raise e
|
||||
finally:
|
||||
conn.close()
|
||||
100
crawler/extractor_ai.py
Normal file
100
crawler/extractor_ai.py
Normal file
@@ -0,0 +1,100 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
从新闻文本中 AI 提取结构化数据,映射到面板 schema
|
||||
输出符合 panel_schema 的字段,供 db_merge 写入
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from panel_schema import validate_category, validate_severity, validate_summary
|
||||
|
||||
CLEANER_AI_DISABLED = os.environ.get("CLEANER_AI_DISABLED", "0") == "1"
|
||||
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "llama3.1")
|
||||
|
||||
|
||||
def _call_ollama_extract(text: str, timeout: int = 10) -> Optional[Dict[str, Any]]:
|
||||
"""调用 Ollama 提取结构化数据。输出 JSON,仅包含新闻中可明确推断的字段"""
|
||||
if CLEANER_AI_DISABLED or not text or len(str(text).strip()) < 10:
|
||||
return None
|
||||
try:
|
||||
import requests
|
||||
prompt = f"""从以下美伊/中东新闻中提取可推断的数值,输出 JSON,仅包含有明确依据的字段。无依据则省略该字段。
|
||||
|
||||
要求:
|
||||
- summary: 1-2句中文事实,≤80字
|
||||
- category: deployment|alert|intel|diplomatic|other
|
||||
- severity: low|medium|high|critical
|
||||
- us_personnel_killed, iran_personnel_killed 等:仅当新闻明确提及具体数字时填写
|
||||
- retaliation_sentiment: 0-100,仅当新闻涉及伊朗报复情绪时
|
||||
- wall_street_value: 0-100,仅当新闻涉及美股/市场反应时
|
||||
|
||||
原文:{str(text)[:500]}
|
||||
|
||||
直接输出 JSON,不要解释:"""
|
||||
r = requests.post(
|
||||
"http://localhost:11434/api/chat",
|
||||
json={
|
||||
"model": OLLAMA_MODEL,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"stream": False,
|
||||
"options": {"num_predict": 256},
|
||||
},
|
||||
timeout=timeout,
|
||||
)
|
||||
if r.status_code != 200:
|
||||
return None
|
||||
raw = (r.json().get("message", {}).get("content", "") or "").strip()
|
||||
raw = re.sub(r"^```\w*\s*|\s*```$", "", raw)
|
||||
return json.loads(raw)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def extract_from_news(text: str, timestamp: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
从新闻文本提取结构化数据,严格符合面板 schema
|
||||
返回: { situation_update?, combat_losses_delta?, retaliation?, wall_street?, ... }
|
||||
"""
|
||||
ts = timestamp or datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.000Z")
|
||||
out: Dict[str, Any] = {}
|
||||
parsed = _call_ollama_extract(text)
|
||||
if not parsed:
|
||||
return out
|
||||
# situation_update
|
||||
if parsed.get("summary"):
|
||||
out["situation_update"] = {
|
||||
"summary": validate_summary(str(parsed["summary"])[:120], 120),
|
||||
"category": validate_category(str(parsed.get("category", "other")).lower()),
|
||||
"severity": validate_severity(str(parsed.get("severity", "medium")).lower()),
|
||||
"timestamp": ts,
|
||||
}
|
||||
# combat_losses 增量(仅数字字段)
|
||||
loss_us = {}
|
||||
loss_ir = {}
|
||||
for k in ["personnel_killed", "personnel_wounded", "civilian_killed", "civilian_wounded", "bases_destroyed", "bases_damaged", "aircraft", "warships", "armor", "vehicles"]:
|
||||
uk = f"us_{k}"
|
||||
ik = f"iran_{k}"
|
||||
if uk in parsed and isinstance(parsed[uk], (int, float)):
|
||||
loss_us[k] = max(0, int(parsed[uk]))
|
||||
if ik in parsed and isinstance(parsed[ik], (int, float)):
|
||||
loss_ir[k] = max(0, int(parsed[ik]))
|
||||
if loss_us or loss_ir:
|
||||
out["combat_losses_delta"] = {}
|
||||
if loss_us:
|
||||
out["combat_losses_delta"]["us"] = loss_us
|
||||
if loss_ir:
|
||||
out["combat_losses_delta"]["iran"] = loss_ir
|
||||
# retaliation
|
||||
if "retaliation_sentiment" in parsed:
|
||||
v = parsed["retaliation_sentiment"]
|
||||
if isinstance(v, (int, float)) and 0 <= v <= 100:
|
||||
out["retaliation"] = {"value": int(v), "time": ts}
|
||||
# wall_street
|
||||
if "wall_street_value" in parsed:
|
||||
v = parsed["wall_street_value"]
|
||||
if isinstance(v, (int, float)) and 0 <= v <= 100:
|
||||
out["wall_street"] = {"time": ts, "value": int(v)}
|
||||
return out
|
||||
49
crawler/extractor_rules.py
Normal file
49
crawler/extractor_rules.py
Normal file
@@ -0,0 +1,49 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
基于规则的新闻数据提取(无需 Ollama)
|
||||
从新闻文本中提取战损、报复情绪等数值,供 db_merge 写入
|
||||
"""
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
|
||||
def _first_int(text: str, pattern: str) -> Optional[int]:
|
||||
m = re.search(pattern, text, re.I)
|
||||
if m and m.group(1) and m.group(1).replace(",", "").isdigit():
|
||||
return max(0, int(m.group(1).replace(",", "")))
|
||||
return None
|
||||
|
||||
|
||||
def extract_from_news(text: str, timestamp: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
规则提取:匹配数字+关键词,输出符合 panel schema 的字段(无需 Ollama)
|
||||
"""
|
||||
ts = timestamp or datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.000Z")
|
||||
out: Dict[str, Any] = {}
|
||||
t = (text or "").lower()
|
||||
|
||||
loss_us, loss_ir = {}, {}
|
||||
v = _first_int(t, r"(?:us|american|u\.?s\.?)[\s\w]*(?:say|report)[\s\w]*(\d+)[\s\w]*(?:troop|soldier|killed|dead)")
|
||||
if v is not None:
|
||||
loss_us["personnel_killed"] = v
|
||||
v = _first_int(t, r"(\d+)[\s\w]*(?:us|american)[\s\w]*(?:troop|soldier|killed|dead)")
|
||||
if v is not None:
|
||||
loss_us["personnel_killed"] = v
|
||||
v = _first_int(t, r"(?:iran|iranian)[\s\w]*(?:say|report)[\s\w]*(\d+)[\s\w]*(?:troop|soldier|killed|dead)")
|
||||
if v is not None:
|
||||
loss_ir["personnel_killed"] = v
|
||||
v = _first_int(t, r"(\d+)[\s\w]*(?:iranian|iran)[\s\w]*(?:troop|soldier|killed|dead)")
|
||||
if v is not None:
|
||||
loss_ir["personnel_killed"] = v
|
||||
|
||||
if loss_us:
|
||||
out.setdefault("combat_losses_delta", {})["us"] = loss_us
|
||||
if loss_ir:
|
||||
out.setdefault("combat_losses_delta", {})["iran"] = loss_ir
|
||||
if "retaliat" in t or "revenge" in t or "报复" in t:
|
||||
out["retaliation"] = {"value": 75, "time": ts}
|
||||
if "wall street" in t or " dow " in t or "s&p" in t or "market slump" in t or "stock fall" in t or "美股" in t:
|
||||
out["wall_street"] = {"time": ts, "value": 55}
|
||||
|
||||
return out
|
||||
42
crawler/panel_schema.py
Normal file
42
crawler/panel_schema.py
Normal file
@@ -0,0 +1,42 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
前端面板完整数据 schema,与 DB / situationData / useReplaySituation 对齐
|
||||
爬虫 + AI 清洗后的数据必须符合此 schema 才能正确更新前端
|
||||
"""
|
||||
from typing import Any, Dict, List, Literal, Optional, Tuple
|
||||
|
||||
# 事件脉络
|
||||
SITUATION_UPDATE_CATEGORIES = ("deployment", "alert", "intel", "diplomatic", "other")
|
||||
SITUATION_UPDATE_SEVERITIES = ("low", "medium", "high", "critical")
|
||||
SUMMARY_MAX_LEN = 120
|
||||
|
||||
# 战损
|
||||
CombatLossesRow = Dict[str, Any] # bases_destroyed, bases_damaged, personnel_killed, ...
|
||||
|
||||
# 时间序列(回放用)
|
||||
TimeSeriesPoint = Tuple[str, int] # (ISO time, value)
|
||||
|
||||
# AI 可从新闻中提取的字段
|
||||
EXTRACTABLE_FIELDS = {
|
||||
"situation_update": ["summary", "category", "severity", "timestamp"],
|
||||
"combat_losses": ["personnel_killed", "personnel_wounded", "civilian_killed", "civilian_wounded", "bases_destroyed", "bases_damaged", "aircraft", "warships", "armor", "vehicles"],
|
||||
"retaliation": ["value"], # 0-100
|
||||
"wall_street_trend": ["time", "value"], # 0-100
|
||||
"conflict_stats": ["estimated_casualties", "estimated_strike_count"],
|
||||
}
|
||||
|
||||
|
||||
def validate_category(cat: str) -> str:
|
||||
return cat if cat in SITUATION_UPDATE_CATEGORIES else "other"
|
||||
|
||||
|
||||
def validate_severity(sev: str) -> str:
|
||||
return sev if sev in SITUATION_UPDATE_SEVERITIES else "medium"
|
||||
|
||||
|
||||
def validate_summary(s: str, max_len: int = SUMMARY_MAX_LEN) -> str:
|
||||
import re
|
||||
if not s or not isinstance(s, str):
|
||||
return ""
|
||||
t = re.sub(r"\s+", " ", str(s).strip())[:max_len]
|
||||
return re.sub(r"[\x00-\x1f]", "", t).rstrip()
|
||||
101
crawler/parser_ai.py
Normal file
101
crawler/parser_ai.py
Normal file
@@ -0,0 +1,101 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
AI 新闻分类与严重度判定
|
||||
优先使用 Ollama 本地模型(免费),失败则回退到规则
|
||||
设置 PARSER_AI_DISABLED=1 可只用规则(更快)
|
||||
"""
|
||||
import os
|
||||
from typing import Literal, Optional, Tuple
|
||||
|
||||
Category = Literal["deployment", "alert", "intel", "diplomatic", "other"]
|
||||
Severity = Literal["low", "medium", "high", "critical"]
|
||||
|
||||
PARSER_AI_DISABLED = os.environ.get("PARSER_AI_DISABLED", "0") == "1"
|
||||
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "llama3.1") # 或 qwen2.5:7b
|
||||
|
||||
_CATEGORIES = ("deployment", "alert", "intel", "diplomatic", "other")
|
||||
_SEVERITIES = ("low", "medium", "high", "critical")
|
||||
|
||||
|
||||
def _parse_ai_response(text: str) -> Tuple[Category, Severity]:
|
||||
"""从 AI 回复解析 category:severity"""
|
||||
t = (text or "").strip().lower()
|
||||
cat, sev = "other", "low"
|
||||
for c in _CATEGORIES:
|
||||
if c in t:
|
||||
cat = c
|
||||
break
|
||||
for s in _SEVERITIES:
|
||||
if s in t:
|
||||
sev = s
|
||||
break
|
||||
return cat, sev # type: ignore
|
||||
|
||||
|
||||
def _call_ollama(text: str, timeout: int = 5) -> Optional[Tuple[Category, Severity]]:
|
||||
"""调用 Ollama 本地模型。需先运行 ollama run llama3.1 或 qwen2.5:7b"""
|
||||
if PARSER_AI_DISABLED:
|
||||
return None
|
||||
try:
|
||||
import requests
|
||||
prompt = f"""Classify this news about US-Iran/middle east (one line only):
|
||||
- category: deployment|alert|intel|diplomatic|other
|
||||
- severity: low|medium|high|critical
|
||||
|
||||
News: {text[:300]}
|
||||
|
||||
Reply format: category:severity (e.g. alert:high)"""
|
||||
r = requests.post(
|
||||
"http://localhost:11434/api/chat",
|
||||
json={
|
||||
"model": OLLAMA_MODEL,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"stream": False,
|
||||
"options": {"num_predict": 32},
|
||||
},
|
||||
timeout=timeout,
|
||||
)
|
||||
if r.status_code != 200:
|
||||
return None
|
||||
out = r.json().get("message", {}).get("content", "")
|
||||
return _parse_ai_response(out)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _rule_classify(text: str) -> Category:
|
||||
from parser import classify
|
||||
return classify(text)
|
||||
|
||||
|
||||
def _rule_severity(text: str, category: Category) -> Severity:
|
||||
from parser import severity
|
||||
return severity(text, category)
|
||||
|
||||
|
||||
def classify(text: str) -> Category:
|
||||
"""分类。AI 失败时回退规则"""
|
||||
res = _call_ollama(text)
|
||||
if res:
|
||||
return res[0]
|
||||
return _rule_classify(text)
|
||||
|
||||
|
||||
def severity(text: str, category: Category) -> Severity:
|
||||
"""严重度。AI 失败时回退规则"""
|
||||
res = _call_ollama(text)
|
||||
if res:
|
||||
return res[1]
|
||||
return _rule_severity(text, category)
|
||||
|
||||
|
||||
def classify_and_severity(text: str) -> Tuple[Category, Severity]:
|
||||
"""一次调用返回分类和严重度(减少 AI 调用)"""
|
||||
if PARSER_AI_DISABLED:
|
||||
from parser import classify, severity
|
||||
c = classify(text)
|
||||
return c, severity(text, c)
|
||||
res = _call_ollama(text)
|
||||
if res:
|
||||
return res
|
||||
return _rule_classify(text), _rule_severity(text, _rule_classify(text))
|
||||
@@ -14,11 +14,13 @@ from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
import logging
|
||||
import requests
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
|
||||
logging.getLogger("apscheduler.scheduler").setLevel(logging.ERROR)
|
||||
app = FastAPI(title="GDELT Conflict Service")
|
||||
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"])
|
||||
|
||||
@@ -29,7 +31,7 @@ API_BASE = os.environ.get("API_BASE", "http://localhost:3001")
|
||||
QUERY = os.environ.get("GDELT_QUERY", "United States Iran military")
|
||||
MAX_RECORDS = int(os.environ.get("GDELT_MAX_RECORDS", "30"))
|
||||
FETCH_INTERVAL_SEC = int(os.environ.get("FETCH_INTERVAL_SEC", "60"))
|
||||
RSS_INTERVAL_SEC = int(os.environ.get("RSS_INTERVAL_SEC", "45")) # 新闻抓取更频繁,优先保证事件脉络
|
||||
RSS_INTERVAL_SEC = int(os.environ.get("RSS_INTERVAL_SEC", "60")) # 每分钟抓取世界主流媒体
|
||||
# 时间范围:1h=1小时 1d=1天 1week=1周;不设则默认 3 个月(易返回旧文)
|
||||
GDELT_TIMESPAN = os.environ.get("GDELT_TIMESPAN", "1d")
|
||||
# 设为 1 则跳过 GDELT,仅用 RSS 新闻作为事件脉络(GDELT 国外可能无法访问)
|
||||
@@ -77,7 +79,9 @@ def _parse_article(article: dict) -> Optional[dict]:
|
||||
if not title_raw:
|
||||
return None
|
||||
from translate_utils import translate_to_chinese
|
||||
from cleaner_ai import clean_news_for_panel
|
||||
title = translate_to_chinese(str(title_raw)[:500])
|
||||
title = clean_news_for_panel(title, max_len=150)
|
||||
url = article.get("url") or article.get("socialimage") or ""
|
||||
seendate = article.get("seendate") or datetime.utcnow().isoformat()
|
||||
lat = article.get("lat")
|
||||
@@ -134,8 +138,8 @@ def fetch_gdelt_events() -> None:
|
||||
_write_to_db(new_events)
|
||||
_notify_node()
|
||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] GDELT 更新 {len(new_events)} 条事件")
|
||||
except Exception as e:
|
||||
print(f"GDELT 抓取失败: {e}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _ensure_table(conn: sqlite3.Connection) -> None:
|
||||
@@ -213,38 +217,115 @@ def _notify_node() -> None:
|
||||
|
||||
|
||||
# ==========================
|
||||
# RSS 新闻抓取(补充 situation_update)
|
||||
# RSS 新闻抓取(补充 situation_update + AI 提取面板数据)
|
||||
# ==========================
|
||||
LAST_FETCH = {"items": 0, "inserted": 0, "error": None}
|
||||
|
||||
|
||||
def fetch_news() -> None:
|
||||
try:
|
||||
from scrapers.rss_scraper import fetch_all
|
||||
from db_writer import write_updates
|
||||
from translate_utils import translate_to_chinese
|
||||
from cleaner_ai import clean_news_for_panel
|
||||
from cleaner_ai import ensure_category, ensure_severity
|
||||
LAST_FETCH["error"] = None
|
||||
items = fetch_all()
|
||||
for it in items:
|
||||
it["title"] = translate_to_chinese(it.get("title", "") or "")
|
||||
it["summary"] = translate_to_chinese(it.get("summary", "") or it.get("title", ""))
|
||||
raw_title = translate_to_chinese(it.get("title", "") or "")
|
||||
raw_summary = translate_to_chinese(it.get("summary", "") or it.get("title", ""))
|
||||
it["title"] = clean_news_for_panel(raw_title, max_len=80)
|
||||
it["summary"] = clean_news_for_panel(raw_summary or raw_title, max_len=120)
|
||||
it["category"] = ensure_category(it.get("category", "other"))
|
||||
it["severity"] = ensure_severity(it.get("severity", "medium"))
|
||||
n = write_updates(items) if items else 0
|
||||
LAST_FETCH["items"] = len(items)
|
||||
LAST_FETCH["inserted"] = n
|
||||
if items:
|
||||
n = write_updates(items)
|
||||
_extract_and_merge_panel_data(items)
|
||||
if n > 0:
|
||||
_notify_node()
|
||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] 新闻入库 {n} 条")
|
||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] RSS 抓取 {len(items)} 条,新增入库 {n} 条")
|
||||
except Exception as e:
|
||||
print(f"新闻抓取失败: {e}")
|
||||
LAST_FETCH["error"] = str(e)
|
||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] 新闻抓取失败: {e}")
|
||||
|
||||
|
||||
def _extract_and_merge_panel_data(items: list) -> None:
|
||||
"""对新闻做 AI/规则 提取,合并到 combat_losses / retaliation / wall_street_trend 等表"""
|
||||
if not items or not os.path.exists(DB_PATH):
|
||||
return
|
||||
try:
|
||||
from db_merge import merge
|
||||
if os.environ.get("CLEANER_AI_DISABLED", "0") == "1":
|
||||
from extractor_rules import extract_from_news
|
||||
else:
|
||||
from extractor_ai import extract_from_news
|
||||
from datetime import timezone
|
||||
merged_any = False
|
||||
# 只对前几条有足够文本的新闻做提取,避免 Ollama 调用过多
|
||||
for it in items[:5]:
|
||||
text = (it.get("title", "") or "") + " " + (it.get("summary", "") or "")
|
||||
if len(text.strip()) < 20:
|
||||
continue
|
||||
pub = it.get("published")
|
||||
ts = None
|
||||
if pub:
|
||||
try:
|
||||
if isinstance(pub, str):
|
||||
pub_dt = datetime.fromisoformat(pub.replace("Z", "+00:00"))
|
||||
else:
|
||||
pub_dt = pub
|
||||
if pub_dt.tzinfo:
|
||||
pub_dt = pub_dt.astimezone(timezone.utc)
|
||||
ts = pub_dt.strftime("%Y-%m-%dT%H:%M:%S.000Z")
|
||||
except Exception:
|
||||
pass
|
||||
extracted = extract_from_news(text, timestamp=ts)
|
||||
if extracted:
|
||||
if merge(extracted, db_path=DB_PATH):
|
||||
merged_any = True
|
||||
if merged_any:
|
||||
_notify_node()
|
||||
except Exception as e:
|
||||
print(f" [warn] AI 面板数据提取/合并: {e}")
|
||||
|
||||
|
||||
# ==========================
|
||||
# 定时任务(RSS 更频繁,优先保证事件脉络实时)
|
||||
# ==========================
|
||||
scheduler = BackgroundScheduler()
|
||||
scheduler.add_job(fetch_news, "interval", seconds=RSS_INTERVAL_SEC)
|
||||
scheduler.add_job(fetch_gdelt_events, "interval", seconds=FETCH_INTERVAL_SEC)
|
||||
scheduler.add_job(fetch_news, "interval", seconds=RSS_INTERVAL_SEC, max_instances=2, coalesce=True)
|
||||
scheduler.add_job(fetch_gdelt_events, "interval", seconds=FETCH_INTERVAL_SEC, max_instances=2, coalesce=True)
|
||||
scheduler.start()
|
||||
|
||||
|
||||
# ==========================
|
||||
# API 接口
|
||||
# ==========================
|
||||
@app.get("/crawler/status")
|
||||
def crawler_status():
|
||||
"""爬虫状态:用于排查数据更新链路"""
|
||||
import os
|
||||
db_ok = os.path.exists(DB_PATH)
|
||||
total = 0
|
||||
if db_ok:
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH, timeout=3)
|
||||
total = conn.execute("SELECT COUNT(*) FROM situation_update").fetchone()[0]
|
||||
conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
return {
|
||||
"db_path": DB_PATH,
|
||||
"db_exists": db_ok,
|
||||
"situation_update_count": total,
|
||||
"last_fetch_items": LAST_FETCH.get("items", 0),
|
||||
"last_fetch_inserted": LAST_FETCH.get("inserted", 0),
|
||||
"last_fetch_error": LAST_FETCH.get("error"),
|
||||
}
|
||||
|
||||
|
||||
@app.get("/events")
|
||||
def get_events():
|
||||
return {
|
||||
|
||||
Binary file not shown.
@@ -6,7 +6,7 @@ from datetime import datetime, timezone
|
||||
import feedparser
|
||||
|
||||
from config import RSS_FEEDS, KEYWORDS
|
||||
from parser import classify, severity
|
||||
from parser_ai import classify_and_severity
|
||||
|
||||
|
||||
def _parse_date(entry) -> datetime:
|
||||
@@ -62,8 +62,7 @@ def fetch_all() -> list[dict]:
|
||||
continue
|
||||
seen.add(key)
|
||||
published = _parse_date(entry)
|
||||
cat = classify(text)
|
||||
sev = severity(text, cat)
|
||||
cat, sev = classify_and_severity(text)
|
||||
items.append({
|
||||
"title": title,
|
||||
"summary": summary[:400] if summary else title,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""英译中,入库前统一翻译"""
|
||||
import os
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
@@ -12,17 +13,26 @@ def _is_mostly_chinese(text: str) -> bool:
|
||||
|
||||
|
||||
def translate_to_chinese(text: str) -> str:
|
||||
"""将文本翻译成中文,失败或已是中文则返回原文。"""
|
||||
"""将文本翻译成中文,失败或已是中文则返回原文。Google 失败时尝试 MyMemory。"""
|
||||
if not text or not text.strip():
|
||||
return text
|
||||
if os.environ.get("TRANSLATE_DISABLED", "0") == "1":
|
||||
return text
|
||||
s = str(text).strip()
|
||||
if len(s) > 2000:
|
||||
s = s[:2000]
|
||||
if _is_mostly_chinese(s):
|
||||
return text
|
||||
try:
|
||||
from deep_translator import GoogleTranslator
|
||||
out = GoogleTranslator(source="auto", target="zh-CN").translate(s)
|
||||
return out if out else text
|
||||
except Exception:
|
||||
return text
|
||||
for translator in ["google", "mymemory"]:
|
||||
try:
|
||||
if translator == "google":
|
||||
from deep_translator import GoogleTranslator
|
||||
out = GoogleTranslator(source="auto", target="zh-CN").translate(s)
|
||||
else:
|
||||
from deep_translator import MyMemoryTranslator
|
||||
out = MyMemoryTranslator(source="auto", target="zh-CN").translate(s)
|
||||
if out and out.strip() and out != s:
|
||||
return out
|
||||
except Exception:
|
||||
continue
|
||||
return text
|
||||
|
||||
Reference in New Issue
Block a user