fix: 优化虫 机制,新增伊朗支援

This commit is contained in:
Daniel
2026-03-06 10:34:52 +08:00
parent 89145a6743
commit 9f2442f2e3
20 changed files with 411 additions and 62 deletions

View File

@@ -26,6 +26,12 @@ MAX_DELTA_PER_MERGE = {
"civilian_ships": 20, "airport_port": 10,
}
# 反击情绪 / 华尔街:合理区间,避免爬虫单条提取 0 或 100 导致指标归零或打满
RETALIATION_SMOOTH_WEIGHT = 0.6 # 当前值权重1 - 此值为新值权重,使更新平滑
RETALIATION_HISTORY_MAX_ROWS = 300 # 反击历史条数上限,供前端曲线与回放使用
WALL_STREET_TREND_MAX_ROWS = 200 # 趋势表保留最近条数,避免无限增长
VALUE_CLAMP_MIN, VALUE_CLAMP_MAX = 1, 99 # 0/100 视为异常,写入前夹在 [1,99]
def _clamp_delta(key: str, value: int) -> int:
"""单次增量上限,避免误提「累计」导致波动"""
@@ -200,38 +206,69 @@ def merge(extracted: Dict[str, Any], db_path: Optional[str] = None) -> bool:
updated = True
except Exception:
pass
# retaliation
# retaliation:平滑更新,避免单条新闻 0/100 导致指标归零或打满
if "retaliation" in extracted:
r = extracted["retaliation"]
conn.execute("INSERT OR REPLACE INTO retaliation_current (id, value) VALUES (1, ?)", (r["value"],))
conn.execute("INSERT INTO retaliation_history (time, value) VALUES (?, ?)", (r["time"], r["value"]))
raw = max(VALUE_CLAMP_MIN, min(VALUE_CLAMP_MAX, int(r.get("value", 50))))
row = conn.execute("SELECT value FROM retaliation_current WHERE id = 1").fetchone()
current = int(row[0]) if row else 50
current = max(VALUE_CLAMP_MIN, min(VALUE_CLAMP_MAX, current))
new_val = round(
RETALIATION_SMOOTH_WEIGHT * current + (1 - RETALIATION_SMOOTH_WEIGHT) * raw
)
new_val = max(VALUE_CLAMP_MIN, min(VALUE_CLAMP_MAX, new_val))
ts = (r.get("time") or datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"))[:25]
conn.execute("INSERT OR REPLACE INTO retaliation_current (id, value) VALUES (1, ?)", (new_val,))
conn.execute("INSERT INTO retaliation_history (time, value) VALUES (?, ?)", (ts, new_val))
n_ret = conn.execute("SELECT COUNT(*) FROM retaliation_history").fetchone()[0]
if n_ret > RETALIATION_HISTORY_MAX_ROWS:
conn.execute(
"DELETE FROM retaliation_history WHERE id IN (SELECT id FROM retaliation_history ORDER BY time ASC LIMIT ?)",
(n_ret - RETALIATION_HISTORY_MAX_ROWS,),
)
updated = True
# wall_street_trend
# wall_street_trend:限幅后写入,并保留最近 N 条避免表无限增长
if "wall_street" in extracted:
w = extracted["wall_street"]
conn.execute("INSERT INTO wall_street_trend (time, value) VALUES (?, ?)", (w["time"], w["value"]))
raw = int(w.get("value", 50))
val = max(VALUE_CLAMP_MIN, min(VALUE_CLAMP_MAX, raw))
ts = (w.get("time") or datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"))[:25]
conn.execute("INSERT INTO wall_street_trend (time, value) VALUES (?, ?)", (ts, val))
n = conn.execute("SELECT COUNT(*) FROM wall_street_trend").fetchone()[0]
if n > WALL_STREET_TREND_MAX_ROWS:
conn.execute(
"DELETE FROM wall_street_trend WHERE id IN (SELECT id FROM wall_street_trend ORDER BY time ASC LIMIT ?)",
(n - WALL_STREET_TREND_MAX_ROWS,),
)
updated = True
# key_location更新双方攻击地点美军基地被打击 side=us伊朗设施被打击 side=iran的 status/damage_level
# key_location更新双方攻击地点美军基地被打击 side=us伊朗设施被打击 side=iran的 status/damage_level/attacked_at
event_time = extracted.get("_event_time") or datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z")
if "key_location_updates" in extracted:
try:
for u in extracted["key_location_updates"]:
kw_raw = (u.get("name_keywords") or "").strip()
if not kw_raw:
continue
# 支持 "a|b|c" 或 "a b c" 分隔
kw = [k.strip() for k in kw_raw.replace("|", " ").split() if k.strip()]
side = u.get("side")
status = (u.get("status") or "attacked")[:20]
dmg = u.get("damage_level", 2)
if not kw or side not in ("us", "iran"):
continue
# 简化name LIKE '%kw%' 对每个关键词 OR 连接,支持中英文
attacked_at = (u.get("attacked_at") or event_time)[:25]
conditions = " OR ".join("name LIKE ?" for _ in kw)
params = [status, dmg, side] + [f"%{k}%" for k in kw]
cur = conn.execute(
f"UPDATE key_location SET status=?, damage_level=? WHERE side=? AND ({conditions})",
params,
)
params_with_at = [status, dmg, attacked_at, side] + [f"%{k}%" for k in kw]
try:
cur = conn.execute(
f"UPDATE key_location SET status=?, damage_level=?, attacked_at=? WHERE side=? AND ({conditions})",
params_with_at,
)
except sqlite3.OperationalError:
params_no_at = [status, dmg, side] + [f"%{k}%" for k in kw]
cur = conn.execute(
f"UPDATE key_location SET status=?, damage_level=? WHERE side=? AND ({conditions})",
params_no_at,
)
if cur.rowcount > 0:
updated = True
except Exception:

View File

@@ -51,6 +51,7 @@ def _call_ollama_extract(text: str, timeout: int = 15) -> Optional[Dict[str, Any
- retaliation_sentiment: 0-100仅当报道涉及伊朗报复/反击情绪时
- wall_street_value: 0-100仅当报道涉及美股/市场时
- key_location_updates: **双方攻击地点**。每项 {{ "name_keywords": "阿萨德|asad|al-asad", "side": "us或iran被打击方", "status": "attacked", "damage_level": 1-3 }}。美军基地例:阿萨德|asad、乌代德|udeid、埃尔比勒|erbil、因吉尔利克|incirlik。伊朗例德黑兰|tehran、布什尔|bushehr、伊斯法罕|isfahan、阿巴斯|abbas、纳坦兹|natanz
- map_strike_lines仅当报道为**美/以盟军打击伊朗目标**时): 数组,每项 {{ "source_id": "israel或lincoln或ford", "target_lng": 经度, "target_lat": 纬度, "target_name": "目标名", "struck_at": "ISO时间" }}。目标坐标例纳坦兹51.92,33.67伊斯法罕51.67,32.65德黑兰51.39,35.69布什尔50.83,28.97阿巴斯港56.27,27.18
- **导弹消耗增量**(仅当报道明确提到「发射/消耗 了 X 枚导弹」时填,用于看板导弹消耗累计): us_missile_consumed_delta, iran_missile_consumed_delta本则报道中该方新增消耗枚数整数
原文:
@@ -133,6 +134,31 @@ def extract_from_news(text: str, timestamp: Optional[str] = None) -> Dict[str, A
})
if valid:
out["key_location_updates"] = valid
# map_strike_lines盟军打击伊朗目标
if "map_strike_lines" in parsed and isinstance(parsed["map_strike_lines"], list):
valid_lines = []
for line in parsed["map_strike_lines"]:
if not isinstance(line, dict):
continue
sid = str(line.get("source_id") or "").strip().lower()
if sid not in ("israel", "lincoln", "ford"):
continue
try:
lng = float(line.get("target_lng", 0))
lat = float(line.get("target_lat", 0))
except (TypeError, ValueError):
continue
name = str(line.get("target_name") or "")[:200]
struck_at = str(line.get("struck_at") or ts)[:25]
valid_lines.append({
"source_id": sid,
"target_lng": lng,
"target_lat": lat,
"target_name": name or None,
"struck_at": struck_at,
})
if valid_lines:
out["map_strike_lines"] = valid_lines
# force_summary 增量:导弹消耗(看板「导弹消耗」由 force_summary.missile_consumed 提供)
fs_delta = {}
for side_key, side_val in [("us_missile_consumed_delta", "us"), ("iran_missile_consumed_delta", "iran")]:

View File

@@ -42,6 +42,7 @@ def _call_dashscope_extract(text: str, timeout: int = 15) -> Optional[Dict[str,
- retaliation_sentiment: 0-100仅当报道涉及伊朗报复情绪时
- wall_street_value: 0-100仅当报道涉及美股/市场时)
- key_location_updates: **双方攻击地点**。每项 {{"name_keywords":"阿萨德|asad","side":"us或iran被打击方","status":"attacked","damage_level":1-3}}。美军基地:阿萨德|asad、乌代德|udeid、埃尔比勒|erbil、因吉尔利克|incirlik。伊朗德黑兰|tehran、布什尔|bushehr、伊斯法罕|isfahan、阿巴斯|abbas、纳坦兹|natanz
- **map_strike_lines**(仅当报道明确为**美/以盟军打击伊朗或伊朗目标**时): 数组,每项 {{"source_id":"israel或lincoln或ford","target_lng":经度,"target_lat":纬度,"target_name":"目标名如纳坦兹","struck_at":"ISO时间"}}。以色列打击→source_id=israel林肯号→lincoln福特号→ford。目标坐标纳坦兹51.92,33.67伊斯法罕51.67,32.65德黑兰51.39,35.69布什尔50.83,28.97阿巴斯港56.27,27.18
- **导弹消耗增量**(仅当报道明确提到「发射/消耗 了 X 枚导弹」时填): us_missile_consumed_delta, iran_missile_consumed_delta本则该方新增消耗枚数整数
原文:
@@ -133,4 +134,29 @@ def extract_from_news(text: str, timestamp: Optional[str] = None) -> Dict[str, A
if valid:
out["key_location_updates"] = valid
if "map_strike_lines" in parsed and isinstance(parsed["map_strike_lines"], list):
valid_lines = []
for line in parsed["map_strike_lines"]:
if not isinstance(line, dict):
continue
sid = str(line.get("source_id") or "").strip().lower()
if sid not in ("israel", "lincoln", "ford"):
continue
try:
lng = float(line.get("target_lng", 0))
lat = float(line.get("target_lat", 0))
except (TypeError, ValueError):
continue
name = str(line.get("target_name") or "")[:200]
struck_at = str(line.get("struck_at") or ts)[:25]
valid_lines.append({
"source_id": sid,
"target_lng": lng,
"target_lat": lat,
"target_name": name or None,
"struck_at": struck_at,
})
if valid_lines:
out["map_strike_lines"] = valid_lines
return out

View File

@@ -1,11 +1,24 @@
# -*- coding: utf-8 -*-
"""
基于规则的新闻数据提取(无需 Ollama
从新闻文本中提取战损、报复情绪等数值,供 db_merge 写入
从新闻文本中提取战损、报复情绪、攻击地点与盟军打击线,供 db_merge 写入
"""
import re
from datetime import datetime, timezone
from typing import Any, Dict, Optional
from typing import Any, Dict, List, Optional, Tuple
# 伊朗境内常见打击目标: (显示名, 经度, 纬度, 匹配关键词)
IRAN_STRIKE_TARGETS: List[Tuple[str, float, float, str]] = [
("纳坦兹", 51.916, 33.666, "natanz|纳坦兹"),
("伊斯法罕", 51.67, 32.65, "isfahan|esfahan|伊斯法罕"),
("德黑兰", 51.389, 35.689, "tehran|德黑兰"),
("布什尔", 50.83, 28.97, "bushehr|布什尔"),
("阿巴斯港", 56.27, 27.18, "bandar abbas|abbas|阿巴斯|霍尔木兹"),
("克尔曼沙赫", 47.06, 34.31, "kermanshah|克尔曼沙赫"),
("大不里士", 46.29, 38.08, "tabriz|大不里士"),
("卡拉季", 50.99, 35.83, "karaj|卡拉季"),
("米纳布", 57.08, 27.13, "minab|米纳布"),
]
def _first_int(text: str, pattern: str) -> Optional[int]:
@@ -251,4 +264,30 @@ def extract_from_news(text: str, timestamp: Optional[str] = None) -> Dict[str, A
if updates:
out["key_location_updates"] = updates
# map_strike_lines盟军以色列/林肯/福特)打击伊朗目标,供地图攻击动画更新
strike_verbs = ("strike" in t or "struck" in t or "strikes" in t or "hit" in t or "attack" in t
or "打击" in (text or "") or "空袭" in (text or "") or "袭击" in (text or ""))
if strike_verbs and ("iran" in t or "伊朗" in (text or "") or any(
any(p in t for p in kw.split("|")) for _n, _lng, _lat, kw in IRAN_STRIKE_TARGETS
)):
source_id = "israel"
if "lincoln" in t or "林肯" in (text or ""):
source_id = "lincoln"
elif "ford" in t or "福特" in (text or ""):
source_id = "ford"
elif ("israel" in t or "idf" in t or "以色列" in (text or "")) and ("us " in t or "american" in t or "pentagon" in t):
source_id = "israel" # 多国时优先以色列
lines = []
for name, lng, lat, kw in IRAN_STRIKE_TARGETS:
if any(p in t for p in kw.split("|")):
lines.append({
"source_id": source_id,
"target_lng": lng,
"target_lat": lat,
"target_name": name,
"struck_at": ts,
})
if lines:
out["map_strike_lines"] = lines
return out

View File

@@ -67,6 +67,8 @@ def _extract_and_merge(items: list, db_path: str) -> bool:
except Exception:
pass
extracted = extract_from_news(text, timestamp=ts)
if ts:
extracted["_event_time"] = ts
if extracted and merge(extracted, db_path=db_path):
merged_any = True
return merged_any