diff --git a/crawler/__pycache__/db_merge.cpython-39.pyc b/crawler/__pycache__/db_merge.cpython-39.pyc index 6acc48f..a827d5f 100644 Binary files a/crawler/__pycache__/db_merge.cpython-39.pyc and b/crawler/__pycache__/db_merge.cpython-39.pyc differ diff --git a/crawler/__pycache__/db_writer.cpython-39.pyc b/crawler/__pycache__/db_writer.cpython-39.pyc index 71c6865..1e716da 100644 Binary files a/crawler/__pycache__/db_writer.cpython-39.pyc and b/crawler/__pycache__/db_writer.cpython-39.pyc differ diff --git a/crawler/__pycache__/extractor_dashscope.cpython-39.pyc b/crawler/__pycache__/extractor_dashscope.cpython-39.pyc index 250b3eb..4387d7f 100644 Binary files a/crawler/__pycache__/extractor_dashscope.cpython-39.pyc and b/crawler/__pycache__/extractor_dashscope.cpython-39.pyc differ diff --git a/crawler/__pycache__/panel_schema.cpython-39.pyc b/crawler/__pycache__/panel_schema.cpython-39.pyc index 0d16f03..8423120 100644 Binary files a/crawler/__pycache__/panel_schema.cpython-39.pyc and b/crawler/__pycache__/panel_schema.cpython-39.pyc differ diff --git a/crawler/__pycache__/parser_ai.cpython-39.pyc b/crawler/__pycache__/parser_ai.cpython-39.pyc index cd6053c..cca1678 100644 Binary files a/crawler/__pycache__/parser_ai.cpython-39.pyc and b/crawler/__pycache__/parser_ai.cpython-39.pyc differ diff --git a/crawler/__pycache__/pipeline.cpython-39.pyc b/crawler/__pycache__/pipeline.cpython-39.pyc index 463c6ea..4264447 100644 Binary files a/crawler/__pycache__/pipeline.cpython-39.pyc and b/crawler/__pycache__/pipeline.cpython-39.pyc differ diff --git a/crawler/db_merge.py b/crawler/db_merge.py index 30b6279..2bc7958 100644 --- a/crawler/db_merge.py +++ b/crawler/db_merge.py @@ -26,6 +26,12 @@ MAX_DELTA_PER_MERGE = { "civilian_ships": 20, "airport_port": 10, } +# 反击情绪 / 华尔街:合理区间,避免爬虫单条提取 0 或 100 导致指标归零或打满 +RETALIATION_SMOOTH_WEIGHT = 0.6 # 当前值权重,1 - 此值为新值权重,使更新平滑 +RETALIATION_HISTORY_MAX_ROWS = 300 # 反击历史条数上限,供前端曲线与回放使用 +WALL_STREET_TREND_MAX_ROWS = 200 # 趋势表保留最近条数,避免无限增长 +VALUE_CLAMP_MIN, VALUE_CLAMP_MAX = 1, 99 # 0/100 视为异常,写入前夹在 [1,99] + def _clamp_delta(key: str, value: int) -> int: """单次增量上限,避免误提「累计」导致波动""" @@ -200,38 +206,69 @@ def merge(extracted: Dict[str, Any], db_path: Optional[str] = None) -> bool: updated = True except Exception: pass - # retaliation + # retaliation:平滑更新,避免单条新闻 0/100 导致指标归零或打满 if "retaliation" in extracted: r = extracted["retaliation"] - conn.execute("INSERT OR REPLACE INTO retaliation_current (id, value) VALUES (1, ?)", (r["value"],)) - conn.execute("INSERT INTO retaliation_history (time, value) VALUES (?, ?)", (r["time"], r["value"])) + raw = max(VALUE_CLAMP_MIN, min(VALUE_CLAMP_MAX, int(r.get("value", 50)))) + row = conn.execute("SELECT value FROM retaliation_current WHERE id = 1").fetchone() + current = int(row[0]) if row else 50 + current = max(VALUE_CLAMP_MIN, min(VALUE_CLAMP_MAX, current)) + new_val = round( + RETALIATION_SMOOTH_WEIGHT * current + (1 - RETALIATION_SMOOTH_WEIGHT) * raw + ) + new_val = max(VALUE_CLAMP_MIN, min(VALUE_CLAMP_MAX, new_val)) + ts = (r.get("time") or datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"))[:25] + conn.execute("INSERT OR REPLACE INTO retaliation_current (id, value) VALUES (1, ?)", (new_val,)) + conn.execute("INSERT INTO retaliation_history (time, value) VALUES (?, ?)", (ts, new_val)) + n_ret = conn.execute("SELECT COUNT(*) FROM retaliation_history").fetchone()[0] + if n_ret > RETALIATION_HISTORY_MAX_ROWS: + conn.execute( + "DELETE FROM retaliation_history WHERE id IN (SELECT id FROM retaliation_history ORDER BY time ASC LIMIT ?)", + (n_ret - RETALIATION_HISTORY_MAX_ROWS,), + ) updated = True - # wall_street_trend + # wall_street_trend:限幅后写入,并保留最近 N 条避免表无限增长 if "wall_street" in extracted: w = extracted["wall_street"] - conn.execute("INSERT INTO wall_street_trend (time, value) VALUES (?, ?)", (w["time"], w["value"])) + raw = int(w.get("value", 50)) + val = max(VALUE_CLAMP_MIN, min(VALUE_CLAMP_MAX, raw)) + ts = (w.get("time") or datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"))[:25] + conn.execute("INSERT INTO wall_street_trend (time, value) VALUES (?, ?)", (ts, val)) + n = conn.execute("SELECT COUNT(*) FROM wall_street_trend").fetchone()[0] + if n > WALL_STREET_TREND_MAX_ROWS: + conn.execute( + "DELETE FROM wall_street_trend WHERE id IN (SELECT id FROM wall_street_trend ORDER BY time ASC LIMIT ?)", + (n - WALL_STREET_TREND_MAX_ROWS,), + ) updated = True - # key_location:更新双方攻击地点(美军基地被打击 side=us,伊朗设施被打击 side=iran)的 status/damage_level + # key_location:更新双方攻击地点(美军基地被打击 side=us,伊朗设施被打击 side=iran)的 status/damage_level/attacked_at + event_time = extracted.get("_event_time") or datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z") if "key_location_updates" in extracted: try: for u in extracted["key_location_updates"]: kw_raw = (u.get("name_keywords") or "").strip() if not kw_raw: continue - # 支持 "a|b|c" 或 "a b c" 分隔 kw = [k.strip() for k in kw_raw.replace("|", " ").split() if k.strip()] side = u.get("side") status = (u.get("status") or "attacked")[:20] dmg = u.get("damage_level", 2) if not kw or side not in ("us", "iran"): continue - # 简化:name LIKE '%kw%' 对每个关键词 OR 连接,支持中英文 + attacked_at = (u.get("attacked_at") or event_time)[:25] conditions = " OR ".join("name LIKE ?" for _ in kw) - params = [status, dmg, side] + [f"%{k}%" for k in kw] - cur = conn.execute( - f"UPDATE key_location SET status=?, damage_level=? WHERE side=? AND ({conditions})", - params, - ) + params_with_at = [status, dmg, attacked_at, side] + [f"%{k}%" for k in kw] + try: + cur = conn.execute( + f"UPDATE key_location SET status=?, damage_level=?, attacked_at=? WHERE side=? AND ({conditions})", + params_with_at, + ) + except sqlite3.OperationalError: + params_no_at = [status, dmg, side] + [f"%{k}%" for k in kw] + cur = conn.execute( + f"UPDATE key_location SET status=?, damage_level=? WHERE side=? AND ({conditions})", + params_no_at, + ) if cur.rowcount > 0: updated = True except Exception: diff --git a/crawler/extractor_ai.py b/crawler/extractor_ai.py index 0b26d89..d339b32 100644 --- a/crawler/extractor_ai.py +++ b/crawler/extractor_ai.py @@ -51,6 +51,7 @@ def _call_ollama_extract(text: str, timeout: int = 15) -> Optional[Dict[str, Any - retaliation_sentiment: 0-100,仅当报道涉及伊朗报复/反击情绪时 - wall_street_value: 0-100,仅当报道涉及美股/市场时 - key_location_updates: **双方攻击地点**。每项 {{ "name_keywords": "阿萨德|asad|al-asad", "side": "us或iran(被打击方)", "status": "attacked", "damage_level": 1-3 }}。美军基地例:阿萨德|asad、乌代德|udeid、埃尔比勒|erbil、因吉尔利克|incirlik。伊朗例:德黑兰|tehran、布什尔|bushehr、伊斯法罕|isfahan、阿巴斯|abbas、纳坦兹|natanz +- map_strike_lines(仅当报道为**美/以盟军打击伊朗目标**时): 数组,每项 {{ "source_id": "israel或lincoln或ford", "target_lng": 经度, "target_lat": 纬度, "target_name": "目标名", "struck_at": "ISO时间" }}。目标坐标例:纳坦兹51.92,33.67;伊斯法罕51.67,32.65;德黑兰51.39,35.69;布什尔50.83,28.97;阿巴斯港56.27,27.18 - **导弹消耗增量**(仅当报道明确提到「发射/消耗 了 X 枚导弹」时填,用于看板导弹消耗累计): us_missile_consumed_delta, iran_missile_consumed_delta(本则报道中该方新增消耗枚数,整数) 原文: @@ -133,6 +134,31 @@ def extract_from_news(text: str, timestamp: Optional[str] = None) -> Dict[str, A }) if valid: out["key_location_updates"] = valid + # map_strike_lines:盟军打击伊朗目标 + if "map_strike_lines" in parsed and isinstance(parsed["map_strike_lines"], list): + valid_lines = [] + for line in parsed["map_strike_lines"]: + if not isinstance(line, dict): + continue + sid = str(line.get("source_id") or "").strip().lower() + if sid not in ("israel", "lincoln", "ford"): + continue + try: + lng = float(line.get("target_lng", 0)) + lat = float(line.get("target_lat", 0)) + except (TypeError, ValueError): + continue + name = str(line.get("target_name") or "")[:200] + struck_at = str(line.get("struck_at") or ts)[:25] + valid_lines.append({ + "source_id": sid, + "target_lng": lng, + "target_lat": lat, + "target_name": name or None, + "struck_at": struck_at, + }) + if valid_lines: + out["map_strike_lines"] = valid_lines # force_summary 增量:导弹消耗(看板「导弹消耗」由 force_summary.missile_consumed 提供) fs_delta = {} for side_key, side_val in [("us_missile_consumed_delta", "us"), ("iran_missile_consumed_delta", "iran")]: diff --git a/crawler/extractor_dashscope.py b/crawler/extractor_dashscope.py index cdd2835..6288350 100644 --- a/crawler/extractor_dashscope.py +++ b/crawler/extractor_dashscope.py @@ -42,6 +42,7 @@ def _call_dashscope_extract(text: str, timeout: int = 15) -> Optional[Dict[str, - retaliation_sentiment: 0-100(仅当报道涉及伊朗报复情绪时) - wall_street_value: 0-100(仅当报道涉及美股/市场时) - key_location_updates: **双方攻击地点**。每项 {{"name_keywords":"阿萨德|asad","side":"us或iran(被打击方)","status":"attacked","damage_level":1-3}}。美军基地:阿萨德|asad、乌代德|udeid、埃尔比勒|erbil、因吉尔利克|incirlik。伊朗:德黑兰|tehran、布什尔|bushehr、伊斯法罕|isfahan、阿巴斯|abbas、纳坦兹|natanz +- **map_strike_lines**(仅当报道明确为**美/以盟军打击伊朗或伊朗目标**时): 数组,每项 {{"source_id":"israel或lincoln或ford","target_lng":经度,"target_lat":纬度,"target_name":"目标名如纳坦兹","struck_at":"ISO时间"}}。以色列打击→source_id=israel;林肯号→lincoln;福特号→ford。目标坐标:纳坦兹51.92,33.67;伊斯法罕51.67,32.65;德黑兰51.39,35.69;布什尔50.83,28.97;阿巴斯港56.27,27.18 - **导弹消耗增量**(仅当报道明确提到「发射/消耗 了 X 枚导弹」时填): us_missile_consumed_delta, iran_missile_consumed_delta(本则该方新增消耗枚数,整数) 原文: @@ -133,4 +134,29 @@ def extract_from_news(text: str, timestamp: Optional[str] = None) -> Dict[str, A if valid: out["key_location_updates"] = valid + if "map_strike_lines" in parsed and isinstance(parsed["map_strike_lines"], list): + valid_lines = [] + for line in parsed["map_strike_lines"]: + if not isinstance(line, dict): + continue + sid = str(line.get("source_id") or "").strip().lower() + if sid not in ("israel", "lincoln", "ford"): + continue + try: + lng = float(line.get("target_lng", 0)) + lat = float(line.get("target_lat", 0)) + except (TypeError, ValueError): + continue + name = str(line.get("target_name") or "")[:200] + struck_at = str(line.get("struck_at") or ts)[:25] + valid_lines.append({ + "source_id": sid, + "target_lng": lng, + "target_lat": lat, + "target_name": name or None, + "struck_at": struck_at, + }) + if valid_lines: + out["map_strike_lines"] = valid_lines + return out diff --git a/crawler/extractor_rules.py b/crawler/extractor_rules.py index 4414897..7301eab 100644 --- a/crawler/extractor_rules.py +++ b/crawler/extractor_rules.py @@ -1,11 +1,24 @@ # -*- coding: utf-8 -*- """ 基于规则的新闻数据提取(无需 Ollama) -从新闻文本中提取战损、报复情绪等数值,供 db_merge 写入 +从新闻文本中提取战损、报复情绪、攻击地点与盟军打击线,供 db_merge 写入 """ import re from datetime import datetime, timezone -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional, Tuple + +# 伊朗境内常见打击目标: (显示名, 经度, 纬度, 匹配关键词) +IRAN_STRIKE_TARGETS: List[Tuple[str, float, float, str]] = [ + ("纳坦兹", 51.916, 33.666, "natanz|纳坦兹"), + ("伊斯法罕", 51.67, 32.65, "isfahan|esfahan|伊斯法罕"), + ("德黑兰", 51.389, 35.689, "tehran|德黑兰"), + ("布什尔", 50.83, 28.97, "bushehr|布什尔"), + ("阿巴斯港", 56.27, 27.18, "bandar abbas|abbas|阿巴斯|霍尔木兹"), + ("克尔曼沙赫", 47.06, 34.31, "kermanshah|克尔曼沙赫"), + ("大不里士", 46.29, 38.08, "tabriz|大不里士"), + ("卡拉季", 50.99, 35.83, "karaj|卡拉季"), + ("米纳布", 57.08, 27.13, "minab|米纳布"), +] def _first_int(text: str, pattern: str) -> Optional[int]: @@ -251,4 +264,30 @@ def extract_from_news(text: str, timestamp: Optional[str] = None) -> Dict[str, A if updates: out["key_location_updates"] = updates + # map_strike_lines:盟军(以色列/林肯/福特)打击伊朗目标,供地图攻击动画更新 + strike_verbs = ("strike" in t or "struck" in t or "strikes" in t or "hit" in t or "attack" in t + or "打击" in (text or "") or "空袭" in (text or "") or "袭击" in (text or "")) + if strike_verbs and ("iran" in t or "伊朗" in (text or "") or any( + any(p in t for p in kw.split("|")) for _n, _lng, _lat, kw in IRAN_STRIKE_TARGETS + )): + source_id = "israel" + if "lincoln" in t or "林肯" in (text or ""): + source_id = "lincoln" + elif "ford" in t or "福特" in (text or ""): + source_id = "ford" + elif ("israel" in t or "idf" in t or "以色列" in (text or "")) and ("us " in t or "american" in t or "pentagon" in t): + source_id = "israel" # 多国时优先以色列 + lines = [] + for name, lng, lat, kw in IRAN_STRIKE_TARGETS: + if any(p in t for p in kw.split("|")): + lines.append({ + "source_id": source_id, + "target_lng": lng, + "target_lat": lat, + "target_name": name, + "struck_at": ts, + }) + if lines: + out["map_strike_lines"] = lines + return out diff --git a/crawler/pipeline.py b/crawler/pipeline.py index a99414f..f0b63b2 100644 --- a/crawler/pipeline.py +++ b/crawler/pipeline.py @@ -67,6 +67,8 @@ def _extract_and_merge(items: list, db_path: str) -> bool: except Exception: pass extracted = extract_from_news(text, timestamp=ts) + if ts: + extracted["_event_time"] = ts if extracted and merge(extracted, db_path=db_path): merged_any = True return merged_any diff --git a/crawler/scrapers/__pycache__/rss_scraper.cpython-39.pyc b/crawler/scrapers/__pycache__/rss_scraper.cpython-39.pyc index 77cdb31..3b666b8 100644 Binary files a/crawler/scrapers/__pycache__/rss_scraper.cpython-39.pyc and b/crawler/scrapers/__pycache__/rss_scraper.cpython-39.pyc differ diff --git a/index.html b/index.html index e79ad3a..f85f1f8 100644 --- a/index.html +++ b/index.html @@ -2,9 +2,15 @@
- + +