fix: 优化后端数据更新机制

This commit is contained in:
Daniel
2026-03-03 13:02:28 +08:00
parent 7284a1a60d
commit fa6f7407f0
20 changed files with 592 additions and 201 deletions

View File

@@ -15,31 +15,40 @@ CLEANER_AI_DISABLED = os.environ.get("CLEANER_AI_DISABLED", "0") == "1"
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "llama3.1")
def _call_ollama_extract(text: str, timeout: int = 10) -> Optional[Dict[str, Any]]:
"""调用 Ollama 提取结构化数据。输出 JSON仅包含新闻中可明确推断的字段"""
# 用于 AI 提取的原文最大长度(有正文时取更长以提取精确数据)
EXTRACT_TEXT_MAX_LEN = int(os.environ.get("EXTRACT_TEXT_MAX_LEN", "4000"))
def _call_ollama_extract(text: str, timeout: int = 15) -> Optional[Dict[str, Any]]:
"""调用 Ollama 从新闻全文/摘要中提取精确结构化数据,仅填写报道中明确给出的数字与事实。"""
if CLEANER_AI_DISABLED or not text or len(str(text).strip()) < 10:
return None
try:
import requests
prompt = f"""从以下美伊/中东新闻中提取可推断的数值,输出 JSON仅包含有明确依据的字段。无依据则省略该字段。
raw = str(text).strip()[:EXTRACT_TEXT_MAX_LEN]
prompt = f"""从以下美伊/中东新闻**全文或摘要**中,提取**报道明确给出的数字与事实**,输出 JSON。规则
1. 仅填写报道中**直接出现、可核对**的数据,不要推测或估算。
2. 无明确依据的字段**必须省略**,不要填 0 或猜。
3. **战损一律按增量**:只填本则报道中「本次/此次/今日/本轮」**新增**的伤亡或损毁数量。若报道只给「累计总数」「迄今共」「total so far」等**不要填写**该字段(避免与库内已有累计值重复叠加)。
4. **攻击地点**:提取双方遭袭的具体地点。美军/盟军基地被打击 → side=us伊朗/亲伊设施被打击 → side=iran。name_keywords 用「中文名|英文名」便于匹配,可填多处。
要求
- summary: 1-2句中文事实≤80字
字段说明
- summary: 1-2 句中文事实概括≤80
- category: deployment|alert|intel|diplomatic|other
- severity: low|medium|high|critical
- 战损(仅当新闻明确提及数字时填写,格式 us_XXX / iran_XXX:
- 战损(**仅填本则报道的新增增量**,如「此次 5 人丧生」「今日又损 2 架」:
us_personnel_killed, iran_personnel_killed, us_personnel_wounded, iran_personnel_wounded,
us_civilian_killed, iran_civilian_killed, us_civilian_wounded, iran_civilian_wounded,
us_bases_destroyed, iran_bases_destroyed, us_bases_damaged, iran_bases_damaged.
重要bases_* 仅指已确认损毁/受损的基地数量;"军事目标"/targets 等泛指不是基地,若报道只说"X个军事目标遭袭"而无具体基地名,不填写 bases_*
us_bases_destroyed, iran_bases_destroyed, us_bases_damaged, iran_bases_damaged,
us_aircraft, iran_aircraft, us_warships, iran_warships, us_armor, iran_armor, us_vehicles, iran_vehicles,
us_drones, iran_drones, us_missiles, iran_missiles, us_helicopters, iran_helicopters, us_submarines, iran_submarines,
us_tanks, iran_tanks, us_civilian_ships, iran_civilian_ships, us_airport_port, iran_airport_port
- retaliation_sentiment: 0-100仅当新闻涉及伊朗报复情绪时
- wall_street_value: 0-100仅当新闻涉及美股/市场反应
- key_location_updates: 当新闻提及具体基地/地点遭袭时,数组项 { "name_keywords": "asad|阿萨德|assad", "side": "us", "status": "attacked", "damage_level": 1-3 }
us_carriers, iran_carriers, us_civilian_ships, iran_civilian_ships, us_airport_port, iran_airport_port
- retaliation_sentiment: 0-100仅当报道涉及伊朗报复/反击情绪时
- wall_street_value: 0-100仅当报道涉及美股/市场时
- key_location_updates: **双方攻击地点**。每项 {{ "name_keywords": "阿萨德|asad|al-asad", "side": "us或iran被打击方", "status": "attacked", "damage_level": 1-3 }}。美军基地例:阿萨德|asad、乌代德|udeid、埃尔比勒|erbil、因吉尔利克|incirlik。伊朗例德黑兰|tehran、布什尔|bushehr、伊斯法罕|isfahan、阿巴斯|abbas、纳坦兹|natanz
原文:{str(text)[:800]}
原文:
{raw}
直接输出 JSON不要解释"""
r = requests.post(
@@ -48,7 +57,7 @@ def _call_ollama_extract(text: str, timeout: int = 10) -> Optional[Dict[str, Any
"model": OLLAMA_MODEL,
"messages": [{"role": "user", "content": prompt}],
"stream": False,
"options": {"num_predict": 256},
"options": {"num_predict": 384},
},
timeout=timeout,
)
@@ -82,7 +91,7 @@ def extract_from_news(text: str, timestamp: Optional[str] = None) -> Dict[str, A
# combat_losses 增量(仅数字字段)
loss_us = {}
loss_ir = {}
for k in ["personnel_killed", "personnel_wounded", "civilian_killed", "civilian_wounded", "bases_destroyed", "bases_damaged", "aircraft", "warships", "armor", "vehicles", "drones", "missiles", "helicopters", "submarines", "tanks", "civilian_ships", "airport_port"]:
for k in ["personnel_killed", "personnel_wounded", "civilian_killed", "civilian_wounded", "bases_destroyed", "bases_damaged", "aircraft", "warships", "armor", "vehicles", "drones", "missiles", "helicopters", "submarines", "carriers", "civilian_ships", "airport_port"]:
uk = f"us_{k}"
ik = f"iran_{k}"
if uk in parsed and isinstance(parsed[uk], (int, float)):