Files
usa/crawler/extractor_dashscope.py
2026-03-02 23:21:07 +08:00

125 lines
5.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
阿里云 DashScope通义千问提取面板结构化数据
从新闻文本中提取战损、报复指数、基地状态等,供 db_merge 落库
API Key 通过环境变量 DASHSCOPE_API_KEY 配置
"""
import json
import os
import re
from datetime import datetime, timezone
from typing import Any, Dict, Optional
from panel_schema import validate_category, validate_severity, validate_summary
def _call_dashscope_extract(text: str, timeout: int = 15) -> Optional[Dict[str, Any]]:
"""调用阿里云 DashScope 提取结构化数据"""
api_key = os.environ.get("DASHSCOPE_API_KEY", "").strip()
if not api_key or not text or len(str(text).strip()) < 10:
return None
try:
import dashscope
from http import HTTPStatus
dashscope.api_key = api_key
prompt = f"""从以下美伊/中东军事新闻中提取可明确推断的数值,输出 JSON。无依据的字段省略不写。
要求:
- summary: 1-2句中文事实摘要≤80字
- category: deployment|alert|intel|diplomatic|other
- severity: low|medium|high|critical
- 战损(仅当新闻明确提及数字时填写):
us_personnel_killed, iran_personnel_killed, us_personnel_wounded, iran_personnel_wounded,
us_civilian_killed, iran_civilian_killed, us_civilian_wounded, iran_civilian_wounded,
us_bases_destroyed, iran_bases_destroyed, us_bases_damaged, iran_bases_damaged.
重要bases_* 仅指已确认损毁/受损的基地数量;"军事目标"/"targets"等泛指不是基地,若报道只说"X个军事目标遭袭"而无具体基地名,不填写 bases_*
us_aircraft, iran_aircraft, us_warships, iran_warships, us_armor, iran_armor, us_vehicles, iran_vehicles,
us_drones, iran_drones, us_missiles, iran_missiles, us_helicopters, iran_helicopters, us_submarines, iran_submarines
- retaliation_sentiment: 0-100仅当新闻涉及伊朗报复/反击情绪时
- wall_street_value: 0-100仅当新闻涉及美股/市场反应时
- key_location_updates: 当新闻提及具体基地/设施遭袭时必填,数组 [{{"name_keywords":"阿萨德|asad|assad|阿因","side":"us","status":"attacked","damage_level":1-3}}]。常用关键词:阿萨德|asad|巴格达|baghdad|乌代德|udeid|埃尔比勒|erbil|因吉尔利克|incirlik|德黑兰|tehran|阿巴斯|abbas|布什尔|bushehr|伊斯法罕|isfahan|纳坦兹|natanz
原文:
{str(text)[:800]}
直接输出 JSON不要其他解释"""
response = dashscope.Generation.call(
model="qwen-turbo",
messages=[{"role": "user", "content": prompt}],
result_format="message",
max_tokens=512,
)
if response.status_code != HTTPStatus.OK:
return None
raw = (response.output.get("choices", [{}])[0].get("message", {}).get("content", "") or "").strip()
raw = re.sub(r"^```\w*\s*|\s*```$", "", raw)
return json.loads(raw)
except Exception:
return None
def extract_from_news(text: str, timestamp: Optional[str] = None) -> Dict[str, Any]:
"""
从新闻文本提取结构化数据,符合面板 schema
返回: { situation_update?, combat_losses_delta?, retaliation?, wall_street?, key_location_updates? }
"""
ts = timestamp or datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.000Z")
out: Dict[str, Any] = {}
parsed = _call_dashscope_extract(text)
if not parsed:
return out
if parsed.get("summary"):
out["situation_update"] = {
"summary": validate_summary(str(parsed["summary"])[:120], 120),
"category": validate_category(str(parsed.get("category", "other")).lower()),
"severity": validate_severity(str(parsed.get("severity", "medium")).lower()),
"timestamp": ts,
}
loss_us = {}
loss_ir = {}
for k in ["personnel_killed", "personnel_wounded", "civilian_killed", "civilian_wounded",
"bases_destroyed", "bases_damaged", "aircraft", "warships", "armor", "vehicles",
"drones", "missiles", "helicopters", "submarines"]:
uk, ik = f"us_{k}", f"iran_{k}"
if uk in parsed and isinstance(parsed[uk], (int, float)):
loss_us[k] = max(0, int(parsed[uk]))
if ik in parsed and isinstance(parsed[ik], (int, float)):
loss_ir[k] = max(0, int(parsed[ik]))
if loss_us or loss_ir:
out["combat_losses_delta"] = {}
if loss_us:
out["combat_losses_delta"]["us"] = loss_us
if loss_ir:
out["combat_losses_delta"]["iran"] = loss_ir
if "retaliation_sentiment" in parsed:
v = parsed["retaliation_sentiment"]
if isinstance(v, (int, float)) and 0 <= v <= 100:
out["retaliation"] = {"value": int(v), "time": ts}
if "wall_street_value" in parsed:
v = parsed["wall_street_value"]
if isinstance(v, (int, float)) and 0 <= v <= 100:
out["wall_street"] = {"time": ts, "value": int(v)}
if "key_location_updates" in parsed and isinstance(parsed["key_location_updates"], list):
valid = []
for u in parsed["key_location_updates"]:
if isinstance(u, dict) and u.get("name_keywords") and u.get("side") in ("us", "iran"):
valid.append({
"name_keywords": str(u["name_keywords"]),
"side": u["side"],
"status": str(u.get("status", "attacked"))[:20],
"damage_level": min(3, max(1, int(u["damage_level"]))) if isinstance(u.get("damage_level"), (int, float)) else 2,
})
if valid:
out["key_location_updates"] = valid
return out