Files
usa/crawler/extractor_dashscope.py
2026-03-02 17:20:31 +08:00

122 lines
5.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
阿里云 DashScope通义千问提取面板结构化数据
从新闻文本中提取战损、报复指数、基地状态等,供 db_merge 落库
API Key 通过环境变量 DASHSCOPE_API_KEY 配置
"""
import json
import os
import re
from datetime import datetime, timezone
from typing import Any, Dict, Optional
from panel_schema import validate_category, validate_severity, validate_summary
def _call_dashscope_extract(text: str, timeout: int = 15) -> Optional[Dict[str, Any]]:
"""调用阿里云 DashScope 提取结构化数据"""
api_key = os.environ.get("DASHSCOPE_API_KEY", "").strip()
if not api_key or not text or len(str(text).strip()) < 10:
return None
try:
import dashscope
from http import HTTPStatus
dashscope.api_key = api_key
prompt = f"""从以下美伊/中东军事新闻中提取可明确推断的数值,输出 JSON。无依据的字段省略不写。
要求:
- summary: 1-2句中文事实摘要≤80字
- category: deployment|alert|intel|diplomatic|other
- severity: low|medium|high|critical
- 战损(仅当新闻明确提及数字时填写):
us_personnel_killed, iran_personnel_killed, us_personnel_wounded, iran_personnel_wounded,
us_civilian_killed, iran_civilian_killed, us_civilian_wounded, iran_civilian_wounded,
us_bases_destroyed, iran_bases_destroyed, us_bases_damaged, iran_bases_damaged,
us_aircraft, iran_aircraft, us_warships, iran_warships, us_armor, iran_armor, us_vehicles, iran_vehicles
- retaliation_sentiment: 0-100仅当新闻涉及伊朗报复/反击情绪时
- wall_street_value: 0-100仅当新闻涉及美股/市场反应时
- key_location_updates: 当新闻提及具体基地遭袭时,数组 [{{"name_keywords":"阿萨德|asad|assad","side":"us","status":"attacked","damage_level":1-3}}]
原文:
{str(text)[:800]}
直接输出 JSON不要其他解释"""
response = dashscope.Generation.call(
model="qwen-turbo",
messages=[{"role": "user", "content": prompt}],
result_format="message",
max_tokens=512,
)
if response.status_code != HTTPStatus.OK:
return None
raw = (response.output.get("choices", [{}])[0].get("message", {}).get("content", "") or "").strip()
raw = re.sub(r"^```\w*\s*|\s*```$", "", raw)
return json.loads(raw)
except Exception:
return None
def extract_from_news(text: str, timestamp: Optional[str] = None) -> Dict[str, Any]:
"""
从新闻文本提取结构化数据,符合面板 schema
返回: { situation_update?, combat_losses_delta?, retaliation?, wall_street?, key_location_updates? }
"""
ts = timestamp or datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.000Z")
out: Dict[str, Any] = {}
parsed = _call_dashscope_extract(text)
if not parsed:
return out
if parsed.get("summary"):
out["situation_update"] = {
"summary": validate_summary(str(parsed["summary"])[:120], 120),
"category": validate_category(str(parsed.get("category", "other")).lower()),
"severity": validate_severity(str(parsed.get("severity", "medium")).lower()),
"timestamp": ts,
}
loss_us = {}
loss_ir = {}
for k in ["personnel_killed", "personnel_wounded", "civilian_killed", "civilian_wounded",
"bases_destroyed", "bases_damaged", "aircraft", "warships", "armor", "vehicles"]:
uk, ik = f"us_{k}", f"iran_{k}"
if uk in parsed and isinstance(parsed[uk], (int, float)):
loss_us[k] = max(0, int(parsed[uk]))
if ik in parsed and isinstance(parsed[ik], (int, float)):
loss_ir[k] = max(0, int(parsed[ik]))
if loss_us or loss_ir:
out["combat_losses_delta"] = {}
if loss_us:
out["combat_losses_delta"]["us"] = loss_us
if loss_ir:
out["combat_losses_delta"]["iran"] = loss_ir
if "retaliation_sentiment" in parsed:
v = parsed["retaliation_sentiment"]
if isinstance(v, (int, float)) and 0 <= v <= 100:
out["retaliation"] = {"value": int(v), "time": ts}
if "wall_street_value" in parsed:
v = parsed["wall_street_value"]
if isinstance(v, (int, float)) and 0 <= v <= 100:
out["wall_street"] = {"time": ts, "value": int(v)}
if "key_location_updates" in parsed and isinstance(parsed["key_location_updates"], list):
valid = []
for u in parsed["key_location_updates"]:
if isinstance(u, dict) and u.get("name_keywords") and u.get("side") in ("us", "iran"):
valid.append({
"name_keywords": str(u["name_keywords"]),
"side": u["side"],
"status": str(u.get("status", "attacked"))[:20],
"damage_level": min(3, max(1, int(u["damage_level"]))) if isinstance(u.get("damage_level"), (int, float)) else 2,
})
if valid:
out["key_location_updates"] = valid
return out