Files
usa/crawler/extractor_rules.py
2026-03-02 11:28:13 +08:00

50 lines
1.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
基于规则的新闻数据提取(无需 Ollama
从新闻文本中提取战损、报复情绪等数值,供 db_merge 写入
"""
import re
from datetime import datetime, timezone
from typing import Any, Dict, Optional
def _first_int(text: str, pattern: str) -> Optional[int]:
m = re.search(pattern, text, re.I)
if m and m.group(1) and m.group(1).replace(",", "").isdigit():
return max(0, int(m.group(1).replace(",", "")))
return None
def extract_from_news(text: str, timestamp: Optional[str] = None) -> Dict[str, Any]:
"""
规则提取:匹配数字+关键词,输出符合 panel schema 的字段(无需 Ollama
"""
ts = timestamp or datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.000Z")
out: Dict[str, Any] = {}
t = (text or "").lower()
loss_us, loss_ir = {}, {}
v = _first_int(t, r"(?:us|american|u\.?s\.?)[\s\w]*(?:say|report)[\s\w]*(\d+)[\s\w]*(?:troop|soldier|killed|dead)")
if v is not None:
loss_us["personnel_killed"] = v
v = _first_int(t, r"(\d+)[\s\w]*(?:us|american)[\s\w]*(?:troop|soldier|killed|dead)")
if v is not None:
loss_us["personnel_killed"] = v
v = _first_int(t, r"(?:iran|iranian)[\s\w]*(?:say|report)[\s\w]*(\d+)[\s\w]*(?:troop|soldier|killed|dead)")
if v is not None:
loss_ir["personnel_killed"] = v
v = _first_int(t, r"(\d+)[\s\w]*(?:iranian|iran)[\s\w]*(?:troop|soldier|killed|dead)")
if v is not None:
loss_ir["personnel_killed"] = v
if loss_us:
out.setdefault("combat_losses_delta", {})["us"] = loss_us
if loss_ir:
out.setdefault("combat_losses_delta", {})["iran"] = loss_ir
if "retaliat" in t or "revenge" in t or "报复" in t:
out["retaliation"] = {"value": 75, "time": ts}
if "wall street" in t or " dow " in t or "s&p" in t or "market slump" in t or "stock fall" in t or "美股" in t:
out["wall_street"] = {"time": ts, "value": 55}
return out