fix: 优化数据
This commit is contained in:
49
crawler/extractor_rules.py
Normal file
49
crawler/extractor_rules.py
Normal file
@@ -0,0 +1,49 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
基于规则的新闻数据提取(无需 Ollama)
|
||||
从新闻文本中提取战损、报复情绪等数值,供 db_merge 写入
|
||||
"""
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
|
||||
def _first_int(text: str, pattern: str) -> Optional[int]:
|
||||
m = re.search(pattern, text, re.I)
|
||||
if m and m.group(1) and m.group(1).replace(",", "").isdigit():
|
||||
return max(0, int(m.group(1).replace(",", "")))
|
||||
return None
|
||||
|
||||
|
||||
def extract_from_news(text: str, timestamp: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
规则提取:匹配数字+关键词,输出符合 panel schema 的字段(无需 Ollama)
|
||||
"""
|
||||
ts = timestamp or datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.000Z")
|
||||
out: Dict[str, Any] = {}
|
||||
t = (text or "").lower()
|
||||
|
||||
loss_us, loss_ir = {}, {}
|
||||
v = _first_int(t, r"(?:us|american|u\.?s\.?)[\s\w]*(?:say|report)[\s\w]*(\d+)[\s\w]*(?:troop|soldier|killed|dead)")
|
||||
if v is not None:
|
||||
loss_us["personnel_killed"] = v
|
||||
v = _first_int(t, r"(\d+)[\s\w]*(?:us|american)[\s\w]*(?:troop|soldier|killed|dead)")
|
||||
if v is not None:
|
||||
loss_us["personnel_killed"] = v
|
||||
v = _first_int(t, r"(?:iran|iranian)[\s\w]*(?:say|report)[\s\w]*(\d+)[\s\w]*(?:troop|soldier|killed|dead)")
|
||||
if v is not None:
|
||||
loss_ir["personnel_killed"] = v
|
||||
v = _first_int(t, r"(\d+)[\s\w]*(?:iranian|iran)[\s\w]*(?:troop|soldier|killed|dead)")
|
||||
if v is not None:
|
||||
loss_ir["personnel_killed"] = v
|
||||
|
||||
if loss_us:
|
||||
out.setdefault("combat_losses_delta", {})["us"] = loss_us
|
||||
if loss_ir:
|
||||
out.setdefault("combat_losses_delta", {})["iran"] = loss_ir
|
||||
if "retaliat" in t or "revenge" in t or "报复" in t:
|
||||
out["retaliation"] = {"value": 75, "time": ts}
|
||||
if "wall street" in t or " dow " in t or "s&p" in t or "market slump" in t or "stock fall" in t or "美股" in t:
|
||||
out["wall_street"] = {"time": ts, "value": 55}
|
||||
|
||||
return out
|
||||
Reference in New Issue
Block a user