fix: 优化docker 镜像
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
crawler/__pycache__/extractor_rules.cpython-39.pyc
Normal file
BIN
crawler/__pycache__/extractor_rules.cpython-39.pyc
Normal file
Binary file not shown.
Binary file not shown.
@@ -39,11 +39,37 @@ RSS_FEEDS = [
|
||||
"https://www.aljazeera.com/xml/rss/middleeast.xml",
|
||||
]
|
||||
|
||||
# 关键词过滤:至少匹配一个才会入库
|
||||
# 关键词过滤:至少匹配一个才会入库(与地图区域对应:伊拉克/叙利亚/海湾/红海/地中海等)
|
||||
KEYWORDS = [
|
||||
"iran", "iranian", "tehran", "以色列", "israel",
|
||||
"usa", "us ", "american", "美军", "美国",
|
||||
"middle east", "中东", "persian gulf", "波斯湾",
|
||||
# 伊朗
|
||||
"iran", "iranian", "tehran", "德黑兰", "bushehr", "布什尔", "abbas", "阿巴斯",
|
||||
# 以色列 / 巴勒斯坦
|
||||
"israel", "以色列", "hamas", "gaza", "加沙", "hezbollah", "真主党",
|
||||
# 美国
|
||||
"usa", "us ", "american", "美军", "美国", "pentagon",
|
||||
# 区域(地图覆盖)
|
||||
"middle east", "中东", "persian gulf", "波斯湾", "gulf of oman", "阿曼湾",
|
||||
"arabian sea", "阿拉伯海", "red sea", "红海", "mediterranean", "地中海",
|
||||
"strait of hormuz", "霍尔木兹",
|
||||
# 伊拉克 / 叙利亚
|
||||
"iraq", "伊拉克", "baghdad", "巴格达", "erbil", "埃尔比勒", "basra", "巴士拉",
|
||||
"syria", "叙利亚", "damascus", "大马士革", "deir", "代尔祖尔",
|
||||
# 海湾国家
|
||||
"saudi", "沙特", "riyadh", "利雅得", "qatar", "卡塔尔", "doha", "多哈",
|
||||
"uae", "emirates", "阿联酋", "dubai", "迪拜", "abu dhabi",
|
||||
"bahrain", "巴林", "kuwait", "科威特", "oman", "阿曼", "yemen", "也门",
|
||||
# 约旦 / 土耳其 / 埃及 / 吉布提 / 黎巴嫩
|
||||
"jordan", "约旦", "amman", "安曼",
|
||||
"lebanon", "黎巴嫩",
|
||||
"turkey", "土耳其", "incirlik", "因吉尔利克",
|
||||
"egypt", "埃及", "cairo", "开罗", "sinai", "西奈",
|
||||
"djibouti", "吉布提",
|
||||
# 军事 / 基地
|
||||
"al-asad", "al asad", "阿萨德", "al udeid", "乌代德", "incirlik",
|
||||
"strike", "attack", "military", "missile", "核", "nuclear",
|
||||
"carrier", "航母", "houthi", "胡塞", "hamas",
|
||||
"carrier", "航母", "drone", "uav", "无人机", "retaliation", "报复",
|
||||
"base", "基地", "troops", "troop", "soldier", "personnel",
|
||||
# 胡塞 / 武装 / 军力
|
||||
"houthi", "胡塞", "houthis",
|
||||
"idf", "irgc", "革命卫队", "qassem soleimani", "苏莱曼尼",
|
||||
]
|
||||
|
||||
@@ -67,7 +67,7 @@ def merge(extracted: Dict[str, Any], db_path: Optional[str] = None) -> bool:
|
||||
)
|
||||
if conn.total_changes > 0:
|
||||
updated = True
|
||||
# combat_losses:增量叠加到当前值
|
||||
# combat_losses:增量叠加到当前值,无行则先插入初始行
|
||||
if "combat_losses_delta" in extracted:
|
||||
for side, delta in extracted["combat_losses_delta"].items():
|
||||
if side not in ("us", "iran"):
|
||||
@@ -77,13 +77,14 @@ def merge(extracted: Dict[str, Any], db_path: Optional[str] = None) -> bool:
|
||||
"SELECT personnel_killed,personnel_wounded,civilian_killed,civilian_wounded,bases_destroyed,bases_damaged,aircraft,warships,armor,vehicles FROM combat_losses WHERE side = ?",
|
||||
(side,),
|
||||
).fetchone()
|
||||
if not row:
|
||||
continue
|
||||
cur = {
|
||||
"personnel_killed": row[0], "personnel_wounded": row[1], "civilian_killed": row[2] or 0,
|
||||
"civilian_wounded": row[3] or 0, "bases_destroyed": row[4], "bases_damaged": row[5],
|
||||
"aircraft": row[6], "warships": row[7], "armor": row[8], "vehicles": row[9],
|
||||
}
|
||||
cur = {"personnel_killed": 0, "personnel_wounded": 0, "civilian_killed": 0, "civilian_wounded": 0,
|
||||
"bases_destroyed": 0, "bases_damaged": 0, "aircraft": 0, "warships": 0, "armor": 0, "vehicles": 0}
|
||||
if row:
|
||||
cur = {
|
||||
"personnel_killed": row[0], "personnel_wounded": row[1], "civilian_killed": row[2] or 0,
|
||||
"civilian_wounded": row[3] or 0, "bases_destroyed": row[4], "bases_damaged": row[5],
|
||||
"aircraft": row[6], "warships": row[7], "armor": row[8], "vehicles": row[9],
|
||||
}
|
||||
pk = max(0, (cur["personnel_killed"] or 0) + delta.get("personnel_killed", 0))
|
||||
pw = max(0, (cur["personnel_wounded"] or 0) + delta.get("personnel_wounded", 0))
|
||||
ck = max(0, (cur["civilian_killed"] or 0) + delta.get("civilian_killed", 0))
|
||||
@@ -95,11 +96,18 @@ def merge(extracted: Dict[str, Any], db_path: Optional[str] = None) -> bool:
|
||||
ar = max(0, (cur["armor"] or 0) + delta.get("armor", 0))
|
||||
vh = max(0, (cur["vehicles"] or 0) + delta.get("vehicles", 0))
|
||||
ts = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z")
|
||||
conn.execute(
|
||||
"""UPDATE combat_losses SET personnel_killed=?, personnel_wounded=?, civilian_killed=?, civilian_wounded=?,
|
||||
bases_destroyed=?, bases_damaged=?, aircraft=?, warships=?, armor=?, vehicles=?, updated_at=? WHERE side=?""",
|
||||
(pk, pw, ck, cw, bd, bm, ac, ws, ar, vh, ts, side),
|
||||
)
|
||||
if row:
|
||||
conn.execute(
|
||||
"""UPDATE combat_losses SET personnel_killed=?, personnel_wounded=?, civilian_killed=?, civilian_wounded=?,
|
||||
bases_destroyed=?, bases_damaged=?, aircraft=?, warships=?, armor=?, vehicles=?, updated_at=? WHERE side=?""",
|
||||
(pk, pw, ck, cw, bd, bm, ac, ws, ar, vh, ts, side),
|
||||
)
|
||||
else:
|
||||
conn.execute(
|
||||
"""INSERT OR REPLACE INTO combat_losses (side, personnel_killed, personnel_wounded, civilian_killed, civilian_wounded,
|
||||
bases_destroyed, bases_damaged, aircraft, warships, armor, vehicles, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
(side, pk, pw, ck, cw, bd, bm, ac, ws, ar, vh, ts),
|
||||
)
|
||||
if conn.total_changes > 0:
|
||||
updated = True
|
||||
except Exception:
|
||||
@@ -115,6 +123,30 @@ def merge(extracted: Dict[str, Any], db_path: Optional[str] = None) -> bool:
|
||||
w = extracted["wall_street"]
|
||||
conn.execute("INSERT INTO wall_street_trend (time, value) VALUES (?, ?)", (w["time"], w["value"]))
|
||||
updated = True
|
||||
# key_location:更新受袭基地 status/damage_level
|
||||
if "key_location_updates" in extracted:
|
||||
try:
|
||||
for u in extracted["key_location_updates"]:
|
||||
kw = (u.get("name_keywords") or "").replace("|", " ").split()
|
||||
side = u.get("side")
|
||||
status = u.get("status", "attacked")[:20]
|
||||
dmg = u.get("damage_level", 2)
|
||||
if not kw or side not in ("us", "iran"):
|
||||
continue
|
||||
conditions = " OR ".join(
|
||||
"(LOWER(name) LIKE ? OR name LIKE ?)" for _ in kw
|
||||
)
|
||||
params = [status, dmg, side]
|
||||
for k in kw:
|
||||
params.extend([f"%{k}%", f"%{k}%"])
|
||||
cur = conn.execute(
|
||||
f"UPDATE key_location SET status=?, damage_level=? WHERE side=? AND ({conditions})",
|
||||
params,
|
||||
)
|
||||
if cur.rowcount > 0:
|
||||
updated = True
|
||||
except Exception:
|
||||
pass
|
||||
if updated:
|
||||
conn.execute("INSERT OR REPLACE INTO situation (id, data, updated_at) VALUES (1, '{}', ?)", (datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"),))
|
||||
conn.commit()
|
||||
|
||||
@@ -27,11 +27,16 @@ def _call_ollama_extract(text: str, timeout: int = 10) -> Optional[Dict[str, Any
|
||||
- summary: 1-2句中文事实,≤80字
|
||||
- category: deployment|alert|intel|diplomatic|other
|
||||
- severity: low|medium|high|critical
|
||||
- us_personnel_killed, iran_personnel_killed 等:仅当新闻明确提及具体数字时填写
|
||||
- 战损(仅当新闻明确提及数字时填写,格式 us_XXX / iran_XXX):
|
||||
us_personnel_killed, iran_personnel_killed, us_personnel_wounded, iran_personnel_wounded,
|
||||
us_civilian_killed, iran_civilian_killed, us_civilian_wounded, iran_civilian_wounded,
|
||||
us_bases_destroyed, iran_bases_destroyed, us_bases_damaged, iran_bases_damaged,
|
||||
us_aircraft, iran_aircraft, us_warships, iran_warships, us_armor, iran_armor, us_vehicles, iran_vehicles
|
||||
- retaliation_sentiment: 0-100,仅当新闻涉及伊朗报复情绪时
|
||||
- wall_street_value: 0-100,仅当新闻涉及美股/市场反应时
|
||||
- key_location_updates: 当新闻提及具体基地/地点遭袭时,数组项 { "name_keywords": "asad|阿萨德|assad", "side": "us", "status": "attacked", "damage_level": 1-3 }
|
||||
|
||||
原文:{str(text)[:500]}
|
||||
原文:{str(text)[:800]}
|
||||
|
||||
直接输出 JSON,不要解释:"""
|
||||
r = requests.post(
|
||||
@@ -97,4 +102,17 @@ def extract_from_news(text: str, timestamp: Optional[str] = None) -> Dict[str, A
|
||||
v = parsed["wall_street_value"]
|
||||
if isinstance(v, (int, float)) and 0 <= v <= 100:
|
||||
out["wall_street"] = {"time": ts, "value": int(v)}
|
||||
# key_location_updates:受袭基地
|
||||
if "key_location_updates" in parsed and isinstance(parsed["key_location_updates"], list):
|
||||
valid = []
|
||||
for u in parsed["key_location_updates"]:
|
||||
if isinstance(u, dict) and u.get("name_keywords") and u.get("side") in ("us", "iran"):
|
||||
valid.append({
|
||||
"name_keywords": str(u["name_keywords"]),
|
||||
"side": u["side"],
|
||||
"status": str(u.get("status", "attacked"))[:20],
|
||||
"damage_level": min(3, max(1, int(u["damage_level"]))) if isinstance(u.get("damage_level"), (int, float)) else 2,
|
||||
})
|
||||
if valid:
|
||||
out["key_location_updates"] = valid
|
||||
return out
|
||||
|
||||
@@ -24,18 +24,64 @@ def extract_from_news(text: str, timestamp: Optional[str] = None) -> Dict[str, A
|
||||
t = (text or "").lower()
|
||||
|
||||
loss_us, loss_ir = {}, {}
|
||||
v = _first_int(t, r"(?:us|american|u\.?s\.?)[\s\w]*(?:say|report)[\s\w]*(\d+)[\s\w]*(?:troop|soldier|killed|dead)")
|
||||
|
||||
# 美军人员伤亡
|
||||
v = _first_int(t, r"(?:us|american|u\.?s\.?)[\s\w]*(?:say|report)[\s\w]*(\d+)[\s\w]*(?:troop|soldier|military)[\s\w]*(?:killed|dead)")
|
||||
if v is not None:
|
||||
loss_us["personnel_killed"] = v
|
||||
v = _first_int(t, r"(\d+)[\s\w]*(?:us|american)[\s\w]*(?:troop|soldier|killed|dead)")
|
||||
v = _first_int(t, r"(\d+)[\s\w]*(?:us|american)[\s\w]*(?:troop|soldier|military)[\s\w]*(?:killed|dead)")
|
||||
if v is not None:
|
||||
loss_us["personnel_killed"] = v
|
||||
v = _first_int(t, r"(?:iran|iranian)[\s\w]*(?:say|report)[\s\w]*(\d+)[\s\w]*(?:troop|soldier|killed|dead)")
|
||||
v = _first_int(t, r"(?:us|american)[\s\w]*(\d+)[\s\w]*(?:wounded|injured)")
|
||||
if v is not None:
|
||||
loss_us["personnel_wounded"] = v
|
||||
|
||||
# 伊朗人员伤亡
|
||||
v = _first_int(t, r"(?:iran|iranian)[\s\w]*(?:say|report)[\s\w]*(\d+)[\s\w]*(?:troop|soldier|guard|killed|dead)")
|
||||
if v is not None:
|
||||
loss_ir["personnel_killed"] = v
|
||||
v = _first_int(t, r"(\d+)[\s\w]*(?:iranian|iran)[\s\w]*(?:troop|soldier|killed|dead)")
|
||||
v = _first_int(t, r"(\d+)[\s\w]*(?:iranian|iran)[\s\w]*(?:troop|soldier|guard|killed|dead)")
|
||||
if v is not None:
|
||||
loss_ir["personnel_killed"] = v
|
||||
v = _first_int(t, r"(?:iran|iranian)[\s\w]*(\d+)[\s\w]*(?:wounded|injured)")
|
||||
if v is not None:
|
||||
loss_ir["personnel_wounded"] = v
|
||||
|
||||
# 平民伤亡(多不区分阵营,计入双方或仅 us 因多为美国基地周边)
|
||||
v = _first_int(t, r"(\d+)[\s\w]*(?:civilian|civil)[\s\w]*(?:killed|dead)")
|
||||
if v is not None:
|
||||
loss_us["civilian_killed"] = v
|
||||
v = _first_int(t, r"(\d+)[\s\w]*(?:civilian|civil)[\s\w]*(?:wounded|injured)")
|
||||
if v is not None:
|
||||
loss_us["civilian_wounded"] = v
|
||||
|
||||
# 基地损毁(美方基地居多)
|
||||
v = _first_int(t, r"(\d+)[\s\w]*(?:base)[\s\w]*(?:destroyed|leveled)")
|
||||
if v is not None:
|
||||
loss_us["bases_destroyed"] = v
|
||||
v = _first_int(t, r"(\d+)[\s\w]*(?:base)[\s\w]*(?:damaged|hit|struck)")
|
||||
if v is not None:
|
||||
loss_us["bases_damaged"] = v
|
||||
if "base" in t and ("destroy" in t or "level" in t) and not loss_us.get("bases_destroyed"):
|
||||
loss_us["bases_destroyed"] = 1
|
||||
if "base" in t and ("damage" in t or "hit" in t or "struck" in t or "strike" in t) and not loss_us.get("bases_damaged"):
|
||||
loss_us["bases_damaged"] = 1
|
||||
|
||||
# 战机 / 舰船(根据上下文判断阵营)
|
||||
v = _first_int(t, r"(\d+)[\s\w]*(?:aircraft|plane|jet|fighter|f-?16|f-?35|f-?18)[\s\w]*(?:down|destroyed|lost|shot)")
|
||||
if v is not None:
|
||||
if "us" in t or "american" in t or "u.s" in t:
|
||||
loss_us["aircraft"] = v
|
||||
elif "iran" in t:
|
||||
loss_ir["aircraft"] = v
|
||||
else:
|
||||
loss_us["aircraft"] = v
|
||||
v = _first_int(t, r"(\d+)[\s\w]*(?:ship|destroyer|warship|vessel)[\s\w]*(?:hit|damaged|sunk)")
|
||||
if v is not None:
|
||||
if "iran" in t:
|
||||
loss_ir["warships"] = v
|
||||
else:
|
||||
loss_us["warships"] = v
|
||||
|
||||
if loss_us:
|
||||
out.setdefault("combat_losses_delta", {})["us"] = loss_us
|
||||
|
||||
@@ -14,13 +14,13 @@ from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import requests
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
|
||||
logging.getLogger("apscheduler.scheduler").setLevel(logging.ERROR)
|
||||
logging.getLogger("uvicorn").setLevel(logging.INFO)
|
||||
app = FastAPI(title="GDELT Conflict Service")
|
||||
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"])
|
||||
|
||||
@@ -263,8 +263,9 @@ def _extract_and_merge_panel_data(items: list) -> None:
|
||||
from extractor_ai import extract_from_news
|
||||
from datetime import timezone
|
||||
merged_any = False
|
||||
# 只对前几条有足够文本的新闻做提取,避免 Ollama 调用过多
|
||||
for it in items[:5]:
|
||||
# 规则模式可多处理几条(无 Ollama);AI 模式限制 5 条避免调用过多
|
||||
limit = 10 if os.environ.get("CLEANER_AI_DISABLED", "0") == "1" else 5
|
||||
for it in items[:limit]:
|
||||
text = (it.get("title", "") or "") + " " + (it.get("summary", "") or "")
|
||||
if len(text.strip()) < 20:
|
||||
continue
|
||||
@@ -292,12 +293,22 @@ def _extract_and_merge_panel_data(items: list) -> None:
|
||||
|
||||
|
||||
# ==========================
|
||||
# 定时任务(RSS 更频繁,优先保证事件脉络实时)
|
||||
# 定时任务(asyncio 后台任务,避免 APScheduler executor 关闭竞态)
|
||||
# ==========================
|
||||
scheduler = BackgroundScheduler()
|
||||
scheduler.add_job(fetch_news, "interval", seconds=RSS_INTERVAL_SEC, max_instances=2, coalesce=True)
|
||||
scheduler.add_job(fetch_gdelt_events, "interval", seconds=FETCH_INTERVAL_SEC, max_instances=2, coalesce=True)
|
||||
scheduler.start()
|
||||
_bg_task: Optional[asyncio.Task] = None
|
||||
|
||||
|
||||
async def _periodic_fetch() -> None:
|
||||
loop = asyncio.get_event_loop()
|
||||
while True:
|
||||
try:
|
||||
await loop.run_in_executor(None, fetch_news)
|
||||
await loop.run_in_executor(None, fetch_gdelt_events)
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
print(f" [warn] 定时抓取: {e}")
|
||||
await asyncio.sleep(min(RSS_INTERVAL_SEC, FETCH_INTERVAL_SEC))
|
||||
|
||||
|
||||
# ==========================
|
||||
@@ -356,18 +367,23 @@ def _get_conflict_stats() -> dict:
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
def startup():
|
||||
# 新闻优先启动,确保事件脉络有数据
|
||||
fetch_news()
|
||||
fetch_gdelt_events()
|
||||
async def startup():
|
||||
global _bg_task
|
||||
loop = asyncio.get_event_loop()
|
||||
await loop.run_in_executor(None, fetch_news)
|
||||
await loop.run_in_executor(None, fetch_gdelt_events)
|
||||
_bg_task = asyncio.create_task(_periodic_fetch())
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
def shutdown():
|
||||
try:
|
||||
scheduler.shutdown(wait=False)
|
||||
except Exception:
|
||||
pass
|
||||
async def shutdown():
|
||||
global _bg_task
|
||||
if _bg_task and not _bg_task.done():
|
||||
_bg_task.cancel()
|
||||
try:
|
||||
await _bg_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -2,5 +2,4 @@ requests>=2.31.0
|
||||
feedparser>=6.0.0
|
||||
fastapi>=0.109.0
|
||||
uvicorn>=0.27.0
|
||||
apscheduler>=3.10.0
|
||||
deep-translator>=1.11.0
|
||||
|
||||
Reference in New Issue
Block a user