Files
usa/crawler/cleaner_ai.py
2026-03-02 11:28:13 +08:00

88 lines
3.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
AI 清洗新闻数据,严格按面板字段约束输出
面板 EventTimelinePanel 所需summary(≤120字)、category(枚举)、severity(枚举)
"""
import os
import re
from typing import Optional
CLEANER_AI_DISABLED = os.environ.get("CLEANER_AI_DISABLED", "0") == "1"
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "llama3.1")
# 面板 schema必须与 EventTimelinePanel / SituationUpdate 一致
SUMMARY_MAX_LEN = 120 # 面板 line-clamp-2 展示
CATEGORIES = ("deployment", "alert", "intel", "diplomatic", "other")
SEVERITIES = ("low", "medium", "high", "critical")
def _sanitize_summary(text: str, max_len: int = SUMMARY_MAX_LEN) -> str:
"""确保 summary 符合面板:纯文本、无换行、限制长度"""
if not text or not isinstance(text, str):
return ""
s = re.sub(r"\s+", " ", str(text).strip())
s = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", "", s) # 去除控制字符
return s[:max_len].rstrip()
def _rule_clean(text: str, max_len: int = SUMMARY_MAX_LEN) -> str:
"""规则清洗:去空白、去控制符、截断"""
return _sanitize_summary(text, max_len)
def _call_ollama_summary(text: str, max_len: int, timeout: int = 6) -> Optional[str]:
"""调用 Ollama 提炼摘要输出须为纯文本、≤max_len 字"""
if CLEANER_AI_DISABLED or not text or len(str(text).strip()) < 5:
return None
try:
import requests
prompt = f"""将新闻提炼为1-2句简洁中文事实直接输出纯文本不要标号、引号、解释。限{max_len}字内。
原文:{str(text)[:350]}
输出:"""
r = requests.post(
"http://localhost:11434/api/chat",
json={
"model": OLLAMA_MODEL,
"messages": [{"role": "user", "content": prompt}],
"stream": False,
"options": {"num_predict": 150},
},
timeout=timeout,
)
if r.status_code != 200:
return None
out = (r.json().get("message", {}).get("content", "") or "").strip()
out = re.sub(r"^[\d\.\-\*\s]+", "", out) # 去编号
out = re.sub(r"^['\"\s]+|['\"\s]+$", "", out)
out = _sanitize_summary(out, max_len)
if out and len(out) > 3:
return out
return None
except Exception:
return None
def clean_news_for_panel(text: str, max_len: int = SUMMARY_MAX_LEN) -> str:
"""清洗 summary 字段,供 EventTimelinePanel 展示。输出必为≤max_len 的纯文本"""
if not text or not isinstance(text, str):
return ""
t = str(text).strip()
if not t:
return ""
res = _call_ollama_summary(t, max_len, timeout=6)
if res:
return res
return _rule_clean(t, max_len)
def ensure_category(cat: str) -> str:
"""确保 category 在面板枚举内"""
return cat if cat in CATEGORIES else "other"
def ensure_severity(sev: str) -> str:
"""确保 severity 在面板枚举内"""
return sev if sev in SEVERITIES else "medium"