88 lines
3.0 KiB
Python
88 lines
3.0 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
AI 清洗新闻数据,严格按面板字段约束输出
|
||
面板 EventTimelinePanel 所需:summary(≤120字)、category(枚举)、severity(枚举)
|
||
"""
|
||
import os
|
||
import re
|
||
from typing import Optional
|
||
|
||
CLEANER_AI_DISABLED = os.environ.get("CLEANER_AI_DISABLED", "0") == "1"
|
||
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "llama3.1")
|
||
|
||
# 面板 schema:必须与 EventTimelinePanel / SituationUpdate 一致
|
||
SUMMARY_MAX_LEN = 120 # 面板 line-clamp-2 展示
|
||
CATEGORIES = ("deployment", "alert", "intel", "diplomatic", "other")
|
||
SEVERITIES = ("low", "medium", "high", "critical")
|
||
|
||
|
||
def _sanitize_summary(text: str, max_len: int = SUMMARY_MAX_LEN) -> str:
|
||
"""确保 summary 符合面板:纯文本、无换行、限制长度"""
|
||
if not text or not isinstance(text, str):
|
||
return ""
|
||
s = re.sub(r"\s+", " ", str(text).strip())
|
||
s = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", "", s) # 去除控制字符
|
||
return s[:max_len].rstrip()
|
||
|
||
|
||
def _rule_clean(text: str, max_len: int = SUMMARY_MAX_LEN) -> str:
|
||
"""规则清洗:去空白、去控制符、截断"""
|
||
return _sanitize_summary(text, max_len)
|
||
|
||
|
||
def _call_ollama_summary(text: str, max_len: int, timeout: int = 6) -> Optional[str]:
|
||
"""调用 Ollama 提炼摘要,输出须为纯文本、≤max_len 字"""
|
||
if CLEANER_AI_DISABLED or not text or len(str(text).strip()) < 5:
|
||
return None
|
||
try:
|
||
import requests
|
||
prompt = f"""将新闻提炼为1-2句简洁中文事实,直接输出纯文本,不要标号、引号、解释。限{max_len}字内。
|
||
|
||
原文:{str(text)[:350]}
|
||
|
||
输出:"""
|
||
r = requests.post(
|
||
"http://localhost:11434/api/chat",
|
||
json={
|
||
"model": OLLAMA_MODEL,
|
||
"messages": [{"role": "user", "content": prompt}],
|
||
"stream": False,
|
||
"options": {"num_predict": 150},
|
||
},
|
||
timeout=timeout,
|
||
)
|
||
if r.status_code != 200:
|
||
return None
|
||
out = (r.json().get("message", {}).get("content", "") or "").strip()
|
||
out = re.sub(r"^[\d\.\-\*\s]+", "", out) # 去编号
|
||
out = re.sub(r"^['\"\s]+|['\"\s]+$", "", out)
|
||
out = _sanitize_summary(out, max_len)
|
||
if out and len(out) > 3:
|
||
return out
|
||
return None
|
||
except Exception:
|
||
return None
|
||
|
||
|
||
def clean_news_for_panel(text: str, max_len: int = SUMMARY_MAX_LEN) -> str:
|
||
"""清洗 summary 字段,供 EventTimelinePanel 展示。输出必为≤max_len 的纯文本"""
|
||
if not text or not isinstance(text, str):
|
||
return ""
|
||
t = str(text).strip()
|
||
if not t:
|
||
return ""
|
||
res = _call_ollama_summary(t, max_len, timeout=6)
|
||
if res:
|
||
return res
|
||
return _rule_clean(t, max_len)
|
||
|
||
|
||
def ensure_category(cat: str) -> str:
|
||
"""确保 category 在面板枚举内"""
|
||
return cat if cat in CATEGORIES else "other"
|
||
|
||
|
||
def ensure_severity(sev: str) -> str:
|
||
"""确保 severity 在面板枚举内"""
|
||
return sev if sev in SEVERITIES else "medium"
|