fix: 优化数据
This commit is contained in:
87
crawler/cleaner_ai.py
Normal file
87
crawler/cleaner_ai.py
Normal file
@@ -0,0 +1,87 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
AI 清洗新闻数据,严格按面板字段约束输出
|
||||
面板 EventTimelinePanel 所需:summary(≤120字)、category(枚举)、severity(枚举)
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
CLEANER_AI_DISABLED = os.environ.get("CLEANER_AI_DISABLED", "0") == "1"
|
||||
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "llama3.1")
|
||||
|
||||
# 面板 schema:必须与 EventTimelinePanel / SituationUpdate 一致
|
||||
SUMMARY_MAX_LEN = 120 # 面板 line-clamp-2 展示
|
||||
CATEGORIES = ("deployment", "alert", "intel", "diplomatic", "other")
|
||||
SEVERITIES = ("low", "medium", "high", "critical")
|
||||
|
||||
|
||||
def _sanitize_summary(text: str, max_len: int = SUMMARY_MAX_LEN) -> str:
|
||||
"""确保 summary 符合面板:纯文本、无换行、限制长度"""
|
||||
if not text or not isinstance(text, str):
|
||||
return ""
|
||||
s = re.sub(r"\s+", " ", str(text).strip())
|
||||
s = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", "", s) # 去除控制字符
|
||||
return s[:max_len].rstrip()
|
||||
|
||||
|
||||
def _rule_clean(text: str, max_len: int = SUMMARY_MAX_LEN) -> str:
|
||||
"""规则清洗:去空白、去控制符、截断"""
|
||||
return _sanitize_summary(text, max_len)
|
||||
|
||||
|
||||
def _call_ollama_summary(text: str, max_len: int, timeout: int = 6) -> Optional[str]:
|
||||
"""调用 Ollama 提炼摘要,输出须为纯文本、≤max_len 字"""
|
||||
if CLEANER_AI_DISABLED or not text or len(str(text).strip()) < 5:
|
||||
return None
|
||||
try:
|
||||
import requests
|
||||
prompt = f"""将新闻提炼为1-2句简洁中文事实,直接输出纯文本,不要标号、引号、解释。限{max_len}字内。
|
||||
|
||||
原文:{str(text)[:350]}
|
||||
|
||||
输出:"""
|
||||
r = requests.post(
|
||||
"http://localhost:11434/api/chat",
|
||||
json={
|
||||
"model": OLLAMA_MODEL,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"stream": False,
|
||||
"options": {"num_predict": 150},
|
||||
},
|
||||
timeout=timeout,
|
||||
)
|
||||
if r.status_code != 200:
|
||||
return None
|
||||
out = (r.json().get("message", {}).get("content", "") or "").strip()
|
||||
out = re.sub(r"^[\d\.\-\*\s]+", "", out) # 去编号
|
||||
out = re.sub(r"^['\"\s]+|['\"\s]+$", "", out)
|
||||
out = _sanitize_summary(out, max_len)
|
||||
if out and len(out) > 3:
|
||||
return out
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def clean_news_for_panel(text: str, max_len: int = SUMMARY_MAX_LEN) -> str:
|
||||
"""清洗 summary 字段,供 EventTimelinePanel 展示。输出必为≤max_len 的纯文本"""
|
||||
if not text or not isinstance(text, str):
|
||||
return ""
|
||||
t = str(text).strip()
|
||||
if not t:
|
||||
return ""
|
||||
res = _call_ollama_summary(t, max_len, timeout=6)
|
||||
if res:
|
||||
return res
|
||||
return _rule_clean(t, max_len)
|
||||
|
||||
|
||||
def ensure_category(cat: str) -> str:
|
||||
"""确保 category 在面板枚举内"""
|
||||
return cat if cat in CATEGORIES else "other"
|
||||
|
||||
|
||||
def ensure_severity(sev: str) -> str:
|
||||
"""确保 severity 在面板枚举内"""
|
||||
return sev if sev in SEVERITIES else "medium"
|
||||
Reference in New Issue
Block a user