fix: 优化数据

This commit is contained in:
Daniel
2026-03-02 11:28:13 +08:00
parent 4a8fff5a00
commit 004d10b283
39 changed files with 1106 additions and 56 deletions

87
crawler/cleaner_ai.py Normal file
View File

@@ -0,0 +1,87 @@
# -*- coding: utf-8 -*-
"""
AI 清洗新闻数据,严格按面板字段约束输出
面板 EventTimelinePanel 所需summary(≤120字)、category(枚举)、severity(枚举)
"""
import os
import re
from typing import Optional
CLEANER_AI_DISABLED = os.environ.get("CLEANER_AI_DISABLED", "0") == "1"
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "llama3.1")
# 面板 schema必须与 EventTimelinePanel / SituationUpdate 一致
SUMMARY_MAX_LEN = 120 # 面板 line-clamp-2 展示
CATEGORIES = ("deployment", "alert", "intel", "diplomatic", "other")
SEVERITIES = ("low", "medium", "high", "critical")
def _sanitize_summary(text: str, max_len: int = SUMMARY_MAX_LEN) -> str:
"""确保 summary 符合面板:纯文本、无换行、限制长度"""
if not text or not isinstance(text, str):
return ""
s = re.sub(r"\s+", " ", str(text).strip())
s = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", "", s) # 去除控制字符
return s[:max_len].rstrip()
def _rule_clean(text: str, max_len: int = SUMMARY_MAX_LEN) -> str:
"""规则清洗:去空白、去控制符、截断"""
return _sanitize_summary(text, max_len)
def _call_ollama_summary(text: str, max_len: int, timeout: int = 6) -> Optional[str]:
"""调用 Ollama 提炼摘要输出须为纯文本、≤max_len 字"""
if CLEANER_AI_DISABLED or not text or len(str(text).strip()) < 5:
return None
try:
import requests
prompt = f"""将新闻提炼为1-2句简洁中文事实直接输出纯文本不要标号、引号、解释。限{max_len}字内。
原文:{str(text)[:350]}
输出:"""
r = requests.post(
"http://localhost:11434/api/chat",
json={
"model": OLLAMA_MODEL,
"messages": [{"role": "user", "content": prompt}],
"stream": False,
"options": {"num_predict": 150},
},
timeout=timeout,
)
if r.status_code != 200:
return None
out = (r.json().get("message", {}).get("content", "") or "").strip()
out = re.sub(r"^[\d\.\-\*\s]+", "", out) # 去编号
out = re.sub(r"^['\"\s]+|['\"\s]+$", "", out)
out = _sanitize_summary(out, max_len)
if out and len(out) > 3:
return out
return None
except Exception:
return None
def clean_news_for_panel(text: str, max_len: int = SUMMARY_MAX_LEN) -> str:
"""清洗 summary 字段,供 EventTimelinePanel 展示。输出必为≤max_len 的纯文本"""
if not text or not isinstance(text, str):
return ""
t = str(text).strip()
if not t:
return ""
res = _call_ollama_summary(t, max_len, timeout=6)
if res:
return res
return _rule_clean(t, max_len)
def ensure_category(cat: str) -> str:
"""确保 category 在面板枚举内"""
return cat if cat in CATEGORIES else "other"
def ensure_severity(sev: str) -> str:
"""确保 severity 在面板枚举内"""
return sev if sev in SEVERITIES else "medium"