# -*- coding: utf-8 -*- """ AI 清洗新闻数据,严格按面板字段约束输出 面板 EventTimelinePanel 所需:summary(≤120字)、category(枚举)、severity(枚举) 优先使用 DASHSCOPE_API_KEY(通义,无需 Ollama),否则 Ollama,最后规则兜底 """ import os import re from typing import Optional CLEANER_AI_DISABLED = os.environ.get("CLEANER_AI_DISABLED", "0") == "1" OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "llama3.1") DASHSCOPE_API_KEY = os.environ.get("DASHSCOPE_API_KEY", "").strip() # 面板 schema:必须与 EventTimelinePanel / SituationUpdate 一致 SUMMARY_MAX_LEN = 120 # 面板 line-clamp-2 展示 CATEGORIES = ("deployment", "alert", "intel", "diplomatic", "other") SEVERITIES = ("low", "medium", "high", "critical") def _sanitize_summary(text: str, max_len: int = SUMMARY_MAX_LEN) -> str: """确保 summary 符合面板:纯文本、无换行、限制长度""" if not text or not isinstance(text, str): return "" s = re.sub(r"\s+", " ", str(text).strip()) s = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", "", s) # 去除控制字符 return s[:max_len].rstrip() def _rule_clean(text: str, max_len: int = SUMMARY_MAX_LEN) -> str: """规则清洗:去空白、去控制符、截断""" return _sanitize_summary(text, max_len) def _call_dashscope_summary(text: str, max_len: int, timeout: int = 8) -> Optional[str]: """调用阿里云通义(DashScope)提炼摘要,无需 Ollama。需设置 DASHSCOPE_API_KEY""" if not DASHSCOPE_API_KEY or CLEANER_AI_DISABLED or not text or len(str(text).strip()) < 5: return None try: import dashscope from http import HTTPStatus dashscope.api_key = DASHSCOPE_API_KEY prompt = f"""将新闻提炼为1-2句简洁中文事实,直接输出纯文本,不要标号、引号、解释。限{max_len}字内。 原文:{str(text)[:350]} 输出:""" r = dashscope.Generation.call( model="qwen-turbo", messages=[{"role": "user", "content": prompt}], result_format="message", max_tokens=150, ) if r.status_code != HTTPStatus.OK: return None out = (r.output.get("choices", [{}])[0].get("message", {}).get("content", "") or "").strip() out = re.sub(r"^[\d\.\-\*\s]+", "", out) out = re.sub(r"^['\"\s]+|['\"\s]+$", "", out) out = _sanitize_summary(out, max_len) if out and len(out) > 3: return out return None except Exception: return None def _call_ollama_summary(text: str, max_len: int, timeout: int = 6) -> Optional[str]: """调用 Ollama 提炼摘要,输出须为纯文本、≤max_len 字""" if CLEANER_AI_DISABLED or not text or len(str(text).strip()) < 5: return None try: import requests prompt = f"""将新闻提炼为1-2句简洁中文事实,直接输出纯文本,不要标号、引号、解释。限{max_len}字内。 原文:{str(text)[:350]} 输出:""" r = requests.post( "http://localhost:11434/api/chat", json={ "model": OLLAMA_MODEL, "messages": [{"role": "user", "content": prompt}], "stream": False, "options": {"num_predict": 150}, }, timeout=timeout, ) if r.status_code != 200: return None out = (r.json().get("message", {}).get("content", "") or "").strip() out = re.sub(r"^[\d\.\-\*\s]+", "", out) # 去编号 out = re.sub(r"^['\"\s]+|['\"\s]+$", "", out) out = _sanitize_summary(out, max_len) if out and len(out) > 3: return out return None except Exception: return None def clean_news_for_panel(text: str, max_len: int = SUMMARY_MAX_LEN) -> str: """清洗 summary 字段,供 EventTimelinePanel 展示。输出必为≤max_len 的纯文本""" if not text or not isinstance(text, str): return "" t = str(text).strip() if not t: return "" # 优先商业模型(通义),再 Ollama,最后规则 if DASHSCOPE_API_KEY: res = _call_dashscope_summary(t, max_len, timeout=8) else: res = _call_ollama_summary(t, max_len, timeout=6) if res: return res return _rule_clean(t, max_len) def ensure_category(cat: str) -> str: """确保 category 在面板枚举内""" return cat if cat in CATEGORIES else "other" def ensure_severity(sev: str) -> str: """确保 severity 在面板枚举内""" return sev if sev in SEVERITIES else "medium"