fix
This commit is contained in:
@@ -1,6 +1,10 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import difflib
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from textwrap import shorten
|
||||
|
||||
from openai import OpenAI
|
||||
@@ -8,73 +12,434 @@ from openai import OpenAI
|
||||
from app.config import settings
|
||||
from app.schemas import RewriteRequest, RewriteResponse
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
SYSTEM_PROMPT = """
|
||||
你是中文内容编辑与合规顾问。请把输入内容进行“原创改写”,要求:
|
||||
1) 保留核心事实,但避免逐句复述;
|
||||
2) 结构清晰:导语、3-5个小节、结尾行动建议;
|
||||
3) 风格适合微信公众号,表达自然,避免AI腔;
|
||||
4) 如果原文存在未经核实结论,请使用“可能/有待验证”等措辞;
|
||||
5) 输出必须是 JSON,字段:title, summary, body_markdown。
|
||||
你是顶级中文公众号主编,擅长把 X/Twitter 的观点型内容改写成高质量公众号文章。
|
||||
你的目标不是“同义替换”,而是“重构表达”,保证可读性、逻辑性和可发布性。
|
||||
|
||||
硬性规则:
|
||||
1) 保留核心事实与关键观点,不编造数据,不夸大结论;
|
||||
2) 文章结构必须完整:导语 -> 核心观点 -> 深度分析 -> 落地建议 -> 结语;
|
||||
3) 风格自然,避免 AI 套话(如“首先其次最后”“赋能”“闭环”等空话);
|
||||
4) 每节都要有信息增量,不要重复原文句式;
|
||||
5) 输出必须是合法 JSON,字段:title, summary, body_markdown。
|
||||
""".strip()
|
||||
|
||||
|
||||
REWRITE_SCHEMA_HINT = """
|
||||
请输出 JSON:
|
||||
{
|
||||
"title": "20字内中文标题,明确价值点",
|
||||
"summary": "80-120字中文摘要,说明读者收获",
|
||||
"body_markdown": "完整Markdown正文"
|
||||
}
|
||||
|
||||
正文格式要求(必须遵循):
|
||||
## 导语
|
||||
2-3段,交代背景、冲突与阅读价值。
|
||||
|
||||
## 核心观点
|
||||
- 3~5条要点,每条是完整信息句,不要口号。
|
||||
|
||||
## 深度分析
|
||||
### 1) 现象背后的原因
|
||||
2-3段
|
||||
### 2) 对行业/团队的影响
|
||||
2-3段
|
||||
### 3) 关键风险与边界
|
||||
2-3段
|
||||
|
||||
## 落地建议
|
||||
1. 三到五条可执行动作,尽量包含“谁在什么场景做什么”。
|
||||
|
||||
## 结语
|
||||
1段,收束观点并给出下一步建议。
|
||||
""".strip()
|
||||
|
||||
|
||||
class AIRewriter:
|
||||
def __init__(self) -> None:
|
||||
self._client = None
|
||||
self._prefer_chat_first = False
|
||||
if settings.openai_api_key:
|
||||
base_url = settings.openai_base_url or ""
|
||||
self._prefer_chat_first = "dashscope.aliyuncs.com" in base_url
|
||||
self._client = OpenAI(
|
||||
api_key=settings.openai_api_key,
|
||||
base_url=settings.openai_base_url,
|
||||
timeout=settings.openai_timeout,
|
||||
max_retries=1,
|
||||
)
|
||||
|
||||
def rewrite(self, req: RewriteRequest) -> RewriteResponse:
|
||||
if not self._client:
|
||||
return self._fallback_rewrite(req)
|
||||
cleaned_source = self._clean_source(req.source_text)
|
||||
started = time.monotonic()
|
||||
|
||||
# Primary: model rewrite + quality gate + optional second-pass polish.
|
||||
if self._client:
|
||||
# DashScope/Qwen works better with a single stable call.
|
||||
if self._prefer_chat_first:
|
||||
first_pass_timeout = max(18.0, min(30.0, settings.openai_timeout))
|
||||
else:
|
||||
first_pass_timeout = max(20.0, min(50.0, settings.openai_timeout))
|
||||
draft = self._model_rewrite(req, cleaned_source, timeout_sec=first_pass_timeout)
|
||||
if draft:
|
||||
normalized = self._normalize_result(draft)
|
||||
issues = self._quality_issues(req, cleaned_source, normalized)
|
||||
elapsed = time.monotonic() - started
|
||||
remaining_budget = max(0.0, (first_pass_timeout + 20.0) - elapsed)
|
||||
if issues and (not self._prefer_chat_first) and remaining_budget >= 10.0:
|
||||
polished = self._model_polish(
|
||||
req,
|
||||
cleaned_source,
|
||||
normalized,
|
||||
issues,
|
||||
timeout_sec=min(30.0, remaining_budget),
|
||||
)
|
||||
if polished:
|
||||
normalized = self._normalize_result(polished)
|
||||
final_issues = self._quality_issues(req, cleaned_source, normalized)
|
||||
if not final_issues:
|
||||
return RewriteResponse(**normalized, mode="ai", quality_notes=[])
|
||||
logger.warning("rewrite quality gate fallback triggered: %s", final_issues)
|
||||
|
||||
# Secondary: deterministic fallback with publishable structure.
|
||||
return self._fallback_rewrite(req, cleaned_source, reason="模型超时或质量未达标,已使用结构化保底稿")
|
||||
|
||||
def _model_rewrite(self, req: RewriteRequest, cleaned_source: str, timeout_sec: float) -> dict | None:
|
||||
user_prompt = self._build_user_prompt(req, cleaned_source)
|
||||
return self._call_model_json(user_prompt, timeout_sec=timeout_sec)
|
||||
|
||||
def _model_polish(
|
||||
self,
|
||||
req: RewriteRequest,
|
||||
cleaned_source: str,
|
||||
normalized: dict,
|
||||
issues: list[str],
|
||||
timeout_sec: float,
|
||||
) -> dict | None:
|
||||
issue_text = "\n".join([f"- {i}" for i in issues])
|
||||
user_prompt = f"""
|
||||
原始内容:
|
||||
{req.source_text}
|
||||
你上一次的改写稿质量未达标,请基于下面问题做彻底重写,不要只改几个词:
|
||||
{issue_text}
|
||||
|
||||
改写约束:
|
||||
原始内容:
|
||||
{cleaned_source}
|
||||
|
||||
上一次草稿:
|
||||
标题:{normalized.get('title', '')}
|
||||
摘要:{normalized.get('summary', '')}
|
||||
正文:
|
||||
{normalized.get('body_markdown', '')}
|
||||
|
||||
用户改写偏好:
|
||||
- 标题参考:{req.title_hint or '自动生成'}
|
||||
- 目标语气:{req.tone}
|
||||
- 语气风格:{req.tone}
|
||||
- 目标读者:{req.audience}
|
||||
- 必须保留观点:{req.keep_points or '无'}
|
||||
- 避免词汇:{req.avoid_words or '无'}
|
||||
|
||||
请输出一个全新且高质量版本。{REWRITE_SCHEMA_HINT}
|
||||
""".strip()
|
||||
return self._call_model_json(user_prompt, timeout_sec=timeout_sec)
|
||||
|
||||
def _build_user_prompt(self, req: RewriteRequest, cleaned_source: str) -> str:
|
||||
return f"""
|
||||
原始内容(已清洗):
|
||||
{cleaned_source}
|
||||
|
||||
用户改写偏好:
|
||||
- 标题参考:{req.title_hint or '自动生成'}
|
||||
- 语气风格:{req.tone}
|
||||
- 目标读者:{req.audience}
|
||||
- 必须保留观点:{req.keep_points or '无'}
|
||||
- 避免词汇:{req.avoid_words or '无'}
|
||||
|
||||
任务:请输出可直接用于公众号发布的文章。{REWRITE_SCHEMA_HINT}
|
||||
""".strip()
|
||||
|
||||
completion = self._client.responses.create(
|
||||
model=settings.openai_model,
|
||||
input=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
text={"format": {"type": "json_object"}},
|
||||
)
|
||||
def _fallback_rewrite(self, req: RewriteRequest, cleaned_source: str, reason: str) -> RewriteResponse:
|
||||
sentences = self._extract_sentences(cleaned_source)
|
||||
points = self._pick_key_points(sentences, limit=5)
|
||||
title = req.title_hint.strip() or self._build_fallback_title(sentences)
|
||||
|
||||
text = completion.output_text
|
||||
import json
|
||||
summary = self._build_fallback_summary(points, cleaned_source)
|
||||
intro = self._build_intro(points, cleaned_source)
|
||||
analysis = self._build_analysis(points)
|
||||
actions = self._build_actions(points)
|
||||
conclusion = "如果你准备把这类内容持续做成栏目,建议建立固定模板:观点来源、关键证据、执行建议、复盘结论。"
|
||||
|
||||
data = json.loads(text)
|
||||
return RewriteResponse(**data)
|
||||
|
||||
def _fallback_rewrite(self, req: RewriteRequest) -> RewriteResponse:
|
||||
clean_text = re.sub(r"\n{2,}", "\n", req.source_text.strip())
|
||||
lines = [line.strip() for line in clean_text.split("\n") if line.strip()]
|
||||
head = lines[0] if lines else clean_text[:50]
|
||||
title = req.title_hint.strip() or f"{shorten(head, width=26, placeholder='')}:可执行解读"
|
||||
summary = shorten(clean_text, width=90, placeholder="...")
|
||||
body = (
|
||||
f"## 导语\n"
|
||||
f"这篇内容值得关注的核心在于:{summary}\n\n"
|
||||
f"## 重点拆解\n"
|
||||
f"1. 背景与问题:从原文可以看到关键矛盾已出现。\n"
|
||||
f"2. 方法与动作:建议按“目标-路径-验证”三步推进。\n"
|
||||
f"3. 风险与边界:避免绝对化表述,必要时补充数据来源。\n\n"
|
||||
f"## 公众号改写正文\n"
|
||||
f"{clean_text}\n\n"
|
||||
f"## 结尾\n"
|
||||
f"以上为原创重组版本,可继续补充案例与数据后发布。"
|
||||
"## 导语\n"
|
||||
f"{intro}\n\n"
|
||||
"## 核心观点\n"
|
||||
+ "\n".join([f"- {p}" for p in points])
|
||||
+ "\n\n"
|
||||
"## 深度分析\n"
|
||||
"### 1) 现象背后的原因\n"
|
||||
f"{analysis['cause']}\n\n"
|
||||
"### 2) 对行业/团队的影响\n"
|
||||
f"{analysis['impact']}\n\n"
|
||||
"### 3) 关键风险与边界\n"
|
||||
f"{analysis['risk']}\n\n"
|
||||
"## 落地建议\n"
|
||||
+ "\n".join([f"{i + 1}. {a}" for i, a in enumerate(actions)])
|
||||
+ "\n\n"
|
||||
"## 结语\n"
|
||||
f"{conclusion}"
|
||||
)
|
||||
return RewriteResponse(title=title, summary=summary, body_markdown=body)
|
||||
|
||||
normalized = {
|
||||
"title": title,
|
||||
"summary": summary,
|
||||
"body_markdown": self._format_markdown(body),
|
||||
}
|
||||
return RewriteResponse(**normalized, mode="fallback", quality_notes=[reason])
|
||||
|
||||
def _build_fallback_title(self, sentences: list[str]) -> str:
|
||||
seed = sentences[0] if sentences else "内容改写"
|
||||
seed = shorten(seed, width=16, placeholder="")
|
||||
return f"{seed}:给内容创作者的实战拆解"
|
||||
|
||||
def _build_fallback_summary(self, points: list[str], source: str) -> str:
|
||||
if len(points) >= 2:
|
||||
return f"本文提炼了{points[0]},并进一步分析{points[1]},最后给出可直接执行的发布建议,帮助你把观点内容做成高质量公众号文章。"
|
||||
return shorten(re.sub(r"\s+", " ", source), width=110, placeholder="...")
|
||||
|
||||
def _build_intro(self, points: list[str], source: str) -> str:
|
||||
focus = points[0] if points else shorten(source, width=42, placeholder="...")
|
||||
return (
|
||||
f"这篇内容的价值不在“信息多”,而在于它点出了一个真正值得关注的问题:{focus}。\n\n"
|
||||
"对公众号读者来说,最关心的是这件事会带来什么变化、现在能做什么。"
|
||||
"因此本文不做逐句复述,而是按“观点-影响-动作”重组,方便直接落地。"
|
||||
)
|
||||
|
||||
def _build_analysis(self, points: list[str]) -> dict[str, str]:
|
||||
p1 = points[0] if points else "行业正在从信息堆叠转向结果导向"
|
||||
p2 = points[1] if len(points) > 1 else "团队协作方式被自动化流程重塑"
|
||||
p3 = points[2] if len(points) > 2 else "内容质量会成为真正分水岭"
|
||||
return {
|
||||
"cause": (
|
||||
f"从表面看是工具迭代,实质是生产逻辑变化。{p1},意味着过去依赖经验的环节,正在被标准化流程替代。"
|
||||
"谁先完成流程化改造,谁就更容易稳定产出。"
|
||||
),
|
||||
"impact": (
|
||||
f"短期影响体现在效率,中长期影响体现在品牌认知。{p2}。"
|
||||
"如果只追求发布速度,内容会快速同质化;如果把洞察和表达打磨成体系,内容资产会持续增值。"
|
||||
),
|
||||
"risk": (
|
||||
f"最大的风险不是‘不用 AI’,而是‘只用 AI’。{p3}。"
|
||||
"没有事实校对与人工观点把关,文章容易出现空泛表达、错误引用和结论过度。"
|
||||
),
|
||||
}
|
||||
|
||||
def _build_actions(self, points: list[str]) -> list[str]:
|
||||
anchor = points[0] if points else "核心观点"
|
||||
return [
|
||||
f"先确定本篇唯一主线:围绕“{anchor}”展开,删除与主线无关的段落。",
|
||||
"按“导语-观点-分析-建议-结语”五段式重排正文,每段只解决一个问题。",
|
||||
"为每个核心观点补一条可验证依据(数据、案例或公开来源),提升可信度。",
|
||||
"发布前做一次反 AI 味检查:删掉空话,替换为具体动作和明确对象。",
|
||||
"将高表现文章沉淀为模板,下次复用同样结构提高稳定性。",
|
||||
]
|
||||
|
||||
def _clean_source(self, text: str) -> str:
|
||||
src = (text or "").replace("\r\n", "\n").strip()
|
||||
src = re.sub(r"https?://\S+", "", src)
|
||||
src = re.sub(r"(?m)^\s*>+\s*", "", src)
|
||||
src = re.sub(r"(?m)^\s*[@#][^\s]+\s*$", "", src)
|
||||
src = re.sub(r"\n{3,}", "\n\n", src)
|
||||
src = re.sub(r"\s+", " ", src)
|
||||
src = src.strip()
|
||||
max_chars = max(1200, settings.openai_source_max_chars)
|
||||
if len(src) > max_chars:
|
||||
src = src[:max_chars] + " ...(原文过长,已截断后改写)"
|
||||
return src
|
||||
|
||||
def _extract_sentences(self, text: str) -> list[str]:
|
||||
parts = re.split(r"[。!?;;.!?\n]+", text)
|
||||
cleaned = [p.strip(" ,,;;::。") for p in parts if p.strip()]
|
||||
return cleaned
|
||||
|
||||
def _pick_key_points(self, sentences: list[str], limit: int) -> list[str]:
|
||||
points: list[str] = []
|
||||
templates = [
|
||||
"核心变化:{}",
|
||||
"关键问题:{}",
|
||||
"方法调整:{}",
|
||||
"结果反馈:{}",
|
||||
"结论启示:{}",
|
||||
]
|
||||
for s in sentences:
|
||||
if len(s) < 12:
|
||||
continue
|
||||
if len(points) >= limit:
|
||||
break
|
||||
normalized = re.sub(r"^(第一|第二|第三|第四|第五)[,,::]?", "", s).strip()
|
||||
normalized = re.sub(r"^[-•\\d\\.\\)\\s]+", "", normalized)
|
||||
text = shorten(normalized, width=50, placeholder="...")
|
||||
points.append(templates[len(points) % len(templates)].format(text))
|
||||
if not points:
|
||||
points = ["原始内容信息密度较高,建议先聚焦一个核心问题再展开"]
|
||||
return points
|
||||
|
||||
def _parse_response_json(self, text: str) -> dict:
|
||||
raw = (text or "").strip()
|
||||
if not raw:
|
||||
raise ValueError("empty model output")
|
||||
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
fenced = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.IGNORECASE).strip()
|
||||
if fenced != raw:
|
||||
try:
|
||||
return json.loads(fenced)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
start = raw.find("{")
|
||||
end = raw.rfind("}")
|
||||
if start != -1 and end != -1 and end > start:
|
||||
return json.loads(raw[start : end + 1])
|
||||
|
||||
raise ValueError("model output is not valid JSON")
|
||||
|
||||
def _call_model_json(self, user_prompt: str, timeout_sec: float) -> dict | None:
|
||||
logger.info(
|
||||
"AI request start model=%s timeout=%.1fs prefer_chat_first=%s prompt_chars=%d",
|
||||
settings.openai_model,
|
||||
timeout_sec,
|
||||
self._prefer_chat_first,
|
||||
len(user_prompt),
|
||||
)
|
||||
methods = ["chat", "responses"] if self._prefer_chat_first else ["responses", "chat"]
|
||||
for method in methods:
|
||||
if method == "responses":
|
||||
try:
|
||||
completion = self._client.responses.create(
|
||||
model=settings.openai_model,
|
||||
input=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
text={"format": {"type": "json_object"}},
|
||||
timeout=timeout_sec,
|
||||
)
|
||||
output_text = completion.output_text or ""
|
||||
logger.info("AI raw output (responses): %s", output_text)
|
||||
return self._parse_response_json(output_text)
|
||||
except Exception as exc:
|
||||
logger.warning("responses API failed: %s", exc)
|
||||
continue
|
||||
|
||||
if method == "chat":
|
||||
try:
|
||||
completion = self._client.chat.completions.create(
|
||||
model=settings.openai_model,
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
response_format={"type": "json_object"},
|
||||
max_tokens=1800,
|
||||
temperature=0.4,
|
||||
extra_body={"enable_thinking": False},
|
||||
timeout=timeout_sec,
|
||||
)
|
||||
msg = completion.choices[0].message.content if completion.choices else ""
|
||||
logger.info("AI raw output (chat.completions): %s", msg or "")
|
||||
return self._parse_response_json(msg or "")
|
||||
except Exception as exc:
|
||||
logger.warning("chat.completions API failed: %s", exc)
|
||||
# DashScope compatibility path: don't spend extra time on responses fallback.
|
||||
if self._prefer_chat_first:
|
||||
return None
|
||||
continue
|
||||
return None
|
||||
|
||||
def _normalize_result(self, data: dict) -> dict:
|
||||
title = str(data.get("title", "")).strip()
|
||||
summary = str(data.get("summary", "")).strip()
|
||||
body = str(data.get("body_markdown", "")).strip()
|
||||
|
||||
if not title:
|
||||
title = "公众号改写稿"
|
||||
if not summary:
|
||||
summary = shorten(re.sub(r"\s+", " ", body), width=110, placeholder="...")
|
||||
|
||||
body = self._ensure_sections(body)
|
||||
body = self._format_markdown(body)
|
||||
|
||||
return {"title": title, "summary": summary, "body_markdown": body}
|
||||
|
||||
def _ensure_sections(self, body: str) -> str:
|
||||
text = (body or "").strip()
|
||||
required = ["## 导语", "## 核心观点", "## 深度分析", "## 落地建议", "## 结语"]
|
||||
missing = [h for h in required if h not in text]
|
||||
if not text:
|
||||
text = "## 导语\n\n内容生成失败,请重试。\n"
|
||||
if missing:
|
||||
# Light touch: append missing sections to keep publish structure stable.
|
||||
pads = "\n\n".join([f"{h}\n\n(待补充)" for h in missing])
|
||||
text = f"{text}\n\n{pads}"
|
||||
return text
|
||||
|
||||
def _quality_issues(self, req: RewriteRequest, source: str, normalized: dict) -> list[str]:
|
||||
issues: list[str] = []
|
||||
title = normalized.get("title", "")
|
||||
summary = normalized.get("summary", "")
|
||||
body = normalized.get("body_markdown", "")
|
||||
|
||||
if len(title) < 8 or len(title) > 34:
|
||||
issues.append("标题长度不理想(建议 8-34 字)")
|
||||
|
||||
if len(summary) < 60:
|
||||
issues.append("摘要过短,信息量不足")
|
||||
|
||||
headings = re.findall(r"(?m)^##\s+.+$", body)
|
||||
if len(headings) < 5:
|
||||
issues.append("二级标题不足,结构不完整")
|
||||
|
||||
paragraphs = [p.strip() for p in body.split("\n\n") if p.strip()]
|
||||
if len(paragraphs) < 10:
|
||||
issues.append("正文段落偏少,展开不充分")
|
||||
|
||||
if len(body) < 900:
|
||||
issues.append("正文过短,无法支撑公众号发布")
|
||||
|
||||
if self._looks_like_raw_copy(source, body):
|
||||
issues.append("改写与原文相似度过高,疑似未充分重写")
|
||||
|
||||
if req.avoid_words:
|
||||
bad_words = [w.strip() for w in re.split(r"[,,]\s*", req.avoid_words) if w.strip()]
|
||||
hit = [w for w in bad_words if w in body or w in summary or w in title]
|
||||
if hit:
|
||||
issues.append(f"命中禁用词: {', '.join(hit)}")
|
||||
|
||||
ai_phrases = ["首先", "其次", "最后", "总而言之", "赋能", "闭环", "颠覆"]
|
||||
hit_ai = [w for w in ai_phrases if body.count(w) >= 3]
|
||||
if hit_ai:
|
||||
issues.append("存在明显 AI 套话堆叠")
|
||||
|
||||
return issues
|
||||
|
||||
def _looks_like_raw_copy(self, source: str, rewritten: str) -> bool:
|
||||
src = re.sub(r"\s+", "", source or "")
|
||||
dst = re.sub(r"\s+", "", rewritten or "")
|
||||
if not src or not dst:
|
||||
return True
|
||||
if dst in src or src in dst:
|
||||
return True
|
||||
ratio = difflib.SequenceMatcher(a=src[:3500], b=dst[:3500]).ratio()
|
||||
return ratio >= 0.80
|
||||
|
||||
def _format_markdown(self, text: str) -> str:
|
||||
body = text.replace("\r\n", "\n").strip()
|
||||
body = re.sub(r"\n{3,}", "\n\n", body)
|
||||
body = re.sub(r"(?m)^(#{1,3}\s[^\n]+)\n(?!\n)", r"\1\n\n", body)
|
||||
return body.strip() + "\n"
|
||||
|
||||
Reference in New Issue
Block a user