fix:优化项目内容

This commit is contained in:
Daniel
2026-03-18 17:01:10 +08:00
parent da63282a10
commit 27dc89e251
64 changed files with 3421 additions and 4982 deletions

View File

@@ -3,7 +3,7 @@ import json
import os
import re
from pathlib import Path
from typing import Any, Dict, Tuple
from typing import Any, Dict, Tuple, List
from openai import AsyncOpenAI
from openai import NotFoundError as OpenAINotFoundError
@@ -197,6 +197,65 @@ async def extract_invoice_metadata(image_bytes: bytes, mime: str = "image/jpeg")
api_key = (config.get("api_key") or "").strip()
if not api_key:
return (None, None)
async def extract_finance_tags(
content_text: str,
doc_type: str,
filename: str = "",
) -> Tuple[List[str], Dict[str, Any]]:
"""
从附件文本内容中抽取标签与结构化信息JSON
返回 (tags, meta)。
"""
config = _load_ai_config()
client = _client_from_config(config)
model = config.get("model_name") or "gpt-4o-mini"
temperature = float(config.get("temperature", 0.2))
prompt = (
"你是一名财务助理。请根据附件的文本内容,为它生成可检索的标签,并抽取关键字段。\n"
"只返回 JSON不要任何解释文字。\n"
"输入信息:\n"
f"- 类型 doc_type: {doc_type}\n"
f"- 文件名 filename: {filename}\n"
"- 附件文本 content_text: (见下)\n\n"
"返回 JSON 格式:\n"
"{\n"
' "tags": ["标签1","标签2"],\n'
' "meta": {\n'
' "counterparty": "对方单位/收款方/付款方(如能识别)或 null",\n'
' "account": "账户/卡号后四位(如能识别)或 null",\n'
' "amount": "金额数字字符串或 null",\n'
' "date": "YYYY-MM-DD 或 null",\n'
' "summary": "一句话摘要"\n'
" }\n"
"}\n\n"
"content_text:\n"
f"{content_text[:12000]}\n"
)
completion = await client.chat.completions.create(
model=model,
response_format={"type": "json_object"},
messages=[{"role": "user", "content": prompt}],
temperature=temperature,
max_tokens=500,
)
content = completion.choices[0].message.content or "{}"
try:
data: Any = json.loads(content)
except Exception:
return ([], {"summary": "", "raw": content})
tags = data.get("tags") if isinstance(data, dict) else None
meta = data.get("meta") if isinstance(data, dict) else None
if not isinstance(tags, list):
tags = []
tags = [str(t).strip() for t in tags if str(t).strip()][:12]
if not isinstance(meta, dict):
meta = {}
return (tags, meta)
try:
client = _client_from_config(config)
model = config.get("model_name") or "gpt-4o-mini"