fix:优化项目内容

This commit is contained in:
Daniel
2026-03-18 17:01:10 +08:00
parent da63282a10
commit 27dc89e251
64 changed files with 3421 additions and 4982 deletions

View File

@@ -3,7 +3,7 @@ import json
import os
import re
from pathlib import Path
from typing import Any, Dict, Tuple
from typing import Any, Dict, Tuple, List
from openai import AsyncOpenAI
from openai import NotFoundError as OpenAINotFoundError
@@ -197,6 +197,65 @@ async def extract_invoice_metadata(image_bytes: bytes, mime: str = "image/jpeg")
api_key = (config.get("api_key") or "").strip()
if not api_key:
return (None, None)
async def extract_finance_tags(
content_text: str,
doc_type: str,
filename: str = "",
) -> Tuple[List[str], Dict[str, Any]]:
"""
从附件文本内容中抽取标签与结构化信息JSON
返回 (tags, meta)。
"""
config = _load_ai_config()
client = _client_from_config(config)
model = config.get("model_name") or "gpt-4o-mini"
temperature = float(config.get("temperature", 0.2))
prompt = (
"你是一名财务助理。请根据附件的文本内容,为它生成可检索的标签,并抽取关键字段。\n"
"只返回 JSON不要任何解释文字。\n"
"输入信息:\n"
f"- 类型 doc_type: {doc_type}\n"
f"- 文件名 filename: {filename}\n"
"- 附件文本 content_text: (见下)\n\n"
"返回 JSON 格式:\n"
"{\n"
' "tags": ["标签1","标签2"],\n'
' "meta": {\n'
' "counterparty": "对方单位/收款方/付款方(如能识别)或 null",\n'
' "account": "账户/卡号后四位(如能识别)或 null",\n'
' "amount": "金额数字字符串或 null",\n'
' "date": "YYYY-MM-DD 或 null",\n'
' "summary": "一句话摘要"\n'
" }\n"
"}\n\n"
"content_text:\n"
f"{content_text[:12000]}\n"
)
completion = await client.chat.completions.create(
model=model,
response_format={"type": "json_object"},
messages=[{"role": "user", "content": prompt}],
temperature=temperature,
max_tokens=500,
)
content = completion.choices[0].message.content or "{}"
try:
data: Any = json.loads(content)
except Exception:
return ([], {"summary": "", "raw": content})
tags = data.get("tags") if isinstance(data, dict) else None
meta = data.get("meta") if isinstance(data, dict) else None
if not isinstance(tags, list):
tags = []
tags = [str(t).strip() for t in tags if str(t).strip()][:12]
if not isinstance(meta, dict):
meta = {}
return (tags, meta)
try:
client = _client_from_config(config)
model = config.get("model_name") or "gpt-4o-mini"

View File

@@ -7,7 +7,7 @@ import os
import re
import sqlite3
import ssl
from datetime import date, datetime
from datetime import date, datetime, timedelta
from email.header import decode_header
from pathlib import Path
from typing import Any, Dict, List, Tuple
@@ -109,6 +109,29 @@ def _run_invoice_ocr_sync(file_path: str, mime: str, raw_bytes: bytes) -> Tuple[
loop.close()
def _extract_text_for_tagging(file_path: str, mime: str, raw_bytes: bytes) -> str:
"""
Extract best-effort text from PDF/image/xlsx for tagging.
- PDF: extract text via fitz; fallback to first page OCR image (handled elsewhere if needed)
- Image: no local OCR here; return empty and let AI decide (optional)
- XLSX: not parsed currently
"""
p = Path(file_path)
suf = p.suffix.lower()
if suf == ".pdf" or "pdf" in (mime or "").lower():
try:
import fitz # PyMuPDF
doc = fitz.open(stream=raw_bytes, filetype="pdf")
texts: list[str] = []
for i in range(min(5, doc.page_count)):
texts.append(doc.load_page(i).get_text("text") or "")
doc.close()
return "\n".join(texts).strip()
except Exception:
return ""
return ""
def _rename_invoice_file(
file_path: str,
amount: float | None,
@@ -173,6 +196,7 @@ def _has_sync_history() -> bool:
def _save_attachment(
msg: email.message.Message,
month_str: str,
allowed_doc_types: set[str] | None = None,
) -> List[Tuple[str, str, str, bytes, str]]:
"""
Save PDF/image attachments.
@@ -193,17 +217,20 @@ def _save_attachment(
_ensure_sync_history_table(conn)
for part in msg.walk():
content_disposition = part.get("Content-Disposition", "")
if "attachment" not in content_disposition:
continue
# 许多邮件附件会以 inline 或缺失 Content-Disposition 的形式出现,
# 只要存在 filename 且扩展名符合,就视为可下载附件。
content_disposition = (part.get("Content-Disposition", "") or "").lower()
filename = part.get_filename()
filename = _decode_header_value(filename)
if not filename:
continue
if content_disposition and ("attachment" not in content_disposition and "inline" not in content_disposition):
# 明确的非附件 disposition跳过
continue
ext = Path(filename).suffix.lower()
if ext not in (".pdf", ".jpg", ".jpeg", ".png", ".xlsx"):
if ext not in (".pdf", ".jpg", ".jpeg", ".png", ".webp", ".xlsx", ".xls"):
continue
maintype = part.get_content_maintype()
@@ -216,6 +243,8 @@ def _save_attachment(
# 分类:基于主题 + 文件名
doc_type = _classify_type(subject, filename)
if allowed_doc_types is not None and doc_type not in allowed_doc_types:
continue
base_dir = _ensure_month_dir(month_str, doc_type)
# 增量去重:根据 (message_id, md5) 判断是否已同步过
@@ -421,7 +450,56 @@ def _select_mailbox(imap: imaplib.IMAP4_SSL, mailbox: str) -> bool:
return False
def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[str, Any]]) -> None:
def _imap_date(d: date) -> str:
# IMAP date format: 16-Mar-2026 (English month)
import calendar
return f"{d.day:02d}-{calendar.month_abbr[d.month]}-{d.year}"
def _pick_latest_msg_id(imap: imaplib.IMAP4_SSL, msg_ids: List[bytes]) -> bytes | None:
"""从一批 msg_id 中按 INTERNALDATE 选择最新的一封。"""
latest_id: bytes | None = None
latest_ts: float = -1.0
for mid in msg_ids:
try:
typ, data = imap.fetch(mid, "(INTERNALDATE)")
if typ != "OK" or not data or not data[0]:
continue
# imaplib.Internaldate2tuple expects a bytes response line
raw = data[0]
if isinstance(raw, tuple):
raw = raw[0]
if not isinstance(raw, (bytes, bytearray)):
raw = str(raw).encode("utf-8", errors="ignore")
t = imaplib.Internaldate2tuple(raw)
if not t:
continue
import time
ts = time.mktime(t)
if ts > latest_ts:
latest_ts = ts
latest_id = mid
except Exception:
continue
return latest_id
def _sync_one_account(
config: Dict[str, Any],
db: Session,
results: List[Dict[str, Any]],
*,
mode: str = "incremental",
start_date: date | None = None,
end_date: date | None = None,
doc_types: list[str] | None = None,
) -> None:
allowed: set[str] | None = None
if doc_types:
allowed = {d.strip().lower() for d in doc_types if d and d.strip()}
allowed = {d for d in allowed if d in ("invoices", "receipts", "statements")}
if not allowed:
allowed = None
host = config.get("host")
user = config.get("user")
password = config.get("password")
@@ -461,20 +539,53 @@ def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[st
f"无法选择邮箱「{mailbox}」,请检查该账户的 Mailbox 配置(如 163 使用 INBOX"
)
# 首次同步(历史库无记录):拉取全部邮件中的附件,由 attachment_history 去重
# 已有历史:只拉取未读邮件,避免重复拉取
# 支持:
# - mode=incremental: 首次全量,否则 UNSEEN
# - mode=all: 全量(可加时间范围)
# - mode=latest: 仅最新一封(可加时间范围)
mode = (mode or "incremental").strip().lower()
if mode not in ("incremental", "all", "latest"):
mode = "incremental"
is_first_sync = not _has_sync_history()
search_criterion = "ALL" if is_first_sync else "UNSEEN"
base_criterion = "ALL"
if mode == "incremental":
base_criterion = "ALL" if is_first_sync else "UNSEEN"
elif mode == "all":
base_criterion = "ALL"
elif mode == "latest":
base_criterion = "ALL"
criteria: List[str] = [base_criterion]
if start_date:
criteria += ["SINCE", _imap_date(start_date)]
if end_date:
# BEFORE is exclusive; add one day to make end_date inclusive
criteria += ["BEFORE", _imap_date(end_date + timedelta(days=1))]
logging.getLogger(__name__).info(
"Finance sync: %s (criterion=%s)",
"全量" if is_first_sync else "增量",
search_criterion,
"Finance sync: mode=%s criterion=%s range=%s~%s",
mode,
base_criterion,
start_date,
end_date,
)
status, data = imap.search(None, search_criterion)
status, data = imap.search(None, *criteria)
if status != "OK":
return
id_list = data[0].split()
id_list: List[bytes] = data[0].split() if data and data[0] else []
logging.getLogger(__name__).info(
"Finance sync: matched messages=%d (mode=%s)", len(id_list), mode
)
if not id_list:
return
if mode == "latest":
latest = _pick_latest_msg_id(imap, id_list)
id_list = [latest] if latest else []
for msg_id in id_list:
status, msg_data = imap.fetch(msg_id, "(RFC822)")
if status != "OK":
@@ -485,7 +596,7 @@ def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[st
dt = _parse_email_date(msg)
month_str = dt.strftime("%Y-%m")
saved = _save_attachment(msg, month_str)
saved = _save_attachment(msg, month_str, allowed_doc_types=allowed)
for file_name, file_path, mime, raw_bytes, doc_type in saved:
final_name = file_name
final_path = file_path
@@ -510,11 +621,28 @@ def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[st
type=doc_type,
file_name=final_name,
file_path=final_path,
tags=None,
meta_json=None,
amount=amount,
billing_date=billing_date,
)
db.add(record)
db.flush()
# 自动识别打标签(同步后自动跑)
try:
from backend.app.services.ai_service import extract_finance_tags
content_text = _extract_text_for_tagging(final_path, mime, raw_bytes)
tags, meta = asyncio.run(extract_finance_tags(content_text, doc_type, final_name)) # type: ignore[arg-type]
if tags:
record.tags = ",".join(tags)
if meta:
import json as _json
record.meta_json = _json.dumps(meta, ensure_ascii=False)
db.flush()
except Exception:
pass
results.append({
"id": record.id,
"month": record.month,
@@ -526,7 +654,13 @@ def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[st
imap.store(msg_id, "+FLAGS", "\\Seen \\Flagged")
async def sync_finance_emails() -> List[Dict[str, Any]]:
async def sync_finance_emails(
*,
mode: str = "incremental",
start_date: date | None = None,
end_date: date | None = None,
doc_types: list[str] | None = None,
) -> List[Dict[str, Any]]:
"""
Sync from all active email configs (data/email_configs.json).
Falls back to env vars if no configs. Classifies into invoices/, receipts/, statements/.
@@ -546,7 +680,15 @@ async def sync_finance_emails() -> List[Dict[str, Any]]:
try:
for config in configs:
try:
_sync_one_account(config, db, results)
_sync_one_account(
config,
db,
results,
mode=mode,
start_date=start_date,
end_date=end_date,
doc_types=doc_types,
)
except Exception as e:
# 不让单个账户的异常中断全部同步,记录错误并继续其他账户。
user = config.get("user", "") or config.get("id", "")