fix:优化项目内容
This commit is contained in:
@@ -3,7 +3,7 @@ import json
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Tuple
|
||||
from typing import Any, Dict, Tuple, List
|
||||
|
||||
from openai import AsyncOpenAI
|
||||
from openai import NotFoundError as OpenAINotFoundError
|
||||
@@ -197,6 +197,65 @@ async def extract_invoice_metadata(image_bytes: bytes, mime: str = "image/jpeg")
|
||||
api_key = (config.get("api_key") or "").strip()
|
||||
if not api_key:
|
||||
return (None, None)
|
||||
|
||||
|
||||
async def extract_finance_tags(
|
||||
content_text: str,
|
||||
doc_type: str,
|
||||
filename: str = "",
|
||||
) -> Tuple[List[str], Dict[str, Any]]:
|
||||
"""
|
||||
从附件文本内容中抽取标签与结构化信息(JSON)。
|
||||
返回 (tags, meta)。
|
||||
"""
|
||||
config = _load_ai_config()
|
||||
client = _client_from_config(config)
|
||||
model = config.get("model_name") or "gpt-4o-mini"
|
||||
temperature = float(config.get("temperature", 0.2))
|
||||
|
||||
prompt = (
|
||||
"你是一名财务助理。请根据附件的文本内容,为它生成可检索的标签,并抽取关键字段。\n"
|
||||
"只返回 JSON,不要任何解释文字。\n"
|
||||
"输入信息:\n"
|
||||
f"- 类型 doc_type: {doc_type}\n"
|
||||
f"- 文件名 filename: {filename}\n"
|
||||
"- 附件文本 content_text: (见下)\n\n"
|
||||
"返回 JSON 格式:\n"
|
||||
"{\n"
|
||||
' "tags": ["标签1","标签2"],\n'
|
||||
' "meta": {\n'
|
||||
' "counterparty": "对方单位/收款方/付款方(如能识别)或 null",\n'
|
||||
' "account": "账户/卡号后四位(如能识别)或 null",\n'
|
||||
' "amount": "金额数字字符串或 null",\n'
|
||||
' "date": "YYYY-MM-DD 或 null",\n'
|
||||
' "summary": "一句话摘要"\n'
|
||||
" }\n"
|
||||
"}\n\n"
|
||||
"content_text:\n"
|
||||
f"{content_text[:12000]}\n"
|
||||
)
|
||||
|
||||
completion = await client.chat.completions.create(
|
||||
model=model,
|
||||
response_format={"type": "json_object"},
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
temperature=temperature,
|
||||
max_tokens=500,
|
||||
)
|
||||
content = completion.choices[0].message.content or "{}"
|
||||
try:
|
||||
data: Any = json.loads(content)
|
||||
except Exception:
|
||||
return ([], {"summary": "", "raw": content})
|
||||
|
||||
tags = data.get("tags") if isinstance(data, dict) else None
|
||||
meta = data.get("meta") if isinstance(data, dict) else None
|
||||
if not isinstance(tags, list):
|
||||
tags = []
|
||||
tags = [str(t).strip() for t in tags if str(t).strip()][:12]
|
||||
if not isinstance(meta, dict):
|
||||
meta = {}
|
||||
return (tags, meta)
|
||||
try:
|
||||
client = _client_from_config(config)
|
||||
model = config.get("model_name") or "gpt-4o-mini"
|
||||
|
||||
@@ -7,7 +7,7 @@ import os
|
||||
import re
|
||||
import sqlite3
|
||||
import ssl
|
||||
from datetime import date, datetime
|
||||
from datetime import date, datetime, timedelta
|
||||
from email.header import decode_header
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Tuple
|
||||
@@ -109,6 +109,29 @@ def _run_invoice_ocr_sync(file_path: str, mime: str, raw_bytes: bytes) -> Tuple[
|
||||
loop.close()
|
||||
|
||||
|
||||
def _extract_text_for_tagging(file_path: str, mime: str, raw_bytes: bytes) -> str:
|
||||
"""
|
||||
Extract best-effort text from PDF/image/xlsx for tagging.
|
||||
- PDF: extract text via fitz; fallback to first page OCR image (handled elsewhere if needed)
|
||||
- Image: no local OCR here; return empty and let AI decide (optional)
|
||||
- XLSX: not parsed currently
|
||||
"""
|
||||
p = Path(file_path)
|
||||
suf = p.suffix.lower()
|
||||
if suf == ".pdf" or "pdf" in (mime or "").lower():
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
doc = fitz.open(stream=raw_bytes, filetype="pdf")
|
||||
texts: list[str] = []
|
||||
for i in range(min(5, doc.page_count)):
|
||||
texts.append(doc.load_page(i).get_text("text") or "")
|
||||
doc.close()
|
||||
return "\n".join(texts).strip()
|
||||
except Exception:
|
||||
return ""
|
||||
return ""
|
||||
|
||||
|
||||
def _rename_invoice_file(
|
||||
file_path: str,
|
||||
amount: float | None,
|
||||
@@ -173,6 +196,7 @@ def _has_sync_history() -> bool:
|
||||
def _save_attachment(
|
||||
msg: email.message.Message,
|
||||
month_str: str,
|
||||
allowed_doc_types: set[str] | None = None,
|
||||
) -> List[Tuple[str, str, str, bytes, str]]:
|
||||
"""
|
||||
Save PDF/image attachments.
|
||||
@@ -193,17 +217,20 @@ def _save_attachment(
|
||||
_ensure_sync_history_table(conn)
|
||||
|
||||
for part in msg.walk():
|
||||
content_disposition = part.get("Content-Disposition", "")
|
||||
if "attachment" not in content_disposition:
|
||||
continue
|
||||
# 许多邮件附件会以 inline 或缺失 Content-Disposition 的形式出现,
|
||||
# 只要存在 filename 且扩展名符合,就视为可下载附件。
|
||||
content_disposition = (part.get("Content-Disposition", "") or "").lower()
|
||||
|
||||
filename = part.get_filename()
|
||||
filename = _decode_header_value(filename)
|
||||
if not filename:
|
||||
continue
|
||||
if content_disposition and ("attachment" not in content_disposition and "inline" not in content_disposition):
|
||||
# 明确的非附件 disposition,跳过
|
||||
continue
|
||||
|
||||
ext = Path(filename).suffix.lower()
|
||||
if ext not in (".pdf", ".jpg", ".jpeg", ".png", ".xlsx"):
|
||||
if ext not in (".pdf", ".jpg", ".jpeg", ".png", ".webp", ".xlsx", ".xls"):
|
||||
continue
|
||||
|
||||
maintype = part.get_content_maintype()
|
||||
@@ -216,6 +243,8 @@ def _save_attachment(
|
||||
|
||||
# 分类:基于主题 + 文件名
|
||||
doc_type = _classify_type(subject, filename)
|
||||
if allowed_doc_types is not None and doc_type not in allowed_doc_types:
|
||||
continue
|
||||
base_dir = _ensure_month_dir(month_str, doc_type)
|
||||
|
||||
# 增量去重:根据 (message_id, md5) 判断是否已同步过
|
||||
@@ -421,7 +450,56 @@ def _select_mailbox(imap: imaplib.IMAP4_SSL, mailbox: str) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[str, Any]]) -> None:
|
||||
def _imap_date(d: date) -> str:
|
||||
# IMAP date format: 16-Mar-2026 (English month)
|
||||
import calendar
|
||||
return f"{d.day:02d}-{calendar.month_abbr[d.month]}-{d.year}"
|
||||
|
||||
|
||||
def _pick_latest_msg_id(imap: imaplib.IMAP4_SSL, msg_ids: List[bytes]) -> bytes | None:
|
||||
"""从一批 msg_id 中按 INTERNALDATE 选择最新的一封。"""
|
||||
latest_id: bytes | None = None
|
||||
latest_ts: float = -1.0
|
||||
for mid in msg_ids:
|
||||
try:
|
||||
typ, data = imap.fetch(mid, "(INTERNALDATE)")
|
||||
if typ != "OK" or not data or not data[0]:
|
||||
continue
|
||||
# imaplib.Internaldate2tuple expects a bytes response line
|
||||
raw = data[0]
|
||||
if isinstance(raw, tuple):
|
||||
raw = raw[0]
|
||||
if not isinstance(raw, (bytes, bytearray)):
|
||||
raw = str(raw).encode("utf-8", errors="ignore")
|
||||
t = imaplib.Internaldate2tuple(raw)
|
||||
if not t:
|
||||
continue
|
||||
import time
|
||||
ts = time.mktime(t)
|
||||
if ts > latest_ts:
|
||||
latest_ts = ts
|
||||
latest_id = mid
|
||||
except Exception:
|
||||
continue
|
||||
return latest_id
|
||||
|
||||
|
||||
def _sync_one_account(
|
||||
config: Dict[str, Any],
|
||||
db: Session,
|
||||
results: List[Dict[str, Any]],
|
||||
*,
|
||||
mode: str = "incremental",
|
||||
start_date: date | None = None,
|
||||
end_date: date | None = None,
|
||||
doc_types: list[str] | None = None,
|
||||
) -> None:
|
||||
allowed: set[str] | None = None
|
||||
if doc_types:
|
||||
allowed = {d.strip().lower() for d in doc_types if d and d.strip()}
|
||||
allowed = {d for d in allowed if d in ("invoices", "receipts", "statements")}
|
||||
if not allowed:
|
||||
allowed = None
|
||||
host = config.get("host")
|
||||
user = config.get("user")
|
||||
password = config.get("password")
|
||||
@@ -461,20 +539,53 @@ def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[st
|
||||
f"无法选择邮箱「{mailbox}」,请检查该账户的 Mailbox 配置(如 163 使用 INBOX)"
|
||||
)
|
||||
|
||||
# 首次同步(历史库无记录):拉取全部邮件中的附件,由 attachment_history 去重
|
||||
# 已有历史:只拉取未读邮件,避免重复拉取
|
||||
# 支持:
|
||||
# - mode=incremental: 首次全量,否则 UNSEEN
|
||||
# - mode=all: 全量(可加时间范围)
|
||||
# - mode=latest: 仅最新一封(可加时间范围)
|
||||
mode = (mode or "incremental").strip().lower()
|
||||
if mode not in ("incremental", "all", "latest"):
|
||||
mode = "incremental"
|
||||
|
||||
is_first_sync = not _has_sync_history()
|
||||
search_criterion = "ALL" if is_first_sync else "UNSEEN"
|
||||
base_criterion = "ALL"
|
||||
if mode == "incremental":
|
||||
base_criterion = "ALL" if is_first_sync else "UNSEEN"
|
||||
elif mode == "all":
|
||||
base_criterion = "ALL"
|
||||
elif mode == "latest":
|
||||
base_criterion = "ALL"
|
||||
|
||||
criteria: List[str] = [base_criterion]
|
||||
if start_date:
|
||||
criteria += ["SINCE", _imap_date(start_date)]
|
||||
if end_date:
|
||||
# BEFORE is exclusive; add one day to make end_date inclusive
|
||||
criteria += ["BEFORE", _imap_date(end_date + timedelta(days=1))]
|
||||
|
||||
logging.getLogger(__name__).info(
|
||||
"Finance sync: %s (criterion=%s)",
|
||||
"全量" if is_first_sync else "增量",
|
||||
search_criterion,
|
||||
"Finance sync: mode=%s criterion=%s range=%s~%s",
|
||||
mode,
|
||||
base_criterion,
|
||||
start_date,
|
||||
end_date,
|
||||
)
|
||||
status, data = imap.search(None, search_criterion)
|
||||
|
||||
status, data = imap.search(None, *criteria)
|
||||
if status != "OK":
|
||||
return
|
||||
|
||||
id_list = data[0].split()
|
||||
id_list: List[bytes] = data[0].split() if data and data[0] else []
|
||||
logging.getLogger(__name__).info(
|
||||
"Finance sync: matched messages=%d (mode=%s)", len(id_list), mode
|
||||
)
|
||||
if not id_list:
|
||||
return
|
||||
|
||||
if mode == "latest":
|
||||
latest = _pick_latest_msg_id(imap, id_list)
|
||||
id_list = [latest] if latest else []
|
||||
|
||||
for msg_id in id_list:
|
||||
status, msg_data = imap.fetch(msg_id, "(RFC822)")
|
||||
if status != "OK":
|
||||
@@ -485,7 +596,7 @@ def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[st
|
||||
dt = _parse_email_date(msg)
|
||||
month_str = dt.strftime("%Y-%m")
|
||||
|
||||
saved = _save_attachment(msg, month_str)
|
||||
saved = _save_attachment(msg, month_str, allowed_doc_types=allowed)
|
||||
for file_name, file_path, mime, raw_bytes, doc_type in saved:
|
||||
final_name = file_name
|
||||
final_path = file_path
|
||||
@@ -510,11 +621,28 @@ def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[st
|
||||
type=doc_type,
|
||||
file_name=final_name,
|
||||
file_path=final_path,
|
||||
tags=None,
|
||||
meta_json=None,
|
||||
amount=amount,
|
||||
billing_date=billing_date,
|
||||
)
|
||||
db.add(record)
|
||||
db.flush()
|
||||
|
||||
# 自动识别打标签(同步后自动跑)
|
||||
try:
|
||||
from backend.app.services.ai_service import extract_finance_tags
|
||||
content_text = _extract_text_for_tagging(final_path, mime, raw_bytes)
|
||||
tags, meta = asyncio.run(extract_finance_tags(content_text, doc_type, final_name)) # type: ignore[arg-type]
|
||||
if tags:
|
||||
record.tags = ",".join(tags)
|
||||
if meta:
|
||||
import json as _json
|
||||
record.meta_json = _json.dumps(meta, ensure_ascii=False)
|
||||
db.flush()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
results.append({
|
||||
"id": record.id,
|
||||
"month": record.month,
|
||||
@@ -526,7 +654,13 @@ def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[st
|
||||
imap.store(msg_id, "+FLAGS", "\\Seen \\Flagged")
|
||||
|
||||
|
||||
async def sync_finance_emails() -> List[Dict[str, Any]]:
|
||||
async def sync_finance_emails(
|
||||
*,
|
||||
mode: str = "incremental",
|
||||
start_date: date | None = None,
|
||||
end_date: date | None = None,
|
||||
doc_types: list[str] | None = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Sync from all active email configs (data/email_configs.json).
|
||||
Falls back to env vars if no configs. Classifies into invoices/, receipts/, statements/.
|
||||
@@ -546,7 +680,15 @@ async def sync_finance_emails() -> List[Dict[str, Any]]:
|
||||
try:
|
||||
for config in configs:
|
||||
try:
|
||||
_sync_one_account(config, db, results)
|
||||
_sync_one_account(
|
||||
config,
|
||||
db,
|
||||
results,
|
||||
mode=mode,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
doc_types=doc_types,
|
||||
)
|
||||
except Exception as e:
|
||||
# 不让单个账户的异常中断全部同步,记录错误并继续其他账户。
|
||||
user = config.get("user", "") or config.get("id", "")
|
||||
|
||||
Reference in New Issue
Block a user