fix:优化项目内容

This commit is contained in:
Daniel
2026-03-18 17:01:10 +08:00
parent da63282a10
commit 27dc89e251
64 changed files with 3421 additions and 4982 deletions

View File

@@ -7,7 +7,7 @@ import os
import re
import sqlite3
import ssl
from datetime import date, datetime
from datetime import date, datetime, timedelta
from email.header import decode_header
from pathlib import Path
from typing import Any, Dict, List, Tuple
@@ -109,6 +109,29 @@ def _run_invoice_ocr_sync(file_path: str, mime: str, raw_bytes: bytes) -> Tuple[
loop.close()
def _extract_text_for_tagging(file_path: str, mime: str, raw_bytes: bytes) -> str:
"""
Extract best-effort text from PDF/image/xlsx for tagging.
- PDF: extract text via fitz; fallback to first page OCR image (handled elsewhere if needed)
- Image: no local OCR here; return empty and let AI decide (optional)
- XLSX: not parsed currently
"""
p = Path(file_path)
suf = p.suffix.lower()
if suf == ".pdf" or "pdf" in (mime or "").lower():
try:
import fitz # PyMuPDF
doc = fitz.open(stream=raw_bytes, filetype="pdf")
texts: list[str] = []
for i in range(min(5, doc.page_count)):
texts.append(doc.load_page(i).get_text("text") or "")
doc.close()
return "\n".join(texts).strip()
except Exception:
return ""
return ""
def _rename_invoice_file(
file_path: str,
amount: float | None,
@@ -173,6 +196,7 @@ def _has_sync_history() -> bool:
def _save_attachment(
msg: email.message.Message,
month_str: str,
allowed_doc_types: set[str] | None = None,
) -> List[Tuple[str, str, str, bytes, str]]:
"""
Save PDF/image attachments.
@@ -193,17 +217,20 @@ def _save_attachment(
_ensure_sync_history_table(conn)
for part in msg.walk():
content_disposition = part.get("Content-Disposition", "")
if "attachment" not in content_disposition:
continue
# 许多邮件附件会以 inline 或缺失 Content-Disposition 的形式出现,
# 只要存在 filename 且扩展名符合,就视为可下载附件。
content_disposition = (part.get("Content-Disposition", "") or "").lower()
filename = part.get_filename()
filename = _decode_header_value(filename)
if not filename:
continue
if content_disposition and ("attachment" not in content_disposition and "inline" not in content_disposition):
# 明确的非附件 disposition跳过
continue
ext = Path(filename).suffix.lower()
if ext not in (".pdf", ".jpg", ".jpeg", ".png", ".xlsx"):
if ext not in (".pdf", ".jpg", ".jpeg", ".png", ".webp", ".xlsx", ".xls"):
continue
maintype = part.get_content_maintype()
@@ -216,6 +243,8 @@ def _save_attachment(
# 分类:基于主题 + 文件名
doc_type = _classify_type(subject, filename)
if allowed_doc_types is not None and doc_type not in allowed_doc_types:
continue
base_dir = _ensure_month_dir(month_str, doc_type)
# 增量去重:根据 (message_id, md5) 判断是否已同步过
@@ -421,7 +450,56 @@ def _select_mailbox(imap: imaplib.IMAP4_SSL, mailbox: str) -> bool:
return False
def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[str, Any]]) -> None:
def _imap_date(d: date) -> str:
# IMAP date format: 16-Mar-2026 (English month)
import calendar
return f"{d.day:02d}-{calendar.month_abbr[d.month]}-{d.year}"
def _pick_latest_msg_id(imap: imaplib.IMAP4_SSL, msg_ids: List[bytes]) -> bytes | None:
"""从一批 msg_id 中按 INTERNALDATE 选择最新的一封。"""
latest_id: bytes | None = None
latest_ts: float = -1.0
for mid in msg_ids:
try:
typ, data = imap.fetch(mid, "(INTERNALDATE)")
if typ != "OK" or not data or not data[0]:
continue
# imaplib.Internaldate2tuple expects a bytes response line
raw = data[0]
if isinstance(raw, tuple):
raw = raw[0]
if not isinstance(raw, (bytes, bytearray)):
raw = str(raw).encode("utf-8", errors="ignore")
t = imaplib.Internaldate2tuple(raw)
if not t:
continue
import time
ts = time.mktime(t)
if ts > latest_ts:
latest_ts = ts
latest_id = mid
except Exception:
continue
return latest_id
def _sync_one_account(
config: Dict[str, Any],
db: Session,
results: List[Dict[str, Any]],
*,
mode: str = "incremental",
start_date: date | None = None,
end_date: date | None = None,
doc_types: list[str] | None = None,
) -> None:
allowed: set[str] | None = None
if doc_types:
allowed = {d.strip().lower() for d in doc_types if d and d.strip()}
allowed = {d for d in allowed if d in ("invoices", "receipts", "statements")}
if not allowed:
allowed = None
host = config.get("host")
user = config.get("user")
password = config.get("password")
@@ -461,20 +539,53 @@ def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[st
f"无法选择邮箱「{mailbox}」,请检查该账户的 Mailbox 配置(如 163 使用 INBOX"
)
# 首次同步(历史库无记录):拉取全部邮件中的附件,由 attachment_history 去重
# 已有历史:只拉取未读邮件,避免重复拉取
# 支持:
# - mode=incremental: 首次全量,否则 UNSEEN
# - mode=all: 全量(可加时间范围)
# - mode=latest: 仅最新一封(可加时间范围)
mode = (mode or "incremental").strip().lower()
if mode not in ("incremental", "all", "latest"):
mode = "incremental"
is_first_sync = not _has_sync_history()
search_criterion = "ALL" if is_first_sync else "UNSEEN"
base_criterion = "ALL"
if mode == "incremental":
base_criterion = "ALL" if is_first_sync else "UNSEEN"
elif mode == "all":
base_criterion = "ALL"
elif mode == "latest":
base_criterion = "ALL"
criteria: List[str] = [base_criterion]
if start_date:
criteria += ["SINCE", _imap_date(start_date)]
if end_date:
# BEFORE is exclusive; add one day to make end_date inclusive
criteria += ["BEFORE", _imap_date(end_date + timedelta(days=1))]
logging.getLogger(__name__).info(
"Finance sync: %s (criterion=%s)",
"全量" if is_first_sync else "增量",
search_criterion,
"Finance sync: mode=%s criterion=%s range=%s~%s",
mode,
base_criterion,
start_date,
end_date,
)
status, data = imap.search(None, search_criterion)
status, data = imap.search(None, *criteria)
if status != "OK":
return
id_list = data[0].split()
id_list: List[bytes] = data[0].split() if data and data[0] else []
logging.getLogger(__name__).info(
"Finance sync: matched messages=%d (mode=%s)", len(id_list), mode
)
if not id_list:
return
if mode == "latest":
latest = _pick_latest_msg_id(imap, id_list)
id_list = [latest] if latest else []
for msg_id in id_list:
status, msg_data = imap.fetch(msg_id, "(RFC822)")
if status != "OK":
@@ -485,7 +596,7 @@ def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[st
dt = _parse_email_date(msg)
month_str = dt.strftime("%Y-%m")
saved = _save_attachment(msg, month_str)
saved = _save_attachment(msg, month_str, allowed_doc_types=allowed)
for file_name, file_path, mime, raw_bytes, doc_type in saved:
final_name = file_name
final_path = file_path
@@ -510,11 +621,28 @@ def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[st
type=doc_type,
file_name=final_name,
file_path=final_path,
tags=None,
meta_json=None,
amount=amount,
billing_date=billing_date,
)
db.add(record)
db.flush()
# 自动识别打标签(同步后自动跑)
try:
from backend.app.services.ai_service import extract_finance_tags
content_text = _extract_text_for_tagging(final_path, mime, raw_bytes)
tags, meta = asyncio.run(extract_finance_tags(content_text, doc_type, final_name)) # type: ignore[arg-type]
if tags:
record.tags = ",".join(tags)
if meta:
import json as _json
record.meta_json = _json.dumps(meta, ensure_ascii=False)
db.flush()
except Exception:
pass
results.append({
"id": record.id,
"month": record.month,
@@ -526,7 +654,13 @@ def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[st
imap.store(msg_id, "+FLAGS", "\\Seen \\Flagged")
async def sync_finance_emails() -> List[Dict[str, Any]]:
async def sync_finance_emails(
*,
mode: str = "incremental",
start_date: date | None = None,
end_date: date | None = None,
doc_types: list[str] | None = None,
) -> List[Dict[str, Any]]:
"""
Sync from all active email configs (data/email_configs.json).
Falls back to env vars if no configs. Classifies into invoices/, receipts/, statements/.
@@ -546,7 +680,15 @@ async def sync_finance_emails() -> List[Dict[str, Any]]:
try:
for config in configs:
try:
_sync_one_account(config, db, results)
_sync_one_account(
config,
db,
results,
mode=mode,
start_date=start_date,
end_date=end_date,
doc_types=doc_types,
)
except Exception as e:
# 不让单个账户的异常中断全部同步,记录错误并继续其他账户。
user = config.get("user", "") or config.get("id", "")