fix:优化项目内容

2026-03-18 17:01:10 +08:00
parent da63282a10
commit 27dc89e251
64 changed files with 3421 additions and 4982 deletions
--- a/backend/app/services/ai_service.py
+++ b/backend/app/services/ai_service.py
@@ -3,7 +3,7 @@ import json
 import os
 import re
 from pathlib import Path
-from typing import Any, Dict, Tuple
+from typing import Any, Dict, Tuple, List

 from openai import AsyncOpenAI
 from openai import NotFoundError as OpenAINotFoundError
@@ -197,6 +197,65 @@ async def extract_invoice_metadata(image_bytes: bytes, mime: str = "image/jpeg")
    api_key = (config.get("api_key") or "").strip()
    if not api_key:
        return (None, None)
+
+
+async def extract_finance_tags(
+    content_text: str,
+    doc_type: str,
+    filename: str = "",
+) -> Tuple[List[str], Dict[str, Any]]:
+    """
+    从附件文本内容中抽取标签与结构化信息（JSON）。
+    返回 (tags, meta)。
+    """
+    config = _load_ai_config()
+    client = _client_from_config(config)
+    model = config.get("model_name") or "gpt-4o-mini"
+    temperature = float(config.get("temperature", 0.2))
+
+    prompt = (
+        "你是一名财务助理。请根据附件的文本内容，为它生成可检索的标签，并抽取关键字段。\n"
+        "只返回 JSON，不要任何解释文字。\n"
+        "输入信息：\n"
+        f"- 类型 doc_type: {doc_type}\n"
+        f"- 文件名 filename: {filename}\n"
+        "- 附件文本 content_text: (见下)\n\n"
+        "返回 JSON 格式：\n"
+        "{\n"
+        '  "tags": ["标签1","标签2"],\n'
+        '  "meta": {\n'
+        '    "counterparty": "对方单位/收款方/付款方（如能识别）或 null",\n'
+        '    "account": "账户/卡号后四位（如能识别）或 null",\n'
+        '    "amount": "金额数字字符串或 null",\n'
+        '    "date": "YYYY-MM-DD 或 null",\n'
+        '    "summary": "一句话摘要"\n'
+        "  }\n"
+        "}\n\n"
+        "content_text:\n"
+        f"{content_text[:12000]}\n"
+    )
+
+    completion = await client.chat.completions.create(
+        model=model,
+        response_format={"type": "json_object"},
+        messages=[{"role": "user", "content": prompt}],
+        temperature=temperature,
+        max_tokens=500,
+    )
+    content = completion.choices[0].message.content or "{}"
+    try:
+        data: Any = json.loads(content)
+    except Exception:
+        return ([], {"summary": "", "raw": content})
+
+    tags = data.get("tags") if isinstance(data, dict) else None
+    meta = data.get("meta") if isinstance(data, dict) else None
+    if not isinstance(tags, list):
+        tags = []
+    tags = [str(t).strip() for t in tags if str(t).strip()][:12]
+    if not isinstance(meta, dict):
+        meta = {}
+    return (tags, meta)
    try:
        client = _client_from_config(config)
        model = config.get("model_name") or "gpt-4o-mini"
--- a/backend/app/services/email_service.py
+++ b/backend/app/services/email_service.py
@@ -7,7 +7,7 @@ import os
 import re
 import sqlite3
 import ssl
-from datetime import date, datetime
+from datetime import date, datetime, timedelta
 from email.header import decode_header
 from pathlib import Path
 from typing import Any, Dict, List, Tuple
@@ -109,6 +109,29 @@ def _run_invoice_ocr_sync(file_path: str, mime: str, raw_bytes: bytes) -> Tuple[
        loop.close()


+def _extract_text_for_tagging(file_path: str, mime: str, raw_bytes: bytes) -> str:
+    """
+    Extract best-effort text from PDF/image/xlsx for tagging.
+    - PDF: extract text via fitz; fallback to first page OCR image (handled elsewhere if needed)
+    - Image: no local OCR here; return empty and let AI decide (optional)
+    - XLSX: not parsed currently
+    """
+    p = Path(file_path)
+    suf = p.suffix.lower()
+    if suf == ".pdf" or "pdf" in (mime or "").lower():
+        try:
+            import fitz  # PyMuPDF
+            doc = fitz.open(stream=raw_bytes, filetype="pdf")
+            texts: list[str] = []
+            for i in range(min(5, doc.page_count)):
+                texts.append(doc.load_page(i).get_text("text") or "")
+            doc.close()
+            return "\n".join(texts).strip()
+        except Exception:
+            return ""
+    return ""
+
+
 def _rename_invoice_file(
    file_path: str,
    amount: float | None,
@@ -173,6 +196,7 @@ def _has_sync_history() -> bool:
 def _save_attachment(
    msg: email.message.Message,
    month_str: str,
+    allowed_doc_types: set[str] | None = None,
 ) -> List[Tuple[str, str, str, bytes, str]]:
    """
    Save PDF/image attachments.
@@ -193,17 +217,20 @@ def _save_attachment(
        _ensure_sync_history_table(conn)

        for part in msg.walk():
-            content_disposition = part.get("Content-Disposition", "")
-            if "attachment" not in content_disposition:
-                continue
+            # 许多邮件附件会以 inline 或缺失 Content-Disposition 的形式出现，
+            # 只要存在 filename 且扩展名符合，就视为可下载附件。
+            content_disposition = (part.get("Content-Disposition", "") or "").lower()

            filename = part.get_filename()
            filename = _decode_header_value(filename)
            if not filename:
                continue
+            if content_disposition and ("attachment" not in content_disposition and "inline" not in content_disposition):
+                # 明确的非附件 disposition，跳过
+                continue

            ext = Path(filename).suffix.lower()
-            if ext not in (".pdf", ".jpg", ".jpeg", ".png", ".xlsx"):
+            if ext not in (".pdf", ".jpg", ".jpeg", ".png", ".webp", ".xlsx", ".xls"):
                continue

            maintype = part.get_content_maintype()
@@ -216,6 +243,8 @@ def _save_attachment(

            # 分类：基于主题 + 文件名
            doc_type = _classify_type(subject, filename)
+            if allowed_doc_types is not None and doc_type not in allowed_doc_types:
+                continue
            base_dir = _ensure_month_dir(month_str, doc_type)

            # 增量去重：根据 (message_id, md5) 判断是否已同步过
@@ -421,7 +450,56 @@ def _select_mailbox(imap: imaplib.IMAP4_SSL, mailbox: str) -> bool:
    return False


-def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[str, Any]]) -> None:
+def _imap_date(d: date) -> str:
+    # IMAP date format: 16-Mar-2026 (English month)
+    import calendar
+    return f"{d.day:02d}-{calendar.month_abbr[d.month]}-{d.year}"
+
+
+def _pick_latest_msg_id(imap: imaplib.IMAP4_SSL, msg_ids: List[bytes]) -> bytes | None:
+    """从一批 msg_id 中按 INTERNALDATE 选择最新的一封。"""
+    latest_id: bytes | None = None
+    latest_ts: float = -1.0
+    for mid in msg_ids:
+        try:
+            typ, data = imap.fetch(mid, "(INTERNALDATE)")
+            if typ != "OK" or not data or not data[0]:
+                continue
+            # imaplib.Internaldate2tuple expects a bytes response line
+            raw = data[0]
+            if isinstance(raw, tuple):
+                raw = raw[0]
+            if not isinstance(raw, (bytes, bytearray)):
+                raw = str(raw).encode("utf-8", errors="ignore")
+            t = imaplib.Internaldate2tuple(raw)
+            if not t:
+                continue
+            import time
+            ts = time.mktime(t)
+            if ts > latest_ts:
+                latest_ts = ts
+                latest_id = mid
+        except Exception:
+            continue
+    return latest_id
+
+
+def _sync_one_account(
+    config: Dict[str, Any],
+    db: Session,
+    results: List[Dict[str, Any]],
+    *,
+    mode: str = "incremental",
+    start_date: date | None = None,
+    end_date: date | None = None,
+    doc_types: list[str] | None = None,
+) -> None:
+    allowed: set[str] | None = None
+    if doc_types:
+        allowed = {d.strip().lower() for d in doc_types if d and d.strip()}
+        allowed = {d for d in allowed if d in ("invoices", "receipts", "statements")}
+        if not allowed:
+            allowed = None
    host = config.get("host")
    user = config.get("user")
    password = config.get("password")
@@ -461,20 +539,53 @@ def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[st
                f"无法选择邮箱「{mailbox}」，请检查该账户的 Mailbox 配置（如 163 使用 INBOX）"
            )

-        # 首次同步（历史库无记录）：拉取全部邮件中的附件，由 attachment_history 去重
-        # 已有历史：只拉取未读邮件，避免重复拉取
+        # 支持：
+        # - mode=incremental: 首次全量，否则 UNSEEN
+        # - mode=all: 全量（可加时间范围）
+        # - mode=latest: 仅最新一封（可加时间范围）
+        mode = (mode or "incremental").strip().lower()
+        if mode not in ("incremental", "all", "latest"):
+            mode = "incremental"
+
        is_first_sync = not _has_sync_history()
-        search_criterion = "ALL" if is_first_sync else "UNSEEN"
+        base_criterion = "ALL"
+        if mode == "incremental":
+            base_criterion = "ALL" if is_first_sync else "UNSEEN"
+        elif mode == "all":
+            base_criterion = "ALL"
+        elif mode == "latest":
+            base_criterion = "ALL"
+
+        criteria: List[str] = [base_criterion]
+        if start_date:
+            criteria += ["SINCE", _imap_date(start_date)]
+        if end_date:
+            # BEFORE is exclusive; add one day to make end_date inclusive
+            criteria += ["BEFORE", _imap_date(end_date + timedelta(days=1))]
+
        logging.getLogger(__name__).info(
-            "Finance sync: %s (criterion=%s)",
-            "全量" if is_first_sync else "增量",
-            search_criterion,
+            "Finance sync: mode=%s criterion=%s range=%s~%s",
+            mode,
+            base_criterion,
+            start_date,
+            end_date,
        )
-        status, data = imap.search(None, search_criterion)
+
+        status, data = imap.search(None, *criteria)
        if status != "OK":
            return

-        id_list = data[0].split()
+        id_list: List[bytes] = data[0].split() if data and data[0] else []
+        logging.getLogger(__name__).info(
+            "Finance sync: matched messages=%d (mode=%s)", len(id_list), mode
+        )
+        if not id_list:
+            return
+
+        if mode == "latest":
+            latest = _pick_latest_msg_id(imap, id_list)
+            id_list = [latest] if latest else []
+
        for msg_id in id_list:
            status, msg_data = imap.fetch(msg_id, "(RFC822)")
            if status != "OK":
@@ -485,7 +596,7 @@ def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[st
            dt = _parse_email_date(msg)
            month_str = dt.strftime("%Y-%m")

-            saved = _save_attachment(msg, month_str)
+            saved = _save_attachment(msg, month_str, allowed_doc_types=allowed)
            for file_name, file_path, mime, raw_bytes, doc_type in saved:
                final_name = file_name
                final_path = file_path
@@ -510,11 +621,28 @@ def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[st
                    type=doc_type,
                    file_name=final_name,
                    file_path=final_path,
+                    tags=None,
+                    meta_json=None,
                    amount=amount,
                    billing_date=billing_date,
                )
                db.add(record)
                db.flush()
+
+                # 自动识别打标签（同步后自动跑）
+                try:
+                    from backend.app.services.ai_service import extract_finance_tags
+                    content_text = _extract_text_for_tagging(final_path, mime, raw_bytes)
+                    tags, meta = asyncio.run(extract_finance_tags(content_text, doc_type, final_name))  # type: ignore[arg-type]
+                    if tags:
+                        record.tags = ",".join(tags)
+                    if meta:
+                        import json as _json
+                        record.meta_json = _json.dumps(meta, ensure_ascii=False)
+                    db.flush()
+                except Exception:
+                    pass
+
                results.append({
                    "id": record.id,
                    "month": record.month,
@@ -526,7 +654,13 @@ def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[st
            imap.store(msg_id, "+FLAGS", "\\Seen \\Flagged")


-async def sync_finance_emails() -> List[Dict[str, Any]]:
+async def sync_finance_emails(
+    *,
+    mode: str = "incremental",
+    start_date: date | None = None,
+    end_date: date | None = None,
+    doc_types: list[str] | None = None,
+) -> List[Dict[str, Any]]:
    """
    Sync from all active email configs (data/email_configs.json).
    Falls back to env vars if no configs. Classifies into invoices/, receipts/, statements/.
@@ -546,7 +680,15 @@ async def sync_finance_emails() -> List[Dict[str, Any]]:
        try:
            for config in configs:
                try:
-                    _sync_one_account(config, db, results)
+                    _sync_one_account(
+                        config,
+                        db,
+                        results,
+                        mode=mode,
+                        start_date=start_date,
+                        end_date=end_date,
+                        doc_types=doc_types,
+                    )
                except Exception as e:
                    # 不让单个账户的异常中断全部同步，记录错误并继续其他账户。
                    user = config.get("user", "") or config.get("id", "")