AiTool/backend/app/services/email_service.py

import asyncio
import email
import hashlib
import imaplib
import logging
import os
import re
import sqlite3
import ssl
from datetime import date, datetime, timedelta
from email.header import decode_header
from pathlib import Path
from typing import Any, Dict, List, Tuple

# Ensure IMAP ID command is recognised by imaplib so we can spoof a
# desktop mail client (Foxmail/Outlook) for providers like NetEase/163.
imaplib.Commands["ID"] = ("NONAUTH", "AUTH", "SELECTED")

from sqlalchemy.orm import Session

from backend.app.db import SessionLocal
from backend.app.models import FinanceRecord


FINANCE_BASE_DIR = Path("data/finance")
SYNC_DB_PATH = Path("data/finance/sync_history.db")

# Folder names for classification (invoices, receipts, statements)
INVOICES_DIR = "invoices"
RECEIPTS_DIR = "receipts"
STATEMENTS_DIR = "statements"


def _decode_header_value(value: str | None) -> str:
    if not value:
        return ""
    parts = decode_header(value)
    decoded = ""
    for text, enc in parts:
        if isinstance(text, bytes):
            decoded += text.decode(enc or "utf-8", errors="ignore")
        else:
            decoded += text
    return decoded


def _classify_type(subject: str, filename: str) -> str:
    """
    Classify finance document type. Returns: invoices, receipts, statements, others.
    Maps to folders: invoices/, receipts/, statements/.
    """
    text = f"{subject} {filename}".lower()
    # 发票 / 开票类
    if any(k in text for k in ["发票", "开票", "票据", "invoice", "fapiao"]):
        return "invoices"
    # 回执
    if any(k in text for k in ["回执", "签收单", "receipt"]):
        return "receipts"
    # 银行流水 / 账户明细 / 对公活期等
    if any(
        k in text
        for k in [
            "流水",
            "活期",
            "活期明细",
            "对公",
            "明细",
            "回单",
            "bank",
            "对账单",
            "statement",
        ]
    ):
        return "statements"
    return "others"


def _ensure_month_dir(month_str: str, doc_type: str) -> Path:
    month_dir = FINANCE_BASE_DIR / month_str / doc_type
    month_dir.mkdir(parents=True, exist_ok=True)
    return month_dir


def _parse_email_date(msg: email.message.Message) -> datetime:
    date_tuple = email.utils.parsedate_tz(msg.get("Date"))
    if date_tuple:
        dt = datetime.fromtimestamp(email.utils.mktime_tz(date_tuple))
    else:
        dt = datetime.utcnow()
    return dt


def _run_invoice_ocr_sync(file_path: str, mime: str, raw_bytes: bytes) -> Tuple[float | None, str | None]:
    """Run extract_invoice_metadata from a sync context (new event loop). Handles PDF via first page image."""
    from backend.app.services.ai_service import extract_invoice_metadata
    from backend.app.services.invoice_upload import _pdf_first_page_to_image

    if "pdf" in (mime or "").lower() or Path(file_path).suffix.lower() == ".pdf":
        img_result = _pdf_first_page_to_image(raw_bytes)
        if img_result:
            image_bytes, img_mime = img_result
            raw_bytes, mime = image_bytes, img_mime
        # else keep raw_bytes and try anyway (may fail)
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    try:
        return loop.run_until_complete(extract_invoice_metadata(raw_bytes, mime))
    finally:
        loop.close()


def _extract_text_for_tagging(file_path: str, mime: str, raw_bytes: bytes) -> str:
    """
    Extract best-effort text from PDF/image/xlsx for tagging.
    - PDF: extract text via fitz; fallback to first page OCR image (handled elsewhere if needed)
    - Image: no local OCR here; return empty and let AI decide (optional)
    - XLSX: not parsed currently
    """
    p = Path(file_path)
    suf = p.suffix.lower()
    if suf == ".pdf" or "pdf" in (mime or "").lower():
        try:
            import fitz  # PyMuPDF
            doc = fitz.open(stream=raw_bytes, filetype="pdf")
            texts: list[str] = []
            for i in range(min(5, doc.page_count)):
                texts.append(doc.load_page(i).get_text("text") or "")
            doc.close()
            return "\n".join(texts).strip()
        except Exception:
            return ""
    return ""


def _rename_invoice_file(
    file_path: str,
    amount: float | None,
    billing_date: date | None,
) -> Tuple[str, str]:
    """
    Rename invoice file to YYYYMMDD_金额_原文件名.
    Returns (new_file_name, new_file_path).
    """
    path = Path(file_path)
    if not path.exists():
        return (path.name, file_path)
    date_str = (billing_date or date.today()).strftime("%Y%m%d")
    amount_str = f"{amount:.2f}" if amount is not None else "0.00"
    # Sanitize original name: take stem, limit length
    orig_stem = path.stem[: 80] if len(path.stem) > 80 else path.stem
    suffix = path.suffix
    new_name = f"{date_str}_{amount_str}_{orig_stem}{suffix}"
    new_path = path.parent / new_name
    counter = 1
    while new_path.exists():
        new_path = path.parent / f"{date_str}_{amount_str}_{orig_stem}_{counter}{suffix}"
        counter += 1
    path.rename(new_path)
    return (new_path.name, str(new_path))


def _ensure_sync_history_table(conn: sqlite3.Connection) -> None:
    conn.execute(
        """
        CREATE TABLE IF NOT EXISTS attachment_history (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            message_id TEXT,
            file_hash TEXT NOT NULL,
            month TEXT,
            doc_type TEXT,
            file_name TEXT,
            file_path TEXT,
            created_at TEXT DEFAULT CURRENT_TIMESTAMP,
            UNIQUE(message_id, file_hash)
        )
        """
    )
    conn.commit()


def _has_sync_history() -> bool:
    """是否有过同步记录；无记录视为首次同步，需拉全量；有记录则只拉增量（UNSEEN）。"""
    if not SYNC_DB_PATH.exists():
        return False
    try:
        conn = sqlite3.connect(SYNC_DB_PATH)
        try:
            cur = conn.execute("SELECT 1 FROM attachment_history LIMIT 1")
            return cur.fetchone() is not None
        finally:
            conn.close()
    except Exception:
        return False


def _save_attachment(
    msg: email.message.Message,
    month_str: str,
    allowed_doc_types: set[str] | None = None,
) -> List[Tuple[str, str, str, bytes, str]]:
    """
    Save PDF/image attachments.
    Returns list of (file_name, file_path, mime, raw_bytes, doc_type).
    raw_bytes kept for invoice OCR when doc_type == invoices.

    同时使用 data/finance/sync_history.db 做增量去重：
    - 以 (message_id, MD5(content)) 为唯一键，避免重复保存相同附件。
    """
    saved: List[Tuple[str, str, str, bytes, str]] = []

    msg_id = msg.get("Message-ID") or ""
    subject = _decode_header_value(msg.get("Subject"))

    SYNC_DB_PATH.parent.mkdir(parents=True, exist_ok=True)
    conn = sqlite3.connect(SYNC_DB_PATH)
    try:
        _ensure_sync_history_table(conn)

        for part in msg.walk():
            # 许多邮件附件会以 inline 或缺失 Content-Disposition 的形式出现，
            # 只要存在 filename 且扩展名符合，就视为可下载附件。
            content_disposition = (part.get("Content-Disposition", "") or "").lower()

            filename = part.get_filename()
            filename = _decode_header_value(filename)
            if not filename:
                continue
            if content_disposition and ("attachment" not in content_disposition and "inline" not in content_disposition):
                # 明确的非附件 disposition，跳过
                continue

            ext = Path(filename).suffix.lower()
            if ext not in (".pdf", ".jpg", ".jpeg", ".png", ".webp", ".xlsx", ".xls"):
                continue

            maintype = part.get_content_maintype()
            if maintype not in ("application", "image"):
                continue

            data = part.get_payload(decode=True)
            if not data:
                continue

            # 分类：基于主题 + 文件名
            doc_type = _classify_type(subject, filename)
            if allowed_doc_types is not None and doc_type not in allowed_doc_types:
                continue
            base_dir = _ensure_month_dir(month_str, doc_type)

            # 增量去重：根据 (message_id, md5) 判断是否已同步过
            file_hash = hashlib.md5(data).hexdigest()  # nosec - content hash only
            cur = conn.execute(
                "SELECT 1 FROM attachment_history WHERE message_id = ? AND file_hash = ?",
                (msg_id, file_hash),
            )
            if cur.fetchone():
                continue

            mime = part.get_content_type() or "application/octet-stream"
            file_path = base_dir / filename
            counter = 1
            while file_path.exists():
                stem, suffix = file_path.stem, file_path.suffix
                file_path = base_dir / f"{stem}_{counter}{suffix}"
                counter += 1

            file_path.write_bytes(data)

            conn.execute(
                """
                INSERT OR IGNORE INTO attachment_history
                (message_id, file_hash, month, doc_type, file_name, file_path)
                VALUES (?, ?, ?, ?, ?, ?)
                """,
                (msg_id, file_hash, month_str, doc_type, file_path.name, str(file_path)),
            )

            saved.append((file_path.name, str(file_path), mime, data, doc_type))

    finally:
        conn.commit()
        conn.close()

    return saved


def _decode_imap_utf7(s: str | bytes) -> str:
    """Decode IMAP4 UTF-7 mailbox name (RFC 3501). Returns decoded string."""
    if isinstance(s, bytes):
        s = s.decode("ascii", errors="replace")
    if "&" not in s:
        return s
    parts = s.split("&")
    out = [parts[0]]
    for i in range(1, len(parts)):
        chunk = parts[i]
        if "-" in chunk:
            u, rest = chunk.split("-", 1)
            if u == "":
                out.append("&")
            else:
                try:
                    # IMAP UTF-7: &BASE64-  where BASE64 is modified (,+ instead of /,=)
                    pad = (4 - len(u) % 4) % 4
                    b = (u + "=" * pad).translate(str.maketrans(",+", "/="))
                    decoded = __import__("base64").b64decode(b).decode("utf-16-be")
                    out.append(decoded)
                except Exception:
                    out.append("&" + chunk)
                out.append(rest)
        else:
            out.append("&" + chunk)
    return "".join(out)


def _parse_list_response(data: List[bytes]) -> List[Tuple[str, str]]:
    """Parse imap.list() response to [(raw_name, decoded_name), ...]. Format: (flags) \"delim\" \"mailbox\"."""
    import shlex
    result: List[Tuple[str, str]] = []
    for line in data:
        if not isinstance(line, bytes):
            continue
        try:
            line_str = line.decode("ascii", errors="replace")
        except Exception:
            continue
        try:
            parts = shlex.split(line_str)
        except ValueError:
            continue
        if not parts:
            continue
        # Mailbox name is the last part (RFC 3501 LIST: (attrs) delim name)
        raw = parts[-1]
        decoded = _decode_imap_utf7(raw)
        result.append((raw, decoded))
    return result


def _list_mailboxes(imap: imaplib.IMAP4_SSL) -> List[Tuple[str, str]]:
    """List all mailboxes. Returns [(raw_name, decoded_name), ...]."""
    status, data = imap.list()
    if status != "OK" or not data:
        return []
    return _parse_list_response(data)


def list_mailboxes_for_config(host: str, port: int, user: str, password: str) -> List[Tuple[str, str]]:
    """Connect and list all mailboxes (for dropdown). Returns [(raw_name, decoded_name), ...]."""
    with imaplib.IMAP4_SSL(host, int(port)) as imap:
        imap.login(user, password)
        return _list_mailboxes(imap)


def _select_mailbox(imap: imaplib.IMAP4_SSL, mailbox: str) -> bool:
    """
    Robust mailbox selection with deep discovery scan.

    Strategy:
    1. LIST all folders, log raw lines for debugging.
    2. Look for entry containing '\\Inbox' flag; if found, SELECT that folder.
    3. Try standard candidates: user-configured name / INBOX / common UTF-7 收件箱编码.
    4. As last resort, attempt SELECT on every listed folder and log which succeed/fail.
    """
    logger = logging.getLogger(__name__)

    name = (mailbox or "INBOX").strip() or "INBOX"

    # 1) Discovery scan: list all folders and log raw entries
    try:
        status, data = imap.list()
        if status != "OK" or not data:
            logger.warning("IMAP LIST returned no data or non-OK status: %s", status)
            data = []
    except Exception as exc:
        logger.error("IMAP LIST failed: %s", exc)
        data = []

    logger.info("IMAP Discovery Scan: listing all folders for mailbox=%s", name)
    for raw in data:
        logger.info("IMAP FOLDER RAW: %r", raw)

    # 2) 优先按 \\Inbox 属性查找“真正的收件箱”
    inbox_candidates: list[str] = []
    for raw in data:
        line = raw.decode("utf-8", errors="ignore") if isinstance(raw, bytes) else str(raw)
        if "\\Inbox" not in line:
            continue
        m = re.search(r'"([^"]+)"\s*$', line)
        if not m:
            continue
        folder_name = m.group(1)
        inbox_candidates.append(folder_name)

    # 3) 补充常规候选：配置名 / INBOX / 常见 UTF-7 收件箱编码
    primary_names = [name, "INBOX"]
    utf7_names = ["&XfJT0ZTx-"]
    for nm in primary_names + utf7_names:
        if nm not in inbox_candidates:
            inbox_candidates.append(nm)

    logger.info("IMAP Inbox candidate list (ordered): %r", inbox_candidates)

    # 4) 依次尝试候选收件箱
    for candidate in inbox_candidates:
        for readonly in (False, True):
            try:
                status, _ = imap.select(candidate, readonly=readonly)
                logger.info(
                    "IMAP SELECT candidate=%r readonly=%s -> %s", candidate, readonly, status
                )
                if status == "OK":
                    return True
            except Exception as exc:
                logger.warning(
                    "IMAP SELECT failed for candidate=%r readonly=%s: %s",
                    candidate,
                    readonly,
                    exc,
                )

    # 5) 最后手段：尝试 LIST 返回的每一个文件夹
    logger.info("IMAP Fallback: trying SELECT on every listed folder...")
    for raw in data:
        line = raw.decode("utf-8", errors="ignore") if isinstance(raw, bytes) else str(raw)
        m = re.search(r'"([^"]+)"\s*$', line)
        if not m:
            continue
        folder_name = m.group(1)
        for readonly in (False, True):
            try:
                status, _ = imap.select(folder_name, readonly=readonly)
                logger.info(
                    "IMAP SELECT fallback folder=%r readonly=%s -> %s",
                    folder_name,
                    readonly,
                    status,
                )
                if status == "OK":
                    return True
            except Exception as exc:
                logger.warning(
                    "IMAP SELECT fallback failed for folder=%r readonly=%s: %s",
                    folder_name,
                    readonly,
                    exc,
                )

    logger.error("IMAP: unable to SELECT any inbox-like folder for mailbox=%s", name)
    return False


def _imap_date(d: date) -> str:
    # IMAP date format: 16-Mar-2026 (English month)
    import calendar
    return f"{d.day:02d}-{calendar.month_abbr[d.month]}-{d.year}"


def _pick_latest_msg_id(imap: imaplib.IMAP4_SSL, msg_ids: List[bytes]) -> bytes | None:
    """从一批 msg_id 中按 INTERNALDATE 选择最新的一封。"""
    latest_id: bytes | None = None
    latest_ts: float = -1.0
    for mid in msg_ids:
        try:
            typ, data = imap.fetch(mid, "(INTERNALDATE)")
            if typ != "OK" or not data or not data[0]:
                continue
            # imaplib.Internaldate2tuple expects a bytes response line
            raw = data[0]
            if isinstance(raw, tuple):
                raw = raw[0]
            if not isinstance(raw, (bytes, bytearray)):
                raw = str(raw).encode("utf-8", errors="ignore")
            t = imaplib.Internaldate2tuple(raw)
            if not t:
                continue
            import time
            ts = time.mktime(t)
            if ts > latest_ts:
                latest_ts = ts
                latest_id = mid
        except Exception:
            continue
    return latest_id


def _sync_one_account(
    config: Dict[str, Any],
    db: Session,
    results: List[Dict[str, Any]],
    *,
    mode: str = "incremental",
    start_date: date | None = None,
    end_date: date | None = None,
    doc_types: list[str] | None = None,
) -> None:
    allowed: set[str] | None = None
    if doc_types:
        allowed = {d.strip().lower() for d in doc_types if d and d.strip()}
        allowed = {d for d in allowed if d in ("invoices", "receipts", "statements")}
        if not allowed:
            allowed = None
    host = config.get("host")
    user = config.get("user")
    password = config.get("password")
    port = int(config.get("port", 993))
    mailbox = (config.get("mailbox") or "INBOX").strip() or "INBOX"

    if not all([host, user, password]):
        return

    # Use strict TLS context for modern protocols (TLS 1.2+)
    tls_context = ssl.create_default_context()

    with imaplib.IMAP4_SSL(host, port, ssl_context=tls_context) as imap:
        # Enable low-level IMAP debug output to backend logs to help diagnose
        # handshake / protocol / mailbox selection issues with specific providers.
        imap.debug = 4
        imap.login(user, password)
        # NetEase / 163 等会对未知客户端静默限制 SELECT，这里通过 ID 命令伪装为常见桌面客户端。
        try:
            logger = logging.getLogger(__name__)
            id_str = (
                '("name" "Foxmail" '
                '"version" "7.2.25.170" '
                '"vendor" "Tencent" '
                '"os" "Windows" '
                '"os-version" "10.0")'
            )
            logger.info("IMAP sending Foxmail-style ID: %s", id_str)
            # Use low-level command so it works across Python versions.
            typ, dat = imap._command("ID", id_str)  # type: ignore[attr-defined]
            logger.info("IMAP ID command result: %s %r", typ, dat)
        except Exception as exc:
            # ID 失败不应阻断登录，只记录日志，方便后续排查。
            logging.getLogger(__name__).warning("IMAP ID command failed: %s", exc)
        if not _select_mailbox(imap, mailbox):
            raise RuntimeError(
                f"无法选择邮箱「{mailbox}」，请检查该账户的 Mailbox 配置（如 163 使用 INBOX）"
            )

        # 支持：
        # - mode=incremental: 首次全量，否则 UNSEEN
        # - mode=all: 全量（可加时间范围）
        # - mode=latest: 仅最新一封（可加时间范围）
        mode = (mode or "incremental").strip().lower()
        if mode not in ("incremental", "all", "latest"):
            mode = "incremental"

        is_first_sync = not _has_sync_history()
        base_criterion = "ALL"
        if mode == "incremental":
            base_criterion = "ALL" if is_first_sync else "UNSEEN"
        elif mode == "all":
            base_criterion = "ALL"
        elif mode == "latest":
            base_criterion = "ALL"

        criteria: List[str] = [base_criterion]
        if start_date:
            criteria += ["SINCE", _imap_date(start_date)]
        if end_date:
            # BEFORE is exclusive; add one day to make end_date inclusive
            criteria += ["BEFORE", _imap_date(end_date + timedelta(days=1))]

        logging.getLogger(__name__).info(
            "Finance sync: mode=%s criterion=%s range=%s~%s",
            mode,
            base_criterion,
            start_date,
            end_date,
        )

        status, data = imap.search(None, *criteria)
        if status != "OK":
            return

        id_list: List[bytes] = data[0].split() if data and data[0] else []
        logging.getLogger(__name__).info(
            "Finance sync: matched messages=%d (mode=%s)", len(id_list), mode
        )
        if not id_list:
            return

        if mode == "latest":
            latest = _pick_latest_msg_id(imap, id_list)
            id_list = [latest] if latest else []

        for msg_id in id_list:
            status, msg_data = imap.fetch(msg_id, "(RFC822)")
            if status != "OK":
                continue

            raw_email = msg_data[0][1]
            msg = email.message_from_bytes(raw_email)
            dt = _parse_email_date(msg)
            month_str = dt.strftime("%Y-%m")

            saved = _save_attachment(msg, month_str, allowed_doc_types=allowed)
            for file_name, file_path, mime, raw_bytes, doc_type in saved:
                final_name = file_name
                final_path = file_path
                amount = None
                billing_date = None

                if doc_type == "invoices":
                    amount, date_str = _run_invoice_ocr_sync(file_path, mime, raw_bytes)
                    if date_str:
                        try:
                            billing_date = date.fromisoformat(date_str[:10])
                        except ValueError:
                            billing_date = date.today()
                    else:
                        billing_date = date.today()
                    final_name, final_path = _rename_invoice_file(
                        file_path, amount, billing_date
                    )

                record = FinanceRecord(
                    month=month_str,
                    type=doc_type,
                    file_name=final_name,
                    file_path=final_path,
                    tags=None,
                    meta_json=None,
                    amount=amount,
                    billing_date=billing_date,
                )
                db.add(record)
                db.flush()

                # 自动识别打标签（同步后自动跑）
                try:
                    from backend.app.services.ai_service import extract_finance_tags
                    content_text = _extract_text_for_tagging(final_path, mime, raw_bytes)
                    tags, meta = asyncio.run(extract_finance_tags(content_text, doc_type, final_name))  # type: ignore[arg-type]
                    if tags:
                        record.tags = ",".join(tags)
                    if meta:
                        import json as _json
                        record.meta_json = _json.dumps(meta, ensure_ascii=False)
                    db.flush()
                except Exception:
                    pass

                results.append({
                    "id": record.id,
                    "month": record.month,
                    "type": record.type,
                    "file_name": record.file_name,
                    "file_path": record.file_path,
                })

            imap.store(msg_id, "+FLAGS", "\\Seen \\Flagged")


async def sync_finance_emails(
    *,
    mode: str = "incremental",
    start_date: date | None = None,
    end_date: date | None = None,
    doc_types: list[str] | None = None,
) -> List[Dict[str, Any]]:
    """
    Sync from all active email configs (data/email_configs.json).
    Falls back to env vars if no configs. Classifies into invoices/, receipts/, statements/.
    Invoices are renamed to YYYYMMDD_金额_原文件名 using OCR.
    """

    def _sync() -> List[Dict[str, Any]]:
        from backend.app.routers.email_configs import get_email_configs_for_sync

        configs = get_email_configs_for_sync()
        if not configs:
            raise RuntimeError("未配置邮箱。请在 设置 → 邮箱账户 中添加，或配置 IMAP_* 环境变量。")

        results: List[Dict[str, Any]] = []
        errors: List[str] = []
        db = SessionLocal()
        try:
            for config in configs:
                try:
                    _sync_one_account(
                        config,
                        db,
                        results,
                        mode=mode,
                        start_date=start_date,
                        end_date=end_date,
                        doc_types=doc_types,
                    )
                except Exception as e:
                    # 不让单个账户的异常中断全部同步，记录错误并继续其他账户。
                    user = config.get("user", "") or config.get("id", "")
                    errors.append(f"同步账户 {user} 失败: {e}")
            db.commit()
        finally:
            db.close()

        if not results and errors:
            # 所有账户都失败了，整体报错，前端可显示详细原因。
            raise RuntimeError("; ".join(errors))

        return results

    return await asyncio.to_thread(_sync)


async def create_monthly_zip(month_str: str) -> str:
    """
    Zip the finance folder for a given month (YYYY-MM).
    Preserves folder structure (invoices/, receipts/, statements/, manual/) inside the zip.
    """
    import zipfile

    def _zip() -> str:
        month_dir = FINANCE_BASE_DIR / month_str
        if not month_dir.exists():
            raise FileNotFoundError(f"Finance directory for {month_str} not found.")

        FINANCE_BASE_DIR.mkdir(parents=True, exist_ok=True)
        zip_path = FINANCE_BASE_DIR / f"{month_str}.zip"

        with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
            for root, _, files in os.walk(month_dir):
                for file in files:
                    full_path = Path(root) / file
                    rel_path = full_path.relative_to(FINANCE_BASE_DIR)
                    zf.write(full_path, arcname=rel_path)

        return str(zip_path)

    return await asyncio.to_thread(_zip)