fix:优化数据

This commit is contained in:
丹尼尔
2026-03-15 16:38:59 +08:00
parent a609f81a36
commit 3aa1a586e5
43 changed files with 14565 additions and 294 deletions

View File

@@ -1,17 +1,34 @@
import asyncio
import email
import hashlib
import imaplib
import logging
import os
from datetime import datetime
import re
import sqlite3
import ssl
from datetime import date, datetime
from email.header import decode_header
from pathlib import Path
from typing import Any, Dict, List, Tuple
# Ensure IMAP ID command is recognised by imaplib so we can spoof a
# desktop mail client (Foxmail/Outlook) for providers like NetEase/163.
imaplib.Commands["ID"] = ("NONAUTH", "AUTH", "SELECTED")
from sqlalchemy.orm import Session
from backend.app.db import SessionLocal
from backend.app.models import FinanceRecord
FINANCE_BASE_DIR = Path("data/finance")
SYNC_DB_PATH = Path("data/finance/sync_history.db")
# Folder names for classification (invoices, receipts, statements)
INVOICES_DIR = "invoices"
RECEIPTS_DIR = "receipts"
STATEMENTS_DIR = "statements"
def _decode_header_value(value: str | None) -> str:
@@ -27,17 +44,21 @@ def _decode_header_value(value: str | None) -> str:
return decoded
def _classify_type(subject: str) -> str:
def _classify_type(subject: str, filename: str) -> str:
"""
Classify finance document type based on subject keywords.
Classify finance document type. Returns: invoices, receipts, statements, others.
Maps to folders: invoices/, receipts/, statements/.
"""
subject_lower = subject.lower()
text = f"{subject} {filename}".lower()
# 发票 / 开票类
if any(k in subject for k in ["发票", "开票", "票据", "invoice"]):
if any(k in text for k in ["发票", "开票", "票据", "invoice", "fapiao"]):
return "invoices"
# 回执
if any(k in text for k in ["回执", "签收单", "receipt"]):
return "receipts"
# 银行流水 / 账户明细 / 对公活期等
if any(
k in subject
k in text
for k in [
"流水",
"活期",
@@ -50,9 +71,7 @@ def _classify_type(subject: str) -> str:
"statement",
]
):
return "bank_records"
if any(k in subject for k in ["回执", "receipt"]):
return "receipts"
return "statements"
return "others"
@@ -71,132 +90,474 @@ def _parse_email_date(msg: email.message.Message) -> datetime:
return dt
def _run_invoice_ocr_sync(file_path: str, mime: str, raw_bytes: bytes) -> Tuple[float | None, str | None]:
"""Run extract_invoice_metadata from a sync context (new event loop). Handles PDF via first page image."""
from backend.app.services.ai_service import extract_invoice_metadata
from backend.app.services.invoice_upload import _pdf_first_page_to_image
if "pdf" in (mime or "").lower() or Path(file_path).suffix.lower() == ".pdf":
img_result = _pdf_first_page_to_image(raw_bytes)
if img_result:
image_bytes, img_mime = img_result
raw_bytes, mime = image_bytes, img_mime
# else keep raw_bytes and try anyway (may fail)
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
return loop.run_until_complete(extract_invoice_metadata(raw_bytes, mime))
finally:
loop.close()
def _rename_invoice_file(
file_path: str,
amount: float | None,
billing_date: date | None,
) -> Tuple[str, str]:
"""
Rename invoice file to YYYYMMDD_金额_原文件名.
Returns (new_file_name, new_file_path).
"""
path = Path(file_path)
if not path.exists():
return (path.name, file_path)
date_str = (billing_date or date.today()).strftime("%Y%m%d")
amount_str = f"{amount:.2f}" if amount is not None else "0.00"
# Sanitize original name: take stem, limit length
orig_stem = path.stem[: 80] if len(path.stem) > 80 else path.stem
suffix = path.suffix
new_name = f"{date_str}_{amount_str}_{orig_stem}{suffix}"
new_path = path.parent / new_name
counter = 1
while new_path.exists():
new_path = path.parent / f"{date_str}_{amount_str}_{orig_stem}_{counter}{suffix}"
counter += 1
path.rename(new_path)
return (new_path.name, str(new_path))
def _ensure_sync_history_table(conn: sqlite3.Connection) -> None:
conn.execute(
"""
CREATE TABLE IF NOT EXISTS attachment_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
message_id TEXT,
file_hash TEXT NOT NULL,
month TEXT,
doc_type TEXT,
file_name TEXT,
file_path TEXT,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE(message_id, file_hash)
)
"""
)
conn.commit()
def _has_sync_history() -> bool:
"""是否有过同步记录无记录视为首次同步需拉全量有记录则只拉增量UNSEEN"""
if not SYNC_DB_PATH.exists():
return False
try:
conn = sqlite3.connect(SYNC_DB_PATH)
try:
cur = conn.execute("SELECT 1 FROM attachment_history LIMIT 1")
return cur.fetchone() is not None
finally:
conn.close()
except Exception:
return False
def _save_attachment(
msg: email.message.Message,
month_str: str,
doc_type: str,
) -> List[Tuple[str, str]]:
) -> List[Tuple[str, str, str, bytes, str]]:
"""
Save PDF/image attachments and return list of (file_name, file_path).
Save PDF/image attachments.
Returns list of (file_name, file_path, mime, raw_bytes, doc_type).
raw_bytes kept for invoice OCR when doc_type == invoices.
同时使用 data/finance/sync_history.db 做增量去重:
- 以 (message_id, MD5(content)) 为唯一键,避免重复保存相同附件。
"""
saved: List[Tuple[str, str]] = []
base_dir = _ensure_month_dir(month_str, doc_type)
saved: List[Tuple[str, str, str, bytes, str]] = []
for part in msg.walk():
content_disposition = part.get("Content-Disposition", "")
if "attachment" not in content_disposition:
continue
msg_id = msg.get("Message-ID") or ""
subject = _decode_header_value(msg.get("Subject"))
filename = part.get_filename()
filename = _decode_header_value(filename)
if not filename:
continue
SYNC_DB_PATH.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(SYNC_DB_PATH)
try:
_ensure_sync_history_table(conn)
content_type = part.get_content_type()
maintype = part.get_content_maintype()
for part in msg.walk():
content_disposition = part.get("Content-Disposition", "")
if "attachment" not in content_disposition:
continue
# Accept pdf and common images
if maintype not in ("application", "image"):
continue
filename = part.get_filename()
filename = _decode_header_value(filename)
if not filename:
continue
data = part.get_payload(decode=True)
if not data:
continue
ext = Path(filename).suffix.lower()
if ext not in (".pdf", ".jpg", ".jpeg", ".png", ".xlsx"):
continue
file_path = base_dir / filename
# Ensure unique filename
counter = 1
while file_path.exists():
stem = file_path.stem
suffix = file_path.suffix
file_path = base_dir / f"{stem}_{counter}{suffix}"
counter += 1
maintype = part.get_content_maintype()
if maintype not in ("application", "image"):
continue
with open(file_path, "wb") as f:
f.write(data)
data = part.get_payload(decode=True)
if not data:
continue
saved.append((filename, str(file_path)))
# 分类:基于主题 + 文件名
doc_type = _classify_type(subject, filename)
base_dir = _ensure_month_dir(month_str, doc_type)
# 增量去重:根据 (message_id, md5) 判断是否已同步过
file_hash = hashlib.md5(data).hexdigest() # nosec - content hash only
cur = conn.execute(
"SELECT 1 FROM attachment_history WHERE message_id = ? AND file_hash = ?",
(msg_id, file_hash),
)
if cur.fetchone():
continue
mime = part.get_content_type() or "application/octet-stream"
file_path = base_dir / filename
counter = 1
while file_path.exists():
stem, suffix = file_path.stem, file_path.suffix
file_path = base_dir / f"{stem}_{counter}{suffix}"
counter += 1
file_path.write_bytes(data)
conn.execute(
"""
INSERT OR IGNORE INTO attachment_history
(message_id, file_hash, month, doc_type, file_name, file_path)
VALUES (?, ?, ?, ?, ?, ?)
""",
(msg_id, file_hash, month_str, doc_type, file_path.name, str(file_path)),
)
saved.append((file_path.name, str(file_path), mime, data, doc_type))
finally:
conn.commit()
conn.close()
return saved
def _decode_imap_utf7(s: str | bytes) -> str:
"""Decode IMAP4 UTF-7 mailbox name (RFC 3501). Returns decoded string."""
if isinstance(s, bytes):
s = s.decode("ascii", errors="replace")
if "&" not in s:
return s
parts = s.split("&")
out = [parts[0]]
for i in range(1, len(parts)):
chunk = parts[i]
if "-" in chunk:
u, rest = chunk.split("-", 1)
if u == "":
out.append("&")
else:
try:
# IMAP UTF-7: &BASE64- where BASE64 is modified (,+ instead of /,=)
pad = (4 - len(u) % 4) % 4
b = (u + "=" * pad).translate(str.maketrans(",+", "/="))
decoded = __import__("base64").b64decode(b).decode("utf-16-be")
out.append(decoded)
except Exception:
out.append("&" + chunk)
out.append(rest)
else:
out.append("&" + chunk)
return "".join(out)
def _parse_list_response(data: List[bytes]) -> List[Tuple[str, str]]:
"""Parse imap.list() response to [(raw_name, decoded_name), ...]. Format: (flags) \"delim\" \"mailbox\"."""
import shlex
result: List[Tuple[str, str]] = []
for line in data:
if not isinstance(line, bytes):
continue
try:
line_str = line.decode("ascii", errors="replace")
except Exception:
continue
try:
parts = shlex.split(line_str)
except ValueError:
continue
if not parts:
continue
# Mailbox name is the last part (RFC 3501 LIST: (attrs) delim name)
raw = parts[-1]
decoded = _decode_imap_utf7(raw)
result.append((raw, decoded))
return result
def _list_mailboxes(imap: imaplib.IMAP4_SSL) -> List[Tuple[str, str]]:
"""List all mailboxes. Returns [(raw_name, decoded_name), ...]."""
status, data = imap.list()
if status != "OK" or not data:
return []
return _parse_list_response(data)
def list_mailboxes_for_config(host: str, port: int, user: str, password: str) -> List[Tuple[str, str]]:
"""Connect and list all mailboxes (for dropdown). Returns [(raw_name, decoded_name), ...]."""
with imaplib.IMAP4_SSL(host, int(port)) as imap:
imap.login(user, password)
return _list_mailboxes(imap)
def _select_mailbox(imap: imaplib.IMAP4_SSL, mailbox: str) -> bool:
"""
Robust mailbox selection with deep discovery scan.
Strategy:
1. LIST all folders, log raw lines for debugging.
2. Look for entry containing '\\Inbox' flag; if found, SELECT that folder.
3. Try standard candidates: user-configured name / INBOX / common UTF-7 收件箱编码.
4. As last resort, attempt SELECT on every listed folder and log which succeed/fail.
"""
logger = logging.getLogger(__name__)
name = (mailbox or "INBOX").strip() or "INBOX"
# 1) Discovery scan: list all folders and log raw entries
try:
status, data = imap.list()
if status != "OK" or not data:
logger.warning("IMAP LIST returned no data or non-OK status: %s", status)
data = []
except Exception as exc:
logger.error("IMAP LIST failed: %s", exc)
data = []
logger.info("IMAP Discovery Scan: listing all folders for mailbox=%s", name)
for raw in data:
logger.info("IMAP FOLDER RAW: %r", raw)
# 2) 优先按 \\Inbox 属性查找“真正的收件箱”
inbox_candidates: list[str] = []
for raw in data:
line = raw.decode("utf-8", errors="ignore") if isinstance(raw, bytes) else str(raw)
if "\\Inbox" not in line:
continue
m = re.search(r'"([^"]+)"\s*$', line)
if not m:
continue
folder_name = m.group(1)
inbox_candidates.append(folder_name)
# 3) 补充常规候选:配置名 / INBOX / 常见 UTF-7 收件箱编码
primary_names = [name, "INBOX"]
utf7_names = ["&XfJT0ZTx-"]
for nm in primary_names + utf7_names:
if nm not in inbox_candidates:
inbox_candidates.append(nm)
logger.info("IMAP Inbox candidate list (ordered): %r", inbox_candidates)
# 4) 依次尝试候选收件箱
for candidate in inbox_candidates:
for readonly in (False, True):
try:
status, _ = imap.select(candidate, readonly=readonly)
logger.info(
"IMAP SELECT candidate=%r readonly=%s -> %s", candidate, readonly, status
)
if status == "OK":
return True
except Exception as exc:
logger.warning(
"IMAP SELECT failed for candidate=%r readonly=%s: %s",
candidate,
readonly,
exc,
)
# 5) 最后手段:尝试 LIST 返回的每一个文件夹
logger.info("IMAP Fallback: trying SELECT on every listed folder...")
for raw in data:
line = raw.decode("utf-8", errors="ignore") if isinstance(raw, bytes) else str(raw)
m = re.search(r'"([^"]+)"\s*$', line)
if not m:
continue
folder_name = m.group(1)
for readonly in (False, True):
try:
status, _ = imap.select(folder_name, readonly=readonly)
logger.info(
"IMAP SELECT fallback folder=%r readonly=%s -> %s",
folder_name,
readonly,
status,
)
if status == "OK":
return True
except Exception as exc:
logger.warning(
"IMAP SELECT fallback failed for folder=%r readonly=%s: %s",
folder_name,
readonly,
exc,
)
logger.error("IMAP: unable to SELECT any inbox-like folder for mailbox=%s", name)
return False
def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[str, Any]]) -> None:
host = config.get("host")
user = config.get("user")
password = config.get("password")
port = int(config.get("port", 993))
mailbox = (config.get("mailbox") or "INBOX").strip() or "INBOX"
if not all([host, user, password]):
return
# Use strict TLS context for modern protocols (TLS 1.2+)
tls_context = ssl.create_default_context()
with imaplib.IMAP4_SSL(host, port, ssl_context=tls_context) as imap:
# Enable low-level IMAP debug output to backend logs to help diagnose
# handshake / protocol / mailbox selection issues with specific providers.
imap.debug = 4
imap.login(user, password)
# NetEase / 163 等会对未知客户端静默限制 SELECT这里通过 ID 命令伪装为常见桌面客户端。
try:
logger = logging.getLogger(__name__)
id_str = (
'("name" "Foxmail" '
'"version" "7.2.25.170" '
'"vendor" "Tencent" '
'"os" "Windows" '
'"os-version" "10.0")'
)
logger.info("IMAP sending Foxmail-style ID: %s", id_str)
# Use low-level command so it works across Python versions.
typ, dat = imap._command("ID", id_str) # type: ignore[attr-defined]
logger.info("IMAP ID command result: %s %r", typ, dat)
except Exception as exc:
# ID 失败不应阻断登录,只记录日志,方便后续排查。
logging.getLogger(__name__).warning("IMAP ID command failed: %s", exc)
if not _select_mailbox(imap, mailbox):
raise RuntimeError(
f"无法选择邮箱「{mailbox}」,请检查该账户的 Mailbox 配置(如 163 使用 INBOX"
)
# 首次同步(历史库无记录):拉取全部邮件中的附件,由 attachment_history 去重
# 已有历史:只拉取未读邮件,避免重复拉取
is_first_sync = not _has_sync_history()
search_criterion = "ALL" if is_first_sync else "UNSEEN"
logging.getLogger(__name__).info(
"Finance sync: %s (criterion=%s)",
"全量" if is_first_sync else "增量",
search_criterion,
)
status, data = imap.search(None, search_criterion)
if status != "OK":
return
id_list = data[0].split()
for msg_id in id_list:
status, msg_data = imap.fetch(msg_id, "(RFC822)")
if status != "OK":
continue
raw_email = msg_data[0][1]
msg = email.message_from_bytes(raw_email)
dt = _parse_email_date(msg)
month_str = dt.strftime("%Y-%m")
saved = _save_attachment(msg, month_str)
for file_name, file_path, mime, raw_bytes, doc_type in saved:
final_name = file_name
final_path = file_path
amount = None
billing_date = None
if doc_type == "invoices":
amount, date_str = _run_invoice_ocr_sync(file_path, mime, raw_bytes)
if date_str:
try:
billing_date = date.fromisoformat(date_str[:10])
except ValueError:
billing_date = date.today()
else:
billing_date = date.today()
final_name, final_path = _rename_invoice_file(
file_path, amount, billing_date
)
record = FinanceRecord(
month=month_str,
type=doc_type,
file_name=final_name,
file_path=final_path,
amount=amount,
billing_date=billing_date,
)
db.add(record)
db.flush()
results.append({
"id": record.id,
"month": record.month,
"type": record.type,
"file_name": record.file_name,
"file_path": record.file_path,
})
imap.store(msg_id, "+FLAGS", "\\Seen \\Flagged")
async def sync_finance_emails() -> List[Dict[str, Any]]:
"""
Connect to IMAP, fetch unread finance-related emails, download attachments,
save to filesystem and record FinanceRecord entries.
Sync from all active email configs (data/email_configs.json).
Falls back to env vars if no configs. Classifies into invoices/, receipts/, statements/.
Invoices are renamed to YYYYMMDD_金额_原文件名 using OCR.
"""
def _sync() -> List[Dict[str, Any]]:
host = os.getenv("IMAP_HOST")
user = os.getenv("IMAP_USER")
password = os.getenv("IMAP_PASSWORD")
port = int(os.getenv("IMAP_PORT", "993"))
mailbox = os.getenv("IMAP_MAILBOX", "INBOX")
from backend.app.routers.email_configs import get_email_configs_for_sync
if not all([host, user, password]):
raise RuntimeError("IMAP_HOST, IMAP_USER, IMAP_PASSWORD must be set.")
configs = get_email_configs_for_sync()
if not configs:
raise RuntimeError("未配置邮箱。请在 设置 → 邮箱账户 中添加,或配置 IMAP_* 环境变量。")
results: List[Dict[str, Any]] = []
errors: List[str] = []
db = SessionLocal()
try:
for config in configs:
try:
_sync_one_account(config, db, results)
except Exception as e:
# 不让单个账户的异常中断全部同步,记录错误并继续其他账户。
user = config.get("user", "") or config.get("id", "")
errors.append(f"同步账户 {user} 失败: {e}")
db.commit()
finally:
db.close()
with imaplib.IMAP4_SSL(host, port) as imap:
imap.login(user, password)
imap.select(mailbox)
# Search for UNSEEN emails with finance related keywords in subject.
# Note: IMAP SEARCH is limited; here we search UNSEEN first then filter in Python.
status, data = imap.search(None, "UNSEEN")
if status != "OK":
return results
id_list = data[0].split()
db = SessionLocal()
try:
for msg_id in id_list:
status, msg_data = imap.fetch(msg_id, "(RFC822)")
if status != "OK":
continue
raw_email = msg_data[0][1]
msg = email.message_from_bytes(raw_email)
subject = _decode_header_value(msg.get("Subject"))
doc_type = _classify_type(subject)
# Filter by keywords first
if doc_type == "others":
continue
dt = _parse_email_date(msg)
month_str = dt.strftime("%Y-%m")
saved_files = _save_attachment(msg, month_str, doc_type)
for file_name, file_path in saved_files:
record = FinanceRecord(
month=month_str,
type=doc_type,
file_name=file_name,
file_path=file_path,
)
# NOTE: created_at defaults at DB layer
db.add(record)
db.flush()
results.append(
{
"id": record.id,
"month": record.month,
"type": record.type,
"file_name": record.file_name,
"file_path": record.file_path,
}
)
# Mark email as seen and flagged to avoid re-processing
imap.store(msg_id, "+FLAGS", "\\Seen \\Flagged")
db.commit()
finally:
db.close()
if not results and errors:
# 所有账户都失败了,整体报错,前端可显示详细原因。
raise RuntimeError("; ".join(errors))
return results
@@ -205,7 +566,8 @@ async def sync_finance_emails() -> List[Dict[str, Any]]:
async def create_monthly_zip(month_str: str) -> str:
"""
Zip the finance folder for a given month (YYYY-MM) and return the zip path.
Zip the finance folder for a given month (YYYY-MM).
Preserves folder structure (invoices/, receipts/, statements/, manual/) inside the zip.
"""
import zipfile
@@ -227,4 +589,3 @@ async def create_monthly_zip(month_str: str) -> str:
return str(zip_path)
return await asyncio.to_thread(_zip)