fix:优化数据

This commit is contained in:
丹尼尔
2026-03-15 16:38:59 +08:00
parent a609f81a36
commit 3aa1a586e5
43 changed files with 14565 additions and 294 deletions

View File

@@ -1,37 +1,71 @@
import base64
import json
import os
from typing import Any, Dict
import re
from pathlib import Path
from typing import Any, Dict, Tuple
from openai import AsyncOpenAI
from openai import NotFoundError as OpenAINotFoundError
AI_CONFIG_PATH = Path("data/ai_config.json")
AI_CONFIGS_PATH = Path("data/ai_configs.json")
_client: AsyncOpenAI | None = None
def get_ai_client() -> AsyncOpenAI:
def get_active_ai_config() -> Dict[str, Any]:
"""
Create (or reuse) a singleton AsyncOpenAI client.
The client is configured via:
- AI_API_KEY / OPENAI_API_KEY
- AI_BASE_URL (optional, defaults to official OpenAI endpoint)
- AI_MODEL (optional, defaults to gpt-4.1-mini or a similar capable model)
从 data/ai_configs.json 读取当前选用的配置;若无则从旧版 ai_config.json 迁移并返回。
供 router 与内部调用。
"""
global _client
if _client is not None:
return _client
defaults = {
"id": "",
"name": "",
"provider": "OpenAI",
"api_key": "",
"base_url": "",
"model_name": "gpt-4o-mini",
"temperature": 0.2,
"system_prompt_override": "",
}
if AI_CONFIGS_PATH.exists():
try:
data = json.loads(AI_CONFIGS_PATH.read_text(encoding="utf-8"))
configs = data.get("configs") or []
active_id = data.get("active_id") or ""
for c in configs:
if c.get("id") == active_id:
return {**defaults, **c}
if configs:
return {**defaults, **configs[0]}
except Exception:
pass
# 兼容旧版单文件
if AI_CONFIG_PATH.exists():
try:
data = json.loads(AI_CONFIG_PATH.read_text(encoding="utf-8"))
return {**defaults, **data}
except Exception:
pass
if not defaults.get("api_key"):
defaults["api_key"] = os.getenv("AI_API_KEY") or os.getenv("OPENAI_API_KEY") or ""
if not defaults.get("base_url") and os.getenv("AI_BASE_URL"):
defaults["base_url"] = os.getenv("AI_BASE_URL")
if defaults.get("model_name") == "gpt-4o-mini" and os.getenv("AI_MODEL"):
defaults["model_name"] = os.getenv("AI_MODEL")
return defaults
api_key = os.getenv("AI_API_KEY") or os.getenv("OPENAI_API_KEY")
def _load_ai_config() -> Dict[str, Any]:
"""当前生效的 AI 配置(供需求解析、发票识别等使用)。"""
return get_active_ai_config()
def _client_from_config(config: Dict[str, Any]) -> AsyncOpenAI:
api_key = (config.get("api_key") or "").strip()
if not api_key:
raise RuntimeError("AI_API_KEY or OPENAI_API_KEY must be set in environment.")
base_url = os.getenv("AI_BASE_URL") # can point to OpenAI, DeepSeek, Qwen, etc.
_client = AsyncOpenAI(
api_key=api_key,
base_url=base_url or None,
)
return _client
raise RuntimeError("AI API Key 未配置,请在 设置 → AI 模型配置 中填写。")
base_url = (config.get("base_url") or "").strip() or None
return AsyncOpenAI(api_key=api_key, base_url=base_url)
def _build_requirement_prompt(raw_text: str) -> str:
@@ -71,38 +105,139 @@ def _build_requirement_prompt(raw_text: str) -> str:
async def analyze_requirement(raw_text: str) -> Dict[str, Any]:
"""
Call the AI model to analyze customer requirements.
Returns a Python dict matching the JSON structure described
in `_build_requirement_prompt`.
Reads config from data/ai_config.json (and env fallback) on every request.
"""
client = get_ai_client()
model = os.getenv("AI_MODEL", "gpt-4.1-mini")
import logging
logger = logging.getLogger(__name__)
config = _load_ai_config()
client = _client_from_config(config)
model = config.get("model_name") or "gpt-4o-mini"
temperature = float(config.get("temperature", 0.2))
system_override = (config.get("system_prompt_override") or "").strip()
logger.info("AI 需求解析: 调用模型 %s,输入长度 %d 字符", model, len(raw_text))
prompt = _build_requirement_prompt(raw_text)
completion = await client.chat.completions.create(
model=model,
response_format={"type": "json_object"},
messages=[
{
"role": "system",
"content": (
"你是一名严谨的系统架构师,只能输出有效的 JSON不要输出任何解释文字。"
),
},
{
"role": "user",
"content": prompt,
},
],
temperature=0.2,
system_content = (
system_override
if system_override
else "你是一名严谨的系统架构师,只能输出有效的 JSON不要输出任何解释文字。"
)
try:
completion = await client.chat.completions.create(
model=model,
response_format={"type": "json_object"},
messages=[
{"role": "system", "content": system_content},
{"role": "user", "content": prompt},
],
temperature=temperature,
)
except OpenAINotFoundError as e:
raise RuntimeError(
"当前配置的模型不存在或无权访问。请在 设置 → AI 模型配置 中确认「模型名称」与当前提供商一致(如阿里云使用 qwen 系列、OpenAI 使用 gpt-4o-mini 等)。"
) from e
content = completion.choices[0].message.content or "{}"
try:
data: Dict[str, Any] = json.loads(content)
data: Any = json.loads(content)
except json.JSONDecodeError as exc:
logger.error("AI 返回非 JSON片段: %s", (content or "")[:200])
raise RuntimeError(f"AI 返回的内容不是合法 JSON{content}") from exc
# Some models return a list (e.g. modules only); normalize to expected dict shape
if isinstance(data, list):
data = {
"modules": data,
"total_estimated_hours": None,
"total_amount": None,
"notes": None,
}
if not isinstance(data, dict):
data = {}
mods = data.get("modules") or []
logger.info("AI 需求解析完成: 模块数 %d", len(mods) if isinstance(mods, list) else 0)
return data
async def test_connection() -> str:
"""使用当前选用配置测试连接。"""
return await test_connection_with_config(get_active_ai_config())
async def test_connection_with_config(config: Dict[str, Any]) -> str:
"""
使用指定配置发送简单补全以验证 API Key 与 Base URL。
供测试当前配置或指定 config_id 时使用。
"""
client = _client_from_config(config)
model = config.get("model_name") or "gpt-4o-mini"
try:
completion = await client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": "Hello"}],
max_tokens=50,
)
except OpenAINotFoundError as e:
raise RuntimeError(
"当前配置的模型不存在或无权访问。请在 设置 → AI 模型配置 中确认「模型名称」(如阿里云使用 qwen 系列)。"
) from e
return (completion.choices[0].message.content or "").strip() or "OK"
async def extract_invoice_metadata(image_bytes: bytes, mime: str = "image/jpeg") -> Tuple[float | None, str | None]:
"""
Use AI vision to extract total amount and invoice date from an image.
Returns (amount, date_yyyy_mm_dd). On any error or unsupported model, returns (None, None).
"""
config = _load_ai_config()
api_key = (config.get("api_key") or "").strip()
if not api_key:
return (None, None)
try:
client = _client_from_config(config)
model = config.get("model_name") or "gpt-4o-mini"
b64 = base64.b64encode(image_bytes).decode("ascii")
data_url = f"data:{mime};base64,{b64}"
prompt = (
"从这张发票/收据图片中识别并提取1) 价税合计/总金额数字不含货币符号2) 开票日期(格式 YYYY-MM-DD"
"只返回 JSON不要其他文字格式{\"amount\": 数字或null, \"date\": \"YYYY-MM-DD\" 或 null}。"
)
completion = await client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": data_url}},
],
}
],
max_tokens=150,
)
content = (completion.choices[0].message.content or "").strip()
if not content:
return (None, None)
# Handle markdown code block
if "```" in content:
content = re.sub(r"^.*?```(?:json)?\s*", "", content).strip()
content = re.sub(r"\s*```.*$", "", content).strip()
data = json.loads(content)
amount_raw = data.get("amount")
date_raw = data.get("date")
amount = None
if amount_raw is not None:
try:
amount = float(amount_raw)
except (TypeError, ValueError):
pass
date_str = None
if isinstance(date_raw, str) and re.match(r"\d{4}-\d{2}-\d{2}", date_raw):
date_str = date_raw[:10]
return (amount, date_str)
except Exception:
return (None, None)

View File

@@ -0,0 +1,315 @@
"""
云文档集成:飞书、语雀、腾讯文档的文档创建/更新。
统一以 Markdown 为中间格式,由各平台 API 写入。
扩展建议:可增加「月度财务明细表」自动导出——每月在飞书/腾讯文档生成表格,
插入当月发票等附件预览链接,供财务查看(需对接财务记录与附件列表)。
"""
from typing import Any, Dict, List, Tuple
import httpx
FEISHU_BASE = "https://open.feishu.cn"
YUQUE_BASE = "https://www.yuque.com/api/v2"
async def get_feishu_tenant_token(app_id: str, app_secret: str) -> str:
"""获取飞书 tenant_access_token。"""
async with httpx.AsyncClient() as client:
r = await client.post(
f"{FEISHU_BASE}/open-apis/auth/v3/tenant_access_token/internal",
json={"app_id": app_id, "app_secret": app_secret},
timeout=10.0,
)
r.raise_for_status()
data = r.json()
if data.get("code") != 0:
raise RuntimeError(data.get("msg", "飞书鉴权失败"))
return data["tenant_access_token"]
def _feishu_text_block_elements(md: str) -> List[Dict[str, Any]]:
"""将 Markdown 转为飞书文本块 elements按行拆成 textRun简单实现"""
elements: List[Dict[str, Any]] = []
for line in md.split("\n"):
line = line.rstrip()
if not line:
elements.append({"type": "textRun", "text_run": {"text": "\n"}})
else:
elements.append({"type": "textRun", "text_run": {"text": line + "\n"}})
if not elements:
elements.append({"type": "textRun", "text_run": {"text": " "}})
return elements
async def feishu_create_doc(
token: str, title: str, body_md: str, folder_token: str = ""
) -> Tuple[str, str]:
"""
创建飞书文档并写入内容。返回 (document_id, url)。
使用 docx/v1创建文档后向根块下添加子块写入 Markdown 文本。
"""
async with httpx.AsyncClient() as client:
headers = {"Authorization": f"Bearer {token}"}
# 1. 创建文档
create_body: Dict[str, Any] = {"title": title[:50] or "未命名文档"}
if folder_token:
create_body["folder_token"] = folder_token
r = await client.post(
f"{FEISHU_BASE}/open-apis/docx/v1/documents",
headers=headers,
json=create_body,
timeout=15.0,
)
r.raise_for_status()
data = r.json()
if data.get("code") != 0:
raise RuntimeError(data.get("msg", "飞书创建文档失败"))
doc = data.get("data", {})
document_id = doc.get("document", {}).get("document_id")
if not document_id:
raise RuntimeError("飞书未返回 document_id")
url = doc.get("document", {}).get("url", "")
# 2. 根块 ID 即 document_id飞书约定
block_id = document_id
# 3. 添加子块(内容)
elements = _feishu_text_block_elements(body_md)
# 单块有长度限制,分批写入多块
chunk_size = 3000
for i in range(0, len(elements), chunk_size):
chunk = elements[i : i + chunk_size]
body_json = {"children": [{"block_type": "text", "text": {"elements": chunk}}], "index": -1}
r3 = await client.post(
f"{FEISHU_BASE}/open-apis/docx/v1/documents/{document_id}/blocks/{block_id}/children",
headers=headers,
json=body_json,
timeout=15.0,
)
r3.raise_for_status()
res = r3.json()
if res.get("code") != 0:
raise RuntimeError(res.get("msg", "飞书写入块失败"))
# 下一批挂在刚创建的块下
new_items = res.get("data", {}).get("children", [])
if new_items:
block_id = new_items[0].get("block_id", block_id)
return document_id, url or f"https://feishu.cn/docx/{document_id}"
async def feishu_update_doc(token: str, document_id: str, body_md: str) -> str:
"""
更新飞书文档内容:获取现有块并批量更新首个文本块,或追加新块。
返回文档 URL。
"""
async with httpx.AsyncClient() as client:
headers = {"Authorization": f"Bearer {token}"}
r = await client.get(
f"{FEISHU_BASE}/open-apis/docx/v1/documents/{document_id}/blocks",
headers=headers,
params={"document_id": document_id},
timeout=10.0,
)
r.raise_for_status()
data = r.json()
if data.get("code") != 0:
raise RuntimeError(data.get("msg", "飞书获取块失败"))
items = data.get("data", {}).get("items", [])
elements = _feishu_text_block_elements(body_md)
if items:
first_id = items[0].get("block_id")
if first_id:
# 批量更新:只更新第一个块的内容
update_body = {
"requests": [
{
"request_type": "blockUpdate",
"block_id": first_id,
"update_text": {"elements": elements},
}
]
}
r2 = await client.patch(
f"{FEISHU_BASE}/open-apis/docx/v1/documents/{document_id}/blocks/batch_update",
headers=headers,
json=update_body,
timeout=15.0,
)
r2.raise_for_status()
if r2.json().get("code") != 0:
# 若 PATCH 不支持该块类型,则追加新块
pass
else:
return f"https://feishu.cn/docx/{document_id}"
# 无块或更新失败:在根下追加子块
block_id = document_id
for i in range(0, len(elements), 3000):
chunk = elements[i : i + 3000]
body_json = {"children": [{"block_type": "text", "text": {"elements": chunk}}], "index": -1}
r3 = await client.post(
f"{FEISHU_BASE}/open-apis/docx/v1/documents/{document_id}/blocks/{block_id}/children",
headers=headers,
json=body_json,
timeout=15.0,
)
r3.raise_for_status()
res = r3.json()
if res.get("data", {}).get("children"):
block_id = res["data"]["children"][0].get("block_id", block_id)
return f"https://feishu.cn/docx/{document_id}"
# --------------- 语雀 ---------------
async def yuque_create_doc(
token: str, repo_id_or_namespace: str, title: str, body_md: str
) -> Tuple[str, str]:
"""
在语雀知识库创建文档。repo_id_or_namespace 可为 repo_id 或 namespace如 user/repo
返回 (doc_id, url)。
"""
async with httpx.AsyncClient() as client:
headers = {
"X-Auth-Token": token,
"Content-Type": "application/json",
"User-Agent": "OpsCore-CloudDoc/1.0",
}
# 若为 namespace 需先解析为 repo_id语雀 API 创建文档用 repo_id
repo_id = repo_id_or_namespace
if "/" in repo_id_or_namespace:
r_repo = await client.get(
f"{YUQUE_BASE}/repos/{repo_id_or_namespace}",
headers=headers,
timeout=10.0,
)
if r_repo.status_code == 200 and r_repo.json().get("data"):
repo_id = str(r_repo.json()["data"]["id"])
r = await client.post(
f"{YUQUE_BASE}/repos/{repo_id}/docs",
headers=headers,
json={
"title": title[:100] or "未命名",
"body": body_md,
"format": "markdown",
},
timeout=15.0,
)
r.raise_for_status()
data = r.json()
doc = data.get("data", {})
doc_id = str(doc.get("id", ""))
url = doc.get("url", "")
if not url and doc.get("slug"):
url = f"https://www.yuque.com/{doc.get('namespace', '').replace('/', '/')}/{doc.get('slug', '')}"
return doc_id, url or ""
async def yuque_update_doc(
token: str, repo_id_or_namespace: str, doc_id: str, title: str, body_md: str
) -> str:
"""更新语雀文档。返回文档 URL。"""
async with httpx.AsyncClient() as client:
headers = {
"X-Auth-Token": token,
"Content-Type": "application/json",
"User-Agent": "OpsCore-CloudDoc/1.0",
}
r = await client.put(
f"{YUQUE_BASE}/repos/{repo_id_or_namespace}/docs/{doc_id}",
headers=headers,
json={
"title": title[:100] or "未命名",
"body": body_md,
"format": "markdown",
},
timeout=15.0,
)
r.raise_for_status()
data = r.json()
doc = data.get("data", {})
return doc.get("url", "") or f"https://www.yuque.com/docs/{doc_id}"
async def yuque_list_docs(token: str, repo_id_or_namespace: str) -> List[Dict[str, Any]]:
"""获取知识库文档列表。"""
async with httpx.AsyncClient() as client:
headers = {
"X-Auth-Token": token,
"User-Agent": "OpsCore-CloudDoc/1.0",
}
r = await client.get(
f"{YUQUE_BASE}/repos/{repo_id_or_namespace}/docs",
headers=headers,
timeout=10.0,
)
r.raise_for_status()
data = r.json()
return data.get("data", [])
# --------------- 腾讯文档(占位) ---------------
async def tencent_create_doc(client_id: str, client_secret: str, title: str, body_md: str) -> Tuple[str, str]:
"""
腾讯文档需 OAuth 用户授权与文件创建 API此处返回占位。
正式接入需在腾讯开放平台创建应用并走 OAuth 流程。
"""
raise RuntimeError(
"腾讯文档 Open API 需在开放平台配置 OAuth 并获取用户授权;当前版本请先用飞书或语雀推送。"
)
# --------------- 统一入口 ---------------
class CloudDocManager:
"""统一封装:读取配置并执行创建/更新,支持增量(有 cloud_doc_id 则更新)。"""
def __init__(self, credentials: Dict[str, Dict[str, str]]):
self.credentials = credentials
async def push_markdown(
self,
platform: str,
title: str,
body_md: str,
existing_doc_id: str | None = None,
extra: Dict[str, str] | None = None,
) -> Tuple[str, str]:
"""
将 Markdown 推送到指定平台。若 existing_doc_id 存在则更新,否则创建。
返回 (cloud_doc_id, url)。
extra: 平台相关参数,如 yuque 的 default_repo。
"""
extra = extra or {}
if platform == "feishu":
cred = self.credentials.get("feishu") or {}
app_id = (cred.get("app_id") or "").strip()
app_secret = (cred.get("app_secret") or "").strip()
if not app_id or not app_secret:
raise RuntimeError("请先在设置中配置飞书 App ID 与 App Secret")
token = await get_feishu_tenant_token(app_id, app_secret)
if existing_doc_id:
url = await feishu_update_doc(token, existing_doc_id, body_md)
return existing_doc_id, url
return await feishu_create_doc(token, title, body_md)
if platform == "yuque":
cred = self.credentials.get("yuque") or {}
token = (cred.get("token") or "").strip()
default_repo = (cred.get("default_repo") or extra.get("repo") or "").strip()
if not token:
raise RuntimeError("请先在设置中配置语雀 Personal Access Token")
if not default_repo:
raise RuntimeError("请先在设置中配置语雀默认知识库namespace如 user/repo")
if existing_doc_id:
url = await yuque_update_doc(token, default_repo, existing_doc_id, title, body_md)
return existing_doc_id, url
return await yuque_create_doc(token, default_repo, title, body_md)
if platform == "tencent":
await tencent_create_doc("", "", title, body_md)
return "", ""
raise RuntimeError(f"不支持的平台: {platform}")

View File

@@ -44,7 +44,12 @@ async def generate_quote_excel(
# Assume the first worksheet is used for the quote.
ws = wb.active
modules: List[Dict[str, Any]] = project_data.get("modules", [])
raw_modules: List[Any] = project_data.get("modules", [])
# Normalize: only dicts have .get(); coerce others to a minimal dict
modules: List[Dict[str, Any]] = [
m if isinstance(m, dict) else {"name": str(m) or f"模块 {i}"}
for i, m in enumerate(raw_modules, start=1)
]
total_amount = project_data.get("total_amount")
total_hours = project_data.get("total_estimated_hours")
notes = project_data.get("notes")
@@ -157,7 +162,11 @@ async def generate_quote_pdf_from_data(
c.setFont("Helvetica", 10)
modules: List[Dict[str, Any]] = project_data.get("modules", [])
raw_modules: List[Any] = project_data.get("modules", [])
modules = [
m if isinstance(m, dict) else {"name": str(m) or f"模块 {i}"}
for i, m in enumerate(raw_modules, start=1)
]
for idx, module in enumerate(modules, start=1):
name = module.get("name", "")
hours = module.get("estimated_hours", "")

View File

@@ -1,17 +1,34 @@
import asyncio
import email
import hashlib
import imaplib
import logging
import os
from datetime import datetime
import re
import sqlite3
import ssl
from datetime import date, datetime
from email.header import decode_header
from pathlib import Path
from typing import Any, Dict, List, Tuple
# Ensure IMAP ID command is recognised by imaplib so we can spoof a
# desktop mail client (Foxmail/Outlook) for providers like NetEase/163.
imaplib.Commands["ID"] = ("NONAUTH", "AUTH", "SELECTED")
from sqlalchemy.orm import Session
from backend.app.db import SessionLocal
from backend.app.models import FinanceRecord
FINANCE_BASE_DIR = Path("data/finance")
SYNC_DB_PATH = Path("data/finance/sync_history.db")
# Folder names for classification (invoices, receipts, statements)
INVOICES_DIR = "invoices"
RECEIPTS_DIR = "receipts"
STATEMENTS_DIR = "statements"
def _decode_header_value(value: str | None) -> str:
@@ -27,17 +44,21 @@ def _decode_header_value(value: str | None) -> str:
return decoded
def _classify_type(subject: str) -> str:
def _classify_type(subject: str, filename: str) -> str:
"""
Classify finance document type based on subject keywords.
Classify finance document type. Returns: invoices, receipts, statements, others.
Maps to folders: invoices/, receipts/, statements/.
"""
subject_lower = subject.lower()
text = f"{subject} {filename}".lower()
# 发票 / 开票类
if any(k in subject for k in ["发票", "开票", "票据", "invoice"]):
if any(k in text for k in ["发票", "开票", "票据", "invoice", "fapiao"]):
return "invoices"
# 回执
if any(k in text for k in ["回执", "签收单", "receipt"]):
return "receipts"
# 银行流水 / 账户明细 / 对公活期等
if any(
k in subject
k in text
for k in [
"流水",
"活期",
@@ -50,9 +71,7 @@ def _classify_type(subject: str) -> str:
"statement",
]
):
return "bank_records"
if any(k in subject for k in ["回执", "receipt"]):
return "receipts"
return "statements"
return "others"
@@ -71,132 +90,474 @@ def _parse_email_date(msg: email.message.Message) -> datetime:
return dt
def _run_invoice_ocr_sync(file_path: str, mime: str, raw_bytes: bytes) -> Tuple[float | None, str | None]:
"""Run extract_invoice_metadata from a sync context (new event loop). Handles PDF via first page image."""
from backend.app.services.ai_service import extract_invoice_metadata
from backend.app.services.invoice_upload import _pdf_first_page_to_image
if "pdf" in (mime or "").lower() or Path(file_path).suffix.lower() == ".pdf":
img_result = _pdf_first_page_to_image(raw_bytes)
if img_result:
image_bytes, img_mime = img_result
raw_bytes, mime = image_bytes, img_mime
# else keep raw_bytes and try anyway (may fail)
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
return loop.run_until_complete(extract_invoice_metadata(raw_bytes, mime))
finally:
loop.close()
def _rename_invoice_file(
file_path: str,
amount: float | None,
billing_date: date | None,
) -> Tuple[str, str]:
"""
Rename invoice file to YYYYMMDD_金额_原文件名.
Returns (new_file_name, new_file_path).
"""
path = Path(file_path)
if not path.exists():
return (path.name, file_path)
date_str = (billing_date or date.today()).strftime("%Y%m%d")
amount_str = f"{amount:.2f}" if amount is not None else "0.00"
# Sanitize original name: take stem, limit length
orig_stem = path.stem[: 80] if len(path.stem) > 80 else path.stem
suffix = path.suffix
new_name = f"{date_str}_{amount_str}_{orig_stem}{suffix}"
new_path = path.parent / new_name
counter = 1
while new_path.exists():
new_path = path.parent / f"{date_str}_{amount_str}_{orig_stem}_{counter}{suffix}"
counter += 1
path.rename(new_path)
return (new_path.name, str(new_path))
def _ensure_sync_history_table(conn: sqlite3.Connection) -> None:
conn.execute(
"""
CREATE TABLE IF NOT EXISTS attachment_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
message_id TEXT,
file_hash TEXT NOT NULL,
month TEXT,
doc_type TEXT,
file_name TEXT,
file_path TEXT,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE(message_id, file_hash)
)
"""
)
conn.commit()
def _has_sync_history() -> bool:
"""是否有过同步记录无记录视为首次同步需拉全量有记录则只拉增量UNSEEN"""
if not SYNC_DB_PATH.exists():
return False
try:
conn = sqlite3.connect(SYNC_DB_PATH)
try:
cur = conn.execute("SELECT 1 FROM attachment_history LIMIT 1")
return cur.fetchone() is not None
finally:
conn.close()
except Exception:
return False
def _save_attachment(
msg: email.message.Message,
month_str: str,
doc_type: str,
) -> List[Tuple[str, str]]:
) -> List[Tuple[str, str, str, bytes, str]]:
"""
Save PDF/image attachments and return list of (file_name, file_path).
Save PDF/image attachments.
Returns list of (file_name, file_path, mime, raw_bytes, doc_type).
raw_bytes kept for invoice OCR when doc_type == invoices.
同时使用 data/finance/sync_history.db 做增量去重:
- 以 (message_id, MD5(content)) 为唯一键,避免重复保存相同附件。
"""
saved: List[Tuple[str, str]] = []
base_dir = _ensure_month_dir(month_str, doc_type)
saved: List[Tuple[str, str, str, bytes, str]] = []
for part in msg.walk():
content_disposition = part.get("Content-Disposition", "")
if "attachment" not in content_disposition:
continue
msg_id = msg.get("Message-ID") or ""
subject = _decode_header_value(msg.get("Subject"))
filename = part.get_filename()
filename = _decode_header_value(filename)
if not filename:
continue
SYNC_DB_PATH.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(SYNC_DB_PATH)
try:
_ensure_sync_history_table(conn)
content_type = part.get_content_type()
maintype = part.get_content_maintype()
for part in msg.walk():
content_disposition = part.get("Content-Disposition", "")
if "attachment" not in content_disposition:
continue
# Accept pdf and common images
if maintype not in ("application", "image"):
continue
filename = part.get_filename()
filename = _decode_header_value(filename)
if not filename:
continue
data = part.get_payload(decode=True)
if not data:
continue
ext = Path(filename).suffix.lower()
if ext not in (".pdf", ".jpg", ".jpeg", ".png", ".xlsx"):
continue
file_path = base_dir / filename
# Ensure unique filename
counter = 1
while file_path.exists():
stem = file_path.stem
suffix = file_path.suffix
file_path = base_dir / f"{stem}_{counter}{suffix}"
counter += 1
maintype = part.get_content_maintype()
if maintype not in ("application", "image"):
continue
with open(file_path, "wb") as f:
f.write(data)
data = part.get_payload(decode=True)
if not data:
continue
saved.append((filename, str(file_path)))
# 分类:基于主题 + 文件名
doc_type = _classify_type(subject, filename)
base_dir = _ensure_month_dir(month_str, doc_type)
# 增量去重:根据 (message_id, md5) 判断是否已同步过
file_hash = hashlib.md5(data).hexdigest() # nosec - content hash only
cur = conn.execute(
"SELECT 1 FROM attachment_history WHERE message_id = ? AND file_hash = ?",
(msg_id, file_hash),
)
if cur.fetchone():
continue
mime = part.get_content_type() or "application/octet-stream"
file_path = base_dir / filename
counter = 1
while file_path.exists():
stem, suffix = file_path.stem, file_path.suffix
file_path = base_dir / f"{stem}_{counter}{suffix}"
counter += 1
file_path.write_bytes(data)
conn.execute(
"""
INSERT OR IGNORE INTO attachment_history
(message_id, file_hash, month, doc_type, file_name, file_path)
VALUES (?, ?, ?, ?, ?, ?)
""",
(msg_id, file_hash, month_str, doc_type, file_path.name, str(file_path)),
)
saved.append((file_path.name, str(file_path), mime, data, doc_type))
finally:
conn.commit()
conn.close()
return saved
def _decode_imap_utf7(s: str | bytes) -> str:
"""Decode IMAP4 UTF-7 mailbox name (RFC 3501). Returns decoded string."""
if isinstance(s, bytes):
s = s.decode("ascii", errors="replace")
if "&" not in s:
return s
parts = s.split("&")
out = [parts[0]]
for i in range(1, len(parts)):
chunk = parts[i]
if "-" in chunk:
u, rest = chunk.split("-", 1)
if u == "":
out.append("&")
else:
try:
# IMAP UTF-7: &BASE64- where BASE64 is modified (,+ instead of /,=)
pad = (4 - len(u) % 4) % 4
b = (u + "=" * pad).translate(str.maketrans(",+", "/="))
decoded = __import__("base64").b64decode(b).decode("utf-16-be")
out.append(decoded)
except Exception:
out.append("&" + chunk)
out.append(rest)
else:
out.append("&" + chunk)
return "".join(out)
def _parse_list_response(data: List[bytes]) -> List[Tuple[str, str]]:
"""Parse imap.list() response to [(raw_name, decoded_name), ...]. Format: (flags) \"delim\" \"mailbox\"."""
import shlex
result: List[Tuple[str, str]] = []
for line in data:
if not isinstance(line, bytes):
continue
try:
line_str = line.decode("ascii", errors="replace")
except Exception:
continue
try:
parts = shlex.split(line_str)
except ValueError:
continue
if not parts:
continue
# Mailbox name is the last part (RFC 3501 LIST: (attrs) delim name)
raw = parts[-1]
decoded = _decode_imap_utf7(raw)
result.append((raw, decoded))
return result
def _list_mailboxes(imap: imaplib.IMAP4_SSL) -> List[Tuple[str, str]]:
"""List all mailboxes. Returns [(raw_name, decoded_name), ...]."""
status, data = imap.list()
if status != "OK" or not data:
return []
return _parse_list_response(data)
def list_mailboxes_for_config(host: str, port: int, user: str, password: str) -> List[Tuple[str, str]]:
"""Connect and list all mailboxes (for dropdown). Returns [(raw_name, decoded_name), ...]."""
with imaplib.IMAP4_SSL(host, int(port)) as imap:
imap.login(user, password)
return _list_mailboxes(imap)
def _select_mailbox(imap: imaplib.IMAP4_SSL, mailbox: str) -> bool:
"""
Robust mailbox selection with deep discovery scan.
Strategy:
1. LIST all folders, log raw lines for debugging.
2. Look for entry containing '\\Inbox' flag; if found, SELECT that folder.
3. Try standard candidates: user-configured name / INBOX / common UTF-7 收件箱编码.
4. As last resort, attempt SELECT on every listed folder and log which succeed/fail.
"""
logger = logging.getLogger(__name__)
name = (mailbox or "INBOX").strip() or "INBOX"
# 1) Discovery scan: list all folders and log raw entries
try:
status, data = imap.list()
if status != "OK" or not data:
logger.warning("IMAP LIST returned no data or non-OK status: %s", status)
data = []
except Exception as exc:
logger.error("IMAP LIST failed: %s", exc)
data = []
logger.info("IMAP Discovery Scan: listing all folders for mailbox=%s", name)
for raw in data:
logger.info("IMAP FOLDER RAW: %r", raw)
# 2) 优先按 \\Inbox 属性查找“真正的收件箱”
inbox_candidates: list[str] = []
for raw in data:
line = raw.decode("utf-8", errors="ignore") if isinstance(raw, bytes) else str(raw)
if "\\Inbox" not in line:
continue
m = re.search(r'"([^"]+)"\s*$', line)
if not m:
continue
folder_name = m.group(1)
inbox_candidates.append(folder_name)
# 3) 补充常规候选:配置名 / INBOX / 常见 UTF-7 收件箱编码
primary_names = [name, "INBOX"]
utf7_names = ["&XfJT0ZTx-"]
for nm in primary_names + utf7_names:
if nm not in inbox_candidates:
inbox_candidates.append(nm)
logger.info("IMAP Inbox candidate list (ordered): %r", inbox_candidates)
# 4) 依次尝试候选收件箱
for candidate in inbox_candidates:
for readonly in (False, True):
try:
status, _ = imap.select(candidate, readonly=readonly)
logger.info(
"IMAP SELECT candidate=%r readonly=%s -> %s", candidate, readonly, status
)
if status == "OK":
return True
except Exception as exc:
logger.warning(
"IMAP SELECT failed for candidate=%r readonly=%s: %s",
candidate,
readonly,
exc,
)
# 5) 最后手段:尝试 LIST 返回的每一个文件夹
logger.info("IMAP Fallback: trying SELECT on every listed folder...")
for raw in data:
line = raw.decode("utf-8", errors="ignore") if isinstance(raw, bytes) else str(raw)
m = re.search(r'"([^"]+)"\s*$', line)
if not m:
continue
folder_name = m.group(1)
for readonly in (False, True):
try:
status, _ = imap.select(folder_name, readonly=readonly)
logger.info(
"IMAP SELECT fallback folder=%r readonly=%s -> %s",
folder_name,
readonly,
status,
)
if status == "OK":
return True
except Exception as exc:
logger.warning(
"IMAP SELECT fallback failed for folder=%r readonly=%s: %s",
folder_name,
readonly,
exc,
)
logger.error("IMAP: unable to SELECT any inbox-like folder for mailbox=%s", name)
return False
def _sync_one_account(config: Dict[str, Any], db: Session, results: List[Dict[str, Any]]) -> None:
host = config.get("host")
user = config.get("user")
password = config.get("password")
port = int(config.get("port", 993))
mailbox = (config.get("mailbox") or "INBOX").strip() or "INBOX"
if not all([host, user, password]):
return
# Use strict TLS context for modern protocols (TLS 1.2+)
tls_context = ssl.create_default_context()
with imaplib.IMAP4_SSL(host, port, ssl_context=tls_context) as imap:
# Enable low-level IMAP debug output to backend logs to help diagnose
# handshake / protocol / mailbox selection issues with specific providers.
imap.debug = 4
imap.login(user, password)
# NetEase / 163 等会对未知客户端静默限制 SELECT这里通过 ID 命令伪装为常见桌面客户端。
try:
logger = logging.getLogger(__name__)
id_str = (
'("name" "Foxmail" '
'"version" "7.2.25.170" '
'"vendor" "Tencent" '
'"os" "Windows" '
'"os-version" "10.0")'
)
logger.info("IMAP sending Foxmail-style ID: %s", id_str)
# Use low-level command so it works across Python versions.
typ, dat = imap._command("ID", id_str) # type: ignore[attr-defined]
logger.info("IMAP ID command result: %s %r", typ, dat)
except Exception as exc:
# ID 失败不应阻断登录,只记录日志,方便后续排查。
logging.getLogger(__name__).warning("IMAP ID command failed: %s", exc)
if not _select_mailbox(imap, mailbox):
raise RuntimeError(
f"无法选择邮箱「{mailbox}」,请检查该账户的 Mailbox 配置(如 163 使用 INBOX"
)
# 首次同步(历史库无记录):拉取全部邮件中的附件,由 attachment_history 去重
# 已有历史:只拉取未读邮件,避免重复拉取
is_first_sync = not _has_sync_history()
search_criterion = "ALL" if is_first_sync else "UNSEEN"
logging.getLogger(__name__).info(
"Finance sync: %s (criterion=%s)",
"全量" if is_first_sync else "增量",
search_criterion,
)
status, data = imap.search(None, search_criterion)
if status != "OK":
return
id_list = data[0].split()
for msg_id in id_list:
status, msg_data = imap.fetch(msg_id, "(RFC822)")
if status != "OK":
continue
raw_email = msg_data[0][1]
msg = email.message_from_bytes(raw_email)
dt = _parse_email_date(msg)
month_str = dt.strftime("%Y-%m")
saved = _save_attachment(msg, month_str)
for file_name, file_path, mime, raw_bytes, doc_type in saved:
final_name = file_name
final_path = file_path
amount = None
billing_date = None
if doc_type == "invoices":
amount, date_str = _run_invoice_ocr_sync(file_path, mime, raw_bytes)
if date_str:
try:
billing_date = date.fromisoformat(date_str[:10])
except ValueError:
billing_date = date.today()
else:
billing_date = date.today()
final_name, final_path = _rename_invoice_file(
file_path, amount, billing_date
)
record = FinanceRecord(
month=month_str,
type=doc_type,
file_name=final_name,
file_path=final_path,
amount=amount,
billing_date=billing_date,
)
db.add(record)
db.flush()
results.append({
"id": record.id,
"month": record.month,
"type": record.type,
"file_name": record.file_name,
"file_path": record.file_path,
})
imap.store(msg_id, "+FLAGS", "\\Seen \\Flagged")
async def sync_finance_emails() -> List[Dict[str, Any]]:
"""
Connect to IMAP, fetch unread finance-related emails, download attachments,
save to filesystem and record FinanceRecord entries.
Sync from all active email configs (data/email_configs.json).
Falls back to env vars if no configs. Classifies into invoices/, receipts/, statements/.
Invoices are renamed to YYYYMMDD_金额_原文件名 using OCR.
"""
def _sync() -> List[Dict[str, Any]]:
host = os.getenv("IMAP_HOST")
user = os.getenv("IMAP_USER")
password = os.getenv("IMAP_PASSWORD")
port = int(os.getenv("IMAP_PORT", "993"))
mailbox = os.getenv("IMAP_MAILBOX", "INBOX")
from backend.app.routers.email_configs import get_email_configs_for_sync
if not all([host, user, password]):
raise RuntimeError("IMAP_HOST, IMAP_USER, IMAP_PASSWORD must be set.")
configs = get_email_configs_for_sync()
if not configs:
raise RuntimeError("未配置邮箱。请在 设置 → 邮箱账户 中添加,或配置 IMAP_* 环境变量。")
results: List[Dict[str, Any]] = []
errors: List[str] = []
db = SessionLocal()
try:
for config in configs:
try:
_sync_one_account(config, db, results)
except Exception as e:
# 不让单个账户的异常中断全部同步,记录错误并继续其他账户。
user = config.get("user", "") or config.get("id", "")
errors.append(f"同步账户 {user} 失败: {e}")
db.commit()
finally:
db.close()
with imaplib.IMAP4_SSL(host, port) as imap:
imap.login(user, password)
imap.select(mailbox)
# Search for UNSEEN emails with finance related keywords in subject.
# Note: IMAP SEARCH is limited; here we search UNSEEN first then filter in Python.
status, data = imap.search(None, "UNSEEN")
if status != "OK":
return results
id_list = data[0].split()
db = SessionLocal()
try:
for msg_id in id_list:
status, msg_data = imap.fetch(msg_id, "(RFC822)")
if status != "OK":
continue
raw_email = msg_data[0][1]
msg = email.message_from_bytes(raw_email)
subject = _decode_header_value(msg.get("Subject"))
doc_type = _classify_type(subject)
# Filter by keywords first
if doc_type == "others":
continue
dt = _parse_email_date(msg)
month_str = dt.strftime("%Y-%m")
saved_files = _save_attachment(msg, month_str, doc_type)
for file_name, file_path in saved_files:
record = FinanceRecord(
month=month_str,
type=doc_type,
file_name=file_name,
file_path=file_path,
)
# NOTE: created_at defaults at DB layer
db.add(record)
db.flush()
results.append(
{
"id": record.id,
"month": record.month,
"type": record.type,
"file_name": record.file_name,
"file_path": record.file_path,
}
)
# Mark email as seen and flagged to avoid re-processing
imap.store(msg_id, "+FLAGS", "\\Seen \\Flagged")
db.commit()
finally:
db.close()
if not results and errors:
# 所有账户都失败了,整体报错,前端可显示详细原因。
raise RuntimeError("; ".join(errors))
return results
@@ -205,7 +566,8 @@ async def sync_finance_emails() -> List[Dict[str, Any]]:
async def create_monthly_zip(month_str: str) -> str:
"""
Zip the finance folder for a given month (YYYY-MM) and return the zip path.
Zip the finance folder for a given month (YYYY-MM).
Preserves folder structure (invoices/, receipts/, statements/, manual/) inside the zip.
"""
import zipfile
@@ -227,4 +589,3 @@ async def create_monthly_zip(month_str: str) -> str:
return str(zip_path)
return await asyncio.to_thread(_zip)

View File

@@ -0,0 +1,90 @@
"""
Manual invoice upload: save file, optionally run AI vision to extract amount/date.
"""
import io
from datetime import date, datetime
from pathlib import Path
from typing import Any, Dict, Tuple
from fastapi import UploadFile
from backend.app.services.ai_service import extract_invoice_metadata
FINANCE_BASE = Path("data/finance")
ALLOWED_IMAGE = {".jpg", ".jpeg", ".png", ".webp"}
ALLOWED_PDF = {".pdf"}
def _current_month() -> str:
return datetime.utcnow().strftime("%Y-%m")
def _pdf_first_page_to_image(pdf_bytes: bytes) -> Tuple[bytes, str] | None:
"""Render first page of PDF to PNG bytes. Returns (bytes, 'image/png') or None on error."""
try:
import fitz
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
if doc.page_count == 0:
doc.close()
return None
page = doc[0]
pix = page.get_pixmap(dpi=150)
png_bytes = pix.tobytes("png")
doc.close()
return (png_bytes, "image/png")
except Exception:
return None
async def process_invoice_upload(
file: UploadFile,
) -> Tuple[str, str, str, float | None, date | None]:
"""
Save uploaded file to data/finance/{YYYY-MM}/manual/, run OCR for amount/date.
Returns (file_name, file_path, month_str, amount, billing_date).
"""
month_str = _current_month()
manual_dir = FINANCE_BASE / month_str / "manual"
manual_dir.mkdir(parents=True, exist_ok=True)
raw = await file.read()
filename = file.filename or "upload"
suf = Path(filename).suffix.lower()
if suf in ALLOWED_IMAGE:
image_bytes, mime = raw, (file.content_type or "image/jpeg")
if "png" in (suf or ""):
mime = "image/png"
amount, date_str = await extract_invoice_metadata(image_bytes, mime)
elif suf in ALLOWED_PDF:
image_result = _pdf_first_page_to_image(raw)
if image_result:
image_bytes, mime = image_result
amount, date_str = await extract_invoice_metadata(image_bytes, mime)
else:
amount, date_str = None, None
# Save original PDF
else:
amount, date_str = None, None
# Unique filename
dest = manual_dir / filename
counter = 1
while dest.exists():
dest = manual_dir / f"{dest.stem}_{counter}{dest.suffix}"
counter += 1
dest.write_bytes(raw)
file_path = str(dest)
file_name = dest.name
billing_date = None
if date_str:
try:
billing_date = date.fromisoformat(date_str)
except ValueError:
pass
if billing_date is None:
billing_date = date.today()
return (file_name, file_path, month_str, amount, billing_date)