Files
usa/crawler/translate_utils.py
2026-03-05 19:18:45 +08:00

45 lines
1.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""英译中,入库前统一翻译"""
import os
import re
from typing import Optional
def _is_mostly_chinese(text: str) -> bool:
if not text or len(text.strip()) < 2:
return False
chinese = len(re.findall(r"[\u4e00-\u9fff]", text))
return chinese / max(len(text), 1) > 0.3
def translate_to_chinese(text: str) -> str:
"""将文本翻译成中文,失败或已是中文则返回原文。
说明:
- 默认关闭外部翻译deep_translator直接返回原文避免因网络或代理问题阻塞整条流水线。
- 如需开启翻译,可显式设置环境变量 TRANSLATE_DISABLED=0。
"""
if not text or not text.strip():
return text
# 默认禁用翻译TRANSLATE_DISABLED 未设置时视为开启(值为 "1"
if os.environ.get("TRANSLATE_DISABLED", "1") == "1":
return text
s = str(text).strip()
if len(s) > 2000:
s = s[:2000]
if _is_mostly_chinese(s):
return text
for translator in ["google", "mymemory"]:
try:
if translator == "google":
from deep_translator import GoogleTranslator
out = GoogleTranslator(source="auto", target="zh-CN").translate(s)
else:
from deep_translator import MyMemoryTranslator
out = MyMemoryTranslator(source="auto", target="zh-CN").translate(s)
if out and out.strip() and out != s:
return out
except Exception:
continue
return text