fix:优化数据来源
This commit is contained in:
28
crawler/translate_utils.py
Normal file
28
crawler/translate_utils.py
Normal file
@@ -0,0 +1,28 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""英译中,入库前统一翻译"""
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def _is_mostly_chinese(text: str) -> bool:
|
||||
if not text or len(text.strip()) < 2:
|
||||
return False
|
||||
chinese = len(re.findall(r"[\u4e00-\u9fff]", text))
|
||||
return chinese / max(len(text), 1) > 0.3
|
||||
|
||||
|
||||
def translate_to_chinese(text: str) -> str:
|
||||
"""将文本翻译成中文,失败或已是中文则返回原文。"""
|
||||
if not text or not text.strip():
|
||||
return text
|
||||
s = str(text).strip()
|
||||
if len(s) > 2000:
|
||||
s = s[:2000]
|
||||
if _is_mostly_chinese(s):
|
||||
return text
|
||||
try:
|
||||
from deep_translator import GoogleTranslator
|
||||
out = GoogleTranslator(source="auto", target="zh-CN").translate(s)
|
||||
return out if out else text
|
||||
except Exception:
|
||||
return text
|
||||
Reference in New Issue
Block a user