This commit is contained in:
Daniel
2026-03-03 20:17:38 +08:00
parent 034c088bac
commit 09ec2e3a69
20 changed files with 395 additions and 19 deletions

View File

@@ -109,6 +109,33 @@ def run_full_pipeline(
if not items:
return 0, 0, 0
# 可选:仅保留指定起始时间之后的条目(如 CRAWL_START_DATE=2026-02-28T00:00:00
start_date_env = os.environ.get("CRAWL_START_DATE", "").strip()
if start_date_env:
try:
raw = start_date_env.replace("Z", "+00:00").strip()
start_dt = datetime.fromisoformat(raw)
if start_dt.tzinfo is None:
start_dt = start_dt.replace(tzinfo=timezone.utc)
else:
start_dt = start_dt.astimezone(timezone.utc)
before = len(items)
items = [it for it in items if (it.get("published") or datetime.min.replace(tzinfo=timezone.utc)) >= start_dt]
if before > len(items):
print(f" [pipeline] 按 CRAWL_START_DATE={start_date_env} 过滤后保留 {len(items)} 条(原 {before} 条)")
except Exception as e:
print(f" [warn] CRAWL_START_DATE 解析失败,忽略: {e}")
if not items:
return 0, 0, 0
n_total = len(items)
print(f" [pipeline] 抓取 {n_total}")
for i, it in enumerate(items[:5]):
title = (it.get("title") or it.get("summary") or "").strip()[:60]
print(f" [{i + 1}] {title}" + ("" if len((it.get("title") or it.get("summary") or "")[:60]) >= 60 else ""))
if n_total > 5:
print(f" ... 共 {n_total}")
# 2. 清洗(标题/摘要/分类,符合面板 schema
if translate:
from translate_utils import translate_to_chinese
@@ -128,6 +155,11 @@ def run_full_pipeline(
# 3. 去重:落库 news_content仅新项返回
new_items, n_news = save_and_dedup(items, db_path=path)
if new_items:
print(f" [pipeline] 去重后新增 {n_news} 条,写入事件脉络 {len(new_items)}")
for i, it in enumerate(new_items[:3]):
title = (it.get("title") or it.get("summary") or "").strip()[:55]
print(f" 新增 [{i + 1}] {title}" + ("" if len((it.get("title") or it.get("summary") or "").strip()) > 55 else ""))
# 3.5 数据增强:为参与 AI 提取的条目抓取正文,便于从全文提取精确数据(伤亡、基地等)
if new_items: