fix: bug
This commit is contained in:
@@ -109,6 +109,33 @@ def run_full_pipeline(
|
||||
if not items:
|
||||
return 0, 0, 0
|
||||
|
||||
# 可选:仅保留指定起始时间之后的条目(如 CRAWL_START_DATE=2026-02-28T00:00:00)
|
||||
start_date_env = os.environ.get("CRAWL_START_DATE", "").strip()
|
||||
if start_date_env:
|
||||
try:
|
||||
raw = start_date_env.replace("Z", "+00:00").strip()
|
||||
start_dt = datetime.fromisoformat(raw)
|
||||
if start_dt.tzinfo is None:
|
||||
start_dt = start_dt.replace(tzinfo=timezone.utc)
|
||||
else:
|
||||
start_dt = start_dt.astimezone(timezone.utc)
|
||||
before = len(items)
|
||||
items = [it for it in items if (it.get("published") or datetime.min.replace(tzinfo=timezone.utc)) >= start_dt]
|
||||
if before > len(items):
|
||||
print(f" [pipeline] 按 CRAWL_START_DATE={start_date_env} 过滤后保留 {len(items)} 条(原 {before} 条)")
|
||||
except Exception as e:
|
||||
print(f" [warn] CRAWL_START_DATE 解析失败,忽略: {e}")
|
||||
|
||||
if not items:
|
||||
return 0, 0, 0
|
||||
n_total = len(items)
|
||||
print(f" [pipeline] 抓取 {n_total} 条")
|
||||
for i, it in enumerate(items[:5]):
|
||||
title = (it.get("title") or it.get("summary") or "").strip()[:60]
|
||||
print(f" [{i + 1}] {title}" + ("…" if len((it.get("title") or it.get("summary") or "")[:60]) >= 60 else ""))
|
||||
if n_total > 5:
|
||||
print(f" ... 共 {n_total} 条")
|
||||
|
||||
# 2. 清洗(标题/摘要/分类,符合面板 schema)
|
||||
if translate:
|
||||
from translate_utils import translate_to_chinese
|
||||
@@ -128,6 +155,11 @@ def run_full_pipeline(
|
||||
|
||||
# 3. 去重:落库 news_content,仅新项返回
|
||||
new_items, n_news = save_and_dedup(items, db_path=path)
|
||||
if new_items:
|
||||
print(f" [pipeline] 去重后新增 {n_news} 条,写入事件脉络 {len(new_items)} 条")
|
||||
for i, it in enumerate(new_items[:3]):
|
||||
title = (it.get("title") or it.get("summary") or "").strip()[:55]
|
||||
print(f" 新增 [{i + 1}] {title}" + ("…" if len((it.get("title") or it.get("summary") or "").strip()) > 55 else ""))
|
||||
|
||||
# 3.5 数据增强:为参与 AI 提取的条目抓取正文,便于从全文提取精确数据(伤亡、基地等)
|
||||
if new_items:
|
||||
|
||||
Reference in New Issue
Block a user