usa/crawler/run_once.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
单独运行爬虫一轮：抓取 → 清洗 → 去重 → 写库 → 通知 Node（可选）
终端直接输出抓取条数及内容摘要，便于排查。
用法（项目根或 crawler 目录）:
  python run_once.py
  python -c "import run_once; run_once.main()"
或: npm run crawler:once
"""
import os
import sys
from datetime import datetime

# 保证可导入同目录模块
if __name__ == "__main__":
    sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

def main():
    from config import DB_PATH, API_BASE
    from pipeline import run_full_pipeline

    crawl_start = os.environ.get("CRAWL_START_DATE", "").strip()
    print("========================================")
    print("爬虫单次运行（RSS → 清洗 → 去重 → 写库）")
    print("DB:", DB_PATH)
    print("API_BASE:", API_BASE)
    if crawl_start:
        print("时间范围: 仅保留 CRAWL_START_DATE 之后:", crawl_start)
    print("========================================\n")

    n_fetched, n_news, n_panel = run_full_pipeline(
        db_path=DB_PATH,
        api_base=API_BASE,
        translate=True,
        notify=True,
    )

    print("")
    print("----------------------------------------")
    print("本轮结果:")
    print(f"  抓取: {n_fetched} 条")
    print(f"  去重后新增资讯: {n_news} 条")
    print(f"  写入事件脉络: {n_panel} 条")
    if n_fetched == 0:
        print("  （0 条：检查网络、RSS 源或 config.KEYWORDS 过滤）")
    print("----------------------------------------")
    return 0

if __name__ == "__main__":
    sys.exit(main())