#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 单独运行爬虫一轮:抓取 → 清洗 → 去重 → 写库 → 通知 Node(可选) 终端直接输出抓取条数及内容摘要,便于排查。 用法(项目根或 crawler 目录): python run_once.py python -c "import run_once; run_once.main()" 或: npm run crawler:once """ import os import sys from datetime import datetime # 保证可导入同目录模块 if __name__ == "__main__": sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) def main(): from config import DB_PATH, API_BASE from pipeline import run_full_pipeline crawl_start = os.environ.get("CRAWL_START_DATE", "").strip() print("========================================") print("爬虫单次运行(RSS → 清洗 → 去重 → 写库)") print("DB:", DB_PATH) print("API_BASE:", API_BASE) if crawl_start: print("时间范围: 仅保留 CRAWL_START_DATE 之后:", crawl_start) print("========================================\n") n_fetched, n_news, n_panel = run_full_pipeline( db_path=DB_PATH, api_base=API_BASE, translate=True, notify=True, ) print("") print("----------------------------------------") print("本轮结果:") print(f" 抓取: {n_fetched} 条") print(f" 去重后新增资讯: {n_news} 条") print(f" 写入事件脉络: {n_panel} 条") if n_fetched == 0: print(" (0 条:检查网络、RSS 源或 config.KEYWORDS 过滤)") print("----------------------------------------") return 0 if __name__ == "__main__": sys.exit(main())