Files
usa/crawler/run_once.py
2026-03-03 20:17:38 +08:00

52 lines
1.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
单独运行爬虫一轮:抓取 → 清洗 → 去重 → 写库 → 通知 Node可选
终端直接输出抓取条数及内容摘要,便于排查。
用法(项目根或 crawler 目录):
python run_once.py
python -c "import run_once; run_once.main()"
或: npm run crawler:once
"""
import os
import sys
from datetime import datetime
# 保证可导入同目录模块
if __name__ == "__main__":
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
def main():
from config import DB_PATH, API_BASE
from pipeline import run_full_pipeline
crawl_start = os.environ.get("CRAWL_START_DATE", "").strip()
print("========================================")
print("爬虫单次运行RSS → 清洗 → 去重 → 写库)")
print("DB:", DB_PATH)
print("API_BASE:", API_BASE)
if crawl_start:
print("时间范围: 仅保留 CRAWL_START_DATE 之后:", crawl_start)
print("========================================\n")
n_fetched, n_news, n_panel = run_full_pipeline(
db_path=DB_PATH,
api_base=API_BASE,
translate=True,
notify=True,
)
print("")
print("----------------------------------------")
print("本轮结果:")
print(f" 抓取: {n_fetched}")
print(f" 去重后新增资讯: {n_news}")
print(f" 写入事件脉络: {n_panel}")
if n_fetched == 0:
print(" 0 条检查网络、RSS 源或 config.KEYWORDS 过滤)")
print("----------------------------------------")
return 0
if __name__ == "__main__":
sys.exit(main())