This commit is contained in:
Daniel
2026-03-03 20:17:38 +08:00
parent 034c088bac
commit 09ec2e3a69
20 changed files with 395 additions and 19 deletions

51
crawler/run_once.py Normal file
View File

@@ -0,0 +1,51 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
单独运行爬虫一轮:抓取 → 清洗 → 去重 → 写库 → 通知 Node可选
终端直接输出抓取条数及内容摘要,便于排查。
用法(项目根或 crawler 目录):
python run_once.py
python -c "import run_once; run_once.main()"
或: npm run crawler:once
"""
import os
import sys
from datetime import datetime
# 保证可导入同目录模块
if __name__ == "__main__":
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
def main():
from config import DB_PATH, API_BASE
from pipeline import run_full_pipeline
crawl_start = os.environ.get("CRAWL_START_DATE", "").strip()
print("========================================")
print("爬虫单次运行RSS → 清洗 → 去重 → 写库)")
print("DB:", DB_PATH)
print("API_BASE:", API_BASE)
if crawl_start:
print("时间范围: 仅保留 CRAWL_START_DATE 之后:", crawl_start)
print("========================================\n")
n_fetched, n_news, n_panel = run_full_pipeline(
db_path=DB_PATH,
api_base=API_BASE,
translate=True,
notify=True,
)
print("")
print("----------------------------------------")
print("本轮结果:")
print(f" 抓取: {n_fetched}")
print(f" 去重后新增资讯: {n_news}")
print(f" 写入事件脉络: {n_panel}")
if n_fetched == 0:
print(" 0 条检查网络、RSS 源或 config.KEYWORDS 过滤)")
print("----------------------------------------")
return 0
if __name__ == "__main__":
sys.exit(main())