52 lines
1.6 KiB
Python
52 lines
1.6 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
单独运行爬虫一轮:抓取 → 清洗 → 去重 → 写库 → 通知 Node(可选)
|
||
终端直接输出抓取条数及内容摘要,便于排查。
|
||
用法(项目根或 crawler 目录):
|
||
python run_once.py
|
||
python -c "import run_once; run_once.main()"
|
||
或: npm run crawler:once
|
||
"""
|
||
import os
|
||
import sys
|
||
from datetime import datetime
|
||
|
||
# 保证可导入同目录模块
|
||
if __name__ == "__main__":
|
||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||
|
||
def main():
|
||
from config import DB_PATH, API_BASE
|
||
from pipeline import run_full_pipeline
|
||
|
||
crawl_start = os.environ.get("CRAWL_START_DATE", "").strip()
|
||
print("========================================")
|
||
print("爬虫单次运行(RSS → 清洗 → 去重 → 写库)")
|
||
print("DB:", DB_PATH)
|
||
print("API_BASE:", API_BASE)
|
||
if crawl_start:
|
||
print("时间范围: 仅保留 CRAWL_START_DATE 之后:", crawl_start)
|
||
print("========================================\n")
|
||
|
||
n_fetched, n_news, n_panel = run_full_pipeline(
|
||
db_path=DB_PATH,
|
||
api_base=API_BASE,
|
||
translate=True,
|
||
notify=True,
|
||
)
|
||
|
||
print("")
|
||
print("----------------------------------------")
|
||
print("本轮结果:")
|
||
print(f" 抓取: {n_fetched} 条")
|
||
print(f" 去重后新增资讯: {n_news} 条")
|
||
print(f" 写入事件脉络: {n_panel} 条")
|
||
if n_fetched == 0:
|
||
print(" (0 条:检查网络、RSS 源或 config.KEYWORDS 过滤)")
|
||
print("----------------------------------------")
|
||
return 0
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main())
|