42 lines
1.2 KiB
Python
42 lines
1.2 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""爬虫入口:定时执行完整写库流水线(抓取 → 清洗 → 去重 → 映射 → 更新表 → 通知 API)"""
|
||
import time
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
# 确保能导入 config
|
||
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
||
|
||
from config import DB_PATH, API_BASE, CRAWL_INTERVAL
|
||
from pipeline import run_full_pipeline
|
||
|
||
|
||
def run_once() -> int:
|
||
"""执行一轮:抓取、清洗、去重、映射、写表、通知。返回本轮新增条数(面板或资讯)。"""
|
||
n_fetched, n_news, n_panel = run_full_pipeline(
|
||
db_path=DB_PATH,
|
||
api_base=API_BASE,
|
||
translate=True,
|
||
notify=True,
|
||
)
|
||
return n_panel or n_news
|
||
|
||
|
||
def main() -> None:
|
||
print("Crawler started. DB:", DB_PATH)
|
||
print("API:", API_BASE, "| Interval:", CRAWL_INTERVAL, "s")
|
||
while True:
|
||
try:
|
||
n = run_once()
|
||
if n > 0:
|
||
print(f"[{time.strftime('%H:%M:%S')}] 抓取完成,去重后新增 {n} 条,已写库并通知 API")
|
||
except KeyboardInterrupt:
|
||
break
|
||
except Exception as e:
|
||
print(f"[{time.strftime('%H:%M:%S')}] Error: {e}")
|
||
time.sleep(CRAWL_INTERVAL)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|