# -*- coding: utf-8 -*- """爬虫入口:定时执行完整写库流水线(抓取 → 清洗 → 去重 → 映射 → 更新表 → 通知 API)""" import time import sys from pathlib import Path # 确保能导入 config sys.path.insert(0, str(Path(__file__).resolve().parent)) from config import DB_PATH, API_BASE, CRAWL_INTERVAL from pipeline import run_full_pipeline def run_once() -> int: """执行一轮:抓取、清洗、去重、映射、写表、通知。返回本轮新增条数(面板或资讯)。""" n_fetched, n_news, n_panel = run_full_pipeline( db_path=DB_PATH, api_base=API_BASE, translate=True, notify=True, ) return n_panel or n_news def main() -> None: print("Crawler started. DB:", DB_PATH) print("API:", API_BASE, "| Interval:", CRAWL_INTERVAL, "s") while True: try: n = run_once() if n > 0: print(f"[{time.strftime('%H:%M:%S')}] 抓取完成,去重后新增 {n} 条,已写库并通知 API") except KeyboardInterrupt: break except Exception as e: print(f"[{time.strftime('%H:%M:%S')}] Error: {e}") time.sleep(CRAWL_INTERVAL) if __name__ == "__main__": main()