fix:优化数据来源

2026-03-02 01:00:04 +08:00
parent 91d9e48e1e
commit 4a8fff5a00
26 changed files with 1361 additions and 0 deletions
--- a/crawler/main.py
+++ b/crawler/main.py
@@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+"""爬虫入口：定时抓取 → 解析 → 入库 → 通知 API"""
+import time
+import sys
+from pathlib import Path
+
+# 确保能导入 config
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+
+from config import DB_PATH, API_BASE, CRAWL_INTERVAL
+from scrapers.rss_scraper import fetch_all
+from db_writer import write_updates
+
+
+def notify_api() -> bool:
+    """调用 Node API 触发立即广播"""
+    try:
+        import urllib.request
+        req = urllib.request.Request(
+            f"{API_BASE}/api/crawler/notify",
+            method="POST",
+            headers={"Content-Type": "application/json"},
+        )
+        with urllib.request.urlopen(req, timeout=5) as resp:
+            return resp.status == 200
+    except Exception as e:
+        print(f"  [warn] notify API failed: {e}")
+        return False
+
+
+def run_once() -> int:
+    items = fetch_all()
+    if not items:
+        return 0
+    n = write_updates(items)
+    if n > 0:
+        notify_api()
+    return n
+
+
+def main() -> None:
+    print("Crawler started. DB:", DB_PATH)
+    print("API:", API_BASE, "| Interval:", CRAWL_INTERVAL, "s")
+    while True:
+        try:
+            n = run_once()
+            if n > 0:
+                print(f"[{time.strftime('%H:%M:%S')}] Inserted {n} new update(s)")
+        except KeyboardInterrupt:
+            break
+        except Exception as e:
+            print(f"[{time.strftime('%H:%M:%S')}] Error: {e}")
+        time.sleep(CRAWL_INTERVAL)
+
+
+if __name__ == "__main__":
+    main()