fix: 优化后端数据更新机制

This commit is contained in:
Daniel
2026-03-03 13:02:28 +08:00
parent 7284a1a60d
commit fa6f7407f0
20 changed files with 592 additions and 201 deletions

View File

@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
"""爬虫入口:定时抓取 → 解析入库 → 通知 API"""
"""爬虫入口:定时执行完整写库流水线(抓取 → 清洗去重 → 映射 → 更新表 → 通知 API"""
import time
import sys
from pathlib import Path
@@ -8,34 +8,18 @@ from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent))
from config import DB_PATH, API_BASE, CRAWL_INTERVAL
from scrapers.rss_scraper import fetch_all
from db_writer import write_updates
def notify_api() -> bool:
"""调用 Node API 触发立即广播"""
try:
import urllib.request
req = urllib.request.Request(
f"{API_BASE}/api/crawler/notify",
method="POST",
headers={"Content-Type": "application/json"},
)
with urllib.request.urlopen(req, timeout=5) as resp:
return resp.status == 200
except Exception as e:
print(f" [warn] notify API failed: {e}")
return False
from pipeline import run_full_pipeline
def run_once() -> int:
items = fetch_all()
if not items:
return 0
n = write_updates(items)
if n > 0:
notify_api()
return n
"""执行一轮:抓取、清洗、去重、映射、写表、通知。返回本轮新增条数(面板或资讯)。"""
n_fetched, n_news, n_panel = run_full_pipeline(
db_path=DB_PATH,
api_base=API_BASE,
translate=True,
notify=True,
)
return n_panel or n_news
def main() -> None:
@@ -45,7 +29,7 @@ def main() -> None:
try:
n = run_once()
if n > 0:
print(f"[{time.strftime('%H:%M:%S')}] Inserted {n} new update(s)")
print(f"[{time.strftime('%H:%M:%S')}] 抓取完成,去重后新增 {n} 条,已写库并通知 API")
except KeyboardInterrupt:
break
except Exception as e: