fix:优化数据来源

This commit is contained in:
Daniel
2026-03-02 01:00:04 +08:00
parent 91d9e48e1e
commit 4a8fff5a00
26 changed files with 1361 additions and 0 deletions

33
crawler/config.py Normal file
View File

@@ -0,0 +1,33 @@
# -*- coding: utf-8 -*-
"""爬虫配置"""
import os
from pathlib import Path
# 数据库路径(与 server 共用 SQLite
PROJECT_ROOT = Path(__file__).resolve().parent.parent
DB_PATH = os.environ.get("DB_PATH", str(PROJECT_ROOT / "server" / "data.db"))
# Node API 地址(用于通知推送)
API_BASE = os.environ.get("API_BASE", "http://localhost:3001")
# 抓取间隔(秒)
CRAWL_INTERVAL = int(os.environ.get("CRAWL_INTERVAL", "300"))
# RSS 源(美伊/中东相关,多源保证实时事件脉络)
RSS_FEEDS = [
"https://feeds.reuters.com/reuters/topNews",
"https://feeds.bbci.co.uk/news/world/rss.xml",
"https://feeds.bbci.co.uk/news/world/middle_east/rss.xml",
"https://www.aljazeera.com/xml/rss/all.xml",
"https://www.aljazeera.com/xml/rss/middleeast.xml",
"https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
]
# 关键词过滤:至少匹配一个才会入库
KEYWORDS = [
"iran", "iranian", "tehran", "以色列", "israel",
"usa", "us ", "american", "美军", "美国",
"middle east", "中东", "persian gulf", "波斯湾",
"strike", "attack", "military", "missile", "", "nuclear",
"carrier", "航母", "houthi", "胡塞", "hamas",
]