Compare commits

...

53 Commits

Author SHA1 Message Date
Daniel
89145a6743 fix:修复启动文件 2026-03-05 20:22:15 +08:00
Daniel
07454b73c2 fix:优化爬虫配置,单独使用docker容器运行 2026-03-05 20:19:24 +08:00
Daniel
bbb9a5e1e1 fix:修复依赖文件报错 2026-03-05 20:00:15 +08:00
Daniel
98d928f457 fix:优化pm2配置项 2026-03-05 19:53:05 +08:00
Daniel
004b03b374 fix:优化爬虫链路 2026-03-05 19:18:45 +08:00
Daniel
475097d372 fix: 优化页面效果 2026-03-05 16:11:33 +08:00
Daniel
af59d6367f fix: 新增态 效果 2026-03-05 15:53:10 +08:00
Daniel
a3bf8abda5 fix:对齐生产环境的数据字段 2026-03-04 19:19:50 +08:00
Daniel
26938449f0 fix: bug 2026-03-04 16:48:17 +08:00
Daniel
64f4c438c3 fix: code update 2026-03-04 09:56:23 +08:00
Daniel
88c37408e8 fix: 化本 2026-03-04 09:43:21 +08:00
Daniel
3264b3252a fix: bug 2026-03-04 00:39:01 +08:00
Daniel
95e2fe1c41 fix: 2026-03-04 00:07:14 +08:00
Daniel
ac24c528f3 Merge branch; keep full EditDashboard 2026-03-03 22:44:24 +08:00
Daniel
86e50debec fix:增面 2026-03-03 22:42:21 +08:00
张成
8696549ba1 1 2026-03-03 20:34:26 +08:00
Daniel
09ec2e3a69 fix: bug 2026-03-03 20:17:38 +08:00
Daniel
034c088bac chore: stop tracking server/data.db, keep in .gitignore 2026-03-03 18:08:34 +08:00
Daniel
15800b1dad fix: update code for db file 2026-03-03 18:06:17 +08:00
Daniel
0cbeaf59a5 fix: update code 2026-03-03 17:54:43 +08:00
Daniel
1764a44eb3 fix: update 2026-03-03 17:27:55 +08:00
Daniel
29c921f498 fix: 更改数据库包 2026-03-03 14:49:02 +08:00
Daniel
85dea726e9 fix: 处理数据问题 2026-03-03 13:03:11 +08:00
Daniel
fa6f7407f0 fix: 优化后端数据更新机制 2026-03-03 13:02:28 +08:00
Daniel
7284a1a60d fix: 修复移动端报错 2026-03-03 11:14:34 +08:00
Daniel
4dd1f7e7dc fix:优化数据样式 2026-03-03 10:35:11 +08:00
Daniel
92914e6522 fix: 更新数据面板的驱动方式 2026-03-02 23:21:07 +08:00
Daniel
ef60f18cb0 fix: meger code 2026-03-02 21:51:18 +08:00
张成
c3ec459671 1 2026-03-02 21:43:36 +08:00
Daniel
75c58eecfc fix:优化界面布局 2026-03-02 19:32:56 +08:00
Daniel
3c55485648 fix: 优化留言和分享数据 2026-03-02 19:07:51 +08:00
Daniel
13ca470cad fix: 优化自适应界面 2026-03-02 18:39:29 +08:00
Daniel
4e91018752 fix: 修复移动端自适应问题 2026-03-02 17:48:12 +08:00
Daniel
55c030e3f5 fix: 修复自适应问题 2026-03-02 17:32:55 +08:00
Daniel
0027074b8b fix: 修复爬虫问题 2026-03-02 17:20:31 +08:00
Daniel
33e4786cd0 feat: 完成合并 2026-03-02 16:43:29 +08:00
Daniel
d646a93dcf Merge branch 'master' of https://git.bimwe.com/Daniel/usa 2026-03-02 16:42:55 +08:00
Daniel
af577400fb fix:移除繁体转简体,移除资讯 2026-03-02 16:42:35 +08:00
Daniel
84656f4a11 fix: 移除繁体转简体和资讯 2026-03-02 16:41:57 +08:00
张成
aa630aa479 Merge branch 'master' of https://git.bimwe.com/Daniel/usa 2026-03-02 16:38:40 +08:00
张成
ffcce0ad81 1 2026-03-02 16:38:39 +08:00
Daniel
ad73305ed1 fix: 更新token 2026-03-02 16:36:49 +08:00
Daniel
a9caf6e7c0 fix: 优化后端数据 2026-03-02 16:29:11 +08:00
Daniel
81628a136a fix: 优化后台数据 2026-03-02 15:35:40 +08:00
Daniel
84e97f3370 fix: 优化了代码 2026-03-02 14:32:32 +08:00
Daniel
049276fedd fix: 上传原始数据 2026-03-02 14:27:30 +08:00
Daniel
5460e806b6 fix: 优化git配置文件 2026-03-02 14:25:44 +08:00
Daniel
2d800094b1 fix:优化docker p配置 2026-03-02 14:23:36 +08:00
Daniel
36576592a2 fix: 优化docker 镜像 2026-03-02 14:10:43 +08:00
Daniel
783a69dad1 fix: 修复数据报错 2026-03-02 11:50:35 +08:00
Daniel
004d10b283 fix: 优化数据 2026-03-02 11:28:13 +08:00
Daniel
4a8fff5a00 fix:优化数据来源 2026-03-02 01:00:04 +08:00
Daniel
91d9e48e1e fix:优化整个大屏界面 2026-03-02 00:59:40 +08:00
141 changed files with 12038 additions and 987 deletions

13
.dockerignore Normal file
View File

@@ -0,0 +1,13 @@
node_modules
.git
.env
.env.local
*.log
dist
server/data.db
.DS_Store
*.md
.cursor
.venv
__pycache__
*.pyc

3
.env Normal file
View File

@@ -0,0 +1,3 @@
# Mapbox 地图令牌
VITE_MAPBOX_ACCESS_TOKEN=pk.eyJ1IjoiZDI5cTAiLCJhIjoiY21tYWQyOXI3MGFrZzJwcjJmZGltODI4ZCJ9.0jW_aK91VJExw6ffKGqWIA
DASHSCOPE_API_KEY=sk-029a4c4d761d49b99cfe6073234ac443

View File

@@ -1,46 +1,8 @@
# Mapbox 地图令牌 (波斯湾区域展示)
# Mapbox 地图令牌(仅在此或 .env 中配置,勿写进源码;若曾泄漏请到 Mapbox 控制台轮换)
# 免费申请: https://account.mapbox.com/access-tokens/
# 复制本文件为 .env 并填入你的 token
VITE_MAPBOX_ACCESS_TOKEN=your_mapbox_public_token_here
27 个基地完整 JSON 数据
[
{ "id": 1, "name": "Al Udeid Air Base", "country": "Qatar", "lat": 25.117, "lng": 51.314 },
{ "id": 2, "name": "Camp As Sayliyah", "country": "Qatar", "lat": 25.275, "lng": 51.520 },
{ "id": 3, "name": "Naval Support Activity Bahrain", "country": "Bahrain", "lat": 26.236, "lng": 50.608 },
{ "id": 4, "name": "Camp Arifjan", "country": "Kuwait", "lat": 28.832, "lng": 47.799 },
{ "id": 5, "name": "Ali Al Salem Air Base", "country": "Kuwait", "lat": 29.346, "lng": 47.520 },
{ "id": 6, "name": "Camp Buehring", "country": "Kuwait", "lat": 29.603, "lng": 47.456 },
{ "id": 7, "name": "Al Dhafra Air Base", "country": "UAE", "lat": 24.248, "lng": 54.547 },
{ "id": 8, "name": "Prince Sultan Air Base", "country": "Saudi Arabia", "lat": 24.062, "lng": 47.580 },
{ "id": 9, "name": "Eskan Village", "country": "Saudi Arabia", "lat": 24.774, "lng": 46.738 },
{ "id": 10, "name": "Al Asad Airbase", "country": "Iraq", "lat": 33.785, "lng": 42.441 },
{ "id": 11, "name": "Erbil Air Base", "country": "Iraq", "lat": 36.237, "lng": 43.963 },
{ "id": 12, "name": "Baghdad Diplomatic Support Center", "country": "Iraq", "lat": 33.315, "lng": 44.366 },
{ "id": 13, "name": "Camp Taji", "country": "Iraq", "lat": 33.556, "lng": 44.256 },
{ "id": 14, "name": "Ain al-Asad", "country": "Iraq", "lat": 33.800, "lng": 42.450 },
{ "id": 15, "name": "Al-Tanf Garrison", "country": "Syria", "lat": 33.490, "lng": 38.618 },
{ "id": 16, "name": "Rmelan Landing Zone", "country": "Syria", "lat": 37.015, "lng": 41.885 },
{ "id": 17, "name": "Shaddadi Base", "country": "Syria", "lat": 36.058, "lng": 40.730 },
{ "id": 18, "name": "Conoco Gas Field Base", "country": "Syria", "lat": 35.336, "lng": 40.295 },
{ "id": 19, "name": "Muwaffaq Salti Air Base", "country": "Jordan", "lat": 32.356, "lng": 36.259 },
{ "id": 20, "name": "Incirlik Air Base", "country": "Turkey", "lat": 37.002, "lng": 35.425 },
{ "id": 21, "name": "Kurecik Radar Station", "country": "Turkey", "lat": 38.354, "lng": 37.794 },
{ "id": 22, "name": "Nevatim Air Base", "country": "Israel", "lat": 31.208, "lng": 35.012 },
{ "id": 23, "name": "Ramon Air Base", "country": "Israel", "lat": 30.776, "lng": 34.666 },
{ "id": 24, "name": "Thumrait Air Base", "country": "Oman", "lat": 17.666, "lng": 54.024 },
{ "id": 25, "name": "Masirah Air Base", "country": "Oman", "lat": 20.675, "lng": 58.890 },
{ "id": 26, "name": "West Cairo Air Base", "country": "Egypt", "lat": 30.915, "lng": 30.298 },
{ "id": 27, "name": "Camp Lemonnier", "country": "Djibouti", "lat": 11.547, "lng": 43.159 }
]
# 阿里云 DashScope API Key爬虫 AI 提取用,不设则用规则或 Ollama
# 在 crawler 目录或系统环境变量中设置,例如:
# export DASHSCOPE_API_KEY=sk-xxx
DASHSCOPE_API_KEY=

3
.env的副本 Normal file
View File

@@ -0,0 +1,3 @@
# Mapbox 地图令牌
VITE_MAPBOX_ACCESS_TOKEN=pk.eyJ1IjoiZDI5cTAiLCJhIjoiY21tYWQyOXI3MGFrZzJwcjJmZGltODI4ZCJ9.0jW_aK91VJExw6ffKGqWIA
DASHSCOPE_API_KEY=sk-029a4c4d761d49b99cfe6073234ac443

5
.gitignore vendored
View File

@@ -23,10 +23,11 @@ dist-ssr
*.sln
*.sw?
# API database
# API databaseSQLite 文件,部署时应挂载卷持久化,勿提交)
server/data.db
# Env
# Env(含 token勿提交
.env
.env.local
.env.*.local
.pyc

0
=1.11.0 Normal file
View File

168
DEPLOY.md Normal file
View File

@@ -0,0 +1,168 @@
# Docker 部署到服务器
将 US-Iran 态势面板打包成 Docker 镜像,便于移植到任意服务器。
## 架构
| 服务 | 端口 | 说明 |
|--------|------|--------------------------|
| api | 3001 | 前端静态 + REST API + WebSocket |
| crawler| 8000 | RSS 爬虫 + GDELT内部服务 |
- 数据库SQLite挂载到 `app-data` volume`/data/data.db`
- 前端与 API 合并到同一镜像,构建时执行 `npm run build` 生成 dist含修订页 `/edit`),访问 `http://主机:3001` 即可
## 快速部署
```bash
# 1. 克隆项目
git clone <repo> usa-dashboard && cd usa-dashboard
# 2. 构建并启动(需先配置 Mapbox Token见下方
docker compose up -d --build
# 3. 访问
# 前端 + API: http://localhost:3001
# 爬虫状态: http://localhost:8000/crawler/status
```
## Mapbox Token地图展示
构建时需将 Token 传入前端,否则地图为占位模式:
```bash
# 方式 1.env 文件
echo "VITE_MAPBOX_ACCESS_TOKEN=pk.xxx" > .env
docker compose up -d --build
# 方式 2环境变量
VITE_MAPBOX_ACCESS_TOKEN=pk.xxx docker compose up -d --build
```
## 推送到私有仓库并移植
```bash
# 1. 打标签(替换为你的仓库地址)
docker compose build
docker tag usa-dashboard-api your-registry/usa-dashboard-api:latest
docker tag usa-dashboard-crawler your-registry/usa-dashboard-crawler:latest
# 2. 推送
docker push your-registry/usa-dashboard-api:latest
docker push your-registry/usa-dashboard-crawler:latest
# 3. 在目标服务器拉取并启动
docker pull your-registry/usa-dashboard-api:latest
docker pull your-registry/usa-dashboard-crawler:latest
# 需准备 docker-compose.yml 或等效编排,见下方
```
## 仅用镜像启动(无 compose
```bash
# 1. 创建网络与数据卷
docker network create usa-net
docker volume create usa-data
# 2. 启动 API前端+接口)
docker run -d --name api --network usa-net \
-p 3001:3001 \
-v usa-data:/data \
-e DB_PATH=/data/data.db \
usa-dashboard-api
# 3. 启动爬虫(通过 usa-net 访问 api
docker run -d --name crawler --network usa-net \
-v usa-data:/data \
-e DB_PATH=/data/data.db \
-e API_BASE=http://api:3001 \
-e CLEANER_AI_DISABLED=1 \
-e GDELT_DISABLED=1 \
usa-dashboard-crawler
```
爬虫通过 `API_BASE` 调用 Node 的 `/api/crawler/notify`,两容器需在同一网络内。
## 国内服务器 / 镜像加速
拉取 `node``python` 等基础镜像慢时:
1. **Docker 镜像加速**:见 [docs/DOCKER_MIRROR.md](docs/DOCKER_MIRROR.md)
2. **构建时使用国内镜像源**
```bash
docker compose build --build-arg REGISTRY=docker.m.daocloud.io/library/
docker compose up -d
```
## 常用操作
```bash
# 查看日志
docker compose logs -f
# 重启
docker compose restart
# 停止并删除容器(数据卷保留)
docker compose down
# 回填战损数据(从 situation_update 重新提取)
curl -X POST http://localhost:8000/crawler/backfill
```
## 服务器直接部署(不用 Docker
若在服务器上直接跑 Node不用 Docker要能访问修订页 `/edit`,需保证:
1. **先构建、再启动**:在项目根目录执行 `npm run build`,再启动 API如 `npm run api` 或 `node server/index.js`)。
未构建时没有 `dist` 目录,启动会打日志:`dist 目录不存在,前端页面(含 /edit 修订页)不可用`。
2. **若前面有 Nginx**`curl http://127.0.0.1:3001/edit` 已是 200 但浏览器访问 `/edit` 仍 404说明 Nginx 没有把前端路由交给后端或没做 SPA fallback。二选一即可
**方式 ANginx 只反代,所有页面由 Node 提供(推荐)**
```nginx
server {
listen 80;
server_name 你的域名;
location / {
proxy_pass http://127.0.0.1:3001;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
location /ws {
proxy_pass http://127.0.0.1:3001;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
}
}
```
**方式 BNginx 提供 dist 静态,仅 /api、/ws 反代**
```nginx
server {
listen 80;
server_name 你的域名;
root /path/to/项目根目录/dist; # 改成实际路径
index index.html;
location / {
try_files $uri $uri/ /index.html;
}
location /api {
proxy_pass http://127.0.0.1:3001;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
}
location /ws {
proxy_pass http://127.0.0.1:3001;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
}
}
```
修改后执行 `sudo nginx -t` 检查配置,再 `sudo systemctl reload nginx`(或 `sudo nginx -s reload`)。

41
Dockerfile Normal file
View File

@@ -0,0 +1,41 @@
# 前端 + 后端合一镜像:构建阶段产出 dist运行阶段提供静态与 API含修订页 /edit
# 国内服务器拉取慢时,可加 --build-arg REGISTRY=docker.m.daocloud.io/library
ARG REGISTRY=
# ---------- 阶段 1构建前端 ----------
FROM ${REGISTRY}node:20-slim AS frontend-builder
WORKDIR /app
RUN npm config set registry https://registry.npmmirror.com
COPY package*.json ./
RUN npm ci
COPY vite.config.ts index.html tsconfig.json tsconfig.app.json ./
COPY postcss.config.js tailwind.config.js ./
COPY src ./src
RUN npm run build
# ---------- 阶段 2运行API + 静态) ----------
FROM ${REGISTRY}node:20-slim
RUN npm config set registry https://registry.npmmirror.com
RUN rm -f /etc/apt/sources.list.d/debian.sources && \
echo 'deb http://mirrors.aliyun.com/debian bookworm main' > /etc/apt/sources.list && \
echo 'deb http://mirrors.aliyun.com/debian bookworm-updates main' >> /etc/apt/sources.list && \
echo 'deb http://mirrors.aliyun.com/debian-security bookworm-security main' >> /etc/apt/sources.list && \
apt-get update && apt-get install -y --no-install-recommends python3 make g++ && \
rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY package*.json ./
RUN npm ci --omit=dev
COPY server ./server
COPY --from=frontend-builder /app/dist ./dist
ENV NODE_ENV=production
ENV API_PORT=3001
ENV DB_PATH=/data/data.db
EXPOSE 3001
COPY docker-entrypoint.sh ./
RUN chmod +x docker-entrypoint.sh
ENTRYPOINT ["./docker-entrypoint.sh"]

18
Dockerfile.crawler Normal file
View File

@@ -0,0 +1,18 @@
# Python 3.11+ 爬虫服务(与 requirements.txt / pyproject.toml 一致)
# 国内服务器可加 --build-arg REGISTRY=docker.m.daocloud.io/library
ARG REGISTRY=
FROM ${REGISTRY}python:3.11-slim
WORKDIR /app
COPY crawler/requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt
COPY crawler ./
ENV DB_PATH=/data/data.db
ENV API_BASE=http://api:3001
ENV GDELT_DISABLED=1
ENV RSS_INTERVAL_SEC=60
EXPOSE 8000
CMD ["uvicorn", "realtime_conflict_service:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -41,7 +41,13 @@ npm run api:seed
npm run api
```
开发时需同时运行前端与 API
开发时可用一键启动(推荐)
```bash
npm start
```
或分终端分别运行:
```bash
# 终端 1
@@ -53,6 +59,19 @@ npm run dev
API 会由 Vite 代理到 `/api`,前端通过 `/api/situation` 获取完整态势数据。数据库文件位于 `server/data.db`,可通过修改表数据实现动态调整。
### 爬虫不生效时
1. 测试 RSS 抓取:`npm run crawler:test`(需网络,返回抓取条数)
2. 单独启动爬虫查看日志:`npm run gdelt`(另开终端)
3. 查看爬虫状态:`curl http://localhost:8000/crawler/status`(需爬虫服务已启动)
4. 数据库面板 `/db` 每 30 秒自动刷新,可观察 situation_update 条数是否增加
### 面板数据 / 地图 / 战损不更新时
- **确保 API 与爬虫共用同一数据库**本地开发时Node 默认用 `server/data.db`,爬虫默认用 `../server/data.db`(同文件)。若 Node 在本地、爬虫在 Docker则数据库不同面板不会更新。
- **Docker 部署**`GDELT_DISABLED=1` 时,地图冲突点由 RSS 新闻填充;战损与基地状态由规则/AI 提取后写入 `combat_losses``key_location`
- **排查**:访问 `/db``situation_update``gdelt_events``combat_losses` 是否在增长;确认 API 已启动且前端能访问 `/api/situation`
## Development
```bash
@@ -65,6 +84,37 @@ npm run dev
npm run build
```
## Docker 部署
```bash
# 构建并启动(需 .env 中配置 VITE_MAPBOX_ACCESS_TOKEN 以启用地图)
docker compose up -d
# 访问前端http://localhost:3001
# 数据库与爬虫共享 volume首次启动自动 seed
```
**迁移到服务器**:见 [DEPLOY.md](DEPLOY.md)(构建、推送、单机/多机部署说明)
**拉取镜像超时?** 在 Docker Desktop 配置镜像加速,见 [docs/DOCKER_MIRROR.md](docs/DOCKER_MIRROR.md)
**开发时无需每次 rebuild**:使用开发模式挂载源码 + 热重载:
```bash
docker compose -f docker-compose.yml -f docker-compose.dev.yml up -d
```
- API`node --watch` 监听 `server/` 变更并自动重启
- 爬虫:`uvicorn --reload` 监听 `crawler/` 变更并自动重启
- 修改 `server/``crawler/` 后,服务会自动重载,无需重新 build
环境变量(可选,在 .env 或 docker-compose.yml 中配置):
- `VITE_MAPBOX_ACCESS_TOKEN`Mapbox 令牌,构建时注入
- `DB_PATH`:数据库路径(默认 /data/data.db
- `CLEANER_AI_DISABLED=1`:爬虫默认禁用 Ollama
- `GDELT_DISABLED=1`:爬虫默认禁用 GDELT国内易超时
## Project Structure
```
@@ -91,3 +141,6 @@ server/
├── seed.js # 数据库种子脚本
└── data.db # SQLite 数据库(运行 seed 后生成)
```
https://git.bimwe.com/Daniel/usa.git

459
crawler/README.md Normal file
View File

@@ -0,0 +1,459 @@
# GDELT 实时冲突服务 + 新闻爬虫
## 数据来源梳理
### 1. GDELT Project (gdelt_events)
| 项目 | 说明 |
|------|------|
| API | `https://api.gdeltproject.org/api/v2/doc/doc` |
| 查询 | `query=United States Iran military`(可配 `GDELT_QUERY` |
| 模式 | `mode=ArtList``format=json``maxrecords=30` |
| 时间范围 | **未指定时默认最近 3 个月**,按相关性排序,易返回较旧文章 |
| 更新频率 | GDELT 约 15 分钟级,爬虫 60 秒拉一次 |
**数据偏老原因**:未传 `timespan``sort=datedesc`API 返回 3 个月内“最相关”文章,不保证最新。
### 2. RSS 新闻 → 看板实时数据(主输出)+ 事件脉络
| 项目 | 说明 |
|------|------|
| **主输出** | **看板实时数据**战损combat_losses、据点状态key_location、冲突事件gdelt_events、统计conflict_stats供前端战损/基地/地图等面板展示。 |
| 辅助输出 | 事件脉络situation_update时间线摘要非主展示目标。 |
| 源 | 多国主流媒体:美/英/法/俄/中/伊/卡塔尔等(见 `config.RSS_FEEDS` |
| 过滤 | 标题/摘要需含 `KEYWORDS` 之一iran、usa、strike、military 等) |
| 更新 | 爬虫按 `RSS_INTERVAL_SEC` 拉取;每 `BACKFILL_CYCLES` 轮会从近期事件回填一次战损/据点,保证面板数据与最新内容一致。 |
**GDELT 无法访问时**:设置 `GDELT_DISABLED=1`,仅用 RSS部分境外源可能需代理。
### 3. AI 新闻清洗与分类(可选)
- **清洗**`cleaner_ai.py` 用 Ollama 提炼新闻为简洁摘要,供面板展示
- **分类**`parser_ai.py` 用 Ollama 替代规则做 category/severity 判定
- 需先安装并运行 Ollama`ollama run llama3.1`
- 环境变量:`OLLAMA_MODEL=llama3.1``PARSER_AI_DISABLED=1``CLEANER_AI_DISABLED=1`(禁用对应 AI
---
**看板实时数据更新**:爬虫抓取 → 提取战损/据点等 → 写入 combat_losses、key_location 等 → 调用 Node 通知 → WebSocket 广播 → 前端战损/基地/地图等面板刷新。事件脉络(时间线)为同一流水线的辅助输出。
## 依赖
- **Python 3.11+**(推荐 3.11 或 3.12
```bash
pip install -r requirements.txt
```
或使用 pyproject`pip install -e crawler/`(在项目根目录)。
- `deep-translator`GDELT 与 RSS 新闻入库前自动翻译为中文。
- `dashscope`:可选,配置 `DASHSCOPE_API_KEY` 后启用通义提取/清洗。
## 运行(需同时启动 3 个服务)
| 终端 | 命令 | 说明 |
|------|------|------|
| 1 | `npm run api` | Node API + WebSocket必须 |
| 2 | `npm run gdelt` | GDELT + RSS 爬虫(**事件脉络数据来源** |
| 3 | `npm run dev` | 前端开发 |
**事件脉络不更新时**:多半是未启动 `npm run gdelt`。只跑 `npm run api` 时,事件脉络会显示空或仅有缓存。
## 如何检查爬虫是否工作正常
按下面顺序做即可确认整条链路(爬虫 → 数据库 → Node 重载 → API/WebSocket正常。
### 1. 一键验证(推荐)
先启动 API再执行验证脚本可选是否顺带启动爬虫
```bash
# 终端 1必须
npm run api
# 终端 2执行验证不启动爬虫只检查当前状态
./scripts/verify-pipeline.sh
# 或:顺带启动爬虫并等首次抓取后再验证
./scripts/verify-pipeline.sh --start-crawler
```
脚本会检查API 健康、态势数据含 `lastUpdated`、爬虫服务是否可达、`news_content`/situation_update、战损字段、`POST /api/crawler/notify` 是否可用。
### 2. 手动快速检查
| 步骤 | 命令 / 操作 | 正常表现 |
|-----|-------------|----------|
| API 是否在跑 | `curl -s http://localhost:3001/api/health` | 返回 `{"ok":true}` |
| 态势是否可读 | `curl -s http://localhost:3001/api/situation \| head -c 300` | 含 `lastUpdated``usForces``recentUpdates` |
| RSS 能否抓到 | `npm run crawler:test` | 输出「RSS 抓取: N 条」N>0 表示有命中 |
| 爬虫服务gdelt | `curl -s http://localhost:8000/crawler/status` | 返回 JSON`db_path`/`db_exists` 等 |
| 库里有无爬虫数据 | `sqlite3 server/data.db "SELECT COUNT(*) FROM situation_update; SELECT COUNT(*) FROM news_content;"` 或访问 `http://localhost:3001/api/db/dashboard` | situation_update、news_content 条数 > 0跑过流水线后 |
| 通知后是否重载 | 爬虫写库后会 POST `/api/crawler/notify`Node 会 `reloadFromFile` 再广播 | 前端/`/api/situation``lastUpdated` 和内容会更新 |
### 3. 跑一轮流水线(不常驻爬虫时)
不启动 gdelt 时,可单次跑完整流水线(抓取 → 去重 → 写表 → notify
```bash
npm run api # 保持运行
cd crawler && python3 -c "
from pipeline import run_full_pipeline
from config import DB_PATH, API_BASE
n_fetched, n_news, n_panel = run_full_pipeline(db_path=DB_PATH, api_base=API_BASE, notify=True)
print('抓取:', n_fetched, '去重新增:', n_news, '面板写入:', n_panel)
"
```
有网络且有关键词命中时,应看到非零数字;再查 `curl -s http://localhost:3001/api/situation` 或前端事件脉络是否出现新数据。
**按时间范围测试(例如 2 月 28 日 0 时至今)**RSS 流水线支持只保留指定起始时间之后的条目,便于测试「从某日 0 点到现在」的数据。
```bash
# 默认从 2026-02-28 0:00 到现在
npm run crawler:once:range
# 或指定起始时间
./scripts/run-crawler-range.sh 2026-02-28T00:00:00
```
需设置环境变量 `CRAWL_START_DATE`ISO 时间,如 `2026-02-28T00:00:00`。GDELT 时间范围在启动 gdelt 服务时设置,例如:`GDELT_TIMESPAN=3d npm run gdelt`(最近 3 天)。
### 4. 仅测提取逻辑(不写库)
```bash
npm run crawler:test:extraction # 规则/db_merge 测试
# 或按 README「快速自测命令」用示例文本调 extract_from_news 看 combat_losses_delta / key_location_updates
```
**常见现象**:抓取 0 条 → 网络/RSS 被墙或关键词未命中situation_update 为空 → 未跑流水线或去重后无新增;前端不刷新 → 未开 `npm run api` 或未开爬虫gdelt
### 5. 爬虫与面板是否联通
专门检查「爬虫写库」与「面板展示」是否一致:
```bash
./scripts/check-crawler-panel-connectivity.sh
```
会对比:爬虫侧的 `situation_update` 条数 vs 面板 API 返回的 `recentUpdates` 条数,并说明为何战损/基地等不一定随每条新闻变化。
## 爬虫与面板数据联动说明
| 面板展示 | 数据来源(表/接口) | 是否由爬虫更新 | 说明 |
|----------|---------------------|----------------|------|
| **事件脉络** (recentUpdates) | situation_update → getSituation() | ✅ 是 | 每条去重后的新闻会写入 situation_updateNode 收到 notify 后重载 DB 再广播 |
| **地图冲突点** (conflictEvents) | gdelt_events 或 RSS→gdelt 回填 | ✅ 是 | GDELT 或 GDELT 禁用时由 situation_update 同步到 gdelt_events |
| **战损/装备毁伤** (combatLosses) | combat_losses | ⚠️ 有条件 | 仅当 AI/规则从新闻中提取到数字如「2 名美军死亡」merge 才写入增量 |
| **基地/地点状态** (keyLocations) | key_location | ⚠️ 有条件 | 仅当提取到 key_location_updates如某基地遭袭时更新 |
| **地图打击/攻击动画** (mapData.strikeSources, strikeLines) | map_strike_source, map_strike_line | ⚠️ 有条件 | 仅当提取到 map_strike_sources / map_strike_lines 时写入;格式见下「地图打击数据」 |
| **力量摘要/指数/资产** (summary, powerIndex, assets) | force_summary, power_index, force_asset | ❌ 否 | 仅 seed 初始化,爬虫不写 |
| **华尔街/报复情绪** (wallStreet, retaliation) | wall_street_trend, retaliation_* | ⚠️ 有条件 | 仅当提取器输出对应字段时更新 |
因此:**新闻很多、但战损/基地数字不动**是正常现象——多数标题不含可解析的伤亡/基地数字只有事件脉络recentUpdates和地图冲突点会随每条新闻增加。若**事件脉络也不更新**,请确认 Node 终端在爬虫每轮抓取后是否出现 `[crawler/notify] DB 已重载`;若无,检查爬虫的 `API_BASE` 是否指向当前 API默认 `http://localhost:3001`)。
## 写库流水线(与 server/README 第五节一致)
RSS 与主入口均走统一流水线 `pipeline.run_full_pipeline`
1. **抓取** → 2. **AI 清洗**(标题/摘要/分类)→ 3. **去重**news_content.content_hash→ 4. **映射到前端库字段**situation_update、combat_losses、key_location 等)→ 5. **更新表** → 6. **有新增时 POST /api/crawler/notify**
- `npm run crawler`main.py`npm run gdelt`realtime_conflict_service的 RSS 分支都调用该流水线。
- 实现见 `crawler/pipeline.py`
## 数据流
```
GDELT API → 抓取(60s) → SQLite (gdelt_events, conflict_stats) → POST /api/crawler/notify
RSS → 抓取 → 清洗 → 去重 → 写 news_content / situation_update / 战损等 → POST /api/crawler/notify
Node 更新 situation.updated_at + WebSocket 广播
前端实时展示
```
## 配置
环境变量:
- `DB_PATH`: SQLite 路径,默认 `../server/data.db`
- `API_BASE`: Node API 地址,默认 `http://localhost:3001`
- **`DASHSCOPE_API_KEY`**阿里云通义DashScopeAPI Key。**设置后全程使用商业模型,无需本机安装 Ollama**(适合 Mac 版本较低无法跑 Ollama 的情况)。获取: [阿里云百炼 / DashScope](https://dashscope.console.aliyun.com/) → 创建 API-KEY复制到环境变量或项目根目录 `.env``DASHSCOPE_API_KEY=sk-xxx`。摘要、分类、战损/基地提取均走通义。
- `GDELT_QUERY`: 搜索关键词,默认 `United States Iran military`
- `GDELT_MAX_RECORDS`: 最大条数,默认 30
- `GDELT_TIMESPAN`: 时间范围,`1h` / `1d` / `1week`,默认 `1d`(近日资讯)
- `GDELT_DISABLED`: 设为 `1` 则跳过 GDELT仅用 RSS 新闻GDELT 无法访问时用)
- `FETCH_INTERVAL_SEC`: GDELT 抓取间隔(秒),默认 60
- `RSS_INTERVAL_SEC`: RSS 抓取间隔(秒),默认 45优先保证事件脉络
- `OLLAMA_MODEL`: AI 分类模型,默认 `llama3.1`
- `PARSER_AI_DISABLED`: 设为 `1` 则禁用 AI 分类,仅用规则
- `CLEANER_AI_DISABLED`: 设为 `1` 则禁用 AI 清洗,仅用规则截断
- `FETCH_FULL_ARTICLE`: 设为 `0` 则不再抓取正文,仅用标题+摘要做 AI 提取(默认 `1` 抓取正文)
- `ARTICLE_FETCH_LIMIT`: 每轮为多少条新资讯抓取正文,默认 10
- `ARTICLE_FETCH_TIMEOUT`: 单篇正文请求超时(秒),默认 12
- `ARTICLE_MAX_BODY_CHARS`: 正文最大字符数,默认 6000
- `EXTRACT_TEXT_MAX_LEN`: 送入 AI 提取的原文最大长度,默认 4000
**增量与地点**:战损一律按**增量**处理——AI 只填本则报道的「本次/此次」新增数,不填累计总数;合并时与库内当前值叠加。双方攻击地点通过 `key_location_updates` 更新(美军基地被打击 side=us伊朗设施被打击 side=iran会写入 `key_location` 的 status/damage_level。
---
## 主要新闻资讯来源RSS
配置在 `crawler/config.py``RSS_FEEDS`,当前包含:
| 来源 | URL / 说明 |
|------|------------|
| **美国** | Reuters Top News、NYT World |
| **英国** | BBC World、BBC Middle East、The Guardian World |
| **法国** | France 24 |
| **德国** | DW World |
| **俄罗斯** | TASS、RT |
| **中国** | Xinhua World、CGTN World |
| **凤凰** | 凤凰军事、凤凰国际feedx.net 镜像) |
| **伊朗** | Press TV |
| **卡塔尔/中东** | Al Jazeera All、Al Jazeera Middle East |
单源超时由 `FEED_TIMEOUT`(默认 12 秒)控制;某源失败不影响其他源。
**过滤**:每条条目的标题+摘要必须命中 `config.KEYWORDS` 中至少一个关键词才会进入流水线(伊朗/美国/中东/军事/基地/霍尔木兹等,见 `config.KEYWORDS`)。
### 境内可访问情况(仅供参考,以实际网络为准)
| 通常境内可直接访问 | 说明 |
|-------------------|------|
| **新华网** `english.news.cn/rss/world.xml` | 中国官方外文社 |
| **CGTN** `cgtn.com/rss/world` | 中国国际台 |
| **凤凰** `feedx.net/rss/ifengmil.xml``ifengworld.xml` | 第三方 RSS 镜像,中文军事/国际 |
| **人民网** `people.com.cn/rss/military.xml``world.xml` | 军事、国际 |
| **新浪** `rss.sina.com.cn` 军事/新闻 | 新浪军事、新浪新闻滚动 |
| **中国日报** `chinadaily.com.cn/rss/world_rss.xml` | 国际新闻 |
| **中国军网** `english.chinamil.com.cn/rss.xml` | 解放军报英文 |
| **俄通社 TASS** `tass.com/rss/v2.xml` | 俄罗斯官媒 |
| **RT** `rt.com/rss/` | 俄罗斯今日俄罗斯 |
| **DW** `rss.dw.com/xml/rss-en-world` | 德国之声,部分地区/时段可访问 |
**境内常需代理**Reuters、NYT、BBC、Guardian、France 24、Al Jazeera、Press TV 等境外主站 RSS直连易超时或被墙。境内部署建议`CRAWLER_USE_PROXY=1` 并配置代理,或仅保留上表源(可在 `config.py` 中注释掉不可达的 URL减少超时等待
**国内其他媒体(今日头条、网易、腾讯、新浪微博等)**:今日头条、腾讯新闻、新浪微博等多为 App/信息流产品,**无官方公开 RSS**。如需接入可考虑:第三方 RSS 聚合(如 FeedX、RSSHub 等若有对应频道)、或平台开放 API若有且合规使用。当前爬虫已加入新浪rss.sina.com.cn、人民网、中国日报、中国军网等有明确 RSS 的境内源;网易新闻曾有 RSS 中心页,具体栏目 XML 需在其订阅页查找后加入 `config.py`
---
## 为什么爬虫一直抓不到有效信息0 条)
常见原因与应对如下。
| 原因 | 说明 | 建议 |
|------|------|------|
| **RSS 源在国内不可达** | 多数源为境外站Reuters、BBC、NYT、Guardian、France24、DW、TASS、RT、Al Jazeera、Press TV 等),国内直连易超时或被墙。 | 使用代理:设 `CRAWLER_USE_PROXY=1` 并配置系统/环境 HTTP(S) 代理,或部署到海外服务器再跑爬虫。 |
| **关键词无一命中** | 只有标题或摘要里包含 `KEYWORDS` 中至少一个词才会保留(如 iran、usa、middle east、strike、基地 等)。若当前头条都不涉及美伊/中东,整轮会 0 条。 | 先跑 `npm run crawler:test` 看是否 0 条;若长期为 0 且网络正常,可在 `config.py` 中适当放宽或增加 `KEYWORDS`(如增加通用词做测试)。 |
| **单源超时导致整轮无结果** | 若所有源都在 `FEED_TIMEOUT` 内未返回,则每源返回空列表,汇总仍为 0 条。 | 增大 `FEED_TIMEOUT`(如 20或先单独用浏览器/curl 测某条 RSS URL 是否可访问;国内建议代理后再试。 |
| **分类/清洗依赖 AI 且失败** | 每条命中关键词的条目会调 `classify_and_severity`Ollama 或 DashScope。若本机未起 Ollama、未设 DashScope且规则兜底异常可能影响该条。 | 设 `PARSER_AI_DISABLED=1` 使用纯规则分类,避免依赖 Ollama/DashScope或配置好 `DASHSCOPE_API_KEY` / 本地 Ollama 再跑。 |
| **去重后无新增** | 抓到的条数 >0但经 `news_content` 的 content_hash 去重后「新增」为 0则不会写 `situation_update`,事件脉络不增加。 | 属正常:同一批新闻再次抓取不会重复写入。等有新头条命中关键词后才会出现新条目。 |
**快速自检**
```bash
npm run crawler:test
```
输出「RSS 抓取: N 条」。若始终为 0优先检查网络/代理与 `KEYWORDS`;若 N>0 但面板无新事件,多为去重后无新增或未调 `POST /api/crawler/notify`
---
## 数据流与 AI 自检
**完整链路**RSS 抓取 → 关键词过滤 → 翻译/清洗 → 去重news_content→ 写 situation_update → 正文抓取(可选)→ **AI 提取**(战损/基地等)→ db_merge 写 combat_losses/key_location 等 → POST /api/crawler/notify → Node 重载并广播。
| 环节 | 说明 | 自检 |
|------|------|------|
| 抓取 | `scrapers/rss_scraper.fetch_all()`,按 KEYWORDS 过滤 | `npm run crawler:test` 看条数 |
| 去重 | `news_storage.save_and_dedup()`content_hash 落库 news_content | 查 `news_content` 表条数 |
| 事件脉络 | `db_writer.write_updates()` 写 situation_update与 pipeline 使用同一 db_path | 查 `situation_update` 表 |
| AI 提取 | 战损/基地等:**有 DASHSCOPE_API_KEY 用通义****否则 CLEANER_AI_DISABLED=1 用规则**,否则用 **Ollama**extractor_ai | 见下 |
| 分类/严重度 | 每条 RSS 的 category/severity**PARSER_AI_DISABLED=1 用规则**,否则 DashScope 或 Ollama | 无 AI 时设 `PARSER_AI_DISABLED=1` 可正常跑 |
**如何保证「面板实时数据」有更新**(战损、据点等):
- **推荐**:设 `CLEANER_AI_DISABLED=1` → 使用 `extractor_rules`(纯规则),无需 Ollama/通义,即可从新闻中提取战损/基地并写入 combat_losses、key_location。
- 或设 `DASHSCOPE_API_KEY` → 用通义做更细的提取。
- 否则用 `extractor_ai`(需本机 `ollama run llama3.1`),未起则提取静默失败、面板数字不更新。
- 服务会每 `BACKFILL_CYCLES` 轮(默认 2 轮)从近期事件再跑一次提取并合并,保证战损/据点与最新内容一致。
**常见 bug 与修复**
- **事件脉络有、战损/基地不更新**:多为 AI 未跑通Ollama 未起且未设 DashScope、未设 CLEANER_AI_DISABLED。可设 `CLEANER_AI_DISABLED=1` 用规则提取,或起 Ollama / 配置 DashScope。
- **多 DB 路径不一致**pipeline 已统一 `db_path``write_updates``save_and_dedup``merge` 均使用同一 path`config.DB_PATH`)。
---
## 优化后验证效果示例
以下为「正文抓取 + AI 精确提取 + 增量与地点更新」优化后,单条新闻从输入到前端展示的完整示例,便于对照验证。
### 1. 示例输入(新闻摘要/全文片段)
```
伊朗向伊拉克阿萨德空军基地发射 12 枚弹道导弹,造成此次袭击中 2 名美军人员死亡、14 人受伤,
另有 1 架战机在跑道受损。乌代德基地未遭直接命中。同日以色列对伊朗伊斯法罕一处设施发动打击。
```
### 2. AI 提取输出(增量 + 攻击地点)
```json
{
"summary": "伊朗导弹袭击伊拉克阿萨德基地致美军 2 死 14 伤1 架战机受损;以军打击伊斯法罕。",
"category": "alert",
"severity": "high",
"us_personnel_killed": 2,
"us_personnel_wounded": 14,
"us_aircraft": 1,
"us_bases_damaged": 1,
"key_location_updates": [
{ "name_keywords": "阿萨德|asad|al-asad", "side": "us", "status": "attacked", "damage_level": 2 },
{ "name_keywords": "伊斯法罕|isfahan", "side": "iran", "status": "attacked", "damage_level": 1 }
]
}
```
说明:战损为**本则报道的新增数**(此次 2 死、14 伤、1 架战机),不是累计总数;地点为双方遭袭设施(美军基地 side=us伊朗设施 side=iran
### 3. 合并后数据库变化
| 表/字段 | 合并前 | 本则增量 | 合并后 |
|--------|--------|----------|--------|
| combat_losses.us.personnel_killed | 127 | +2 | 129 |
| combat_losses.us.personnel_wounded | 384 | +14 | 398 |
| combat_losses.us.aircraft | 2 | +1 | 3 |
| combat_losses.us.bases_damaged | 27 | +1 | 28 |
| key_locationname 含「阿萨德」) | status=operational | — | status=attacked, damage_level=2 |
| key_locationname 含「伊斯法罕」) | status=operational | — | status=attacked, damage_level=1 |
若 AI 误提「累计 2847 人丧生」并填成 personnel_killed=2847单次合并会被上限截断如最多 +500避免一次写入导致数据剧增。
### 4. 前端验证效果
- **事件脉络**出现一条新条目summary 为上述 12 句概括category=alert、severity=high。
- **装备毁伤面板**:美军「阵亡」+2、「受伤」+14、「战机」+1基地毁/损数字随 bases_damaged +1 更新。
- **地图**:阿萨德基地、伊斯法罕对应点位显示为「遭袭」状态(脉冲/标色随现有地图逻辑)。
- **API**`GET /api/situation``usForces.combatLosses``usForces.keyLocations`(含 status/damage_level为更新后值`lastUpdated` 为合并后时间。
### 5. 快速自测命令
```bash
# 仅测提取逻辑(不写库):用示例文本调 AI 提取,看是否得到增量 + key_location_updates
cd crawler && python3 -c "
from extractor_ai import extract_from_news
text = '''伊朗向伊拉克阿萨德空军基地发射导弹,此次袭击造成 2 名美军死亡、14 人受伤1 架战机受损。'''
out = extract_from_news(text)
print('combat_losses_delta:', out.get('combat_losses_delta'))
print('key_location_updates:', out.get('key_location_updates'))
"
```
期望:`combat_losses_delta.us` 含 personnel_killed=2、personnel_wounded=14、aircraft=1 等增量;`key_location_updates` 含阿萨德 side=us 等条目。
### 地图打击数据(与前端攻击动画统一格式)
爬虫/AI 若输出以下字段,`db_merge` 会写入 `map_strike_source``map_strike_line``GET /api/situation``mapData.strikeSources` / `mapData.strikeLines` 会更新,前端可直接追加打击线与飞行动画。
- **map_strike_sources**(可选):`[{ "id": "israel"|"lincoln"|"ford", "name": "显示名", "lng": 经度, "lat": 纬度 }]`,与 seed 中打击源 id 一致时可覆盖位置。
- **map_strike_lines**(可选):`[{ "source_id": "israel"|"lincoln"|"ford", "target_lng", "target_lat", "target_name": "目标名", "struck_at": "ISO 时间" }]`,每条追加一条打击线(不删已有),便于按时间回放。
示例:`{ "map_strike_lines": [{ "source_id": "israel", "target_lng": 51.916, "target_lat": 33.666, "target_name": "纳坦兹", "struck_at": "2026-03-01T02:04:00.000Z" }] }`
---
## 冲突强度 (impact_score)
| 分数 | 地图效果 |
|------|------------|
| 13 | 绿色点 |
| 46 | 橙色闪烁 |
| 710 | 红色脉冲扩散 |
## API
- `GET http://localhost:8000/events`返回事件列表与冲突统计Python 服务直连)
- `GET http://localhost:3001/api/events`:从 Node 读取(推荐,含 WebSocket 同步)
## 本地验证链路
按下面任选一种方式,确认「抓取 → 清洗 → 去重 → 映射 → 写表 → 通知」整条链路正常。
### 方式一:最小验证(不启动前端)
1. **启动 API必须**
```bash
npm run api
```
保持运行,默认 `http://localhost:3001`。
2. **安装爬虫依赖并跑一轮流水线**
```bash
cd crawler && pip install -r requirements.txt
python -c "
from pipeline import run_full_pipeline
from config import DB_PATH, API_BASE
n_fetched, n_news, n_panel = run_full_pipeline(db_path=DB_PATH, api_base=API_BASE, translate=True, notify=True)
print('抓取:', n_fetched, '去重新增:', n_news, '面板写入:', n_panel)
"
```
- 有网络且有关键词命中时,应看到非零数字;无网络或全被过滤则为 `0 0 0`。
- 若报错 `module 'socket' has no attribute 'settimeout'`,已修复为 `setdefaulttimeout`,请拉取最新代码。
3. **查库确认**
```bash
sqlite3 server/data.db "SELECT COUNT(*) FROM situation_update; SELECT COUNT(*) FROM news_content;"
```
或浏览器打开 `http://localhost:3001/api/db/dashboard`,看 `situation_update`、`news_content` 是否有数据。
4. **确认态势接口**
```bash
curl -s http://localhost:3001/api/situation | head -c 500
```
应包含 `lastUpdated`、`recentUpdates` 等。
### 方式二:用现有验证脚本(推荐)
1. 终端 1`npm run api`
2. 终端 2可选`npm run gdelt`(会定时跑 RSS + GDELT
3. 执行验证脚本:
```bash
./scripts/verify-pipeline.sh
```
若爬虫未启动想一并测爬虫,可:
```bash
./scripts/verify-pipeline.sh --start-crawler
```
脚本会检查API 健康、态势数据、爬虫状态、资讯表、战损字段、通知接口。
### 方式三:只测 RSS 抓取(不写库)
```bash
npm run crawler:test
```
输出为「RSS 抓取: N 条」。0 条时检查网络或 `config.py` 里 `RSS_FEEDS` / `KEYWORDS`。
### 常见问题
| 现象 | 可能原因 |
|------|----------|
| 抓取 0 条 | 网络不通、RSS 被墙、关键词无一命中 |
| `situation_update` 为空 | 去重后无新增,或未跑流水线(只跑了 `fetch_all` 未跑 `run_full_pipeline` |
| 前端事件脉络不刷新 | 未启动 `npm run api` 或 WebSocket 未连上(需通过 Vite 代理访问前端) |
| 翻译/AI 清洗很慢或报错 | 设 `TRANSLATE_DISABLED=1` 或 `CLEANER_AI_DISABLED=1` 可跳过,用规则兜底 |
---
## 故障排查
| 现象 | 可能原因 | 排查 |
|------|----------|------|
| 事件脉络始终为空 | 未启动 GDELT 爬虫 | 另开终端运行 `npm run gdelt`,观察是否有 `GDELT 更新 X 条事件` 输出 |
| 事件脉络不刷新 | WebSocket 未连上 | 确认 `npm run api` 已启动,前端需通过 `npm run dev` 访问Vite 会代理 /ws |
| GDELT 抓取失败 | 系统代理超时 / ProxyError | 爬虫默认直连,不走代理;若需代理请设 `CRAWLER_USE_PROXY=1` |
| GDELT 抓取失败 | 网络 / GDELT API 限流 | 检查 Python 终端报错GDELT 在国外,国内网络可能较慢或超时 |
| 新闻条数为 0 | RSS 源被墙或关键词不匹配 | 检查 crawler/config.py 中 RSS_FEEDS、KEYWORDS国内需代理 |
| **返回数据偏老** | GDELT 默认 3 个月内按相关性 | 设置 `GDELT_TIMESPAN=1d` 限制为近日;加 `sort=datedesc` 最新优先 |

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,90 @@
# -*- coding: utf-8 -*-
"""
从文章 URL 抓取正文,供 AI 提取精确数据使用。
RSS 仅提供标题和短摘要,正文可提供伤亡、番号、地点等具体数字与事实。
"""
import os
import re
from typing import Optional
# 单页超时(秒)
FETCH_TIMEOUT = int(os.environ.get("ARTICLE_FETCH_TIMEOUT", "12"))
# 正文最大字符数,避免超长输入
MAX_BODY_CHARS = int(os.environ.get("ARTICLE_MAX_BODY_CHARS", "6000"))
# 是否启用正文抓取(设为 0 则仅用标题+摘要)
FETCH_FULL_ARTICLE = os.environ.get("FETCH_FULL_ARTICLE", "1") == "1"
def _strip_html(html: str) -> str:
"""简单去除 HTML 标签与多余空白"""
if not html:
return ""
text = re.sub(r"<script[^>]*>[\s\S]*?</script>", " ", html, flags=re.I)
text = re.sub(r"<style[^>]*>[\s\S]*?</style>", " ", text, flags=re.I)
text = re.sub(r"<[^>]+>", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def fetch_article_body(url: str, timeout: int = FETCH_TIMEOUT) -> Optional[str]:
"""
请求文章 URL提取正文纯文本。失败或非 HTML 返回 None。
优先用 BeautifulSoup 取 main/article 或 body否则退化为正则去标签。
"""
if not url or not url.strip().startswith("http"):
return None
try:
import requests
headers = {"User-Agent": "US-Iran-Dashboard/1.0 (News Aggregator)"}
# 不跟随代理,避免墙内超时
proxies = {"http": None, "https": None} if os.environ.get("CRAWLER_USE_PROXY") != "1" else None
r = requests.get(url, headers=headers, timeout=timeout, proxies=proxies)
r.raise_for_status()
ct = (r.headers.get("Content-Type") or "").lower()
if "html" not in ct and "xml" not in ct:
return None
html = r.text
if not html or len(html) < 200:
return None
try:
from bs4 import BeautifulSoup
except ImportError:
return _strip_html(html)[:MAX_BODY_CHARS]
try:
soup = BeautifulSoup(html, "html.parser")
for tag in ("article", "main", "[role='main']", ".article-body", ".post-content", ".entry-content", ".content"):
if tag.startswith((".", "[")):
node = soup.select_one(tag)
else:
node = soup.find(tag)
if node:
body = node.get_text(separator=" ", strip=True)
if len(body) > 300:
return _strip_html(body)[:MAX_BODY_CHARS]
body = soup.body.get_text(separator=" ", strip=True) if soup.body else ""
if len(body) > 300:
return _strip_html(body)[:MAX_BODY_CHARS]
except Exception:
pass
return _strip_html(html)[:MAX_BODY_CHARS]
except Exception:
return None
def enrich_item_with_body(item: dict, max_chars: int = MAX_BODY_CHARS) -> None:
"""
若 item 有 url 且无 full_text则抓取正文并写入 item["full_text"]。
用于 AI 提取时获得更多上下文。原地修改 item。
"""
if not FETCH_FULL_ARTICLE:
return
url = (item.get("url") or "").strip()
if not url or item.get("full_text"):
return
body = fetch_article_body(url)
if not body:
return
title = (item.get("title") or "").strip()
summary = (item.get("summary") or "").strip()
combined = f"{title}\n{summary}\n{body}" if summary else f"{title}\n{body}"
item["full_text"] = combined[:max_chars]

125
crawler/cleaner_ai.py Normal file
View File

@@ -0,0 +1,125 @@
# -*- coding: utf-8 -*-
"""
AI 清洗新闻数据,严格按面板字段约束输出
面板 EventTimelinePanel 所需summary(≤120字)、category(枚举)、severity(枚举)
优先使用 DASHSCOPE_API_KEY通义无需 Ollama否则 Ollama最后规则兜底
"""
import os
import re
from typing import Optional
CLEANER_AI_DISABLED = os.environ.get("CLEANER_AI_DISABLED", "0") == "1"
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "llama3.1")
DASHSCOPE_API_KEY = os.environ.get("DASHSCOPE_API_KEY", "").strip()
# 面板 schema必须与 EventTimelinePanel / SituationUpdate 一致
SUMMARY_MAX_LEN = 120 # 面板 line-clamp-2 展示
CATEGORIES = ("deployment", "alert", "intel", "diplomatic", "other")
SEVERITIES = ("low", "medium", "high", "critical")
def _sanitize_summary(text: str, max_len: int = SUMMARY_MAX_LEN) -> str:
"""确保 summary 符合面板:纯文本、无换行、限制长度"""
if not text or not isinstance(text, str):
return ""
s = re.sub(r"\s+", " ", str(text).strip())
s = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", "", s) # 去除控制字符
return s[:max_len].rstrip()
def _rule_clean(text: str, max_len: int = SUMMARY_MAX_LEN) -> str:
"""规则清洗:去空白、去控制符、截断"""
return _sanitize_summary(text, max_len)
def _call_dashscope_summary(text: str, max_len: int, timeout: int = 8) -> Optional[str]:
"""调用阿里云通义DashScope提炼摘要无需 Ollama。需设置 DASHSCOPE_API_KEY"""
if not DASHSCOPE_API_KEY or CLEANER_AI_DISABLED or not text or len(str(text).strip()) < 5:
return None
try:
import dashscope
from http import HTTPStatus
dashscope.api_key = DASHSCOPE_API_KEY
prompt = f"""将新闻提炼为1-2句简洁中文事实直接输出纯文本不要标号、引号、解释。限{max_len}字内。
原文:{str(text)[:350]}
输出:"""
r = dashscope.Generation.call(
model="qwen-turbo",
messages=[{"role": "user", "content": prompt}],
result_format="message",
max_tokens=150,
)
if r.status_code != HTTPStatus.OK:
return None
out = (r.output.get("choices", [{}])[0].get("message", {}).get("content", "") or "").strip()
out = re.sub(r"^[\d\.\-\*\s]+", "", out)
out = re.sub(r"^['\"\s]+|['\"\s]+$", "", out)
out = _sanitize_summary(out, max_len)
if out and len(out) > 3:
return out
return None
except Exception:
return None
def _call_ollama_summary(text: str, max_len: int, timeout: int = 6) -> Optional[str]:
"""调用 Ollama 提炼摘要输出须为纯文本、≤max_len 字"""
if CLEANER_AI_DISABLED or not text or len(str(text).strip()) < 5:
return None
try:
import requests
prompt = f"""将新闻提炼为1-2句简洁中文事实直接输出纯文本不要标号、引号、解释。限{max_len}字内。
原文:{str(text)[:350]}
输出:"""
r = requests.post(
"http://localhost:11434/api/chat",
json={
"model": OLLAMA_MODEL,
"messages": [{"role": "user", "content": prompt}],
"stream": False,
"options": {"num_predict": 150},
},
timeout=timeout,
)
if r.status_code != 200:
return None
out = (r.json().get("message", {}).get("content", "") or "").strip()
out = re.sub(r"^[\d\.\-\*\s]+", "", out) # 去编号
out = re.sub(r"^['\"\s]+|['\"\s]+$", "", out)
out = _sanitize_summary(out, max_len)
if out and len(out) > 3:
return out
return None
except Exception:
return None
def clean_news_for_panel(text: str, max_len: int = SUMMARY_MAX_LEN) -> str:
"""清洗 summary 字段,供 EventTimelinePanel 展示。输出必为≤max_len 的纯文本"""
if not text or not isinstance(text, str):
return ""
t = str(text).strip()
if not t:
return ""
# 优先商业模型(通义),再 Ollama最后规则
if DASHSCOPE_API_KEY:
res = _call_dashscope_summary(t, max_len, timeout=8)
else:
res = _call_ollama_summary(t, max_len, timeout=6)
if res:
return res
return _rule_clean(t, max_len)
def ensure_category(cat: str) -> str:
"""确保 category 在面板枚举内"""
return cat if cat in CATEGORIES else "other"
def ensure_severity(sev: str) -> str:
"""确保 severity 在面板枚举内"""
return sev if sev in SEVERITIES else "medium"

108
crawler/config.py Normal file
View File

@@ -0,0 +1,108 @@
# -*- coding: utf-8 -*-
"""爬虫配置"""
import os
from pathlib import Path
# 数据库路径(与 server 共用 SQLite
PROJECT_ROOT = Path(__file__).resolve().parent.parent
DB_PATH = os.environ.get("DB_PATH", str(PROJECT_ROOT / "server" / "data.db"))
# Node API 地址(用于通知推送)
API_BASE = os.environ.get("API_BASE", "http://localhost:3001")
# 阿里云 DashScope API Key用于 AI 提取面板数据,不设则回退到规则/Ollama
DASHSCOPE_API_KEY = os.environ.get("DASHSCOPE_API_KEY", "")
# 抓取间隔(秒)
CRAWL_INTERVAL = int(os.environ.get("CRAWL_INTERVAL", "300"))
# 单源抓取超时(秒),避免某源卡住拖垮整轮
FEED_TIMEOUT = int(os.environ.get("FEED_TIMEOUT", "12"))
# RSS 源:世界主流媒体,覆盖美伊/中东多视角
# 每项为 URL 字符串,或 {"name": "显示名", "url": "..."} 便于日志与排查
RSS_FEEDS = [
# 美国
"https://feeds.reuters.com/reuters/topNews",
"https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
# 英国
"https://feeds.bbci.co.uk/news/world/rss.xml",
"https://feeds.bbci.co.uk/news/world/middle_east/rss.xml",
"https://www.theguardian.com/world/rss",
# 法国
"https://www.france24.com/en/rss",
# 德国
"https://rss.dw.com/xml/rss-en-world",
# 俄罗斯
"https://tass.com/rss/v2.xml",
"https://www.rt.com/rss/",
# 中国
"https://english.news.cn/rss/world.xml",
"https://www.cgtn.com/rss/world",
# 凤凰网(军事 + 国际,中文视角)
{"name": "凤凰军事", "url": "https://feedx.net/rss/ifengmil.xml"},
{"name": "凤凰国际", "url": "https://feedx.net/rss/ifengworld.xml"},
# 境内媒体(境内直连友好,可补中文视角)
{"name": "人民网军事", "url": "http://www.people.com.cn/rss/military.xml"},
{"name": "人民网国际", "url": "http://www.people.com.cn/rss/world.xml"},
{"name": "新浪军事", "url": "http://rss.sina.com.cn/rss/jczs/index.shtml"},
{"name": "新浪新闻", "url": "http://rss.sina.com.cn/rss/roll/news.xml"},
{"name": "中国日报国际", "url": "http://www.chinadaily.com.cn/rss/world_rss.xml"},
{"name": "中国军网", "url": "https://english.chinamil.com.cn/rss.xml"},
# 伊朗
"https://www.presstv.ir/rss",
# 卡塔尔(中东)
"https://www.aljazeera.com/xml/rss/all.xml",
"https://www.aljazeera.com/xml/rss/middleeast.xml",
]
def get_feed_sources():
"""返回 [(name, url), ...]name 用于日志,缺省为 URL 的 host"""
import urllib.parse
out = []
for raw in RSS_FEEDS:
if isinstance(raw, dict):
name = raw.get("name") or "rss"
url = raw.get("url", "").strip()
else:
url = (raw or "").strip()
name = urllib.parse.urlparse(url).netloc or "rss"
if url:
out.append((name, url))
return out
# 关键词过滤:至少匹配一个才会入库(与地图区域对应:伊拉克/叙利亚/海湾/红海/地中海等)
KEYWORDS = [
# 伊朗
"iran", "iranian", "tehran", "德黑兰", "bushehr", "布什尔", "abbas", "阿巴斯",
# 以色列 / 巴勒斯坦
"israel", "以色列", "hamas", "gaza", "加沙", "hezbollah", "真主党",
# 美国
"usa", "us ", "american", "美军", "美国", "pentagon",
# 区域(地图覆盖)
"middle east", "中东", "persian gulf", "波斯湾", "gulf of oman", "阿曼湾",
"arabian sea", "阿拉伯海", "red sea", "红海", "mediterranean", "地中海",
"strait of hormuz", "霍尔木兹",
# 伊拉克 / 叙利亚
"iraq", "伊拉克", "baghdad", "巴格达", "erbil", "埃尔比勒", "basra", "巴士拉",
"syria", "叙利亚", "damascus", "大马士革", "deir", "代尔祖尔",
# 海湾国家
"saudi", "沙特", "riyadh", "利雅得", "qatar", "卡塔尔", "doha", "多哈",
"uae", "emirates", "阿联酋", "dubai", "迪拜", "abu dhabi",
"bahrain", "巴林", "kuwait", "科威特", "oman", "阿曼", "yemen", "也门",
# 约旦 / 土耳其 / 埃及 / 吉布提 / 黎巴嫩
"jordan", "约旦", "amman", "安曼",
"lebanon", "黎巴嫩",
"turkey", "土耳其", "incirlik", "因吉尔利克",
"egypt", "埃及", "cairo", "开罗", "sinai", "西奈",
"djibouti", "吉布提",
# 军事 / 基地
"al-asad", "al asad", "阿萨德", "al udeid", "乌代德", "incirlik",
"strike", "attack", "military", "missile", "", "nuclear",
"carrier", "航母", "drone", "uav", "无人机", "retaliation", "报复",
"base", "基地", "troops", "troop", "soldier", "personnel",
# 胡塞 / 武装 / 军力
"houthi", "胡塞", "houthis",
"idf", "irgc", "革命卫队", "qassem soleimani", "苏莱曼尼",
]

282
crawler/db_merge.py Normal file
View File

@@ -0,0 +1,282 @@
# -*- coding: utf-8 -*-
"""
将 AI 提取的结构化数据合并到 SQLite
与 panel schema 及 situationData.getSituation 对齐,支持回放。
地图打击数据(与前端攻击动画一致):
- map_strike_sources: [{ "id": "israel"|"lincoln"|"ford", "name": "显示名", "lng", "lat" }] 写入 map_strike_source
- map_strike_lines: [{ "source_id", "target_lng", "target_lat", "target_name?", "struck_at?" }] 追加到 map_strike_line
爬虫/AI 可按此格式输出,落库后 GET /api/situation 的 mapData.strikeSources/strikeLines 会更新,前端直接追加攻击动画。
"""
import os
import sqlite3
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Optional
PROJECT_ROOT = Path(__file__).resolve().parent.parent
DB_PATH = os.environ.get("DB_PATH", str(PROJECT_ROOT / "server" / "data.db"))
# 单次合并时各字段增量的上限,防止误把「累计总数」当增量导致数据剧增(可选,设为 0 表示不设限)
MAX_DELTA_PER_MERGE = {
"personnel_killed": 500, "personnel_wounded": 1000, "civilian_killed": 300, "civilian_wounded": 500,
"bases_destroyed": 5, "bases_damaged": 10,
"aircraft": 50, "warships": 10, "armor": 30, "vehicles": 100,
"drones": 50, "missiles": 200, "helicopters": 20, "submarines": 5, "carriers": 2,
"civilian_ships": 20, "airport_port": 10,
}
def _clamp_delta(key: str, value: int) -> int:
"""单次增量上限,避免误提「累计」导致波动"""
cap = MAX_DELTA_PER_MERGE.get(key, 0)
if cap <= 0:
return max(0, value)
return max(0, min(value, cap))
def _ensure_tables(conn: sqlite3.Connection) -> None:
"""确保所需表存在(与 db.js 一致)"""
conn.execute("""
CREATE TABLE IF NOT EXISTS situation_update (
id TEXT PRIMARY KEY, timestamp TEXT NOT NULL, category TEXT NOT NULL,
summary TEXT NOT NULL, severity TEXT NOT NULL
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS combat_losses (
side TEXT PRIMARY KEY CHECK (side IN ('us', 'iran')),
bases_destroyed INTEGER NOT NULL, bases_damaged INTEGER NOT NULL,
personnel_killed INTEGER NOT NULL, personnel_wounded INTEGER NOT NULL,
aircraft INTEGER NOT NULL, warships INTEGER NOT NULL, armor INTEGER NOT NULL, vehicles INTEGER NOT NULL
)
""")
try:
conn.execute("ALTER TABLE combat_losses ADD COLUMN civilian_killed INTEGER NOT NULL DEFAULT 0")
except sqlite3.OperationalError:
pass
try:
conn.execute("ALTER TABLE combat_losses ADD COLUMN civilian_wounded INTEGER NOT NULL DEFAULT 0")
except sqlite3.OperationalError:
pass
try:
conn.execute("ALTER TABLE combat_losses ADD COLUMN updated_at TEXT DEFAULT (datetime('now'))")
except sqlite3.OperationalError:
pass
for col in ("drones", "missiles", "helicopters", "submarines", "tanks", "carriers", "civilian_ships", "airport_port"):
try:
conn.execute(f"ALTER TABLE combat_losses ADD COLUMN {col} INTEGER NOT NULL DEFAULT 0")
except sqlite3.OperationalError:
pass
conn.execute("CREATE TABLE IF NOT EXISTS wall_street_trend (id INTEGER PRIMARY KEY AUTOINCREMENT, time TEXT NOT NULL, value INTEGER NOT NULL)")
conn.execute("CREATE TABLE IF NOT EXISTS retaliation_current (id INTEGER PRIMARY KEY CHECK (id = 1), value INTEGER NOT NULL)")
conn.execute("CREATE TABLE IF NOT EXISTS retaliation_history (id INTEGER PRIMARY KEY AUTOINCREMENT, time TEXT NOT NULL, value INTEGER NOT NULL)")
conn.execute("CREATE TABLE IF NOT EXISTS situation (id INTEGER PRIMARY KEY CHECK (id = 1), data TEXT NOT NULL, updated_at TEXT NOT NULL)")
# 地图打击源与打击线(与 server/db.js 一致),供 getSituation mapData 与前端攻击动画使用
conn.execute("""
CREATE TABLE IF NOT EXISTS map_strike_source (
id TEXT PRIMARY KEY,
name TEXT NOT NULL,
lng REAL NOT NULL,
lat REAL NOT NULL
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS map_strike_line (
source_id TEXT NOT NULL,
target_lng REAL NOT NULL,
target_lat REAL NOT NULL,
target_name TEXT,
struck_at TEXT,
FOREIGN KEY (source_id) REFERENCES map_strike_source(id)
)
""")
try:
conn.execute("CREATE INDEX IF NOT EXISTS idx_map_strike_line_source ON map_strike_line(source_id)")
except sqlite3.OperationalError:
pass
try:
for col in ("struck_at",):
cur = conn.execute("PRAGMA table_info(map_strike_line)")
cols = [r[1] for r in cur.fetchall()]
if col not in cols:
conn.execute(f"ALTER TABLE map_strike_line ADD COLUMN {col} TEXT")
except sqlite3.OperationalError:
pass
conn.commit()
def merge(extracted: Dict[str, Any], db_path: Optional[str] = None) -> bool:
"""将提取数据合并到 DB返回是否有更新"""
path = db_path or DB_PATH
if not os.path.exists(path):
return False
conn = sqlite3.connect(path, timeout=10)
try:
_ensure_tables(conn)
updated = False
# situation_update
if "situation_update" in extracted:
u = extracted["situation_update"]
uid = f"ai_{hash(u.get('summary','')+u.get('timestamp','')) % 10**10}"
conn.execute(
"INSERT OR IGNORE INTO situation_update (id, timestamp, category, summary, severity) VALUES (?, ?, ?, ?, ?)",
(uid, u.get("timestamp", ""), u.get("category", "other"), u.get("summary", "")[:500], u.get("severity", "medium")),
)
if conn.total_changes > 0:
updated = True
# combat_losses统一按增量处理。AI 输出为本则报道的新增数,此处叠加到库内当前值,避免把「累计总数」当增量导致数据波动。
if "combat_losses_delta" in extracted:
for side, delta in extracted["combat_losses_delta"].items():
if side not in ("us", "iran"):
continue
try:
row = conn.execute(
"SELECT personnel_killed,personnel_wounded,civilian_killed,civilian_wounded,bases_destroyed,bases_damaged,aircraft,warships,armor,vehicles,drones,missiles,helicopters,submarines,tanks,carriers,civilian_ships,airport_port FROM combat_losses WHERE side = ?",
(side,),
).fetchone()
cur = {"personnel_killed": 0, "personnel_wounded": 0, "civilian_killed": 0, "civilian_wounded": 0,
"bases_destroyed": 0, "bases_damaged": 0, "aircraft": 0, "warships": 0, "armor": 0, "vehicles": 0,
"drones": 0, "missiles": 0, "helicopters": 0, "submarines": 0, "tanks": 0, "carriers": 0, "civilian_ships": 0, "airport_port": 0}
if row:
cur = {
"personnel_killed": row[0], "personnel_wounded": row[1], "civilian_killed": row[2] or 0,
"civilian_wounded": row[3] or 0, "bases_destroyed": row[4], "bases_damaged": row[5],
"aircraft": row[6], "warships": row[7], "armor": row[8], "vehicles": row[9],
"drones": row[10] if len(row) > 10 else 0, "missiles": row[11] if len(row) > 11 else 0,
"helicopters": row[12] if len(row) > 12 else 0, "submarines": row[13] if len(row) > 13 else 0,
"tanks": row[14] if len(row) > 14 else 0, "carriers": row[15] if len(row) > 15 else (row[14] if len(row) > 14 else 0),
"civilian_ships": row[16] if len(row) > 16 else 0, "airport_port": row[17] if len(row) > 17 else 0,
}
pk = max(0, (cur["personnel_killed"] or 0) + _clamp_delta("personnel_killed", delta.get("personnel_killed", 0)))
pw = max(0, (cur["personnel_wounded"] or 0) + _clamp_delta("personnel_wounded", delta.get("personnel_wounded", 0)))
ck = max(0, (cur["civilian_killed"] or 0) + _clamp_delta("civilian_killed", delta.get("civilian_killed", 0)))
cw = max(0, (cur["civilian_wounded"] or 0) + _clamp_delta("civilian_wounded", delta.get("civilian_wounded", 0)))
bd = max(0, (cur["bases_destroyed"] or 0) + _clamp_delta("bases_destroyed", delta.get("bases_destroyed", 0)))
bm = max(0, (cur["bases_damaged"] or 0) + _clamp_delta("bases_damaged", delta.get("bases_damaged", 0)))
ac = max(0, (cur["aircraft"] or 0) + _clamp_delta("aircraft", delta.get("aircraft", 0)))
ws = max(0, (cur["warships"] or 0) + _clamp_delta("warships", delta.get("warships", 0)))
ar = max(0, (cur["armor"] or 0) + _clamp_delta("armor", delta.get("armor", 0)))
vh = max(0, (cur["vehicles"] or 0) + _clamp_delta("vehicles", delta.get("vehicles", 0)))
dr = max(0, (cur["drones"] or 0) + _clamp_delta("drones", delta.get("drones", 0)))
ms = max(0, (cur["missiles"] or 0) + _clamp_delta("missiles", delta.get("missiles", 0)))
hp = max(0, (cur["helicopters"] or 0) + _clamp_delta("helicopters", delta.get("helicopters", 0)))
sb = max(0, (cur["submarines"] or 0) + _clamp_delta("submarines", delta.get("submarines", 0)))
cr = max(0, (cur["carriers"] or 0) + _clamp_delta("carriers", delta.get("carriers", 0)))
cs = max(0, (cur["civilian_ships"] or 0) + _clamp_delta("civilian_ships", delta.get("civilian_ships", 0)))
ap = max(0, (cur["airport_port"] or 0) + _clamp_delta("airport_port", delta.get("airport_port", 0)))
ts = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z")
if row:
conn.execute(
"""UPDATE combat_losses SET personnel_killed=?, personnel_wounded=?, civilian_killed=?, civilian_wounded=?,
bases_destroyed=?, bases_damaged=?, aircraft=?, warships=?, armor=?, vehicles=?,
drones=?, missiles=?, helicopters=?, submarines=?, tanks=?, carriers=?, civilian_ships=?, airport_port=?, updated_at=? WHERE side=?""",
(pk, pw, ck, cw, bd, bm, ac, ws, ar, vh, dr, ms, hp, sb, cur.get("tanks", 0), cr, cs, ap, ts, side),
)
else:
conn.execute(
"""INSERT OR REPLACE INTO combat_losses (side, personnel_killed, personnel_wounded, civilian_killed, civilian_wounded,
bases_destroyed, bases_damaged, aircraft, warships, armor, vehicles, drones, missiles, helicopters, submarines, tanks, carriers, civilian_ships, airport_port, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(side, pk, pw, ck, cw, bd, bm, ac, ws, ar, vh, dr, ms, hp, sb, 0, cr, cs, ap, ts),
)
if conn.total_changes > 0:
updated = True
except Exception:
pass
# force_summary 增量:导弹消耗(看板「导弹消耗」「导弹库存」由 force_summary 提供)
if "force_summary_delta" in extracted:
for side, delta in extracted["force_summary_delta"].items():
if side not in ("us", "iran"):
continue
mc = delta.get("missile_consumed")
if mc is not None and isinstance(mc, (int, float)) and mc > 0:
mc = min(int(mc), 500)
try:
cur = conn.execute(
"UPDATE force_summary SET missile_consumed = missile_consumed + ?, missile_stock = max(0, missile_stock - ?) WHERE side = ?",
(mc, mc, side),
)
if cur.rowcount > 0:
updated = True
except Exception:
pass
# retaliation
if "retaliation" in extracted:
r = extracted["retaliation"]
conn.execute("INSERT OR REPLACE INTO retaliation_current (id, value) VALUES (1, ?)", (r["value"],))
conn.execute("INSERT INTO retaliation_history (time, value) VALUES (?, ?)", (r["time"], r["value"]))
updated = True
# wall_street_trend
if "wall_street" in extracted:
w = extracted["wall_street"]
conn.execute("INSERT INTO wall_street_trend (time, value) VALUES (?, ?)", (w["time"], w["value"]))
updated = True
# key_location更新双方攻击地点美军基地被打击 side=us伊朗设施被打击 side=iran的 status/damage_level
if "key_location_updates" in extracted:
try:
for u in extracted["key_location_updates"]:
kw_raw = (u.get("name_keywords") or "").strip()
if not kw_raw:
continue
# 支持 "a|b|c" 或 "a b c" 分隔
kw = [k.strip() for k in kw_raw.replace("|", " ").split() if k.strip()]
side = u.get("side")
status = (u.get("status") or "attacked")[:20]
dmg = u.get("damage_level", 2)
if not kw or side not in ("us", "iran"):
continue
# 简化name LIKE '%kw%' 对每个关键词 OR 连接,支持中英文
conditions = " OR ".join("name LIKE ?" for _ in kw)
params = [status, dmg, side] + [f"%{k}%" for k in kw]
cur = conn.execute(
f"UPDATE key_location SET status=?, damage_level=? WHERE side=? AND ({conditions})",
params,
)
if cur.rowcount > 0:
updated = True
except Exception:
pass
# map_strike_source打击源与前端 mapData.strikeSources 一致),爬虫可补充或覆盖
if "map_strike_sources" in extracted:
try:
for s in extracted["map_strike_sources"]:
sid = (s.get("id") or "").strip()
name = (s.get("name") or "").strip() or sid
lng = float(s.get("lng", 0))
lat = float(s.get("lat", 0))
if sid:
conn.execute(
"INSERT OR REPLACE INTO map_strike_source (id, name, lng, lat) VALUES (?, ?, ?, ?)",
(sid, name[:200], lng, lat),
)
if conn.total_changes > 0:
updated = True
except Exception:
pass
# map_strike_lines打击线与前端 mapData.strikeLines 一致),爬虫可追加新打击,便于前端追加攻击动画
if "map_strike_lines" in extracted:
try:
for line in extracted["map_strike_lines"]:
source_id = (line.get("source_id") or "").strip()
target_lng = float(line.get("target_lng", 0))
target_lat = float(line.get("target_lat", 0))
target_name = (line.get("target_name") or "").strip()[:200] or None
struck_at = (line.get("struck_at") or "").strip() or None
if source_id:
conn.execute(
"INSERT INTO map_strike_line (source_id, target_lng, target_lat, target_name, struck_at) VALUES (?, ?, ?, ?, ?)",
(source_id, target_lng, target_lat, target_name, struck_at),
)
if conn.total_changes > 0:
updated = True
except Exception:
pass
if updated:
conn.execute("INSERT OR REPLACE INTO situation (id, data, updated_at) VALUES (1, '{}', ?)", (datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"),))
conn.commit()
return updated
except Exception as e:
conn.rollback()
raise e
finally:
conn.close()

126
crawler/db_writer.py Normal file
View File

@@ -0,0 +1,126 @@
# -*- coding: utf-8 -*-
"""写入 SQLite 并确保 situation_update 表存在"""
import sqlite3
import hashlib
import os
from datetime import datetime, timezone
from typing import List, Optional
from config import DB_PATH
CATEGORIES = ("deployment", "alert", "intel", "diplomatic", "other")
SEVERITIES = ("low", "medium", "high", "critical")
def _ensure_table(conn: sqlite3.Connection) -> None:
conn.execute("""
CREATE TABLE IF NOT EXISTS situation_update (
id TEXT PRIMARY KEY,
timestamp TEXT NOT NULL,
category TEXT NOT NULL,
summary TEXT NOT NULL,
severity TEXT NOT NULL
)
""")
conn.commit()
def _make_id(title: str, url: str, published: str) -> str:
raw = f"{title}|{url}|{published}"
return "nw_" + hashlib.sha256(raw.encode("utf-8")).hexdigest()[:16]
def _to_utc_iso(dt: datetime) -> str:
if dt.tzinfo:
dt = dt.astimezone(timezone.utc)
return dt.strftime("%Y-%m-%dT%H:%M:%S.000Z")
def insert_update(
conn: sqlite3.Connection,
title: str,
summary: str,
url: str,
published: datetime,
category: str = "other",
severity: str = "medium",
) -> bool:
"""插入一条更新,若 id 已存在则跳过。返回是否插入了新记录。"""
_ensure_table(conn)
ts = _to_utc_iso(published)
uid = _make_id(title, url, ts)
if category not in CATEGORIES:
category = "other"
if severity not in SEVERITIES:
severity = "medium"
try:
conn.execute(
"INSERT OR IGNORE INTO situation_update (id, timestamp, category, summary, severity) VALUES (?, ?, ?, ?, ?)",
(uid, ts, category, summary[:500], severity),
)
conn.commit()
return conn.total_changes > 0
except Exception:
conn.rollback()
return False
def touch_situation_updated_at(conn: sqlite3.Connection) -> None:
"""更新 situation 表的 updated_at"""
conn.execute(
"INSERT OR REPLACE INTO situation (id, data, updated_at) VALUES (1, '{}', ?)",
(datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"),),
)
conn.commit()
def touch_situation_updated_at_path(db_path: Optional[str] = None) -> bool:
"""仅更新 situation.updated_at 为当前时间(每次爬虫运行都调用,便于前端显示「最后抓取时间」)。返回是否成功。"""
path = db_path or DB_PATH
if not os.path.exists(path):
return False
conn = sqlite3.connect(path, timeout=10)
try:
touch_situation_updated_at(conn)
return True
finally:
conn.close()
def write_updates(updates: List[dict], db_path: Optional[str] = None) -> int:
"""
updates: [{"title","summary","url","published","category","severity"}, ...]
db_path: 与 pipeline 一致,缺省用 config.DB_PATH
返回新增条数。
"""
path = db_path or DB_PATH
if not os.path.exists(path):
return 0
conn = sqlite3.connect(path, timeout=10)
try:
count = 0
for u in updates:
pub = u.get("published")
if isinstance(pub, str):
try:
pub = datetime.fromisoformat(pub.replace("Z", "+00:00"))
except ValueError:
pub = datetime.utcnow()
elif pub is None:
pub = datetime.utcnow()
ok = insert_update(
conn,
title=u.get("title", "")[:200],
summary=u.get("summary", "") or u.get("title", ""),
url=u.get("url", ""),
published=pub,
category=u.get("category", "other"),
severity=u.get("severity", "medium"),
)
if ok:
count += 1
if count > 0:
touch_situation_updated_at(conn)
return count
finally:
conn.close()

144
crawler/extractor_ai.py Normal file
View File

@@ -0,0 +1,144 @@
# -*- coding: utf-8 -*-
"""
从新闻文本中 AI 提取结构化数据,映射到面板 schema
输出符合 panel_schema 的字段,供 db_merge 写入
"""
import json
import os
import re
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional
from panel_schema import validate_category, validate_severity, validate_summary
CLEANER_AI_DISABLED = os.environ.get("CLEANER_AI_DISABLED", "0") == "1"
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "llama3.1")
# 用于 AI 提取的原文最大长度(有正文时取更长以提取精确数据)
EXTRACT_TEXT_MAX_LEN = int(os.environ.get("EXTRACT_TEXT_MAX_LEN", "4000"))
def _call_ollama_extract(text: str, timeout: int = 15) -> Optional[Dict[str, Any]]:
"""调用 Ollama 从新闻全文/摘要中提取精确结构化数据,仅填写报道中明确给出的数字与事实。"""
if CLEANER_AI_DISABLED or not text or len(str(text).strip()) < 10:
return None
try:
import requests
raw = str(text).strip()[:EXTRACT_TEXT_MAX_LEN]
prompt = f"""从以下美伊/中东新闻**全文或摘要**中,提取**报道明确给出的数字与事实**,输出 JSON。
输入说明:
- 原文可能是英文、中文或其他语言English / Chinese / Arabic / Persian 等),请先理解含义,再按要求输出。
规则:
1. 仅填写报道中**直接出现、可核对**的数据,不要推测或估算。
2. 无明确依据的字段**必须省略**,不要填 0 或猜。
3. **战损一律按增量**:只填本则报道中「本次/此次/今日/本轮」**新增**的伤亡或损毁数量。若报道只给「累计总数」「迄今共」「total so far」等**不要填写**该字段(避免与库内已有累计值重复叠加)。
4. **攻击地点**:提取双方遭袭的具体地点。美军/盟军基地被打击 → side=us伊朗/亲伊设施被打击 → side=iran。name_keywords 用「中文名|英文名」便于匹配,可填多处。
字段说明:
- summary: 1-2 句中文事实概括≤80 字
- category: deployment|alert|intel|diplomatic|other
- severity: low|medium|high|critical
- 战损(**仅填本则报道的新增增量**,如「此次 5 人丧生」「今日又损 2 架」):
us_personnel_killed, iran_personnel_killed, us_personnel_wounded, iran_personnel_wounded,
us_civilian_killed, iran_civilian_killed, us_civilian_wounded, iran_civilian_wounded,
us_bases_destroyed, iran_bases_destroyed, us_bases_damaged, iran_bases_damaged,
us_aircraft, iran_aircraft, us_warships, iran_warships, us_armor, iran_armor, us_vehicles, iran_vehicles,
us_drones, iran_drones, us_missiles, iran_missiles, us_helicopters, iran_helicopters, us_submarines, iran_submarines,
us_carriers, iran_carriers, us_civilian_ships, iran_civilian_ships, us_airport_port, iran_airport_port
- retaliation_sentiment: 0-100仅当报道涉及伊朗报复/反击情绪时
- wall_street_value: 0-100仅当报道涉及美股/市场时
- key_location_updates: **双方攻击地点**。每项 {{ "name_keywords": "阿萨德|asad|al-asad", "side": "us或iran被打击方", "status": "attacked", "damage_level": 1-3 }}。美军基地例:阿萨德|asad、乌代德|udeid、埃尔比勒|erbil、因吉尔利克|incirlik。伊朗例德黑兰|tehran、布什尔|bushehr、伊斯法罕|isfahan、阿巴斯|abbas、纳坦兹|natanz
- **导弹消耗增量**(仅当报道明确提到「发射/消耗 了 X 枚导弹」时填,用于看板导弹消耗累计): us_missile_consumed_delta, iran_missile_consumed_delta本则报道中该方新增消耗枚数整数
原文:
{raw}
直接输出 JSON 对象,不要解释,不要加反引号或代码块标记:"""
r = requests.post(
"http://localhost:11434/api/chat",
json={
"model": OLLAMA_MODEL,
"messages": [{"role": "user", "content": prompt}],
"stream": False,
"options": {"num_predict": 384},
},
timeout=timeout,
)
if r.status_code != 200:
return None
raw = (r.json().get("message", {}).get("content", "") or "").strip()
raw = re.sub(r"^```\w*\s*|\s*```$", "", raw)
return json.loads(raw)
except Exception:
return None
def extract_from_news(text: str, timestamp: Optional[str] = None) -> Dict[str, Any]:
"""
从新闻文本提取结构化数据,严格符合面板 schema
返回: { situation_update?, combat_losses_delta?, retaliation?, wall_street?, ... }
"""
ts = timestamp or datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.000Z")
out: Dict[str, Any] = {}
parsed = _call_ollama_extract(text)
if not parsed:
return out
# situation_update
if parsed.get("summary"):
out["situation_update"] = {
"summary": validate_summary(str(parsed["summary"])[:120], 120),
"category": validate_category(str(parsed.get("category", "other")).lower()),
"severity": validate_severity(str(parsed.get("severity", "medium")).lower()),
"timestamp": ts,
}
# combat_losses 增量(仅数字字段)
loss_us = {}
loss_ir = {}
for k in ["personnel_killed", "personnel_wounded", "civilian_killed", "civilian_wounded", "bases_destroyed", "bases_damaged", "aircraft", "warships", "armor", "vehicles", "drones", "missiles", "helicopters", "submarines", "carriers", "civilian_ships", "airport_port"]:
uk = f"us_{k}"
ik = f"iran_{k}"
if uk in parsed and isinstance(parsed[uk], (int, float)):
loss_us[k] = max(0, int(parsed[uk]))
if ik in parsed and isinstance(parsed[ik], (int, float)):
loss_ir[k] = max(0, int(parsed[ik]))
if loss_us or loss_ir:
out["combat_losses_delta"] = {}
if loss_us:
out["combat_losses_delta"]["us"] = loss_us
if loss_ir:
out["combat_losses_delta"]["iran"] = loss_ir
# retaliation
if "retaliation_sentiment" in parsed:
v = parsed["retaliation_sentiment"]
if isinstance(v, (int, float)) and 0 <= v <= 100:
out["retaliation"] = {"value": int(v), "time": ts}
# wall_street
if "wall_street_value" in parsed:
v = parsed["wall_street_value"]
if isinstance(v, (int, float)) and 0 <= v <= 100:
out["wall_street"] = {"time": ts, "value": int(v)}
# key_location_updates受袭基地
if "key_location_updates" in parsed and isinstance(parsed["key_location_updates"], list):
valid = []
for u in parsed["key_location_updates"]:
if isinstance(u, dict) and u.get("name_keywords") and u.get("side") in ("us", "iran"):
valid.append({
"name_keywords": str(u["name_keywords"]),
"side": u["side"],
"status": str(u.get("status", "attacked"))[:20],
"damage_level": min(3, max(1, int(u["damage_level"]))) if isinstance(u.get("damage_level"), (int, float)) else 2,
})
if valid:
out["key_location_updates"] = valid
# force_summary 增量:导弹消耗(看板「导弹消耗」由 force_summary.missile_consumed 提供)
fs_delta = {}
for side_key, side_val in [("us_missile_consumed_delta", "us"), ("iran_missile_consumed_delta", "iran")]:
v = parsed.get(side_key)
if isinstance(v, (int, float)) and v > 0:
fs_delta[side_val] = {"missile_consumed": min(500, int(v))}
if fs_delta:
out["force_summary_delta"] = fs_delta
return out

View File

@@ -0,0 +1,136 @@
# -*- coding: utf-8 -*-
"""
阿里云 DashScope通义千问提取面板结构化数据
从新闻文本中提取战损、报复指数、基地状态等,供 db_merge 落库
API Key 通过环境变量 DASHSCOPE_API_KEY 配置
"""
import json
import os
import re
from datetime import datetime, timezone
from typing import Any, Dict, Optional
from panel_schema import validate_category, validate_severity, validate_summary
EXTRACT_TEXT_MAX_LEN = int(os.environ.get("EXTRACT_TEXT_MAX_LEN", "4000"))
def _call_dashscope_extract(text: str, timeout: int = 15) -> Optional[Dict[str, Any]]:
"""调用阿里云 DashScope 从新闻全文中提取精确结构化数据,仅填写报道明确给出的数字与事实。"""
api_key = os.environ.get("DASHSCOPE_API_KEY", "").strip()
if not api_key or not text or len(str(text).strip()) < 10:
return None
try:
import dashscope
from http import HTTPStatus
dashscope.api_key = api_key
raw = str(text).strip()[:EXTRACT_TEXT_MAX_LEN]
prompt = f"""从以下美伊/中东新闻**全文或摘要**中,提取**报道明确给出的数字与事实**,输出 JSON。规则
1. 仅填写报道中**直接出现、可核对**的数据,不要推测或估算。
2. 无明确依据的字段**必须省略**,不要填 0 或猜。
3. **战损一律按增量**:只填本则报道中「本次/此次/今日」**新增**数量。报道若只给「累计总数」「迄今共」**不要填**该字段。
4. **攻击地点**:提取双方遭袭地点。美军/盟军基地被打击 → side=us伊朗/亲伊设施被打击 → side=iran。name_keywords 用「中文|英文」,可填多处。
字段:
- summary: 1-2 句中文事实概括≤80 字
- category: deployment|alert|intel|diplomatic|other
- severity: low|medium|high|critical
- 战损(**仅填本则报道的新增增量**: us_personnel_killed, iran_personnel_killed, us_personnel_wounded, iran_personnel_wounded, us_civilian_killed, iran_civilian_killed, us_civilian_wounded, iran_civilian_wounded, us_bases_destroyed, iran_bases_destroyed, us_bases_damaged, iran_bases_damaged, us_aircraft, iran_aircraft, us_warships, iran_warships, us_armor, iran_armor, us_vehicles, iran_vehicles, us_drones, iran_drones, us_missiles, iran_missiles, us_helicopters, iran_helicopters, us_submarines, iran_submarines, us_carriers, iran_carriers, us_civilian_ships, iran_civilian_ships, us_airport_port, iran_airport_port
- retaliation_sentiment: 0-100仅当报道涉及伊朗报复情绪时
- wall_street_value: 0-100仅当报道涉及美股/市场时)
- key_location_updates: **双方攻击地点**。每项 {{"name_keywords":"阿萨德|asad","side":"us或iran被打击方","status":"attacked","damage_level":1-3}}。美军基地:阿萨德|asad、乌代德|udeid、埃尔比勒|erbil、因吉尔利克|incirlik。伊朗德黑兰|tehran、布什尔|bushehr、伊斯法罕|isfahan、阿巴斯|abbas、纳坦兹|natanz
- **导弹消耗增量**(仅当报道明确提到「发射/消耗 了 X 枚导弹」时填): us_missile_consumed_delta, iran_missile_consumed_delta本则该方新增消耗枚数整数
原文:
{raw}
直接输出 JSON不要其他解释"""
response = dashscope.Generation.call(
model="qwen-turbo",
messages=[{"role": "user", "content": prompt}],
result_format="message",
max_tokens=512,
)
if response.status_code != HTTPStatus.OK:
return None
raw = (response.output.get("choices", [{}])[0].get("message", {}).get("content", "") or "").strip()
raw = re.sub(r"^```\w*\s*|\s*```$", "", raw)
return json.loads(raw)
except Exception:
return None
def extract_from_news(text: str, timestamp: Optional[str] = None) -> Dict[str, Any]:
"""
从新闻文本提取结构化数据,符合面板 schema
返回: { situation_update?, combat_losses_delta?, retaliation?, wall_street?, key_location_updates? }
"""
ts = timestamp or datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.000Z")
out: Dict[str, Any] = {}
parsed = _call_dashscope_extract(text)
if not parsed:
return out
if parsed.get("summary"):
out["situation_update"] = {
"summary": validate_summary(str(parsed["summary"])[:120], 120),
"category": validate_category(str(parsed.get("category", "other")).lower()),
"severity": validate_severity(str(parsed.get("severity", "medium")).lower()),
"timestamp": ts,
}
loss_us = {}
loss_ir = {}
for k in ["personnel_killed", "personnel_wounded", "civilian_killed", "civilian_wounded",
"bases_destroyed", "bases_damaged", "aircraft", "warships", "armor", "vehicles",
"drones", "missiles", "helicopters", "submarines", "carriers", "civilian_ships", "airport_port"]:
uk, ik = f"us_{k}", f"iran_{k}"
if uk in parsed and isinstance(parsed[uk], (int, float)):
loss_us[k] = max(0, int(parsed[uk]))
if ik in parsed and isinstance(parsed[ik], (int, float)):
loss_ir[k] = max(0, int(parsed[ik]))
if loss_us or loss_ir:
out["combat_losses_delta"] = {}
if loss_us:
out["combat_losses_delta"]["us"] = loss_us
if loss_ir:
out["combat_losses_delta"]["iran"] = loss_ir
if "retaliation_sentiment" in parsed:
v = parsed["retaliation_sentiment"]
if isinstance(v, (int, float)) and 0 <= v <= 100:
out["retaliation"] = {"value": int(v), "time": ts}
if "wall_street_value" in parsed:
v = parsed["wall_street_value"]
if isinstance(v, (int, float)) and 0 <= v <= 100:
out["wall_street"] = {"time": ts, "value": int(v)}
# force_summary 增量:导弹消耗(看板「导弹消耗」)
fs_delta = {}
for key, side in [("us_missile_consumed_delta", "us"), ("iran_missile_consumed_delta", "iran")]:
v = parsed.get(key)
if isinstance(v, (int, float)) and v > 0:
fs_delta[side] = {"missile_consumed": min(500, int(v))}
if fs_delta:
out["force_summary_delta"] = fs_delta
if "key_location_updates" in parsed and isinstance(parsed["key_location_updates"], list):
valid = []
for u in parsed["key_location_updates"]:
if isinstance(u, dict) and u.get("name_keywords") and u.get("side") in ("us", "iran"):
valid.append({
"name_keywords": str(u["name_keywords"]),
"side": u["side"],
"status": str(u.get("status", "attacked"))[:20],
"damage_level": min(3, max(1, int(u["damage_level"]))) if isinstance(u.get("damage_level"), (int, float)) else 2,
})
if valid:
out["key_location_updates"] = valid
return out

254
crawler/extractor_rules.py Normal file
View File

@@ -0,0 +1,254 @@
# -*- coding: utf-8 -*-
"""
基于规则的新闻数据提取(无需 Ollama
从新闻文本中提取战损、报复情绪等数值,供 db_merge 写入
"""
import re
from datetime import datetime, timezone
from typing import Any, Dict, Optional
def _first_int(text: str, pattern: str) -> Optional[int]:
m = re.search(pattern, text, re.I)
if m and m.group(1) and m.group(1).replace(",", "").isdigit():
return max(0, int(m.group(1).replace(",", "")))
return None
def extract_from_news(text: str, timestamp: Optional[str] = None) -> Dict[str, Any]:
"""
规则提取:匹配数字+关键词,输出符合 panel schema 的字段(无需 Ollama
"""
ts = timestamp or datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.000Z")
out: Dict[str, Any] = {}
t = (text or "").lower()
loss_us, loss_ir = {}, {}
# 美军人员伤亡(中文,优先匹配)
v = _first_int(t, r"造成\s*(\d+)\s*名?\s*美军\s*伤亡")
if v is not None:
loss_us["personnel_killed"] = v
v = _first_int(t, r"(\d+)\s*名?\s*美军\s*伤亡") if loss_us.get("personnel_killed") is None else None
if v is not None:
loss_us["personnel_killed"] = v
v = _first_int(t, r"(\d+)\s*名?\s*(?:美军|美国军队|美国)\s*(?:死亡|阵亡)")
if v is not None:
loss_us["personnel_killed"] = v
v = _first_int(t, r"(\d+)\s*名?\s*(?:美军|美国)\s*受伤")
if v is None and ("美军" in (text or "") or "美国" in (text or "")):
v = _first_int(text or t, r"另有\s*(\d+)\s*人\s*受伤")
if v is not None:
loss_us["personnel_wounded"] = v
v = _first_int(t, r"美军\s*伤亡\s*(\d+)")
if v is not None and loss_us.get("personnel_killed") is None:
loss_us["personnel_killed"] = v
# 美军人员伤亡(英文)
v = _first_int(t, r"(?:us|american|u\.?s\.?)[\s\w]*(?:say|report)[\s\w]*(\d+)[\s\w]*(?:troop|soldier|military)[\s\w]*(?:killed|dead)")
if v is not None:
loss_us["personnel_killed"] = v
v = _first_int(t, r"(\d+)[\s\w]*(?:us|american)[\s\w]*(?:troop|soldier|military)[\s\w]*(?:killed|dead)")
if v is not None:
loss_us["personnel_killed"] = v
v = _first_int(t, r"(?:us|american)[\s\w]*(\d+)[\s\w]*(?:wounded|injured)")
if v is not None:
loss_us["personnel_wounded"] = v
# 伊朗人员伤亡(中文)
v = _first_int(t, r"(\d+)\s*名?\s*伊朗\s*伤亡")
if v is not None:
loss_ir["personnel_killed"] = v
v = _first_int(t, r"(\d+)\s*名?\s*(?:伊朗|伊朗军队)[\s\w]*(?:死亡|阵亡)")
if v is not None:
loss_ir["personnel_killed"] = v
v = _first_int(t, r"(\d+)\s*名?\s*伊朗\s*受伤")
if v is not None:
loss_ir["personnel_wounded"] = v
# 伊朗人员伤亡(英文)
v = _first_int(t, r"(?:iran|iranian)[\s\w]*(?:say|report)[\s\w]*(\d+)[\s\w]*(?:troop|soldier|guard|killed|dead)")
if v is not None:
loss_ir["personnel_killed"] = v
v = _first_int(t, r"(\d+)[\s\w]*(?:iranian|iran)[\s\w]*(?:troop|soldier|guard|killed|dead)")
if v is not None:
loss_ir["personnel_killed"] = v
v = _first_int(t, r"(?:iran|iranian)[\s\w]*(\d+)[\s\w]*(?:wounded|injured)")
if v is not None:
loss_ir["personnel_wounded"] = v
# 平民伤亡(中英文,按阵营归属)
v = _first_int(t, r"(\d+)\s*名?\s*平民\s*(?:伤亡|死亡)")
if v is not None:
if "伊朗" in text or "iran" in t:
loss_ir["civilian_killed"] = v
else:
loss_us["civilian_killed"] = v
v = _first_int(t, r"(\d+)[\s\w]*(?:civilian|civil)[\s\w]*(?:killed|dead)") if loss_us.get("civilian_killed") is None and loss_ir.get("civilian_killed") is None else None
if v is not None:
if "iran" in t:
loss_ir["civilian_killed"] = v
else:
loss_us["civilian_killed"] = v
v = _first_int(t, r"(\d+)[\s\w]*(?:civilian|civil)[\s\w]*(?:wounded|injured)")
if v is not None:
if "iran" in t:
loss_ir["civilian_wounded"] = v
else:
loss_us["civilian_wounded"] = v
v = _first_int(text or t, r"伊朗[\s\w]*(?:空袭|打击)[\s\w]*造成[^\d]*(\d+)[\s\w]*(?:平民|人|伤亡)")
if v is not None:
loss_ir["civilian_killed"] = v
# 基地损毁(仅匹配 base/基地,排除"军事目标"等泛指)
skip_bases = "军事目标" in (text or "") and "基地" not in (text or "") and "base" not in t
if not skip_bases:
v = _first_int(t, r"(\d+)[\s\w]*(?:base|基地)[\s\w]*(?:destroyed|leveled|摧毁|夷平)")
if v is not None:
loss_us["bases_destroyed"] = v
v = _first_int(t, r"(\d+)[\s\w]*(?:base|基地)[\s\w]*(?:damaged|hit|struck|受损|袭击)")
if v is not None:
loss_us["bases_damaged"] = v
if ("base" in t or "基地" in t) and ("destroy" in t or "level" in t or "摧毁" in t or "夷平" in t) and not loss_us.get("bases_destroyed"):
loss_us["bases_destroyed"] = 1
if ("base" in t or "基地" in t) and ("damage" in t or "hit" in t or "struck" in t or "strike" in t or "袭击" in t or "受损" in t) and not loss_us.get("bases_damaged"):
loss_us["bases_damaged"] = 1
# 战机 / 舰船(根据上下文判断阵营)
v = _first_int(t, r"(\d+)[\s\w]*(?:aircraft|plane|jet|fighter|f-?16|f-?35|f-?18)[\s\w]*(?:down|destroyed|lost|shot)")
if v is not None:
if "us" in t or "american" in t or "u.s" in t:
loss_us["aircraft"] = v
elif "iran" in t:
loss_ir["aircraft"] = v
else:
loss_us["aircraft"] = v
v = _first_int(t, r"(\d+)[\s\w]*(?:ship|destroyer|warship|vessel)[\s\w]*(?:hit|damaged|sunk)")
if v is not None:
if "iran" in t:
loss_ir["warships"] = v
else:
loss_us["warships"] = v
# 无人机 drone / uav / 无人机
v = _first_int(t, r"(\d+)[\s\w]*(?:drone|uav|无人机)[\s\w]*(?:down|destroyed|shot|击落|摧毁)")
if v is None:
v = _first_int(text or t, r"(?:击落|摧毁)[^\d]*(\d+)[\s\w]*(?:drone|uav|无人机|架)")
if v is None:
v = _first_int(t, r"(?:drone|uav|无人机)[\s\w]*(\d+)[\s\w]*(?:down|destroyed|shot|击落|摧毁)")
if v is not None:
if "iran" in t or "iranian" in t or "shahed" in t or "沙希德" in t or "伊朗" in (text or ""):
loss_ir["drones"] = v
else:
loss_us["drones"] = v
# 导弹 missile / 导弹
v = _first_int(t, r"(\d+)[\s\w]*(?:missile|导弹)[\s\w]*(?:fired|launched|intercepted|destroyed|发射|拦截|击落)")
if v is not None:
if "iran" in t or "iranian" in t:
loss_ir["missiles"] = v
else:
loss_us["missiles"] = v
v = _first_int(t, r"(?:missile|导弹)[\s\w]*(\d+)[\s\w]*(?:fired|launched|intercepted|destroyed|发射|拦截)") if not loss_us.get("missiles") and not loss_ir.get("missiles") else None
if v is not None:
if "iran" in t:
loss_ir["missiles"] = v
else:
loss_us["missiles"] = v
# 直升机 helicopter / 直升机
v = _first_int(t, r"(\d+)[\s\w]*(?:helicopter|直升机)[\s\w]*(?:down|destroyed|crashed|crashes|击落|坠毁)")
if v is not None:
if "iran" in t or "iranian" in t:
loss_ir["helicopters"] = v
else:
loss_us["helicopters"] = v
# 潜艇 submarine / 潜艇
v = _first_int(t, r"(\d+)[\s\w]*(?:submarine|潜艇)[\s\w]*(?:sunk|damaged|hit|destroyed|击沉|受损)")
if v is not None:
if "iran" in t or "iranian" in t:
loss_ir["submarines"] = v
else:
loss_us["submarines"] = v
# 航母 carrier / 航空母舰 / 航母
v = _first_int(t, r"(\d+)[\s\w]*(?:carrier|aircraft\s*carrier|航母|航空母舰)[\s\w]*(?:destroyed|damaged|lost|hit|sunk|摧毁|损毁|击毁|沉没)")
if v is not None:
if "iran" in t or "iranian" in t:
loss_ir["carriers"] = v
else:
loss_us["carriers"] = v
# 民船 civilian ship / 商船 / 民船
v = _first_int(t, r"(\d+)[\s\w]*(?:civilian\s*ship|merchant|商船|民船)[\s\w]*(?:sunk|damaged|hit|击沉|受损)")
if v is None:
v = _first_int(text or t, r"(?:民船|商船|货船)[\s\w]*(\d+)[\s\w]*(?:艘)?[\s\w]*(?:击沉|受损|袭击)")
if v is not None:
if "iran" in t or "iranian" in t or "伊朗" in (text or ""):
loss_ir["civilian_ships"] = v
else:
loss_us["civilian_ships"] = v
# 机/港 airport / port / 机场 / 港口
v = _first_int(t, r"(\d+)[\s\w]*(?:airport|port|机场|港口)[\s\w]*(?:destroyed|damaged|hit|struck|摧毁|受损|袭击)")
if v is None:
v = _first_int(text or t, r"(?:机场|港口)[\s\w]*(\d+)[\s\w]*(?:处|个)?[\s\w]*(?:受损|袭击|摧毁)")
if v is not None:
if "iran" in t or "iranian" in t or "伊朗" in (text or ""):
loss_ir["airport_port"] = v
else:
loss_us["airport_port"] = v
if loss_us:
out.setdefault("combat_losses_delta", {})["us"] = loss_us
if loss_ir:
out.setdefault("combat_losses_delta", {})["iran"] = loss_ir
if "retaliat" in t or "revenge" in t or "报复" in t or "反击" in t:
out["retaliation"] = {"value": 75, "time": ts}
if "wall street" in t or " dow " in t or "s&p" in t or "market slump" in t or "stock fall" in t or "美股" in t:
out["wall_street"] = {"time": ts, "value": 55}
# key_location_updates受袭基地与 key_location.name 匹配)
# 新闻提及基地遭袭时,更新对应基地 status放宽触发词以匹配更多英文报道
attack_words = ("attack" in t or "attacked" in t or "hit" in t or "strike" in t or "struck" in t or "strikes" in t
or "damage" in t or "damaged" in t or "target" in t or "targeted" in t or "bomb" in t or "bombed" in t
or "袭击" in (text or "") or "遭袭" in (text or "") or "打击" in (text or "") or "受损" in (text or "") or "摧毁" in (text or ""))
base_attacked = ("base" in t or "基地" in t or "outpost" in t or "facility" in t) and attack_words
if base_attacked:
updates: list = []
# 常见美军基地关键词 -> name_keywords用于 db_merge 的 LIKE 匹配,需与 key_location.name 能匹配)
bases_all = [
("阿萨德|阿因|asad|assad|ain", "us"),
("巴格达|baghdad", "us"),
("乌代德|udeid|卡塔尔|qatar", "us"),
("阿克罗蒂里|akrotiri|塞浦路斯|cyprus", "us"),
("巴格拉姆|bagram|阿富汗|afghanistan", "us"),
("埃尔比勒|erbil", "us"),
("因吉尔利克|incirlik|土耳其|turkey", "us"),
("苏尔坦|sultan|沙特|saudi", "us"),
("坦夫|tanf|叙利亚|syria", "us"),
("达夫拉|dhafra|阿联酋|uae", "us"),
("内瓦提姆|nevatim|拉蒙|ramon|以色列|israel", "us"),
("赛利耶|sayliyah", "us"),
("巴林|bahrain", "us"),
("科威特|kuwait", "us"),
# 伊朗基地
("阿巴斯港|abbas|bandar abbas", "iran"),
("德黑兰|tehran", "iran"),
("布什尔|bushehr", "iran"),
("伊斯法罕|isfahan|esfahan", "iran"),
("纳坦兹|natanz", "iran"),
("米纳布|minab", "iran"),
("卡拉季|karaj", "iran"),
("克尔曼沙赫|kermanshah", "iran"),
("大不里士|tabriz", "iran"),
("霍尔木兹|hormuz", "iran"),
]
for kws, side in bases_all:
if any(k in t for k in kws.split("|")):
updates.append({"name_keywords": kws, "side": side, "status": "attacked", "damage_level": 2})
if updates:
out["key_location_updates"] = updates
return out

41
crawler/main.py Normal file
View File

@@ -0,0 +1,41 @@
# -*- coding: utf-8 -*-
"""爬虫入口:定时执行完整写库流水线(抓取 → 清洗 → 去重 → 映射 → 更新表 → 通知 API"""
import time
import sys
from pathlib import Path
# 确保能导入 config
sys.path.insert(0, str(Path(__file__).resolve().parent))
from config import DB_PATH, API_BASE, CRAWL_INTERVAL
from pipeline import run_full_pipeline
def run_once() -> int:
"""执行一轮:抓取、清洗、去重、映射、写表、通知。返回本轮新增条数(面板或资讯)。"""
n_fetched, n_news, n_panel = run_full_pipeline(
db_path=DB_PATH,
api_base=API_BASE,
translate=True,
notify=True,
)
return n_panel or n_news
def main() -> None:
print("Crawler started. DB:", DB_PATH)
print("API:", API_BASE, "| Interval:", CRAWL_INTERVAL, "s")
while True:
try:
n = run_once()
if n > 0:
print(f"[{time.strftime('%H:%M:%S')}] 抓取完成,去重后新增 {n} 条,已写库并通知 API")
except KeyboardInterrupt:
break
except Exception as e:
print(f"[{time.strftime('%H:%M:%S')}] Error: {e}")
time.sleep(CRAWL_INTERVAL)
if __name__ == "__main__":
main()

141
crawler/news_storage.py Normal file
View File

@@ -0,0 +1,141 @@
# -*- coding: utf-8 -*-
"""
资讯内容独立存储,支持历史去重
爬虫拉回数据 → 计算 content_hash → 若已存在则跳过(去重)→ 新数据落库 news_content
"""
import hashlib
import os
import re
import sqlite3
from datetime import datetime, timezone
from typing import List, Optional, Tuple
from config import DB_PATH
def _to_utc_iso(dt: datetime) -> str:
if dt.tzinfo:
dt = dt.astimezone(timezone.utc)
return dt.strftime("%Y-%m-%dT%H:%M:%S.000Z")
def _normalize_for_hash(text: str) -> str:
"""归一化文本用于生成去重 hash"""
if not text:
return ""
t = re.sub(r"\s+", " ", str(text).strip().lower())[:600]
return re.sub(r"[\x00-\x1f]", "", t)
def content_hash(title: str, summary: str, url: str) -> str:
"""根据标题、摘要、URL 生成去重 hash相似内容视为重复"""
raw = _normalize_for_hash(title) + "|" + _normalize_for_hash(summary) + "|" + (url or "").strip()
return hashlib.sha256(raw.encode("utf-8")).hexdigest()[:32]
def _ensure_table(conn: sqlite3.Connection) -> None:
conn.execute("""
CREATE TABLE IF NOT EXISTS news_content (
id TEXT PRIMARY KEY,
content_hash TEXT NOT NULL UNIQUE,
title TEXT NOT NULL,
summary TEXT NOT NULL,
url TEXT NOT NULL DEFAULT '',
source TEXT NOT NULL DEFAULT '',
published_at TEXT NOT NULL,
category TEXT NOT NULL DEFAULT 'other',
severity TEXT NOT NULL DEFAULT 'medium',
created_at TEXT NOT NULL DEFAULT (datetime('now'))
)
""")
try:
conn.execute("CREATE UNIQUE INDEX IF NOT EXISTS idx_news_content_hash ON news_content(content_hash)")
except sqlite3.OperationalError:
pass
try:
conn.execute("CREATE INDEX IF NOT EXISTS idx_news_content_pub ON news_content(published_at DESC)")
except sqlite3.OperationalError:
pass
conn.commit()
def exists_by_hash(conn: sqlite3.Connection, h: str) -> bool:
row = conn.execute("SELECT 1 FROM news_content WHERE content_hash = ? LIMIT 1", (h,)).fetchone()
return row is not None
def insert_news(
conn: sqlite3.Connection,
*,
title: str,
summary: str,
url: str = "",
source: str = "",
published: datetime,
category: str = "other",
severity: str = "medium",
) -> Optional[str]:
"""
插入资讯,若 content_hash 已存在则跳过(去重)
返回: 新插入的 id或 None 表示重复跳过
"""
_ensure_table(conn)
h = content_hash(title, summary, url)
if exists_by_hash(conn, h):
return None
uid = "nc_" + hashlib.sha256(f"{h}{datetime.utcnow().isoformat()}".encode()).hexdigest()[:14]
ts = _to_utc_iso(published)
conn.execute(
"""INSERT INTO news_content (id, content_hash, title, summary, url, source, published_at, category, severity)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(uid, h, (title or "")[:500], (summary or "")[:2000], (url or "")[:500], (source or "")[:100], ts, category, severity),
)
conn.commit()
return uid
def save_and_dedup(items: List[dict], db_path: Optional[str] = None) -> Tuple[List[dict], int]:
"""
去重后落库 news_content
items: [{"title","summary","url","published","category","severity","source"?}, ...]
返回: (通过去重的新项列表, 实际新增条数)
"""
path = db_path or DB_PATH
if not os.path.exists(path):
return [], 0
conn = sqlite3.connect(path, timeout=10)
try:
_ensure_table(conn)
new_items: List[dict] = []
count = 0
for u in items:
title = (u.get("title") or "")[:500]
summary = (u.get("summary") or u.get("title") or "")[:2000]
url = (u.get("url") or "")[:500]
source = (u.get("source") or "")[:100]
pub = u.get("published")
if isinstance(pub, str):
try:
pub = datetime.fromisoformat(pub.replace("Z", "+00:00"))
except ValueError:
pub = datetime.now(timezone.utc)
elif pub is None:
pub = datetime.now(timezone.utc)
cat = u.get("category", "other")
sev = u.get("severity", "medium")
uid = insert_news(
conn,
title=title,
summary=summary,
url=url,
source=source,
published=pub,
category=cat,
severity=sev,
)
if uid:
count += 1
new_items.append({**u, "news_id": uid})
return new_items, count
finally:
conn.close()

42
crawler/panel_schema.py Normal file
View File

@@ -0,0 +1,42 @@
# -*- coding: utf-8 -*-
"""
前端面板完整数据 schema与 DB / situationData / useReplaySituation 对齐
爬虫 + AI 清洗后的数据必须符合此 schema 才能正确更新前端
"""
from typing import Any, Dict, List, Optional, Tuple
# 事件脉络
SITUATION_UPDATE_CATEGORIES = ("deployment", "alert", "intel", "diplomatic", "other")
SITUATION_UPDATE_SEVERITIES = ("low", "medium", "high", "critical")
SUMMARY_MAX_LEN = 120
# 战损
CombatLossesRow = Dict[str, Any] # bases_destroyed, bases_damaged, personnel_killed, ...
# 时间序列(回放用)
TimeSeriesPoint = Tuple[str, int] # (ISO time, value)
# AI 可从新闻中提取的字段
EXTRACTABLE_FIELDS = {
"situation_update": ["summary", "category", "severity", "timestamp"],
"combat_losses": ["personnel_killed", "personnel_wounded", "civilian_killed", "civilian_wounded", "bases_destroyed", "bases_damaged", "aircraft", "warships", "armor", "vehicles", "drones", "missiles", "helicopters", "submarines", "tanks", "carriers", "civilian_ships", "airport_port"],
"retaliation": ["value"], # 0-100
"wall_street_trend": ["time", "value"], # 0-100
"conflict_stats": ["estimated_casualties", "estimated_strike_count"],
}
def validate_category(cat: str) -> str:
return cat if cat in SITUATION_UPDATE_CATEGORIES else "other"
def validate_severity(sev: str) -> str:
return sev if sev in SITUATION_UPDATE_SEVERITIES else "medium"
def validate_summary(s: str, max_len: int = SUMMARY_MAX_LEN) -> str:
import re
if not s or not isinstance(s, str):
return ""
t = re.sub(r"\s+", " ", str(s).strip())[:max_len]
return re.sub(r"[\x00-\x1f]", "", t).rstrip()

66
crawler/parser.py Normal file
View File

@@ -0,0 +1,66 @@
# -*- coding: utf-8 -*-
"""新闻分类与严重度判定"""
import re
from typing import List
try:
from typing import Literal # type: ignore
except ImportError:
try:
from typing_extensions import Literal # type: ignore
except ImportError:
from typing import Any
class _LiteralFallback:
def __getitem__(self, item):
return Any
Literal = _LiteralFallback()
Category = Literal["deployment", "alert", "intel", "diplomatic", "other"]
Severity = Literal["low", "medium", "high", "critical"]
# 分类关键词
CAT_DEPLOYMENT = ["deploy", "carrier", "航母", "military build", "troop", "forces"]
CAT_ALERT = ["strike", "attack", "fire", "blast", "hit", "爆炸", "袭击", "打击"]
CAT_INTEL = ["satellite", "intel", "image", "surveillance", "卫星", "情报"]
CAT_DIPLOMATIC = ["talk", "negotiation", "diplomat", "sanction", "谈判", "制裁"]
def _match(text: str, words: List[str]) -> bool:
t = (text or "").lower()
for w in words:
if w.lower() in t:
return True
return False
def classify(text: str) -> Category:
if _match(text, CAT_ALERT):
return "alert"
if _match(text, CAT_DEPLOYMENT):
return "deployment"
if _match(text, CAT_INTEL):
return "intel"
if _match(text, CAT_DIPLOMATIC):
return "diplomatic"
return "other"
def severity(text: str, category: Category) -> Severity:
t = (text or "").lower()
critical = [
"nuclear", "", "strike", "attack", "killed", "dead", "casualty",
"war", "invasion", "袭击", "打击", "死亡",
]
high = [
"missile", "drone", "bomb", "explosion", "blasted", "fire",
"导弹", "无人机", "爆炸", "轰炸",
]
if _match(t, critical):
return "critical"
if _match(t, high) or category == "alert":
return "high"
if category == "deployment":
return "medium"
return "low"

150
crawler/parser_ai.py Normal file
View File

@@ -0,0 +1,150 @@
# -*- coding: utf-8 -*-
"""
AI 新闻分类与严重度判定
优先 DASHSCOPE_API_KEY通义无需 Ollama否则 Ollama最后规则
设置 PARSER_AI_DISABLED=1 可只用规则(更快)
"""
import os
from typing import Any, Optional, Tuple
try:
from typing import Literal # type: ignore
except ImportError:
try:
from typing_extensions import Literal # type: ignore
except ImportError:
class _LiteralFallback:
def __getitem__(self, item):
return Any
Literal = _LiteralFallback()
Category = Literal["deployment", "alert", "intel", "diplomatic", "other"]
Severity = Literal["low", "medium", "high", "critical"]
PARSER_AI_DISABLED = os.environ.get("PARSER_AI_DISABLED", "0") == "1"
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "llama3.1")
DASHSCOPE_API_KEY = os.environ.get("DASHSCOPE_API_KEY", "").strip()
_CATEGORIES = ("deployment", "alert", "intel", "diplomatic", "other")
_SEVERITIES = ("low", "medium", "high", "critical")
def _parse_ai_response(text: str) -> Tuple[Category, Severity]:
"""从 AI 回复解析 category:severity"""
t = (text or "").strip().lower()
cat, sev = "other", "low"
for c in _CATEGORIES:
if c in t:
cat = c
break
for s in _SEVERITIES:
if s in t:
sev = s
break
return cat, sev # type: ignore
def _call_dashscope(text: str, timeout: int = 6) -> Optional[Tuple[Category, Severity]]:
"""调用阿里云通义DashScope分类无需 Ollama。需设置 DASHSCOPE_API_KEY"""
if not DASHSCOPE_API_KEY or PARSER_AI_DISABLED:
return None
try:
import dashscope
from http import HTTPStatus
dashscope.api_key = DASHSCOPE_API_KEY
prompt = f"""Classify this news about US-Iran/middle east (one line only):
- category: deployment|alert|intel|diplomatic|other
- severity: low|medium|high|critical
News: {text[:300]}
Reply format: category:severity (e.g. alert:high)"""
r = dashscope.Generation.call(
model="qwen-turbo",
messages=[{"role": "user", "content": prompt}],
result_format="message",
max_tokens=32,
)
if r.status_code != HTTPStatus.OK:
return None
out = r.output.get("choices", [{}])[0].get("message", {}).get("content", "")
return _parse_ai_response(out)
except Exception:
return None
def _call_ollama(text: str, timeout: int = 5) -> Optional[Tuple[Category, Severity]]:
"""调用 Ollama 本地模型。需先运行 ollama run llama3.1"""
if PARSER_AI_DISABLED:
return None
try:
import requests
prompt = f"""Classify this news about US-Iran/middle east (one line only):
- category: deployment|alert|intel|diplomatic|other
- severity: low|medium|high|critical
News: {text[:300]}
Reply format: category:severity (e.g. alert:high)"""
r = requests.post(
"http://localhost:11434/api/chat",
json={
"model": OLLAMA_MODEL,
"messages": [{"role": "user", "content": prompt}],
"stream": False,
"options": {"num_predict": 32},
},
timeout=timeout,
)
if r.status_code != 200:
return None
out = r.json().get("message", {}).get("content", "")
return _parse_ai_response(out)
except Exception:
return None
def _rule_classify(text: str) -> Category:
from parser import classify
return classify(text)
def _rule_severity(text: str, category: Category) -> Severity:
from parser import severity
return severity(text, category)
def _call_ai(text: str) -> Optional[Tuple[Category, Severity]]:
"""优先通义,再 Ollama"""
if DASHSCOPE_API_KEY:
return _call_dashscope(text)
return _call_ollama(text)
def classify(text: str) -> Category:
"""分类。AI 失败时回退规则"""
res = _call_ai(text)
if res:
return res[0]
return _rule_classify(text)
def severity(text: str, category: Category) -> Severity:
"""严重度。AI 失败时回退规则"""
res = _call_ai(text)
if res:
return res[1]
return _rule_severity(text, category)
def classify_and_severity(text: str) -> Tuple[Category, Severity]:
"""一次调用返回分类和严重度(减少 AI 调用)"""
if PARSER_AI_DISABLED:
from parser import classify, severity
c = classify(text)
return c, severity(text, c)
res = _call_ai(text)
if res:
return res
return _rule_classify(text), _rule_severity(text, _rule_classify(text))

190
crawler/pipeline.py Normal file
View File

@@ -0,0 +1,190 @@
# -*- coding: utf-8 -*-
"""
统一写库流水线:抓取 → 清洗 → 去重 → 映射到前端库字段 → 更新表 → 通知
与 server/README.md 第五节「爬虫侧写库链路」一致,供 main.py 与 realtime_conflict_service 共用。
"""
import os
from datetime import datetime, timezone
from typing import Callable, Optional, Tuple
from config import DB_PATH, API_BASE
from db_writer import touch_situation_updated_at_path
def _notify_api(api_base: str) -> bool:
"""调用 Node API 触发立即广播"""
try:
import urllib.request
token = os.environ.get("API_CRAWLER_TOKEN", "").strip()
req = urllib.request.Request(
f"{api_base.rstrip('/')}/api/crawler/notify",
method="POST",
headers={
"Content-Type": "application/json",
**({"X-Crawler-Token": token} if token else {}),
},
)
with urllib.request.urlopen(req, timeout=5) as resp:
return resp.status == 200
except Exception as e:
print(f" [warn] notify API failed: {e}")
return False
def _extract_and_merge(items: list, db_path: str) -> bool:
"""AI 从新闻全文或标题+摘要中提取精确结构化数据,合并到 combat_losses / key_location 等表。"""
if not items or not os.path.exists(db_path):
return False
try:
from db_merge import merge
use_dashscope = bool(os.environ.get("DASHSCOPE_API_KEY", "").strip())
if use_dashscope:
from extractor_dashscope import extract_from_news
limit = 10
elif os.environ.get("CLEANER_AI_DISABLED", "0") == "1":
from extractor_rules import extract_from_news
limit = 25
else:
from extractor_ai import extract_from_news
limit = 10
merged_any = False
for it in items[:limit]:
# 优先用正文article_fetcher 抓取),否则用标题+摘要,供 AI 提取精确数字
text = it.get("full_text") or ((it.get("title", "") or "") + " " + (it.get("summary", "") or ""))
if len(text.strip()) < 20:
continue
pub = it.get("published")
ts = None
if pub:
try:
if isinstance(pub, str):
pub_dt = datetime.fromisoformat(pub.replace("Z", "+00:00"))
else:
pub_dt = pub
if pub_dt.tzinfo:
pub_dt = pub_dt.astimezone(timezone.utc)
ts = pub_dt.strftime("%Y-%m-%dT%H:%M:%S.000Z")
except Exception:
pass
extracted = extract_from_news(text, timestamp=ts)
if extracted and merge(extracted, db_path=db_path):
merged_any = True
return merged_any
except Exception as e:
print(f" [warn] AI 面板数据提取/合并: {e}")
return False
def run_full_pipeline(
db_path: Optional[str] = None,
api_base: Optional[str] = None,
*,
translate: bool = True,
notify: bool = True,
on_notify: Optional[Callable[[], None]] = None,
) -> Tuple[int, int, int]:
"""
执行完整写库链路:
1. 爬虫抓取实时数据
2. AI 清洗(标题/摘要/分类)→ 有效数据
3. 去重news_content content_hash→ 仅新项进入后续
4. 有效数据映射到前端库字段situation_update、news_content、combat_losses 等)
5. 更新数据库表;若有更新则通知后端
translate: 是否对标题/摘要做翻译(英→中)
notify: 是否在流水线末尾调用 POST /api/crawler/notify
on_notify: 若提供,在通知前调用(供 gdelt 服务做 GDELT 回填等)
返回: (本轮抓取条数, 去重后新增资讯数, 写入 situation_update 条数)
"""
path = db_path or DB_PATH
base = api_base or API_BASE
from scrapers.rss_scraper import fetch_all
from db_writer import write_updates
from news_storage import save_and_dedup
from cleaner_ai import clean_news_for_panel, ensure_category, ensure_severity
# 1. 抓取
items = fetch_all()
if not items:
return 0, 0, 0
# 可选:仅保留指定起始时间之后的条目(如 CRAWL_START_DATE=2026-02-28T00:00:00
start_date_env = os.environ.get("CRAWL_START_DATE", "").strip()
if start_date_env:
try:
raw = start_date_env.replace("Z", "+00:00").strip()
start_dt = datetime.fromisoformat(raw)
if start_dt.tzinfo is None:
start_dt = start_dt.replace(tzinfo=timezone.utc)
else:
start_dt = start_dt.astimezone(timezone.utc)
before = len(items)
items = [it for it in items if (it.get("published") or datetime.min.replace(tzinfo=timezone.utc)) >= start_dt]
if before > len(items):
print(f" [pipeline] 按 CRAWL_START_DATE={start_date_env} 过滤后保留 {len(items)} 条(原 {before} 条)")
except Exception as e:
print(f" [warn] CRAWL_START_DATE 解析失败,忽略: {e}")
if not items:
return 0, 0, 0
n_total = len(items)
print(f" [pipeline] 抓取 {n_total}")
for i, it in enumerate(items[:5]):
title = (it.get("title") or it.get("summary") or "").strip()[:60]
print(f" [{i + 1}] {title}" + ("" if len((it.get("title") or it.get("summary") or "")[:60]) >= 60 else ""))
if n_total > 5:
print(f" ... 共 {n_total}")
# 2. 清洗(标题/摘要/分类,符合面板 schema
if translate:
from translate_utils import translate_to_chinese
for it in items:
raw_title = translate_to_chinese(it.get("title", "") or "")
raw_summary = translate_to_chinese(it.get("summary", "") or it.get("title", ""))
it["title"] = clean_news_for_panel(raw_title, max_len=80)
it["summary"] = clean_news_for_panel(raw_summary or raw_title, max_len=120)
else:
for it in items:
it["title"] = clean_news_for_panel(it.get("title", "") or "", max_len=80)
it["summary"] = clean_news_for_panel(it.get("summary", "") or it.get("title", ""), max_len=120)
for it in items:
it["category"] = ensure_category(it.get("category", "other"))
it["severity"] = ensure_severity(it.get("severity", "medium"))
it["source"] = it.get("source") or "rss"
# 3. 去重:落库 news_content仅新项返回
new_items, n_news = save_and_dedup(items, db_path=path)
if new_items:
print(f" [pipeline] 去重后新增 {n_news} 条,写入事件脉络 {len(new_items)}")
for i, it in enumerate(new_items[:3]):
title = (it.get("title") or it.get("summary") or "").strip()[:55]
print(f" 新增 [{i + 1}] {title}" + ("" if len((it.get("title") or it.get("summary") or "").strip()) > 55 else ""))
# 3.5 数据增强:为参与 AI 提取的条目抓取正文,便于从全文提取精确数据(伤亡、基地等)
if new_items:
try:
from article_fetcher import enrich_item_with_body
# 仅对前若干条抓取正文,避免单轮请求过多
enrich_limit = int(os.environ.get("ARTICLE_FETCH_LIMIT", "10"))
for it in new_items[:enrich_limit]:
enrich_item_with_body(it)
except Exception as e:
print(f" [warn] 正文抓取: {e}")
# 4. 映射到前端库字段并更新表(与去重/AI 使用同一 db path
n_panel = write_updates(new_items, db_path=path) if new_items else 0
if new_items:
_extract_and_merge(new_items, path)
# 4.5 每次运行都刷新 situation.updated_at便于前端显示「最后抓取时间」否则只有新增条目时才更新数据会一直停在旧日期
touch_situation_updated_at_path(db_path=path)
# 5. 通知(每次运行都通知,让 API 重载并广播最新 lastUpdated
if on_notify:
on_notify()
if notify:
_notify_api(base)
return len(items), n_news, n_panel

20
crawler/pyproject.toml Normal file
View File

@@ -0,0 +1,20 @@
[project]
name = "usa-crawler"
version = "1.0.0"
description = "GDELT + RSS 爬虫与实时冲突服务"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"requests>=2.32.0",
"feedparser>=6.0.10",
"beautifulsoup4>=4.12.0",
"pytest>=8.0.0",
"fastapi>=0.115.0",
"uvicorn[standard]>=0.32.0",
"deep-translator>=1.11.0",
"dashscope>=1.20.0",
]
[tool.pytest.ini_options]
testpaths = ["tests"]
python_files = ["test_*.py"]

View File

@@ -0,0 +1,504 @@
# -*- coding: utf-8 -*-
"""
GDELT 实时冲突抓取 + API 服务
核心数据源GDELT Project约 15 分钟级更新,含经纬度、事件编码、参与方、事件强度
"""
import os
# 直连外网,避免系统代理导致 ProxyError / 超时(需代理时设置 CRAWLER_USE_PROXY=1
if os.environ.get("CRAWLER_USE_PROXY") != "1":
os.environ.setdefault("NO_PROXY", "*")
import hashlib
import sqlite3
from datetime import datetime
from pathlib import Path
from typing import List, Optional
import asyncio
import logging
import requests
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
logging.getLogger("uvicorn").setLevel(logging.INFO)
app = FastAPI(title="GDELT Conflict Service")
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"])
# 配置
PROJECT_ROOT = Path(__file__).resolve().parent.parent
DB_PATH = os.environ.get("DB_PATH", str(PROJECT_ROOT / "server" / "data.db"))
API_BASE = os.environ.get("API_BASE", "http://localhost:3001")
QUERY = os.environ.get("GDELT_QUERY", "United States Iran military")
MAX_RECORDS = int(os.environ.get("GDELT_MAX_RECORDS", "30"))
FETCH_INTERVAL_SEC = int(os.environ.get("FETCH_INTERVAL_SEC", "60"))
RSS_INTERVAL_SEC = int(os.environ.get("RSS_INTERVAL_SEC", "60")) # 每分钟抓取世界主流媒体
# 时间范围1h=1小时 1d=1天 1week=1周不设则默认 3 个月(易返回旧文)
GDELT_TIMESPAN = os.environ.get("GDELT_TIMESPAN", "1d")
# 设为 1 则跳过 GDELT仅用 RSS 新闻作为事件脉络GDELT 国外可能无法访问)
GDELT_DISABLED = os.environ.get("GDELT_DISABLED", "0") == "1"
# 伊朗攻击源(无经纬度时默认)
IRAN_COORD = [51.3890, 35.6892] # Tehran [lng, lat]
# 请求直连,不经过系统代理(避免 ProxyError / 代理超时)
_REQ_KW = {"timeout": 15, "headers": {"User-Agent": "US-Iran-Dashboard/1.0"}}
if os.environ.get("CRAWLER_USE_PROXY") != "1":
_REQ_KW["proxies"] = {"http": None, "https": None}
EVENT_CACHE: List[dict] = []
# ==========================
# 冲突强度评分 (110)
# ==========================
def calculate_impact_score(title: str) -> int:
score = 1
t = (title or "").lower()
if "missile" in t or "导弹" in t:
score += 3
if "strike" in t or "袭击" in t or "打击" in t:
score += 2
if "killed" in t or "death" in t or "casualt" in t or "死亡" in t or "伤亡" in t:
score += 4
if "troops" in t or "soldier" in t or "士兵" in t or "军人" in t:
score += 2
if "attack" in t or "attacked" in t or "攻击" in t:
score += 3
if "nuclear" in t or "" in t:
score += 4
if "explosion" in t or "blast" in t or "bomb" in t or "爆炸" in t:
score += 2
return min(score, 10)
# 根据 severity 映射到 impact_score
def _severity_to_score(sev: str) -> int:
m = {"critical": 9, "high": 7, "medium": 5, "low": 2}
return m.get((sev or "").lower(), 5)
# 根据文本推断坐标 [lng, lat],用于 GDELT 禁用时 RSS→gdelt_events
_LOC_COORDS = [
(["阿克罗蒂里", "akrotiri", "塞浦路斯", "cyprus"], (32.98, 34.58)),
(["巴格拉姆", "bagram", "阿富汗", "afghanistan"], (69.26, 34.95)),
(["巴格达", "baghdad", "伊拉克", "iraq"], (44.37, 33.31)),
(["贝鲁特", "beirut", "黎巴嫩", "lebanon"], (35.49, 33.89)),
(["耶路撒冷", "jerusalem", "特拉维夫", "tel aviv", "以色列", "israel"], (35.21, 31.77)),
(["阿巴斯港", "bandar abbas", "霍尔木兹", "hormuz"], (56.27, 27.18)),
(["米纳布", "minab"], (57.08, 27.13)),
(["德黑兰", "tehran", "伊朗", "iran"], (51.389, 35.689)),
(["大马士革", "damascus", "叙利亚", "syria"], (36.28, 33.50)),
(["迪拜", "dubai", "阿联酋", "uae"], (55.27, 25.20)),
(["沙特", "saudi"], (46.73, 24.71)),
(["巴基斯坦", "pakistan"], (73.06, 33.72)),
(["奥斯汀", "austin"], (-97.74, 30.27)),
]
def _infer_coords(text: str) -> tuple:
t = (text or "").lower()
for kws, (lng, lat) in _LOC_COORDS:
for k in kws:
if k in t:
return (lng, lat)
return (IRAN_COORD[0], IRAN_COORD[1])
# ==========================
# 获取 GDELT 实时事件
# ==========================
def _parse_article(article: dict) -> Optional[dict]:
title_raw = article.get("title") or article.get("seendate") or ""
if not title_raw:
return None
from translate_utils import translate_to_chinese
from cleaner_ai import clean_news_for_panel
title = translate_to_chinese(str(title_raw)[:500])
title = clean_news_for_panel(title, max_len=150)
url = article.get("url") or article.get("socialimage") or ""
seendate = article.get("seendate") or datetime.utcnow().isoformat()
lat = article.get("lat")
lng = article.get("lng")
# 无经纬度时使用伊朗坐标(攻击源)
if lat is None or lng is None:
lat, lng = IRAN_COORD[1], IRAN_COORD[0]
try:
lat, lng = float(lat), float(lng)
except (TypeError, ValueError):
lat, lng = IRAN_COORD[1], IRAN_COORD[0]
impact = calculate_impact_score(title_raw)
event_id = hashlib.sha256(f"{url}{seendate}".encode()).hexdigest()[:24]
return {
"event_id": event_id,
"event_time": seendate,
"title": title[:500],
"lat": lat,
"lng": lng,
"impact_score": impact,
"url": url,
}
def fetch_gdelt_events() -> None:
if GDELT_DISABLED:
return
url = (
"https://api.gdeltproject.org/api/v2/doc/doc"
f"?query={QUERY}"
"&mode=ArtList"
"&format=json"
f"&maxrecords={MAX_RECORDS}"
f"&timespan={GDELT_TIMESPAN}"
"&sort=datedesc"
)
try:
resp = requests.get(url, **_REQ_KW)
resp.raise_for_status()
data = resp.json()
articles = data.get("articles", data) if isinstance(data, dict) else (data if isinstance(data, list) else [])
if not isinstance(articles, list):
articles = []
new_events = []
for a in articles:
ev = _parse_article(a) if isinstance(a, dict) else None
if ev:
new_events.append(ev)
# 按 event_time 排序,最新在前
new_events.sort(key=lambda e: e.get("event_time", ""), reverse=True)
global EVENT_CACHE
EVENT_CACHE = new_events
# 写入 SQLite 并通知 Node
_write_to_db(new_events)
_notify_node()
print(f"[{datetime.now().strftime('%H:%M:%S')}] GDELT 更新 {len(new_events)} 条事件")
except Exception:
pass
def _ensure_table(conn: sqlite3.Connection) -> None:
conn.execute("""
CREATE TABLE IF NOT EXISTS gdelt_events (
event_id TEXT PRIMARY KEY,
event_time TEXT NOT NULL,
title TEXT NOT NULL,
lat REAL NOT NULL,
lng REAL NOT NULL,
impact_score INTEGER NOT NULL,
url TEXT,
created_at TEXT DEFAULT (datetime('now'))
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS conflict_stats (
id INTEGER PRIMARY KEY CHECK (id = 1),
total_events INTEGER NOT NULL,
high_impact_events INTEGER NOT NULL,
estimated_casualties INTEGER NOT NULL,
estimated_strike_count INTEGER NOT NULL,
updated_at TEXT NOT NULL
)
""")
conn.commit()
def _write_to_db(events: List[dict]) -> None:
if not os.path.exists(DB_PATH):
return
conn = sqlite3.connect(DB_PATH, timeout=10)
try:
_ensure_table(conn)
for e in events:
conn.execute(
"INSERT OR REPLACE INTO gdelt_events (event_id, event_time, title, lat, lng, impact_score, url) VALUES (?, ?, ?, ?, ?, ?, ?)",
(
e["event_id"],
e.get("event_time", ""),
e.get("title", ""),
e.get("lat", 0),
e.get("lng", 0),
e.get("impact_score", 1),
e.get("url", ""),
),
)
# 战损统计模型(展示用)
high = sum(1 for x in events if x.get("impact_score", 0) >= 7)
strikes = sum(1 for x in events if "strike" in (x.get("title") or "").lower() or "attack" in (x.get("title") or "").lower())
casualties = min(5000, high * 80 + len(events) * 10) # 估算
conn.execute(
"INSERT OR REPLACE INTO conflict_stats (id, total_events, high_impact_events, estimated_casualties, estimated_strike_count, updated_at) VALUES (1, ?, ?, ?, ?, ?)",
(len(events), high, casualties, strikes, datetime.utcnow().isoformat()),
)
conn.execute(
"INSERT OR REPLACE INTO situation (id, data, updated_at) VALUES (1, '{}', ?)",
(datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"),),
)
conn.commit()
except Exception as e:
print(f"写入 DB 失败: {e}")
conn.rollback()
finally:
conn.close()
def _notify_node() -> None:
try:
headers = {}
token = os.environ.get("API_CRAWLER_TOKEN", "").strip()
if token:
headers["X-Crawler-Token"] = token
r = requests.post(
f"{API_BASE}/api/crawler/notify",
timeout=5,
headers=headers,
proxies={"http": None, "https": None},
)
if r.status_code != 200:
print(" [warn] notify API 失败")
except Exception as e:
print(f" [warn] notify API: {e}")
def _rss_to_gdelt_fallback() -> None:
"""GDELT 禁用时,将 situation_update 同步到 gdelt_events使地图有冲突点"""
if not GDELT_DISABLED or not os.path.exists(DB_PATH):
return
try:
conn = sqlite3.connect(DB_PATH, timeout=10)
rows = conn.execute(
"SELECT id, timestamp, category, summary, severity FROM situation_update ORDER BY timestamp DESC LIMIT 50"
).fetchall()
conn.close()
events = []
for r in rows:
uid, ts, cat, summary, sev = r
lng, lat = _infer_coords((summary or "")[:300])
impact = _severity_to_score(sev)
events.append({
"event_id": f"rss_{uid}",
"event_time": ts,
"title": (summary or "")[:500],
"lat": lat,
"lng": lng,
"impact_score": impact,
"url": "",
})
if events:
global EVENT_CACHE
EVENT_CACHE = events
_write_to_db(events)
_notify_node()
except Exception as e:
print(f" [warn] RSS→gdelt fallback: {e}")
# ==========================
# RSS 新闻抓取:使用统一流水线(抓取 → 清洗 → 去重 → 映射 → 写表 → 通知)
# ==========================
LAST_FETCH = {"items": 0, "inserted": 0, "error": None}
def _refresh_panel_data() -> int:
"""从近期事件重新提取并合并战损/据点等面板实时数据,不依赖本轮是否有新 RSS。返回合并条数。"""
if not os.path.exists(DB_PATH):
return 0
try:
from db_merge import merge
use_dashscope = bool(os.environ.get("DASHSCOPE_API_KEY", "").strip())
if use_dashscope:
from extractor_dashscope import extract_from_news
elif os.environ.get("CLEANER_AI_DISABLED", "0") == "1":
from extractor_rules import extract_from_news
else:
from extractor_ai import extract_from_news
conn = sqlite3.connect(DB_PATH, timeout=10)
rows = conn.execute(
"SELECT id, timestamp, category, summary FROM situation_update ORDER BY timestamp DESC LIMIT 50"
).fetchall()
conn.close()
merged = 0
for r in rows:
uid, ts, cat, summary = r
text = ((cat or "") + " " + (summary or "")).strip()
if len(text) < 20:
continue
try:
extracted = extract_from_news(text, timestamp=ts)
if extracted and merge(extracted, db_path=DB_PATH):
merged += 1
except Exception:
pass
return merged
except Exception:
return 0
def fetch_news() -> None:
"""执行完整写库流水线;产出看板实时数据(战损、据点、冲突事件)+ 事件脉络。GDELT 禁用时用 RSS 回填 gdelt_events。"""
try:
from pipeline import run_full_pipeline
LAST_FETCH["error"] = None
n_fetched, n_news, n_panel = run_full_pipeline(
db_path=DB_PATH,
api_base=API_BASE,
translate=True,
notify=False,
)
LAST_FETCH["items"] = n_fetched
LAST_FETCH["inserted"] = n_news
if GDELT_DISABLED:
_rss_to_gdelt_fallback()
_notify_node()
ts = datetime.now().strftime("%H:%M:%S")
print(f"[{ts}] 抓取 {n_fetched} 条,去重新增 {n_news} 条,写脉络 {n_panel} 条 → 面板实时数据(战损/据点)已由本批提取更新")
if n_fetched == 0:
print(f"[{ts}] 0 条检查网络、RSS 源或 KEYWORDS 过滤)")
except Exception as e:
LAST_FETCH["error"] = str(e)
print(f"[{datetime.now().strftime('%H:%M:%S')}] 新闻抓取失败: {e}")
# 每 N 轮做一次「从近期事件回填面板实时数据」,保证战损/据点等与最新内容一致
BACKFILL_CYCLES = int(os.environ.get("BACKFILL_CYCLES", "2"))
_cycle_count = 0
# ==========================
# 定时任务asyncio 后台任务,避免 APScheduler executor 关闭竞态)
# ==========================
_bg_task: Optional[asyncio.Task] = None
async def _periodic_fetch() -> None:
global _cycle_count
loop = asyncio.get_event_loop()
while True:
try:
await loop.run_in_executor(None, fetch_news)
await loop.run_in_executor(None, fetch_gdelt_events)
_cycle_count += 1
if _cycle_count >= BACKFILL_CYCLES:
_cycle_count = 0
merged = _refresh_panel_data()
if merged > 0:
_notify_node()
ts = datetime.now().strftime("%H:%M:%S")
print(f"[{ts}] 面板实时数据回填:从近期事件合并 {merged} 条(战损/据点)")
except asyncio.CancelledError:
break
except Exception as e:
print(f" [warn] 定时抓取: {e}")
await asyncio.sleep(min(RSS_INTERVAL_SEC, FETCH_INTERVAL_SEC))
# ==========================
# API 接口
# ==========================
@app.post("/crawler/backfill")
def crawler_backfill():
"""从 situation_update 重新解析并合并战损/报复等数据,用于修复历史数据未提取的情况"""
if not os.path.exists(DB_PATH):
return {"ok": False, "error": "db not found"}
try:
from db_merge import merge
use_dashscope = bool(os.environ.get("DASHSCOPE_API_KEY", "").strip())
if use_dashscope:
from extractor_dashscope import extract_from_news
elif os.environ.get("CLEANER_AI_DISABLED", "0") == "1":
from extractor_rules import extract_from_news
else:
from extractor_ai import extract_from_news
conn = sqlite3.connect(DB_PATH, timeout=10)
rows = conn.execute(
"SELECT id, timestamp, category, summary FROM situation_update ORDER BY timestamp DESC LIMIT 50"
).fetchall()
conn.close()
merged = 0
for r in rows:
uid, ts, cat, summary = r
text = ((cat or "") + " " + (summary or "")).strip()
if len(text) < 20:
continue
try:
extracted = extract_from_news(text, timestamp=ts)
if extracted and merge(extracted, db_path=DB_PATH):
merged += 1
except Exception:
pass
_notify_node()
return {"ok": True, "processed": len(rows), "merged": merged}
except Exception as e:
return {"ok": False, "error": str(e)}
@app.get("/crawler/status")
def crawler_status():
"""爬虫状态:用于排查数据更新链路"""
import os
db_ok = os.path.exists(DB_PATH)
total = 0
if db_ok:
try:
conn = sqlite3.connect(DB_PATH, timeout=3)
total = conn.execute("SELECT COUNT(*) FROM situation_update").fetchone()[0]
conn.close()
except Exception:
pass
return {
"db_path": DB_PATH,
"db_exists": db_ok,
"situation_update_count": total,
"last_fetch_items": LAST_FETCH.get("items", 0),
"last_fetch_inserted": LAST_FETCH.get("inserted", 0),
"last_fetch_error": LAST_FETCH.get("error"),
}
@app.get("/events")
def get_events():
return {
"updated_at": datetime.utcnow().isoformat(),
"count": len(EVENT_CACHE),
"events": EVENT_CACHE,
"conflict_stats": _get_conflict_stats(),
}
def _get_conflict_stats() -> dict:
if not os.path.exists(DB_PATH):
return {"total_events": 0, "high_impact_events": 0, "estimated_casualties": 0, "estimated_strike_count": 0}
try:
conn = sqlite3.connect(DB_PATH, timeout=5)
row = conn.execute("SELECT total_events, high_impact_events, estimated_casualties, estimated_strike_count FROM conflict_stats WHERE id = 1").fetchone()
conn.close()
if row:
return {
"total_events": row[0],
"high_impact_events": row[1],
"estimated_casualties": row[2],
"estimated_strike_count": row[3],
}
except Exception:
pass
return {"total_events": 0, "high_impact_events": 0, "estimated_casualties": 0, "estimated_strike_count": 0}
@app.on_event("startup")
async def startup():
"""仅启动后台定时任务,不阻塞首次抓取,避免启动超时(验证脚本 /crawler/status 可尽快就绪)"""
global _bg_task
_bg_task = asyncio.create_task(_periodic_fetch())
@app.on_event("shutdown")
async def shutdown():
global _bg_task
if _bg_task and not _bg_task.done():
_bg_task.cancel()
try:
await _bg_task
except asyncio.CancelledError:
pass
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)

10
crawler/requirements.txt Normal file
View File

@@ -0,0 +1,10 @@
# Python 3.11+ 爬虫依赖(使用当前最新兼容版本)
# 安装: pip install -r crawler/requirements.txt
requests>=2.32.0
feedparser>=6.0.10
beautifulsoup4>=4.12.0
pytest>=8.0.0
fastapi>=0.115.0
uvicorn[standard]>=0.32.0
deep-translator>=1.11.0
dashscope>=1.20.0

51
crawler/run_once.py Normal file
View File

@@ -0,0 +1,51 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
单独运行爬虫一轮:抓取 → 清洗 → 去重 → 写库 → 通知 Node可选
终端直接输出抓取条数及内容摘要,便于排查。
用法(项目根或 crawler 目录):
python run_once.py
python -c "import run_once; run_once.main()"
或: npm run crawler:once
"""
import os
import sys
from datetime import datetime
# 保证可导入同目录模块
if __name__ == "__main__":
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
def main():
from config import DB_PATH, API_BASE
from pipeline import run_full_pipeline
crawl_start = os.environ.get("CRAWL_START_DATE", "").strip()
print("========================================")
print("爬虫单次运行RSS → 清洗 → 去重 → 写库)")
print("DB:", DB_PATH)
print("API_BASE:", API_BASE)
if crawl_start:
print("时间范围: 仅保留 CRAWL_START_DATE 之后:", crawl_start)
print("========================================\n")
n_fetched, n_news, n_panel = run_full_pipeline(
db_path=DB_PATH,
api_base=API_BASE,
translate=True,
notify=True,
)
print("")
print("----------------------------------------")
print("本轮结果:")
print(f" 抓取: {n_fetched}")
print(f" 去重后新增资讯: {n_news}")
print(f" 写入事件脉络: {n_panel}")
if n_fetched == 0:
print(" 0 条检查网络、RSS 源或 config.KEYWORDS 过滤)")
print("----------------------------------------")
return 0
if __name__ == "__main__":
sys.exit(main())

9
crawler/run_uvicorn.sh Normal file
View File

@@ -0,0 +1,9 @@
#!/usr/bin/env bash
# PM2 用:在 crawler 目录下启动 uvicornGDELT/RSS 实时服务 :8000
set -e
cd "$(dirname "$0")"
[ -n "$LANG" ] || export LANG="${LANG:-en_US.UTF-8}"
[ -n "$LC_ALL" ] || export LC_ALL="${LC_ALL:-en_US.UTF-8}"
# 若项目根目录有 .env可在此加载PM2 一般已在 ecosystem 里配 env
if [ -f ../.env ]; then set -a; . ../.env; set +a; fi
exec python3 -m uvicorn realtime_conflict_service:app --host 0.0.0.0 --port 8000

View File

@@ -0,0 +1 @@
# -*- coding: utf-8 -*-

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,94 @@
# -*- coding: utf-8 -*-
"""RSS 抓取:按源独立超时与错误隔离,单源失败不影响其他源"""
import re
import socket
from datetime import datetime, timezone
from typing import List, Set, Tuple
import feedparser
from config import KEYWORDS, FEED_TIMEOUT, get_feed_sources
from parser_ai import classify_and_severity
def _parse_date(entry) -> datetime:
for attr in ("published_parsed", "updated_parsed"):
val = getattr(entry, attr, None)
if val:
try:
return datetime(*val[:6], tzinfo=timezone.utc)
except (TypeError, ValueError):
pass
return datetime.now(timezone.utc)
def _strip_html(s: str) -> str:
return re.sub(r"<[^>]+>", "", s) if s else ""
def _matches_keywords(text: str) -> bool:
t = (text or "").lower()
for k in KEYWORDS:
if k.lower() in t:
return True
return False
def _fetch_one_feed(name: str, url: str, timeout: int) -> List[dict]:
"""抓取单个 RSS 源,超时或异常返回空列表。不负责去重。"""
old_timeout = socket.getdefaulttimeout()
socket.setdefaulttimeout(timeout)
try:
feed = feedparser.parse(
url,
request_headers={"User-Agent": "US-Iran-Dashboard/1.0"},
agent="US-Iran-Dashboard/1.0",
)
except Exception as e:
print(f" [rss] {name} error: {e}")
return []
finally:
socket.setdefaulttimeout(old_timeout)
out = []
for entry in feed.entries:
title = getattr(entry, "title", "") or ""
raw_summary = getattr(entry, "summary", "") or getattr(entry, "description", "") or ""
summary = _strip_html(raw_summary)
link = getattr(entry, "link", "") or ""
text = f"{title} {summary}"
if not _matches_keywords(text):
continue
published = _parse_date(entry)
cat, sev = classify_and_severity(text)
out.append({
"title": title,
"summary": summary[:400] if summary else title,
"url": link,
"published": published,
"category": cat,
"severity": sev,
"source": name,
})
return out
def fetch_all() -> List[dict]:
"""抓取所有配置的 RSS 源,按源超时与隔离错误,全局去重后返回。"""
sources = get_feed_sources()
if not sources:
return []
items: List[dict] = []
seen: Set[Tuple[str, str]] = set()
for name, url in sources:
batch = _fetch_one_feed(name, url, FEED_TIMEOUT)
for item in batch:
key = (item["title"][:80], item["url"])
if key in seen:
continue
seen.add(key)
items.append(item)
return items

View File

@@ -0,0 +1 @@
# crawler tests

Binary file not shown.

View File

@@ -0,0 +1,198 @@
# -*- coding: utf-8 -*-
"""
爬虫数据清洗与字段映射测试
验证 extractor_rules、extractor_dashscope、db_merge 的正确性
"""
import os
import sqlite3
import tempfile
from pathlib import Path
import pytest
# 确保 crawler 在 path 中
ROOT = Path(__file__).resolve().parent.parent
if str(ROOT) not in __import__("sys").path:
__import__("sys").path.insert(0, str(ROOT))
from extractor_rules import extract_from_news as extract_rules
class TestExtractorRules:
"""规则提取器单元测试"""
def test_trump_1000_targets_no_bases(self):
"""特朗普说伊朗有1000个军事目标遭到袭击 -> 不应提取 bases_destroyed/bases_damaged"""
text = "特朗普说伊朗有1000个军事目标遭到袭击美国已做好进一步打击准备"
out = extract_rules(text)
delta = out.get("combat_losses_delta", {})
for side in ("us", "iran"):
if side in delta:
assert delta[side].get("bases_destroyed") is None, f"{side} bases_destroyed 不应被提取"
assert delta[side].get("bases_damaged") is None, f"{side} bases_damaged 不应被提取"
def test_base_damaged_when_explicit(self):
"""阿萨德基地遭袭 -> 应提取 key_location_updates且 combat_losses 若有则正确"""
text = "阿萨德空军基地遭袭,损失严重"
out = extract_rules(text)
# 规则会触发 key_location_updates因为 base_attacked 且匹配 阿萨德)
assert "key_location_updates" in out
kl = out["key_location_updates"]
assert len(kl) >= 1
assert any(u.get("side") == "us" and "阿萨德" in (u.get("name_keywords") or "") for u in kl)
def test_us_personnel_killed(self):
"""3名美军阵亡 -> personnel_killed=3"""
text = "据报道3名美军阵亡另有5人受伤"
out = extract_rules(text)
assert "combat_losses_delta" in out
us = out["combat_losses_delta"].get("us", {})
assert us.get("personnel_killed") == 3
assert us.get("personnel_wounded") == 5
def test_iran_personnel_killed(self):
"""10名伊朗士兵死亡"""
text = "伊朗方面称10名伊朗士兵死亡"
out = extract_rules(text)
iran = out.get("combat_losses_delta", {}).get("iran", {})
assert iran.get("personnel_killed") == 10
def test_civilian_us_context(self):
"""美军空袭造成50名平民伤亡 -> loss_us"""
text = "美军空袭造成50名平民伤亡"
out = extract_rules(text)
us = out.get("combat_losses_delta", {}).get("us", {})
assert us.get("civilian_killed") == 50
def test_civilian_iran_context(self):
"""伊朗空袭造成伊拉克平民50人伤亡 -> loss_ir"""
text = "伊朗空袭造成伊拉克平民50人伤亡"
out = extract_rules(text)
iran = out.get("combat_losses_delta", {}).get("iran", {})
assert iran.get("civilian_killed") == 50
def test_drone_attribution_iran(self):
"""美军击落伊朗10架无人机 -> iran drones=10"""
text = "美军击落伊朗10架无人机"
out = extract_rules(text)
iran = out.get("combat_losses_delta", {}).get("iran", {})
assert iran.get("drones") == 10
def test_empty_or_short_text(self):
"""短文本或无内容 -> 无 combat_losses"""
assert extract_rules("") == {} or "combat_losses_delta" not in extract_rules("")
assert "combat_losses_delta" not in extract_rules("abc") or not extract_rules("abc").get("combat_losses_delta")
class TestDbMerge:
"""db_merge 字段映射与增量逻辑测试"""
@pytest.fixture
def temp_db(self):
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
path = f.name
yield path
try:
os.unlink(path)
except OSError:
pass
def test_merge_combat_losses_delta(self, temp_db):
"""merge 正确将 combat_losses_delta 叠加到 DB"""
from db_merge import merge
merge({"combat_losses_delta": {"us": {"personnel_killed": 3, "personnel_wounded": 2}}}, db_path=temp_db)
merge({"combat_losses_delta": {"us": {"personnel_killed": 2}}}, db_path=temp_db)
conn = sqlite3.connect(temp_db)
row = conn.execute("SELECT personnel_killed, personnel_wounded FROM combat_losses WHERE side='us'").fetchone()
conn.close()
assert row[0] == 5
assert row[1] == 2
def test_merge_all_combat_fields(self, temp_db):
"""merge 正确映射所有 combat_losses 字段"""
from db_merge import merge
delta = {
"personnel_killed": 1,
"personnel_wounded": 2,
"civilian_killed": 3,
"civilian_wounded": 4,
"bases_destroyed": 1,
"bases_damaged": 2,
"aircraft": 3,
"warships": 4,
"armor": 5,
"vehicles": 6,
"drones": 7,
"missiles": 8,
"helicopters": 9,
"submarines": 10,
}
merge({"combat_losses_delta": {"iran": delta}}, db_path=temp_db)
conn = sqlite3.connect(temp_db)
row = conn.execute(
"""SELECT personnel_killed, personnel_wounded, civilian_killed, civilian_wounded,
bases_destroyed, bases_damaged, aircraft, warships, armor, vehicles,
drones, missiles, helicopters, submarines FROM combat_losses WHERE side='iran'"""
).fetchone()
conn.close()
assert row == (1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
def test_merge_key_location_requires_table(self, temp_db):
"""key_location_updates 需要 key_location 表中有行才能更新"""
from db_merge import merge
conn = sqlite3.connect(temp_db)
conn.execute(
"""CREATE TABLE IF NOT EXISTS key_location (id INTEGER PRIMARY KEY, side TEXT, name TEXT, lat REAL, lng REAL, type TEXT, region TEXT, status TEXT, damage_level INTEGER)"""
)
conn.execute(
"INSERT INTO key_location (side, name, lat, lng, type, region, status, damage_level) VALUES ('us', '阿萨德空军基地', 33.0, 43.0, 'Base', 'IRQ', 'operational', 0)"
)
conn.commit()
conn.close()
merge(
{"key_location_updates": [{"name_keywords": "阿萨德|asad", "side": "us", "status": "attacked", "damage_level": 2}]},
db_path=temp_db,
)
conn = sqlite3.connect(temp_db)
row = conn.execute("SELECT status, damage_level FROM key_location WHERE name LIKE '%阿萨德%'").fetchone()
conn.close()
assert row[0] == "attacked"
assert row[1] == 2
class TestEndToEndTrumpExample:
"""端到端:特朗普 1000 军事目标案例"""
def test_full_pipeline_trump_no_bases(self, tmp_path):
"""完整流程:规则提取 + merge特朗普案例不应增加 bases"""
from db_merge import merge
db_path = str(tmp_path / "test.db")
(tmp_path / "test.db").touch() # 确保文件存在merge 才会执行
merge({"combat_losses_delta": {"us": {"bases_destroyed": 0, "bases_damaged": 0}, "iran": {"bases_destroyed": 0, "bases_damaged": 0}}}, db_path=db_path)
text = "特朗普说伊朗有1000个军事目标遭到袭击"
out = extract_rules(text)
# 规则提取不应包含 bases
assert "combat_losses_delta" not in out or (
"iran" not in out.get("combat_losses_delta", {})
or out["combat_losses_delta"].get("iran", {}).get("bases_destroyed") is None
and out["combat_losses_delta"].get("iran", {}).get("bases_damaged") is None
)
if "combat_losses_delta" in out:
merge(out, db_path=db_path)
conn = sqlite3.connect(db_path)
iran = conn.execute("SELECT bases_destroyed, bases_damaged FROM combat_losses WHERE side='iran'").fetchone()
conn.close()
# 若提取器没输出 basesmerge 不会改;若有错误输出则需要为 0
if iran:
assert iran[0] == 0
assert iran[1] == 0

View File

@@ -0,0 +1,44 @@
# -*- coding: utf-8 -*-
"""英译中,入库前统一翻译"""
import os
import re
from typing import Optional
def _is_mostly_chinese(text: str) -> bool:
if not text or len(text.strip()) < 2:
return False
chinese = len(re.findall(r"[\u4e00-\u9fff]", text))
return chinese / max(len(text), 1) > 0.3
def translate_to_chinese(text: str) -> str:
"""将文本翻译成中文,失败或已是中文则返回原文。
说明:
- 默认关闭外部翻译deep_translator直接返回原文避免因网络或代理问题阻塞整条流水线。
- 如需开启翻译,可显式设置环境变量 TRANSLATE_DISABLED=0。
"""
if not text or not text.strip():
return text
# 默认禁用翻译TRANSLATE_DISABLED 未设置时视为开启(值为 "1"
if os.environ.get("TRANSLATE_DISABLED", "1") == "1":
return text
s = str(text).strip()
if len(s) > 2000:
s = s[:2000]
if _is_mostly_chinese(s):
return text
for translator in ["google", "mymemory"]:
try:
if translator == "google":
from deep_translator import GoogleTranslator
out = GoogleTranslator(source="auto", target="zh-CN").translate(s)
else:
from deep_translator import MyMemoryTranslator
out = MyMemoryTranslator(source="auto", target="zh-CN").translate(s)
if out and out.strip() and out != s:
return out
except Exception:
continue
return text

13
docker-compose.dev.yml Normal file
View File

@@ -0,0 +1,13 @@
# 开发模式:挂载源码 + 热重载,代码更新后无需重新 build
# 使用: docker compose -f docker-compose.yml -f docker-compose.dev.yml up -d
# 或: docker compose --profile dev up -d (需在 dev 服务加 profiles)
services:
api:
volumes:
- ./server:/app/server:ro
command: ["node", "--watch", "server/index.js"]
crawler:
volumes:
- ./crawler:/app
command: ["uvicorn", "realtime_conflict_service:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]

37
docker-compose.yml Normal file
View File

@@ -0,0 +1,37 @@
services:
api:
image: usa-dashboard-api:latest
build:
context: .
args:
- VITE_MAPBOX_ACCESS_TOKEN=${VITE_MAPBOX_ACCESS_TOKEN:-}
ports:
- "3001:3001"
environment:
- DB_PATH=/data/data.db
- API_PORT=3001
volumes:
- app-data:/data
restart: unless-stopped
crawler:
image: usa-dashboard-crawler:latest
build:
context: .
dockerfile: Dockerfile.crawler
ports:
- "8000:8000"
environment:
- DB_PATH=/data/data.db
- API_BASE=http://api:3001
- GDELT_DISABLED=1
- RSS_INTERVAL_SEC=60
- DASHSCOPE_API_KEY=${DASHSCOPE_API_KEY:-}
volumes:
- app-data:/data
depends_on:
- api
restart: unless-stopped
volumes:
app-data:

8
docker-entrypoint.sh Normal file
View File

@@ -0,0 +1,8 @@
#!/bin/sh
set -e
export DB_PATH="${DB_PATH:-/data/data.db}"
if [ ! -f "$DB_PATH" ]; then
echo "==> Seeding database..."
node server/seed.js
fi
exec node server/index.js

91
docs/BACKEND_MODULES.md Normal file
View File

@@ -0,0 +1,91 @@
# 后端模块说明
## 一、现有模块结构
```
server/
├── index.js # HTTP + WebSocket 入口
├── routes.js # REST API 路由
├── db.js # SQLite schema 与连接
├── situationData.js # 态势数据聚合 (从 DB 读取)
├── seed.js # 初始数据填充
├── data.db # SQLite 数据库
└── package.json
crawler/
├── realtime_conflict_service.py # GDELT 实时冲突服务 (核心)
├── requirements.txt
├── config.py, db_writer.py # 旧 RSS 爬虫(可保留)
├── main.py
└── README.md
```
### 1. server/index.js
- Express + CORS
- WebSocket (`/ws`),每 5 秒广播 `situation`
- `POST /api/crawler/notify`:爬虫写入后触发立即广播
### 2. server/routes.js
- `GET /api/situation`:完整态势
- `GET /api/events`GDELT 事件 + 冲突统计
- `GET /api/health`:健康检查
### 3. server/db.js
- 表:`situation``force_summary``power_index``force_asset`
`key_location``combat_losses``wall_street_trend`
`retaliation_current``retaliation_history``situation_update`
**`gdelt_events`**、**`conflict_stats`**
---
## 二、GDELT 核心数据源
**GDELT Project**:全球冲突数据库,约 15 分钟级更新,含经纬度、事件编码、参与方、事件强度。
### realtime_conflict_service.py
- 定时(默认 60 秒)从 GDELT API 抓取
- 冲突强度评分missile +3, strike +2, killed +4 等
- 无经纬度时默认攻击源:`IRAN_COORD = [51.3890, 35.6892]`
- 写入 `gdelt_events``conflict_stats`
- 调用 `POST /api/crawler/notify` 触发 Node 广播
### 冲突强度 → 地图效果
| impact_score | 效果 |
|--------------|------------|
| 13 | 绿色点 |
| 46 | 橙色闪烁 |
| 710 | 红色脉冲扩散 |
### 战损统计模型(展示用)
- `total_events`
- `high_impact_events` (impact ≥ 7)
- `estimated_casualties`
- `estimated_strike_count`
---
## 三、数据流
```
GDELT API → Python 服务(60s) → gdelt_events, conflict_stats
POST /api/crawler/notify → situation.updated_at
WebSocket 广播 getSituation() → 前端
```
---
## 四、运行方式
```bash
# 1. 启动 Node API
npm run api
# 2. 启动 GDELT 服务
npm run gdelt
# 或: cd crawler && uvicorn realtime_conflict_service:app --port 8000
```

137
docs/CRAWLER_LOGIC.md Normal file
View File

@@ -0,0 +1,137 @@
# 爬虫逻辑梳理与数据校验
## 一、两条入口,数据流不同
### 1. 入口 A`npm run crawler`main.py
- **流程**RSS 抓取 → 关键词过滤 → 分类/严重度 → **直接写 situation_update** → 通知 API
- **不经过**翻译、news_content、AI 提取(战损/基地等)
- **写入表**`situation_update``situation.updated_at`
- **用途**:轻量、只给「事件脉络」喂新条目,不更新战损/基地/报复指数
```
RSS_FEEDS → fetch_all() → KEYWORDS 过滤 → parser_ai.classify_and_severity
→ write_updates(items) → situation_update INSERT + situation 表 touch
→ notify_api()
```
### 2. 入口 B`npm run gdelt`realtime_conflict_service.py
- **流程**RSS 抓取 → 翻译 → 清洗 → **news_content 去重** → situation_update → **AI 提取 → db_merge** → GDELT 事件(可选)→ 通知 API
- **写入表**`news_content``situation_update``situation`;提取后还有 `combat_losses``key_location``retaliation_*``wall_street_trend`
- **用途**:完整管线,前端「战损 / 军事基地 / 报复 / 美股」等数据都依赖这条
```
RSS → fetch_all() → translate_to_chinese → cleaner_ai → save_and_dedup → news_content
→ write_updates(new_items) → situation_update
→ _extract_and_merge_panel_data(new_items) → extract_from_news() → db_merge.merge()
→ (可选) fetch_gdelt_events() → gdelt_events, conflict_stats
→ _notify_node()
```
**结论**:要检查「抓回的数据是否有效」且包含战损/基地等,应跑 **入口 B**gdelt 服务);若只关心事件脉络条数,可看入口 A。
---
## 二、入口 B 逐步拆解(用于逐段校验)
### 2.1 RSS 抓取与过滤
| 步骤 | 位置 | 说明 |
|------|------|------|
| 源列表 | `config.RSS_FEEDS` | 多国媒体 RSS见 config.py |
| 抓取 | `scrapers.rss_scraper.fetch_all()` | feedparser单源超时 10s |
| 过滤 | `_matches_keywords(text)` | 标题+摘要 至少命中 `config.KEYWORDS` 中一个才保留 |
| 去重 | `(title[:80], link)` | 同一条不重复加入当次列表 |
| 分类 | `parser_ai.classify_and_severity(text)` | 得到 category、severityOllama 或规则) |
**校验**`npm run crawler:test` 看本次抓到的条数;若为 0查网络或放宽/检查 KEYWORDS。
### 2.2 翻译与清洗(仅入口 B
| 步骤 | 位置 | 说明 |
|------|------|------|
| 翻译 | `translate_utils.translate_to_chinese()` | 标题/摘要译成中文(依赖配置) |
| 清洗 | `cleaner_ai.clean_news_for_panel()` | 截断、清理;`ensure_category` / `ensure_severity` 合法化 |
### 2.3 落库news_content去重与 situation_update
| 步骤 | 位置 | 说明 |
|------|------|------|
| 去重 | `news_storage.save_and_dedup(items)` | 按 `content_hash(title, summary, url)` 判重,只插入新记录 |
| 表 | `news_content` | id, content_hash, title, summary, url, source, published_at, category, severity |
| 表 | `situation_update` | 仅对 **去重后的 new_items** 调用 `write_updates()`,供前端「事件脉络」 |
**校验**
- `news_content``SELECT COUNT(*), MAX(published_at) FROM news_content`
- `situation_update``SELECT COUNT(*), MAX(timestamp) FROM situation_update`
- 服务状态:`GET http://localhost:8000/crawler/status``last_fetch_items` / `last_fetch_inserted` / `last_fetch_error`
### 2.4 AI 提取与 db_merge战损 / 基地 / 报复等)
| 步骤 | 位置 | 说明 |
|------|------|------|
| 输入 | `_extract_and_merge_panel_data(new_items)` | 仅处理本次 **新增** 的 new_items前 limit 条DashScope 10 条,规则 25 条Ollama 10 条) |
| 文本 | 每条 `title + " " + summary`,长度 &lt; 20 跳过 |
| 提取器选择 | 环境变量 | `DASHSCOPE_API_KEY` → extractor_dashscope`CLEANER_AI_DISABLED=1` → extractor_rules否则 extractor_aiOllama |
| 输出结构 | 见 panel_schema / 各 extractor | `situation_update?`, `combat_losses_delta?`, `retaliation?`, `wall_street?`, `key_location_updates?` |
| 合并 | `db_merge.merge(extracted)` | 见下表 |
**merge 映射概要**
| 提取字段 | 写入表/逻辑 |
|----------|-------------|
| situation_update | situation_update 表 INSERTid 为 hash |
| combat_losses_delta | combat_losses 表,按 side 增量叠加 |
| retaliation | retaliation_current 替换 + retaliation_history 追加 |
| wall_street | wall_street_trend 表 INSERT |
| key_location_updates | key_location 表 UPDATE status/damage_levelname LIKE 关键词) |
**校验**
- 战损:`SELECT * FROM combat_losses`
- 基地:`SELECT id, name, side, status, damage_level FROM key_location WHERE status != 'operational' OR damage_level > 0`
- 报复:`SELECT * FROM retaliation_current``retaliation_history` 最近几条
- 事件脉络:`SELECT id, timestamp, category, summary, severity FROM situation_update ORDER BY timestamp DESC LIMIT 20`
### 2.5 GDELT可选
- `GDELT_DISABLED=1` 时跳过 GDELT仅用 RSS可用 `_rss_to_gdelt_fallback()` 用 RSS 标题生成 gdelt_events。
- 未禁用时:`fetch_gdelt_events()` 拉 GDELT → 写 `gdelt_events``conflict_stats`
**校验**`SELECT COUNT(*), MAX(event_time) FROM gdelt_events``SELECT * FROM conflict_stats WHERE id=1`
---
## 三、如何检查「抓回的数据是否有效」
1. **确认跑的入口**
- 只跑 `npm run crawler`:只有 situation_update 会有新数据,战损/基地不会变。
-`npm run gdelt` 且服务常驻:才会既有 situation_update又有 combat_losses、key_location 等。
2. **看 DB 与 API**
- 同上:查 `news_content``situation_update``combat_losses``key_location``retaliation_*``gdelt_events``conflict_stats`
- 前端数据来源:`GET /api/situation`(见 server/situationData.js对照上述表即可。
3. **看提取是否触发**
-`combat_losses` / `key_location` 一直不更新:确认是入口 B、有 new_items、提取器未报错可对单条新闻跑 `extract_from_news(text)` 看是否产出 combat_losses_delta / key_location_updates。
4. **重跑历史提取(补数据)**
- `POST http://localhost:8000/crawler/backfill`:用当前 situation_update 最近 50 条重新做一次提取并 merge可用来修历史未提取的数据。
---
## 四、配置与环境变量(与数据有效性相关)
| 变量 | 作用 |
|------|------|
| DB_PATH | 与 server 共用的 SQLite 路径,必须一致 |
| API_BASE | 通知 Node 的地址merge 后通知前端 |
| DASHSCOPE_API_KEY | 有则用 DashScope 提取;无则用 Ollama 或规则 |
| CLEANER_AI_DISABLED=1 | 用规则提取extractor_rules不用 Ollama |
| GDELT_DISABLED=1 | 不用 GDELT仅 RSSRSS 可转 gdelt_events 占位 |
| CRAWL_INTERVAL | main.py 抓取间隔(秒) |
| RSS_INTERVAL_SEC / FETCH_INTERVAL_SEC | realtime 服务里 RSS / GDELT 间隔 |
按上述顺序对照「入口 → RSS → 去重 → situation_update → 提取 → merge → 表」即可逐段检查爬虫抓回的数据是否有效。

65
docs/CRAWLER_PIPELINE.md Normal file
View File

@@ -0,0 +1,65 @@
# 爬虫数据流水线
## 数据流
```
RSS 抓取
↓ 翻译、清洗
↓ news_storage.save_and_dedup() → 历史去重
news_content资讯独立表供后续消费
↓ 去重后的新数据
situation_update面板展示用
↓ AI 提取(阿里云 DashScope
combat_losses / retaliation / key_location / wall_street_trend
↓ notify Node
前端 WebSocket + 轮询
```
## 阿里云 DashScope API Key
设置环境变量 `DASHSCOPE_API_KEY` 后,爬虫使用阿里云通义千问进行 AI 提取。不设置时回退到规则提取(`extractor_rules`)或 Ollama若可用
```bash
# 本地
export DASHSCOPE_API_KEY=sk-xxx
# Docker
docker compose up -d -e DASHSCOPE_API_KEY=sk-xxx
# 或在 .env 中写入 DASHSCOPE_API_KEY=sk-xxx
```
## 表说明
| 表 | 用途 |
|----|------|
| `news_content` | 资讯原文独立存储支持去重content_hash供后续消费 |
| `situation_update` | 面板「近期更新」展示 |
| `combat_losses` | 战损数据AI/规则提取) |
| `key_location` | 基地状态 |
| `gdelt_events` | 地图冲突点 |
## 去重逻辑
根据 `content_hash = sha256(normalize(title) + normalize(summary) + url)` 判断,相同或高度相似内容视为重复,不入库。
## 消费资讯
- HTTP: `GET /api/news?limit=50`
- 调试: `/db` 面板查看 `news_content`
## 链路验证
运行脚本一键检查全链路:
```bash
./scripts/verify-pipeline.sh
```
支持环境变量覆盖:`API_URL``CRAWLER_URL`

62
docs/DATA_FLOW.md Normal file
View File

@@ -0,0 +1,62 @@
# 前端数据更新链路与字段映射
## 1. 前端数据点
| 组件 | 数据 | API 字段 | DB 表/列 |
|------|------|----------|----------|
| HeaderPanel | lastUpdated | situation.lastUpdated | situation.updated_at |
| HeaderPanel | powerIndex | usForces/iranForces.powerIndex | power_index |
| HeaderPanel | feedbackCount, shareCount | POST /api/feedback, /api/share | feedback, share_count |
| TimelinePanel | recentUpdates | situation.recentUpdates | situation_update |
| WarMap | keyLocations | usForces/iranForces.keyLocations | key_location |
| BaseStatusPanel | 基地统计 | keyLocations (status, damage_level) | key_location |
| CombatLossesPanel | 人员/平民伤亡 | combatLosses, civilianCasualtiesTotal | combat_losses |
| CombatLossesOtherPanel | 装备毁伤 | combatLosses (bases, aircraft, drones, …) | combat_losses |
| PowerChart | 雷达图 | powerIndex | power_index |
| WallStreetTrend | 美股趋势 | wallStreetInvestmentTrend | wall_street_trend |
| RetaliationGauge | 报复指数 | retaliationSentiment | retaliation_current/history |
**轮询**: `fetchSituation()` 加载WebSocket `/ws` 每 3 秒广播。`GET /api/situation``getSituation()`
## 2. 爬虫 → DB 字段映射
| 提取器输出 | DB 表 | 逻辑 |
|------------|-------|------|
| situation_update | situation_update | INSERT |
| combat_losses_delta | combat_losses | 增量叠加 (ADD) |
| retaliation | retaliation_current, retaliation_history | REPLACE / APPEND |
| wall_street | wall_street_trend | INSERT |
| key_location_updates | key_location | UPDATE status, damage_level WHERE name LIKE |
### combat_losses 字段对应
| 提取器 (us/iran) | DB 列 |
|------------------|-------|
| personnel_killed | personnel_killed |
| personnel_wounded | personnel_wounded |
| civilian_killed | civilian_killed |
| civilian_wounded | civilian_wounded |
| bases_destroyed | bases_destroyed |
| bases_damaged | bases_damaged |
| aircraft, warships, armor, vehicles | 同名 |
| drones, missiles, helicopters, submarines | 同名 |
## 3. 测试用例
运行: `npm run crawler:test:extraction`
| 用例 | 输入 | 预期 |
|------|------|------|
| 特朗普 1000 军事目标 | "特朗普说伊朗有1000个军事目标遭到袭击" | 不提取 bases_destroyed/bases_damaged |
| 阿萨德基地遭袭 | "阿萨德空军基地遭袭,损失严重" | 输出 key_location_updates |
| 美军伤亡 | "3名美军阵亡另有5人受伤" | personnel_killed=3, personnel_wounded=5 |
| 伊朗平民 | "伊朗空袭造成伊拉克平民50人伤亡" | iran.civilian_killed=50 |
| 伊朗无人机 | "美军击落伊朗10架无人机" | iran.drones=10 |
| db_merge 增量 | 两次 merge 3+2 | personnel_killed=5 |
## 4. 注意事项
- **bases_***: 仅指已确认损毁/受损的基地;"军事目标"/targets 不填 bases_*。
- **正则 [\s\w]***: 会匹配数字,导致 (\d+) 只捕获末位;数字前用 `[^\d]*`
- **伊朗平民**: 规则已支持 "伊朗空袭造成…平民" 归入 loss_ir。
- **key_location**: 需 name LIKE '%keyword%' 匹配,关键词见 extractor_rules.bases_all。

269
docs/DEBUG_PANELS.md Normal file
View File

@@ -0,0 +1,269 @@
# 看板板块逐项调试指南
本文档按前端每个板块列出:**数据来源表**、**谁写入**、**如何验证**、**常见问题**,便于逐项排查。
---
## 数据流总览
```
前端 Dashboard
→ useReplaySituation() → situation (来自 WebSocket / GET /api/situation)
→ getSituation() 读 server/situationData.js
→ 从 SQLite (server/data.db) 多表 SELECT 后拼成 JSON
```
- **写入方**`server/seed.js`(初始化)、爬虫流水线(`crawler/pipeline.py` + `db_merge.py`、GDELT 服务(`gdelt_events` / `conflict_stats`)。
- **读入方**:仅 `server/situationData.js``getSituation()`,被 `/api/situation` 与 WebSocket 广播使用。
---
## 1. 顶栏 (HeaderPanel)
| 项目 | 数据来源 | 写入方 | 说明 |
|------|----------|--------|------|
| 最后更新时间 | `situation.lastUpdated` | `situation.updated_at`(表 `situation` id=1 | 爬虫 notify 时更新 |
| 在看/看过 | `stats.viewers` / `stats.cumulative` | `visits` / `visitor_count`,见 `POST /api/visit` | 与爬虫无关 |
| 美/伊战力条 | `usForces.powerIndex.overall` / `iranForces.powerIndex.overall` | `power_index` 表 | **仅 seed** |
**验证**
- `curl -s http://localhost:3001/api/situation | jq '.lastUpdated, .usForces.powerIndex.overall, .iranForces.powerIndex.overall'`
- 看板顶栏是否显示时间、双战力数值。
**常见问题**
- `lastUpdated` 不变:爬虫未调 `POST /api/crawler/notify` 或 Node 未执行 `reloadFromFile()`
- 战力条为 0未跑 seed 或 `power_index` 无数据。
---
## 2. 事件脉络 / 时间线 (TimelinePanel → EventTimelinePanel + RecentUpdatesPanel)
| 项目 | 数据来源 | 写入方 | 说明 |
|------|----------|--------|------|
| 近期更新列表 | `situation.recentUpdates` | `situation_update`ORDER BY timestamp DESC LIMIT 50 | 爬虫 `write_updates(new_items)` + seed 若干条 |
**验证**
- `curl -s http://localhost:3001/api/situation | jq '.recentUpdates | length'`
- `curl -s http://localhost:3001/api/situation | jq '.recentUpdates[0]'`
- 或用调试接口:`curl -s -H "x-api-key: $API_ADMIN_KEY" http://localhost:3001/api/db/dashboard | jq '.situation_update | length'`
**常见问题**
- 条数为 0未 seed 且爬虫未写入;或爬虫只跑 main.py入口 A未跑 gdelt入口 B仍会写 `situation_update`,但若 RSS 抓取 0 条则无新数据。
- 不更新:爬虫未启动;或未调 notify或 Node 与爬虫用的不是同一个 `data.db`(路径/环境变量不一致)。
---
## 3. 地图 (WarMap)
| 项目 | 数据来源 | 写入方 | 说明 |
|------|----------|--------|------|
| 美军据点 | `usForces.keyLocations` | `key_location` WHERE side='us' | seed 全量;爬虫通过 `key_location_updates` 只更新 status/damage_level |
| 伊朗据点 | `iranForces.keyLocations` | `key_location` WHERE side='iran' | 同上 |
| 冲突点(绿/橙/红) | `situation.conflictEvents` | `gdelt_events`ORDER BY event_time DESC LIMIT 30 | GDELT API 写入;或 GDELT 关闭时 RSS 回填 |
**验证**
- `curl -s http://localhost:3001/api/situation | jq '.usForces.keyLocations | length, .conflictEvents | length'`
- 地图上是否有基地/舰船点位、是否有冲突点图层。
**常见问题**
- 无冲突点:`gdelt_events` 为空;未跑 gdelt 或 GDELT 被墙且未用 RSS 回填(`_rss_to_gdelt_fallback`)。
- 基地状态不更新:爬虫提取的 `key_location_updates``name_keywords``key_location.name` 无法 LIKE 匹配(名称不一致)。
---
## 4. 美国基地状态 (BaseStatusPanel)
| 项目 | 数据来源 | 写入方 | 说明 |
|------|----------|--------|------|
| 基地列表 | `usForces.keyLocations``type === 'Base'` | `key_location` side='us' | 同 WarMap |
**验证**
- `curl -s http://localhost:3001/api/situation | jq '[.usForces.keyLocations[] | select(.type == "Base")] | length'`
- 看板左侧「美国基地」是否展示且状态/损伤与预期一致。
**常见问题**
- 与「地图」一致;若 seed 的 key_location 有 type/region而爬虫只更新 status/damage_level名称必须能与 extractor 的 name_keywords 匹配。
---
## 5. 战损 (CombatLossesPanel + CombatLossesOtherPanel)
| 项目 | 数据来源 | 写入方 | 说明 |
|------|----------|--------|------|
| 美军/伊朗阵亡/受伤/装备等 | `usForces.combatLosses` / `iranForces.combatLosses` | `combat_losses`side=us/iran | seed 初始值;爬虫 AI 提取 `combat_losses_delta` 后 db_merge **增量**叠加 |
| 冲突统计(估计伤亡等) | `situation.conflictStats` | `conflict_stats` 表 id=1 | GDELT 或 RSS 回填时写入 |
| 平民伤亡合计 | `situation.civilianCasualtiesTotal` | 由 combat_losses 双方平民字段 + conflict_stats.estimated_casualties 计算 | 见 situationData.js |
**验证**
- `curl -s http://localhost:3001/api/situation | jq '.usForces.combatLosses, .iranForces.combatLosses, .conflictStats'`
- 看板战损数字是否与 API 一致。
**常见问题**
- 战损一直不变:新闻中无明确伤亡/装备数字;或未跑入口 Bgdelt或 AI 提取器未启用/报错Ollama/通义/规则);或 merge 时单次增量被上限截断。
- 数字异常大:提取器误把「累计总数」当成本条增量;已用 `MAX_DELTA_PER_MERGE` 做上限。
---
## 6. 伊朗基地状态 (IranBaseStatusPanel)
| 项目 | 数据来源 | 写入方 | 说明 |
|------|----------|--------|------|
| 基地/港/核/导弹等 | `iranForces.keyLocations` 中 type 为 Base/Port/Nuclear/Missile | `key_location` side='iran' | 同 WarMap |
**验证与常见问题**
- 同「美国基地」;确保 seed 中伊朗 key_location 的 name 与爬虫 extractor 的 name_keywords 能匹配(如德黑兰、伊斯法罕、布什尔等)。
---
## 7. 战力对比图 (PowerChart)
| 项目 | 数据来源 | 写入方 | 说明 |
|------|----------|--------|------|
| 美/伊战力指数 | `usForces.powerIndex` / `iranForces.powerIndex` | `power_index` 表 | **仅 seed**,爬虫不写 |
**验证**
- `curl -s http://localhost:3001/api/situation | jq '.usForces.powerIndex, .iranForces.powerIndex'`
**常见问题**
- 为 0 或缺失:未执行 seed`power_index` 表空。
---
## 8. 华尔街/投资趋势 (InvestmentTrendChart)
| 项目 | 数据来源 | 写入方 | 说明 |
|------|----------|--------|------|
| 时间序列 | `usForces.wallStreetInvestmentTrend` | `wall_street_trend`time, value | seed 写入初始曲线;爬虫仅在提取出 `wall_street`**INSERT 新点** |
**验证**
- `curl -s http://localhost:3001/api/situation | jq '.usForces.wallStreetInvestmentTrend | length'`
- 看板右侧美国下方趋势图是否有数据。
**常见问题**
- 无曲线:未 seed 或表空。
- 不随新闻更新:提取器未输出 `wall_street` 或新闻中无相关表述。
---
## 9. 美国力量摘要 (ForcePanel side=us)
| 项目 | 数据来源 | 写入方 | 说明 |
|------|----------|--------|------|
| 摘要数字 | `usForces.summary` | `force_summary` side='us' | **仅 seed** |
| 战力指数 | `usForces.powerIndex` | `power_index` | **仅 seed** |
| 资产列表 | `usForces.assets` | `force_asset` side='us' | **仅 seed** |
**验证**
- `curl -s http://localhost:3001/api/situation | jq '.usForces.summary, .usForces.assets | length'`
**常见问题**
- 全为 0 或空:未 seed爬虫不更新这些表。
---
## 10. 报复情绪 (RetaliationGauge)
| 项目 | 数据来源 | 写入方 | 说明 |
|------|----------|--------|------|
| 当前值 | `iranForces.retaliationSentiment` | `retaliation_current` id=1 | seed 初始;爬虫提取 `retaliation`**替换** 当前值并 **追加** history |
| 历史曲线 | `iranForces.retaliationSentimentHistory` | `retaliation_history` 表 | 同上 |
**验证**
- `curl -s http://localhost:3001/api/situation | jq '.iranForces.retaliationSentiment, .iranForces.retaliationSentimentHistory | length'`
**常见问题**
- 不更新:新闻中无报复相关表述;或提取器未输出 `retaliation`
---
## 11. 伊朗力量摘要 (ForcePanel side=iran)
| 项目 | 数据来源 | 写入方 | 说明 |
|------|----------|--------|------|
| 同美国侧 | `iranForces.summary` / `powerIndex` / `assets` | `force_summary` / `power_index` / `force_asset` side='iran' | **仅 seed** |
**验证与常见问题**
- 同「美国力量摘要」。
---
## 12. 资讯列表 (GET /api/news若有单独页面消费)
| 项目 | 数据来源 | 写入方 | 说明 |
|------|----------|--------|------|
| 资讯行 | `news_content` 表 | 爬虫 `save_and_dedup` 后写入 | 仅入口 B 流水线;事件脉络来自 situation_update资讯表独立 |
**验证**
- `curl -s -H "x-api-key: $API_ADMIN_KEY" http://localhost:3001/api/news?limit=5 | jq '.items | length'`
- 若未配 ADMIN_KEY部分环境可能不鉴权也可访问视 routes 配置而定。
**常见问题**
- `items` 为 0未跑入口 B或去重后无新增或 RSS 抓取 0 条。
---
## 快速检查命令汇总
```bash
# 1. API 与态势整体
curl -s http://localhost:3001/api/health
curl -s http://localhost:3001/api/situation | jq '{
lastUpdated,
recentUpdates: (.recentUpdates | length),
conflictEvents: (.conflictEvents | length),
usPower: .usForces.powerIndex.overall,
iranPower: .iranForces.powerIndex.overall,
usLosses: .usForces.combatLosses.personnelCasualties,
iranLosses: .iranForces.combatLosses.personnelCasualties,
usBases: (.usForces.keyLocations | length),
iranBases: (.iranForces.keyLocations | length),
wallStreetLen: (.usForces.wallStreetInvestmentTrend | length),
retaliationCur: .iranForces.retaliationSentiment
}'
# 2. 各表行数(需 sqlite3
DB="${DB_PATH:-server/data.db}"
for t in force_summary power_index force_asset key_location combat_losses wall_street_trend retaliation_current retaliation_history situation_update gdelt_events conflict_stats news_content; do
echo -n "$t: "; sqlite3 "$DB" "SELECT COUNT(*) FROM $t" 2>/dev/null || echo "?"
done
# 3. 爬虫状态与通知
curl -s http://localhost:8000/crawler/status | jq .
curl -s -X POST http://localhost:3001/api/crawler/notify
```
---
## 建议调试顺序
1. **先确认 API 与 DB 一致**`npm run api` 已起、`GET /api/situation` 返回 200`lastUpdated``recentUpdates` 等存在。
2. **确认 seed**:若从未 seed先跑 `node server/seed.js`(或项目提供的 seed 命令),再刷新看板,检查战力/摘要/基地/战损等是否有初始值。
3. **事件脉络**:确认爬虫已起(`npm run gdelt`、RSS 能抓到条数、`situation_update` 条数增加、notify 后前端/API 的 `recentUpdates` 增加。
4. **战损/基地/报复/美股**:确认跑的是入口 B、提取器可用Ollama 或 DASHSCOPE_API_KEY 或规则)、新闻内容包含可解析的伤亡/基地/报复表述;必要时用 crawler 的提取单测或 backfill 接口验证。
5. **地图冲突点**:确认 `gdelt_events` 有数据GDELT 或 RSS 回填);冲突统计看 `conflict_stats`
按上述顺序逐板块对照「数据来源 → 写入方 → 验证命令 → 常见问题」,即可定位每个板块不更新或显示异常的原因。
**若只关心战损、基地、地图战区**:见 **docs/DEBUG_战损_基地_地图.md**,并运行 `./scripts/debug-panels-focus.sh` 做专项检查。

View File

@@ -0,0 +1,135 @@
# 战损、基地、地图战区 — 专项调试
只关心这三块时,按下面数据源 + 排查顺序即可。
---
## 一、战损 (combat_losses)
### 数据流
```
RSS 新闻(标题+摘要/正文) → 爬虫流水线 run_full_pipeline
→ extract_from_news(text) → combat_losses_delta { us: { personnel_killed, ... }, iran: { ... } }
→ db_merge.merge() → 按「增量」叠加到 combat_losses 表
→ POST /api/crawler/notify → Node 重载 DB
→ getSituation() 读 combat_losses → 前端 CombatLossesPanel / CombatLossesOtherPanel
```
- **表**`combat_losses`side=us / iran字段含 personnel_killed、personnel_wounded、bases_destroyed、bases_damaged、aircraft、drones、missiles 等。
- **初始值**`node server/seed.js` 会写入美/伊两行。
- **更新条件**:只有新闻里**明确出现可解析的伤亡/装备数字**如「2 名美军死亡」「14 人受伤」「1 架战机受损」)时,提取器才会输出 `combat_losses_delta`merge 才会叠加。
### 提取器选择(三选一)
| 环境变量 | 使用模块 | 说明 |
|----------|----------|------|
| `DASHSCOPE_API_KEY` 已设 | `extractor_dashscope` | 通义抽取,精度较好 |
| 未设通义 且 `CLEANER_AI_DISABLED≠1` | `extractor_ai` | 需本机 Ollama如 llama3.1 |
| 未设通义 且 `CLEANER_AI_DISABLED=1` | `extractor_rules` | 规则正则,无需模型 |
### 验证命令
```bash
# API 返回的战损
curl -s http://localhost:3001/api/situation | jq '{
us: .usForces.combatLosses.personnelCasualties,
iran: .iranForces.combatLosses.personnelCasualties,
conflictStats: .conflictStats
}'
# 表内原始值
sqlite3 server/data.db "SELECT side, personnel_killed, personnel_wounded, bases_destroyed, bases_damaged, aircraft FROM combat_losses"
```
### 常见问题
| 现象 | 可能原因 | 处理 |
|------|----------|------|
| 战损数字从不变化 | 1) 只跑了 main.py 未跑 gdelt<br>2) 新闻里没有明确伤亡/装备数字<br>3) 提取器未启用或报错Ollama 未起、通义未配) | 跑 `npm run gdelt`;用带数字的新闻测;看爬虫日志是否有提取/merge 报错 |
| 数字暴增一次 | 提取器把「累计总数」当成本条增量 | 已用 MAX_DELTA_PER_MERGE 做单次上限;可查 db_merge.py |
| 想用已有事件脉络重算战损 | 历史新闻当时未做提取 | `curl -X POST http://localhost:8000/crawler/backfill` 用 situation_update 最近 50 条重新提取并 merge |
---
## 二、基地 (key_location)
### 数据流
```
RSS 新闻 → extract_from_news → key_location_updates: [ { name_keywords, side, status, damage_level } ]
→ db_merge.merge() → UPDATE key_location SET status=?, damage_level=? WHERE side=? AND (name LIKE ? OR ...)
→ getSituation() 读 key_location → 前端 BaseStatusPanel(美) / IranBaseStatusPanel(伊) / WarMap 据点层
```
- **表**`key_location`side=us / iran字段含 name、lat、lng、type、region、**status**、**damage_level**。
- **初始数据**seed 写入大量美/伊据点和基地(含 name**爬虫只更新已有行的 status、damage_level**,不新增行。
- **匹配规则**:提取器的 `name_keywords`(如 `阿萨德|asad`)会按 **LIKE '%关键词%'**`key_location.name` 匹配。例如 name 为「阿萨德空军基地」时,关键词「阿萨德」能匹配。
### 规则提取器支持的基地关键词(与 seed name 对应关系)
- **美军**:阿萨德|阿因|asad → 匹配 seed「阿萨德空军基地」「阿因·阿萨德」巴格达 → 巴格达外交支援中心;乌代德|卡塔尔 → 乌代德空军基地;埃尔比勒 → 埃尔比勒空军基地;因吉尔利克|土耳其 → 因吉尔利克空军基地;苏尔坦|沙特 → 苏尔坦亲王空军基地;坦夫|叙利亚 → 坦夫驻军;达夫拉|阿联酋 → 达夫拉空军基地;内瓦提姆|拉蒙|以色列 → 内瓦提姆/拉蒙等;赛利耶、巴林、科威特 等。
- **伊朗**:阿巴斯港、德黑兰、布什尔、伊斯法罕、纳坦兹、米纳布、霍尔木兹 等seed 中需有对应 name 的伊朗据点)。
若 seed 里没有某据点,或 name 与关键词完全对不上(例如英文报道只写 "Al-Asad" 而 seed 只有「阿萨德空军基地」),规则里已含 asad/阿萨德,一般能匹配;若仍不匹配,可查 `key_location.name` 与 extractor_rules.py / extractor_dashscope 的 name_keywords 是否有一致子串。
### 验证命令
```bash
# 被标为遭袭的据点
curl -s http://localhost:3001/api/situation | jq '[.usForces.keyLocations[], .iranForces.keyLocations[]] | map(select(.status == "attacked")) | length'
# 表内 status / damage_level
sqlite3 server/data.db "SELECT side, name, status, damage_level FROM key_location WHERE status != 'operational' OR damage_level IS NOT NULL LIMIT 20"
```
### 常见问题
| 现象 | 可能原因 | 处理 |
|------|----------|------|
| 基地状态从不更新 | 1) 新闻未提及「某基地遭袭」类表述<br>2) 提取的 name_keywords 与 key_location.name 无法 LIKE 匹配 | 确认 seed 的 name 含中文/英文与提取器关键词一致;或扩展 extractor 的 name_keywords |
| 地图/基地面板无据点 | key_location 表空 | 先执行 `node server/seed.js` |
---
## 三、地图战区 / 冲突点 (gdelt_events + conflict_stats)
### 数据流
- **正常模式**`fetch_gdelt_events()` 请求 GDELT API → 解析为事件列表 → `_write_to_db(events)` 写入 `gdelt_events``conflict_stats`(总事件数、高影响事件数、估计伤亡、打击次数等)。
- **GDELT 不可用**:设 `GDELT_DISABLED=1` 时,`fetch_news()` 里在流水线结束后调 `_rss_to_gdelt_fallback()`,用 **situation_update 最近 50 条** 按 summary 推断经纬度(`_infer_coords`)和 impact_score由 severity 映射),写入 `gdelt_events`,这样地图仍有冲突点。
前端 WarMap 根据 `conflictEvents`= gdelt_events的 impact_score 分绿/橙/红三层显示;战损区「冲突统计」来自 `conflict_stats`
### 验证命令
```bash
# 冲突点条数 + 冲突统计
curl -s http://localhost:3001/api/situation | jq '{ conflictEvents: (.conflictEvents | length), conflictStats: .conflictStats }'
# 表内
sqlite3 server/data.db "SELECT COUNT(*) FROM gdelt_events"
sqlite3 server/data.db "SELECT * FROM conflict_stats WHERE id = 1"
```
### 常见问题
| 现象 | 可能原因 | 处理 |
|------|----------|------|
| 地图没有冲突点 | 1) gdelt_events 表空<br>2) 未跑 gdelt 或 GDELT 被墙且未开 RSS 回填 | 跑 `npm run gdelt`;国内可设 `GDELT_DISABLED=1`,靠 situation_update 回填 |
| 冲突点不更新 | 爬虫未调 notify或 Node/爬虫用的不是同一个 data.db | 确认 API_BASE、DB_PATH 一致;看 Node 终端是否有 `[crawler/notify] DB 已重载` |
| conflict_stats 全 0 | 从未成功写入过 gdelt_eventsGDELT 与 RSS 回填都未执行) | 先让 gdelt_events 有数据(见上) |
---
## 四、一键检查(仅战损 / 基地 / 地图)
在项目根执行:
```bash
./scripts/debug-panels-focus.sh
```
会检查API 是否通、`combat_losses` / `key_location` / `gdelt_events` / `conflict_stats` 行数及关键字段、并给出简短结论。需已启动 API`npm run api`);可选 `jq``sqlite3` 以输出更全。
详细逐板块说明见 `docs/DEBUG_PANELS.md`

30
docs/DOCKER_MIRROR.md Normal file
View File

@@ -0,0 +1,30 @@
# Docker 拉取超时 / 配置镜像加速
国内环境从 Docker Hub 拉取镜像常超时,需在 Docker 中配置镜像加速。
## Docker DesktopmacOS / Windows
1. 打开 **Docker Desktop**
2. **Settings****Docker Engine**
3. 在 JSON 中增加 `registry-mirrors`(若已有其他配置,只需合并进该字段):
```json
{
"registry-mirrors": [
"https://docker.m.daocloud.io",
"https://docker.1ms.run"
]
}
```
4. 点击 **Apply & Restart**
5. 重新执行:`docker compose up -d --build`
## 备选镜像源
可替换或补充到 `registry-mirrors` 中:
- `https://docker.m.daocloud.io`DaoCloud
- `https://docker.1ms.run`
- `https://docker.rainbond.cc`(好雨科技)
- 阿里云 / 腾讯云等:在对应云控制台的「容器镜像服务」中获取个人专属加速地址

68
docs/PRODUCTION.md Normal file
View File

@@ -0,0 +1,68 @@
# 生产部署与数据对齐
## 1. 当前项目是否能在 Docker 中单独运行
- **能**。爬虫镜像 `Dockerfile.crawler` 自包含 Python 3.11 + `crawler/requirements.txt`(含 dashscope无宿主机 Python 版本依赖。
- **两种常见用法**
- **docker-compose 一起跑**API + 爬虫都在容器内,共用一个命名卷 `app-data`,天然对齐。
- **爬虫单独 Docker、API 在宿主机**:爬虫容器通过挂载宿主机上的 **同一个** `server/data.db`,并设置 `API_BASE` 指向宿主机 API即可单独运行且数据一致。
## 2. 数据对齐(必须满足)
| 角色 | 使用的 DB 路径(示例) | 说明 |
|--------|-------------------------------|------|
| Node API | `process.env.DB_PATH``server/data.db` | 见 `server/db.js``docker-entrypoint.sh` |
| 爬虫Docker 内) | `DB_PATH=/data/data.db`,且 `/data/data.db` 由宿主机同一文件挂载 | 见 `Dockerfile.crawler``crawler/config.py` |
**原则**API 和爬虫必须读写 **同一个 SQLite 文件**。否则会出现「爬虫写了库、API 读不到」或反之。
- **docker-compose 全容器**:两边都用卷 `app-data`,路径均为 `/data/data.db`,自动对齐。
- **API 宿主机 + 爬虫 Docker**:宿主机 API 的 `DB_PATH` 指向例如 `$PROJECT/server/data.db`;爬虫启动时用 `-v $PROJECT/server/data.db:/data/data.db``-e DB_PATH=/data/data.db`,即对齐。
## 3. 生产脚本与用法
### 3.1 爬虫单独 DockerAPI 在宿主机,如 PM2
```bash
# 首次:构建镜像并启动爬虫容器(会读 .env 中的 DASHSCOPE_API_KEY
./scripts/production-start.sh
# 或分步:
docker build -t usa-dashboard-crawler:latest -f Dockerfile.crawler .
./scripts/run-crawler-docker-standalone.sh
```
可调环境变量(在运行脚本前 export 或写在 .env
- `PROJECT_ROOT`:项目根目录,默认当前目录;用于解析 `server/data.db`
- `DB_FILE`:宿主机 DB 绝对路径,默认 `$PROJECT_ROOT/server/data.db`
- `API_BASE`:爬虫通知 API 的地址,默认 `http://host.docker.internal:3001`Linux 下脚本会自动加 `--add-host=host.docker.internal:host-gateway`)。
- `DASHSCOPE_API_KEY`:阿里云 DashScope启用 AI 清洗(可选)。
### 3.2 docker-compose 全栈API + 爬虫都在容器)
```bash
# 启动
docker compose up -d
# 或传入 DASHSCOPE_API_KEY
DASHSCOPE_API_KEY=sk-xxx docker compose up -d
# 停止
docker compose down
```
此时 API 与爬虫共用卷 `app-data`DB 路径均为 `/data/data.db`,无需额外对齐。
### 3.3 宿主机 APIPM2使用的 DB 路径
确保 PM2 启动 API 时使用的 DB 与爬虫挂载的是同一文件,例如:
- 在 ecosystem 或启动命令里设置:`DB_PATH=/www/wwwroot/www.airtep.com2/usa/server/data.db`
- 或项目根目录即部署目录时,不设则默认为 `server/data.db`(相对路径以进程 cwd 为准)。
## 4. 检查清单
- [ ] API 与爬虫使用**同一 DB 文件**(见上表)。
- [ ] 爬虫能访问到 API`API_BASE` 在「爬虫单独 Docker」场景下指向宿主机`http://host.docker.internal:3001`),在 compose 场景下为 `http://api:3001`
- [ ] 若需 AI 清洗:在爬虫侧设置 `DASHSCOPE_API_KEY`compose 或 standalone 脚本的 .env/环境变量)。
- [ ] 首次部署或无 DB 时:先创建并初始化 DB例如 `DB_PATH=server/data.db node server/seed.js`),再启动爬虫容器。

43
ecosystem.config.cjs Normal file
View File

@@ -0,0 +1,43 @@
/**
* PM2 进程配置API + 爬虫GDELT/RSS uvicorn 服务)
* 用法:
* pm2 start ecosystem.config.cjs # 启动全部
* pm2 restart ecosystem.config.cjs # 重启全部
* pm2 stop ecosystem.config.cjs # 停止全部
* pm2 logs nsa_api / pm2 logs nsa_crawler
* 需 .env 时可在启动前 source .env或在应用内用 dotenv 加载。
*/
module.exports = {
apps: [
{
name: 'nsa_api',
script: 'server/index.js',
cwd: __dirname,
interpreter: 'node',
instances: 1,
autorestart: true,
watch: false,
max_memory_restart: '300M',
env: {
NODE_ENV: 'production',
API_PORT: 3001,
},
},
{
name: 'nsa_crawler',
script: 'crawler/run_uvicorn.sh',
cwd: __dirname,
interpreter: 'bash',
instances: 1,
autorestart: true,
watch: false,
max_memory_restart: '300M',
env: {
CLEANER_AI_DISABLED: '1',
PARSER_AI_DISABLED: '0',
GDELT_DISABLED: '1',
RSS_INTERVAL_SEC: '60',
},
},
],
};

22
g.sh Executable file
View File

@@ -0,0 +1,22 @@
#!/usr/bin/env bash
# 快速 git add + commit + push
# 用法: g "fix: 提交说明"
set -e
msg="${1:-}"
if [[ -z "$msg" ]]; then
echo "用法: g \"commit message\""
echo "示例: g \"fix: 修复登录问题\""
exit 1
fi
# 检查是否有改动
if [[ -z $(git status --porcelain) ]]; then
echo "无文件改动,跳过提交"
exit 0
fi
git add .
git commit -m "$msg"
git push
echo "✓ 已推送"

View File

@@ -3,7 +3,7 @@
<head>
<meta charset="UTF-8" />
<link rel="icon" type="image/svg+xml" href="/usa_logo.png" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="viewport" content="width=device-width, initial-scale=1, viewport-fit=cover" />
<title>美伊军事态势显示</title>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>

8
map.md
View File

@@ -281,4 +281,10 @@ const IRAN_SOURCE = [51.3890, 35.6892] // Tehran
所有动画走 WebGL 图层
禁止 DOM 动画
禁止 DOM 动画
git代码更新:git fetch origin && git reset --hard origin/master
前端发版npm run build
后端发版pm2 restart 3

477
package-lock.json generated
View File

@@ -8,16 +8,18 @@
"name": "us-iran-military-dashboard",
"version": "1.0.0",
"dependencies": {
"better-sqlite3": "^11.6.0",
"cors": "^2.8.5",
"echarts": "^5.5.0",
"echarts-for-react": "^3.0.2",
"express": "^4.21.1",
"lucide-react": "^0.460.0",
"lucide-react": "^0.576.0",
"mapbox-gl": "^3.6.0",
"react": "^18.3.1",
"react-dom": "^18.3.1",
"react-map-gl": "^7.1.7",
"react-router-dom": "^7.13.1",
"sql.js": "^1.11.0",
"swagger-ui-express": "^5.0.1",
"ws": "^8.19.0",
"zustand": "^5.0.0"
},
@@ -1342,6 +1344,12 @@
"win32"
]
},
"node_modules/@scarf/scarf": {
"version": "1.4.0",
"resolved": "https://registry.npmmirror.com/@scarf/scarf/-/scarf-1.4.0.tgz",
"integrity": "sha512-xxeapPiUXdZAE3che6f3xogoJPeZgig6omHEy1rIY5WVsB3H2BHNnZH+gHG6x91SCWyQCzWGsuL2Hh3ClO5/qQ==",
"hasInstallScript": true
},
"node_modules/@types/babel__core": {
"version": "7.20.5",
"resolved": "https://registry.npmmirror.com/@types/babel__core/-/babel__core-7.20.5.tgz",
@@ -1921,25 +1929,6 @@
"integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==",
"dev": true
},
"node_modules/base64-js": {
"version": "1.5.1",
"resolved": "https://registry.npmmirror.com/base64-js/-/base64-js-1.5.1.tgz",
"integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
]
},
"node_modules/baseline-browser-mapping": {
"version": "2.10.0",
"resolved": "https://registry.npmmirror.com/baseline-browser-mapping/-/baseline-browser-mapping-2.10.0.tgz",
@@ -1952,16 +1941,6 @@
"node": ">=6.0.0"
}
},
"node_modules/better-sqlite3": {
"version": "11.10.0",
"resolved": "https://registry.npmmirror.com/better-sqlite3/-/better-sqlite3-11.10.0.tgz",
"integrity": "sha512-EwhOpyXiOEL/lKzHz9AW1msWFNzGc/z+LzeB3/jnFJpxu+th2yqvzsSWas1v9jgs9+xiXJcD5A8CJxAG2TaghQ==",
"hasInstallScript": true,
"dependencies": {
"bindings": "^1.5.0",
"prebuild-install": "^7.1.1"
}
},
"node_modules/binary-extensions": {
"version": "2.3.0",
"resolved": "https://registry.npmmirror.com/binary-extensions/-/binary-extensions-2.3.0.tgz",
@@ -1974,24 +1953,6 @@
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/bindings": {
"version": "1.5.0",
"resolved": "https://registry.npmmirror.com/bindings/-/bindings-1.5.0.tgz",
"integrity": "sha512-p2q/t/mhvuOj/UeLlV6566GD/guowlr0hHxClI0W9m7MWYkL1F0hLo+0Aexs9HSPCtR1SXQ0TD3MMKrXZajbiQ==",
"dependencies": {
"file-uri-to-path": "1.0.0"
}
},
"node_modules/bl": {
"version": "4.1.0",
"resolved": "https://registry.npmmirror.com/bl/-/bl-4.1.0.tgz",
"integrity": "sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==",
"dependencies": {
"buffer": "^5.5.0",
"inherits": "^2.0.4",
"readable-stream": "^3.4.0"
}
},
"node_modules/body-parser": {
"version": "1.20.4",
"resolved": "https://registry.npmmirror.com/body-parser/-/body-parser-1.20.4.tgz",
@@ -2083,29 +2044,6 @@
"node": "^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7"
}
},
"node_modules/buffer": {
"version": "5.7.1",
"resolved": "https://registry.npmmirror.com/buffer/-/buffer-5.7.1.tgz",
"integrity": "sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
],
"dependencies": {
"base64-js": "^1.3.1",
"ieee754": "^1.1.13"
}
},
"node_modules/bytes": {
"version": "3.1.2",
"resolved": "https://registry.npmmirror.com/bytes/-/bytes-3.1.2.tgz",
@@ -2253,11 +2191,6 @@
"node": ">= 6"
}
},
"node_modules/chownr": {
"version": "1.1.4",
"resolved": "https://registry.npmmirror.com/chownr/-/chownr-1.1.4.tgz",
"integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg=="
},
"node_modules/color-convert": {
"version": "2.0.1",
"resolved": "https://registry.npmmirror.com/color-convert/-/color-convert-2.0.1.tgz",
@@ -2399,28 +2332,6 @@
}
}
},
"node_modules/decompress-response": {
"version": "6.0.0",
"resolved": "https://registry.npmmirror.com/decompress-response/-/decompress-response-6.0.0.tgz",
"integrity": "sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==",
"dependencies": {
"mimic-response": "^3.1.0"
},
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/deep-extend": {
"version": "0.6.0",
"resolved": "https://registry.npmmirror.com/deep-extend/-/deep-extend-0.6.0.tgz",
"integrity": "sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==",
"engines": {
"node": ">=4.0.0"
}
},
"node_modules/deep-is": {
"version": "0.1.4",
"resolved": "https://registry.npmmirror.com/deep-is/-/deep-is-0.1.4.tgz",
@@ -2444,14 +2355,6 @@
"npm": "1.2.8000 || >= 1.4.16"
}
},
"node_modules/detect-libc": {
"version": "2.1.2",
"resolved": "https://registry.npmmirror.com/detect-libc/-/detect-libc-2.1.2.tgz",
"integrity": "sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ==",
"engines": {
"node": ">=8"
}
},
"node_modules/didyoumean": {
"version": "1.2.2",
"resolved": "https://registry.npmmirror.com/didyoumean/-/didyoumean-1.2.2.tgz",
@@ -2523,14 +2426,6 @@
"node": ">= 0.8"
}
},
"node_modules/end-of-stream": {
"version": "1.4.5",
"resolved": "https://registry.npmmirror.com/end-of-stream/-/end-of-stream-1.4.5.tgz",
"integrity": "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg==",
"dependencies": {
"once": "^1.4.0"
}
},
"node_modules/es-define-property": {
"version": "1.0.1",
"resolved": "https://registry.npmmirror.com/es-define-property/-/es-define-property-1.0.1.tgz",
@@ -2797,14 +2692,6 @@
"node": ">= 0.6"
}
},
"node_modules/expand-template": {
"version": "2.0.3",
"resolved": "https://registry.npmmirror.com/expand-template/-/expand-template-2.0.3.tgz",
"integrity": "sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==",
"engines": {
"node": ">=6"
}
},
"node_modules/express": {
"version": "4.22.1",
"resolved": "https://registry.npmmirror.com/express/-/express-4.22.1.tgz",
@@ -2940,11 +2827,6 @@
"node": ">=16.0.0"
}
},
"node_modules/file-uri-to-path": {
"version": "1.0.0",
"resolved": "https://registry.npmmirror.com/file-uri-to-path/-/file-uri-to-path-1.0.0.tgz",
"integrity": "sha512-0Zt+s3L7Vf1biwWZ29aARiVYLx7iMGnEUl9x33fbB/j3jR81u/O2LbqK+Bm1CDSNDKVtJ/YjwY7TUd5SkeLQLw=="
},
"node_modules/fill-range": {
"version": "7.1.1",
"resolved": "https://registry.npmmirror.com/fill-range/-/fill-range-7.1.1.tgz",
@@ -3051,11 +2933,6 @@
"node": ">= 0.6"
}
},
"node_modules/fs-constants": {
"version": "1.0.0",
"resolved": "https://registry.npmmirror.com/fs-constants/-/fs-constants-1.0.0.tgz",
"integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow=="
},
"node_modules/fsevents": {
"version": "2.3.3",
"resolved": "https://registry.npmmirror.com/fsevents/-/fsevents-2.3.3.tgz",
@@ -3135,11 +3012,6 @@
"node": ">=0.10.0"
}
},
"node_modules/github-from-package": {
"version": "0.0.0",
"resolved": "https://registry.npmmirror.com/github-from-package/-/github-from-package-0.0.0.tgz",
"integrity": "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw=="
},
"node_modules/gl-matrix": {
"version": "3.4.4",
"resolved": "https://registry.npmmirror.com/gl-matrix/-/gl-matrix-3.4.4.tgz",
@@ -3246,25 +3118,6 @@
"node": ">=0.10.0"
}
},
"node_modules/ieee754": {
"version": "1.2.1",
"resolved": "https://registry.npmmirror.com/ieee754/-/ieee754-1.2.1.tgz",
"integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
]
},
"node_modules/ignore": {
"version": "5.3.2",
"resolved": "https://registry.npmmirror.com/ignore/-/ignore-5.3.2.tgz",
@@ -3304,11 +3157,6 @@
"resolved": "https://registry.npmmirror.com/inherits/-/inherits-2.0.4.tgz",
"integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="
},
"node_modules/ini": {
"version": "1.3.8",
"resolved": "https://registry.npmmirror.com/ini/-/ini-1.3.8.tgz",
"integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew=="
},
"node_modules/ipaddr.js": {
"version": "1.9.1",
"resolved": "https://registry.npmmirror.com/ipaddr.js/-/ipaddr.js-1.9.1.tgz",
@@ -3567,11 +3415,11 @@
}
},
"node_modules/lucide-react": {
"version": "0.460.0",
"resolved": "https://registry.npmmirror.com/lucide-react/-/lucide-react-0.460.0.tgz",
"integrity": "sha512-BVtq/DykVeIvRTJvRAgCsOwaGL8Un3Bxh8MbDxMhEWlZay3T4IpEKDEpwt5KZ0KJMHzgm6jrltxlT5eXOWXDHg==",
"version": "0.576.0",
"resolved": "https://registry.npmmirror.com/lucide-react/-/lucide-react-0.576.0.tgz",
"integrity": "sha512-koNxU14BXrxUfZQ9cUaP0ES1uyPZKYDjk31FQZB6dQ/x+tXk979sVAn9ppZ/pVeJJyOxVM8j1E+8QEuSc02Vug==",
"peerDependencies": {
"react": "^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0-rc"
"react": "^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0"
}
},
"node_modules/mapbox-gl": {
@@ -3706,17 +3554,6 @@
"node": ">= 0.6"
}
},
"node_modules/mimic-response": {
"version": "3.1.0",
"resolved": "https://registry.npmmirror.com/mimic-response/-/mimic-response-3.1.0.tgz",
"integrity": "sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==",
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/minimatch": {
"version": "3.1.5",
"resolved": "https://registry.npmmirror.com/minimatch/-/minimatch-3.1.5.tgz",
@@ -3737,11 +3574,6 @@
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/mkdirp-classic": {
"version": "0.5.3",
"resolved": "https://registry.npmmirror.com/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz",
"integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A=="
},
"node_modules/ms": {
"version": "2.1.3",
"resolved": "https://registry.npmmirror.com/ms/-/ms-2.1.3.tgz",
@@ -3781,11 +3613,6 @@
"node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1"
}
},
"node_modules/napi-build-utils": {
"version": "2.0.0",
"resolved": "https://registry.npmmirror.com/napi-build-utils/-/napi-build-utils-2.0.0.tgz",
"integrity": "sha512-GEbrYkbfF7MoNaoh2iGG84Mnf/WZfB0GdGEsM8wz7Expx/LlWf5U8t9nvJKXSp3qr5IsEbK04cBGhol/KwOsWA=="
},
"node_modules/natural-compare": {
"version": "1.4.0",
"resolved": "https://registry.npmmirror.com/natural-compare/-/natural-compare-1.4.0.tgz",
@@ -3800,28 +3627,6 @@
"node": ">= 0.6"
}
},
"node_modules/node-abi": {
"version": "3.87.0",
"resolved": "https://registry.npmmirror.com/node-abi/-/node-abi-3.87.0.tgz",
"integrity": "sha512-+CGM1L1CgmtheLcBuleyYOn7NWPVu0s0EJH2C4puxgEZb9h8QpR9G2dBfZJOAUhi7VQxuBPMd0hiISWcTyiYyQ==",
"dependencies": {
"semver": "^7.3.5"
},
"engines": {
"node": ">=10"
}
},
"node_modules/node-abi/node_modules/semver": {
"version": "7.7.4",
"resolved": "https://registry.npmmirror.com/semver/-/semver-7.7.4.tgz",
"integrity": "sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA==",
"bin": {
"semver": "bin/semver.js"
},
"engines": {
"node": ">=10"
}
},
"node_modules/node-releases": {
"version": "2.0.27",
"resolved": "https://registry.npmmirror.com/node-releases/-/node-releases-2.0.27.tgz",
@@ -3876,14 +3681,6 @@
"node": ">= 0.8"
}
},
"node_modules/once": {
"version": "1.4.0",
"resolved": "https://registry.npmmirror.com/once/-/once-1.4.0.tgz",
"integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==",
"dependencies": {
"wrappy": "1"
}
},
"node_modules/optionator": {
"version": "0.9.4",
"resolved": "https://registry.npmmirror.com/optionator/-/optionator-0.9.4.tgz",
@@ -4188,32 +3985,6 @@
"resolved": "https://registry.npmmirror.com/potpack/-/potpack-2.1.0.tgz",
"integrity": "sha512-pcaShQc1Shq0y+E7GqJqvZj8DTthWV1KeHGdi0Z6IAin2Oi3JnLCOfwnCo84qc+HAp52wT9nK9H7FAJp5a44GQ=="
},
"node_modules/prebuild-install": {
"version": "7.1.3",
"resolved": "https://registry.npmmirror.com/prebuild-install/-/prebuild-install-7.1.3.tgz",
"integrity": "sha512-8Mf2cbV7x1cXPUILADGI3wuhfqWvtiLA1iclTDbFRZkgRQS0NqsPZphna9V+HyTEadheuPmjaJMsbzKQFOzLug==",
"deprecated": "No longer maintained. Please contact the author of the relevant native addon; alternatives are available.",
"dependencies": {
"detect-libc": "^2.0.0",
"expand-template": "^2.0.3",
"github-from-package": "0.0.0",
"minimist": "^1.2.3",
"mkdirp-classic": "^0.5.3",
"napi-build-utils": "^2.0.0",
"node-abi": "^3.3.0",
"pump": "^3.0.0",
"rc": "^1.2.7",
"simple-get": "^4.0.0",
"tar-fs": "^2.0.0",
"tunnel-agent": "^0.6.0"
},
"bin": {
"prebuild-install": "bin.js"
},
"engines": {
"node": ">=10"
}
},
"node_modules/prelude-ls": {
"version": "1.2.1",
"resolved": "https://registry.npmmirror.com/prelude-ls/-/prelude-ls-1.2.1.tgz",
@@ -4240,15 +4011,6 @@
"node": ">= 0.10"
}
},
"node_modules/pump": {
"version": "3.0.4",
"resolved": "https://registry.npmmirror.com/pump/-/pump-3.0.4.tgz",
"integrity": "sha512-VS7sjc6KR7e1ukRFhQSY5LM2uBWAUPiOPa/A3mkKmiMwSmRFUITt0xuj+/lesgnCv+dPIEYlkzrcyXgquIHMcA==",
"dependencies": {
"end-of-stream": "^1.1.0",
"once": "^1.3.1"
}
},
"node_modules/punycode": {
"version": "2.3.1",
"resolved": "https://registry.npmmirror.com/punycode/-/punycode-2.3.1.tgz",
@@ -4319,28 +4081,6 @@
"node": ">= 0.8"
}
},
"node_modules/rc": {
"version": "1.2.8",
"resolved": "https://registry.npmmirror.com/rc/-/rc-1.2.8.tgz",
"integrity": "sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw==",
"dependencies": {
"deep-extend": "^0.6.0",
"ini": "~1.3.0",
"minimist": "^1.2.0",
"strip-json-comments": "~2.0.1"
},
"bin": {
"rc": "cli.js"
}
},
"node_modules/rc/node_modules/strip-json-comments": {
"version": "2.0.1",
"resolved": "https://registry.npmmirror.com/strip-json-comments/-/strip-json-comments-2.0.1.tgz",
"integrity": "sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ==",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/react": {
"version": "18.3.1",
"resolved": "https://registry.npmmirror.com/react/-/react-18.3.1.tgz",
@@ -4396,6 +4136,54 @@
"node": ">=0.10.0"
}
},
"node_modules/react-router": {
"version": "7.13.1",
"resolved": "https://registry.npmmirror.com/react-router/-/react-router-7.13.1.tgz",
"integrity": "sha512-td+xP4X2/6BJvZoX6xw++A2DdEi++YypA69bJUV5oVvqf6/9/9nNlD70YO1e9d3MyamJEBQFEzk6mbfDYbqrSA==",
"dependencies": {
"cookie": "^1.0.1",
"set-cookie-parser": "^2.6.0"
},
"engines": {
"node": ">=20.0.0"
},
"peerDependencies": {
"react": ">=18",
"react-dom": ">=18"
},
"peerDependenciesMeta": {
"react-dom": {
"optional": true
}
}
},
"node_modules/react-router-dom": {
"version": "7.13.1",
"resolved": "https://registry.npmmirror.com/react-router-dom/-/react-router-dom-7.13.1.tgz",
"integrity": "sha512-UJnV3Rxc5TgUPJt2KJpo1Jpy0OKQr0AjgbZzBFjaPJcFOb2Y8jA5H3LT8HUJAiRLlWrEXWHbF1Z4SCZaQjWDHw==",
"dependencies": {
"react-router": "7.13.1"
},
"engines": {
"node": ">=20.0.0"
},
"peerDependencies": {
"react": ">=18",
"react-dom": ">=18"
}
},
"node_modules/react-router/node_modules/cookie": {
"version": "1.1.1",
"resolved": "https://registry.npmmirror.com/cookie/-/cookie-1.1.1.tgz",
"integrity": "sha512-ei8Aos7ja0weRpFzJnEA9UHJ/7XQmqglbRwnf2ATjcB9Wq874VKH9kfjjirM6UhU2/E5fFYadylyhFldcqSidQ==",
"engines": {
"node": ">=18"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/express"
}
},
"node_modules/read-cache": {
"version": "1.0.0",
"resolved": "https://registry.npmmirror.com/read-cache/-/read-cache-1.0.0.tgz",
@@ -4405,19 +4193,6 @@
"pify": "^2.3.0"
}
},
"node_modules/readable-stream": {
"version": "3.6.2",
"resolved": "https://registry.npmmirror.com/readable-stream/-/readable-stream-3.6.2.tgz",
"integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==",
"dependencies": {
"inherits": "^2.0.3",
"string_decoder": "^1.1.1",
"util-deprecate": "^1.0.1"
},
"engines": {
"node": ">= 6"
}
},
"node_modules/readdirp": {
"version": "3.6.0",
"resolved": "https://registry.npmmirror.com/readdirp/-/readdirp-3.6.0.tgz",
@@ -4645,6 +4420,11 @@
"node": ">= 0.8.0"
}
},
"node_modules/set-cookie-parser": {
"version": "2.7.2",
"resolved": "https://registry.npmmirror.com/set-cookie-parser/-/set-cookie-parser-2.7.2.tgz",
"integrity": "sha512-oeM1lpU/UvhTxw+g3cIfxXHyJRc/uidd3yK1P242gzHds0udQBYzs3y8j4gCCW+ZJ7ad0yctld8RYO+bdurlvw=="
},
"node_modules/set-value": {
"version": "2.0.1",
"resolved": "https://registry.npmmirror.com/set-value/-/set-value-2.0.1.tgz",
@@ -4753,49 +4533,6 @@
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/simple-concat": {
"version": "1.0.1",
"resolved": "https://registry.npmmirror.com/simple-concat/-/simple-concat-1.0.1.tgz",
"integrity": "sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
]
},
"node_modules/simple-get": {
"version": "4.0.1",
"resolved": "https://registry.npmmirror.com/simple-get/-/simple-get-4.0.1.tgz",
"integrity": "sha512-brv7p5WgH0jmQJr1ZDDfKDOSeWWg+OVypG99A/5vYGPqJ6pxiaHLy8nxtFjBA7oMa01ebA9gfh1uMCFqOuXxvA==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
],
"dependencies": {
"decompress-response": "^6.0.0",
"once": "^1.3.1",
"simple-concat": "^1.0.0"
}
},
"node_modules/size-sensor": {
"version": "1.0.3",
"resolved": "https://registry.npmmirror.com/size-sensor/-/size-sensor-1.0.3.tgz",
@@ -4881,6 +4618,11 @@
"node": ">=0.10.0"
}
},
"node_modules/sql.js": {
"version": "1.14.0",
"resolved": "https://registry.npmmirror.com/sql.js/-/sql.js-1.14.0.tgz",
"integrity": "sha512-NXYh+kFqLiYRCNAaHD0PcbjFgXyjuolEKLMk5vRt2DgPENtF1kkNzzMlg42dUk5wIsH8MhUzsRhaUxIisoSlZQ=="
},
"node_modules/statuses": {
"version": "2.0.2",
"resolved": "https://registry.npmmirror.com/statuses/-/statuses-2.0.2.tgz",
@@ -4889,14 +4631,6 @@
"node": ">= 0.8"
}
},
"node_modules/string_decoder": {
"version": "1.3.0",
"resolved": "https://registry.npmmirror.com/string_decoder/-/string_decoder-1.3.0.tgz",
"integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==",
"dependencies": {
"safe-buffer": "~5.2.0"
}
},
"node_modules/strip-json-comments": {
"version": "3.1.1",
"resolved": "https://registry.npmmirror.com/strip-json-comments/-/strip-json-comments-3.1.1.tgz",
@@ -4963,6 +4697,28 @@
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/swagger-ui-dist": {
"version": "5.32.0",
"resolved": "https://registry.npmmirror.com/swagger-ui-dist/-/swagger-ui-dist-5.32.0.tgz",
"integrity": "sha512-nKZB0OuDvacB0s/lC2gbge+RigYvGRGpLLMWMFxaTUwfM+CfndVk9Th2IaTinqXiz6Mn26GK2zriCpv6/+5m3Q==",
"dependencies": {
"@scarf/scarf": "=1.4.0"
}
},
"node_modules/swagger-ui-express": {
"version": "5.0.1",
"resolved": "https://registry.npmmirror.com/swagger-ui-express/-/swagger-ui-express-5.0.1.tgz",
"integrity": "sha512-SrNU3RiBGTLLmFU8GIJdOdanJTl4TOmT27tt3bWWHppqYmAZ6IDuEuBvMU6nZq0zLEe6b/1rACXCgLZqO6ZfrA==",
"dependencies": {
"swagger-ui-dist": ">=5.0.0"
},
"engines": {
"node": ">= v0.10.32"
},
"peerDependencies": {
"express": ">=4.0.0 || >=5.0.0-beta"
}
},
"node_modules/tailwindcss": {
"version": "3.4.19",
"resolved": "https://registry.npmmirror.com/tailwindcss/-/tailwindcss-3.4.19.tgz",
@@ -5000,32 +4756,6 @@
"node": ">=14.0.0"
}
},
"node_modules/tar-fs": {
"version": "2.1.4",
"resolved": "https://registry.npmmirror.com/tar-fs/-/tar-fs-2.1.4.tgz",
"integrity": "sha512-mDAjwmZdh7LTT6pNleZ05Yt65HC3E+NiQzl672vQG38jIrehtJk/J3mNwIg+vShQPcLF/LV7CMnDW6vjj6sfYQ==",
"dependencies": {
"chownr": "^1.1.1",
"mkdirp-classic": "^0.5.2",
"pump": "^3.0.0",
"tar-stream": "^2.1.4"
}
},
"node_modules/tar-stream": {
"version": "2.2.0",
"resolved": "https://registry.npmmirror.com/tar-stream/-/tar-stream-2.2.0.tgz",
"integrity": "sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==",
"dependencies": {
"bl": "^4.0.3",
"end-of-stream": "^1.4.1",
"fs-constants": "^1.0.0",
"inherits": "^2.0.3",
"readable-stream": "^3.1.1"
},
"engines": {
"node": ">=6"
}
},
"node_modules/thenify": {
"version": "3.3.1",
"resolved": "https://registry.npmmirror.com/thenify/-/thenify-3.3.1.tgz",
@@ -5140,17 +4870,6 @@
"resolved": "https://registry.npmmirror.com/tslib/-/tslib-2.3.0.tgz",
"integrity": "sha512-N82ooyxVNm6h1riLCoyS9e3fuJ3AMG2zIZs2Gd1ATcSFjSA23Q0fzjjZeh0jbJvWVDZ0cJT8yaNNaaXHzueNjg=="
},
"node_modules/tunnel-agent": {
"version": "0.6.0",
"resolved": "https://registry.npmmirror.com/tunnel-agent/-/tunnel-agent-0.6.0.tgz",
"integrity": "sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==",
"dependencies": {
"safe-buffer": "^5.0.1"
},
"engines": {
"node": "*"
}
},
"node_modules/type-check": {
"version": "0.4.0",
"resolved": "https://registry.npmmirror.com/type-check/-/type-check-0.4.0.tgz",
@@ -5288,7 +5007,8 @@
"node_modules/util-deprecate": {
"version": "1.0.2",
"resolved": "https://registry.npmmirror.com/util-deprecate/-/util-deprecate-1.0.2.tgz",
"integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw=="
"integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==",
"dev": true
},
"node_modules/utils-merge": {
"version": "1.0.1",
@@ -5389,11 +5109,6 @@
"node": ">=0.10.0"
}
},
"node_modules/wrappy": {
"version": "1.0.2",
"resolved": "https://registry.npmmirror.com/wrappy/-/wrappy-1.0.2.tgz",
"integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ=="
},
"node_modules/ws": {
"version": "8.19.0",
"resolved": "https://registry.npmmirror.com/ws/-/ws-8.19.0.tgz",

View File

@@ -4,25 +4,38 @@
"version": "1.0.0",
"type": "module",
"scripts": {
"start": "./start.sh",
"dev": "vite",
"api": "node server/index.js",
"api:seed": "node server/seed.js",
"crawler": "cd crawler && python main.py",
"gdelt": "cd crawler && uvicorn realtime_conflict_service:app --host 0.0.0.0 --port 8000",
"crawler:once": "cd crawler && python run_once.py",
"crawler:once:range": "./scripts/run-crawler-range.sh",
"crawler:test": "cd crawler && python3 -c \"import sys; sys.path.insert(0,'.'); from scrapers.rss_scraper import fetch_all; n=len(fetch_all()); print('RSS 抓取:', n, '条' if n else '(0 条,检查网络或关键词过滤)')\"",
"crawler:test:extraction": "cd crawler && python3 -m pytest tests/test_extraction.py -v",
"build": "vite build",
"typecheck": "tsc --noEmit",
"lint": "eslint .",
"preview": "vite preview"
"preview": "vite preview",
"verify": "./scripts/verify-pipeline.sh",
"verify:full": "./scripts/verify-pipeline.sh --start-crawler",
"verify-panels": "node scripts/verify-panels.cjs",
"check-crawler-data": "node scripts/check-crawler-data.cjs"
},
"dependencies": {
"better-sqlite3": "^11.6.0",
"cors": "^2.8.5",
"echarts": "^5.5.0",
"echarts-for-react": "^3.0.2",
"express": "^4.21.1",
"lucide-react": "^0.460.0",
"lucide-react": "^0.576.0",
"mapbox-gl": "^3.6.0",
"react": "^18.3.1",
"react-dom": "^18.3.1",
"react-map-gl": "^7.1.7",
"react-router-dom": "^7.13.1",
"sql.js": "^1.11.0",
"swagger-ui-express": "^5.0.1",
"ws": "^8.19.0",
"zustand": "^5.0.0"
},

3
run.sh Normal file
View File

@@ -0,0 +1,3 @@
#!/usr/bin/env bash
cd /www/wwwroot/www.airtep.com2/usa
git fetch origin && git reset --hard origin/master && npm run build && pm2 restart 4

View File

@@ -0,0 +1,154 @@
#!/usr/bin/env bash
# ç”± scripts/gen-align-schema-from-local.sh æ ¹æ<C2B9>®æœ¬åœ° server/data.db 表结构生æˆ<C3A6>,ä¾ç”Ÿäº§æ‰§è¡Œã€
# 用法:在生产目录执行 DB_PATH=server/data.db ./scripts/align-production-schema.sh
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
DB_PATH="${DB_PATH:-$PROJECT_ROOT/server/data.db}"
run() { sqlite3 "$DB_PATH" "$1" 2>/dev/null || true; }
echo "=== 对é½<C3A9>生产库表结构(与本地 data.db 一致):$DB_PATH ==="
run "ALTER TABLE combat_losses ADD COLUMN bases_destroyed INTEGER NOT NULL;"
# ¼ˆæœ¬åœ°åˆ—)
run "ALTER TABLE combat_losses ADD COLUMN bases_damaged INTEGER NOT NULL;"
run "ALTER TABLE combat_losses ADD COLUMN personnel_killed INTEGER NOT NULL;"
run "ALTER TABLE combat_losses ADD COLUMN personnel_wounded INTEGER NOT NULL;"
run "ALTER TABLE combat_losses ADD COLUMN aircraft INTEGER NOT NULL;"
run "ALTER TABLE combat_losses ADD COLUMN warships INTEGER NOT NULL;"
run "ALTER TABLE combat_losses ADD COLUMN armor INTEGER NOT NULL;"
run "ALTER TABLE combat_losses ADD COLUMN vehicles INTEGER NOT NULL;"
run "ALTER TABLE combat_losses ADD COLUMN civilian_killed INTEGER NOT NULL DEFAULT 0;"
run "ALTER TABLE combat_losses ADD COLUMN civilian_wounded INTEGER NOT NULL DEFAULT 0;"
run "ALTER TABLE combat_losses ADD COLUMN drones INTEGER NOT NULL DEFAULT 0;"
run "ALTER TABLE combat_losses ADD COLUMN missiles INTEGER NOT NULL DEFAULT 0;"
run "ALTER TABLE combat_losses ADD COLUMN helicopters INTEGER NOT NULL DEFAULT 0;"
run "ALTER TABLE combat_losses ADD COLUMN submarines INTEGER NOT NULL DEFAULT 0;"
run "ALTER TABLE combat_losses ADD COLUMN tanks INTEGER NOT NULL DEFAULT 0;"
run "ALTER TABLE combat_losses ADD COLUMN carriers INTEGER NOT NULL DEFAULT 0;"
run "UPDATE combat_losses SET carriers = COALESCE(tanks, 0) WHERE carriers = 0;"
run "ALTER TABLE combat_losses ADD COLUMN civilian_ships INTEGER NOT NULL DEFAULT 0;"
run "ALTER TABLE combat_losses ADD COLUMN airport_port INTEGER NOT NULL DEFAULT 0;"
echo " combat_losses done"
run "ALTER TABLE conflict_stats ADD COLUMN total_events INTEGER NOT NULL DEFAULT 0;"
# ¼ˆæœ¬åœ°åˆ—)
run "ALTER TABLE conflict_stats ADD COLUMN high_impact_events INTEGER NOT NULL DEFAULT 0;"
run "ALTER TABLE conflict_stats ADD COLUMN estimated_casualties INTEGER NOT NULL DEFAULT 0;"
run "ALTER TABLE conflict_stats ADD COLUMN estimated_strike_count INTEGER NOT NULL DEFAULT 0;"
run "ALTER TABLE conflict_stats ADD COLUMN updated_at TEXT NOT NULL;"
echo " conflict_stats done"
run "ALTER TABLE display_stats ADD COLUMN viewers INTEGER;"
# ¼ˆæœ¬åœ°åˆ—)
run "ALTER TABLE display_stats ADD COLUMN share_count INTEGER;"
run "ALTER TABLE display_stats ADD COLUMN like_count INTEGER;"
run "ALTER TABLE display_stats ADD COLUMN override_enabled INTEGER NOT NULL DEFAULT 0;"
echo " display_stats done"
run "ALTER TABLE feedback ADD COLUMN content TEXT NOT NULL;"
# ¼ˆæœ¬åœ°åˆ—)
run "ALTER TABLE feedback ADD COLUMN ip TEXT;"
run "ALTER TABLE feedback ADD COLUMN created_at TEXT NOT NULL;"
echo " feedback done"
run "ALTER TABLE force_asset ADD COLUMN side TEXT NOT NULL;"
# ¼ˆæœ¬åœ°åˆ—)
run "ALTER TABLE force_asset ADD COLUMN name TEXT NOT NULL;"
run "ALTER TABLE force_asset ADD COLUMN type TEXT NOT NULL;"
run "ALTER TABLE force_asset ADD COLUMN count INTEGER NOT NULL;"
run "ALTER TABLE force_asset ADD COLUMN status TEXT NOT NULL;"
run "ALTER TABLE force_asset ADD COLUMN lat REAL;"
run "ALTER TABLE force_asset ADD COLUMN lng REAL;"
echo " force_asset done"
run "ALTER TABLE force_summary ADD COLUMN total_assets INTEGER NOT NULL;"
# ¼ˆæœ¬åœ°åˆ—)
run "ALTER TABLE force_summary ADD COLUMN personnel INTEGER NOT NULL;"
run "ALTER TABLE force_summary ADD COLUMN naval_ships INTEGER NOT NULL;"
run "ALTER TABLE force_summary ADD COLUMN aircraft INTEGER NOT NULL;"
run "ALTER TABLE force_summary ADD COLUMN ground_units INTEGER NOT NULL;"
run "ALTER TABLE force_summary ADD COLUMN uav INTEGER NOT NULL;"
run "ALTER TABLE force_summary ADD COLUMN missile_consumed INTEGER NOT NULL;"
run "ALTER TABLE force_summary ADD COLUMN missile_stock INTEGER NOT NULL;"
echo " force_summary done"
run "ALTER TABLE gdelt_events ADD COLUMN event_time TEXT NOT NULL;"
# ¼ˆæœ¬åœ°åˆ—)
run "ALTER TABLE gdelt_events ADD COLUMN title TEXT NOT NULL;"
run "ALTER TABLE gdelt_events ADD COLUMN lat REAL NOT NULL;"
run "ALTER TABLE gdelt_events ADD COLUMN lng REAL NOT NULL;"
run "ALTER TABLE gdelt_events ADD COLUMN impact_score INTEGER NOT NULL;"
run "ALTER TABLE gdelt_events ADD COLUMN url TEXT;"
run "ALTER TABLE gdelt_events ADD COLUMN created_at TEXT;"
echo " gdelt_events done"
run "ALTER TABLE key_location ADD COLUMN side TEXT NOT NULL;"
# ¼ˆæœ¬åœ°åˆ—)
run "ALTER TABLE key_location ADD COLUMN name TEXT NOT NULL;"
run "ALTER TABLE key_location ADD COLUMN lat REAL NOT NULL;"
run "ALTER TABLE key_location ADD COLUMN lng REAL NOT NULL;"
run "ALTER TABLE key_location ADD COLUMN type TEXT;"
run "ALTER TABLE key_location ADD COLUMN region TEXT;"
run 'ALTER TABLE key_location ADD COLUMN status TEXT DEFAULT '\''operational'\'';'
run "ALTER TABLE key_location ADD COLUMN damage_level INTEGER;"
run "ALTER TABLE key_location ADD COLUMN attacked_at TEXT;"
echo " key_location done"
run "ALTER TABLE like_count ADD COLUMN total INTEGER NOT NULL DEFAULT 0;"
# ¼ˆæœ¬åœ°åˆ—)
echo " like_count done"
run "ALTER TABLE map_strike_line ADD COLUMN target_lng REAL NOT NULL;"
# ¼ˆæœ¬åœ°åˆ—)
run "ALTER TABLE map_strike_line ADD COLUMN target_lat REAL NOT NULL;"
run "ALTER TABLE map_strike_line ADD COLUMN target_name TEXT;"
run "ALTER TABLE map_strike_line ADD COLUMN struck_at TEXT;"
echo " map_strike_line done"
run "ALTER TABLE map_strike_source ADD COLUMN name TEXT NOT NULL;"
# ¼ˆæœ¬åœ°åˆ—)
run "ALTER TABLE map_strike_source ADD COLUMN lng REAL NOT NULL;"
run "ALTER TABLE map_strike_source ADD COLUMN lat REAL NOT NULL;"
echo " map_strike_source done"
run "ALTER TABLE news_content ADD COLUMN content_hash TEXT NOT NULL;"
# ¼ˆæœ¬åœ°åˆ—)
run "ALTER TABLE news_content ADD COLUMN title TEXT NOT NULL;"
run "ALTER TABLE news_content ADD COLUMN summary TEXT NOT NULL;"
run 'ALTER TABLE news_content ADD COLUMN url TEXT NOT NULL DEFAULT '\'''\'';'
run 'ALTER TABLE news_content ADD COLUMN source TEXT NOT NULL DEFAULT '\'''\'';'
run "ALTER TABLE news_content ADD COLUMN published_at TEXT NOT NULL;"
run 'ALTER TABLE news_content ADD COLUMN category TEXT NOT NULL DEFAULT '\''other'\'';'
run 'ALTER TABLE news_content ADD COLUMN severity TEXT NOT NULL DEFAULT '\''medium'\'';'
run "ALTER TABLE news_content ADD COLUMN created_at TEXT NOT NULL;"
echo " news_content done"
run "ALTER TABLE power_index ADD COLUMN overall INTEGER NOT NULL;"
# ¼ˆæœ¬åœ°åˆ—)
run "ALTER TABLE power_index ADD COLUMN military_strength INTEGER NOT NULL;"
run "ALTER TABLE power_index ADD COLUMN economic_power INTEGER NOT NULL;"
run "ALTER TABLE power_index ADD COLUMN geopolitical_influence INTEGER NOT NULL;"
echo " power_index done"
run "ALTER TABLE retaliation_current ADD COLUMN value INTEGER NOT NULL;"
# ¼ˆæœ¬åœ°åˆ—)
echo " retaliation_current done"
run "ALTER TABLE retaliation_history ADD COLUMN time TEXT NOT NULL;"
# ¼ˆæœ¬åœ°åˆ—)
run "ALTER TABLE retaliation_history ADD COLUMN value INTEGER NOT NULL;"
echo " retaliation_history done"
run "ALTER TABLE share_count ADD COLUMN total INTEGER NOT NULL DEFAULT 0;"
# ¼ˆæœ¬åœ°åˆ—)
echo " share_count done"
run "ALTER TABLE situation ADD COLUMN data TEXT NOT NULL;"
# ¼ˆæœ¬åœ°åˆ—)
run "ALTER TABLE situation ADD COLUMN updated_at TEXT NOT NULL;"
echo " situation done"
run "ALTER TABLE situation_update ADD COLUMN timestamp TEXT NOT NULL;"
# ¼ˆæœ¬åœ°åˆ—)
run "ALTER TABLE situation_update ADD COLUMN category TEXT NOT NULL;"
run "ALTER TABLE situation_update ADD COLUMN summary TEXT NOT NULL;"
run "ALTER TABLE situation_update ADD COLUMN severity TEXT NOT NULL;"
echo " situation_update done"
run "ALTER TABLE visitor_count ADD COLUMN total INTEGER NOT NULL DEFAULT 0;"
# ¼ˆæœ¬åœ°åˆ—)
echo " visitor_count done"
run "ALTER TABLE visits ADD COLUMN last_seen TEXT NOT NULL;"
# ¼ˆæœ¬åœ°åˆ—)
echo " visits done"
run "ALTER TABLE wall_street_trend ADD COLUMN time TEXT NOT NULL;"
# ¼ˆæœ¬åœ°åˆ—)
run "ALTER TABLE wall_street_trend ADD COLUMN value INTEGER NOT NULL;"
echo " wall_street_trend done"
echo "=== 完æˆ<C3A6>ã€æ ¸å¯¹ç¤ºä¾ï¼š ==="
echo " sqlite3 $DB_PATH \"PRAGMA table_info(key_location);\""
echo " sqlite3 $DB_PATH \"PRAGMA table_info(combat_losses);\""

View File

@@ -0,0 +1,64 @@
#!/usr/bin/env bash
# 检查攻击地点/打击线是否完整(与 seed.js 一致)
# 用法: DB_PATH=server/data.db ./scripts/check-attack-locations.sh
# 裸机: cd /root/usa && ./scripts/check-attack-locations.sh
set -e
PROJECT_ROOT="${PROJECT_ROOT:-$(cd "$(dirname "$0")/.." && pwd)}"
DB_PATH="${DB_PATH:-$PROJECT_ROOT/server/data.db}"
echo "=========================================="
echo "攻击地点 / 打击线 检查"
echo "DB: $DB_PATH"
echo "=========================================="
if [[ ! -f "$DB_PATH" ]]; then
echo "错误: 数据库文件不存在"
exit 1
fi
if ! command -v sqlite3 &>/dev/null; then
echo "需要 sqlite3。安装: yum install sqlite 或 apt install sqlite3"
exit 1
fi
# 期望数量(与 server/seed.js 一致)
EXPECT_US=62 # getUsLocations: naval + attacked + newBases
EXPECT_IRAN=18 # iranLocs 条数
EXPECT_ISRAEL=4
EXPECT_LINCOLN=5
EXPECT_FORD=7
n_us=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM key_location WHERE side='us';" 2>/dev/null || echo "0")
n_iran=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM key_location WHERE side='iran';" 2>/dev/null || echo "0")
n_israel=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM map_strike_line WHERE source_id='israel';" 2>/dev/null || echo "0")
n_lincoln=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM map_strike_line WHERE source_id='lincoln';" 2>/dev/null || echo "0")
n_ford=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM map_strike_line WHERE source_id='ford';" 2>/dev/null || echo "0")
echo ""
echo "key_location:"
echo " us (美军基地等): 当前 $n_us 条,期望 $EXPECT_US"
echo " iran (伊朗被袭点): 当前 $n_iran 条,期望 $EXPECT_IRAN"
echo ""
echo "map_strike_line (盟军打击伊朗):"
echo " israel: 当前 $n_israel 条,期望 $EXPECT_ISRAEL"
echo " lincoln: 当前 $n_lincoln 条,期望 $EXPECT_LINCOLN"
echo " ford: 当前 $n_ford 条,期望 $EXPECT_FORD"
echo "=========================================="
ok=0
[[ "$n_us" -ge "$EXPECT_US" ]] && [[ "$n_iran" -ge "$EXPECT_IRAN" ]] && \
[[ "$n_israel" -ge "$EXPECT_ISRAEL" ]] && [[ "$n_lincoln" -ge "$EXPECT_LINCOLN" ]] && \
[[ "$n_ford" -ge "$EXPECT_FORD" ]] && ok=1
if [[ $ok -eq 1 ]]; then
echo "结论: 攻击地点/打击线数量正常"
exit 0
fi
echo "结论: 数量不足,请在生产执行 seed 以与当前代码一致:"
echo " cd $PROJECT_ROOT"
echo " cp server/data.db server/data.db.bak-\$(date +%Y%m%d-%H%M%S)"
echo " DB_PATH=server/data.db node server/seed.js"
echo " 重启 API 后刷新页面"
exit 1

View File

@@ -0,0 +1,140 @@
#!/usr/bin/env node
/**
* 检查爬虫写入的数据:条数 + 最近内容situation_update、news_content、gdelt_events
* 用法(项目根目录): node scripts/check-crawler-data.cjs
* 可选:先启动爬虫 npm run gdelt再启动 API 或直接运行本脚本读 DB
*/
const path = require('path')
const http = require('http')
const projectRoot = path.resolve(__dirname, '..')
process.chdir(projectRoot)
const db = require('../server/db')
const CRAWLER_URL = process.env.CRAWLER_URL || 'http://localhost:8000'
const SHOW_ROWS = 10
function fetchCrawlerStatus() {
return new Promise((resolve) => {
const url = new URL(`${CRAWLER_URL}/crawler/status`)
const req = http.request(
{ hostname: url.hostname, port: url.port || 80, path: url.pathname, method: 'GET', timeout: 3000 },
(res) => {
let body = ''
res.on('data', (c) => (body += c))
res.on('end', () => {
try {
resolve(JSON.parse(body))
} catch {
resolve(null)
}
})
}
)
req.on('error', () => resolve(null))
req.end()
})
}
async function run() {
console.log('========================================')
console.log('爬虫数据检查(条数 + 最近内容)')
console.log('========================================\n')
// ---------- 爬虫服务状态(可选)----------
const status = await fetchCrawlerStatus()
if (status) {
console.log('--- 爬虫服务状态 GET /crawler/status ---')
console.log(' db_path:', status.db_path)
console.log(' db_exists:', status.db_exists)
console.log(' situation_update_count:', status.situation_update_count)
console.log(' last_fetch_items:', status.last_fetch_items, '(本轮抓取条数)')
console.log(' last_fetch_inserted:', status.last_fetch_inserted, '(去重后新增)')
if (status.last_fetch_error) console.log(' last_fetch_error:', status.last_fetch_error)
console.log('')
} else {
console.log('--- 爬虫服务 ---')
console.log(' 未启动或不可达:', CRAWLER_URL)
console.log('')
}
// ---------- situation_update事件脉络看板「近期更新」----------
let situationUpdateRows = []
let situationUpdateCount = 0
try {
situationUpdateCount = db.prepare('SELECT COUNT(*) as c FROM situation_update').get().c
situationUpdateRows = db
.prepare(
'SELECT id, timestamp, category, summary, severity FROM situation_update ORDER BY timestamp DESC LIMIT ?'
)
.all(SHOW_ROWS)
} catch (e) {
console.log('situation_update 表读取失败:', e.message)
}
console.log('--- situation_update事件脉络---')
console.log(' 总条数:', situationUpdateCount)
if (situationUpdateRows.length > 0) {
console.log(' 最近', situationUpdateRows.length, '条:')
situationUpdateRows.forEach((r, i) => {
const summary = (r.summary || '').slice(0, 50)
console.log(` ${i + 1}. [${r.timestamp}] ${r.category}/${r.severity} ${summary}${summary.length >= 50 ? '…' : ''}`)
})
}
console.log('')
// ---------- news_content资讯表爬虫去重后写入----------
let newsCount = 0
let newsRows = []
try {
newsCount = db.prepare('SELECT COUNT(*) as c FROM news_content').get().c
newsRows = db
.prepare(
'SELECT title, summary, source, published_at, category, severity FROM news_content ORDER BY published_at DESC LIMIT ?'
)
.all(SHOW_ROWS)
} catch (e) {
console.log('news_content 表读取失败:', e.message)
}
console.log('--- news_content资讯表---')
console.log(' 总条数:', newsCount)
if (newsRows.length > 0) {
console.log(' 最近', newsRows.length, '条:')
newsRows.forEach((r, i) => {
const title = (r.title || '').slice(0, 45)
console.log(` ${i + 1}. [${r.published_at || ''}] ${r.source || ''} ${title}${title.length >= 45 ? '…' : ''}`)
if (r.summary) console.log(` summary: ${(r.summary || '').slice(0, 60)}`)
})
}
console.log('')
// ---------- gdelt_events地图冲突点----------
let gdeltCount = 0
let gdeltRows = []
try {
gdeltCount = db.prepare('SELECT COUNT(*) as c FROM gdelt_events').get().c
gdeltRows = db
.prepare('SELECT event_id, event_time, title, impact_score FROM gdelt_events ORDER BY event_time DESC LIMIT 5')
.all()
} catch (e) {
console.log('gdelt_events 表读取失败:', e.message)
}
console.log('--- gdelt_events地图冲突点---')
console.log(' 总条数:', gdeltCount)
if (gdeltRows.length > 0) {
console.log(' 最近 5 条:')
gdeltRows.forEach((r, i) => {
const title = (r.title || '').slice(0, 50)
console.log(` ${i + 1}. [${r.event_time}] impact=${r.impact_score} ${title}${title.length >= 50 ? '…' : ''}`)
})
}
console.log('========================================')
}
db.initDb().then(() => run()).catch((err) => {
console.error('失败:', err.message)
process.exit(1)
})

View File

@@ -0,0 +1,61 @@
#!/usr/bin/env bash
# 检查爬虫数据与面板数据是否联通
# 用法: ./scripts/check-crawler-panel-connectivity.sh
# 需先启动: npm run api可选: npm run gdelt
set -e
API_URL="${API_URL:-http://localhost:3001}"
CRAWLER_URL="${CRAWLER_URL:-http://localhost:8000}"
echo "=========================================="
echo "爬虫 ↔ 面板 联通检查"
echo "API: $API_URL | Crawler: $CRAWLER_URL"
echo "=========================================="
# 1. 爬虫侧situation_update 条数
CRAWLER_SU_COUNT=""
if curl -sf "$CRAWLER_URL/crawler/status" >/dev/null 2>&1; then
if command -v jq &>/dev/null; then
CRAWLER_SU_COUNT=$(curl -sf "$CRAWLER_URL/crawler/status" | jq -r '.situation_update_count // "?"')
else
CRAWLER_SU_COUNT="(需 jq 查看)"
fi
echo "[爬虫] situation_update 条数: $CRAWLER_SU_COUNT"
else
echo "[爬虫] 未启动或不可达 (curl $CRAWLER_URL/crawler/status 失败)"
fi
# 2. 面板侧API 返回的 recentUpdates 条数、lastUpdated
if ! curl -sf "$API_URL/api/health" >/dev/null 2>&1; then
echo "[API] 未启动,请先运行: npm run api"
exit 1
fi
SIT=$(curl -sf "$API_URL/api/situation" 2>/dev/null || echo "{}")
if command -v jq &>/dev/null; then
RU_LEN=$(echo "$SIT" | jq '.recentUpdates | length')
LAST=$(echo "$SIT" | jq -r '.lastUpdated // "?"')
echo "[面板] recentUpdates 条数: $RU_LEN | lastUpdated: $LAST"
else
echo "[面板] 态势数据已获取 (安装 jq 可显示条数)"
fi
# 3. 一致性:爬虫写的是 server/data.dbNode 通过 notify 重载后应一致
echo ""
echo "--- 联动说明 ---"
echo " • 事件脉络 (recentUpdates) ← situation_update 表,由爬虫 write_updates() 写入"
echo " • 爬虫每次抓取后会 POST $API_URL/api/crawler/notifyNode 会 reloadFromFile() 后广播"
echo " • 若爬虫有数据但面板 recentUpdates 很少/为空:检查 Node 终端是否出现 [crawler/notify] DB 已重载"
echo " • 若从未出现:检查 API_BASE 是否指向当前 API默认 http://localhost:3001"
echo " • 战损/基地/力量指数:仅当 AI/规则从新闻中提取到数字时才会更新,多数新闻不会触发"
echo "=========================================="
# 4. 可选:触发一次 notify 看 Node 是否重载(不启动爬虫时可用于测试)
# 非交互时跳过;交互时可用: echo y | ./scripts/check-crawler-panel-connectivity.sh
if [[ -t 0 ]]; then
echo ""
read -r -p "是否发送一次 POST /api/crawler/notify 测试 Node 重载? [y/N] " ans
if [[ "${ans,,}" = "y" ]]; then
curl -sf -X POST "$API_URL/api/crawler/notify" && echo " 已发送 notify请看 Node 终端是否打印 [crawler/notify] DB 已重载"
fi
fi

39
scripts/check-db-and-crawler.sh Executable file
View File

@@ -0,0 +1,39 @@
#!/usr/bin/env bash
# 查看数据库中的 lastUpdated 与条数,并提示如何用爬虫更新
# 用法: ./scripts/check-db-and-crawler.sh
PROJECT_ROOT="${PROJECT_ROOT:-$(cd "$(dirname "$0")/.." && pwd)}"
DB_PATH="${DB_PATH:-$PROJECT_ROOT/server/data.db}"
echo "=========================================="
echo "数据库与爬虫状态"
echo "DB: $DB_PATH"
echo "=========================================="
if [[ ! -f "$DB_PATH" ]]; then
echo "数据库文件不存在。请先执行: node server/seed.js"
exit 1
fi
if command -v sqlite3 &>/dev/null; then
UPDATED_AT=$(sqlite3 "$DB_PATH" "SELECT updated_at FROM situation WHERE id = 1;" 2>/dev/null || echo "?")
SU_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM situation_update;" 2>/dev/null || echo "?")
NEWS_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM news_content;" 2>/dev/null || echo "?")
echo "situation.updated_at (前端 lastUpdated): $UPDATED_AT"
echo "situation_update 条数: $SU_COUNT"
echo "news_content 条数: $NEWS_COUNT"
else
echo "未安装 sqlite3无法直接查库。可安装: brew install sqlite3"
fi
echo ""
echo "--- 为何数据停在旧日期? ---"
echo " • lastUpdated 来自 situation.updated_at。"
echo " • 已改为:每次爬虫运行都会更新该时间(不再仅在有新资讯时更新)。"
echo " • 若从未跑爬虫或很久没跑,请执行一次爬虫:"
echo ""
echo " cd $PROJECT_ROOT && python crawler/run_once.py"
echo " 或: npm run crawler:once"
echo ""
echo " 若需定时更新,可启动常驻爬虫: python crawler/main.py"
echo "=========================================="

78
scripts/debug-panels-focus.sh Executable file
View File

@@ -0,0 +1,78 @@
#!/usr/bin/env bash
# 仅检查:战损、基地、地图战区 三块数据
# 用法: ./scripts/debug-panels-focus.sh
set -e
API_URL="${API_URL:-http://localhost:3001}"
PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
DB_PATH="${DB_PATH:-$PROJECT_ROOT/server/data.db}"
echo "=========================================="
echo "战损 / 基地 / 地图战区 — 数据检查"
echo "API: $API_URL | DB: $DB_PATH"
echo "=========================================="
echo ""
# ---------- API 连通 ----------
if ! curl -sf "$API_URL/api/health" >/dev/null 2>&1; then
echo "✗ API 无响应,请先运行: npm run api"
exit 1
fi
echo "✓ API 正常"
echo ""
SIT=$(curl -sf "$API_URL/api/situation" 2>/dev/null || echo "{}")
# ---------- 1. 战损 ----------
echo "[1] 战损 (combat_losses)"
if command -v jq &>/dev/null; then
us_k=$(echo "$SIT" | jq -r '.usForces.combatLosses.personnelCasualties.killed // "?"')
us_w=$(echo "$SIT" | jq -r '.usForces.combatLosses.personnelCasualties.wounded // "?"')
ir_k=$(echo "$SIT" | jq -r '.iranForces.combatLosses.personnelCasualties.killed // "?"')
ir_w=$(echo "$SIT" | jq -r '.iranForces.combatLosses.personnelCasualties.wounded // "?"')
echo " 美军 阵亡=$us_k 受伤=$us_w | 伊朗 阵亡=$ir_k 受伤=$ir_w"
echo " conflictStats: $(echo "$SIT" | jq -c '.conflictStats')"
else
echo " (安装 jq 可显示详细数字)"
fi
if [[ -f "$DB_PATH" ]] && command -v sqlite3 &>/dev/null; then
echo " 表 combat_losses:"
sqlite3 "$DB_PATH" "SELECT side, personnel_killed, personnel_wounded, bases_destroyed, bases_damaged FROM combat_losses" 2>/dev/null | while read -r line; do echo " $line"; done
fi
echo " 数据来源: seed 初始;爬虫从新闻提取 combat_losses_delta 后 db_merge 增量叠加。不更新→检查是否跑 gdelt、提取器是否输出、新闻是否含伤亡数字。"
echo ""
# ---------- 2. 基地 ----------
echo "[2] 基地 (key_location)"
if command -v jq &>/dev/null; then
us_loc=$(echo "$SIT" | jq -r '.usForces.keyLocations | length')
ir_loc=$(echo "$SIT" | jq -r '.iranForces.keyLocations | length')
us_attacked=$(echo "$SIT" | jq -r '[.usForces.keyLocations[] | select(.status == "attacked")] | length')
ir_attacked=$(echo "$SIT" | jq -r '[.iranForces.keyLocations[] | select(.status == "attacked")] | length')
echo " 美军 据点=$us_loc 遭袭=$us_attacked | 伊朗 据点=$ir_loc 遭袭=$ir_attacked"
fi
if [[ -f "$DB_PATH" ]] && command -v sqlite3 &>/dev/null; then
echo " 表 key_location 遭袭/有损伤的:"
sqlite3 "$DB_PATH" "SELECT side, name, status, damage_level FROM key_location WHERE status != 'operational' OR damage_level IS NOT NULL LIMIT 10" 2>/dev/null | while read -r line; do echo " $line"; done
fi
echo " 数据来源: seed 写入全部据点;爬虫只更新 status/damage_level需 name_keywords 与 name LIKE 匹配。不更新→检查新闻是否提基地遭袭、关键词与 seed name 是否一致。"
echo ""
# ---------- 3. 地图战区 ----------
echo "[3] 地图战区 (gdelt_events + conflict_stats)"
if command -v jq &>/dev/null; then
ev_cnt=$(echo "$SIT" | jq -r '.conflictEvents | length')
echo " conflictEvents 条数: $ev_cnt"
echo " conflictStats: $(echo "$SIT" | jq -c '.conflictStats')"
fi
if [[ -f "$DB_PATH" ]] && command -v sqlite3 &>/dev/null; then
n_ev=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM gdelt_events" 2>/dev/null || echo "0")
echo " 表 gdelt_events 行数: $n_ev"
sqlite3 "$DB_PATH" "SELECT total_events, high_impact_events, estimated_casualties, estimated_strike_count FROM conflict_stats WHERE id = 1" 2>/dev/null | while read -r line; do echo " conflict_stats: $line"; done
fi
echo " 数据来源: GDELT API 写入;或 GDELT_DISABLED=1 时由 situation_update 回填。无点→跑 gdelt 或开启 RSS 回填。"
echo ""
echo "=========================================="
echo "详细说明与排查顺序见: docs/DEBUG_战损_基地_地图.md"
echo "=========================================="

83
scripts/debug-panels.sh Executable file
View File

@@ -0,0 +1,83 @@
#!/usr/bin/env bash
# 看板板块数据快速检查:各表/API 与板块对应关系,便于逐项 debug
# 用法: ./scripts/debug-panels.sh
# 依赖: curl可选 jq、sqlite3 以输出更清晰
set -e
API_URL="${API_URL:-http://localhost:3001}"
PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
DB_PATH="${DB_PATH:-$PROJECT_ROOT/server/data.db}"
echo "=========================================="
echo "看板板块数据检查 (DEBUG_PANELS)"
echo "API: $API_URL | DB: $DB_PATH"
echo "=========================================="
echo ""
# ---------- 1. API 健康与态势摘要 ----------
echo "[1] API 与态势摘要"
if ! curl -sf "$API_URL/api/health" >/dev/null 2>&1; then
echo " ✗ API 无响应,请先运行: npm run api"
echo " 后续表检查将跳过(依赖 API 或直接读 DB"
else
echo " ✓ API 正常"
SIT=$(curl -sf "$API_URL/api/situation" 2>/dev/null || echo "{}")
if command -v jq &>/dev/null; then
echo " lastUpdated: $(echo "$SIT" | jq -r '.lastUpdated // "?"')"
echo " recentUpdates: $(echo "$SIT" | jq -r '.recentUpdates | length') 条 → 事件脉络"
echo " conflictEvents: $(echo "$SIT" | jq -r '.conflictEvents | length') 条 → 地图冲突点"
echo " us powerIndex: $(echo "$SIT" | jq -r '.usForces.powerIndex.overall') → 顶栏/战力图"
echo " iran powerIndex: $(echo "$SIT" | jq -r '.iranForces.powerIndex.overall')"
echo " us keyLocations: $(echo "$SIT" | jq -r '.usForces.keyLocations | length') 条 → 美国基地/地图"
echo " iran keyLocations: $(echo "$SIT" | jq -r '.iranForces.keyLocations | length') 条 → 伊朗基地/地图"
echo " us combatLosses: killed=$(echo "$SIT" | jq -r '.usForces.combatLosses.personnelCasualties.killed') wounded=$(echo "$SIT" | jq -r '.usForces.combatLosses.personnelCasualties.wounded')"
echo " wallStreet points: $(echo "$SIT" | jq -r '.usForces.wallStreetInvestmentTrend | length') → 华尔街图"
echo " retaliation: $(echo "$SIT" | jq -r '.iranForces.retaliationSentiment') (history: $(echo "$SIT" | jq -r '.iranForces.retaliationSentimentHistory | length') 条)"
else
echo " (安装 jq 可显示详细字段) 态势已拉取,长度: ${#SIT}"
fi
fi
echo ""
# ---------- 2. 各表行数(直接读 DB----------
echo "[2] 数据库表行数(与板块对应)"
if ! [[ -f "$DB_PATH" ]]; then
echo " ✗ 数据库文件不存在: $DB_PATH"
echo " 请先 seed: node server/seed.js 或 启动 API 后由 initDb 创建"
elif ! command -v sqlite3 &>/dev/null; then
echo " (未安装 sqlite3跳过表统计。可安装后重试)"
else
TABLES="force_summary power_index force_asset key_location combat_losses wall_street_trend retaliation_current retaliation_history situation_update situation gdelt_events conflict_stats news_content"
for t in $TABLES; do
n=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM $t" 2>/dev/null || echo "?")
case "$t" in
force_summary) desc="力量摘要(美/伊)" ;;
power_index) desc="战力指数 → 顶栏/战力图" ;;
force_asset) desc="资产列表 → 左右侧摘要" ;;
key_location) desc="据点 → 地图/美伊基地面板" ;;
combat_losses) desc="战损 → 战损面板" ;;
wall_street_trend) desc="华尔街趋势图" ;;
retaliation_current) desc="报复当前值" ;;
retaliation_history) desc="报复历史 → 仪表盘" ;;
situation_update) desc="事件脉络 → 时间线" ;;
situation) desc="updated_at → 顶栏时间" ;;
gdelt_events) desc="冲突点 → 地图图层" ;;
conflict_stats) desc="冲突统计 → 战损区" ;;
news_content) desc="资讯表 → /api/news" ;;
*) desc="" ;;
esac
printf " %-22s %6s %s\n" "$t" "$n" "$desc"
done
fi
echo ""
# ---------- 3. 板块健康简要判断 ----------
echo "[3] 板块数据来源与可能问题"
echo " • 仅 seed、爬虫不写: force_summary, power_index, force_asset"
echo " • 爬虫可更新: situation_update(事件脉络), key_location(基地状态), combat_losses(战损), retaliation_*, wall_street_trend, gdelt_events"
echo " • 事件脉络不更新 → 检查爬虫是否启动、是否调用 POST /api/crawler/notify"
echo " • 战损/基地不更新 → 检查是否跑 npm run gdelt、提取器是否输出、新闻是否含相关表述"
echo " • 地图无冲突点 → 检查 gdelt_events 是否有数据、GDELT 或 RSS 回填是否执行"
echo ""
echo "详细逐板块说明见: docs/DEBUG_PANELS.md"
echo "=========================================="

View File

@@ -0,0 +1,77 @@
#!/usr/bin/env bash
# 在本地执行:读取 server/data.db 各表 PRAGMA table_info生成供生产执行的 align-production-schema.sh
# 用法:在项目根目录执行 ./scripts/gen-align-schema-from-local.sh
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
DB_PATH="${DB_PATH:-$PROJECT_ROOT/server/data.db}"
OUT_PATH="$PROJECT_ROOT/scripts/align-production-schema.sh"
if [[ ! -f "$DB_PATH" ]]; then
echo "本地库不存在: $DB_PATH"
exit 1
fi
tables=$(sqlite3 "$DB_PATH" "SELECT name FROM sqlite_master WHERE type='table' AND name NOT IN ('sqlite_sequence') ORDER BY name;")
cat > "$OUT_PATH" << 'HEAD'
#!/usr/bin/env bash
# 由 scripts/gen-align-schema-from-local.sh 根据本地 server/data.db 表结构生成,供生产执行。
# 用法:在生产目录执行 DB_PATH=server/data.db ./scripts/align-production-schema.sh
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
DB_PATH="${DB_PATH:-$PROJECT_ROOT/server/data.db}"
run() { sqlite3 "$DB_PATH" "$1" 2>/dev/null || true; }
echo "=== 对齐生产库表结构(与本地 data.db 一致):$DB_PATH ==="
HEAD
while IFS= read -r table; do
[[ -z "$table" ]] && continue
# 跳过 cid=0首列通常建表时已有
first=1
while IFS='|' read -r cid name type notnull dflt pk; do
[[ -z "$cid" || "$cid" -eq 0 ]] && continue
# 非常量默认值(如 datetime('now'))不写 DEFAULT避免生产 SQLite 报错
def="$type"
[[ "$notnull" == "1" ]] && def="$def NOT NULL"
if [[ -n "$dflt" && "$dflt" != *"("* ]]; then
# SQL 字面量:已知字符串默认值写死,避免 shell 转义问题
case "$dflt" in
'"operational"') def="${def} DEFAULT 'operational'" ;;
'"other"') def="${def} DEFAULT 'other'" ;;
'"medium"') def="${def} DEFAULT 'medium'" ;;
"''") def="${def} DEFAULT ''" ;;
*) dflt_sql="${dflt//\"/\'}"; def="$def DEFAULT $dflt_sql" ;;
esac
fi
if [[ "$def" == *\'* ]]; then
# def 含单引号:用 run '...'\''...'\'' 形式写入
safe_def=$(echo "$def" | sed "s/'/'\\\\''/g")
printf "run 'ALTER TABLE %s ADD COLUMN %s %s;'\n" "$table" "$name" "$safe_def" >> "$OUT_PATH"
else
printf 'run "ALTER TABLE %s ADD COLUMN %s %s;"\n' "$table" "$name" "$def" >> "$OUT_PATH"
fi
if [[ "$first" -eq 1 ]]; then
echo "# $table(本地列)" >> "$OUT_PATH"
first=0
fi
if [[ "$table" == "combat_losses" && "$name" == "carriers" ]]; then
echo 'run "UPDATE combat_losses SET carriers = COALESCE(tanks, 0) WHERE carriers = 0;"' >> "$OUT_PATH"
fi
done < <(sqlite3 -separator '|' "$DB_PATH" "PRAGMA table_info($table);")
if [[ "$first" -eq 0 ]]; then
echo "echo \" $table done\"" >> "$OUT_PATH"
fi
done <<< "$tables"
echo "" >> "$OUT_PATH"
echo "echo \"=== 完成。核对示例: ===\"" >> "$OUT_PATH"
echo "echo \" sqlite3 \$DB_PATH \\\"PRAGMA table_info(key_location);\\\"\"" >> "$OUT_PATH"
echo "echo \" sqlite3 \$DB_PATH \\\"PRAGMA table_info(combat_losses);\\\"\"" >> "$OUT_PATH"
chmod +x "$OUT_PATH"
echo "已生成: $OUT_PATH"
echo "请将该文件推到生产后执行DB_PATH=server/data.db ./scripts/align-production-schema.sh"

View File

@@ -0,0 +1,21 @@
#!/usr/bin/env bash
# 生产环境一键:构建爬虫镜像 + 以「仅爬虫 Docker、API 在宿主机」方式启动,并输出数据对齐说明。
# 使用前API 已用 PM2 等方式在宿主机 3001 端口运行,且 server/data.db 已存在(或先执行 npm run api:seed
set -e
cd "$(dirname "$0")/.."
PROJECT_ROOT="${PROJECT_ROOT:-$(pwd)}"
REGISTRY="${REGISTRY:-}"
echo "==> Building crawler image..."
docker build -t usa-dashboard-crawler:latest \
${REGISTRY:+--build-arg REGISTRY="$REGISTRY"} \
-f Dockerfile.crawler .
echo ""
./scripts/run-crawler-docker-standalone.sh
echo ""
echo "==> Data alignment (生产数据对齐)"
echo " API (host) DB_PATH = $PROJECT_ROOT/server/data.db (或 env DB_PATH)"
echo " Crawler /data/data.db = 挂载自上述同一文件"
echo " 二者必须指向同一 SQLite 文件,前端/API 与爬虫才能数据一致。"

View File

@@ -0,0 +1,55 @@
#!/usr/bin/env bash
# 生产:仅用 Docker 跑爬虫API 在宿主机(如 PM2时使用。
# 保证爬虫与 API 使用同一 SQLite 文件(数据对齐)。
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_ROOT="${PROJECT_ROOT:-$(cd "$SCRIPT_DIR/.." && pwd)}"
DB_FILE="${DB_FILE:-$PROJECT_ROOT/server/data.db}"
API_BASE="${API_BASE:-http://host.docker.internal:3001}"
CRAWLER_IMAGE="${CRAWLER_IMAGE:-usa-dashboard-crawler:latest}"
CONTAINER_NAME="${CONTAINER_NAME:-usa-crawler}"
# 可选:从 .env 加载 DASHSCOPE_API_KEY 等
if [ -f "$PROJECT_ROOT/.env" ]; then
set -a
# shellcheck source=../.env
. "$PROJECT_ROOT/.env"
set +a
fi
# 宿主机 DB 必须存在API 已初始化或先 seed
if [ ! -f "$DB_FILE" ]; then
echo "ERROR: DB file not found: $DB_FILE"
echo " Create it first: DB_PATH=$DB_FILE node server/seed.js"
exit 1
fi
# Linux 下 Docker 默认无 host.docker.internal需显式添加
DOCKER_EXTRA=()
if [ "$(uname -s)" = "Linux" ]; then
DOCKER_EXTRA+=(--add-host=host.docker.internal:host-gateway)
fi
# 若已存在同名容器则先删
docker rm -f "$CONTAINER_NAME" 2>/dev/null || true
echo "==> Starting crawler container (standalone)"
echo " DB: $DB_FILE -> /data/data.db"
echo " API_BASE: $API_BASE"
echo " Image: $CRAWLER_IMAGE"
docker run -d \
--name "$CONTAINER_NAME" \
--restart unless-stopped \
-p 8000:8000 \
-v "$DB_FILE:/data/data.db" \
-e DB_PATH=/data/data.db \
-e API_BASE="$API_BASE" \
-e GDELT_DISABLED=1 \
-e RSS_INTERVAL_SEC=60 \
${DASHSCOPE_API_KEY:+ -e DASHSCOPE_API_KEY="$DASHSCOPE_API_KEY"} \
"${DOCKER_EXTRA[@]}" \
"$CRAWLER_IMAGE"
echo " Container: $CONTAINER_NAME"
echo " Logs: docker logs -f $CONTAINER_NAME"
echo " Status: curl -s http://localhost:8000/crawler/status | jq ."

17
scripts/run-crawler-range.sh Executable file
View File

@@ -0,0 +1,17 @@
#!/usr/bin/env bash
# 按时间范围跑一轮爬虫RSS仅保留指定起始时间之后的条目
# 用法:
# ./scripts/run-crawler-range.sh # 默认从 2026-02-28 0:00 到现在
# ./scripts/run-crawler-range.sh 2026-02-25T00:00:00
#
# GDELT 时间范围需在启动 gdelt 服务时设置,例如:
# GDELT_TIMESPAN=3d npm run gdelt
set -e
START="${1:-2026-02-28T00:00:00}"
cd "$(dirname "$0")/.."
echo "RSS 抓取时间范围: 仅保留 ${START} 之后"
echo "运行: cd crawler && CRAWL_START_DATE=${START} python run_once.py"
echo ""
export CRAWL_START_DATE="$START"
(cd crawler && python3 run_once.py)

126
scripts/test.sh Normal file
View File

@@ -0,0 +1,126 @@
sqlite3 server/data.db "
UPDATE combat_losses
SET civilian_killed = 380, civilian_wounded = 1520
WHERE side = 'us';
UPDATE combat_losses
SET civilian_killed = 4120, civilian_wounded = 12030
WHERE side = 'iran';
"
cd /root/usa
sqlite3 server/data.db "
UPDATE combat_losses
SET bases_destroyed = 15,
bases_damaged = 57,
personnel_killed = 327,
personnel_wounded = 984,
civilian_killed = 380,
civilian_wounded = 1520,
aircraft = 24,
warships = 1,
armor = 18,
vehicles = 42,
drones = 28,
missiles = 756,
helicopters = 8,
submarines = 2,
tanks = 0,
carriers = 0,
civilian_ships = 100,
airport_port = 5
WHERE side = 'us';
UPDATE combat_losses
SET bases_destroyed = 2100,
bases_damaged = 8400,
personnel_killed = 2847,
personnel_wounded = 5620,
civilian_killed = 4120,
civilian_wounded = 12030,
aircraft = 240,
warships = 120,
armor = 18,
vehicles = 420,
drones = 28,
missiles = 4560,
helicopters = 20,
submarines = 2,
tanks = 50,
carriers = 0,
civilian_ships = 50,
airport_port = 42
WHERE side = 'iran';
"
sqlite3 server/data.db "
UPDATE combat_losses
SET bases_destroyed = 15,
bases_damaged = 57,
personnel_killed = 327,
personnel_wounded = 984,
aircraft = 4,
warships = 0,
armor = 3,
vehicles = 76,
civilian_killed = 380,
civilian_wounded = 1520
WHERE side = 'us';
UPDATE combat_losses
SET bases_destroyed = 2100,
bases_damaged = 8400,
personnel_killed = 2847,
personnel_wounded = 5620,
aircraft = 70,
warships = 120,
armor = 18,
vehicles = 420,
civilian_killed = 4120,
civilian_wounded = 12030
WHERE side = 'iran';
"
cd /root/usa
sqlite3 server/data.db "
UPDATE combat_losses
SET bases_destroyed = 15,
bases_damaged = 57,
personnel_killed = 327,
personnel_wounded = 984,
civilian_killed = 380,
civilian_wounded = 1520,
aircraft = 4,
warships = 1,
armor = 18,
vehicles = 42,
drones = 68,
missiles = 1756,
helicopters = 8,
submarines = 0,
tanks = 0,
carriers = 0,
civilian_ships = 172,
airport_port = 7
WHERE side = 'us';
UPDATE combat_losses
SET bases_destroyed = 2100,
bases_damaged = 8400,
personnel_killed = 2847,
personnel_wounded = 5620,
civilian_killed = 4120,
civilian_wounded = 12030,
aircraft = 106,
warships = 107,
armor = 72,
vehicles = 506,
drones = 1428,
missiles = 6620,
helicopters = 20,
submarines = 4,
tanks = 50,
carriers = 1,
civilian_ships = 42,
airport_port = 31
WHERE side = 'iran';
"

81
scripts/verify-panels.cjs Normal file
View File

@@ -0,0 +1,81 @@
#!/usr/bin/env node
/**
* 代码层执行看板验证:直接调用 getSituation() 与 DB输出战损 / 基地 / 地图战区 结果。
* 用法(项目根目录): node scripts/verify-panels.cjs
*/
const path = require('path')
const projectRoot = path.resolve(__dirname, '..')
process.chdir(projectRoot)
const db = require('../server/db')
const { getSituation } = require('../server/situationData')
function run() {
const s = getSituation()
console.log('========================================')
console.log('看板数据验证(与 API getSituation 一致)')
console.log('========================================\n')
console.log('lastUpdated:', s.lastUpdated)
console.log('')
// ---------- 1. 战损 ----------
console.log('--- [1] 战损 combat_losses ---')
const us = s.usForces.combatLosses
const ir = s.iranForces.combatLosses
console.log('美军 阵亡:', us.personnelCasualties.killed, '受伤:', us.personnelCasualties.wounded)
console.log('美军 基地毁/损:', us.bases.destroyed, '/', us.bases.damaged)
console.log('美军 战机/舰艇/装甲/车辆:', us.aircraft, us.warships, us.armor, us.vehicles)
console.log('伊朗 阵亡:', ir.personnelCasualties.killed, '受伤:', ir.personnelCasualties.wounded)
console.log('伊朗 基地毁/损:', ir.bases.destroyed, '/', ir.bases.damaged)
console.log('平民合计 killed/wounded:', s.civilianCasualtiesTotal.killed, s.civilianCasualtiesTotal.wounded)
console.log('conflictStats:', JSON.stringify(s.conflictStats))
console.log('')
// ---------- 2. 基地(与看板口径一致:美军仅 type===Base伊朗为 Base/Port/Nuclear/Missile----------
console.log('--- [2] 基地 key_location ---')
const usLoc = s.usForces.keyLocations || []
const irLoc = s.iranForces.keyLocations || []
const usBases = usLoc.filter((l) => l.type === 'Base')
const irBases = irLoc.filter((l) => ['Base', 'Port', 'Nuclear', 'Missile'].includes(l.type))
const usAttacked = usBases.filter((l) => l.status === 'attacked')
const irAttacked = irBases.filter((l) => l.status === 'attacked')
console.log('美军 总基地数(仅Base):', usBases.length, '| 遭袭:', usAttacked.length, '(与看板「美军基地态势」一致)')
console.log('伊朗 总基地数(Base/Port/Nuclear/Missile):', irBases.length, '| 遭袭:', irAttacked.length, '(与看板「伊朗基地态势」一致)')
if (usAttacked.length > 0) {
console.log('美军遭袭示例:', usAttacked.slice(0, 3).map((l) => `${l.name}(${l.status},damage=${l.damage_level})`).join(', '))
}
if (irAttacked.length > 0) {
console.log('伊朗遭袭示例:', irAttacked.slice(0, 3).map((l) => `${l.name}(${l.status},damage=${l.damage_level})`).join(', '))
}
console.log('')
// ---------- 3. 地图战区 ----------
console.log('--- [3] 地图战区 gdelt_events + conflict_stats ---')
const events = s.conflictEvents || []
console.log('conflictEvents 条数:', events.length)
console.log('conflictStats:', JSON.stringify(s.conflictStats))
if (events.length > 0) {
console.log('最近 3 条:', events.slice(0, 3).map((e) => `${e.event_time} ${(e.title || '').slice(0, 40)} impact=${e.impact_score}`))
}
console.log('')
// ---------- 附加:事件脉络 ----------
const updates = s.recentUpdates || []
console.log('--- [附] 事件脉络 situation_update ---')
console.log('recentUpdates 条数:', updates.length)
if (updates.length > 0) {
console.log('最新 1 条:', updates[0].timestamp, (updates[0].summary || '').slice(0, 50))
}
console.log('========================================')
}
db
.initDb()
.then(() => run())
.catch((err) => {
console.error('验证失败:', err.message)
process.exit(1)
})

124
scripts/verify-pipeline.sh Executable file
View File

@@ -0,0 +1,124 @@
#!/usr/bin/env bash
# 验证爬虫 → 数据库 → API → 前端 全链路
# 用法: ./scripts/verify-pipeline.sh [--start-crawler]
set -e
API_URL="${API_URL:-http://localhost:3001}"
CRAWLER_URL="${CRAWLER_URL:-http://localhost:8000}"
START_CRAWLER=false
[[ "${1:-}" = "--start-crawler" ]] && START_CRAWLER=true
PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
echo "=========================================="
echo "US-Iran 态势面板 链路验证"
echo "API: $API_URL | Crawler: $CRAWLER_URL"
echo "=========================================="
echo ""
# 可选:启动爬虫
if $START_CRAWLER; then
echo "[0/6] 启动爬虫..."
if curl -sf "$CRAWLER_URL/crawler/status" >/dev/null 2>&1; then
echo " ✓ 爬虫已在运行"
else
cd "$PROJECT_ROOT/crawler"
python3 -c "import uvicorn" 2>/dev/null || { echo " 需安装: pip install uvicorn"; exit 1; }
uvicorn realtime_conflict_service:app --host 127.0.0.1 --port 8000 &
echo " 等待爬虫就绪..."
for i in $(seq 1 15); do
sleep 2
if curl -sf "$CRAWLER_URL/crawler/status" >/dev/null 2>&1; then
echo " ✓ 爬虫已启动"
echo " 等待首次 RSS 抓取(约 70 秒)..."
sleep 70
break
fi
done
if ! curl -sf "$CRAWLER_URL/crawler/status" >/dev/null 2>&1; then
echo " ✗ 爬虫启动超时"
exit 1
fi
fi
echo ""
fi
# 1. API 健康检查
echo "[1/6] API 健康检查..."
if curl -sf "$API_URL/api/health" > /dev/null; then
echo " ✓ API 正常"
else
echo " ✗ API 无响应,请先运行: npm run api"
exit 1
fi
# 2. 态势数据
echo "[2/6] 态势数据..."
SIT=$(curl -sf "$API_URL/api/situation" 2>/dev/null || echo "{}")
if echo "$SIT" | grep -q "lastUpdated"; then
echo " ✓ 态势数据可读"
LAST=$(echo "$SIT" | grep -o '"lastUpdated":"[^"]*"' | head -1)
echo " $LAST"
else
echo " ✗ 态势数据异常"
exit 1
fi
# 3. 爬虫状态
echo "[3/6] 爬虫状态..."
CRAWLER=$(curl -sf "$CRAWLER_URL/crawler/status" 2>/dev/null || echo "{}")
if echo "$CRAWLER" | grep -q "db_path\|db_exists"; then
echo " ✓ 爬虫服务可访问"
if command -v jq &>/dev/null; then
CNT=$(echo "$CRAWLER" | jq -r '.situation_update_count // "?"')
echo " situation_update 条数: $CNT"
fi
else
echo " ⚠ 爬虫未启动或不可达(可选,需单独运行爬虫)"
fi
# 4. 资讯表
echo "[4/6] 资讯表 news_content..."
NEWS=$(curl -sf "$API_URL/api/news?limit=3" 2>/dev/null || echo '{"items":[]}')
if echo "$NEWS" | grep -q '"items"'; then
if command -v jq &>/dev/null; then
N=$(echo "$NEWS" | jq '.items | length')
echo " ✓ 最近 $N 条资讯"
else
echo " ✓ 资讯接口可读"
fi
else
echo " ⚠ news_content 可能为空(爬虫未跑或刚启动)"
fi
# 5. 战损数据
echo "[5/6] 战损数据 combat_losses..."
if echo "$SIT" | grep -q "personnelCasualties"; then
echo " ✓ 战损字段存在"
if command -v jq &>/dev/null; then
US_K=$(echo "$SIT" | jq -r '.usForces.combatLosses.personnelCasualties.killed // "?"')
IR_K=$(echo "$SIT" | jq -r '.iranForces.combatLosses.personnelCasualties.killed // "?"')
echo " 美军阵亡: $US_K | 伊朗阵亡: $IR_K"
fi
else
echo " ✗ 战损结构异常"
fi
# 6. 通知接口(仅验证可调用)
echo "[6/6] 通知接口 POST /api/crawler/notify..."
NOTIFY=$(curl -sf -X POST "$API_URL/api/crawler/notify" 2>/dev/null || echo "{}")
if echo "$NOTIFY" | grep -q '"ok"'; then
echo " ✓ 通知接口正常"
else
echo " ⚠ 通知接口可能异常"
fi
echo ""
echo "=========================================="
echo "验证完成。"
echo ""
echo "建议:"
echo " - 访问 $API_URL/db 查看各表数据"
echo " - 爬虫未启动时: ./scripts/verify-pipeline.sh --start-crawler"
echo " - 或手动启动: cd crawler && uvicorn realtime_conflict_service:app --port 8000"
echo "=========================================="

171
server/README.md Normal file
View File

@@ -0,0 +1,171 @@
# 后端运行逻辑
后端是 **Node.js Express + SQLite + WebSocket**,与 Python 爬虫共用同一数据库文件负责提供「态势数据」API、实时推送和简单统计。
---
## 一、启动方式
```bash
npm run api # 启动 server/index.js默认端口 3001
```
- 端口:`process.env.API_PORT || 3001`
- 数据库:`process.env.DB_PATH``server/data.db`(与爬虫共用)
---
## 二、整体架构
```
┌─────────────────────────────────────────┐
│ server/index.js │
│ (HTTP Server + WebSocket Server) │
└─────────────────────────────────────────┘
┌───────────────────────────────┼───────────────────────────────┐
│ │ │
▼ ▼ ▼
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ /api/* │ │ /ws │ │ 静态 dist │
│ routes.js │ │ WebSocket │ │ (生产) │
└──────┬──────┘ └──────┬──────┘ └─────────────┘
│ │
│ 读/写 │ 广播 situation + stats
▼ │
┌─────────────┐ │
│ db.js │◄─────────────────────┘
│ (SQLite) │ getSituation() / getStats()
└──────┬──────┘
│ 同文件 data.db
┌─────────────┐
│ Python 爬虫 │ 抓取 → 去重 → AI 清洗 → 映射到库字段 → 写表 → POST /api/crawler/notify
│ situation_ │ (main.py 或 gdelt 服务;写 situation_update / news_content / combat_losses 等)
│ update 等 │
└─────────────┘
```
---
## 三、核心模块
| 文件 | 作用 |
|------|------|
| **index.js** | 创建 HTTP + WebSocket 服务,挂载路由、静态资源、定时广播、爬虫通知回调 |
| **routes.js** | 所有 `/api/*` 接口situation、db/dashboard、visit、feedback、share、stats、events、news 等 |
| **situationData.js** | `getSituation()`从多张表聚合为前端所需的「态势」JSON军力、基地、战损、事件脉络、GDELT 等) |
| **db.js** | SQLite 连接、建表、迁移better-sqlite3WAL 模式) |
| **stats.js** | `getStats()`:在看人数、累计访问、留言数、分享数 |
| **openapi.js** | Swagger/OpenAPI 文档定义 |
| **seed.js** | 初始化/重置种子数据(可单独运行 `npm run api:seed` |
---
## 四、数据流(读)
1. **前端要「整页态势」**
- 请求 `GET /api/situation``routes.js` 调用 `getSituation()`
- `situationData.js` 从 db 读:`force_summary``power_index``force_asset``key_location``combat_losses``wall_street_trend``retaliation_*``situation_update`(最近 50 条)、`gdelt_events``conflict_stats`
- 组装成 `{ lastUpdated, usForces, iranForces, recentUpdates, conflictEvents, conflictStats, civilianCasualtiesTotal }` 返回。
2. **前端要「事件列表」**
- `GET /api/events` 返回 `conflictEvents` + `conflict_stats` + `updated_at`(同样来自 getSituation 的数据)。
3. **前端要「原始表数据」**
- `GET /api/db/dashboard` 返回多张表的 `SELECT *` 结果(含 `situation_update`),供 `/db` 调试页使用。
4. **WebSocket**
- 连接 `ws://host/ws` 时立即收到一条 `{ type: 'situation', data: getSituation(), stats: getStats() }`
- 之后每 3 秒服务端主动广播同结构数据,前端可据此做实时刷新。
---
## 五、数据流(写)
### 5.1 爬虫侧写库链路(推荐理解顺序)
爬虫写入前端库的完整链路如下,**不是**「抓完直接写表」而是经过去重、AI 清洗、字段映射后再落库:
1. **爬虫抓取实时数据**
- RSS 等源抓取(`scrapers/rss_scraper.fetch_all`),得到原始条目列表。
2. **数据去重**
- 抓取阶段RSS 内按 (title, url) 去重。
- 落库前:按 `content_hash(title, summary, url)``news_content` 表中去重,仅**未出现过**的条目进入后续流程(`news_storage.save_and_dedup`)。
3. **去重后按批次推送给 AI 清洗**
- 对通过去重的每条/每批数据:
- **展示用清洗**:标题/摘要翻译、`clean_news_for_panel` 提炼为符合面板的纯文本与长度(如 summary ≤120 字),`ensure_category` / `ensure_severity` 规范为前端枚举(`cleaner_ai`)。
- **结构化提取**(可选):`extractor_ai` / `extractor_dashscope` / `extractor_rules` 从新闻文本中抽取战损、基地状态等,输出符合 `panel_schema` 的结构。
- 得到「有效数据」:既有人读的 summary/category/severity也有可落库的 combat_losses_delta、key_location 等。
4. **有效数据映射回前端数据库字段**
- 事件脉络:清洗后的条目写入 `situation_update``db_writer.write_updates`)。
- 资讯存档:去重后的新数据写入 `news_content`(已在步骤 2 完成)。
- 结构化数据AI 提取结果通过 `db_merge.merge` 映射到前端表结构,更新 `combat_losses``key_location``retaliation_*``wall_street_trend` 等(与 `situationData.getSituation` 所用字段一致)。
5. **更新数据库表并通知后端**
- 上述表更新完成后,爬虫请求 **POST /api/crawler/notify**
- 后端index.js更新 `situation.updated_at` 并调用 `broadcastSituation()`,前端通过 WebSocket 拿到最新态势。
实现上,**gdelt 服务**`realtime_conflict_service`)里:先对抓取结果做翻译与清洗,再 `save_and_dedup` 去重落库 `news_content`,用去重后的新项写 `situation_update`,再按批次对这批新项做 AI 提取并 `db_merge.merge` 写战损/基地等表。
### 5.2 用户行为写入
- **POST /api/visit**:记 IP 到 `visits``visitor_count.total` +1并触发一次广播。
- **POST /api/feedback**:插入 `feedback`
- **POST /api/share**`share_count.total` +1。
这些写操作在 `routes.js` 中通过 `db.prepare().run()` 完成。
---
## 六、API 一览
| 方法 | 路径 | 说明 |
|------|------|------|
| GET | /api/health | 健康检查 |
| GET | /api/situation | 完整态势(供主面板) |
| GET | /api/events | 冲突事件 + 统计 |
| GET | /api/db/dashboard | 各表原始数据(供 /db 页) |
| GET | /api/news | 资讯列表news_content 表) |
| GET | /api/stats | 在看/累计/留言/分享数 |
| POST | /api/visit | 记录访问并返回 stats |
| POST | /api/feedback | 提交留言 |
| POST | /api/share | 分享计数 +1 |
| POST | /api/crawler/notify | 爬虫通知:更新 updated_at 并广播(内部用) |
- **Swagger**`http://localhost:3001/api-docs`
---
## 七、WebSocket 行为
- **路径**`/ws`(与 HTTP 同端口)。
- **连接时**:服务端发送一条 `{ type: 'situation', data, stats }`
- **定时广播**:按 `BROADCAST_INTERVAL_MS`(默认 30 秒)轮询;**仅当数据有变化**(以 `situation.updated_at` + `situation_update` 条数为版本)时才执行 `getSituation()` + `getStats()` 并推送,避免无变更时重复查库和推送、降低负载。
- **即时广播**:以下情况会立即推送一次(不等待定时间隔):爬虫 POST `/api/crawler/notify`、修订页保存PUT/PATCH/POST/DELETE `/api/edit/*`)。
- **环境变量**`BROADCAST_INTERVAL_MS=0` 可关闭定时轮询,仅依赖即时广播;设为 `3000` 可恢复为每 3 秒检查一次(仍仅在数据变化时推送)。
---
## 八、与爬虫的协作
- **共享 DB**:后端与爬虫都使用同一 `DB_PATH`(默认 `server/data.db`)。
- **爬虫写库链路**:爬虫抓取 → 去重 → AI 清洗出有效数据 → 映射到前端库字段 → 更新 `situation_update``news_content``combat_losses``key_location``gdelt_events` 等表 → 调用 POST `/api/crawler/notify` 通知后端。
- **后端角色**:只读这些表(`getSituation()` 等)并推送;不参与抓取、去重或 AI 清洗,不调度爬虫。
整体上,后端是「读库 + 聚合 + 推送」的服务;写库来自**爬虫(经过去重与 AI 清洗、字段映射后)**以及**用户行为**(访问/留言/分享)。
---
## 九、本地验证链路
1. **启动后端**`npm run api`(默认 3001
2. **检查读库**`curl -s http://localhost:3001/api/situation` 应返回含 `lastUpdated``recentUpdates` 的 JSON。
3. **检查写库与通知**:爬虫跑完流水线后会 POST `/api/crawler/notify`,后端会更新 `situation.updated_at` 并广播;可再请求 `/api/situation``lastUpdated` 是否更新。
4. **查原始表**:浏览器打开 `http://localhost:3001/api/db/dashboard` 或前端 `/db` 页,查看 `situation_update``news_content` 等表。
爬虫侧完整验证步骤见 **crawler/README.md** 的「本地验证链路」;项目根目录可执行 `./scripts/verify-pipeline.sh` 做一键检查。

View File

@@ -1,20 +1,69 @@
const Database = require('better-sqlite3')
/**
* SQLite 封装:使用 sql.js纯 JS/WebAssembly无需 node-gyp
* 对外接口与 better-sqlite3 兼容db.prepare().get/all/run、db.exec
*/
const path = require('path')
const fs = require('fs')
const dbPath = path.join(__dirname, 'data.db')
const db = new Database(dbPath)
const dbPath = process.env.DB_PATH || path.join(__dirname, 'data.db')
let _db = null
/** sql.js 构造函数initDb 时注入,供 reloadFromFile 使用 */
let _sqlJs = null
// 启用外键
db.pragma('journal_mode = WAL')
function getDb() {
if (!_db) throw new Error('DB not initialized. Call initDb() first.')
return _db
}
// 建表
db.exec(`
function wrapDatabase(nativeDb, persist) {
return {
prepare(sql) {
return {
get(...args) {
const stmt = nativeDb.prepare(sql)
stmt.bind(args.length ? args : null)
const row = stmt.step() ? stmt.getAsObject() : undefined
stmt.free()
return row
},
all(...args) {
const stmt = nativeDb.prepare(sql)
stmt.bind(args.length ? args : null)
const rows = []
while (stmt.step()) rows.push(stmt.getAsObject())
stmt.free()
return rows
},
run(...args) {
const stmt = nativeDb.prepare(sql)
stmt.bind(args.length ? args : null)
while (stmt.step());
stmt.free()
persist()
},
}
},
exec(sql) {
const statements = sql.split(';').map((s) => s.trim()).filter(Boolean)
statements.forEach((s) => nativeDb.run(s))
persist()
},
pragma(str) {
nativeDb.run('PRAGMA ' + str)
},
}
}
function runMigrations(db) {
const exec = (sql) => db.exec(sql)
const prepare = (sql) => db.prepare(sql)
exec(`
CREATE TABLE IF NOT EXISTS situation (
id INTEGER PRIMARY KEY CHECK (id = 1),
data TEXT NOT NULL,
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS force_summary (
side TEXT PRIMARY KEY CHECK (side IN ('us', 'iran')),
total_assets INTEGER NOT NULL,
@@ -26,7 +75,6 @@ db.exec(`
missile_consumed INTEGER NOT NULL,
missile_stock INTEGER NOT NULL
);
CREATE TABLE IF NOT EXISTS power_index (
side TEXT PRIMARY KEY CHECK (side IN ('us', 'iran')),
overall INTEGER NOT NULL,
@@ -34,7 +82,6 @@ db.exec(`
economic_power INTEGER NOT NULL,
geopolitical_influence INTEGER NOT NULL
);
CREATE TABLE IF NOT EXISTS force_asset (
id TEXT PRIMARY KEY,
side TEXT NOT NULL CHECK (side IN ('us', 'iran')),
@@ -45,7 +92,6 @@ db.exec(`
lat REAL,
lng REAL
);
CREATE TABLE IF NOT EXISTS key_location (
id INTEGER PRIMARY KEY AUTOINCREMENT,
side TEXT NOT NULL CHECK (side IN ('us', 'iran')),
@@ -55,7 +101,6 @@ db.exec(`
type TEXT,
region TEXT
);
CREATE TABLE IF NOT EXISTS combat_losses (
side TEXT PRIMARY KEY CHECK (side IN ('us', 'iran')),
bases_destroyed INTEGER NOT NULL,
@@ -67,24 +112,20 @@ db.exec(`
armor INTEGER NOT NULL,
vehicles INTEGER NOT NULL
);
CREATE TABLE IF NOT EXISTS wall_street_trend (
id INTEGER PRIMARY KEY AUTOINCREMENT,
time TEXT NOT NULL,
value INTEGER NOT NULL
);
CREATE TABLE IF NOT EXISTS retaliation_current (
id INTEGER PRIMARY KEY CHECK (id = 1),
value INTEGER NOT NULL
);
CREATE TABLE IF NOT EXISTS retaliation_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
time TEXT NOT NULL,
value INTEGER NOT NULL
);
CREATE TABLE IF NOT EXISTS situation_update (
id TEXT PRIMARY KEY,
timestamp TEXT NOT NULL,
@@ -92,16 +133,248 @@ db.exec(`
summary TEXT NOT NULL,
severity TEXT NOT NULL
);
`)
CREATE TABLE IF NOT EXISTS gdelt_events (
event_id TEXT PRIMARY KEY,
event_time TEXT NOT NULL,
title TEXT NOT NULL,
lat REAL NOT NULL,
lng REAL NOT NULL,
impact_score INTEGER NOT NULL,
url TEXT,
created_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS conflict_stats (
id INTEGER PRIMARY KEY CHECK (id = 1),
total_events INTEGER NOT NULL DEFAULT 0,
high_impact_events INTEGER NOT NULL DEFAULT 0,
estimated_casualties INTEGER NOT NULL DEFAULT 0,
estimated_strike_count INTEGER NOT NULL DEFAULT 0,
updated_at TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS news_content (
id TEXT PRIMARY KEY,
content_hash TEXT NOT NULL UNIQUE,
title TEXT NOT NULL,
summary TEXT NOT NULL,
url TEXT NOT NULL DEFAULT '',
source TEXT NOT NULL DEFAULT '',
published_at TEXT NOT NULL,
category TEXT NOT NULL DEFAULT 'other',
severity TEXT NOT NULL DEFAULT 'medium',
created_at TEXT NOT NULL DEFAULT (datetime('now'))
);
`)
try { exec('CREATE INDEX IF NOT EXISTS idx_news_content_hash ON news_content(content_hash)') } catch (_) {}
try { exec('CREATE INDEX IF NOT EXISTS idx_news_content_published ON news_content(published_at DESC)') } catch (_) {}
// 迁移:为已有 key_location 表添加 type、region、status、damage_level 列
try {
const cols = db.prepare('PRAGMA table_info(key_location)').all()
const names = cols.map((c) => c.name)
if (!names.includes('type')) db.exec('ALTER TABLE key_location ADD COLUMN type TEXT')
if (!names.includes('region')) db.exec('ALTER TABLE key_location ADD COLUMN region TEXT')
if (!names.includes('status')) db.exec('ALTER TABLE key_location ADD COLUMN status TEXT DEFAULT "operational"')
if (!names.includes('damage_level')) db.exec('ALTER TABLE key_location ADD COLUMN damage_level INTEGER')
} catch (_) {}
try {
const cols = prepare('PRAGMA table_info(key_location)').all()
const names = cols.map((c) => c.name)
if (!names.includes('type')) exec('ALTER TABLE key_location ADD COLUMN type TEXT')
if (!names.includes('region')) exec('ALTER TABLE key_location ADD COLUMN region TEXT')
if (!names.includes('status')) exec('ALTER TABLE key_location ADD COLUMN status TEXT DEFAULT "operational"')
if (!names.includes('damage_level')) exec('ALTER TABLE key_location ADD COLUMN damage_level INTEGER')
if (!names.includes('attacked_at')) exec('ALTER TABLE key_location ADD COLUMN attacked_at TEXT')
} catch (_) {}
try {
const lossCols = prepare('PRAGMA table_info(combat_losses)').all()
const lossNames = lossCols.map((c) => c.name)
if (!lossNames.includes('civilian_killed')) exec('ALTER TABLE combat_losses ADD COLUMN civilian_killed INTEGER NOT NULL DEFAULT 0')
if (!lossNames.includes('civilian_wounded')) exec('ALTER TABLE combat_losses ADD COLUMN civilian_wounded INTEGER NOT NULL DEFAULT 0')
if (!lossNames.includes('updated_at')) exec('ALTER TABLE combat_losses ADD COLUMN updated_at TEXT DEFAULT (datetime("now"))')
if (!lossNames.includes('drones')) exec('ALTER TABLE combat_losses ADD COLUMN drones INTEGER NOT NULL DEFAULT 0')
if (!lossNames.includes('missiles')) exec('ALTER TABLE combat_losses ADD COLUMN missiles INTEGER NOT NULL DEFAULT 0')
if (!lossNames.includes('helicopters')) exec('ALTER TABLE combat_losses ADD COLUMN helicopters INTEGER NOT NULL DEFAULT 0')
if (!lossNames.includes('submarines')) exec('ALTER TABLE combat_losses ADD COLUMN submarines INTEGER NOT NULL DEFAULT 0')
if (!lossNames.includes('tanks')) exec('ALTER TABLE combat_losses ADD COLUMN tanks INTEGER NOT NULL DEFAULT 0')
if (!lossNames.includes('carriers')) {
exec('ALTER TABLE combat_losses ADD COLUMN carriers INTEGER NOT NULL DEFAULT 0')
exec('UPDATE combat_losses SET carriers = tanks')
}
if (!lossNames.includes('civilian_ships')) exec('ALTER TABLE combat_losses ADD COLUMN civilian_ships INTEGER NOT NULL DEFAULT 0')
if (!lossNames.includes('airport_port')) exec('ALTER TABLE combat_losses ADD COLUMN airport_port INTEGER NOT NULL DEFAULT 0')
} catch (_) {}
module.exports = db
const addUpdatedAt = (table) => {
try {
const cols = prepare(`PRAGMA table_info(${table})`).all()
if (!cols.some((c) => c.name === 'updated_at')) {
exec(`ALTER TABLE ${table} ADD COLUMN updated_at TEXT DEFAULT (datetime("now"))`)
}
} catch (_) {}
}
;['force_summary', 'power_index', 'force_asset', 'key_location', 'retaliation_current'].forEach(addUpdatedAt)
try {
exec(`
CREATE TABLE IF NOT EXISTS visits (
ip TEXT PRIMARY KEY,
last_seen TEXT NOT NULL DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS visitor_count (
id INTEGER PRIMARY KEY CHECK (id = 1),
total INTEGER NOT NULL DEFAULT 0
);
INSERT OR IGNORE INTO visitor_count (id, total) VALUES (1, 0);
`)
} catch (_) {}
try {
exec(`
CREATE TABLE IF NOT EXISTS feedback (
id INTEGER PRIMARY KEY AUTOINCREMENT,
content TEXT NOT NULL,
ip TEXT,
created_at TEXT NOT NULL DEFAULT (datetime('now'))
)
`)
} catch (_) {}
try {
exec(`
CREATE TABLE IF NOT EXISTS share_count (
id INTEGER PRIMARY KEY CHECK (id = 1),
total INTEGER NOT NULL DEFAULT 0
);
INSERT OR IGNORE INTO share_count (id, total) VALUES (1, 0);
`)
} catch (_) {}
try {
exec(`
CREATE TABLE IF NOT EXISTS like_count (
id INTEGER PRIMARY KEY CHECK (id = 1),
total INTEGER NOT NULL DEFAULT 0
);
INSERT OR IGNORE INTO like_count (id, total) VALUES (1, 0);
`)
} catch (_) {}
try {
exec(`
CREATE TABLE IF NOT EXISTS display_stats (
id INTEGER PRIMARY KEY CHECK (id = 1),
viewers INTEGER NULL,
cumulative INTEGER NULL,
share_count INTEGER NULL,
like_count INTEGER NULL,
feedback_count INTEGER NULL
);
INSERT OR IGNORE INTO display_stats (id) VALUES (1);
`)
} catch (_) {}
try {
const dsCols = prepare('PRAGMA table_info(display_stats)').all()
const dsNames = dsCols.map((c) => c.name)
if (!dsNames.includes('override_enabled')) {
exec('ALTER TABLE display_stats ADD COLUMN override_enabled INTEGER NOT NULL DEFAULT 0')
}
} catch (_) {}
try {
exec(`
CREATE TABLE IF NOT EXISTS map_strike_source (
id TEXT PRIMARY KEY,
name TEXT NOT NULL,
lng REAL NOT NULL,
lat REAL NOT NULL
);
CREATE TABLE IF NOT EXISTS map_strike_line (
source_id TEXT NOT NULL,
target_lng REAL NOT NULL,
target_lat REAL NOT NULL,
target_name TEXT,
struck_at TEXT,
FOREIGN KEY (source_id) REFERENCES map_strike_source(id)
);
CREATE INDEX IF NOT EXISTS idx_map_strike_line_source ON map_strike_line(source_id);
`)
} catch (_) {}
try {
const lineCols = prepare('PRAGMA table_info(map_strike_line)').all()
if (!lineCols.some((c) => c.name === 'struck_at')) {
exec('ALTER TABLE map_strike_line ADD COLUMN struck_at TEXT')
}
} catch (_) {}
try {
exec(`
CREATE TABLE IF NOT EXISTS animation_config (
id INTEGER PRIMARY KEY CHECK (id = 1),
strike_cutoff_days INTEGER NOT NULL DEFAULT 5,
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
);
INSERT OR IGNORE INTO animation_config (id, strike_cutoff_days) VALUES (1, 5);
`)
} catch (_) {}
try {
exec(`
CREATE TABLE IF NOT EXISTS war_map_config (
id INTEGER PRIMARY KEY CHECK (id = 1),
config TEXT NOT NULL,
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
);
`)
} catch (_) {}
}
async function initDb() {
const initSqlJs = require('sql.js')
const SQL = await initSqlJs()
_sqlJs = SQL
let data = new Uint8Array(0)
if (fs.existsSync(dbPath)) {
data = new Uint8Array(fs.readFileSync(dbPath))
}
const nativeDb = new SQL.Database(data)
function persist() {
try {
const buf = nativeDb.export()
fs.writeFileSync(dbPath, Buffer.from(buf))
} catch (e) {
console.error('[db] persist error:', e.message)
}
}
nativeDb.run('PRAGMA journal_mode = WAL')
const wrapped = wrapDatabase(nativeDb, persist)
runMigrations(wrapped)
_db = wrapped
return _db
}
/**
* 从磁盘重新加载 DB爬虫写入同一文件后调用使 Node 内存中的库与文件一致)
*/
function reloadFromFile() {
if (!_sqlJs || !_db) throw new Error('DB not initialized. Call initDb() first.')
let data = new Uint8Array(0)
if (fs.existsSync(dbPath)) {
data = new Uint8Array(fs.readFileSync(dbPath))
}
const nativeDb = new _sqlJs.Database(data)
function persist() {
try {
const buf = nativeDb.export()
fs.writeFileSync(dbPath, Buffer.from(buf))
} catch (e) {
console.error('[db] persist error:', e.message)
}
}
nativeDb.run('PRAGMA journal_mode = WAL')
const wrapped = wrapDatabase(nativeDb, persist)
runMigrations(wrapped)
_db = wrapped
}
const proxy = {
prepare(sql) {
return getDb().prepare(sql)
},
exec(sql) {
return getDb().exec(sql)
},
pragma(str) {
getDb().pragma(str)
},
}
module.exports = proxy
module.exports.initDb = initDb
module.exports.getDb = getDb
module.exports.reloadFromFile = reloadFromFile

Some files were not shown because too many files have changed in this diff Show More