fix:优化爬虫配置,单独使用docker容器运行

This commit is contained in:
Daniel
2026-03-05 20:19:24 +08:00
parent bbb9a5e1e1
commit 07454b73c2
9 changed files with 180 additions and 17 deletions

View File

@@ -0,0 +1,21 @@
#!/usr/bin/env bash
# 生产环境一键:构建爬虫镜像 + 以「仅爬虫 Docker、API 在宿主机」方式启动,并输出数据对齐说明。
# 使用前API 已用 PM2 等方式在宿主机 3001 端口运行,且 server/data.db 已存在(或先执行 npm run api:seed
set -e
cd "$(dirname "$0")/.."
PROJECT_ROOT="${PROJECT_ROOT:-$(pwd)}"
REGISTRY="${REGISTRY:-}"
echo "==> Building crawler image..."
docker build -t usa-dashboard-crawler:latest \
${REGISTRY:+--build-arg REGISTRY="$REGISTRY"} \
-f Dockerfile.crawler .
echo ""
./scripts/run-crawler-docker-standalone.sh
echo ""
echo "==> Data alignment (生产数据对齐)"
echo " API (host) DB_PATH = $PROJECT_ROOT/server/data.db (或 env DB_PATH)"
echo " Crawler /data/data.db = 挂载自上述同一文件"
echo " 二者必须指向同一 SQLite 文件,前端/API 与爬虫才能数据一致。"

View File

@@ -0,0 +1,55 @@
#!/usr/bin/env bash
# 生产:仅用 Docker 跑爬虫API 在宿主机(如 PM2时使用。
# 保证爬虫与 API 使用同一 SQLite 文件(数据对齐)。
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_ROOT="${PROJECT_ROOT:-$(cd "$SCRIPT_DIR/.." && pwd)}"
DB_FILE="${DB_FILE:-$PROJECT_ROOT/server/data.db}"
API_BASE="${API_BASE:-http://host.docker.internal:3001}"
CRAWLER_IMAGE="${CRAWLER_IMAGE:-usa-dashboard-crawler:latest}"
CONTAINER_NAME="${CONTAINER_NAME:-usa-crawler}"
# 可选:从 .env 加载 DASHSCOPE_API_KEY 等
if [ -f "$PROJECT_ROOT/.env" ]; then
set -a
# shellcheck source=../.env
. "$PROJECT_ROOT/.env"
set +a
fi
# 宿主机 DB 必须存在API 已初始化或先 seed
if [ ! -f "$DB_FILE" ]; then
echo "ERROR: DB file not found: $DB_FILE"
echo " Create it first: DB_PATH=$DB_FILE node server/seed.js"
exit 1
fi
# Linux 下 Docker 默认无 host.docker.internal需显式添加
DOCKER_EXTRA=()
if [ "$(uname -s)" = "Linux" ]; then
DOCKER_EXTRA+=(--add-host=host.docker.internal:host-gateway)
fi
# 若已存在同名容器则先删
docker rm -f "$CONTAINER_NAME" 2>/dev/null || true
echo "==> Starting crawler container (standalone)"
echo " DB: $DB_FILE -> /data/data.db"
echo " API_BASE: $API_BASE"
echo " Image: $CRAWLER_IMAGE"
docker run -d \
--name "$CONTAINER_NAME" \
--restart unless-stopped \
-p 8000:8000 \
-v "$DB_FILE:/data/data.db" \
-e DB_PATH=/data/data.db \
-e API_BASE="$API_BASE" \
-e GDELT_DISABLED=1 \
-e RSS_INTERVAL_SEC=60 \
${DASHSCOPE_API_KEY:+ -e DASHSCOPE_API_KEY="$DASHSCOPE_API_KEY"} \
"${DOCKER_EXTRA[@]}" \
"$CRAWLER_IMAGE"
echo " Container: $CONTAINER_NAME"
echo " Logs: docker logs -f $CONTAINER_NAME"
echo " Status: curl -s http://localhost:8000/crawler/status | jq ."