fix:优化wsl环境下的边缘设备执行逻辑

This commit is contained in:
Daniel
2026-04-07 02:06:43 +08:00
parent afec0980d4
commit 6b9b9d60b5
7 changed files with 439 additions and 4 deletions

View File

@@ -6,6 +6,8 @@ WORKER_BASE_URL=http://127.0.0.1:8000
WS_POLL_INTERVAL_SEC=1.0 WS_POLL_INTERVAL_SEC=1.0
DISPATCH_WS_URL=ws://127.0.0.1:8060/ws/edge/edge-a4000-01 DISPATCH_WS_URL=ws://127.0.0.1:8060/ws/edge/edge-a4000-01
EDGE_POLL_INTERVAL_SEC=1.0 EDGE_POLL_INTERVAL_SEC=1.0
EDGE_ALLOW_REMOTE_UPDATE=true
EDGE_ALLOW_REMOTE_RESTART=true
EDGE_DISPATCH_HOST=0.0.0.0 EDGE_DISPATCH_HOST=0.0.0.0
EDGE_DISPATCH_PORT=8020 EDGE_DISPATCH_PORT=8020
EDGE_MAX_DISPATCH_RECORDS=2000 EDGE_MAX_DISPATCH_RECORDS=2000

View File

@@ -76,6 +76,34 @@ cd video_worker
bash scripts/start_edge_device_local.sh bash scripts/start_edge_device_local.sh
``` ```
Windows + WSL 一键边缘设备启动(推荐):
```powershell
cd video_worker
.\scripts\edge_device_wsl.ps1 -Action start
```
常用操作:
```powershell
.\scripts\edge_device_wsl.ps1 -Action status
.\scripts\edge_device_wsl.ps1 -Action restart
.\scripts\edge_device_wsl.ps1 -Action stop
```
多发行版时可指定:
```powershell
.\scripts\edge_device_wsl.ps1 -Action start -Distro Ubuntu-22.04
```
边缘设备停止 / 重启:
```bash
bash scripts/stop_edge_device_local.sh
bash scripts/restart_edge_device_local.sh
```
独立 WS 网关服务(远程推荐): 独立 WS 网关服务(远程推荐):
```bash ```bash
@@ -223,6 +251,46 @@ video_worker/
- 查看在线边缘设备 - 查看在线边缘设备
- `WS /ws/edge/{device_id}`edge_dispatch_service - `WS /ws/edge/{device_id}`edge_dispatch_service
- 边缘设备接入通道 - 边缘设备接入通道
- `POST /devices/{device_id}/command`edge_dispatch_service
- 通过 HTTP 下发设备运维指令(中心自动转 WS
- `GET /commands/{dispatch_id}`edge_dispatch_service
- 查询设备指令执行状态和结果
边缘设备 WS 控制指令(由上游下发到 `edge_device_client.py`
- `generate`: 下发生成任务
- `update_code`: 拉取最新代码(默认执行 `git fetch --all && git checkout <branch> && git pull --ff-only origin <branch>`,可通过 `command` 自定义)
- `restart_service`: 执行边缘本地重启脚本
- `ping`: 心跳探活,设备回 `pong`
HTTP 下发设备指令示例(推荐上游系统使用):
```bash
# 1) 更新边缘设备代码
curl -X POST http://<dispatch-host>:8020/devices/edge-a4000-01/command \
-H "Content-Type: application/json" \
-d '{
"command": "update_code",
"branch": "master"
}'
# 2) 重启边缘设备服务
curl -X POST http://<dispatch-host>:8020/devices/edge-a4000-01/command \
-H "Content-Type: application/json" \
-d '{
"command": "restart_service"
}'
# 3) 设备心跳检查
curl -X POST http://<dispatch-host>:8020/devices/edge-a4000-01/command \
-H "Content-Type: application/json" \
-d '{
"command": "ping"
}'
# 4) 查询指令执行状态(使用上一步返回的 dispatch_id
curl http://<dispatch-host>:8020/commands/<dispatch_id>
```
参数限制: 参数限制:

View File

@@ -29,6 +29,22 @@ class DispatchResponse(BaseModel):
created_at: str created_at: str
class DeviceCommandRequest(BaseModel):
command: str
dispatch_id: Optional[str] = None
branch: Optional[str] = None
shell_command: Optional[str] = None
payload: Optional[dict[str, Any]] = None
class DeviceCommandResponse(BaseModel):
dispatch_id: str
device_id: str
command: str
status: str
created_at: str
@dataclass @dataclass
class EdgeConnection: class EdgeConnection:
device_id: str device_id: str
@@ -42,6 +58,7 @@ class EdgeDispatchManager:
def __init__(self) -> None: def __init__(self) -> None:
self.connections: dict[str, EdgeConnection] = {} self.connections: dict[str, EdgeConnection] = {}
self.dispatches: dict[str, dict[str, Any]] = {} self.dispatches: dict[str, dict[str, Any]] = {}
self.commands: dict[str, dict[str, Any]] = {}
self.lock = asyncio.Lock() self.lock = asyncio.Lock()
async def register(self, device_id: str, websocket: WebSocket) -> EdgeConnection: async def register(self, device_id: str, websocket: WebSocket) -> EdgeConnection:
@@ -130,17 +147,68 @@ class EdgeDispatchManager:
raise raise
return record return record
async def create_command(self, conn: EdgeConnection, body: DeviceCommandRequest) -> dict[str, Any]:
await self._prune_if_needed()
command = body.command.strip().lower()
allowed = {"update_code", "restart_service", "ping"}
if command not in allowed:
raise HTTPException(status_code=400, detail=f"unsupported command: {command}")
dispatch_id = body.dispatch_id or uuid4().hex
now = utc_now_iso()
record = {
"dispatch_id": dispatch_id,
"device_id": conn.device_id,
"command": command,
"status": "DISPATCHED",
"created_at": now,
"updated_at": now,
"request": body.model_dump(),
"result": None,
"error": None,
}
payload: dict[str, Any] = {
"event": command,
"dispatch_id": dispatch_id,
}
if body.branch:
payload["branch"] = body.branch
if body.shell_command:
payload["command"] = body.shell_command
if body.payload:
payload.update(body.payload)
async with self.lock:
self.commands[dispatch_id] = record
target = self.connections.get(conn.device_id)
if target is None:
raise HTTPException(status_code=404, detail=f"device disconnected: {conn.device_id}")
target.last_seen = now
try:
await conn.websocket.send_json(payload)
except Exception as exc:
async with self.lock:
record["status"] = "FAILED"
record["error"] = f"command send failed: {exc}"
record["updated_at"] = utc_now_iso()
raise
return record
async def _prune_if_needed(self) -> None: async def _prune_if_needed(self) -> None:
max_records = max(100, int(settings.edge_max_dispatch_records)) max_records = max(100, int(settings.edge_max_dispatch_records))
async with self.lock: async with self.lock:
total = len(self.dispatches) total = len(self.dispatches) + len(self.commands)
if total < max_records: if total < max_records:
return return
over = total - max_records + 1 over = total - max_records + 1
done = [v for v in self.dispatches.values() if v.get("status") in {"SUCCEEDED", "FAILED"}] done = [v for v in self.dispatches.values() if v.get("status") in {"SUCCEEDED", "FAILED"}]
done.extend([v for v in self.commands.values() if v.get("status") in {"SUCCEEDED", "FAILED"}])
done.sort(key=lambda x: x.get("updated_at", "")) done.sort(key=lambda x: x.get("updated_at", ""))
for rec in done[:over]: for rec in done[:over]:
self.dispatches.pop(rec["dispatch_id"], None) self.dispatches.pop(rec["dispatch_id"], None)
self.commands.pop(rec["dispatch_id"], None)
async def mark_event(self, device_id: str, event: dict[str, Any]) -> None: async def mark_event(self, device_id: str, event: dict[str, Any]) -> None:
now = utc_now_iso() now = utc_now_iso()
@@ -154,6 +222,8 @@ class EdgeDispatchManager:
return return
record = self.dispatches.get(dispatch_id) record = self.dispatches.get(dispatch_id)
if record is None:
record = self.commands.get(dispatch_id)
if record is None: if record is None:
return return
@@ -175,6 +245,18 @@ class EdgeDispatchManager:
record["error"] = event.get("error") record["error"] = event.get("error")
if conn: if conn:
conn.busy = False conn.busy = False
elif evt == "command_status":
status_val = event.get("status") or "RUNNING"
record["status"] = status_val
elif evt == "command_result":
status_val = event.get("status") or "SUCCEEDED"
record["status"] = status_val
record["result"] = event
if status_val == "FAILED":
record["error"] = event.get("error") or event.get("stderr")
elif evt == "pong":
record["status"] = "SUCCEEDED"
record["result"] = event
record["updated_at"] = now record["updated_at"] = now
@@ -186,6 +268,11 @@ class EdgeDispatchManager:
record["status"] = "FAILED" record["status"] = "FAILED"
record["error"] = f"device disconnected: {device_id}" record["error"] = f"device disconnected: {device_id}"
record["updated_at"] = now record["updated_at"] = now
for record in self.commands.values():
if record["device_id"] == device_id and record["status"] in {"DISPATCHED", "RUNNING", "PENDING"}:
record["status"] = "FAILED"
record["error"] = f"device disconnected: {device_id}"
record["updated_at"] = now
manager = EdgeDispatchManager() manager = EdgeDispatchManager()
@@ -234,6 +321,34 @@ async def get_dispatch(dispatch_id: str) -> dict[str, Any]:
return record return record
@app.post("/devices/{device_id}/command", response_model=DeviceCommandResponse)
async def device_command(device_id: str, body: DeviceCommandRequest) -> DeviceCommandResponse:
conn = await manager.select_device(device_id)
try:
record = await manager.create_command(conn, body)
except WebSocketDisconnect as exc:
await manager.unregister(conn.device_id)
raise HTTPException(status_code=503, detail=f"device disconnected during command: {conn.device_id}") from exc
except RuntimeError as exc:
raise HTTPException(status_code=503, detail=str(exc)) from exc
return DeviceCommandResponse(
dispatch_id=record["dispatch_id"],
device_id=record["device_id"],
command=record["command"],
status=record["status"],
created_at=record["created_at"],
)
@app.get("/commands/{dispatch_id}")
async def get_command(dispatch_id: str) -> dict[str, Any]:
record = manager.commands.get(dispatch_id)
if record is None:
raise HTTPException(status_code=404, detail=f"command not found: {dispatch_id}")
return record
@app.post("/dispatch/{dispatch_id}/artifacts") @app.post("/dispatch/{dispatch_id}/artifacts")
async def upload_artifacts( async def upload_artifacts(
dispatch_id: str, dispatch_id: str,

View File

@@ -2,6 +2,7 @@ import asyncio
import json import json
import os import os
from pathlib import Path from pathlib import Path
import subprocess
import sys import sys
import time import time
from urllib.parse import urlparse from urllib.parse import urlparse
@@ -13,6 +14,12 @@ DISPATCH_WS_URL = os.getenv("DISPATCH_WS_URL", "ws://127.0.0.1:8020/ws/edge/edge
WORKER_BASE_URL = os.getenv("WORKER_BASE_URL", "http://127.0.0.1:8000") WORKER_BASE_URL = os.getenv("WORKER_BASE_URL", "http://127.0.0.1:8000")
POLL_INTERVAL = float(os.getenv("EDGE_POLL_INTERVAL_SEC", "1.0")) POLL_INTERVAL = float(os.getenv("EDGE_POLL_INTERVAL_SEC", "1.0"))
DISPATCH_HTTP_BASE = os.getenv("DISPATCH_HTTP_BASE", "") DISPATCH_HTTP_BASE = os.getenv("DISPATCH_HTTP_BASE", "")
ALLOW_REMOTE_UPDATE = os.getenv("EDGE_ALLOW_REMOTE_UPDATE", "true").lower() in {"1", "true", "yes", "on"}
ALLOW_REMOTE_RESTART = os.getenv("EDGE_ALLOW_REMOTE_RESTART", "true").lower() in {"1", "true", "yes", "on"}
SCRIPT_DIR = Path(__file__).resolve().parent
ROOT_DIR = SCRIPT_DIR.parent
RESTART_SCRIPT = ROOT_DIR / "scripts" / "restart_edge_device_local.sh"
if len(sys.argv) > 1: if len(sys.argv) > 1:
DISPATCH_WS_URL = sys.argv[1] DISPATCH_WS_URL = sys.argv[1]
@@ -41,6 +48,18 @@ def worker_get(path: str):
return r.json() return r.json()
def run_shell(command: str, timeout_sec: int = 1200) -> tuple[int, str, str]:
proc = subprocess.run(
command,
cwd=str(ROOT_DIR),
shell=True,
text=True,
capture_output=True,
timeout=timeout_sec,
)
return proc.returncode, proc.stdout.strip(), proc.stderr.strip()
def upload_artifacts(dispatch_id: str, task_id: str, result: dict) -> dict: def upload_artifacts(dispatch_id: str, task_id: str, result: dict) -> dict:
candidate_fields = ["video_path", "first_frame_path", "metadata_path", "log_path"] candidate_fields = ["video_path", "first_frame_path", "metadata_path", "log_path"]
existing_paths = [] existing_paths = []
@@ -78,6 +97,12 @@ def upload_artifacts(dispatch_id: str, task_id: str, result: dict) -> dict:
fh.close() fh.close()
def _short_text(text: str, limit: int = 1500) -> str:
if len(text) <= limit:
return text
return text[:limit] + "...(truncated)"
async def handle_generate(ws, data: dict): async def handle_generate(ws, data: dict):
dispatch_id = data["dispatch_id"] dispatch_id = data["dispatch_id"]
req = data["request"] req = data["request"]
@@ -114,13 +139,126 @@ async def handle_generate(ws, data: dict):
result_payload["artifact_urls"] = artifact_urls result_payload["artifact_urls"] = artifact_urls
else: else:
result_payload["error"] = result.get("error") result_payload["error"] = result.get("error")
await ws.send( await ws.send(json.dumps(result_payload, ensure_ascii=False))
json.dumps(result_payload, ensure_ascii=False)
)
return return
await asyncio.sleep(POLL_INTERVAL) await asyncio.sleep(POLL_INTERVAL)
async def handle_update_code(ws, data: dict):
dispatch_id = data.get("dispatch_id", "")
if not ALLOW_REMOTE_UPDATE:
await ws.send(
json.dumps(
{
"event": "command_result",
"dispatch_id": dispatch_id,
"command": "update_code",
"status": "FAILED",
"error": "EDGE_ALLOW_REMOTE_UPDATE=false",
},
ensure_ascii=False,
)
)
return
branch = data.get("branch", "master")
git_command = data.get("command") or f"git fetch --all && git checkout {branch} && git pull --ff-only origin {branch}"
await ws.send(
json.dumps(
{"event": "command_status", "dispatch_id": dispatch_id, "command": "update_code", "status": "RUNNING"},
ensure_ascii=False,
)
)
code, out, err = await asyncio.to_thread(run_shell, git_command, 1800)
payload = {
"event": "command_result",
"dispatch_id": dispatch_id,
"command": "update_code",
"status": "SUCCEEDED" if code == 0 else "FAILED",
"exit_code": code,
"stdout": _short_text(out),
"stderr": _short_text(err),
}
await ws.send(json.dumps(payload, ensure_ascii=False))
async def handle_restart_service(ws, data: dict):
dispatch_id = data.get("dispatch_id", "")
if not ALLOW_REMOTE_RESTART:
await ws.send(
json.dumps(
{
"event": "command_result",
"dispatch_id": dispatch_id,
"command": "restart_service",
"status": "FAILED",
"error": "EDGE_ALLOW_REMOTE_RESTART=false",
},
ensure_ascii=False,
)
)
return
if not RESTART_SCRIPT.exists():
await ws.send(
json.dumps(
{
"event": "command_result",
"dispatch_id": dispatch_id,
"command": "restart_service",
"status": "FAILED",
"error": f"restart script missing: {RESTART_SCRIPT}",
},
ensure_ascii=False,
)
)
return
await ws.send(
json.dumps(
{"event": "command_status", "dispatch_id": dispatch_id, "command": "restart_service", "status": "RUNNING"},
ensure_ascii=False,
)
)
await ws.send(
json.dumps(
{
"event": "command_result",
"dispatch_id": dispatch_id,
"command": "restart_service",
"status": "SUCCEEDED",
"message": "restart script launched",
},
ensure_ascii=False,
)
)
subprocess.Popen(
["bash", str(RESTART_SCRIPT)],
cwd=str(ROOT_DIR),
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
start_new_session=True,
)
raise SystemExit(0)
async def handle_ping(ws, data: dict):
await ws.send(
json.dumps(
{
"event": "pong",
"dispatch_id": data.get("dispatch_id", ""),
"status": "ok",
"worker_base_url": WORKER_BASE_URL,
},
ensure_ascii=False,
)
)
async def main() -> None: async def main() -> None:
while True: while True:
try: try:
@@ -132,8 +270,16 @@ async def main() -> None:
event = data.get("event") event = data.get("event")
if event == "generate": if event == "generate":
await handle_generate(ws, data) await handle_generate(ws, data)
elif event == "update_code":
await handle_update_code(ws, data)
elif event == "restart_service":
await handle_restart_service(ws, data)
elif event == "ping":
await handle_ping(ws, data)
elif event == "registered": elif event == "registered":
print("registered", data) print("registered", data)
except SystemExit:
return
except Exception as exc: except Exception as exc:
print("connection error, retry in 3s:", exc) print("connection error, retry in 3s:", exc)
time.sleep(3) time.sleep(3)

View File

@@ -0,0 +1,54 @@
param(
[ValidateSet("start", "stop", "restart", "status")]
[string]$Action = "start",
[string]$Distro = ""
)
$ErrorActionPreference = "Stop"
$Root = Split-Path -Parent (Split-Path -Parent $MyInvocation.MyCommand.Path)
Set-Location $Root
if (!(Get-Command wsl -ErrorAction SilentlyContinue)) {
throw "WSL command not found. Please install WSL first."
}
if ([string]::IsNullOrWhiteSpace($Distro)) {
$linuxPath = (wsl -- wslpath -a "$Root").Trim()
} else {
$linuxPath = (wsl -d $Distro -- wslpath -a "$Root").Trim()
}
if ([string]::IsNullOrWhiteSpace($linuxPath)) {
throw "Failed to resolve WSL path for project root: $Root"
}
switch ($Action) {
"start" { $targetScript = "scripts/start_edge_device_local.sh" }
"stop" { $targetScript = "scripts/stop_edge_device_local.sh" }
"restart" { $targetScript = "scripts/restart_edge_device_local.sh" }
"status" { $targetScript = "" }
}
if ($Action -eq "status") {
$bashCommand = @"
cd '$linuxPath' && \
if [ -f runtime/pids/worker.pid ] && kill -0 `$(cat runtime/pids/worker.pid) >/dev/null 2>&1; then
echo "[OK] worker running pid=`$(cat runtime/pids/worker.pid)"
else
echo "[INFO] worker not running"
fi && \
if [ -f runtime/pids/edge_client.pid ] && kill -0 `$(cat runtime/pids/edge_client.pid) >/dev/null 2>&1; then
echo "[OK] edge_client running pid=`$(cat runtime/pids/edge_client.pid)"
else
echo "[INFO] edge_client not running"
fi
"@
} else {
$bashCommand = "cd '$linuxPath' && chmod +x scripts/install_wsl_env.sh scripts/start_edge_device_local.sh scripts/stop_edge_device_local.sh scripts/restart_edge_device_local.sh && bash $targetScript"
}
if ([string]::IsNullOrWhiteSpace($Distro)) {
wsl -- bash -lc "$bashCommand"
} else {
wsl -d $Distro -- bash -lc "$bashCommand"
}

View File

@@ -0,0 +1,9 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT_DIR"
bash scripts/stop_edge_device_local.sh
sleep 1
bash scripts/start_edge_device_local.sh

View File

@@ -0,0 +1,41 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT_DIR"
PID_DIR="runtime/pids"
WORKER_PID_FILE="${PID_DIR}/worker.pid"
EDGE_CLIENT_PID_FILE="${PID_DIR}/edge_client.pid"
stop_by_pid_file() {
local name="$1"
local pid_file="$2"
if [ ! -f "$pid_file" ]; then
echo "[INFO] ${name} not running (no pid file)"
return
fi
local pid
pid="$(cat "$pid_file" 2>/dev/null || true)"
if [ -z "$pid" ]; then
rm -f "$pid_file"
echo "[INFO] ${name} pid file empty, cleaned"
return
fi
if kill -0 "$pid" >/dev/null 2>&1; then
kill "$pid" >/dev/null 2>&1 || true
sleep 1
if kill -0 "$pid" >/dev/null 2>&1; then
kill -9 "$pid" >/dev/null 2>&1 || true
fi
echo "[OK] ${name} stopped (pid ${pid})"
else
echo "[INFO] ${name} already stopped (stale pid ${pid})"
fi
rm -f "$pid_file"
}
stop_by_pid_file "edge client" "$EDGE_CLIENT_PID_FILE"
stop_by_pid_file "worker" "$WORKER_PID_FILE"