fix: 优化内容

2026-03-25 13:33:48 +08:00
parent f99098ec58
commit 8991f2a2d7
14 changed files with 1417 additions and 277 deletions
--- a/62
+++ b/62
@@ -1,6 +1,7 @@
-FROM python:3.10-slim
+FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04 AS builder
-ENV PYTHONDONTWRITEBYTECODE=1 \
+ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1 \
    PIP_DISABLE_PIP_VERSION_CHECK=1 \
    PIP_NO_CACHE_DIR=1 \
@@ -10,26 +11,53 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
 WORKDIR /app
-# ffmpeg is required for MoviePy (audio duration + encoding).
+# Base deps + Python 3.10 + Node.js 20.x
-RUN if [ -f /etc/apt/sources.list ]; then \
+RUN sed -i 's|http://archive.ubuntu.com/ubuntu|https://mirrors.tuna.tsinghua.edu.cn/ubuntu|g; s|http://security.ubuntu.com/ubuntu|https://mirrors.tuna.tsinghua.edu.cn/ubuntu|g' /etc/apt/sources.list \
-      sed -i 's|http://deb.debian.org/debian|https://mirrors.tuna.tsinghua.edu.cn/debian|g; s|http://security.debian.org/debian-security|https://mirrors.tuna.tsinghua.edu.cn/debian-security|g' /etc/apt/sources.list; \
+    && apt-get -o Acquire::Retries=5 update \
-    fi \
+    && apt-get -o Acquire::Retries=5 install -y --no-install-recommends --fix-missing \
-  && if [ -f /etc/apt/sources.list.d/debian.sources ]; then \
+      ca-certificates curl gnupg \
-      sed -i 's|http://deb.debian.org/debian|https://mirrors.tuna.tsinghua.edu.cn/debian|g; s|http://security.debian.org/debian-security|https://mirrors.tuna.tsinghua.edu.cn/debian-security|g' /etc/apt/sources.list.d/debian.sources; \
+      python3.10 python3.10-distutils python3-pip \
-    fi \
+      ffmpeg fonts-dejavu-core \
-  && apt-get update && apt-get install -y --no-install-recommends \
+    && curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
-    ffmpeg \
+    && apt-get -o Acquire::Retries=5 install -y --no-install-recommends --fix-missing nodejs \
-    fonts-dejavu-core \
+    && ln -sf /usr/bin/python3.10 /usr/local/bin/python \
-    nodejs \
+    && rm -rf /var/lib/apt/lists/*
    npm \
  && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt /app/requirements.txt
-RUN pip install -r /app/requirements.txt
+RUN python3.10 -m pip install -r /app/requirements.txt
 COPY server/package.json server/package-lock.json /app/server/
 RUN cd /app/server && npm ci --omit=dev
 COPY . /app
-RUN cd /app/server && npm i --omit=dev
+FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04 AS runtime
 ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1 \
    PIP_DISABLE_PIP_VERSION_CHECK=1 \
    PIP_NO_CACHE_DIR=1 \
    PIP_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple \
    PIP_TRUSTED_HOST=pypi.tuna.tsinghua.edu.cn \
    NPM_CONFIG_REGISTRY=https://registry.npmmirror.com
 WORKDIR /app
 RUN sed -i 's|http://archive.ubuntu.com/ubuntu|https://mirrors.tuna.tsinghua.edu.cn/ubuntu|g; s|http://security.ubuntu.com/ubuntu|https://mirrors.tuna.tsinghua.edu.cn/ubuntu|g' /etc/apt/sources.list \
    && apt-get -o Acquire::Retries=5 update \
    && apt-get -o Acquire::Retries=5 install -y --no-install-recommends --fix-missing \
      ca-certificates \
      python3.10 python3.10-distutils python3-pip \
      ffmpeg fonts-dejavu-core \
    && curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
    && apt-get -o Acquire::Retries=5 install -y --no-install-recommends --fix-missing nodejs \
    && ln -sf /usr/bin/python3.10 /usr/local/bin/python \
    && rm -rf /var/lib/apt/lists/*
 COPY --from=builder /usr/local/lib/python3.10 /usr/local/lib/python3.10
 COPY --from=builder /usr/local/bin /usr/local/bin
 COPY --from=builder /app /app
 EXPOSE 3000
 CMD ["node", "/app/server/index.js"]
--- a/README.md
+++ b/README.md
@@ -5,6 +5,8 @@
 - Output: a 3-scene narrated video `final_poc.mp4` (mock mode supported)
 ## Quick start (Docker)
 `docker compose up` includes a **ComfyUI** service (default image `jamesbrink/comfyui:latest` from Docker Hub). If you use another registry image, set `COMFYUI_IMAGE` in the environment.
 Build:
 ```bash
--- a/configs/config.yaml
+++ b/configs/config.yaml
@@ -1,6 +1,6 @@
 app:
-  # ComfyUI base url (local)
+  # ComfyUI base url (docker internal service)
-  comfy_base_url: "http://127.0.0.1:8188"
+  comfy_base_url: "http://comfyui:8188"
  # ComfyUI output directory on the same machine running this code
  comfy_output_dir: "./ComfyUI/output"
@@ -26,7 +26,7 @@ tts:
 video:
  # Final output path
-  final_output: "./final_poc.mp4"
+  final_output: "./outputs/final_poc.mp4"
  # If ComfyUI is not ready, generate mock clips with this size & fps
  mock_size: [1024, 576]
  mock_fps: 24
--- a/dev.sh
+++ b/dev.sh
@@ -18,7 +18,29 @@ shift || true
 case "$CMD" in
  up)
-    docker compose up --build "$@"
+    # Start in background, then wait for Node self-check + health endpoint.
    docker compose up -d --build "$@"
    echo "[dev] waiting for server health..."
    deadline=$((SECONDS + 90))
    ok=0
    while [ $SECONDS -lt $deadline ]; do
      if curl -fsS "http://127.0.0.1:3000/api/health" >/dev/null 2>&1; then
        ok=1
        break
      fi
      # If container exited, fail fast.
      if ! docker compose ps --status running | grep -q "aivideo"; then
        break
      fi
      sleep 1
    done
    if [ "$ok" -ne 1 ]; then
      echo "[dev] server failed to become healthy (self-check likely failed)." >&2
      docker compose logs --tail=200 aivideo || true
      exit 1
    fi
    echo "[dev] server ready: http://127.0.0.1:3000"
    docker compose logs -f --tail=50 aivideo
    ;;
  rebuild)
    docker compose build "$@"
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -2,6 +2,8 @@ services:
  aivideo:
    build: .
    working_dir: /app
    depends_on:
      - comfyui
    environment:
      - OPENAI_API_KEY=${OPENAI_API_KEY}
      - OPENAI_BASE_URL=${OPENAI_BASE_URL}
@@ -10,5 +12,18 @@ services:
      - ./:/app
    ports:
      - "3000:3000"
-    # On macOS, use host.docker.internal to reach host services like ComfyUI.
+
-    # Example: set app.comfy_base_url in configs/config.yaml to http://host.docker.internal:8188
+  # Default: Docker Hub (anonymous pull). GHCR comfyanonymous image often returns "denied" without login.
  # Override: COMFYUI_IMAGE=ghcr.io/... after `docker login ghcr.io`
  comfyui:
    image: ${COMFYUI_IMAGE:-jamesbrink/comfyui:latest}
    environment:
      - CLI_ARGS=--listen 0.0.0.0 --port 8188
    ports:
      - "8188:8188"
    volumes:
      - ./ComfyUI/user:/comfyui/user
      - ./ComfyUI/models:/comfyui/models
      - ./ComfyUI/custom_nodes:/comfyui/custom_nodes
      - ./ComfyUI/output:/comfyui/output
      - ./ComfyUI/input:/comfyui/input
--- a/engine/main.py
+++ b/engine/main.py
@@ -0,0 +1,354 @@
 from __future__ import annotations
 import argparse
 import asyncio
 import json
 import os
 import random
 import sys
 from pathlib import Path
 from typing import Any
 from moviepy import ImageClip
 from PIL import Image, ImageDraw, ImageFont
 from engine.audio_gen import synthesize_scenes
 from engine.comfy_client import ComfyClient
 from engine.config import AppConfig
 from engine.script_gen import generate_scenes, refine_scene
 from engine.types import Scene
 from engine.video_editor import Segment, render_final
 def _emit(line: str) -> None:
    print(line, flush=True)
 def _emit_scene(scene_idx: int, scene: Scene) -> None:
    payload = {
        "index": scene_idx,
        "image_prompt": scene.image_prompt,
        "video_motion": scene.video_motion,
        "narration": scene.narration,
    }
    _emit("SCENE_JSON " + json.dumps(payload, ensure_ascii=False))
 def _ensure_mock_image(path: Path, size: tuple[int, int]) -> Path:
    if path.exists():
        return path
    path.parent.mkdir(parents=True, exist_ok=True)
    img = Image.new("RGB", size, color=(20, 24, 33))
    draw = ImageDraw.Draw(img)
    text = "MOCK"
    try:
        font = ImageFont.load_default()
    except Exception:
        font = None
    draw.text((size[0] // 2 - 30, size[1] // 2 - 10), text, fill=(240, 240, 240), font=font)
    img.save(path)
    return path
 def _make_mock_video(out_path: Path, image_path: Path, duration_s: float, fps: int) -> Path:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    clip = ImageClip(str(image_path)).with_duration(max(0.5, duration_s)).with_fps(fps)
    try:
        clip.write_videofile(str(out_path), codec="libx264", audio=False, fps=fps, preset="veryfast")
    finally:
        clip.close()
    return out_path
 def _prog(p: float, msg: str) -> None:
    p2 = max(0.0, min(1.0, float(p)))
    _emit("PROG " + json.dumps({"p": p2, "msg": msg}, ensure_ascii=False))
 def _normalize_style(style: str | None) -> str:
    s = (style or "").strip()
    if not s:
        return ""
    # Allow both Chinese labels and simple aliases
    mapping = {
        "电影感": "电影感",
        "cinema": "电影感",
        "二次元": "二次元",
        "anime": "二次元",
        "写实": "写实",
        "real": "写实",
    }
    return mapping.get(s, s)
 def _inject_globals_into_prompt(prompt: str, *, style: str | None, character: str | None) -> str:
    style_n = _normalize_style(style)
    character_n = (character or "").strip()
    if not style_n and not character_n:
        return prompt
    parts: list[str] = [prompt.strip(), "\n\n[Global Constraints]"]
    if style_n:
        parts.append(f"- Global Style: {style_n}")
    if character_n:
        parts.append(f"- Character Preset: {character_n}")
    parts.append("请严格遵守上述全局信息，并保持三分镜主角一致。")
    return "\n".join(parts).strip()
 def _decorate_image_prompt(image_prompt: str, *, style: str | None, character: str | None) -> str:
    # Industrial rule: final_prompt = f"{global_character}, {global_style}, {scene_prompt}"
    style_n = _normalize_style(style)
    character_n = (character or "").strip()
    parts = []
    if character_n:
        parts.append(character_n)
    if style_n:
        parts.append(style_n)
    parts.append(image_prompt)
    return ", ".join([p for p in parts if p]).strip(", ")
 def _fallback_scenes(prompt: str) -> list[Scene]:
    return [
        Scene(
            image_prompt=f"{prompt}，城市夜景，霓虹灯，电影感",
            video_motion="缓慢推进镜头，轻微摇镜",
            narration="夜色温柔落在街灯上",
        ),
        Scene(
            image_prompt=f"{prompt}，咖啡店窗边，暖光，细雨",
            video_motion="侧向平移，人物轻轻抬头",
            narration="雨声里藏着一段回忆",
        ),
        Scene(
            image_prompt=f"{prompt}，桥上远景，车流光轨，温暖",
            video_motion="拉远全景，光轨流动",
            narration="我们在光里学会告别",
        ),
    ]
 def _has_llm_key(cfg: AppConfig) -> bool:
    api_key_env = str(cfg.get("openai.api_key_env", "OPENAI_API_KEY"))
    return bool(os.environ.get(api_key_env))
 def _parse_scenes_from_obj(obj: Any) -> list[Scene]:
    if not isinstance(obj, dict):
        raise ValueError("payload must be object")
    if "scene" in obj and obj.get("scene") is not None:
        s = obj.get("scene")
        if not isinstance(s, dict):
            raise ValueError("payload.scene must be object")
        return [
            Scene(
                image_prompt=str(s.get("image_prompt", "")).strip(),
                video_motion=str(s.get("video_motion", "")).strip(),
                narration=str(s.get("narration", "")).strip(),
            )
        ]
    scenes_raw = obj.get("scenes")
    if not isinstance(scenes_raw, list) or not scenes_raw:
        raise ValueError("payload.scenes must be non-empty array")
    scenes: list[Scene] = []
    for i, s in enumerate(scenes_raw, start=1):
        if not isinstance(s, dict):
            raise ValueError(f"scenes[{i}] must be object")
        scenes.append(
            Scene(
                image_prompt=str(s.get("image_prompt", "")).strip(),
                video_motion=str(s.get("video_motion", "")).strip(),
                narration=str(s.get("narration", "")).strip(),
            )
        )
    return scenes
 async def _render_from_scenes(
    prompt: str,
    scenes: list[Scene],
    cfg: AppConfig,
    mock: bool,
    *,
    style: str | None,
    character: str | None,
    out_dir: Path,
 ) -> Path:
    # Force-inject globals into image prompts for rendering.
    scenes2 = [
        Scene(
            image_prompt=_decorate_image_prompt(s.image_prompt, style=style, character=character),
            video_motion=s.video_motion,
            narration=s.narration,
        )
        for s in scenes
    ]
    _prog(0.15, "Generating TTS")
    audios = await synthesize_scenes([s.narration for s in scenes2], cfg)
    segments: list[Segment] = []
    fps = int(cfg.get("video.mock_fps", 24))
    mock_size = cfg.get("video.mock_size", [1024, 576])
    w, h = int(mock_size[0]), int(mock_size[1])
    mock_image = _ensure_mock_image(Path("./assets/mock.png"), (w, h))
    if mock:
        _prog(0.35, "Generating mock videos")
        for i, (scene, audio) in enumerate(zip(scenes2, audios), start=1):
            vpath = Path("./assets/mock_videos") / f"scene_{i:02d}.mp4"
            _make_mock_video(vpath, mock_image, audio.duration_s, fps=fps)
            segments.append(Segment(video_path=vpath, audio_path=audio.path, narration=scene.narration))
        _prog(0.85, "Compositing final video")
        out_path = out_dir / "final.mp4"
        return render_final(segments, cfg, output_path=out_path)
    comfy = ComfyClient(cfg)
    wf = comfy.load_workflow()
    for i, (scene, audio) in enumerate(zip(scenes2, audios), start=1):
        _prog(0.25 + 0.45 * (i - 1) / max(1, len(scenes2)), f"Rendering scene {i} with ComfyUI")
        seed = random.randint(1, 2_147_483_647)
        wf_i = comfy.inject_params(wf, image_prompt=scene.image_prompt, seed=seed, motion_prompt=scene.video_motion or None)
        result = await comfy.run_workflow(wf_i)
        candidates = [p for p in result.output_files if p.suffix.lower() in {".mp4", ".mov", ".webm"}]
        video_path = candidates[0] if candidates else result.output_files[0]
        segments.append(Segment(video_path=video_path, audio_path=audio.path, narration=scene.narration))
    _prog(0.85, "Compositing final video")
    out_path = out_dir / "final.mp4"
    return render_final(segments, cfg, output_path=out_path)
 def _read_stdin_json() -> Any:
    raw = sys.stdin.read()
    if not raw.strip():
        return None
    return json.loads(raw)
 def step_script(prompt: str, cfg: AppConfig, mock: bool, *, style: str | None, character: str | None, out_dir: Path) -> int:
    prompt2 = _inject_globals_into_prompt(prompt, style=style, character=character)
    if mock and not _has_llm_key(cfg):
        # fallback scenes still should include global injection
        scenes = _fallback_scenes(prompt)
    else:
        scenes = generate_scenes(prompt2, cfg)
    out_dir.mkdir(parents=True, exist_ok=True)
    _emit("SCRIPT_BEGIN")
    for idx, s in enumerate(scenes, start=1):
        s2 = Scene(
            image_prompt=_decorate_image_prompt(s.image_prompt, style=style, character=character),
            video_motion=s.video_motion,
            narration=s.narration,
        )
        _emit_scene(idx, s2)
    _emit("SCRIPT_END")
    (out_dir / "scenes.json").write_text(
        json.dumps(
            {"scenes": [{"image_prompt": s.image_prompt, "video_motion": s.video_motion, "narration": s.narration} for s in scenes]},
            ensure_ascii=False,
            indent=2,
        ),
        encoding="utf-8",
    )
    return 0
 def step_refine(
    prompt: str,
    cfg: AppConfig,
    mock: bool,
    scene_index: int,
    *,
    style: str | None,
    character: str | None,
    out_dir: Path,
 ) -> int:
    prompt2 = _inject_globals_into_prompt(prompt, style=style, character=character)
    payload = _read_stdin_json()
    scenes = _parse_scenes_from_obj(payload)
    # If client only sent one scene, treat it as the target scene.
    if len(scenes) == 1:
        target_index = 1
    else:
        target_index = scene_index
        if not (1 <= target_index <= len(scenes)):
            raise ValueError("scene_index out of range")
    if mock and not _has_llm_key(cfg):
        # Simple fallback: append a tiny polish hint to narration
        s = scenes[target_index - 1]
        refined = Scene(
            image_prompt=_decorate_image_prompt(s.image_prompt, style=style, character=character),
            video_motion=s.video_motion,
            narration=(s.narration + "（更凝练）")[:30],
        )
    else:
        # Ensure globals are visible to LLM, and inject to output image prompt.
        refined0 = refine_scene(prompt=prompt2, scenes=scenes, target_index=target_index, cfg=cfg)
        refined = Scene(
            image_prompt=_decorate_image_prompt(refined0.image_prompt, style=style, character=character),
            video_motion=refined0.video_motion,
            narration=refined0.narration,
        )
    # Keep the original index for frontend replacement.
    _emit_scene(scene_index, refined)
    out_dir.mkdir(parents=True, exist_ok=True)
    (out_dir / f"refine_scene_{scene_index}.json").write_text(
        json.dumps(
            {"index": scene_index, "image_prompt": refined.image_prompt, "video_motion": refined.video_motion, "narration": refined.narration},
            ensure_ascii=False,
            indent=2,
        ),
        encoding="utf-8",
    )
    return 0
 def step_render(prompt: str, cfg: AppConfig, mock: bool, *, style: str | None, character: str | None, out_dir: Path) -> int:
    payload = _read_stdin_json()
    scenes = _parse_scenes_from_obj(payload)
    out_dir.mkdir(parents=True, exist_ok=True)
    _prog(0.05, "Start render")
    out = asyncio.run(_render_from_scenes(prompt, scenes, cfg, mock=mock, style=style, character=character, out_dir=out_dir))
    _prog(1.0, "Render finished")
    _emit("RENDER_DONE " + json.dumps({"output": str(out)}, ensure_ascii=False))
    return 0
 def main() -> int:
    parser = argparse.ArgumentParser(description="AIGC interactive POC entry")
    parser.add_argument("--prompt", required=True, help="User creative prompt")
    parser.add_argument("--config", default="./configs/config.yaml", help="Config yaml path")
    parser.add_argument("--mock", action="store_true", help="Mock mode (no ComfyUI needed)")
    parser.add_argument("--step", default="script", choices=["script", "render", "refine"])
    parser.add_argument("--scene-index", type=int, default=1, help="For --step=refine only (1-based)")
    parser.add_argument("--global-style", default="", help="Global style lock (e.g. 电影感/二次元/写实)")
    parser.add_argument("--character", default="", help="Character preset lock (main character description)")
    parser.add_argument("--task-id", required=True, help="Task id (UUID). Outputs go to outputs/{task_id}/")
    args = parser.parse_args()
    cfg = AppConfig.load(args.config)
    out_dir = Path("./outputs") / str(args.task_id)
    if args.step == "script":
        return step_script(args.prompt, cfg, mock=args.mock, style=args.global_style, character=args.character, out_dir=out_dir)
    if args.step == "render":
        return step_render(args.prompt, cfg, mock=args.mock, style=args.global_style, character=args.character, out_dir=out_dir)
    if args.step == "refine":
        return step_refine(
            args.prompt,
            cfg,
            mock=args.mock,
            scene_index=args.scene_index,
            style=args.global_style,
            character=args.character,
            out_dir=out_dir,
        )
    raise SystemExit(2)
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/engine/script_gen.py
+++ b/engine/script_gen.py
@@ -33,6 +33,24 @@ def _system_prompt(scene_count: int, min_chars: int, max_chars: int) -> str:
 """
 def _refine_system_prompt(min_chars: int, max_chars: int) -> str:
    return f"""你是短视频分镜润色助手。
 你会收到用户的原始创意 prompt、以及一组三分镜（其中主角设定需一致）。
 你的任务：只润色指定的一个 Scene，使其更具体、更镜头化、更适合生成视频，同时保持主角描述与其它分镜一致。
 硬性约束：
 1) 只修改目标 Scene，不要改其它 Scene。
 2) 目标 Scene 必须包含：image_prompt, video_motion, narration。
 3) narration 为中文旁白，每段控制在约 {min_chars}-{max_chars} 字左右。
 4) 输出只允许 JSON，不要解释、不要 markdown。
 输出 JSON Schema：
 {{
  "scene": {{"image_prompt":"...","video_motion":"...","narration":"..."}}
 }}
 """
 def generate_scenes(user_prompt: str, cfg: AppConfig) -> list[Scene]:
    scene_count = int(cfg.get("script_gen.scene_count", 3))
    min_chars = int(cfg.get("script_gen.narration_min_chars", 15))
@@ -78,3 +96,56 @@ def generate_scenes(user_prompt: str, cfg: AppConfig) -> list[Scene]:
            raise ValueError(f"Scene[{i}] missing required fields")
        scenes.append(Scene(image_prompt=image_prompt, video_motion=video_motion, narration=narration))
    return scenes
 def refine_scene(*, prompt: str, scenes: list[Scene], target_index: int, cfg: AppConfig) -> Scene:
    if not (1 <= target_index <= len(scenes)):
        raise ValueError("target_index out of range")
    min_chars = int(cfg.get("script_gen.narration_min_chars", 15))
    max_chars = int(cfg.get("script_gen.narration_max_chars", 20))
    api_key_env = str(cfg.get("openai.api_key_env", "OPENAI_API_KEY"))
    base_url_env = str(cfg.get("openai.base_url_env", "OPENAI_BASE_URL"))
    model = str(cfg.get("openai.model", "gpt-4o-mini"))
    api_key = os.environ.get(api_key_env)
    if not api_key:
        raise RuntimeError(f"Missing env var {api_key_env} for OpenAI API key")
    client = OpenAI(
        api_key=api_key,
        base_url=os.environ.get(base_url_env) or None,
    )
    scenes_payload = [
        {"image_prompt": s.image_prompt, "video_motion": s.video_motion, "narration": s.narration}
        for s in scenes
    ]
    user_payload = {
        "prompt": prompt,
        "target_index": target_index,
        "scenes": scenes_payload,
    }
    resp = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": _refine_system_prompt(min_chars, max_chars)},
            {"role": "user", "content": json.dumps(user_payload, ensure_ascii=False)},
        ],
        response_format={"type": "json_object"},
        temperature=0.6,
    )
    content = resp.choices[0].message.content or "{}"
    data: Any = json.loads(content)
    s = data.get("scene")
    if not isinstance(s, dict):
        raise ValueError("Model refine output missing scene")
    image_prompt = str(s.get("image_prompt", "")).strip()
    video_motion = str(s.get("video_motion", "")).strip()
    narration = str(s.get("narration", "")).strip()
    if not image_prompt or not narration:
        raise ValueError("Refined scene missing required fields")
    return Scene(image_prompt=image_prompt, video_motion=video_motion, narration=narration)
--- a/final_poc.mp4
+++ b/final_poc.mp4
--- a/main.py
+++ b/main.py
@@ -7,154 +7,11 @@ import os
 import random
 from pathlib import Path
 from fastapi import FastAPI
 from moviepy import ImageClip
 from PIL import Image, ImageDraw, ImageFont
 from engine.audio_gen import synthesize_scenes
 from engine.comfy_client import ComfyClient
 from engine.config import AppConfig
 from engine.script_gen import generate_scenes
 from engine.types import Scene
 from engine.video_editor import Segment, render_final
 app = FastAPI(title="AiVideo POC")
 def _ensure_mock_image(path: Path, size: tuple[int, int]) -> Path:
    if path.exists():
        return path
    path.parent.mkdir(parents=True, exist_ok=True)
    img = Image.new("RGB", size, color=(20, 24, 33))
    draw = ImageDraw.Draw(img)
    text = "MOCK"
    try:
        font = ImageFont.load_default()
    except Exception:
        font = None
    draw.text((size[0] // 2 - 30, size[1] // 2 - 10), text, fill=(240, 240, 240), font=font)
    img.save(path)
    return path
 def _make_mock_video(out_path: Path, image_path: Path, duration_s: float, fps: int) -> Path:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    clip = ImageClip(str(image_path)).with_duration(max(0.5, duration_s)).with_fps(fps)
    try:
        clip.write_videofile(str(out_path), codec="libx264", audio=False, fps=fps, preset="veryfast")
    finally:
        clip.close()
    return out_path
 def _emit(line: str) -> None:
    print(line, flush=True)
 def _emit_scene(scene_idx: int, scene: Scene) -> None:
    payload = {
        "index": scene_idx,
        "image_prompt": scene.image_prompt,
        "video_motion": scene.video_motion,
        "narration": scene.narration,
    }
    _emit("SCENE_JSON " + json.dumps(payload, ensure_ascii=False))
 def _fallback_scenes(prompt: str) -> list[Scene]:
    return [
        Scene(
            image_prompt=f"{prompt}，城市夜景，霓虹灯，电影感",
            video_motion="缓慢推进镜头，轻微摇镜",
            narration="夜色温柔落在街灯上",
        ),
        Scene(
            image_prompt=f"{prompt}，咖啡店窗边，暖光，细雨",
            video_motion="侧向平移，人物轻轻抬头",
            narration="雨声里藏着一段回忆",
        ),
        Scene(
            image_prompt=f"{prompt}，桥上远景，车流光轨，温暖",
            video_motion="拉远全景，光轨流动",
            narration="我们在光里学会告别",
        ),
    ]
 def _should_allow_llm_without_key(cfg: AppConfig) -> bool:
    api_key_env = str(cfg.get("openai.api_key_env", "OPENAI_API_KEY"))
    return bool(os.environ.get(api_key_env))
 def _generate_scenes_for_run(prompt: str, cfg: AppConfig, mock: bool) -> list[Scene]:
    if mock and not _should_allow_llm_without_key(cfg):
        return _fallback_scenes(prompt)
    try:
        return generate_scenes(prompt, cfg)
    except Exception:
        if mock:
            return _fallback_scenes(prompt)
        raise
 async def run_pipeline(prompt: str, cfg: AppConfig, mock: bool) -> Path:
    scenes = _generate_scenes_for_run(prompt, cfg, mock=mock)
    audios = await synthesize_scenes([s.narration for s in scenes], cfg)
    segments: list[Segment] = []
    fps = int(cfg.get("video.mock_fps", 24))
    mock_size = cfg.get("video.mock_size", [1024, 576])
    w, h = int(mock_size[0]), int(mock_size[1])
    mock_image = _ensure_mock_image(Path("./assets/mock.png"), (w, h))
    if mock:
        for i, (scene, audio) in enumerate(zip(scenes, audios), start=1):
            vpath = Path("./assets/mock_videos") / f"scene_{i:02d}.mp4"
            _make_mock_video(vpath, mock_image, audio.duration_s, fps=fps)
            segments.append(Segment(video_path=vpath, audio_path=audio.path, narration=scene.narration))
        return render_final(segments, cfg)
    comfy = ComfyClient(cfg)
    wf = comfy.load_workflow()
    for i, (scene, audio) in enumerate(zip(scenes, audios), start=1):
        seed = random.randint(1, 2_147_483_647)
        wf_i = comfy.inject_params(wf, image_prompt=scene.image_prompt, seed=seed, motion_prompt=scene.video_motion or None)
        result = await comfy.run_workflow(wf_i)
        # pick first mp4-like output; if none, fall back to first file.
        candidates = [p for p in result.output_files if p.suffix.lower() in {".mp4", ".mov", ".webm"}]
        video_path = candidates[0] if candidates else result.output_files[0]
        segments.append(Segment(video_path=video_path, audio_path=audio.path, narration=scene.narration))
    return render_final(segments, cfg)
 def script_only(prompt: str, cfg: AppConfig, mock: bool) -> int:
    scenes = _generate_scenes_for_run(prompt, cfg, mock=mock)
    _emit("SCRIPT_BEGIN")
    for idx, s in enumerate(scenes, start=1):
        _emit_scene(idx, s)
    _emit("SCRIPT_END")
    return 0
 def main() -> int:
-    parser = argparse.ArgumentParser(description="AIGC auto video generation POC")
+    # Backward-compatible entry: delegate to engine/main.py
-    parser.add_argument("--prompt", required=True, help="User creative prompt")
+    from engine.main import main as engine_main
    parser.add_argument("--config", default="./configs/config.yaml", help="Config yaml path")
    parser.add_argument("--mock", action="store_true", help="Mock mode (no ComfyUI needed)")
    parser.add_argument(
        "--script-only",
        action="store_true",
        help="Only generate script/scenes and print to stdout (for Node.js streaming)",
    )
    args = parser.parse_args()
-    cfg = AppConfig.load(args.config)
+    return engine_main()
    if args.script_only:
        return script_only(args.prompt, cfg, mock=args.mock)
    out = asyncio.run(run_pipeline(args.prompt, cfg, mock=args.mock))
    print(str(out))
    return 0
 if __name__ == "__main__":
--- a/outputs/final_poc.mp4
+++ b/outputs/final_poc.mp4
--- a/scripts/check_comfy.py
+++ b/scripts/check_comfy.py
@@ -3,9 +3,11 @@ from __future__ import annotations
 import argparse
 import json
 import sys
 from pathlib import Path
 from typing import Any
 import httpx
 import yaml
 def fetch_object_info(base_url: str, timeout_s: float = 5.0) -> dict[str, Any]:
@@ -19,19 +21,40 @@ def fetch_object_info(base_url: str, timeout_s: float = 5.0) -> dict[str, Any]:
        return data
 def read_base_url_from_config(config_path: str) -> str | None:
    p = Path(config_path)
    if not p.exists():
        return None
    try:
        raw = yaml.safe_load(p.read_text(encoding="utf-8"))
    except Exception:
        return None
    if not isinstance(raw, dict):
        return None
    app = raw.get("app")
    if not isinstance(app, dict):
        return None
    v = app.get("comfy_base_url")
    if isinstance(v, str) and v.strip():
        return v.strip()
    return None
 def main() -> int:
    parser = argparse.ArgumentParser(description="Check ComfyUI API connectivity")
    parser.add_argument(
        "--base-url",
-        default="http://127.0.0.1:8188",
+        default="",
-        help="ComfyUI base URL (default: http://127.0.0.1:8188)",
+        help="ComfyUI base URL (if empty, read from config app.comfy_base_url)",
    )
    parser.add_argument("--config", default="./configs/config.yaml", help="Config yaml path")
    parser.add_argument("--timeout", type=float, default=5.0, help="Request timeout seconds")
    parser.add_argument("--pretty", action="store_true", help="Pretty print JSON")
    args = parser.parse_args()
    try:
-        data = fetch_object_info(args.base_url, timeout_s=args.timeout)
+        base_url = args.base_url.strip() or read_base_url_from_config(args.config) or "http://127.0.0.1:8188"
        data = fetch_object_info(base_url, timeout_s=args.timeout)
        out = json.dumps(data, ensure_ascii=False, indent=2 if args.pretty else None)
        sys.stdout.write(out + "\n")
        return 0
--- a/scripts/inspect_comfy_node.py
+++ b/scripts/inspect_comfy_node.py
@@ -0,0 +1,331 @@
 from __future__ import annotations
 import argparse
 import json
 import sys
 from pathlib import Path
 from typing import Any, Iterable
 import httpx
 import yaml
 def fetch_object_info(base_url: str, timeout_s: float = 5.0) -> dict[str, Any]:
    url = base_url.rstrip("/") + "/object_info"
    with httpx.Client(timeout=timeout_s) as client:
        r = client.get(url)
        r.raise_for_status()
        data = r.json()
        if not isinstance(data, dict):
            raise RuntimeError(f"Unexpected object_info type: {type(data)}")
        return data
 def load_yaml(path: str | Path) -> dict[str, Any]:
    p = Path(path)
    if not p.exists():
        return {}
    raw = yaml.safe_load(p.read_text(encoding="utf-8"))
    return raw if isinstance(raw, dict) else {}
 def load_json(path: str | Path) -> Any:
    p = Path(path)
    if not p.exists():
        return None
    return json.loads(p.read_text(encoding="utf-8"))
 def iter_node_class_types(object_info: dict[str, Any]) -> Iterable[str]:
    for k in object_info.keys():
        if isinstance(k, str):
            yield k
 def find_ckpt_values(object_info: dict[str, Any]) -> list[str]:
    """
    Heuristic: locate any node input that looks like checkpoint selector.
    ComfyUI commonly uses CheckpointLoaderSimple.inputs.required.ckpt_name = [[...values...]]
    """
    vals: list[str] = []
    for node_name, node_info in object_info.items():
        if not isinstance(node_info, dict):
            continue
        inputs = node_info.get("input")
        if not isinstance(inputs, dict):
            continue
        required = inputs.get("required")
        if not isinstance(required, dict):
            continue
        for key in ("ckpt_name", "checkpoint", "model_name"):
            entry = required.get(key)
            # expected shape: [ [values...], {meta...} ] or [ [values...] ]
            if isinstance(entry, list) and entry:
                first = entry[0]
                if isinstance(first, list):
                    for v in first:
                        if isinstance(v, str):
                            vals.append(v)
    # de-dup
    seen: set[str] = set()
    out: list[str] = []
    for v in vals:
        if v not in seen:
            seen.add(v)
            out.append(v)
    return out
 def has_ksampler_seed(object_info: dict[str, Any], ks_classes: list[str], seed_key: str) -> bool:
    for cls in ks_classes:
        info = object_info.get(cls)
        if not isinstance(info, dict):
            continue
        inputs = info.get("input")
        if not isinstance(inputs, dict):
            continue
        required = inputs.get("required")
        optional = inputs.get("optional")
        if isinstance(required, dict) and seed_key in required:
            return True
        if isinstance(optional, dict) and seed_key in optional:
            return True
    return False
 def resolve_seed_target_from_workflow(workflow: Any, seed_class_types: list[str]) -> tuple[str | None, str | None]:
    """
    Returns (node_id, class_type) by scanning workflow dict for first matching class_type.
    workflow_api.json is typically { node_id: {class_type, inputs, ...}, ... }
    """
    if not isinstance(workflow, dict):
        return (None, None)
    want = set(seed_class_types)
    for node_id, node in workflow.items():
        if not isinstance(node, dict):
            continue
        ct = node.get("class_type")
        if isinstance(ct, str) and ct in want:
            return (str(node_id), ct)
    return (None, None)
 def _workflow_nodes(workflow: Any) -> dict[str, Any]:
    if not isinstance(workflow, dict):
        raise ValueError("workflow_api.json root must be an object mapping node_id -> node")
    return workflow
 def _get_node(workflow: dict[str, Any], node_id: str) -> dict[str, Any]:
    node = workflow.get(str(node_id))
    if not isinstance(node, dict):
        raise KeyError(f"workflow missing node_id={node_id}")
    return node
 def _validate_configured_node_id(
    *,
    workflow: dict[str, Any],
    node_id: Any,
    allowed_class_types: list[str],
    name: str,
 ) -> list[str]:
    errs: list[str] = []
    if node_id is None or not str(node_id).strip():
        return errs
    nid = str(node_id).strip()
    try:
        node = _get_node(workflow, nid)
    except Exception as e:
        return [f"{name}: configured node_id={nid} not found in workflow ({e})"]
    ct = node.get("class_type")
    if allowed_class_types and isinstance(ct, str) and ct not in set(allowed_class_types):
        errs.append(f"{name}: node_id={nid} class_type={ct} not in allowed {allowed_class_types}")
    return errs
 def _workflow_has_ltx_node(workflow: dict[str, Any], keyword: str) -> bool:
    kw = keyword.lower()
    for _nid, node in workflow.items():
        if not isinstance(node, dict):
            continue
        ct = node.get("class_type")
        if isinstance(ct, str) and kw in ct.lower():
            return True
    return False
 def main() -> int:
    p = argparse.ArgumentParser(description="Inspect ComfyUI /object_info for LTX + checkpoints + sampler override readiness")
    p.add_argument("--base-url", default="")
    p.add_argument("--timeout", type=float, default=8.0)
    p.add_argument("--config", default="./configs/config.yaml")
    p.add_argument("--workflow", default="./workflow_api.json")
    p.add_argument(
        "--expected-checkpoint",
        action="append",
        default=[],
        help="Expected checkpoint name (can repeat). Exact match against ckpt list.",
    )
    p.add_argument(
        "--ltx-keyword",
        default="LTX",
        help="Keyword to detect LTX-Video nodes in object_info keys (default: LTX)",
    )
    args = p.parse_args()
    cfg = load_yaml(args.config)
    base_url = (args.base_url or "").strip()
    if not base_url:
        app_cfg = (cfg.get("app") or {}) if isinstance(cfg, dict) else {}
        if isinstance(app_cfg, dict):
            base_url = str(app_cfg.get("comfy_base_url", "")).strip()
    if not base_url:
        base_url = "http://127.0.0.1:8188"
    comfy_cfg = (cfg.get("comfy_workflow") or {}) if isinstance(cfg, dict) else {}
    seed_key = str(comfy_cfg.get("seed_input_key", "seed"))
    seed_class_types = comfy_cfg.get("seed_node_class_types") or ["KSampler", "KSamplerAdvanced"]
    if not isinstance(seed_class_types, list):
        seed_class_types = ["KSampler", "KSamplerAdvanced"]
    seed_class_types = [str(x) for x in seed_class_types]
    # Industrial hard requirement: workflow must exist for ID matching checks
    wf_path = Path(args.workflow)
    if not wf_path.exists():
        sys.stderr.write(f"[inspect] FAIL: workflow_api.json not found at {wf_path}\n")
        return 3
    try:
        object_info = fetch_object_info(base_url, timeout_s=args.timeout)
    except Exception as e:
        sys.stderr.write(f"[inspect] ERROR fetch /object_info: {e}\n")
        return 2
    # 1) LTX-Video plugin activated? (heuristic)
    keyword = str(args.ltx_keyword or "LTX")
    ltx_hits = sorted([k for k in iter_node_class_types(object_info) if keyword.lower() in k.lower()])
    ltx_ok = len(ltx_hits) > 0
    # 2) checkpoint list includes expected
    ckpts = find_ckpt_values(object_info)
    expected = list(args.expected_checkpoint or [])
    missing = [x for x in expected if x not in ckpts]
    ckpt_ok = len(missing) == 0 if expected else True
    # 3) KSampler defaults overridden by our python? (readiness check)
    # /object_info cannot prove runtime override happened, but we can validate:
    # - ComfyUI exposes a sampler node class with a 'seed' input key
    # - our config intends to override that same key
    ks_ok = has_ksampler_seed(object_info, seed_class_types, seed_key)
    wf = load_json(args.workflow)
    try:
        wf_nodes = _workflow_nodes(wf)
    except Exception as e:
        sys.stderr.write(f"[inspect] FAIL: invalid workflow format: {e}\n")
        return 3
    seed_node_id, seed_node_class = resolve_seed_target_from_workflow(wf_nodes, seed_class_types)
    wf_ok = seed_node_id is not None
    # Hard validation: configured node IDs must exist and match expected class_type families
    prompt_allowed = [str(x) for x in (comfy_cfg.get("prompt_node_class_types") or []) if str(x).strip()]
    seed_allowed = [str(x) for x in (comfy_cfg.get("seed_node_class_types") or []) if str(x).strip()]
    save_allowed = [str(x) for x in (comfy_cfg.get("save_node_class_types") or []) if str(x).strip()]
    errs: list[str] = []
    errs += _validate_configured_node_id(
        workflow=wf_nodes,
        node_id=comfy_cfg.get("prompt_node_id"),
        allowed_class_types=prompt_allowed,
        name="prompt_node_id",
    )
    errs += _validate_configured_node_id(
        workflow=wf_nodes,
        node_id=comfy_cfg.get("seed_node_id"),
        allowed_class_types=seed_allowed,
        name="seed_node_id",
    )
    errs += _validate_configured_node_id(
        workflow=wf_nodes,
        node_id=comfy_cfg.get("save_node_id"),
        allowed_class_types=save_allowed,
        name="save_node_id",
    )
    errs += _validate_configured_node_id(
        workflow=wf_nodes,
        node_id=comfy_cfg.get("motion_node_id"),
        allowed_class_types=[],
        name="motion_node_id",
    )
    # Hard validation: workflow must contain LTX node(s) if we're using LTX-Video pipeline
    wf_ltx_ok = _workflow_has_ltx_node(wf_nodes, str(args.ltx_keyword or "LTX"))
    # Hard validation: seed node in workflow must expose the seed input key (so it can be overridden)
    wf_seed_key_ok = False
    if wf_ok:
        try:
            node = _get_node(wf_nodes, str(seed_node_id))
            inputs = node.get("inputs")
            wf_seed_key_ok = isinstance(inputs, dict) and seed_key in inputs
        except Exception:
            wf_seed_key_ok = False
    report = {
        "base_url": base_url,
        "ltx": {
            "keyword": keyword,
            "activated": ltx_ok,
            "matching_nodes": ltx_hits[:50],
            "match_count": len(ltx_hits),
        },
        "checkpoints": {
            "expected": expected,
            "found_count": len(ckpts),
            "missing": missing,
            "ok": ckpt_ok,
            "sample": ckpts[:50],
        },
        "sampler_override_readiness": {
            "seed_input_key_from_config": seed_key,
            "seed_node_class_types_from_config": seed_class_types,
            "comfy_has_seed_input": ks_ok,
            "workflow_path": args.workflow,
            "workflow_seed_node_detected": wf_ok,
            "workflow_seed_node_id": seed_node_id,
            "workflow_seed_node_class_type": seed_node_class,
            "workflow_seed_node_has_seed_key": wf_seed_key_ok,
            "note": "object_info cannot prove runtime override; this enforces key alignment + workflow ID/class checks.",
        },
        "workflow_validation": {
            "ltx_node_in_workflow": wf_ltx_ok,
            "configured_node_id_errors": errs,
        },
        "ok": bool(ltx_ok and ckpt_ok and ks_ok and wf_ok and wf_ltx_ok and wf_seed_key_ok and not errs),
    }
    sys.stdout.write(json.dumps(report, ensure_ascii=False, indent=2) + "\n")
    if not ltx_ok:
        sys.stderr.write(f"[inspect] FAIL: no node matched keyword '{keyword}' (LTX plugin may be missing)\n")
    if not ckpt_ok:
        sys.stderr.write(f"[inspect] FAIL: missing checkpoints: {missing}\n")
    if not ks_ok:
        sys.stderr.write(f"[inspect] FAIL: ComfyUI sampler classes {seed_class_types} do not expose input '{seed_key}'\n")
    if not wf_ok:
        sys.stderr.write(f"[inspect] FAIL: workflow does not contain a seed node of class types {seed_class_types}\n")
    if not wf_ltx_ok:
        sys.stderr.write(f"[inspect] FAIL: workflow has no node with class_type containing '{args.ltx_keyword}'\n")
    if wf_ok and not wf_seed_key_ok:
        sys.stderr.write(f"[inspect] FAIL: workflow seed node {seed_node_id} does not expose inputs['{seed_key}']\n")
    if errs:
        for e in errs:
            sys.stderr.write(f"[inspect] FAIL: {e}\n")
    return 0 if report["ok"] else 3
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/server/index.js
+++ b/server/index.js
@@ -2,14 +2,34 @@ import express from "express";
 import { spawn } from "node:child_process";
 import path from "node:path";
 import { fileURLToPath } from "node:url";
 import fs from "node:fs";
 import crypto from "node:crypto";
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = path.dirname(__filename);
 const repoRoot = path.resolve(__dirname, "..");
 const outputsDir = path.join(repoRoot, "outputs");
 fs.mkdirSync(outputsDir, { recursive: true });
 const app = express();
 app.use(express.json({ limit: "2mb" }));
 app.use(
  "/api/static",
  express.static(outputsDir, {
    fallthrough: true,
    setHeaders: (res) => {
      // Important: avoid stale video preview.
      res.setHeader("Cache-Control", "no-cache, no-transform");
    },
  })
 );
 app.use(express.static(path.join(__dirname, "public")));
 app.get("/api/health", (_req, res) => {
  res.setHeader("Cache-Control", "no-cache");
  res.status(200).json({ ok: true });
 });
 function sseHeaders(res) {
  res.setHeader("Content-Type", "text/event-stream; charset=utf-8");
  res.setHeader("Cache-Control", "no-cache, no-transform");
@@ -25,9 +45,46 @@ function sseSend(res, event, data) {
  res.write("\n");
 }
-app.get("/api/run", (req, res) => {
+function newTaskId() {
  return crypto.randomUUID();
 }
 function taskDir(taskId) {
  return path.join(outputsDir, taskId);
 }
 function ensureTaskDir(taskId) {
  const dir = taskDir(taskId);
  fs.mkdirSync(dir, { recursive: true });
  return dir;
 }
 function spawnPythonStep({ step, prompt, configPath, mock, globalStyle, character, taskId, sceneIndex }) {
  const py = process.env.PYTHON_BIN || "python3.10";
  const args = [
    "-m",
    "engine.main",
    "--prompt",
    prompt,
    "--config",
    configPath,
    "--step",
    step,
    "--task-id",
    taskId,
  ];
  if (sceneIndex) args.push("--scene-index", String(sceneIndex));
  if (globalStyle) args.push("--global-style", globalStyle);
  if (character) args.push("--character", character);
  if (mock) args.push("--mock");
  return spawn(py, args, { cwd: repoRoot, env: process.env, stdio: ["pipe", "pipe", "pipe"] });
 }
 app.get("/api/script", (req, res) => {
  const prompt = String(req.query.prompt || "").trim();
  const mock = String(req.query.mock || "1") === "1";
  const globalStyle = String(req.query.global_style || "").trim();
  const character = String(req.query.character || "").trim();
  const configPath = String(req.query.config || "./configs/config.yaml");
  if (!prompt) {
@@ -35,25 +92,21 @@ app.get("/api/run", (req, res) => {
    return;
  }
  const taskId = newTaskId();
  ensureTaskDir(taskId);
  sseHeaders(res);
  sseSend(res, "task", JSON.stringify({ task_id: taskId }));
  sseSend(res, "status", "starting");
-  // Unified in-container execution: Node spawns python directly.
+  const child = spawnPythonStep({
-  const py = process.env.PYTHON_BIN || "python";
+    step: "script",
  const args = [
    path.join(repoRoot, "main.py"),
    "--prompt",
    prompt,
    "--config",
    configPath,
-    "--script-only",
+    mock,
-  ];
+    globalStyle,
-  if (mock) args.push("--mock");
+    character,
-
+    taskId,
  const child = spawn(py, args, {
    cwd: repoRoot,
    env: process.env,
    stdio: ["ignore", "pipe", "pipe"],
  });
  let buf = "";
@@ -64,14 +117,15 @@ app.get("/api/run", (req, res) => {
    buf = parts.pop() || "";
    for (const line of parts) {
      if (!line) continue;
-      // Forward raw lines. Frontend will parse SCENE_JSON.
+      if (line.startsWith("SCENE_JSON ")) sseSend(res, "scene", line.slice("SCENE_JSON ".length));
-      sseSend(res, "line", line);
+      else if (line.startsWith("PROG ")) sseSend(res, "prog", line.slice("PROG ".length));
      else sseSend(res, "line", line);
    }
  });
  child.stderr.setEncoding("utf8");
  child.stderr.on("data", (chunk) => {
-    sseSend(res, "stderr", chunk);
+    sseSend(res, "error", chunk);
  });
  req.on("close", () => {
@@ -80,13 +134,177 @@ app.get("/api/run", (req, res) => {
  child.on("exit", (code) => {
    if (buf.trim()) sseSend(res, "line", buf.trim());
-    sseSend(res, "done", String(code ?? 0));
+    sseSend(res, "done", String(code != null ? code : 0));
    res.end();
  });
 });
-const port = Number(process.env.PORT || 3000);
+app.post("/api/refine", (req, res) => {
-app.listen(port, () => {
+  const prompt = String((req.body && req.body.prompt) || "").trim();
-  console.log(`[server] http://127.0.0.1:${port}`);
+  const sceneIndex = Number((req.body && req.body.scene_index) || 1);
  const scenes = req.body && req.body.scenes;
  const scene = req.body && req.body.scene;
  const mock = Boolean((req.body && req.body.mock) != null ? req.body.mock : true);
  const globalStyle = String((req.body && req.body.global_style) || "").trim();
  const character = String((req.body && req.body.character) || "").trim();
  const configPath = String((req.body && req.body.config) || "./configs/config.yaml");
  const taskId = String((req.body && req.body.task_id) || "").trim() || newTaskId();
  if (!prompt) return res.status(400).json({ error: "missing prompt" });
  if (!Number.isFinite(sceneIndex) || sceneIndex < 1) return res.status(400).json({ error: "bad scene_index" });
  if (!Array.isArray(scenes) && (!scene || typeof scene !== "object")) {
    return res.status(400).json({ error: "missing scene or scenes[]" });
  }
  ensureTaskDir(taskId);
  const child = spawnPythonStep({
    step: "refine",
    prompt,
    configPath,
    mock,
    globalStyle,
    character,
    taskId,
    sceneIndex,
  });
  if (Array.isArray(scenes)) {
    child.stdin.end(JSON.stringify({ scenes }));
  } else {
    child.stdin.end(JSON.stringify({ scene }));
  }
  let out = "";
  let err = "";
  child.stdout.setEncoding("utf8");
  child.stderr.setEncoding("utf8");
  child.stdout.on("data", (c) => (out += c));
  child.stderr.on("data", (c) => (err += c));
  child.on("exit", (code) => {
    if (code !== 0) return res.status(500).json({ error: "python failed", stderr: err, stdout: out });
    const line = out
      .split(/\r?\n/)
      .map((s) => s.trim())
      .find((s) => s.startsWith("SCENE_JSON "));
    if (!line) return res.status(500).json({ error: "no SCENE_JSON", stderr: err, stdout: out });
    const payload = JSON.parse(line.slice("SCENE_JSON ".length));
    return res.json({ task_id: taskId, scene: payload, stderr: err });
  });
 });
 let isBusy = false;
 app.post("/api/render", (req, res) => {
  const prompt = String((req.body && req.body.prompt) || "").trim();
  const scenes = req.body && req.body.scenes;
  const mock = Boolean((req.body && req.body.mock) != null ? req.body.mock : false);
  const globalStyle = String((req.body && req.body.global_style) || "").trim();
  const character = String((req.body && req.body.character) || "").trim();
  const configPath = String((req.body && req.body.config) || "./configs/config.yaml");
  const taskId = String((req.body && req.body.task_id) || "").trim() || newTaskId();
  if (!prompt) return res.status(400).json({ error: "missing prompt" });
  if (!Array.isArray(scenes)) return res.status(400).json({ error: "missing scenes[]" });
  ensureTaskDir(taskId);
  if (isBusy) {
    return res.status(429).json({ error: "busy", msg: "GPU is busy, try later" });
  }
  isBusy = true;
  sseHeaders(res);
  sseSend(res, "task", JSON.stringify({ task_id: taskId }));
  sseSend(res, "status", "render_start");
  const child = spawnPythonStep({
    step: "render",
    prompt,
    configPath,
    mock,
    globalStyle,
    character,
    taskId,
  });
  child.stdin.end(JSON.stringify({ scenes }));
  let buf = "";
  child.stdout.setEncoding("utf8");
  child.stderr.setEncoding("utf8");
  child.stdout.on("data", (chunk) => {
    buf += chunk;
    const parts = buf.split(/\r?\n/);
    buf = parts.pop() || "";
    for (const line of parts) {
      if (!line) continue;
      if (line.startsWith("PROG ")) sseSend(res, "prog", line.slice("PROG ".length));
      else if (line.startsWith("RENDER_DONE ")) sseSend(res, "done", line.slice("RENDER_DONE ".length));
      else sseSend(res, "line", line);
    }
  });
  child.stderr.on("data", (chunk) => {
    sseSend(res, "error", chunk);
  });
  req.on("close", () => {
    child.kill("SIGTERM");
  });
  child.on("exit", (code) => {
    isBusy = false;
    if (buf.trim()) sseSend(res, "line", buf.trim());
    if (code !== 0) sseSend(res, "error", `[ERROR] python exit_code=${code}`);
    res.end();
  });
 });
 async function runSelfCheck() {
  const py = process.env.PYTHON_BIN || "python3.10";
  const checks = [
    { name: "check_comfy", args: ["scripts/check_comfy.py"] },
    { name: "inspect_comfy_node", args: ["scripts/inspect_comfy_node.py"] },
  ];
  for (const c of checks) {
    const deadline = Date.now() + 90_000;
    let lastErr = "";
    while (Date.now() < deadline) {
      try {
        await new Promise((resolve, reject) => {
          const child = spawn(py, c.args, { cwd: repoRoot, env: process.env, stdio: ["ignore", "pipe", "pipe"] });
          let out = "";
          let err = "";
          child.stdout.setEncoding("utf8");
          child.stderr.setEncoding("utf8");
          child.stdout.on("data", (d) => (out += d));
          child.stderr.on("data", (d) => (err += d));
          child.on("exit", (code) => {
            if (code === 0) return resolve(true);
            reject(new Error(`${c.name} failed (code=${code})\n${err || out}`));
          });
        });
        lastErr = "";
        break;
      } catch (e) {
        lastErr = String(e);
        await new Promise((r) => setTimeout(r, 2000));
      }
    }
    if (lastErr) {
      throw new Error(lastErr);
    }
  }
 }
 const port = Number(process.env.PORT || 3000);
 (async () => {
  try {
    await runSelfCheck();
    app.listen(port, () => {
      console.log(`[server] http://127.0.0.1:${port}`);
    });
  } catch (e) {
    console.error(String(e));
    process.exit(1);
  }
 })();
--- a/server/public/index.html
+++ b/server/public/index.html
@@ -3,7 +3,7 @@
  <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
-    <title>AiVideo POC - Script Stream Test</title>
+    <title>AiVideo POC - Interactive</title>
    <style>
      body { font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, "PingFang SC", "Noto Sans CJK SC", "Microsoft YaHei", sans-serif; margin: 24px; }
      .row { display: flex; gap: 12px; align-items: center; flex-wrap: wrap; }
@@ -15,108 +15,327 @@
      .k { color: #6b7280; font-size: 12px; margin: 8px 0 2px; }
      .v { white-space: pre-wrap; }
      .muted { color: #6b7280; }
      .videoBox { margin-top: 16px; border-top: 1px solid #e5e7eb; padding-top: 16px; }
      video { width: min(980px, 100%); background: #000; border-radius: 10px; }
      .toast {
        position: fixed;
        right: 18px;
        bottom: 18px;
        max-width: min(520px, calc(100vw - 36px));
        background: rgba(20, 24, 33, 0.96);
        color: #fff;
        border: 1px solid rgba(255,255,255,0.12);
        border-radius: 12px;
        padding: 12px 14px;
        box-shadow: 0 10px 30px rgba(0,0,0,0.35);
        z-index: 9999;
      }
      .toast .title { font-weight: 700; margin-bottom: 6px; }
      .toast .msg { white-space: pre-wrap; font-size: 13px; opacity: 0.95; }
      .toast .close { float: right; cursor: pointer; opacity: 0.8; }
    </style>
  </head>
  <body>
-    <h2>AiVideo POC：实时分镜脚本流测试</h2>
+    <div id="root"></div>
    <p class="muted">点击运行后，页面会通过 SSE 实时接收 Python stdout，并把分镜渲染到下方。</p>
-    <div class="row">
+    <script crossorigin src="https://unpkg.com/react@18/umd/react.production.min.js"></script>
-      <input id="prompt" type="text" value="写一个温暖的城市夜景故事" />
+    <script crossorigin src="https://unpkg.com/react-dom@18/umd/react-dom.production.min.js"></script>
-      <label class="row" style="gap:6px;">
+    <script crossorigin src="https://unpkg.com/babel-standalone@6/babel.min.js"></script>
        <input id="mock" type="checkbox" checked />
        mock（无 ComfyUI / 无 Key 也能跑）
      </label>
      <button id="run">运行</button>
      <button id="stop">停止</button>
    </div>
-    <div class="scenes" id="scenes"></div>
+    <script type="text/babel">
      const { useEffect, useMemo, useRef, useState } = React;
-    <h3>原始日志（stdout/stderr）</h3>
+      function App() {
-    <pre id="log"></pre>
+        const [prompt, setPrompt] = useState("写一个温暖的城市夜景故事");
        const [globalStyle, setGlobalStyle] = useState("电影感");
        const [characterPreset, setCharacterPreset] = useState("");
        const [mock, setMock] = useState(true);
        const [logs, setLogs] = useState("");
        const [scenes, setScenes] = useState([null, null, null]);
        const [canRender, setCanRender] = useState(false);
        const [finalVideoUrl, setFinalVideoUrl] = useState("");
        const [taskId, setTaskId] = useState("");
        const [toast, setToast] = useState("");
-    <script>
+        const esRef = useRef(null);
-      const $ = (id) => document.getElementById(id);
+        const logRef = useRef(null);
      const logEl = $("log");
      const scenesEl = $("scenes");
      let es = null;
-      function log(line) {
+        const appendLog = (line) => {
-        logEl.textContent += line + "\n";
+          setLogs((prev) => prev + line + "\n");
-        logEl.scrollTop = logEl.scrollHeight;
+        };
      }
-      function upsertScene(scene) {
+        const showToast = (msg) => {
-        const id = "scene-" + scene.index;
+          setToast(String(msg || "发生错误"));
-        let card = document.getElementById(id);
+          // auto hide
-        if (!card) {
+          setTimeout(() => setToast(""), 6000);
-          card = document.createElement("div");
+        };
          card.className = "card";
          card.id = id;
          scenesEl.appendChild(card);
        }
        card.innerHTML = `
          <div><strong>Scene ${scene.index}</strong></div>
          <div class="k">image_prompt</div><div class="v">${escapeHtml(scene.image_prompt)}</div>
          <div class="k">video_motion</div><div class="v">${escapeHtml(scene.video_motion || "")}</div>
          <div class="k">narration</div><div class="v">${escapeHtml(scene.narration)}</div>
        `;
      }
-      function escapeHtml(s) {
+        useEffect(() => {
-        return String(s)
+          if (!logRef.current) return;
-          .replaceAll("&", "&amp;")
+          logRef.current.scrollTop = logRef.current.scrollHeight;
-          .replaceAll("<", "&lt;")
+        }, [logs]);
          .replaceAll(">", "&gt;")
          .replaceAll('"', "&quot;")
          .replaceAll("'", "&#039;");
      }
-      function start() {
+        const startScript = () => {
-        stop();
+          stopScript();
-        logEl.textContent = "";
+          setLogs("");
-        scenesEl.innerHTML = "";
+          setScenes([null, null, null]);
          setCanRender(false);
          setFinalVideoUrl("");
          setTaskId("");
-        const prompt = $("prompt").value.trim();
+          const url = `/api/script?prompt=${encodeURIComponent(prompt.trim())}&mock=${mock ? "1" : "0"}&global_style=${encodeURIComponent(globalStyle)}&character=${encodeURIComponent(characterPreset)}`;
-        const mock = $("mock").checked ? "1" : "0";
+          const es = new EventSource(url);
-        if (!prompt) return;
+          esRef.current = es;
-        const url = `/api/run?prompt=${encodeURIComponent(prompt)}&mock=${mock}`;
+          es.addEventListener("status", (e) => appendLog("[status] " + e.data));
-        es = new EventSource(url);
+          es.addEventListener("error", (e) => {
-
+            const m = (e && e.data) ? e.data : "连接或后端错误";
-        es.addEventListener("status", (e) => log("[status] " + e.data));
+            appendLog("[ERROR] " + m);
-        es.addEventListener("stderr", (e) => log("[stderr] " + e.data));
+            showToast(m);
-        es.addEventListener("done", (e) => {
+          });
-          log("[done] exit_code=" + e.data);
+          es.addEventListener("task", (e) => {
-          stop();
+            try { setTaskId(JSON.parse(e.data).task_id || ""); } catch { }
-        });
+          });
-        es.addEventListener("line", (e) => {
+          es.addEventListener("done", (e) => {
-          const line = e.data;
+            appendLog("[done] exit_code=" + e.data);
-          log(line);
+            stopScript();
-          if (line.startsWith("SCENE_JSON ")) {
+          });
          es.addEventListener("scene", (e) => {
            try {
-              const obj = JSON.parse(line.slice("SCENE_JSON ".length));
+              const obj = JSON.parse(e.data);
-              upsertScene(obj);
+              setScenes((prev) => {
                const next = [...prev];
                next[obj.index - 1] = {
                  index: obj.index,
                  image_prompt: obj.image_prompt || "",
                  video_motion: obj.video_motion || "",
                  narration: obj.narration || "",
                };
                return next;
              });
            } catch (err) {
-              log("[parse_error] " + err);
+              appendLog("[parse_error] " + err);
            }
          });
          es.addEventListener("line", (e) => {
            appendLog(e.data);
            if (e.data === "SCRIPT_END") setCanRender(true);
          });
          es.onerror = () => appendLog("[error] connection error");
        };
        const stopScript = () => {
          if (esRef.current) {
            esRef.current.close();
            esRef.current = null;
          }
        };
        const onEdit = (idx, field, value) => {
          setScenes((prev) => {
            const next = [...prev];
            const cur = next[idx] || { index: idx + 1, image_prompt: "", video_motion: "", narration: "" };
            next[idx] = { ...cur, [field]: value };
            return next;
          });
        };
        const refineOne = async (sceneIndex) => {
          appendLog(`[refine] scene ${sceneIndex}...`);
          const s0 = scenes[sceneIndex - 1] || {};
          const payloadScene = {
            image_prompt: s0.image_prompt || "",
            video_motion: s0.video_motion || "",
            narration: s0.narration || "",
          };
          const resp = await fetch("/api/refine", {
            method: "POST",
            headers: { "Content-Type": "application/json" },
            body: JSON.stringify({ prompt, scene: payloadScene, scene_index: sceneIndex, mock, global_style: globalStyle, character: characterPreset, task_id: taskId }),
          });
          const data = await resp.json();
          if (!resp.ok) {
            appendLog("[refine_error] " + JSON.stringify(data));
            showToast((data && (data.error || data.msg)) || "润色失败");
            return;
          }
          const s = data.scene;
          setScenes((prev) => {
            const next = [...prev];
            next[s.index - 1] = {
              index: s.index,
              image_prompt: s.image_prompt || "",
              video_motion: s.video_motion || "",
              narration: s.narration || "",
            };
            return next;
          });
          appendLog(`[refine] scene ${sceneIndex} done`);
        };
        const renderVideo = async () => {
          appendLog("[render] start...");
          const payloadScenes = scenes.map((s, i) => ({
            image_prompt: (s && s.image_prompt) || "",
            video_motion: (s && s.video_motion) || "",
            narration: (s && s.narration) || "",
          }));
          const resp = await fetch("/api/render", {
            method: "POST",
            headers: { "Content-Type": "application/json" },
            body: JSON.stringify({ prompt, scenes: payloadScenes, mock, global_style: globalStyle, character: characterPreset, task_id: taskId }),
          });
          if (!resp.ok) {
            appendLog("[render_error] http " + resp.status);
            showToast("渲染请求失败（HTTP " + resp.status + "）");
            return;
          }
          // Parse SSE from fetch (POST)
          const reader = resp.body.getReader();
          const decoder = new TextDecoder("utf-8");
          let buf = "";
          while (true) {
            const { value, done } = await reader.read();
            if (done) break;
            buf += decoder.decode(value, { stream: true });
            const chunks = buf.split("\n\n");
            buf = chunks.pop() || "";
            for (const c of chunks) {
              const lines = c.split("\n").filter(Boolean);
              let event = "message";
              const dataLines = [];
              for (const line of lines) {
                if (line.startsWith("event:")) event = line.slice(6).trim();
                else if (line.startsWith("data:")) dataLines.push(line.slice(5).trim());
              }
              const data = dataLines.join("\n");
              if (event === "task") {
                try { setTaskId(JSON.parse(data).task_id || ""); } catch { }
              } else if (event === "prog") {
                appendLog("[prog] " + data);
              } else if (event === "error") {
                appendLog("[ERROR] " + data);
                showToast(data);
              } else if (event === "done") {
                try {
                  const obj = JSON.parse(data);
                  const file = String(obj.output || "").split("/").pop() || "final.mp4";
                  const tid = taskId || (obj.task_id || "");
                  appendLog("[render] done: " + file);
                  if (tid) setFinalVideoUrl(`/api/static/${encodeURIComponent(tid)}/${encodeURIComponent(file)}?t=${Date.now()}`);
                } catch (e) {
                  appendLog("[render_done_parse_error] " + e);
                  showToast("渲染完成消息解析失败");
                }
              } else {
                appendLog(data);
              }
            }
          }
        });
        es.onerror = () => {
          log("[error] connection error");
        };
        return (
          <div>
            <h2>AiVideo POC：双向交互手搓平台</h2>
            <p className="muted">分镜可编辑、可单条润色，渲染完成后可直接预览与下载。</p>
            <div className="row">
              <input
                type="text"
                value={prompt}
                onChange={(e) => setPrompt(e.target.value)}
              />
            </div>
            <div className="row">
              <label className="row" style={{ gap: 6 }}>
                Global Style:
                <select value={globalStyle} onChange={(e) => setGlobalStyle(e.target.value)} style={{ padding: "8px" }}>
                  <option value="电影感">电影感</option>
                  <option value="二次元">二次元</option>
                  <option value="写实">写实</option>
                </select>
              </label>
              <label className="row" style={{ gap: 6 }}>
                Character Preset:
                <input
                  type="text"
                  value={characterPreset}
                  onChange={(e) => setCharacterPreset(e.target.value)}
                  placeholder="例如：黑发短发、穿风衣的年轻侦探、冷静目光"
                  style={{ width: "min(640px, 100%)", padding: "10px 12px", fontSize: 14 }}
                />
              </label>
              <label className="row" style={{ gap: 6 }}>
                <input type="checkbox" checked={mock} onChange={(e) => setMock(e.target.checked)} />
                mock（无 ComfyUI / 无 Key 也能跑）
              </label>
              <button onClick={startScript}>生成分镜</button>
              <button onClick={stopScript}>停止</button>
              {canRender ? (
                <button onClick={renderVideo}>确认并开始渲染视频</button>
              ) : null}
              {taskId ? <span className="muted">task_id: {taskId}</span> : null}
            </div>
            <div className="scenes">
              {scenes.map((s, idx) => (
                <div className="card" key={idx}>
                  <div className="row" style={{ justifyContent: "space-between" }}>
                    <strong>Scene {idx + 1}</strong>
                    <button style={{ padding: "6px 10px" }} onClick={() => refineOne(idx + 1)}>🔄 重新润色</button>
                  </div>
                  <div className="k">image_prompt</div>
                  <textarea rows="3" style={{ width: "100%", padding: 8 }}
                    value={(s && s.image_prompt) || ""}
                    onChange={(e) => onEdit(idx, "image_prompt", e.target.value)}
                  />
                  <div className="k">video_motion</div>
                  <textarea rows="2" style={{ width: "100%", padding: 8 }}
                    value={(s && s.video_motion) || ""}
                    onChange={(e) => onEdit(idx, "video_motion", e.target.value)}
                  />
                  <div className="k">narration</div>
                  <textarea rows="2" style={{ width: "100%", padding: 8 }}
                    value={(s && s.narration) || ""}
                    onChange={(e) => onEdit(idx, "narration", e.target.value)}
                  />
                </div>
              ))}
            </div>
            <div className="videoBox">
              <h3>视频预览</h3>
              {finalVideoUrl ? (
                <div>
                  <video controls src={finalVideoUrl}></video>
                  <div className="row" style={{ marginTop: 10 }}>
                    <a href={finalVideoUrl} download>
                      <button>下载视频</button>
                    </a>
                  </div>
                  <div className="muted">URL: {finalVideoUrl}</div>
                </div>
              ) : (
                <div className="muted">尚未渲染完成。</div>
              )}
            </div>
            <h3>原始日志（stdout/stderr）</h3>
            <pre ref={logRef}>{logs}</pre>
            {toast ? (
              <div className="toast" role="alert">
                <span className="close" onClick={() => setToast("")}>✕</span>
                <div className="title">发生错误</div>
                <div className="msg">{toast}</div>
              </div>
            ) : null}
          </div>
        );
      }
-      function stop() {
+      ReactDOM.createRoot(document.getElementById("root")).render(<App />);
        if (es) {
          es.close();
          es = null;
        }
      }
      $("run").addEventListener("click", start);
      $("stop").addEventListener("click", stop);
    </script>
  </body>
 </html>