diff --git a/Dockerfile b/Dockerfile index 8699e45..6154e4f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,7 @@ -FROM python:3.10-slim +FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04 AS builder -ENV PYTHONDONTWRITEBYTECODE=1 \ +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 \ PIP_DISABLE_PIP_VERSION_CHECK=1 \ PIP_NO_CACHE_DIR=1 \ @@ -10,26 +11,53 @@ ENV PYTHONDONTWRITEBYTECODE=1 \ WORKDIR /app -# ffmpeg is required for MoviePy (audio duration + encoding). -RUN if [ -f /etc/apt/sources.list ]; then \ - sed -i 's|http://deb.debian.org/debian|https://mirrors.tuna.tsinghua.edu.cn/debian|g; s|http://security.debian.org/debian-security|https://mirrors.tuna.tsinghua.edu.cn/debian-security|g' /etc/apt/sources.list; \ - fi \ - && if [ -f /etc/apt/sources.list.d/debian.sources ]; then \ - sed -i 's|http://deb.debian.org/debian|https://mirrors.tuna.tsinghua.edu.cn/debian|g; s|http://security.debian.org/debian-security|https://mirrors.tuna.tsinghua.edu.cn/debian-security|g' /etc/apt/sources.list.d/debian.sources; \ - fi \ - && apt-get update && apt-get install -y --no-install-recommends \ - ffmpeg \ - fonts-dejavu-core \ - nodejs \ - npm \ - && rm -rf /var/lib/apt/lists/* +# Base deps + Python 3.10 + Node.js 20.x +RUN sed -i 's|http://archive.ubuntu.com/ubuntu|https://mirrors.tuna.tsinghua.edu.cn/ubuntu|g; s|http://security.ubuntu.com/ubuntu|https://mirrors.tuna.tsinghua.edu.cn/ubuntu|g' /etc/apt/sources.list \ + && apt-get -o Acquire::Retries=5 update \ + && apt-get -o Acquire::Retries=5 install -y --no-install-recommends --fix-missing \ + ca-certificates curl gnupg \ + python3.10 python3.10-distutils python3-pip \ + ffmpeg fonts-dejavu-core \ + && curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \ + && apt-get -o Acquire::Retries=5 install -y --no-install-recommends --fix-missing nodejs \ + && ln -sf /usr/bin/python3.10 /usr/local/bin/python \ + && rm -rf /var/lib/apt/lists/* COPY requirements.txt /app/requirements.txt -RUN pip install -r /app/requirements.txt +RUN python3.10 -m pip install -r /app/requirements.txt + +COPY server/package.json server/package-lock.json /app/server/ +RUN cd /app/server && npm ci --omit=dev COPY . /app -RUN cd /app/server && npm i --omit=dev +FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04 AS runtime + +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple \ + PIP_TRUSTED_HOST=pypi.tuna.tsinghua.edu.cn \ + NPM_CONFIG_REGISTRY=https://registry.npmmirror.com + +WORKDIR /app + +RUN sed -i 's|http://archive.ubuntu.com/ubuntu|https://mirrors.tuna.tsinghua.edu.cn/ubuntu|g; s|http://security.ubuntu.com/ubuntu|https://mirrors.tuna.tsinghua.edu.cn/ubuntu|g' /etc/apt/sources.list \ + && apt-get -o Acquire::Retries=5 update \ + && apt-get -o Acquire::Retries=5 install -y --no-install-recommends --fix-missing \ + ca-certificates \ + python3.10 python3.10-distutils python3-pip \ + ffmpeg fonts-dejavu-core \ + && curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \ + && apt-get -o Acquire::Retries=5 install -y --no-install-recommends --fix-missing nodejs \ + && ln -sf /usr/bin/python3.10 /usr/local/bin/python \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=builder /usr/local/lib/python3.10 /usr/local/lib/python3.10 +COPY --from=builder /usr/local/bin /usr/local/bin +COPY --from=builder /app /app EXPOSE 3000 CMD ["node", "/app/server/index.js"] diff --git a/README.md b/README.md index 3013f81..e293042 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,8 @@ - Output: a 3-scene narrated video `final_poc.mp4` (mock mode supported) ## Quick start (Docker) +`docker compose up` includes a **ComfyUI** service (default image `jamesbrink/comfyui:latest` from Docker Hub). If you use another registry image, set `COMFYUI_IMAGE` in the environment. + Build: ```bash diff --git a/configs/config.yaml b/configs/config.yaml index 4d4f361..be2485d 100644 --- a/configs/config.yaml +++ b/configs/config.yaml @@ -1,6 +1,6 @@ app: - # ComfyUI base url (local) - comfy_base_url: "http://127.0.0.1:8188" + # ComfyUI base url (docker internal service) + comfy_base_url: "http://comfyui:8188" # ComfyUI output directory on the same machine running this code comfy_output_dir: "./ComfyUI/output" @@ -26,7 +26,7 @@ tts: video: # Final output path - final_output: "./final_poc.mp4" + final_output: "./outputs/final_poc.mp4" # If ComfyUI is not ready, generate mock clips with this size & fps mock_size: [1024, 576] mock_fps: 24 diff --git a/dev.sh b/dev.sh index c9c6961..f9bec04 100755 --- a/dev.sh +++ b/dev.sh @@ -18,7 +18,29 @@ shift || true case "$CMD" in up) - docker compose up --build "$@" + # Start in background, then wait for Node self-check + health endpoint. + docker compose up -d --build "$@" + echo "[dev] waiting for server health..." + deadline=$((SECONDS + 90)) + ok=0 + while [ $SECONDS -lt $deadline ]; do + if curl -fsS "http://127.0.0.1:3000/api/health" >/dev/null 2>&1; then + ok=1 + break + fi + # If container exited, fail fast. + if ! docker compose ps --status running | grep -q "aivideo"; then + break + fi + sleep 1 + done + if [ "$ok" -ne 1 ]; then + echo "[dev] server failed to become healthy (self-check likely failed)." >&2 + docker compose logs --tail=200 aivideo || true + exit 1 + fi + echo "[dev] server ready: http://127.0.0.1:3000" + docker compose logs -f --tail=50 aivideo ;; rebuild) docker compose build "$@" diff --git a/docker-compose.yml b/docker-compose.yml index cf09164..1eb1b46 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,6 +2,8 @@ services: aivideo: build: . working_dir: /app + depends_on: + - comfyui environment: - OPENAI_API_KEY=${OPENAI_API_KEY} - OPENAI_BASE_URL=${OPENAI_BASE_URL} @@ -10,5 +12,18 @@ services: - ./:/app ports: - "3000:3000" - # On macOS, use host.docker.internal to reach host services like ComfyUI. - # Example: set app.comfy_base_url in configs/config.yaml to http://host.docker.internal:8188 + + # Default: Docker Hub (anonymous pull). GHCR comfyanonymous image often returns "denied" without login. + # Override: COMFYUI_IMAGE=ghcr.io/... after `docker login ghcr.io` + comfyui: + image: ${COMFYUI_IMAGE:-jamesbrink/comfyui:latest} + environment: + - CLI_ARGS=--listen 0.0.0.0 --port 8188 + ports: + - "8188:8188" + volumes: + - ./ComfyUI/user:/comfyui/user + - ./ComfyUI/models:/comfyui/models + - ./ComfyUI/custom_nodes:/comfyui/custom_nodes + - ./ComfyUI/output:/comfyui/output + - ./ComfyUI/input:/comfyui/input diff --git a/engine/main.py b/engine/main.py new file mode 100644 index 0000000..2b05295 --- /dev/null +++ b/engine/main.py @@ -0,0 +1,354 @@ +from __future__ import annotations + +import argparse +import asyncio +import json +import os +import random +import sys +from pathlib import Path +from typing import Any + +from moviepy import ImageClip +from PIL import Image, ImageDraw, ImageFont + +from engine.audio_gen import synthesize_scenes +from engine.comfy_client import ComfyClient +from engine.config import AppConfig +from engine.script_gen import generate_scenes, refine_scene +from engine.types import Scene +from engine.video_editor import Segment, render_final + + +def _emit(line: str) -> None: + print(line, flush=True) + + +def _emit_scene(scene_idx: int, scene: Scene) -> None: + payload = { + "index": scene_idx, + "image_prompt": scene.image_prompt, + "video_motion": scene.video_motion, + "narration": scene.narration, + } + _emit("SCENE_JSON " + json.dumps(payload, ensure_ascii=False)) + + +def _ensure_mock_image(path: Path, size: tuple[int, int]) -> Path: + if path.exists(): + return path + path.parent.mkdir(parents=True, exist_ok=True) + img = Image.new("RGB", size, color=(20, 24, 33)) + draw = ImageDraw.Draw(img) + text = "MOCK" + try: + font = ImageFont.load_default() + except Exception: + font = None + draw.text((size[0] // 2 - 30, size[1] // 2 - 10), text, fill=(240, 240, 240), font=font) + img.save(path) + return path + + +def _make_mock_video(out_path: Path, image_path: Path, duration_s: float, fps: int) -> Path: + out_path.parent.mkdir(parents=True, exist_ok=True) + clip = ImageClip(str(image_path)).with_duration(max(0.5, duration_s)).with_fps(fps) + try: + clip.write_videofile(str(out_path), codec="libx264", audio=False, fps=fps, preset="veryfast") + finally: + clip.close() + return out_path + + +def _prog(p: float, msg: str) -> None: + p2 = max(0.0, min(1.0, float(p))) + _emit("PROG " + json.dumps({"p": p2, "msg": msg}, ensure_ascii=False)) + + +def _normalize_style(style: str | None) -> str: + s = (style or "").strip() + if not s: + return "" + # Allow both Chinese labels and simple aliases + mapping = { + "电影感": "电影感", + "cinema": "电影感", + "二次元": "二次元", + "anime": "二次元", + "写实": "写实", + "real": "写实", + } + return mapping.get(s, s) + + +def _inject_globals_into_prompt(prompt: str, *, style: str | None, character: str | None) -> str: + style_n = _normalize_style(style) + character_n = (character or "").strip() + if not style_n and not character_n: + return prompt + parts: list[str] = [prompt.strip(), "\n\n[Global Constraints]"] + if style_n: + parts.append(f"- Global Style: {style_n}") + if character_n: + parts.append(f"- Character Preset: {character_n}") + parts.append("请严格遵守上述全局信息,并保持三分镜主角一致。") + return "\n".join(parts).strip() + + +def _decorate_image_prompt(image_prompt: str, *, style: str | None, character: str | None) -> str: + # Industrial rule: final_prompt = f"{global_character}, {global_style}, {scene_prompt}" + style_n = _normalize_style(style) + character_n = (character or "").strip() + parts = [] + if character_n: + parts.append(character_n) + if style_n: + parts.append(style_n) + parts.append(image_prompt) + return ", ".join([p for p in parts if p]).strip(", ") + + +def _fallback_scenes(prompt: str) -> list[Scene]: + return [ + Scene( + image_prompt=f"{prompt},城市夜景,霓虹灯,电影感", + video_motion="缓慢推进镜头,轻微摇镜", + narration="夜色温柔落在街灯上", + ), + Scene( + image_prompt=f"{prompt},咖啡店窗边,暖光,细雨", + video_motion="侧向平移,人物轻轻抬头", + narration="雨声里藏着一段回忆", + ), + Scene( + image_prompt=f"{prompt},桥上远景,车流光轨,温暖", + video_motion="拉远全景,光轨流动", + narration="我们在光里学会告别", + ), + ] + + +def _has_llm_key(cfg: AppConfig) -> bool: + api_key_env = str(cfg.get("openai.api_key_env", "OPENAI_API_KEY")) + return bool(os.environ.get(api_key_env)) + + +def _parse_scenes_from_obj(obj: Any) -> list[Scene]: + if not isinstance(obj, dict): + raise ValueError("payload must be object") + if "scene" in obj and obj.get("scene") is not None: + s = obj.get("scene") + if not isinstance(s, dict): + raise ValueError("payload.scene must be object") + return [ + Scene( + image_prompt=str(s.get("image_prompt", "")).strip(), + video_motion=str(s.get("video_motion", "")).strip(), + narration=str(s.get("narration", "")).strip(), + ) + ] + scenes_raw = obj.get("scenes") + if not isinstance(scenes_raw, list) or not scenes_raw: + raise ValueError("payload.scenes must be non-empty array") + scenes: list[Scene] = [] + for i, s in enumerate(scenes_raw, start=1): + if not isinstance(s, dict): + raise ValueError(f"scenes[{i}] must be object") + scenes.append( + Scene( + image_prompt=str(s.get("image_prompt", "")).strip(), + video_motion=str(s.get("video_motion", "")).strip(), + narration=str(s.get("narration", "")).strip(), + ) + ) + return scenes + + +async def _render_from_scenes( + prompt: str, + scenes: list[Scene], + cfg: AppConfig, + mock: bool, + *, + style: str | None, + character: str | None, + out_dir: Path, +) -> Path: + # Force-inject globals into image prompts for rendering. + scenes2 = [ + Scene( + image_prompt=_decorate_image_prompt(s.image_prompt, style=style, character=character), + video_motion=s.video_motion, + narration=s.narration, + ) + for s in scenes + ] + + _prog(0.15, "Generating TTS") + audios = await synthesize_scenes([s.narration for s in scenes2], cfg) + + segments: list[Segment] = [] + fps = int(cfg.get("video.mock_fps", 24)) + mock_size = cfg.get("video.mock_size", [1024, 576]) + w, h = int(mock_size[0]), int(mock_size[1]) + mock_image = _ensure_mock_image(Path("./assets/mock.png"), (w, h)) + + if mock: + _prog(0.35, "Generating mock videos") + for i, (scene, audio) in enumerate(zip(scenes2, audios), start=1): + vpath = Path("./assets/mock_videos") / f"scene_{i:02d}.mp4" + _make_mock_video(vpath, mock_image, audio.duration_s, fps=fps) + segments.append(Segment(video_path=vpath, audio_path=audio.path, narration=scene.narration)) + _prog(0.85, "Compositing final video") + out_path = out_dir / "final.mp4" + return render_final(segments, cfg, output_path=out_path) + + comfy = ComfyClient(cfg) + wf = comfy.load_workflow() + for i, (scene, audio) in enumerate(zip(scenes2, audios), start=1): + _prog(0.25 + 0.45 * (i - 1) / max(1, len(scenes2)), f"Rendering scene {i} with ComfyUI") + seed = random.randint(1, 2_147_483_647) + wf_i = comfy.inject_params(wf, image_prompt=scene.image_prompt, seed=seed, motion_prompt=scene.video_motion or None) + result = await comfy.run_workflow(wf_i) + candidates = [p for p in result.output_files if p.suffix.lower() in {".mp4", ".mov", ".webm"}] + video_path = candidates[0] if candidates else result.output_files[0] + segments.append(Segment(video_path=video_path, audio_path=audio.path, narration=scene.narration)) + _prog(0.85, "Compositing final video") + out_path = out_dir / "final.mp4" + return render_final(segments, cfg, output_path=out_path) + + +def _read_stdin_json() -> Any: + raw = sys.stdin.read() + if not raw.strip(): + return None + return json.loads(raw) + + +def step_script(prompt: str, cfg: AppConfig, mock: bool, *, style: str | None, character: str | None, out_dir: Path) -> int: + prompt2 = _inject_globals_into_prompt(prompt, style=style, character=character) + if mock and not _has_llm_key(cfg): + # fallback scenes still should include global injection + scenes = _fallback_scenes(prompt) + else: + scenes = generate_scenes(prompt2, cfg) + + out_dir.mkdir(parents=True, exist_ok=True) + _emit("SCRIPT_BEGIN") + for idx, s in enumerate(scenes, start=1): + s2 = Scene( + image_prompt=_decorate_image_prompt(s.image_prompt, style=style, character=character), + video_motion=s.video_motion, + narration=s.narration, + ) + _emit_scene(idx, s2) + _emit("SCRIPT_END") + (out_dir / "scenes.json").write_text( + json.dumps( + {"scenes": [{"image_prompt": s.image_prompt, "video_motion": s.video_motion, "narration": s.narration} for s in scenes]}, + ensure_ascii=False, + indent=2, + ), + encoding="utf-8", + ) + return 0 + + +def step_refine( + prompt: str, + cfg: AppConfig, + mock: bool, + scene_index: int, + *, + style: str | None, + character: str | None, + out_dir: Path, +) -> int: + prompt2 = _inject_globals_into_prompt(prompt, style=style, character=character) + payload = _read_stdin_json() + scenes = _parse_scenes_from_obj(payload) + # If client only sent one scene, treat it as the target scene. + if len(scenes) == 1: + target_index = 1 + else: + target_index = scene_index + if not (1 <= target_index <= len(scenes)): + raise ValueError("scene_index out of range") + + if mock and not _has_llm_key(cfg): + # Simple fallback: append a tiny polish hint to narration + s = scenes[target_index - 1] + refined = Scene( + image_prompt=_decorate_image_prompt(s.image_prompt, style=style, character=character), + video_motion=s.video_motion, + narration=(s.narration + "(更凝练)")[:30], + ) + else: + # Ensure globals are visible to LLM, and inject to output image prompt. + refined0 = refine_scene(prompt=prompt2, scenes=scenes, target_index=target_index, cfg=cfg) + refined = Scene( + image_prompt=_decorate_image_prompt(refined0.image_prompt, style=style, character=character), + video_motion=refined0.video_motion, + narration=refined0.narration, + ) + + # Keep the original index for frontend replacement. + _emit_scene(scene_index, refined) + out_dir.mkdir(parents=True, exist_ok=True) + (out_dir / f"refine_scene_{scene_index}.json").write_text( + json.dumps( + {"index": scene_index, "image_prompt": refined.image_prompt, "video_motion": refined.video_motion, "narration": refined.narration}, + ensure_ascii=False, + indent=2, + ), + encoding="utf-8", + ) + return 0 + + +def step_render(prompt: str, cfg: AppConfig, mock: bool, *, style: str | None, character: str | None, out_dir: Path) -> int: + payload = _read_stdin_json() + scenes = _parse_scenes_from_obj(payload) + out_dir.mkdir(parents=True, exist_ok=True) + _prog(0.05, "Start render") + out = asyncio.run(_render_from_scenes(prompt, scenes, cfg, mock=mock, style=style, character=character, out_dir=out_dir)) + _prog(1.0, "Render finished") + _emit("RENDER_DONE " + json.dumps({"output": str(out)}, ensure_ascii=False)) + return 0 + + +def main() -> int: + parser = argparse.ArgumentParser(description="AIGC interactive POC entry") + parser.add_argument("--prompt", required=True, help="User creative prompt") + parser.add_argument("--config", default="./configs/config.yaml", help="Config yaml path") + parser.add_argument("--mock", action="store_true", help="Mock mode (no ComfyUI needed)") + parser.add_argument("--step", default="script", choices=["script", "render", "refine"]) + parser.add_argument("--scene-index", type=int, default=1, help="For --step=refine only (1-based)") + parser.add_argument("--global-style", default="", help="Global style lock (e.g. 电影感/二次元/写实)") + parser.add_argument("--character", default="", help="Character preset lock (main character description)") + parser.add_argument("--task-id", required=True, help="Task id (UUID). Outputs go to outputs/{task_id}/") + args = parser.parse_args() + + cfg = AppConfig.load(args.config) + out_dir = Path("./outputs") / str(args.task_id) + + if args.step == "script": + return step_script(args.prompt, cfg, mock=args.mock, style=args.global_style, character=args.character, out_dir=out_dir) + if args.step == "render": + return step_render(args.prompt, cfg, mock=args.mock, style=args.global_style, character=args.character, out_dir=out_dir) + if args.step == "refine": + return step_refine( + args.prompt, + cfg, + mock=args.mock, + scene_index=args.scene_index, + style=args.global_style, + character=args.character, + out_dir=out_dir, + ) + raise SystemExit(2) + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/engine/script_gen.py b/engine/script_gen.py index 2a512b8..3238cf6 100644 --- a/engine/script_gen.py +++ b/engine/script_gen.py @@ -33,6 +33,24 @@ def _system_prompt(scene_count: int, min_chars: int, max_chars: int) -> str: """ +def _refine_system_prompt(min_chars: int, max_chars: int) -> str: + return f"""你是短视频分镜润色助手。 +你会收到用户的原始创意 prompt、以及一组三分镜(其中主角设定需一致)。 +你的任务:只润色指定的一个 Scene,使其更具体、更镜头化、更适合生成视频,同时保持主角描述与其它分镜一致。 + +硬性约束: +1) 只修改目标 Scene,不要改其它 Scene。 +2) 目标 Scene 必须包含:image_prompt, video_motion, narration。 +3) narration 为中文旁白,每段控制在约 {min_chars}-{max_chars} 字左右。 +4) 输出只允许 JSON,不要解释、不要 markdown。 + +输出 JSON Schema: +{{ + "scene": {{"image_prompt":"...","video_motion":"...","narration":"..."}} +}} +""" + + def generate_scenes(user_prompt: str, cfg: AppConfig) -> list[Scene]: scene_count = int(cfg.get("script_gen.scene_count", 3)) min_chars = int(cfg.get("script_gen.narration_min_chars", 15)) @@ -78,3 +96,56 @@ def generate_scenes(user_prompt: str, cfg: AppConfig) -> list[Scene]: raise ValueError(f"Scene[{i}] missing required fields") scenes.append(Scene(image_prompt=image_prompt, video_motion=video_motion, narration=narration)) return scenes + + +def refine_scene(*, prompt: str, scenes: list[Scene], target_index: int, cfg: AppConfig) -> Scene: + if not (1 <= target_index <= len(scenes)): + raise ValueError("target_index out of range") + + min_chars = int(cfg.get("script_gen.narration_min_chars", 15)) + max_chars = int(cfg.get("script_gen.narration_max_chars", 20)) + + api_key_env = str(cfg.get("openai.api_key_env", "OPENAI_API_KEY")) + base_url_env = str(cfg.get("openai.base_url_env", "OPENAI_BASE_URL")) + model = str(cfg.get("openai.model", "gpt-4o-mini")) + + api_key = os.environ.get(api_key_env) + if not api_key: + raise RuntimeError(f"Missing env var {api_key_env} for OpenAI API key") + + client = OpenAI( + api_key=api_key, + base_url=os.environ.get(base_url_env) or None, + ) + + scenes_payload = [ + {"image_prompt": s.image_prompt, "video_motion": s.video_motion, "narration": s.narration} + for s in scenes + ] + user_payload = { + "prompt": prompt, + "target_index": target_index, + "scenes": scenes_payload, + } + + resp = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": _refine_system_prompt(min_chars, max_chars)}, + {"role": "user", "content": json.dumps(user_payload, ensure_ascii=False)}, + ], + response_format={"type": "json_object"}, + temperature=0.6, + ) + + content = resp.choices[0].message.content or "{}" + data: Any = json.loads(content) + s = data.get("scene") + if not isinstance(s, dict): + raise ValueError("Model refine output missing scene") + image_prompt = str(s.get("image_prompt", "")).strip() + video_motion = str(s.get("video_motion", "")).strip() + narration = str(s.get("narration", "")).strip() + if not image_prompt or not narration: + raise ValueError("Refined scene missing required fields") + return Scene(image_prompt=image_prompt, video_motion=video_motion, narration=narration) diff --git a/final_poc.mp4 b/final_poc.mp4 index 38d1761..8791b02 100644 Binary files a/final_poc.mp4 and b/final_poc.mp4 differ diff --git a/main.py b/main.py index c3ce198..7a55735 100644 --- a/main.py +++ b/main.py @@ -7,154 +7,11 @@ import os import random from pathlib import Path -from fastapi import FastAPI -from moviepy import ImageClip -from PIL import Image, ImageDraw, ImageFont - -from engine.audio_gen import synthesize_scenes -from engine.comfy_client import ComfyClient -from engine.config import AppConfig -from engine.script_gen import generate_scenes -from engine.types import Scene -from engine.video_editor import Segment, render_final - - -app = FastAPI(title="AiVideo POC") - - -def _ensure_mock_image(path: Path, size: tuple[int, int]) -> Path: - if path.exists(): - return path - path.parent.mkdir(parents=True, exist_ok=True) - img = Image.new("RGB", size, color=(20, 24, 33)) - draw = ImageDraw.Draw(img) - text = "MOCK" - try: - font = ImageFont.load_default() - except Exception: - font = None - draw.text((size[0] // 2 - 30, size[1] // 2 - 10), text, fill=(240, 240, 240), font=font) - img.save(path) - return path - - -def _make_mock_video(out_path: Path, image_path: Path, duration_s: float, fps: int) -> Path: - out_path.parent.mkdir(parents=True, exist_ok=True) - clip = ImageClip(str(image_path)).with_duration(max(0.5, duration_s)).with_fps(fps) - try: - clip.write_videofile(str(out_path), codec="libx264", audio=False, fps=fps, preset="veryfast") - finally: - clip.close() - return out_path - - -def _emit(line: str) -> None: - print(line, flush=True) - - -def _emit_scene(scene_idx: int, scene: Scene) -> None: - payload = { - "index": scene_idx, - "image_prompt": scene.image_prompt, - "video_motion": scene.video_motion, - "narration": scene.narration, - } - _emit("SCENE_JSON " + json.dumps(payload, ensure_ascii=False)) - - -def _fallback_scenes(prompt: str) -> list[Scene]: - return [ - Scene( - image_prompt=f"{prompt},城市夜景,霓虹灯,电影感", - video_motion="缓慢推进镜头,轻微摇镜", - narration="夜色温柔落在街灯上", - ), - Scene( - image_prompt=f"{prompt},咖啡店窗边,暖光,细雨", - video_motion="侧向平移,人物轻轻抬头", - narration="雨声里藏着一段回忆", - ), - Scene( - image_prompt=f"{prompt},桥上远景,车流光轨,温暖", - video_motion="拉远全景,光轨流动", - narration="我们在光里学会告别", - ), - ] - - -def _should_allow_llm_without_key(cfg: AppConfig) -> bool: - api_key_env = str(cfg.get("openai.api_key_env", "OPENAI_API_KEY")) - return bool(os.environ.get(api_key_env)) - - -def _generate_scenes_for_run(prompt: str, cfg: AppConfig, mock: bool) -> list[Scene]: - if mock and not _should_allow_llm_without_key(cfg): - return _fallback_scenes(prompt) - try: - return generate_scenes(prompt, cfg) - except Exception: - if mock: - return _fallback_scenes(prompt) - raise - - -async def run_pipeline(prompt: str, cfg: AppConfig, mock: bool) -> Path: - scenes = _generate_scenes_for_run(prompt, cfg, mock=mock) - audios = await synthesize_scenes([s.narration for s in scenes], cfg) - - segments: list[Segment] = [] - fps = int(cfg.get("video.mock_fps", 24)) - mock_size = cfg.get("video.mock_size", [1024, 576]) - w, h = int(mock_size[0]), int(mock_size[1]) - mock_image = _ensure_mock_image(Path("./assets/mock.png"), (w, h)) - - if mock: - for i, (scene, audio) in enumerate(zip(scenes, audios), start=1): - vpath = Path("./assets/mock_videos") / f"scene_{i:02d}.mp4" - _make_mock_video(vpath, mock_image, audio.duration_s, fps=fps) - segments.append(Segment(video_path=vpath, audio_path=audio.path, narration=scene.narration)) - return render_final(segments, cfg) - - comfy = ComfyClient(cfg) - wf = comfy.load_workflow() - for i, (scene, audio) in enumerate(zip(scenes, audios), start=1): - seed = random.randint(1, 2_147_483_647) - wf_i = comfy.inject_params(wf, image_prompt=scene.image_prompt, seed=seed, motion_prompt=scene.video_motion or None) - result = await comfy.run_workflow(wf_i) - # pick first mp4-like output; if none, fall back to first file. - candidates = [p for p in result.output_files if p.suffix.lower() in {".mp4", ".mov", ".webm"}] - video_path = candidates[0] if candidates else result.output_files[0] - segments.append(Segment(video_path=video_path, audio_path=audio.path, narration=scene.narration)) - return render_final(segments, cfg) - - -def script_only(prompt: str, cfg: AppConfig, mock: bool) -> int: - scenes = _generate_scenes_for_run(prompt, cfg, mock=mock) - _emit("SCRIPT_BEGIN") - for idx, s in enumerate(scenes, start=1): - _emit_scene(idx, s) - _emit("SCRIPT_END") - return 0 - - def main() -> int: - parser = argparse.ArgumentParser(description="AIGC auto video generation POC") - parser.add_argument("--prompt", required=True, help="User creative prompt") - parser.add_argument("--config", default="./configs/config.yaml", help="Config yaml path") - parser.add_argument("--mock", action="store_true", help="Mock mode (no ComfyUI needed)") - parser.add_argument( - "--script-only", - action="store_true", - help="Only generate script/scenes and print to stdout (for Node.js streaming)", - ) - args = parser.parse_args() + # Backward-compatible entry: delegate to engine/main.py + from engine.main import main as engine_main - cfg = AppConfig.load(args.config) - if args.script_only: - return script_only(args.prompt, cfg, mock=args.mock) - out = asyncio.run(run_pipeline(args.prompt, cfg, mock=args.mock)) - print(str(out)) - return 0 + return engine_main() if __name__ == "__main__": diff --git a/outputs/final_poc.mp4 b/outputs/final_poc.mp4 new file mode 100644 index 0000000..38d1761 Binary files /dev/null and b/outputs/final_poc.mp4 differ diff --git a/scripts/check_comfy.py b/scripts/check_comfy.py index a03413e..66b2102 100644 --- a/scripts/check_comfy.py +++ b/scripts/check_comfy.py @@ -3,9 +3,11 @@ from __future__ import annotations import argparse import json import sys +from pathlib import Path from typing import Any import httpx +import yaml def fetch_object_info(base_url: str, timeout_s: float = 5.0) -> dict[str, Any]: @@ -19,19 +21,40 @@ def fetch_object_info(base_url: str, timeout_s: float = 5.0) -> dict[str, Any]: return data +def read_base_url_from_config(config_path: str) -> str | None: + p = Path(config_path) + if not p.exists(): + return None + try: + raw = yaml.safe_load(p.read_text(encoding="utf-8")) + except Exception: + return None + if not isinstance(raw, dict): + return None + app = raw.get("app") + if not isinstance(app, dict): + return None + v = app.get("comfy_base_url") + if isinstance(v, str) and v.strip(): + return v.strip() + return None + + def main() -> int: parser = argparse.ArgumentParser(description="Check ComfyUI API connectivity") parser.add_argument( "--base-url", - default="http://127.0.0.1:8188", - help="ComfyUI base URL (default: http://127.0.0.1:8188)", + default="", + help="ComfyUI base URL (if empty, read from config app.comfy_base_url)", ) + parser.add_argument("--config", default="./configs/config.yaml", help="Config yaml path") parser.add_argument("--timeout", type=float, default=5.0, help="Request timeout seconds") parser.add_argument("--pretty", action="store_true", help="Pretty print JSON") args = parser.parse_args() try: - data = fetch_object_info(args.base_url, timeout_s=args.timeout) + base_url = args.base_url.strip() or read_base_url_from_config(args.config) or "http://127.0.0.1:8188" + data = fetch_object_info(base_url, timeout_s=args.timeout) out = json.dumps(data, ensure_ascii=False, indent=2 if args.pretty else None) sys.stdout.write(out + "\n") return 0 diff --git a/scripts/inspect_comfy_node.py b/scripts/inspect_comfy_node.py new file mode 100644 index 0000000..45afe5e --- /dev/null +++ b/scripts/inspect_comfy_node.py @@ -0,0 +1,331 @@ +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path +from typing import Any, Iterable + +import httpx +import yaml + + +def fetch_object_info(base_url: str, timeout_s: float = 5.0) -> dict[str, Any]: + url = base_url.rstrip("/") + "/object_info" + with httpx.Client(timeout=timeout_s) as client: + r = client.get(url) + r.raise_for_status() + data = r.json() + if not isinstance(data, dict): + raise RuntimeError(f"Unexpected object_info type: {type(data)}") + return data + + +def load_yaml(path: str | Path) -> dict[str, Any]: + p = Path(path) + if not p.exists(): + return {} + raw = yaml.safe_load(p.read_text(encoding="utf-8")) + return raw if isinstance(raw, dict) else {} + + +def load_json(path: str | Path) -> Any: + p = Path(path) + if not p.exists(): + return None + return json.loads(p.read_text(encoding="utf-8")) + + +def iter_node_class_types(object_info: dict[str, Any]) -> Iterable[str]: + for k in object_info.keys(): + if isinstance(k, str): + yield k + + +def find_ckpt_values(object_info: dict[str, Any]) -> list[str]: + """ + Heuristic: locate any node input that looks like checkpoint selector. + ComfyUI commonly uses CheckpointLoaderSimple.inputs.required.ckpt_name = [[...values...]] + """ + vals: list[str] = [] + for node_name, node_info in object_info.items(): + if not isinstance(node_info, dict): + continue + inputs = node_info.get("input") + if not isinstance(inputs, dict): + continue + required = inputs.get("required") + if not isinstance(required, dict): + continue + for key in ("ckpt_name", "checkpoint", "model_name"): + entry = required.get(key) + # expected shape: [ [values...], {meta...} ] or [ [values...] ] + if isinstance(entry, list) and entry: + first = entry[0] + if isinstance(first, list): + for v in first: + if isinstance(v, str): + vals.append(v) + # de-dup + seen: set[str] = set() + out: list[str] = [] + for v in vals: + if v not in seen: + seen.add(v) + out.append(v) + return out + + +def has_ksampler_seed(object_info: dict[str, Any], ks_classes: list[str], seed_key: str) -> bool: + for cls in ks_classes: + info = object_info.get(cls) + if not isinstance(info, dict): + continue + inputs = info.get("input") + if not isinstance(inputs, dict): + continue + required = inputs.get("required") + optional = inputs.get("optional") + if isinstance(required, dict) and seed_key in required: + return True + if isinstance(optional, dict) and seed_key in optional: + return True + return False + + +def resolve_seed_target_from_workflow(workflow: Any, seed_class_types: list[str]) -> tuple[str | None, str | None]: + """ + Returns (node_id, class_type) by scanning workflow dict for first matching class_type. + workflow_api.json is typically { node_id: {class_type, inputs, ...}, ... } + """ + if not isinstance(workflow, dict): + return (None, None) + want = set(seed_class_types) + for node_id, node in workflow.items(): + if not isinstance(node, dict): + continue + ct = node.get("class_type") + if isinstance(ct, str) and ct in want: + return (str(node_id), ct) + return (None, None) + + +def _workflow_nodes(workflow: Any) -> dict[str, Any]: + if not isinstance(workflow, dict): + raise ValueError("workflow_api.json root must be an object mapping node_id -> node") + return workflow + + +def _get_node(workflow: dict[str, Any], node_id: str) -> dict[str, Any]: + node = workflow.get(str(node_id)) + if not isinstance(node, dict): + raise KeyError(f"workflow missing node_id={node_id}") + return node + + +def _validate_configured_node_id( + *, + workflow: dict[str, Any], + node_id: Any, + allowed_class_types: list[str], + name: str, +) -> list[str]: + errs: list[str] = [] + if node_id is None or not str(node_id).strip(): + return errs + nid = str(node_id).strip() + try: + node = _get_node(workflow, nid) + except Exception as e: + return [f"{name}: configured node_id={nid} not found in workflow ({e})"] + ct = node.get("class_type") + if allowed_class_types and isinstance(ct, str) and ct not in set(allowed_class_types): + errs.append(f"{name}: node_id={nid} class_type={ct} not in allowed {allowed_class_types}") + return errs + + +def _workflow_has_ltx_node(workflow: dict[str, Any], keyword: str) -> bool: + kw = keyword.lower() + for _nid, node in workflow.items(): + if not isinstance(node, dict): + continue + ct = node.get("class_type") + if isinstance(ct, str) and kw in ct.lower(): + return True + return False + + +def main() -> int: + p = argparse.ArgumentParser(description="Inspect ComfyUI /object_info for LTX + checkpoints + sampler override readiness") + p.add_argument("--base-url", default="") + p.add_argument("--timeout", type=float, default=8.0) + p.add_argument("--config", default="./configs/config.yaml") + p.add_argument("--workflow", default="./workflow_api.json") + p.add_argument( + "--expected-checkpoint", + action="append", + default=[], + help="Expected checkpoint name (can repeat). Exact match against ckpt list.", + ) + p.add_argument( + "--ltx-keyword", + default="LTX", + help="Keyword to detect LTX-Video nodes in object_info keys (default: LTX)", + ) + args = p.parse_args() + + cfg = load_yaml(args.config) + base_url = (args.base_url or "").strip() + if not base_url: + app_cfg = (cfg.get("app") or {}) if isinstance(cfg, dict) else {} + if isinstance(app_cfg, dict): + base_url = str(app_cfg.get("comfy_base_url", "")).strip() + if not base_url: + base_url = "http://127.0.0.1:8188" + + comfy_cfg = (cfg.get("comfy_workflow") or {}) if isinstance(cfg, dict) else {} + seed_key = str(comfy_cfg.get("seed_input_key", "seed")) + seed_class_types = comfy_cfg.get("seed_node_class_types") or ["KSampler", "KSamplerAdvanced"] + if not isinstance(seed_class_types, list): + seed_class_types = ["KSampler", "KSamplerAdvanced"] + seed_class_types = [str(x) for x in seed_class_types] + + # Industrial hard requirement: workflow must exist for ID matching checks + wf_path = Path(args.workflow) + if not wf_path.exists(): + sys.stderr.write(f"[inspect] FAIL: workflow_api.json not found at {wf_path}\n") + return 3 + + try: + object_info = fetch_object_info(base_url, timeout_s=args.timeout) + except Exception as e: + sys.stderr.write(f"[inspect] ERROR fetch /object_info: {e}\n") + return 2 + + # 1) LTX-Video plugin activated? (heuristic) + keyword = str(args.ltx_keyword or "LTX") + ltx_hits = sorted([k for k in iter_node_class_types(object_info) if keyword.lower() in k.lower()]) + ltx_ok = len(ltx_hits) > 0 + + # 2) checkpoint list includes expected + ckpts = find_ckpt_values(object_info) + expected = list(args.expected_checkpoint or []) + missing = [x for x in expected if x not in ckpts] + ckpt_ok = len(missing) == 0 if expected else True + + # 3) KSampler defaults overridden by our python? (readiness check) + # /object_info cannot prove runtime override happened, but we can validate: + # - ComfyUI exposes a sampler node class with a 'seed' input key + # - our config intends to override that same key + ks_ok = has_ksampler_seed(object_info, seed_class_types, seed_key) + + wf = load_json(args.workflow) + try: + wf_nodes = _workflow_nodes(wf) + except Exception as e: + sys.stderr.write(f"[inspect] FAIL: invalid workflow format: {e}\n") + return 3 + + seed_node_id, seed_node_class = resolve_seed_target_from_workflow(wf_nodes, seed_class_types) + wf_ok = seed_node_id is not None + + # Hard validation: configured node IDs must exist and match expected class_type families + prompt_allowed = [str(x) for x in (comfy_cfg.get("prompt_node_class_types") or []) if str(x).strip()] + seed_allowed = [str(x) for x in (comfy_cfg.get("seed_node_class_types") or []) if str(x).strip()] + save_allowed = [str(x) for x in (comfy_cfg.get("save_node_class_types") or []) if str(x).strip()] + errs: list[str] = [] + errs += _validate_configured_node_id( + workflow=wf_nodes, + node_id=comfy_cfg.get("prompt_node_id"), + allowed_class_types=prompt_allowed, + name="prompt_node_id", + ) + errs += _validate_configured_node_id( + workflow=wf_nodes, + node_id=comfy_cfg.get("seed_node_id"), + allowed_class_types=seed_allowed, + name="seed_node_id", + ) + errs += _validate_configured_node_id( + workflow=wf_nodes, + node_id=comfy_cfg.get("save_node_id"), + allowed_class_types=save_allowed, + name="save_node_id", + ) + errs += _validate_configured_node_id( + workflow=wf_nodes, + node_id=comfy_cfg.get("motion_node_id"), + allowed_class_types=[], + name="motion_node_id", + ) + + # Hard validation: workflow must contain LTX node(s) if we're using LTX-Video pipeline + wf_ltx_ok = _workflow_has_ltx_node(wf_nodes, str(args.ltx_keyword or "LTX")) + + # Hard validation: seed node in workflow must expose the seed input key (so it can be overridden) + wf_seed_key_ok = False + if wf_ok: + try: + node = _get_node(wf_nodes, str(seed_node_id)) + inputs = node.get("inputs") + wf_seed_key_ok = isinstance(inputs, dict) and seed_key in inputs + except Exception: + wf_seed_key_ok = False + + report = { + "base_url": base_url, + "ltx": { + "keyword": keyword, + "activated": ltx_ok, + "matching_nodes": ltx_hits[:50], + "match_count": len(ltx_hits), + }, + "checkpoints": { + "expected": expected, + "found_count": len(ckpts), + "missing": missing, + "ok": ckpt_ok, + "sample": ckpts[:50], + }, + "sampler_override_readiness": { + "seed_input_key_from_config": seed_key, + "seed_node_class_types_from_config": seed_class_types, + "comfy_has_seed_input": ks_ok, + "workflow_path": args.workflow, + "workflow_seed_node_detected": wf_ok, + "workflow_seed_node_id": seed_node_id, + "workflow_seed_node_class_type": seed_node_class, + "workflow_seed_node_has_seed_key": wf_seed_key_ok, + "note": "object_info cannot prove runtime override; this enforces key alignment + workflow ID/class checks.", + }, + "workflow_validation": { + "ltx_node_in_workflow": wf_ltx_ok, + "configured_node_id_errors": errs, + }, + "ok": bool(ltx_ok and ckpt_ok and ks_ok and wf_ok and wf_ltx_ok and wf_seed_key_ok and not errs), + } + + sys.stdout.write(json.dumps(report, ensure_ascii=False, indent=2) + "\n") + + if not ltx_ok: + sys.stderr.write(f"[inspect] FAIL: no node matched keyword '{keyword}' (LTX plugin may be missing)\n") + if not ckpt_ok: + sys.stderr.write(f"[inspect] FAIL: missing checkpoints: {missing}\n") + if not ks_ok: + sys.stderr.write(f"[inspect] FAIL: ComfyUI sampler classes {seed_class_types} do not expose input '{seed_key}'\n") + if not wf_ok: + sys.stderr.write(f"[inspect] FAIL: workflow does not contain a seed node of class types {seed_class_types}\n") + if not wf_ltx_ok: + sys.stderr.write(f"[inspect] FAIL: workflow has no node with class_type containing '{args.ltx_keyword}'\n") + if wf_ok and not wf_seed_key_ok: + sys.stderr.write(f"[inspect] FAIL: workflow seed node {seed_node_id} does not expose inputs['{seed_key}']\n") + if errs: + for e in errs: + sys.stderr.write(f"[inspect] FAIL: {e}\n") + + return 0 if report["ok"] else 3 + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/server/index.js b/server/index.js index d5ee17d..3ce4085 100644 --- a/server/index.js +++ b/server/index.js @@ -2,14 +2,34 @@ import express from "express"; import { spawn } from "node:child_process"; import path from "node:path"; import { fileURLToPath } from "node:url"; +import fs from "node:fs"; +import crypto from "node:crypto"; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); const repoRoot = path.resolve(__dirname, ".."); +const outputsDir = path.join(repoRoot, "outputs"); +fs.mkdirSync(outputsDir, { recursive: true }); const app = express(); +app.use(express.json({ limit: "2mb" })); +app.use( + "/api/static", + express.static(outputsDir, { + fallthrough: true, + setHeaders: (res) => { + // Important: avoid stale video preview. + res.setHeader("Cache-Control", "no-cache, no-transform"); + }, + }) +); app.use(express.static(path.join(__dirname, "public"))); +app.get("/api/health", (_req, res) => { + res.setHeader("Cache-Control", "no-cache"); + res.status(200).json({ ok: true }); +}); + function sseHeaders(res) { res.setHeader("Content-Type", "text/event-stream; charset=utf-8"); res.setHeader("Cache-Control", "no-cache, no-transform"); @@ -25,9 +45,46 @@ function sseSend(res, event, data) { res.write("\n"); } -app.get("/api/run", (req, res) => { +function newTaskId() { + return crypto.randomUUID(); +} + +function taskDir(taskId) { + return path.join(outputsDir, taskId); +} + +function ensureTaskDir(taskId) { + const dir = taskDir(taskId); + fs.mkdirSync(dir, { recursive: true }); + return dir; +} + +function spawnPythonStep({ step, prompt, configPath, mock, globalStyle, character, taskId, sceneIndex }) { + const py = process.env.PYTHON_BIN || "python3.10"; + const args = [ + "-m", + "engine.main", + "--prompt", + prompt, + "--config", + configPath, + "--step", + step, + "--task-id", + taskId, + ]; + if (sceneIndex) args.push("--scene-index", String(sceneIndex)); + if (globalStyle) args.push("--global-style", globalStyle); + if (character) args.push("--character", character); + if (mock) args.push("--mock"); + return spawn(py, args, { cwd: repoRoot, env: process.env, stdio: ["pipe", "pipe", "pipe"] }); +} + +app.get("/api/script", (req, res) => { const prompt = String(req.query.prompt || "").trim(); const mock = String(req.query.mock || "1") === "1"; + const globalStyle = String(req.query.global_style || "").trim(); + const character = String(req.query.character || "").trim(); const configPath = String(req.query.config || "./configs/config.yaml"); if (!prompt) { @@ -35,25 +92,21 @@ app.get("/api/run", (req, res) => { return; } + const taskId = newTaskId(); + ensureTaskDir(taskId); + sseHeaders(res); + sseSend(res, "task", JSON.stringify({ task_id: taskId })); sseSend(res, "status", "starting"); - // Unified in-container execution: Node spawns python directly. - const py = process.env.PYTHON_BIN || "python"; - const args = [ - path.join(repoRoot, "main.py"), - "--prompt", + const child = spawnPythonStep({ + step: "script", prompt, - "--config", configPath, - "--script-only", - ]; - if (mock) args.push("--mock"); - - const child = spawn(py, args, { - cwd: repoRoot, - env: process.env, - stdio: ["ignore", "pipe", "pipe"], + mock, + globalStyle, + character, + taskId, }); let buf = ""; @@ -64,14 +117,15 @@ app.get("/api/run", (req, res) => { buf = parts.pop() || ""; for (const line of parts) { if (!line) continue; - // Forward raw lines. Frontend will parse SCENE_JSON. - sseSend(res, "line", line); + if (line.startsWith("SCENE_JSON ")) sseSend(res, "scene", line.slice("SCENE_JSON ".length)); + else if (line.startsWith("PROG ")) sseSend(res, "prog", line.slice("PROG ".length)); + else sseSend(res, "line", line); } }); child.stderr.setEncoding("utf8"); child.stderr.on("data", (chunk) => { - sseSend(res, "stderr", chunk); + sseSend(res, "error", chunk); }); req.on("close", () => { @@ -80,13 +134,177 @@ app.get("/api/run", (req, res) => { child.on("exit", (code) => { if (buf.trim()) sseSend(res, "line", buf.trim()); - sseSend(res, "done", String(code ?? 0)); + sseSend(res, "done", String(code != null ? code : 0)); res.end(); }); }); -const port = Number(process.env.PORT || 3000); -app.listen(port, () => { - console.log(`[server] http://127.0.0.1:${port}`); +app.post("/api/refine", (req, res) => { + const prompt = String((req.body && req.body.prompt) || "").trim(); + const sceneIndex = Number((req.body && req.body.scene_index) || 1); + const scenes = req.body && req.body.scenes; + const scene = req.body && req.body.scene; + const mock = Boolean((req.body && req.body.mock) != null ? req.body.mock : true); + const globalStyle = String((req.body && req.body.global_style) || "").trim(); + const character = String((req.body && req.body.character) || "").trim(); + const configPath = String((req.body && req.body.config) || "./configs/config.yaml"); + const taskId = String((req.body && req.body.task_id) || "").trim() || newTaskId(); + + if (!prompt) return res.status(400).json({ error: "missing prompt" }); + if (!Number.isFinite(sceneIndex) || sceneIndex < 1) return res.status(400).json({ error: "bad scene_index" }); + if (!Array.isArray(scenes) && (!scene || typeof scene !== "object")) { + return res.status(400).json({ error: "missing scene or scenes[]" }); + } + ensureTaskDir(taskId); + + const child = spawnPythonStep({ + step: "refine", + prompt, + configPath, + mock, + globalStyle, + character, + taskId, + sceneIndex, + }); + if (Array.isArray(scenes)) { + child.stdin.end(JSON.stringify({ scenes })); + } else { + child.stdin.end(JSON.stringify({ scene })); + } + + let out = ""; + let err = ""; + child.stdout.setEncoding("utf8"); + child.stderr.setEncoding("utf8"); + child.stdout.on("data", (c) => (out += c)); + child.stderr.on("data", (c) => (err += c)); + child.on("exit", (code) => { + if (code !== 0) return res.status(500).json({ error: "python failed", stderr: err, stdout: out }); + const line = out + .split(/\r?\n/) + .map((s) => s.trim()) + .find((s) => s.startsWith("SCENE_JSON ")); + if (!line) return res.status(500).json({ error: "no SCENE_JSON", stderr: err, stdout: out }); + const payload = JSON.parse(line.slice("SCENE_JSON ".length)); + return res.json({ task_id: taskId, scene: payload, stderr: err }); + }); }); +let isBusy = false; + +app.post("/api/render", (req, res) => { + const prompt = String((req.body && req.body.prompt) || "").trim(); + const scenes = req.body && req.body.scenes; + const mock = Boolean((req.body && req.body.mock) != null ? req.body.mock : false); + const globalStyle = String((req.body && req.body.global_style) || "").trim(); + const character = String((req.body && req.body.character) || "").trim(); + const configPath = String((req.body && req.body.config) || "./configs/config.yaml"); + const taskId = String((req.body && req.body.task_id) || "").trim() || newTaskId(); + + if (!prompt) return res.status(400).json({ error: "missing prompt" }); + if (!Array.isArray(scenes)) return res.status(400).json({ error: "missing scenes[]" }); + ensureTaskDir(taskId); + + if (isBusy) { + return res.status(429).json({ error: "busy", msg: "GPU is busy, try later" }); + } + isBusy = true; + + sseHeaders(res); + sseSend(res, "task", JSON.stringify({ task_id: taskId })); + sseSend(res, "status", "render_start"); + + const child = spawnPythonStep({ + step: "render", + prompt, + configPath, + mock, + globalStyle, + character, + taskId, + }); + child.stdin.end(JSON.stringify({ scenes })); + + let buf = ""; + child.stdout.setEncoding("utf8"); + child.stderr.setEncoding("utf8"); + + child.stdout.on("data", (chunk) => { + buf += chunk; + const parts = buf.split(/\r?\n/); + buf = parts.pop() || ""; + for (const line of parts) { + if (!line) continue; + if (line.startsWith("PROG ")) sseSend(res, "prog", line.slice("PROG ".length)); + else if (line.startsWith("RENDER_DONE ")) sseSend(res, "done", line.slice("RENDER_DONE ".length)); + else sseSend(res, "line", line); + } + }); + + child.stderr.on("data", (chunk) => { + sseSend(res, "error", chunk); + }); + + req.on("close", () => { + child.kill("SIGTERM"); + }); + + child.on("exit", (code) => { + isBusy = false; + if (buf.trim()) sseSend(res, "line", buf.trim()); + if (code !== 0) sseSend(res, "error", `[ERROR] python exit_code=${code}`); + res.end(); + }); +}); + +async function runSelfCheck() { + const py = process.env.PYTHON_BIN || "python3.10"; + const checks = [ + { name: "check_comfy", args: ["scripts/check_comfy.py"] }, + { name: "inspect_comfy_node", args: ["scripts/inspect_comfy_node.py"] }, + ]; + for (const c of checks) { + const deadline = Date.now() + 90_000; + let lastErr = ""; + while (Date.now() < deadline) { + try { + await new Promise((resolve, reject) => { + const child = spawn(py, c.args, { cwd: repoRoot, env: process.env, stdio: ["ignore", "pipe", "pipe"] }); + let out = ""; + let err = ""; + child.stdout.setEncoding("utf8"); + child.stderr.setEncoding("utf8"); + child.stdout.on("data", (d) => (out += d)); + child.stderr.on("data", (d) => (err += d)); + child.on("exit", (code) => { + if (code === 0) return resolve(true); + reject(new Error(`${c.name} failed (code=${code})\n${err || out}`)); + }); + }); + lastErr = ""; + break; + } catch (e) { + lastErr = String(e); + await new Promise((r) => setTimeout(r, 2000)); + } + } + if (lastErr) { + throw new Error(lastErr); + } + } +} + +const port = Number(process.env.PORT || 3000); +(async () => { + try { + await runSelfCheck(); + app.listen(port, () => { + console.log(`[server] http://127.0.0.1:${port}`); + }); + } catch (e) { + console.error(String(e)); + process.exit(1); + } +})(); + diff --git a/server/public/index.html b/server/public/index.html index 1e92cea..a10cf62 100644 --- a/server/public/index.html +++ b/server/public/index.html @@ -3,7 +3,7 @@ - AiVideo POC - Script Stream Test + AiVideo POC - Interactive -

AiVideo POC:实时分镜脚本流测试

-

点击运行后,页面会通过 SSE 实时接收 Python stdout,并把分镜渲染到下方。

+
-
- - - - -
+ + + -
+