from __future__ import annotations import argparse import asyncio import json import os import random from pathlib import Path from fastapi import FastAPI from moviepy import ImageClip from PIL import Image, ImageDraw, ImageFont from engine.audio_gen import synthesize_scenes from engine.comfy_client import ComfyClient from engine.config import AppConfig from engine.script_gen import generate_scenes from engine.types import Scene from engine.video_editor import Segment, render_final app = FastAPI(title="AiVideo POC") def _ensure_mock_image(path: Path, size: tuple[int, int]) -> Path: if path.exists(): return path path.parent.mkdir(parents=True, exist_ok=True) img = Image.new("RGB", size, color=(20, 24, 33)) draw = ImageDraw.Draw(img) text = "MOCK" try: font = ImageFont.load_default() except Exception: font = None draw.text((size[0] // 2 - 30, size[1] // 2 - 10), text, fill=(240, 240, 240), font=font) img.save(path) return path def _make_mock_video(out_path: Path, image_path: Path, duration_s: float, fps: int) -> Path: out_path.parent.mkdir(parents=True, exist_ok=True) clip = ImageClip(str(image_path)).with_duration(max(0.5, duration_s)).with_fps(fps) try: clip.write_videofile(str(out_path), codec="libx264", audio=False, fps=fps, preset="veryfast") finally: clip.close() return out_path def _emit(line: str) -> None: print(line, flush=True) def _emit_scene(scene_idx: int, scene: Scene) -> None: payload = { "index": scene_idx, "image_prompt": scene.image_prompt, "video_motion": scene.video_motion, "narration": scene.narration, } _emit("SCENE_JSON " + json.dumps(payload, ensure_ascii=False)) def _fallback_scenes(prompt: str) -> list[Scene]: return [ Scene( image_prompt=f"{prompt},城市夜景,霓虹灯,电影感", video_motion="缓慢推进镜头,轻微摇镜", narration="夜色温柔落在街灯上", ), Scene( image_prompt=f"{prompt},咖啡店窗边,暖光,细雨", video_motion="侧向平移,人物轻轻抬头", narration="雨声里藏着一段回忆", ), Scene( image_prompt=f"{prompt},桥上远景,车流光轨,温暖", video_motion="拉远全景,光轨流动", narration="我们在光里学会告别", ), ] def _should_allow_llm_without_key(cfg: AppConfig) -> bool: api_key_env = str(cfg.get("openai.api_key_env", "OPENAI_API_KEY")) return bool(os.environ.get(api_key_env)) def _generate_scenes_for_run(prompt: str, cfg: AppConfig, mock: bool) -> list[Scene]: if mock and not _should_allow_llm_without_key(cfg): return _fallback_scenes(prompt) try: return generate_scenes(prompt, cfg) except Exception: if mock: return _fallback_scenes(prompt) raise async def run_pipeline(prompt: str, cfg: AppConfig, mock: bool) -> Path: scenes = _generate_scenes_for_run(prompt, cfg, mock=mock) audios = await synthesize_scenes([s.narration for s in scenes], cfg) segments: list[Segment] = [] fps = int(cfg.get("video.mock_fps", 24)) mock_size = cfg.get("video.mock_size", [1024, 576]) w, h = int(mock_size[0]), int(mock_size[1]) mock_image = _ensure_mock_image(Path("./assets/mock.png"), (w, h)) if mock: for i, (scene, audio) in enumerate(zip(scenes, audios), start=1): vpath = Path("./assets/mock_videos") / f"scene_{i:02d}.mp4" _make_mock_video(vpath, mock_image, audio.duration_s, fps=fps) segments.append(Segment(video_path=vpath, audio_path=audio.path, narration=scene.narration)) return render_final(segments, cfg) comfy = ComfyClient(cfg) wf = comfy.load_workflow() for i, (scene, audio) in enumerate(zip(scenes, audios), start=1): seed = random.randint(1, 2_147_483_647) wf_i = comfy.inject_params(wf, image_prompt=scene.image_prompt, seed=seed, motion_prompt=scene.video_motion or None) result = await comfy.run_workflow(wf_i) # pick first mp4-like output; if none, fall back to first file. candidates = [p for p in result.output_files if p.suffix.lower() in {".mp4", ".mov", ".webm"}] video_path = candidates[0] if candidates else result.output_files[0] segments.append(Segment(video_path=video_path, audio_path=audio.path, narration=scene.narration)) return render_final(segments, cfg) def script_only(prompt: str, cfg: AppConfig, mock: bool) -> int: scenes = _generate_scenes_for_run(prompt, cfg, mock=mock) _emit("SCRIPT_BEGIN") for idx, s in enumerate(scenes, start=1): _emit_scene(idx, s) _emit("SCRIPT_END") return 0 def main() -> int: parser = argparse.ArgumentParser(description="AIGC auto video generation POC") parser.add_argument("--prompt", required=True, help="User creative prompt") parser.add_argument("--config", default="./configs/config.yaml", help="Config yaml path") parser.add_argument("--mock", action="store_true", help="Mock mode (no ComfyUI needed)") parser.add_argument( "--script-only", action="store_true", help="Only generate script/scenes and print to stdout (for Node.js streaming)", ) args = parser.parse_args() cfg = AppConfig.load(args.config) if args.script_only: return script_only(args.prompt, cfg, mock=args.mock) out = asyncio.run(run_pipeline(args.prompt, cfg, mock=args.mock)) print(str(out)) return 0 if __name__ == "__main__": raise SystemExit(main())