AiVideo/main.py

from __future__ import annotations

import argparse
import asyncio
import json
import os
import random
from pathlib import Path

from fastapi import FastAPI
from moviepy import ImageClip
from PIL import Image, ImageDraw, ImageFont

from engine.audio_gen import synthesize_scenes
from engine.comfy_client import ComfyClient
from engine.config import AppConfig
from engine.script_gen import generate_scenes
from engine.types import Scene
from engine.video_editor import Segment, render_final


app = FastAPI(title="AiVideo POC")


def _ensure_mock_image(path: Path, size: tuple[int, int]) -> Path:
    if path.exists():
        return path
    path.parent.mkdir(parents=True, exist_ok=True)
    img = Image.new("RGB", size, color=(20, 24, 33))
    draw = ImageDraw.Draw(img)
    text = "MOCK"
    try:
        font = ImageFont.load_default()
    except Exception:
        font = None
    draw.text((size[0] // 2 - 30, size[1] // 2 - 10), text, fill=(240, 240, 240), font=font)
    img.save(path)
    return path


def _make_mock_video(out_path: Path, image_path: Path, duration_s: float, fps: int) -> Path:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    clip = ImageClip(str(image_path)).with_duration(max(0.5, duration_s)).with_fps(fps)
    try:
        clip.write_videofile(str(out_path), codec="libx264", audio=False, fps=fps, preset="veryfast")
    finally:
        clip.close()
    return out_path


def _emit(line: str) -> None:
    print(line, flush=True)


def _emit_scene(scene_idx: int, scene: Scene) -> None:
    payload = {
        "index": scene_idx,
        "image_prompt": scene.image_prompt,
        "video_motion": scene.video_motion,
        "narration": scene.narration,
    }
    _emit("SCENE_JSON " + json.dumps(payload, ensure_ascii=False))


def _fallback_scenes(prompt: str) -> list[Scene]:
    return [
        Scene(
            image_prompt=f"{prompt}，城市夜景，霓虹灯，电影感",
            video_motion="缓慢推进镜头，轻微摇镜",
            narration="夜色温柔落在街灯上",
        ),
        Scene(
            image_prompt=f"{prompt}，咖啡店窗边，暖光，细雨",
            video_motion="侧向平移，人物轻轻抬头",
            narration="雨声里藏着一段回忆",
        ),
        Scene(
            image_prompt=f"{prompt}，桥上远景，车流光轨，温暖",
            video_motion="拉远全景，光轨流动",
            narration="我们在光里学会告别",
        ),
    ]


def _should_allow_llm_without_key(cfg: AppConfig) -> bool:
    api_key_env = str(cfg.get("openai.api_key_env", "OPENAI_API_KEY"))
    return bool(os.environ.get(api_key_env))


def _generate_scenes_for_run(prompt: str, cfg: AppConfig, mock: bool) -> list[Scene]:
    if mock and not _should_allow_llm_without_key(cfg):
        return _fallback_scenes(prompt)
    try:
        return generate_scenes(prompt, cfg)
    except Exception:
        if mock:
            return _fallback_scenes(prompt)
        raise


async def run_pipeline(prompt: str, cfg: AppConfig, mock: bool) -> Path:
    scenes = _generate_scenes_for_run(prompt, cfg, mock=mock)
    audios = await synthesize_scenes([s.narration for s in scenes], cfg)

    segments: list[Segment] = []
    fps = int(cfg.get("video.mock_fps", 24))
    mock_size = cfg.get("video.mock_size", [1024, 576])
    w, h = int(mock_size[0]), int(mock_size[1])
    mock_image = _ensure_mock_image(Path("./assets/mock.png"), (w, h))

    if mock:
        for i, (scene, audio) in enumerate(zip(scenes, audios), start=1):
            vpath = Path("./assets/mock_videos") / f"scene_{i:02d}.mp4"
            _make_mock_video(vpath, mock_image, audio.duration_s, fps=fps)
            segments.append(Segment(video_path=vpath, audio_path=audio.path, narration=scene.narration))
        return render_final(segments, cfg)

    comfy = ComfyClient(cfg)
    wf = comfy.load_workflow()
    for i, (scene, audio) in enumerate(zip(scenes, audios), start=1):
        seed = random.randint(1, 2_147_483_647)
        wf_i = comfy.inject_params(wf, image_prompt=scene.image_prompt, seed=seed, motion_prompt=scene.video_motion or None)
        result = await comfy.run_workflow(wf_i)
        # pick first mp4-like output; if none, fall back to first file.
        candidates = [p for p in result.output_files if p.suffix.lower() in {".mp4", ".mov", ".webm"}]
        video_path = candidates[0] if candidates else result.output_files[0]
        segments.append(Segment(video_path=video_path, audio_path=audio.path, narration=scene.narration))
    return render_final(segments, cfg)


def script_only(prompt: str, cfg: AppConfig, mock: bool) -> int:
    scenes = _generate_scenes_for_run(prompt, cfg, mock=mock)
    _emit("SCRIPT_BEGIN")
    for idx, s in enumerate(scenes, start=1):
        _emit_scene(idx, s)
    _emit("SCRIPT_END")
    return 0


def main() -> int:
    parser = argparse.ArgumentParser(description="AIGC auto video generation POC")
    parser.add_argument("--prompt", required=True, help="User creative prompt")
    parser.add_argument("--config", default="./configs/config.yaml", help="Config yaml path")
    parser.add_argument("--mock", action="store_true", help="Mock mode (no ComfyUI needed)")
    parser.add_argument(
        "--script-only",
        action="store_true",
        help="Only generate script/scenes and print to stdout (for Node.js streaming)",
    )
    args = parser.parse_args()

    cfg = AppConfig.load(args.config)
    if args.script_only:
        return script_only(args.prompt, cfg, mock=args.mock)
    out = asyncio.run(run_pipeline(args.prompt, cfg, mock=args.mock))
    print(str(out))
    return 0


if __name__ == "__main__":
    raise SystemExit(main())