from __future__ import annotations import asyncio import os import random from pathlib import Path from typing import Any import numpy as np from moviepy import AudioFileClip, VideoClip from PIL import Image from urllib.request import urlopen from .audio_gen import synthesize_one from .comfy_client import generate_image as comfy_generate_image from .config import AppConfig from .render_pipeline import render_shot as render_shot_pipeline ASSETS_DIR = "assets" DEMO_IMAGE = os.path.join(ASSETS_DIR, "demo.jpg") def ensure_demo_image() -> None: os.makedirs(ASSETS_DIR, exist_ok=True) if os.path.exists(DEMO_IMAGE): return # Simple placeholder image source. url = "https://picsum.photos/1280/720" with urlopen(url, timeout=30) as resp: data = resp.read() with open(DEMO_IMAGE, "wb") as f: f.write(data) def generate_image_mock(prompt: str) -> str: # Keep interface compatible with the requested interface. _ = prompt ensure_demo_image() return DEMO_IMAGE def enrich_prompt(prompt_text: str) -> str: style = "cinematic, ultra realistic, 4k, detailed lighting" pt = (prompt_text or "").strip() if not pt: return style return f"{pt}, {style}" async def _render_shot_async( shot: dict[str, Any], output_dir: str | Path, cfg: AppConfig, *, mock: bool = False, ) -> str: out_dir = Path(output_dir) out_dir.mkdir(parents=True, exist_ok=True) clips_dir = out_dir / "clips" audio_dir = out_dir / "audio" clips_dir.mkdir(parents=True, exist_ok=True) audio_dir.mkdir(parents=True, exist_ok=True) shot_id = str(shot.get("shot_id", "unknown")) image_prompt = str(shot.get("image_prompt", "")).strip() prompt_text = str(shot.get("prompt", image_prompt) or image_prompt).strip() tts_text = str(shot.get("tts", "")).strip() duration_s = max(1.0, float(shot.get("duration", 3))) voice = str(cfg.get("tts.voice", "zh-CN-XiaoxiaoNeural")) rate = str(cfg.get("tts.rate", "+0%")) volume = str(cfg.get("tts.volume", "+0%")) audio_asset: Any | None = None if tts_text: audio_path = audio_dir / f"shot_{shot_id}.mp3" audio_asset = await synthesize_one(tts_text, audio_path, voice, rate, volume) # Use config-defined output resolution for stable concatenation. mock_size = cfg.get("video.mock_size", [1024, 576]) w, h = int(mock_size[0]), int(mock_size[1]) fps = int(cfg.get("video.mock_fps", 24)) if audio_asset and audio_asset.duration_s: duration_s = max(duration_s, float(audio_asset.duration_s)) # shot -> image (ComfyUI first; fallback to demo.jpg) image_path: str if mock: image_path = generate_image_mock(prompt_text) else: try: enriched = enrich_prompt(prompt_text) # Store generated images directly under outputs/{task_id} # (as required by verification: outputs/{task_id}/*.png). image_path = str( comfy_generate_image( enriched, out_dir, cfg=cfg, timeout_s=60, retry=2, filename_prefix=f"shot_{shot_id}", ) ) print(f"[SHOT_RENDER] {shot_id} -> image generated: {image_path}") except Exception as e: print(f"[WARN] Comfy failed, fallback to demo: {e}") image_path = generate_image_mock(prompt_text) # Ensure image exists before rendering. if not image_path or not os.path.exists(image_path): image_path = generate_image_mock(prompt_text) base_img = Image.open(image_path).convert("RGB") def make_frame(t: float): # Subtle zoom-in from 1.00 to ~1.03 over the clip duration. progress = float(t) / max(duration_s, 1e-6) progress = max(0.0, min(1.0, progress)) scale = 1.0 + 0.03 * progress new_w = max(w, int(w * scale)) new_h = max(h, int(h * scale)) frame = base_img.resize((new_w, new_h), Image.LANCZOS) left = (new_w - w) // 2 top = (new_h - h) // 2 frame = frame.crop((left, top, left + w, top + h)) return np.array(frame) # image -> video video = VideoClip(make_frame, duration=duration_s, has_constant_size=True) # optional audio -> clip audio_clip: AudioFileClip | None = None if audio_asset and os.path.exists(str(audio_asset.path)): audio_clip = AudioFileClip(str(audio_asset.path)) video = video.with_audio(audio_clip) # output clip_out = clips_dir / f"shot_{shot_id}.mp4" print(f"[SHOT_RENDER] {shot_id} -> {clip_out}") try: video.write_videofile( str(clip_out), fps=fps, codec="libx264", audio_codec="aac", preset="veryfast", threads=2, ) finally: try: video.close() except Exception: pass if audio_clip is not None: try: audio_clip.close() except Exception: pass return str(clip_out) def render_shot( shot: dict[str, Any], output_dir: str | Path, cfg: AppConfig | None = None, *, mock: bool = False, ) -> str: cfg2 = cfg or AppConfig.load("./configs/config.yaml") return render_shot_pipeline(shot, cfg2, output_dir, mock=mock)