fix: 优化架构

2026-03-25 19:35:37 +08:00
parent 34786b37c7
commit 508c28ce31
184 changed files with 2199 additions and 241 deletions
--- a/engine/shot_executor.py
+++ b/engine/shot_executor.py
@@ -1,42 +1,53 @@
 from __future__ import annotations

 import asyncio
+import os
 import random
 from pathlib import Path
 from typing import Any

-from moviepy import AudioFileClip, CompositeVideoClip, TextClip, VideoFileClip, vfx
+import numpy as np
+from moviepy import AudioFileClip, VideoClip
+from PIL import Image
+from urllib.request import urlopen

 from .audio_gen import synthesize_one
-from .comfy_client import ComfyClient
+from .comfy_client import generate_image as comfy_generate_image
 from .config import AppConfig
+from .render_pipeline import render_shot as render_shot_pipeline


-def _fit_video_to_audio(video: VideoFileClip, audio: AudioFileClip) -> VideoFileClip:
-    if audio.duration is None or video.duration is None:
-        return video.with_audio(audio)
-    if audio.duration > video.duration:
-        video = video.with_effects([vfx.Loop(duration=audio.duration)])
-    elif video.duration > audio.duration:
-        video = video.subclipped(0, audio.duration)
-    return video.with_audio(audio)
+ASSETS_DIR = "assets"
+DEMO_IMAGE = os.path.join(ASSETS_DIR, "demo.jpg")


-def _subtitle_clip(text: str, size: tuple[int, int], duration: float) -> TextClip:
-    return (
-        TextClip(
-            text=text,
-            font_size=44,
-            color="white",
-            stroke_color="black",
-            stroke_width=2,
-            size=(int(size[0] * 0.92), None),
-            method="caption",
-        )
-        .with_position(("center", "bottom"))
-        .with_duration(duration)
-        .with_opacity(0.95)
-    )
+def ensure_demo_image() -> None:
+    os.makedirs(ASSETS_DIR, exist_ok=True)
+    if os.path.exists(DEMO_IMAGE):
+        return
+
+    # Simple placeholder image source.
+    url = "https://picsum.photos/1280/720"
+    with urlopen(url, timeout=30) as resp:
+        data = resp.read()
+
+    with open(DEMO_IMAGE, "wb") as f:
+        f.write(data)
+
+
+def generate_image_mock(prompt: str) -> str:
+    # Keep interface compatible with the requested interface.
+    _ = prompt
+    ensure_demo_image()
+    return DEMO_IMAGE
+
+
+def enrich_prompt(prompt_text: str) -> str:
+    style = "cinematic, ultra realistic, 4k, detailed lighting"
+    pt = (prompt_text or "").strip()
+    if not pt:
+        return style
+    return f"{pt}, {style}"


 async def _render_shot_async(
@@ -55,49 +66,102 @@ async def _render_shot_async(

    shot_id = str(shot.get("shot_id", "unknown"))
    image_prompt = str(shot.get("image_prompt", "")).strip()
-    motion = str(shot.get("motion", "")).strip()
+    prompt_text = str(shot.get("prompt", image_prompt) or image_prompt).strip()
    tts_text = str(shot.get("tts", "")).strip()
    duration_s = max(1.0, float(shot.get("duration", 3)))

    voice = str(cfg.get("tts.voice", "zh-CN-XiaoxiaoNeural"))
    rate = str(cfg.get("tts.rate", "+0%"))
    volume = str(cfg.get("tts.volume", "+0%"))
-    audio_path = audio_dir / f"shot_{shot_id}.mp3"
-    audio_asset = await synthesize_one(tts_text or " ", audio_path, voice, rate, volume)
+    audio_asset: Any | None = None
+    if tts_text:
+        audio_path = audio_dir / f"shot_{shot_id}.mp3"
+        audio_asset = await synthesize_one(tts_text, audio_path, voice, rate, volume)

+    # Use config-defined output resolution for stable concatenation.
+    mock_size = cfg.get("video.mock_size", [1024, 576])
+    w, h = int(mock_size[0]), int(mock_size[1])
+    fps = int(cfg.get("video.mock_fps", 24))
+
+    if audio_asset and audio_asset.duration_s:
+        duration_s = max(duration_s, float(audio_asset.duration_s))
+
+    # shot -> image (ComfyUI first; fallback to demo.jpg)
+    image_path: str
    if mock:
-        from engine.main import _ensure_mock_image, _make_mock_video  # local import to avoid circular at module import
-
-        mock_size = cfg.get("video.mock_size", [1024, 576])
-        w, h = int(mock_size[0]), int(mock_size[1])
-        mock_image = _ensure_mock_image(Path("./assets/mock.png"), (w, h))
-        fps = int(cfg.get("video.mock_fps", 24))
-        raw_video_path = out_dir / f"shot_raw_{shot_id}.mp4"
-        _make_mock_video(raw_video_path, mock_image, max(duration_s, audio_asset.duration_s), fps=fps)
+        image_path = generate_image_mock(prompt_text)
    else:
-        comfy = ComfyClient(cfg)
-        wf = comfy.load_workflow()
-        seed = random.randint(1, 2_147_483_647)
-        wf_i = comfy.inject_params(wf, image_prompt=image_prompt, seed=seed, motion_prompt=motion or None)
-        result = await comfy.run_workflow(wf_i)
-        candidates = [p for p in result.output_files if p.suffix.lower() in {".mp4", ".mov", ".webm"}]
-        raw_video_path = candidates[0] if candidates else result.output_files[0]
-
-    clip_out = clips_dir / f"shot_{shot_id}.mp4"
-    v = VideoFileClip(str(raw_video_path))
-    a = AudioFileClip(str(audio_asset.path))
-    try:
-        v2 = _fit_video_to_audio(v, a)
-        w2, h2 = v2.size
-        subtitle = _subtitle_clip(tts_text, (w2, h2), v2.duration or a.duration or duration_s)
-        comp = CompositeVideoClip([v2, subtitle])
        try:
-            comp.write_videofile(str(clip_out), codec="libx264", audio_codec="aac", fps=v2.fps or 24, preset="veryfast")
-        finally:
-            comp.close()
+            enriched = enrich_prompt(prompt_text)
+            # Store generated images directly under outputs/{task_id}
+            # (as required by verification: outputs/{task_id}/*.png).
+            image_path = str(
+                comfy_generate_image(
+                    enriched,
+                    out_dir,
+                    cfg=cfg,
+                    timeout_s=60,
+                    retry=2,
+                    filename_prefix=f"shot_{shot_id}",
+                )
+            )
+            print(f"[SHOT_RENDER] {shot_id} -> image generated: {image_path}")
+        except Exception as e:
+            print(f"[WARN] Comfy failed, fallback to demo: {e}")
+            image_path = generate_image_mock(prompt_text)
+
+    # Ensure image exists before rendering.
+    if not image_path or not os.path.exists(image_path):
+        image_path = generate_image_mock(prompt_text)
+    base_img = Image.open(image_path).convert("RGB")
+
+    def make_frame(t: float):
+        # Subtle zoom-in from 1.00 to ~1.03 over the clip duration.
+        progress = float(t) / max(duration_s, 1e-6)
+        progress = max(0.0, min(1.0, progress))
+        scale = 1.0 + 0.03 * progress
+
+        new_w = max(w, int(w * scale))
+        new_h = max(h, int(h * scale))
+
+        frame = base_img.resize((new_w, new_h), Image.LANCZOS)
+        left = (new_w - w) // 2
+        top = (new_h - h) // 2
+        frame = frame.crop((left, top, left + w, top + h))
+        return np.array(frame)
+
+    # image -> video
+    video = VideoClip(make_frame, duration=duration_s, has_constant_size=True)
+
+    # optional audio -> clip
+    audio_clip: AudioFileClip | None = None
+    if audio_asset and os.path.exists(str(audio_asset.path)):
+        audio_clip = AudioFileClip(str(audio_asset.path))
+        video = video.with_audio(audio_clip)
+
+    # output
+    clip_out = clips_dir / f"shot_{shot_id}.mp4"
+    print(f"[SHOT_RENDER] {shot_id} -> {clip_out}")
+    try:
+        video.write_videofile(
+            str(clip_out),
+            fps=fps,
+            codec="libx264",
+            audio_codec="aac",
+            preset="veryfast",
+            threads=2,
+        )
    finally:
-        v.close()
-        a.close()
+        try:
+            video.close()
+        except Exception:
+            pass
+        if audio_clip is not None:
+            try:
+                audio_clip.close()
+            except Exception:
+                pass
+
    return str(clip_out)


@@ -109,5 +173,5 @@ def render_shot(
    mock: bool = False,
 ) -> str:
    cfg2 = cfg or AppConfig.load("./configs/config.yaml")
-    return asyncio.run(_render_shot_async(shot, output_dir, cfg2, mock=mock))
+    return render_shot_pipeline(shot, cfg2, output_dir, mock=mock)