fix: 优化架构

This commit is contained in:
Daniel
2026-03-25 19:35:37 +08:00
parent 34786b37c7
commit 508c28ce31
184 changed files with 2199 additions and 241 deletions

View File

@@ -1,42 +1,53 @@
from __future__ import annotations
import asyncio
import os
import random
from pathlib import Path
from typing import Any
from moviepy import AudioFileClip, CompositeVideoClip, TextClip, VideoFileClip, vfx
import numpy as np
from moviepy import AudioFileClip, VideoClip
from PIL import Image
from urllib.request import urlopen
from .audio_gen import synthesize_one
from .comfy_client import ComfyClient
from .comfy_client import generate_image as comfy_generate_image
from .config import AppConfig
from .render_pipeline import render_shot as render_shot_pipeline
def _fit_video_to_audio(video: VideoFileClip, audio: AudioFileClip) -> VideoFileClip:
if audio.duration is None or video.duration is None:
return video.with_audio(audio)
if audio.duration > video.duration:
video = video.with_effects([vfx.Loop(duration=audio.duration)])
elif video.duration > audio.duration:
video = video.subclipped(0, audio.duration)
return video.with_audio(audio)
ASSETS_DIR = "assets"
DEMO_IMAGE = os.path.join(ASSETS_DIR, "demo.jpg")
def _subtitle_clip(text: str, size: tuple[int, int], duration: float) -> TextClip:
return (
TextClip(
text=text,
font_size=44,
color="white",
stroke_color="black",
stroke_width=2,
size=(int(size[0] * 0.92), None),
method="caption",
)
.with_position(("center", "bottom"))
.with_duration(duration)
.with_opacity(0.95)
)
def ensure_demo_image() -> None:
os.makedirs(ASSETS_DIR, exist_ok=True)
if os.path.exists(DEMO_IMAGE):
return
# Simple placeholder image source.
url = "https://picsum.photos/1280/720"
with urlopen(url, timeout=30) as resp:
data = resp.read()
with open(DEMO_IMAGE, "wb") as f:
f.write(data)
def generate_image_mock(prompt: str) -> str:
# Keep interface compatible with the requested interface.
_ = prompt
ensure_demo_image()
return DEMO_IMAGE
def enrich_prompt(prompt_text: str) -> str:
style = "cinematic, ultra realistic, 4k, detailed lighting"
pt = (prompt_text or "").strip()
if not pt:
return style
return f"{pt}, {style}"
async def _render_shot_async(
@@ -55,49 +66,102 @@ async def _render_shot_async(
shot_id = str(shot.get("shot_id", "unknown"))
image_prompt = str(shot.get("image_prompt", "")).strip()
motion = str(shot.get("motion", "")).strip()
prompt_text = str(shot.get("prompt", image_prompt) or image_prompt).strip()
tts_text = str(shot.get("tts", "")).strip()
duration_s = max(1.0, float(shot.get("duration", 3)))
voice = str(cfg.get("tts.voice", "zh-CN-XiaoxiaoNeural"))
rate = str(cfg.get("tts.rate", "+0%"))
volume = str(cfg.get("tts.volume", "+0%"))
audio_path = audio_dir / f"shot_{shot_id}.mp3"
audio_asset = await synthesize_one(tts_text or " ", audio_path, voice, rate, volume)
audio_asset: Any | None = None
if tts_text:
audio_path = audio_dir / f"shot_{shot_id}.mp3"
audio_asset = await synthesize_one(tts_text, audio_path, voice, rate, volume)
# Use config-defined output resolution for stable concatenation.
mock_size = cfg.get("video.mock_size", [1024, 576])
w, h = int(mock_size[0]), int(mock_size[1])
fps = int(cfg.get("video.mock_fps", 24))
if audio_asset and audio_asset.duration_s:
duration_s = max(duration_s, float(audio_asset.duration_s))
# shot -> image (ComfyUI first; fallback to demo.jpg)
image_path: str
if mock:
from engine.main import _ensure_mock_image, _make_mock_video # local import to avoid circular at module import
mock_size = cfg.get("video.mock_size", [1024, 576])
w, h = int(mock_size[0]), int(mock_size[1])
mock_image = _ensure_mock_image(Path("./assets/mock.png"), (w, h))
fps = int(cfg.get("video.mock_fps", 24))
raw_video_path = out_dir / f"shot_raw_{shot_id}.mp4"
_make_mock_video(raw_video_path, mock_image, max(duration_s, audio_asset.duration_s), fps=fps)
image_path = generate_image_mock(prompt_text)
else:
comfy = ComfyClient(cfg)
wf = comfy.load_workflow()
seed = random.randint(1, 2_147_483_647)
wf_i = comfy.inject_params(wf, image_prompt=image_prompt, seed=seed, motion_prompt=motion or None)
result = await comfy.run_workflow(wf_i)
candidates = [p for p in result.output_files if p.suffix.lower() in {".mp4", ".mov", ".webm"}]
raw_video_path = candidates[0] if candidates else result.output_files[0]
clip_out = clips_dir / f"shot_{shot_id}.mp4"
v = VideoFileClip(str(raw_video_path))
a = AudioFileClip(str(audio_asset.path))
try:
v2 = _fit_video_to_audio(v, a)
w2, h2 = v2.size
subtitle = _subtitle_clip(tts_text, (w2, h2), v2.duration or a.duration or duration_s)
comp = CompositeVideoClip([v2, subtitle])
try:
comp.write_videofile(str(clip_out), codec="libx264", audio_codec="aac", fps=v2.fps or 24, preset="veryfast")
finally:
comp.close()
enriched = enrich_prompt(prompt_text)
# Store generated images directly under outputs/{task_id}
# (as required by verification: outputs/{task_id}/*.png).
image_path = str(
comfy_generate_image(
enriched,
out_dir,
cfg=cfg,
timeout_s=60,
retry=2,
filename_prefix=f"shot_{shot_id}",
)
)
print(f"[SHOT_RENDER] {shot_id} -> image generated: {image_path}")
except Exception as e:
print(f"[WARN] Comfy failed, fallback to demo: {e}")
image_path = generate_image_mock(prompt_text)
# Ensure image exists before rendering.
if not image_path or not os.path.exists(image_path):
image_path = generate_image_mock(prompt_text)
base_img = Image.open(image_path).convert("RGB")
def make_frame(t: float):
# Subtle zoom-in from 1.00 to ~1.03 over the clip duration.
progress = float(t) / max(duration_s, 1e-6)
progress = max(0.0, min(1.0, progress))
scale = 1.0 + 0.03 * progress
new_w = max(w, int(w * scale))
new_h = max(h, int(h * scale))
frame = base_img.resize((new_w, new_h), Image.LANCZOS)
left = (new_w - w) // 2
top = (new_h - h) // 2
frame = frame.crop((left, top, left + w, top + h))
return np.array(frame)
# image -> video
video = VideoClip(make_frame, duration=duration_s, has_constant_size=True)
# optional audio -> clip
audio_clip: AudioFileClip | None = None
if audio_asset and os.path.exists(str(audio_asset.path)):
audio_clip = AudioFileClip(str(audio_asset.path))
video = video.with_audio(audio_clip)
# output
clip_out = clips_dir / f"shot_{shot_id}.mp4"
print(f"[SHOT_RENDER] {shot_id} -> {clip_out}")
try:
video.write_videofile(
str(clip_out),
fps=fps,
codec="libx264",
audio_codec="aac",
preset="veryfast",
threads=2,
)
finally:
v.close()
a.close()
try:
video.close()
except Exception:
pass
if audio_clip is not None:
try:
audio_clip.close()
except Exception:
pass
return str(clip_out)
@@ -109,5 +173,5 @@ def render_shot(
mock: bool = False,
) -> str:
cfg2 = cfg or AppConfig.load("./configs/config.yaml")
return asyncio.run(_render_shot_async(shot, output_dir, cfg2, mock=mock))
return render_shot_pipeline(shot, cfg2, output_dir, mock=mock)