feat: 新增文件

2026-03-18 17:36:07 +08:00
commit f99098ec58
702 changed files with 68533 additions and 0 deletions
--- a/engine/init.py
+++ b/engine/init.py
@@ -0,0 +1,3 @@
+from .types import Scene
+
+__all__ = ["Scene"]
--- a/engine/audio_gen.py
+++ b/engine/audio_gen.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+import asyncio
+from dataclasses import dataclass
+from pathlib import Path
+
+import edge_tts
+from moviepy.audio.io.AudioFileClip import AudioFileClip
+
+from .config import AppConfig
+
+
+@dataclass(frozen=True)
+class AudioAsset:
+    path: Path
+    duration_s: float
+
+
+def _audio_duration_seconds(path: Path) -> float:
+    # MoviePy uses ffmpeg and provides reliable duration for mp3.
+    clip = AudioFileClip(str(path))
+    try:
+        return float(clip.duration or 0.0)
+    finally:
+        clip.close()
+
+
+async def synthesize_one(text: str, out_path: Path, voice: str, rate: str, volume: str) -> AudioAsset:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    communicate = edge_tts.Communicate(text=text, voice=voice, rate=rate, volume=volume)
+    await communicate.save(str(out_path))
+    dur = _audio_duration_seconds(out_path)
+    return AudioAsset(path=out_path, duration_s=dur)
+
+
+async def synthesize_scenes(narrations: list[str], cfg: AppConfig) -> list[AudioAsset]:
+    voice = str(cfg.get("tts.voice", "zh-CN-XiaoxiaoNeural"))
+    rate = str(cfg.get("tts.rate", "+0%"))
+    volume = str(cfg.get("tts.volume", "+0%"))
+    out_dir = Path(str(cfg.get("tts.output_dir", "./assets/audio")))
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    tasks: list[asyncio.Task[AudioAsset]] = []
+    for idx, text in enumerate(narrations, start=1):
+        out_path = out_dir / f"scene_{idx:02d}.mp3"
+        tasks.append(asyncio.create_task(synthesize_one(text, out_path, voice, rate, volume)))
+    return await asyncio.gather(*tasks)
--- a/engine/comfy_client.py
+++ b/engine/comfy_client.py
@@ -0,0 +1,188 @@
+from __future__ import annotations
+
+import asyncio
+import json
+import uuid
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Iterable
+
+import httpx
+
+from .config import AppConfig
+
+
+@dataclass(frozen=True)
+class ComfyResult:
+    prompt_id: str
+    output_files: list[Path]
+
+
+class ComfyClient:
+    def __init__(self, cfg: AppConfig):
+        self.cfg = cfg
+        self.base_url = str(cfg.get("app.comfy_base_url", "http://127.0.0.1:8188")).rstrip("/")
+        self.output_dir = Path(str(cfg.get("app.comfy_output_dir", "./ComfyUI/output")))
+        self.workflow_path = Path(str(cfg.get("comfy_workflow.workflow_path", "./workflow_api.json")))
+        self._client_id = str(uuid.uuid4())
+
+    def load_workflow(self) -> dict[str, Any]:
+        if not self.workflow_path.exists():
+            raise FileNotFoundError(f"workflow file not found: {self.workflow_path}")
+        raw = json.loads(self.workflow_path.read_text(encoding="utf-8"))
+        if not isinstance(raw, dict):
+            raise ValueError(f"workflow_api.json root must be dict, got {type(raw)}")
+        return raw
+
+    def _nodes(self, workflow: dict[str, Any]) -> dict[str, Any]:
+        # ComfyUI API workflow exports typically use { node_id: {class_type, inputs, ...}, ... }
+        return workflow
+
+    def _find_node_id_by_class_type(self, workflow: dict[str, Any], class_types: Iterable[str]) -> str | None:
+        want = {c.strip() for c in class_types if c and str(c).strip()}
+        if not want:
+            return None
+        for node_id, node in self._nodes(workflow).items():
+            if not isinstance(node, dict):
+                continue
+            ct = node.get("class_type")
+            if isinstance(ct, str) and ct in want:
+                return str(node_id)
+        return None
+
+    def _resolve_node_id(self, workflow: dict[str, Any], configured_id: Any, fallback_class_types_key: str) -> str:
+        if configured_id is not None and str(configured_id).strip():
+            node_id = str(configured_id).strip()
+            if node_id not in self._nodes(workflow):
+                raise KeyError(f"Configured node_id {node_id} not found in workflow")
+            return node_id
+        class_types = self.cfg.get(f"comfy_workflow.{fallback_class_types_key}", []) or []
+        if not isinstance(class_types, list):
+            raise ValueError(f"Config comfy_workflow.{fallback_class_types_key} must be list")
+        found = self._find_node_id_by_class_type(workflow, [str(x) for x in class_types])
+        if not found:
+            raise KeyError(f"Cannot resolve node by class types: {class_types}")
+        return found
+
+    def inject_params(self, workflow: dict[str, Any], image_prompt: str, seed: int, motion_prompt: str | None = None) -> dict[str, Any]:
+        wf = json.loads(json.dumps(workflow))  # deep copy
+
+        prompt_node_id = self._resolve_node_id(
+            wf,
+            self.cfg.get("comfy_workflow.prompt_node_id", None),
+            "prompt_node_class_types",
+        )
+        prompt_key = str(self.cfg.get("comfy_workflow.prompt_input_key", "text"))
+        self._set_input(wf, prompt_node_id, prompt_key, image_prompt)
+
+        seed_node_id = self._resolve_node_id(
+            wf,
+            self.cfg.get("comfy_workflow.seed_node_id", None),
+            "seed_node_class_types",
+        )
+        seed_key = str(self.cfg.get("comfy_workflow.seed_input_key", "seed"))
+        self._set_input(wf, seed_node_id, seed_key, int(seed))
+
+        motion_node_id = self.cfg.get("comfy_workflow.motion_node_id", None)
+        if motion_prompt and motion_node_id is not None and str(motion_node_id).strip():
+            motion_key = str(self.cfg.get("comfy_workflow.motion_input_key", "text"))
+            self._set_input(wf, str(motion_node_id).strip(), motion_key, motion_prompt)
+
+        return wf
+
+    def _set_input(self, workflow: dict[str, Any], node_id: str, key: str, value: Any) -> None:
+        node = self._nodes(workflow).get(str(node_id))
+        if not isinstance(node, dict):
+            raise KeyError(f"Node {node_id} not found")
+        inputs = node.get("inputs")
+        if inputs is None:
+            inputs = {}
+            node["inputs"] = inputs
+        if not isinstance(inputs, dict):
+            raise TypeError(f"Node {node_id} inputs must be dict, got {type(inputs)}")
+        inputs[key] = value
+
+    async def _post_prompt(self, client: httpx.AsyncClient, workflow: dict[str, Any]) -> str:
+        url = f"{self.base_url}/prompt"
+        payload = {"prompt": workflow, "client_id": self._client_id}
+        r = await client.post(url, json=payload)
+        r.raise_for_status()
+        data = r.json()
+        pid = data.get("prompt_id") or data.get("prompt_id".upper())
+        if not isinstance(pid, str) or not pid:
+            raise RuntimeError(f"Unexpected /prompt response: {data}")
+        return pid
+
+    async def _get_history(self, client: httpx.AsyncClient, prompt_id: str) -> dict[str, Any] | None:
+        # Common endpoints:
+        # - /history/{prompt_id}
+        # - /history (returns all histories keyed by prompt id)
+        for url in (f"{self.base_url}/history/{prompt_id}", f"{self.base_url}/history"):
+            try:
+                r = await client.get(url)
+                if r.status_code == 404:
+                    continue
+                r.raise_for_status()
+                data = r.json()
+                if isinstance(data, dict):
+                    if prompt_id in data and isinstance(data[prompt_id], dict):
+                        return data[prompt_id]
+                    if url.endswith(f"/{prompt_id}"):
+                        return data
+                return None
+            except httpx.HTTPStatusError:
+                raise
+            except Exception:
+                continue
+        return None
+
+    def _extract_output_files(self, history_item: dict[str, Any]) -> list[Path]:
+        out: list[Path] = []
+        outputs = history_item.get("outputs")
+        if not isinstance(outputs, dict):
+            return out
+
+        def walk(v: Any) -> None:
+            if isinstance(v, dict):
+                # ComfyUI tends to store files like {"filename":"x.mp4","subfolder":"","type":"output"}
+                fn = v.get("filename")
+                if isinstance(fn, str) and fn.strip():
+                    out.append(self.output_dir / fn)
+                for vv in v.values():
+                    walk(vv)
+            elif isinstance(v, list):
+                for vv in v:
+                    walk(vv)
+
+        walk(outputs)
+        # De-dup while preserving order
+        seen: set[str] = set()
+        uniq: list[Path] = []
+        for p in out:
+            s = str(p)
+            if s not in seen:
+                seen.add(s)
+                uniq.append(p)
+        return uniq
+
+    async def run_workflow(self, workflow: dict[str, Any], *, poll_interval_s: float = 1.0, timeout_s: float = 300.0) -> ComfyResult:
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            prompt_id = await self._post_prompt(client, workflow)
+
+            deadline = asyncio.get_event_loop().time() + timeout_s
+            last_files: list[Path] = []
+            while True:
+                if asyncio.get_event_loop().time() > deadline:
+                    raise TimeoutError(f"ComfyUI job timeout: {prompt_id}")
+                item = await self._get_history(client, prompt_id)
+                if isinstance(item, dict):
+                    files = self._extract_output_files(item)
+                    if files:
+                        last_files = files
+                        # Heuristic: if any file exists on disk, treat as done.
+                        if any(p.exists() for p in files):
+                            return ComfyResult(prompt_id=prompt_id, output_files=files)
+                await asyncio.sleep(poll_interval_s)
+
+            # unreachable
+            # return ComfyResult(prompt_id=prompt_id, output_files=last_files)
--- a/engine/config.py
+++ b/engine/config.py
@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+
+@dataclass(frozen=True)
+class AppConfig:
+    data: dict[str, Any]
+
+    @staticmethod
+    def load(path: str | Path) -> "AppConfig":
+        p = Path(path)
+        raw = yaml.safe_load(p.read_text(encoding="utf-8")) if p.exists() else {}
+        if not isinstance(raw, dict):
+            raise ValueError(f"Config root must be a mapping, got {type(raw)}")
+        return AppConfig(raw)
+
+    def get(self, dotted: str, default: Any = None) -> Any:
+        cur: Any = self.data
+        for part in dotted.split("."):
+            if not isinstance(cur, dict) or part not in cur:
+                return default
+            cur = cur[part]
+        return cur
--- a/engine/script_gen.py
+++ b/engine/script_gen.py
@@ -0,0 +1,80 @@
+from __future__ import annotations
+
+import json
+import os
+from typing import Any
+
+from openai import OpenAI
+
+from .config import AppConfig
+from .types import Scene
+
+
+def _system_prompt(scene_count: int, min_chars: int, max_chars: int) -> str:
+    return f"""你是一个专业短视频编剧与分镜师。
+请把用户的创意扩展为 {scene_count} 个分镜(Scene) 的 JSON。
+
+硬性约束：
+1) 三个分镜的主角描述（Character Description）必须保持一致：姓名/外观/服饰/风格不可前后矛盾。
+2) 每个分镜必须包含字段：image_prompt, video_motion, narration。
+3) narration 为中文旁白，每段严格控制在约 {min_chars}-{max_chars} 字左右（宁可略短，不要超过太多）。
+4) 画面描述要具体可视化，video_motion 描述镜头运动/人物动作。
+5) 只输出 JSON，不要输出任何解释、markdown、代码块。
+
+输出 JSON Schema（示例结构）：
+{{
+  "character_description": "...一致的主角设定...",
+  "scenes": [
+    {{"image_prompt":"...","video_motion":"...","narration":"..."}},
+    {{"image_prompt":"...","video_motion":"...","narration":"..."}},
+    {{"image_prompt":"...","video_motion":"...","narration":"..."}}
+  ]
+}}
+"""
+
+
+def generate_scenes(user_prompt: str, cfg: AppConfig) -> list[Scene]:
+    scene_count = int(cfg.get("script_gen.scene_count", 3))
+    min_chars = int(cfg.get("script_gen.narration_min_chars", 15))
+    max_chars = int(cfg.get("script_gen.narration_max_chars", 20))
+
+    api_key_env = str(cfg.get("openai.api_key_env", "OPENAI_API_KEY"))
+    base_url_env = str(cfg.get("openai.base_url_env", "OPENAI_BASE_URL"))
+    model = str(cfg.get("openai.model", "gpt-4o-mini"))
+
+    api_key = os.environ.get(api_key_env)
+    if not api_key:
+        raise RuntimeError(f"Missing env var {api_key_env} for OpenAI API key")
+
+    client = OpenAI(
+        api_key=api_key,
+        base_url=os.environ.get(base_url_env) or None,
+    )
+
+    resp = client.chat.completions.create(
+        model=model,
+        messages=[
+            {"role": "system", "content": _system_prompt(scene_count, min_chars, max_chars)},
+            {"role": "user", "content": user_prompt},
+        ],
+        response_format={"type": "json_object"},
+        temperature=0.6,
+    )
+
+    content = resp.choices[0].message.content or "{}"
+    data: Any = json.loads(content)
+    scenes_raw = data.get("scenes")
+    if not isinstance(scenes_raw, list) or len(scenes_raw) != scene_count:
+        raise ValueError(f"Model returned invalid scenes length: {type(scenes_raw)}")
+
+    scenes: list[Scene] = []
+    for i, s in enumerate(scenes_raw):
+        if not isinstance(s, dict):
+            raise ValueError(f"Scene[{i}] must be object, got {type(s)}")
+        image_prompt = str(s.get("image_prompt", "")).strip()
+        video_motion = str(s.get("video_motion", "")).strip()
+        narration = str(s.get("narration", "")).strip()
+        if not image_prompt or not narration:
+            raise ValueError(f"Scene[{i}] missing required fields")
+        scenes.append(Scene(image_prompt=image_prompt, video_motion=video_motion, narration=narration))
+    return scenes
--- a/engine/types.py
+++ b/engine/types.py
@@ -0,0 +1,10 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class Scene:
+    image_prompt: str
+    video_motion: str
+    narration: str
--- a/engine/video_editor.py
+++ b/engine/video_editor.py
@@ -0,0 +1,78 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+from moviepy import AudioFileClip, CompositeVideoClip, TextClip, VideoFileClip, concatenate_videoclips, vfx
+
+from .config import AppConfig
+
+
+@dataclass(frozen=True)
+class Segment:
+    video_path: Path
+    audio_path: Path
+    narration: str
+
+
+def _fit_video_to_audio(video: VideoFileClip, audio: AudioFileClip) -> VideoFileClip:
+    if audio.duration is None or video.duration is None:
+        return video.with_audio(audio)
+    if audio.duration > video.duration:
+        video = video.with_effects([vfx.Loop(duration=audio.duration)])
+    elif video.duration > audio.duration:
+        video = video.subclipped(0, audio.duration)
+    return video.with_audio(audio)
+
+
+def _subtitle_clip(text: str, size: tuple[int, int], duration: float) -> TextClip:
+    # MoviePy 2 uses Pillow for text rendering by default on most setups.
+    return (
+        TextClip(
+            text=text,
+            font_size=44,
+            color="white",
+            stroke_color="black",
+            stroke_width=2,
+            size=(int(size[0] * 0.92), None),
+            method="caption",
+        )
+        .with_position(("center", "bottom"))
+        .with_duration(duration)
+        .with_opacity(0.95)
+    )
+
+
+def render_final(segments: list[Segment], cfg: AppConfig, output_path: str | Path | None = None) -> Path:
+    transition_s = float(cfg.get("video.transition_seconds", 0.25))
+    out = Path(output_path or str(cfg.get("video.final_output", "./final_poc.mp4")))
+    out.parent.mkdir(parents=True, exist_ok=True)
+
+    clips = []
+    for seg in segments:
+        v = VideoFileClip(str(seg.video_path))
+        a = AudioFileClip(str(seg.audio_path))
+        v2 = _fit_video_to_audio(v, a)
+
+        w, h = v2.size
+        sub = _subtitle_clip(seg.narration, (w, h), v2.duration or a.duration or 0)
+        comp = CompositeVideoClip([v2, sub])
+        if transition_s > 0:
+            comp = comp.with_effects([vfx.FadeIn(transition_s), vfx.FadeOut(transition_s)])
+        clips.append(comp)
+
+    final = concatenate_videoclips(clips, method="compose")
+    try:
+        final.write_videofile(
+            str(out),
+            codec="libx264",
+            audio_codec="aac",
+            fps=clips[0].fps if clips and clips[0].fps else int(cfg.get("video.mock_fps", 24)),
+            threads=4,
+            preset="medium",
+        )
+    finally:
+        final.close()
+        for c in clips:
+            c.close()
+    return out