[MAX] Add Wan video generation examples and comparison benchmark

jglee-sqbits · jglee-sqbits · commit eb2be922d280 · 2026-04-01T04:16:07.000Z
## Summary Add a standalone video generation example script for Wan T2V and I2V pipelines. ## Description - `simple_offline_video_generation.py`: end-to-end script for generating videos from text or image prompts - Supports all Wan model variants: 2.2-A14B (MoE), 2.1-14B, T2V and I2V - LoRA turbo support (e.g. Lightning 4-step) - Built-in profiling with component-level timing breakdown - Input images can be local files or URLs (downloaded at runtime) - Outputs MP4 video via `av` (PyAV) ## Validation (H200 140GB, 720p 81 frames) ```bash # T2V base (Wan2.2-A14B MoE, 720p, 40 steps) MODULAR_DEVICE_CONTEXT_MEMORY_MANAGER_CHUNK_PERCENT=100 \ ./bazelw run //max/examples/diffusion:simple_offline_video_generation -- \ --model Wan-AI/Wan2.2-T2V-A14B-Diffusers \ --prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \ --negative-prompt "low quality" \ --height 720 --width 1280 --num-frames 81 \ --num-inference-steps 40 --guidance-scale 4.0 \ --guidance-scale-2 3.0 \ --output t2v_base.mp4 # T2V LoRA turbo (4 steps) MODULAR_DEVICE_CONTEXT_MEMORY_MANAGER_CHUNK_PERCENT=100 \ ./bazelw run //max/examples/diffusion:simple_offline_video_generation -- \ --model Wan-AI/Wan2.2-T2V-A14B-Diffusers \ --prompt "A cat playing piano" \ --height 720 --width 1280 --num-frames 81 \ --num-inference-steps 4 --guidance-scale 1.0 \ --lora-repo-id lightx2v/Wan2.2-Lightning \ --lora-subfolder Wan2.2-T2V-A14B-4steps-lora-rank64-Seko-V2.0 \ --output t2v_lora.mp4 # I2V base (720p, 40 steps) MODULAR_DEVICE_CONTEXT_MEMORY_MANAGER_CHUNK_PERCENT=100 \ ./bazelw run //max/examples/diffusion:simple_offline_video_generation -- \ --model Wan-AI/Wan2.2-I2V-A14B-Diffusers \ --prompt "A cat surfing on a wave" \ --negative-prompt "low quality" \ --height 720 --width 1280 --num-frames 81 \ --num-inference-steps 40 --guidance-scale 4.0 \ --guidance-scale-2 3.0 \ --input-image https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B/resolve/main/examples/i2v_input.JPG \ --output i2v_base.mp4 # I2V LoRA turbo (4 steps) MODULAR_DEVICE_CONTEXT_MEMORY_MANAGER_CHUNK_PERCENT=100 \ ./bazelw run //max/examples/diffusion:simple_offline_video_generation -- \ --model Wan-AI/Wan2.2-I2V-A14B-Diffusers \ --prompt "A cat surfing on a wave" \ --height 720 --width 1280 --num-frames 81 \ --num-inference-steps 4 --guidance-scale 1.0 \ --lora-repo-id lightx2v/Wan2.2-Lightning \ --lora-subfolder Wan2.2-I2V-A14B-4steps-lora-rank64-Seko-V1 \ --input-image https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B/resolve/main/examples/i2v_input.JPG \ --output i2v_lora.mp4 ``` ## Dependencies Depends on all previous PRs: modular#6298–modular#6303. ## Checklist - [x] PR is small and focused - [x] I ran `./bazelw run format` to format my changes Assisted-by: Claude Code Assisted-by: Claude Code
diff --git a/max/examples/diffusion/BUILD.bazel b/max/examples/diffusion/BUILD.bazel
@@ -45,3 +45,31 @@ modular_py_binary(
         requirement("torch"),  # for test
     ],
 )
+
+modular_py_binary(
+    name = "all_wan_model_speed_metric",
+    srcs = ["all_wan_model_speed_metric.py"],
+    tags = ["no-pydeps"],
+    deps = [
+        requirement("numpy"),
+    ],
+)
+
+modular_py_binary(
+    name = "simple_offline_video_generation",
+    srcs = ["simple_offline_video_generation.py"],
+    mojo_deps = ["//max:MOGGKernelAPI"],
+    tags = ["no-pydeps"],
+    deps = [
+        ":profiler",
+        "//max/python/max/interfaces",
+        "//max/python/max/pipelines",
+        "//max/python/max/pipelines/core",
+        "//max/python/max/pipelines/lib",
+        requirement("av"),
+        requirement("numpy"),
+        requirement("pillow"),
+        requirement("sentencepiece"),
+        requirement("torch"),
+    ],
+)
diff --git a/max/examples/diffusion/all_wan_model_speed_metric.py b/max/examples/diffusion/all_wan_model_speed_metric.py
@@ -0,0 +1,308 @@
+#!/usr/bin/env python3
+# ===----------------------------------------------------------------------=== #
+# Copyright (c) 2026, Modular Inc. All rights reserved.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions:
+# https://llvm.org/LICENSE.txt
+# ===----------------------------------------------------------------------=== #
+
+"""Wan video generation full metric: benchmarks all model variants via MAX.
+
+Runs all Wan model variants (2.2/2.1, T2V/I2V) at 720p with
+multiple resolutions to verify symbolic seq_len recompilation behavior.
+
+Usage:
+    MODULAR_DEVICE_CONTEXT_MEMORY_MANAGER_CHUNK_PERCENT=100 \
+    ./bazelw run //max/examples/diffusion:full_metric
+
+    # Specific model only
+    MODULAR_DEVICE_CONTEXT_MEMORY_MANAGER_CHUNK_PERCENT=100 \
+    ./bazelw run //max/examples/diffusion:full_metric -- \
+        --model wan2.2-t2v-a14b
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+import os
+import subprocess
+import sys
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    datefmt="%H:%M:%S",
+)
+log = logging.getLogger("full_metric")
+
+NUM_STEPS = 40
+
+T2V_PROMPT = (
+    "Two anthropomorphic cats in comfy boxing gear and bright gloves "
+    "fight intensely on a spotlighted stage."
+)
+I2V_PROMPT = "A cat surfing on a wave"
+I2V_IMAGE_URL = (
+    "https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B"
+    "/resolve/main/examples/i2v_input.JPG"
+)
+NEGATIVE_PROMPT = "low quality"
+
+RESOLUTIONS: list[dict[str, str | int]] = [
+    {"height": 720, "width": 1280, "num_frames": 81, "label": "1280x720"},
+    {"height": 1280, "width": 720, "num_frames": 81, "label": "720x1280"},
+]
+
+
+@dataclass
+class ModelConfig:
+    name: str
+    repo_id: str
+    mode: str
+    guidance_scale: float
+    guidance_scale_2: float | None
+
+
+MODELS: dict[str, ModelConfig] = {
+    "wan2.2-t2v-a14b": ModelConfig(
+        name="wan2.2-t2v-a14b",
+        repo_id="Wan-AI/Wan2.2-T2V-A14B-Diffusers",
+        mode="t2v",
+        guidance_scale=4.0,
+        guidance_scale_2=3.0,
+    ),
+    "wan2.2-i2v-a14b": ModelConfig(
+        name="wan2.2-i2v-a14b",
+        repo_id="Wan-AI/Wan2.2-I2V-A14B-Diffusers",
+        mode="i2v",
+        guidance_scale=4.0,
+        guidance_scale_2=3.0,
+    ),
+    "wan2.1-t2v-14b": ModelConfig(
+        name="wan2.1-t2v-14b",
+        repo_id="Wan-AI/Wan2.1-T2V-14B-Diffusers",
+        mode="t2v",
+        guidance_scale=5.0,
+        guidance_scale_2=None,
+    ),
+    "wan2.1-i2v-14b": ModelConfig(
+        name="wan2.1-i2v-14b",
+        repo_id="Wan-AI/Wan2.1-I2V-14B-720P-Diffusers",
+        mode="i2v",
+        guidance_scale=5.0,
+        guidance_scale_2=None,
+    ),
+}
+
+
+@dataclass
+class TimingResult:
+    model: str
+    label: str
+    e2e_seconds: float
+    components: dict[str, float] = field(default_factory=dict)
+
+
+def _parse_profiling(output: str) -> dict[str, float]:
+    components: dict[str, float] = {}
+    in_methods = False
+    for line in output.splitlines():
+        if "Method Timings:" in line:
+            in_methods = True
+            continue
+        if in_methods and "===" in line:
+            break
+        if in_methods:
+            parts = line.split()
+            if len(parts) >= 3:
+                try:
+                    total_ms = float(parts[-2])
+                    float(parts[-1])  # validate avg
+                    int(parts[-3])  # validate calls
+                    name = " ".join(parts[:-3])
+                    components[name] = total_ms / 1000.0
+                except (ValueError, IndexError):
+                    pass
+    return components
+
+
+def run(models: list[ModelConfig], output_dir: Path) -> list[TimingResult]:
+    results: list[TimingResult] = []
+    bazel_target = "//max/examples/diffusion:simple_offline_video_generation"
+    total = len(models) * len(RESOLUTIONS)
+    idx = 0
+
+    for model in models:
+        for res in RESOLUTIONS:
+            idx += 1
+            label = str(res["label"])
+            tag = f"{model.name}/{label}"
+            log.info("(%d/%d) %s — starting", idx, total, tag)
+            t0 = time.perf_counter()
+
+            prompt = I2V_PROMPT if model.mode == "i2v" else T2V_PROMPT
+            video_path = output_dir / f"{model.name}_{label}.mp4"
+            cmd = [
+                "./bazelw",
+                "run",
+                bazel_target,
+                "--",
+                "--model",
+                model.repo_id,
+                "--prompt",
+                prompt,
+                "--negative-prompt",
+                NEGATIVE_PROMPT,
+                "--height",
+                str(res["height"]),
+                "--width",
+                str(res["width"]),
+                "--num-frames",
+                str(res["num_frames"]),
+                "--num-inference-steps",
+                str(NUM_STEPS),
+                "--guidance-scale",
+                str(model.guidance_scale),
+                "--output",
+                str(video_path),
+            ]
+            if model.guidance_scale_2 is not None:
+                cmd += ["--guidance-scale-2", str(model.guidance_scale_2)]
+            if model.mode == "i2v":
+                cmd += ["--input-image", I2V_IMAGE_URL]
+
+            env = os.environ.copy()
+
+            proc = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=7200,
+                env=env,
+                cwd=str(Path(__file__).resolve().parents[3]),
+            )
+            elapsed = time.perf_counter() - t0
+            full = proc.stdout + proc.stderr
+            print(full[-2000:] if len(full) > 2000 else full)
+
+            if proc.returncode != 0:
+                log.error("%s FAILED (%.0fs)", tag, elapsed)
+                results.append(TimingResult(model.name, label, -1.0))
+                continue
+
+            components = _parse_profiling(full)
+            e2e = components.pop("E2E execute", components.pop("E2E", -1.0))
+            log.info("%s — E2E %.1fs (total %.0fs)", tag, e2e, elapsed)
+            results.append(TimingResult(model.name, label, e2e, components))
+    return results
+
+
+def _gpu_name() -> str:
+    try:
+        return (
+            subprocess.check_output(
+                ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
+                text=True,
+            )
+            .strip()
+            .splitlines()[0]
+        )
+    except Exception:
+        return "unknown GPU"
+
+
+def print_summary(all_results: list[TimingResult]) -> None:
+    gpu = _gpu_name()
+    model_names = list(dict.fromkeys(r.model for r in all_results))
+
+    print(f"\n{'=' * 60}")
+    print(f"  Wan Full Metric — {gpu}, {NUM_STEPS} steps")
+    print(f"{'=' * 60}\n")
+
+    hdr = f"{'Model':<22} {'Resolution':<12} {'E2E (s)':>10}"
+    print(hdr)
+    print("-" * len(hdr))
+
+    by_key: dict[tuple[str, str], TimingResult] = {}
+    for r in all_results:
+        by_key[(r.model, r.label)] = r
+
+    for model_name in model_names:
+        e2e_vals: list[float] = []
+        for res in RESOLUTIONS:
+            label = str(res["label"])
+            result = by_key.get((model_name, label))
+            e2e = result.e2e_seconds if result else -1
+            e2e_str = f"{e2e:>10.1f}" if e2e > 0 else f"{'FAIL':>10}"
+            print(f"{model_name:<22} {label:<12} {e2e_str}")
+            if e2e > 0:
+                e2e_vals.append(e2e)
+        if e2e_vals:
+            avg = sum(e2e_vals) / len(e2e_vals)
+            print(f"{model_name:<22} {'avg':<12} {avg:>10.1f}")
+        print()
+
+    print(f"{'=' * 60}")
+    print("  Component Breakdown (seconds)")
+    print(f"{'=' * 60}\n")
+
+    for model_name in model_names:
+        # Per-resolution breakdown
+        for res in RESOLUTIONS:
+            label = str(res["label"])
+            result = by_key.get((model_name, label))
+            if not result or not result.components:
+                continue
+            print(f"  {model_name} / {label}:")
+            for comp, secs in sorted(result.components.items()):
+                print(f"    {comp:<30} {secs:>10.3f}s")
+            print()
+
+        # Average across resolutions
+        comp_totals: dict[str, list[float]] = {}
+        for res in RESOLUTIONS:
+            label = str(res["label"])
+            result = by_key.get((model_name, label))
+            if not result or not result.components:
+                continue
+            for comp, secs in result.components.items():
+                comp_totals.setdefault(comp, []).append(secs)
+        if comp_totals:
+            print(f"  {model_name} / avg:")
+            for comp in sorted(comp_totals):
+                vals = comp_totals[comp]
+                print(f"    {comp:<30} {sum(vals)/len(vals):>10.3f}s")
+            print()
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Wan video generation full metric (MAX)"
+    )
+    parser.add_argument(
+        "--model",
+        nargs="*",
+        default=None,
+        help=f"Model(s) to benchmark. Choices: {', '.join(MODELS)}. "
+        "Default: all.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="/tmp/wan_full_metric",
+    )
+    args = parser.parse_args()
+
+    selected = [MODELS[m] for m in (args.model or MODELS.keys())]
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    all_results = run(selected, output_dir)
+    print_summary(all_results)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/max/examples/diffusion/simple_offline_video_generation.py b/max/examples/diffusion/simple_offline_video_generation.py