[None][test] Add Stage-1 LPIPS golden accuracy tests for visual generation

chang-l · chang-l · commit 9b4c317d3c12 · 2026-06-29T09:16:05.000-07:00
Add default-setting single-GPU LPIPS golden tests for QwenImage and Cosmos3-Nano, completing single-GPU VisualGen LPIPS CI coverage across all supported models. Refresh all eight golden media entries with the pinned staging main image at TRT-LLM commit 85665f5, which contains the Cosmos3 accuracy fix from #15545. Record the TRT-LLM commit, package versions, container digest, compile-off mode, and deterministic-algorithm mode in every golden JSON while retaining the existing LPIPS thresholds. Explicitly disable pipeline torch compile for every LPIPS generator. Unwaive all eight single-GPU cases, test_wan_t2v_example, and the passing attn2d_2x2_ulysses2 multi-GPU case. Retain the six pre-existing multi-GPU waivers and waive only the new cfg2_ulysses2_attn2d_2x1 case that measured 0.255208 above the unchanged 0.25 threshold. Validated on B200: all eight single-GPU LPIPS cases passed at 0.000000, test_wan_t2v_example passed, and attn2d_2x2_ulysses2 passed at 0.232348. Signed-off-by: Chang Liu <9713593+chang-l@users.noreply.github.com>
diff --git a/tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/cosmos3_nano_t2i_lpips_golden.json b/tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/cosmos3_nano_t2i_lpips_golden.json
@@ -0,0 +1,21 @@
+{
+  "image": "cosmos3_nano_t2i_lpips_golden.png",
+  "model": "Cosmos3-Nano",
+  "source": "TensorRT-LLM VisualGen",
+  "prompt": "A serene mountain landscape with snow-capped peaks and a flowing river",
+  "height": 720,
+  "width": 1280,
+  "num_frames": 1,
+  "num_inference_steps": 35,
+  "guidance_scale": 6.0,
+  "seed": 42,
+  "attention_backend": "VANILLA",
+  "torch_compile": false,
+  "deterministic_algorithms": true,
+  "lpips_net": "alex",
+  "lpips_threshold": 0.05,
+  "diffusers_version": "0.38.0",
+  "tensorrt_llm_version": "1.3.0rc20",
+  "tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
+  "container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
+}
diff --git a/tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/cosmos3_nano_t2v_lpips_golden_video.json b/tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/cosmos3_nano_t2v_lpips_golden_video.json
@@ -0,0 +1,22 @@
+{
+  "video": "cosmos3_nano_t2v_lpips_golden_video.mp4",
+  "model": "Cosmos3-Nano",
+  "source": "TensorRT-LLM VisualGen",
+  "prompt": "A serene mountain landscape with snow-capped peaks and a flowing river",
+  "height": 720,
+  "width": 1280,
+  "num_frames": 189,
+  "num_inference_steps": 35,
+  "guidance_scale": 6.0,
+  "seed": 42,
+  "frame_rate": 24.0,
+  "attention_backend": "VANILLA",
+  "torch_compile": false,
+  "deterministic_algorithms": true,
+  "lpips_net": "alex",
+  "lpips_threshold": 0.05,
+  "diffusers_version": "0.38.0",
+  "tensorrt_llm_version": "1.3.0rc20",
+  "tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
+  "container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
+}
diff --git a/tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/flux1_lpips_golden.json b/tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/flux1_lpips_golden.json
@@ -8,7 +8,12 @@
   "num_inference_steps": 4,
   "guidance_scale": 3.5,
   "seed": 42,
+  "torch_compile": false,
+  "deterministic_algorithms": true,
   "lpips_net": "alex",
   "lpips_threshold": 0.05,
-  "diffusers_version": "0.37.1"
+  "diffusers_version": "0.38.0",
+  "tensorrt_llm_version": "1.3.0rc20",
+  "tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
+  "container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
 }
diff --git a/tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/flux2_lpips_golden.json b/tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/flux2_lpips_golden.json
@@ -8,7 +8,12 @@
   "num_inference_steps": 4,
   "guidance_scale": 3.5,
   "seed": 42,
+  "torch_compile": false,
+  "deterministic_algorithms": true,
   "lpips_net": "alex",
   "lpips_threshold": 0.05,
-  "diffusers_version": "0.37.1"
+  "diffusers_version": "0.38.0",
+  "tensorrt_llm_version": "1.3.0rc20",
+  "tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
+  "container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
 }
diff --git a/tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/ltx2_lpips_golden_video.json b/tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/ltx2_lpips_golden_video.json
@@ -11,6 +11,12 @@
   "guidance_scale": 4.0,
   "seed": 42,
   "frame_rate": 24.0,
+  "torch_compile": false,
+  "deterministic_algorithms": true,
   "lpips_net": "alex",
-  "lpips_threshold": 0.05
+  "lpips_threshold": 0.05,
+  "diffusers_version": "0.38.0",
+  "tensorrt_llm_version": "1.3.0rc20",
+  "tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
+  "container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
 }
diff --git a/tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/qwenimage_lpips_golden.json b/tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/qwenimage_lpips_golden.json
@@ -0,0 +1,20 @@
+{
+  "image": "qwenimage_lpips_golden.png",
+  "model": "Qwen-Image",
+  "source": "TensorRT-LLM VisualGen",
+  "prompt": "a tiny astronaut hatching from an egg on the moon",
+  "negative_prompt": "",
+  "height": 1328,
+  "width": 1328,
+  "num_inference_steps": 50,
+  "true_cfg_scale": 4.0,
+  "seed": 42,
+  "torch_compile": false,
+  "deterministic_algorithms": true,
+  "lpips_net": "alex",
+  "lpips_threshold": 0.05,
+  "diffusers_version": "0.38.0",
+  "tensorrt_llm_version": "1.3.0rc20",
+  "tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
+  "container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
+}
diff --git a/tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/visual_gen_lpips_golden_media.zip b/tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/visual_gen_lpips_golden_media.zip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5ec590f2d76a2a189575e48416bc8609bcadfac6b2d041a094835846a698937
+size 14374170
diff --git a/tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/wan21_t2v_lpips_golden_video.json b/tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/wan21_t2v_lpips_golden_video.json
@@ -11,7 +11,12 @@
   "guidance_scale": 5.0,
   "seed": 42,
   "frame_rate": 16.0,
+  "torch_compile": false,
+  "deterministic_algorithms": true,
   "lpips_net": "alex",
   "lpips_threshold": 0.05,
-  "diffusers_version": "0.37.1"
+  "diffusers_version": "0.38.0",
+  "tensorrt_llm_version": "1.3.0rc20",
+  "tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
+  "container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
 }
diff --git a/tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/wan22_t2v_lpips_golden_video.json b/tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/wan22_t2v_lpips_golden_video.json
@@ -11,7 +11,12 @@
   "guidance_scale": 4.0,
   "seed": 42,
   "frame_rate": 16.0,
+  "torch_compile": false,
+  "deterministic_algorithms": true,
   "lpips_net": "alex",
   "lpips_threshold": 0.05,
-  "diffusers_version": "0.37.1"
+  "diffusers_version": "0.38.0",
+  "tensorrt_llm_version": "1.3.0rc20",
+  "tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
+  "container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
 }
diff --git a/tests/integration/defs/examples/visual_gen/test_visual_gen.py b/tests/integration/defs/examples/visual_gen/test_visual_gen.py
@@ -46,6 +46,8 @@
 WAN_T2V_WIDTH = 832
 WAN_T2V_NUM_FRAMES = 165
 
+# NB: this test file lives at tests/integration/defs/examples/visual_gen/, so the repo
+# root is five levels up (the LPIPS eval script is referenced from <repo>/scripts/).
 REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", ".."))
 VISUAL_GEN_LPIPS_EVAL_SCRIPT = os.path.join(
     REPO_ROOT, "scripts", "visualgen_eval", "visual_gen_lpips_score_eval.py"
@@ -88,6 +90,36 @@
 WAN22_LPIPS_GUIDANCE_SCALE = 4.0
 WAN22_LPIPS_SEED = 42
 WAN22_LPIPS_FRAME_RATE = 16.0
+
+# QwenImage (text-to-image) — default-setting LPIPS golden.
+# Params mirror the QwenImage 20B reference defaults (pipeline_qwen_image.py).
+# NOTE: QwenImage's forward CFG knob is ``true_cfg_scale`` (not ``guidance_scale``),
+# and real-CFG only engages when a negative prompt is supplied.
+QWENIMAGE_MODEL_SUBPATH = "qwen-image"
+QWENIMAGE_LPIPS_PROMPT = "a tiny astronaut hatching from an egg on the moon"
+QWENIMAGE_LPIPS_NEGATIVE_PROMPT = ""
+QWENIMAGE_LPIPS_HEIGHT = 1328
+QWENIMAGE_LPIPS_WIDTH = 1328
+QWENIMAGE_LPIPS_NUM_INFERENCE_STEPS = 50
+QWENIMAGE_LPIPS_TRUE_CFG_SCALE = 4.0
+QWENIMAGE_LPIPS_SEED = 42
+QWENIMAGE_LPIPS_THRESHOLD = 0.05
+
+# Cosmos3-Nano (text-to-video + text-to-image) — default-setting LPIPS golden.
+# Params are the Cosmos3 720P defaults (cosmos3/defaults.py:COSMOS3_720P_PARAMS).
+# Cosmos3 requires VANILLA attention and guardrails disabled in CI.
+COSMOS3_NANO_MODEL_SUBPATH = "Cosmos3-Nano"
+COSMOS3_LPIPS_PROMPT = "A serene mountain landscape with snow-capped peaks and a flowing river"
+COSMOS3_LPIPS_HEIGHT = 720
+COSMOS3_LPIPS_WIDTH = 1280
+COSMOS3_LPIPS_T2V_NUM_FRAMES = 189
+COSMOS3_LPIPS_T2I_NUM_FRAMES = 1
+COSMOS3_LPIPS_NUM_INFERENCE_STEPS = 35
+COSMOS3_LPIPS_GUIDANCE_SCALE = 6.0
+COSMOS3_LPIPS_SEED = 42
+COSMOS3_LPIPS_FRAME_RATE = 24.0
+COSMOS3_LPIPS_THRESHOLD = 0.05
+
 # LTX-2 configuration
 LTX2_MODEL_CHECKPOINT_PATH = "LTX-2/ltx-2-19b-dev.safetensors"
 LTX2_TEXT_ENCODER_SUBPATH = "gemma-3-12b-it"
@@ -488,12 +520,15 @@ def _assert_lpips_below_threshold(score, threshold):
 def _generate_flux_lpips_image(model_path, output_path):
     from tensorrt_llm._torch.visual_gen.pipeline_loader import PipelineLoader
     from tensorrt_llm.media.encoding import save_image
-    from tensorrt_llm.visual_gen.args import VisualGenArgs
+    from tensorrt_llm.visual_gen.args import TorchCompileConfig, VisualGenArgs
 
     _skip_if_missing(model_path, "FLUX checkpoint", is_dir=True)
     _disable_inductor_compile_worker_quiesce()
     with _lpips_deterministic_algorithms():
-        args = VisualGenArgs(model=model_path)
+        args = VisualGenArgs(
+            model=model_path,
+            torch_compile_config=TorchCompileConfig(enable=False),
+        )
         pipeline = PipelineLoader(args).load(skip_warmup=True)
         try:
             result = pipeline.forward(
@@ -572,13 +607,14 @@ def _run_wan_lpips_pipeline(
     parallel=None,
 ):
     from tensorrt_llm._torch.visual_gen.pipeline_loader import PipelineLoader
-    from tensorrt_llm.visual_gen.args import AttentionConfig, VisualGenArgs
+    from tensorrt_llm.visual_gen.args import AttentionConfig, TorchCompileConfig, VisualGenArgs
 
     _skip_if_missing(model_path, "Wan checkpoint", is_dir=True)
     _disable_inductor_compile_worker_quiesce()
     args_kwargs = dict(
         model=model_path,
         attention_config=AttentionConfig(backend=attention_backend),
+        torch_compile_config=TorchCompileConfig(enable=False),
     )
     if parallel is not None:
         args_kwargs["parallel_config"] = parallel
@@ -677,6 +713,111 @@ def wan22_bf16_video_path(_visual_gen_deps, llm_venv):
     return output_path
 
 
+def _generate_qwenimage_lpips_image(model_path, output_path):
+    """Generate the QwenImage text-to-image LPIPS sample (default setting, compile-off)."""
+    from tensorrt_llm._torch.visual_gen.pipeline_loader import PipelineLoader
+    from tensorrt_llm.media.encoding import save_image
+    from tensorrt_llm.visual_gen.args import TorchCompileConfig, VisualGenArgs
+
+    _skip_if_missing(model_path, "QwenImage checkpoint", is_dir=True)
+    _disable_inductor_compile_worker_quiesce()
+    args = VisualGenArgs(
+        model=model_path,
+        torch_compile_config=TorchCompileConfig(enable=False),
+    )
+    pipeline = PipelineLoader(args).load(skip_warmup=True)
+    try:
+        with torch.no_grad():
+            result = pipeline.forward(
+                prompt=QWENIMAGE_LPIPS_PROMPT,
+                negative_prompt=QWENIMAGE_LPIPS_NEGATIVE_PROMPT,
+                height=QWENIMAGE_LPIPS_HEIGHT,
+                width=QWENIMAGE_LPIPS_WIDTH,
+                num_inference_steps=QWENIMAGE_LPIPS_NUM_INFERENCE_STEPS,
+                true_cfg_scale=QWENIMAGE_LPIPS_TRUE_CFG_SCALE,
+                seed=QWENIMAGE_LPIPS_SEED,
+            )
+        generated_image = result.image[0].detach().cpu()
+    finally:
+        del pipeline
+        _cleanup_cuda()
+
+    save_image(generated_image, output_path)
+
+
+def _run_cosmos3_lpips_pipeline(num_frames):
+    """Run the Cosmos3-Nano pipeline (default setting, VANILLA attn, compile-off).
+
+    Returns the generated video tensor ``(B, T, H, W, C)`` (T == ``num_frames``),
+    or ``None`` if generation produced no video.  ``num_frames=1`` yields the
+    single-frame text-to-image path.
+    """
+    # Cosmos3 re-reads the guardrail flag in __init__; set it before the pipeline loads.
+    guardrails_env_key = "TRTLLM_DISABLE_COSMOS3_GUARDRAILS"
+    previous_guardrails_env = os.environ.get(guardrails_env_key)
+    os.environ[guardrails_env_key] = "1"
+    try:
+        from tensorrt_llm._torch.visual_gen.pipeline_loader import PipelineLoader
+        from tensorrt_llm.visual_gen.args import (
+            AttentionConfig,
+            CompilationConfig,
+            TorchCompileConfig,
+            VisualGenArgs,
+        )
+
+        model_path = _lpips_model_path(COSMOS3_NANO_MODEL_SUBPATH)
+        _skip_if_missing(model_path, "Cosmos3-Nano checkpoint", is_dir=True)
+        _disable_inductor_compile_worker_quiesce()
+        args = VisualGenArgs(
+            model=model_path,
+            compilation_config=CompilationConfig(skip_warmup=True),
+            torch_compile_config=TorchCompileConfig(enable=False),
+            attention_config=AttentionConfig(backend="VANILLA"),
+        )
+        pipeline = PipelineLoader(args).load(skip_warmup=True)
+        try:
+            with torch.no_grad():
+                result = pipeline.forward(
+                    prompt=COSMOS3_LPIPS_PROMPT,
+                    seed=COSMOS3_LPIPS_SEED,
+                    height=COSMOS3_LPIPS_HEIGHT,
+                    width=COSMOS3_LPIPS_WIDTH,
+                    num_frames=num_frames,
+                    num_inference_steps=COSMOS3_LPIPS_NUM_INFERENCE_STEPS,
+                    guidance_scale=COSMOS3_LPIPS_GUIDANCE_SCALE,
+                    frame_rate=COSMOS3_LPIPS_FRAME_RATE,
+                    use_guardrails=False,
+                )
+            if result is None or result.video is None:
+                return None
+            return result.video.detach().cpu()
+        finally:
+            del pipeline
+            _cleanup_cuda()
+    finally:
+        if previous_guardrails_env is None:
+            os.environ.pop(guardrails_env_key, None)
+        else:
+            os.environ[guardrails_env_key] = previous_guardrails_env
+
+
+def _generate_cosmos3_lpips_video(output_path):
+    """Generate the Cosmos3-Nano text-to-video LPIPS sample."""
+    video = _run_cosmos3_lpips_pipeline(COSMOS3_LPIPS_T2V_NUM_FRAMES)
+    assert video is not None, "Cosmos3-Nano T2V LPIPS run produced no video"
+    _save_lpips_video_mp4(video, output_path, frame_rate=COSMOS3_LPIPS_FRAME_RATE)
+
+
+def _generate_cosmos3_lpips_image(output_path):
+    """Generate the Cosmos3-Nano text-to-image LPIPS sample (single frame)."""
+    from tensorrt_llm.media.encoding import save_image
+
+    video = _run_cosmos3_lpips_pipeline(COSMOS3_LPIPS_T2I_NUM_FRAMES)
+    assert video is not None, "Cosmos3-Nano T2I LPIPS run produced no frame"
+    # video is (B, T, H, W, C); take the single frame -> (H, W, C) for save_image.
+    save_image(video[0, 0], output_path)
+
+
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 def test_flux1_lpips_against_golden(tmp_path):
     generated_path = tmp_path / "flux1_generated.png"
@@ -779,6 +920,62 @@ def test_wan22_t2v_lpips_against_golden(tmp_path, wan22_bf16_video_path):
     _assert_lpips_below_threshold(score, WAN_LPIPS_THRESHOLD)
 
 
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_qwenimage_lpips_against_golden(tmp_path):
+    generated_path = tmp_path / "qwenimage_generated.png"
+    golden_path = _golden_media_path(
+        tmp_path, "qwenimage_lpips_golden.png", "QwenImage LPIPS golden image"
+    )
+    _generate_qwenimage_lpips_image(_lpips_model_path(QWENIMAGE_MODEL_SUBPATH), generated_path)
+    score = _run_lpips_eval(
+        tmp_path,
+        "qwenimage",
+        "image",
+        QWENIMAGE_LPIPS_PROMPT,
+        golden_path,
+        generated_path,
+    )
+    _assert_lpips_below_threshold(score, QWENIMAGE_LPIPS_THRESHOLD)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_cosmos3_nano_t2v_lpips_against_golden(tmp_path):
+    generated_path = tmp_path / "cosmos3_nano_t2v_generated.mp4"
+    golden_path = _golden_media_path(
+        tmp_path,
+        "cosmos3_nano_t2v_lpips_golden_video.mp4",
+        "Cosmos3-Nano T2V LPIPS golden video",
+    )
+    _generate_cosmos3_lpips_video(generated_path)
+    score = _run_lpips_eval(
+        tmp_path,
+        "cosmos3_nano_t2v",
+        "video",
+        COSMOS3_LPIPS_PROMPT,
+        golden_path,
+        generated_path,
+    )
+    _assert_lpips_below_threshold(score, COSMOS3_LPIPS_THRESHOLD)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_cosmos3_nano_t2i_lpips_against_golden(tmp_path):
+    generated_path = tmp_path / "cosmos3_nano_t2i_generated.png"
+    golden_path = _golden_media_path(
+        tmp_path, "cosmos3_nano_t2i_lpips_golden.png", "Cosmos3-Nano T2I LPIPS golden image"
+    )
+    _generate_cosmos3_lpips_image(generated_path)
+    score = _run_lpips_eval(
+        tmp_path,
+        "cosmos3_nano_t2i",
+        "image",
+        COSMOS3_LPIPS_PROMPT,
+        golden_path,
+        generated_path,
+    )
+    _assert_lpips_below_threshold(score, COSMOS3_LPIPS_THRESHOLD)
+
+
 def _generate_wan_video(llm_venv, model_subpath, output_subdir):
     """Generate a WAN video for a given model checkpoint.
 
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:f5ec590f2d76a2a189575e48416bc8609bcadfac6b2d041a094835846a698937`
	`3`	`+size 14374170`