Skip to content

Commit 9b4c317

Browse files
committed
[None][test] Add Stage-1 LPIPS golden accuracy tests for visual generation
Add default-setting single-GPU LPIPS golden tests for QwenImage and Cosmos3-Nano, completing single-GPU VisualGen LPIPS CI coverage across all supported models. Refresh all eight golden media entries with the pinned staging main image at TRT-LLM commit 85665f5, which contains the Cosmos3 accuracy fix from #15545. Record the TRT-LLM commit, package versions, container digest, compile-off mode, and deterministic-algorithm mode in every golden JSON while retaining the existing LPIPS thresholds. Explicitly disable pipeline torch compile for every LPIPS generator. Unwaive all eight single-GPU cases, test_wan_t2v_example, and the passing attn2d_2x2_ulysses2 multi-GPU case. Retain the six pre-existing multi-GPU waivers and waive only the new cfg2_ulysses2_attn2d_2x1 case that measured 0.255208 above the unchanged 0.25 threshold. Validated on B200: all eight single-GPU LPIPS cases passed at 0.000000, test_wan_t2v_example passed, and attn2d_2x2_ulysses2 passed at 0.232348. Signed-off-by: Chang Liu <9713593+chang-l@users.noreply.github.com>
1 parent c9b6518 commit 9b4c317

12 files changed

Lines changed: 303 additions & 14 deletions
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{
2+
"image": "cosmos3_nano_t2i_lpips_golden.png",
3+
"model": "Cosmos3-Nano",
4+
"source": "TensorRT-LLM VisualGen",
5+
"prompt": "A serene mountain landscape with snow-capped peaks and a flowing river",
6+
"height": 720,
7+
"width": 1280,
8+
"num_frames": 1,
9+
"num_inference_steps": 35,
10+
"guidance_scale": 6.0,
11+
"seed": 42,
12+
"attention_backend": "VANILLA",
13+
"torch_compile": false,
14+
"deterministic_algorithms": true,
15+
"lpips_net": "alex",
16+
"lpips_threshold": 0.05,
17+
"diffusers_version": "0.38.0",
18+
"tensorrt_llm_version": "1.3.0rc20",
19+
"tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
20+
"container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
21+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
"video": "cosmos3_nano_t2v_lpips_golden_video.mp4",
3+
"model": "Cosmos3-Nano",
4+
"source": "TensorRT-LLM VisualGen",
5+
"prompt": "A serene mountain landscape with snow-capped peaks and a flowing river",
6+
"height": 720,
7+
"width": 1280,
8+
"num_frames": 189,
9+
"num_inference_steps": 35,
10+
"guidance_scale": 6.0,
11+
"seed": 42,
12+
"frame_rate": 24.0,
13+
"attention_backend": "VANILLA",
14+
"torch_compile": false,
15+
"deterministic_algorithms": true,
16+
"lpips_net": "alex",
17+
"lpips_threshold": 0.05,
18+
"diffusers_version": "0.38.0",
19+
"tensorrt_llm_version": "1.3.0rc20",
20+
"tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
21+
"container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
22+
}

tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/flux1_lpips_golden.json

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,12 @@
88
"num_inference_steps": 4,
99
"guidance_scale": 3.5,
1010
"seed": 42,
11+
"torch_compile": false,
12+
"deterministic_algorithms": true,
1113
"lpips_net": "alex",
1214
"lpips_threshold": 0.05,
13-
"diffusers_version": "0.37.1"
15+
"diffusers_version": "0.38.0",
16+
"tensorrt_llm_version": "1.3.0rc20",
17+
"tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
18+
"container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
1419
}

tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/flux2_lpips_golden.json

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,12 @@
88
"num_inference_steps": 4,
99
"guidance_scale": 3.5,
1010
"seed": 42,
11+
"torch_compile": false,
12+
"deterministic_algorithms": true,
1113
"lpips_net": "alex",
1214
"lpips_threshold": 0.05,
13-
"diffusers_version": "0.37.1"
15+
"diffusers_version": "0.38.0",
16+
"tensorrt_llm_version": "1.3.0rc20",
17+
"tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
18+
"container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
1419
}

tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/ltx2_lpips_golden_video.json

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@
1111
"guidance_scale": 4.0,
1212
"seed": 42,
1313
"frame_rate": 24.0,
14+
"torch_compile": false,
15+
"deterministic_algorithms": true,
1416
"lpips_net": "alex",
15-
"lpips_threshold": 0.05
17+
"lpips_threshold": 0.05,
18+
"diffusers_version": "0.38.0",
19+
"tensorrt_llm_version": "1.3.0rc20",
20+
"tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
21+
"container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
1622
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"image": "qwenimage_lpips_golden.png",
3+
"model": "Qwen-Image",
4+
"source": "TensorRT-LLM VisualGen",
5+
"prompt": "a tiny astronaut hatching from an egg on the moon",
6+
"negative_prompt": "",
7+
"height": 1328,
8+
"width": 1328,
9+
"num_inference_steps": 50,
10+
"true_cfg_scale": 4.0,
11+
"seed": 42,
12+
"torch_compile": false,
13+
"deterministic_algorithms": true,
14+
"lpips_net": "alex",
15+
"lpips_threshold": 0.05,
16+
"diffusers_version": "0.38.0",
17+
"tensorrt_llm_version": "1.3.0rc20",
18+
"tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
19+
"container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
20+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:f5ec590f2d76a2a189575e48416bc8609bcadfac6b2d041a094835846a698937
3+
size 14374170

tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/wan21_t2v_lpips_golden_video.json

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,12 @@
1111
"guidance_scale": 5.0,
1212
"seed": 42,
1313
"frame_rate": 16.0,
14+
"torch_compile": false,
15+
"deterministic_algorithms": true,
1416
"lpips_net": "alex",
1517
"lpips_threshold": 0.05,
16-
"diffusers_version": "0.37.1"
18+
"diffusers_version": "0.38.0",
19+
"tensorrt_llm_version": "1.3.0rc20",
20+
"tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
21+
"container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
1722
}

tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/wan22_t2v_lpips_golden_video.json

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,12 @@
1111
"guidance_scale": 4.0,
1212
"seed": 42,
1313
"frame_rate": 16.0,
14+
"torch_compile": false,
15+
"deterministic_algorithms": true,
1416
"lpips_net": "alex",
1517
"lpips_threshold": 0.05,
16-
"diffusers_version": "0.37.1"
18+
"diffusers_version": "0.38.0",
19+
"tensorrt_llm_version": "1.3.0rc20",
20+
"tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
21+
"container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
1722
}

tests/integration/defs/examples/visual_gen/test_visual_gen.py

Lines changed: 200 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@
4646
WAN_T2V_WIDTH = 832
4747
WAN_T2V_NUM_FRAMES = 165
4848

49+
# NB: this test file lives at tests/integration/defs/examples/visual_gen/, so the repo
50+
# root is five levels up (the LPIPS eval script is referenced from <repo>/scripts/).
4951
REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", ".."))
5052
VISUAL_GEN_LPIPS_EVAL_SCRIPT = os.path.join(
5153
REPO_ROOT, "scripts", "visualgen_eval", "visual_gen_lpips_score_eval.py"
@@ -88,6 +90,36 @@
8890
WAN22_LPIPS_GUIDANCE_SCALE = 4.0
8991
WAN22_LPIPS_SEED = 42
9092
WAN22_LPIPS_FRAME_RATE = 16.0
93+
94+
# QwenImage (text-to-image) — default-setting LPIPS golden.
95+
# Params mirror the QwenImage 20B reference defaults (pipeline_qwen_image.py).
96+
# NOTE: QwenImage's forward CFG knob is ``true_cfg_scale`` (not ``guidance_scale``),
97+
# and real-CFG only engages when a negative prompt is supplied.
98+
QWENIMAGE_MODEL_SUBPATH = "qwen-image"
99+
QWENIMAGE_LPIPS_PROMPT = "a tiny astronaut hatching from an egg on the moon"
100+
QWENIMAGE_LPIPS_NEGATIVE_PROMPT = ""
101+
QWENIMAGE_LPIPS_HEIGHT = 1328
102+
QWENIMAGE_LPIPS_WIDTH = 1328
103+
QWENIMAGE_LPIPS_NUM_INFERENCE_STEPS = 50
104+
QWENIMAGE_LPIPS_TRUE_CFG_SCALE = 4.0
105+
QWENIMAGE_LPIPS_SEED = 42
106+
QWENIMAGE_LPIPS_THRESHOLD = 0.05
107+
108+
# Cosmos3-Nano (text-to-video + text-to-image) — default-setting LPIPS golden.
109+
# Params are the Cosmos3 720P defaults (cosmos3/defaults.py:COSMOS3_720P_PARAMS).
110+
# Cosmos3 requires VANILLA attention and guardrails disabled in CI.
111+
COSMOS3_NANO_MODEL_SUBPATH = "Cosmos3-Nano"
112+
COSMOS3_LPIPS_PROMPT = "A serene mountain landscape with snow-capped peaks and a flowing river"
113+
COSMOS3_LPIPS_HEIGHT = 720
114+
COSMOS3_LPIPS_WIDTH = 1280
115+
COSMOS3_LPIPS_T2V_NUM_FRAMES = 189
116+
COSMOS3_LPIPS_T2I_NUM_FRAMES = 1
117+
COSMOS3_LPIPS_NUM_INFERENCE_STEPS = 35
118+
COSMOS3_LPIPS_GUIDANCE_SCALE = 6.0
119+
COSMOS3_LPIPS_SEED = 42
120+
COSMOS3_LPIPS_FRAME_RATE = 24.0
121+
COSMOS3_LPIPS_THRESHOLD = 0.05
122+
91123
# LTX-2 configuration
92124
LTX2_MODEL_CHECKPOINT_PATH = "LTX-2/ltx-2-19b-dev.safetensors"
93125
LTX2_TEXT_ENCODER_SUBPATH = "gemma-3-12b-it"
@@ -488,12 +520,15 @@ def _assert_lpips_below_threshold(score, threshold):
488520
def _generate_flux_lpips_image(model_path, output_path):
489521
from tensorrt_llm._torch.visual_gen.pipeline_loader import PipelineLoader
490522
from tensorrt_llm.media.encoding import save_image
491-
from tensorrt_llm.visual_gen.args import VisualGenArgs
523+
from tensorrt_llm.visual_gen.args import TorchCompileConfig, VisualGenArgs
492524

493525
_skip_if_missing(model_path, "FLUX checkpoint", is_dir=True)
494526
_disable_inductor_compile_worker_quiesce()
495527
with _lpips_deterministic_algorithms():
496-
args = VisualGenArgs(model=model_path)
528+
args = VisualGenArgs(
529+
model=model_path,
530+
torch_compile_config=TorchCompileConfig(enable=False),
531+
)
497532
pipeline = PipelineLoader(args).load(skip_warmup=True)
498533
try:
499534
result = pipeline.forward(
@@ -572,13 +607,14 @@ def _run_wan_lpips_pipeline(
572607
parallel=None,
573608
):
574609
from tensorrt_llm._torch.visual_gen.pipeline_loader import PipelineLoader
575-
from tensorrt_llm.visual_gen.args import AttentionConfig, VisualGenArgs
610+
from tensorrt_llm.visual_gen.args import AttentionConfig, TorchCompileConfig, VisualGenArgs
576611

577612
_skip_if_missing(model_path, "Wan checkpoint", is_dir=True)
578613
_disable_inductor_compile_worker_quiesce()
579614
args_kwargs = dict(
580615
model=model_path,
581616
attention_config=AttentionConfig(backend=attention_backend),
617+
torch_compile_config=TorchCompileConfig(enable=False),
582618
)
583619
if parallel is not None:
584620
args_kwargs["parallel_config"] = parallel
@@ -677,6 +713,111 @@ def wan22_bf16_video_path(_visual_gen_deps, llm_venv):
677713
return output_path
678714

679715

716+
def _generate_qwenimage_lpips_image(model_path, output_path):
717+
"""Generate the QwenImage text-to-image LPIPS sample (default setting, compile-off)."""
718+
from tensorrt_llm._torch.visual_gen.pipeline_loader import PipelineLoader
719+
from tensorrt_llm.media.encoding import save_image
720+
from tensorrt_llm.visual_gen.args import TorchCompileConfig, VisualGenArgs
721+
722+
_skip_if_missing(model_path, "QwenImage checkpoint", is_dir=True)
723+
_disable_inductor_compile_worker_quiesce()
724+
args = VisualGenArgs(
725+
model=model_path,
726+
torch_compile_config=TorchCompileConfig(enable=False),
727+
)
728+
pipeline = PipelineLoader(args).load(skip_warmup=True)
729+
try:
730+
with torch.no_grad():
731+
result = pipeline.forward(
732+
prompt=QWENIMAGE_LPIPS_PROMPT,
733+
negative_prompt=QWENIMAGE_LPIPS_NEGATIVE_PROMPT,
734+
height=QWENIMAGE_LPIPS_HEIGHT,
735+
width=QWENIMAGE_LPIPS_WIDTH,
736+
num_inference_steps=QWENIMAGE_LPIPS_NUM_INFERENCE_STEPS,
737+
true_cfg_scale=QWENIMAGE_LPIPS_TRUE_CFG_SCALE,
738+
seed=QWENIMAGE_LPIPS_SEED,
739+
)
740+
generated_image = result.image[0].detach().cpu()
741+
finally:
742+
del pipeline
743+
_cleanup_cuda()
744+
745+
save_image(generated_image, output_path)
746+
747+
748+
def _run_cosmos3_lpips_pipeline(num_frames):
749+
"""Run the Cosmos3-Nano pipeline (default setting, VANILLA attn, compile-off).
750+
751+
Returns the generated video tensor ``(B, T, H, W, C)`` (T == ``num_frames``),
752+
or ``None`` if generation produced no video. ``num_frames=1`` yields the
753+
single-frame text-to-image path.
754+
"""
755+
# Cosmos3 re-reads the guardrail flag in __init__; set it before the pipeline loads.
756+
guardrails_env_key = "TRTLLM_DISABLE_COSMOS3_GUARDRAILS"
757+
previous_guardrails_env = os.environ.get(guardrails_env_key)
758+
os.environ[guardrails_env_key] = "1"
759+
try:
760+
from tensorrt_llm._torch.visual_gen.pipeline_loader import PipelineLoader
761+
from tensorrt_llm.visual_gen.args import (
762+
AttentionConfig,
763+
CompilationConfig,
764+
TorchCompileConfig,
765+
VisualGenArgs,
766+
)
767+
768+
model_path = _lpips_model_path(COSMOS3_NANO_MODEL_SUBPATH)
769+
_skip_if_missing(model_path, "Cosmos3-Nano checkpoint", is_dir=True)
770+
_disable_inductor_compile_worker_quiesce()
771+
args = VisualGenArgs(
772+
model=model_path,
773+
compilation_config=CompilationConfig(skip_warmup=True),
774+
torch_compile_config=TorchCompileConfig(enable=False),
775+
attention_config=AttentionConfig(backend="VANILLA"),
776+
)
777+
pipeline = PipelineLoader(args).load(skip_warmup=True)
778+
try:
779+
with torch.no_grad():
780+
result = pipeline.forward(
781+
prompt=COSMOS3_LPIPS_PROMPT,
782+
seed=COSMOS3_LPIPS_SEED,
783+
height=COSMOS3_LPIPS_HEIGHT,
784+
width=COSMOS3_LPIPS_WIDTH,
785+
num_frames=num_frames,
786+
num_inference_steps=COSMOS3_LPIPS_NUM_INFERENCE_STEPS,
787+
guidance_scale=COSMOS3_LPIPS_GUIDANCE_SCALE,
788+
frame_rate=COSMOS3_LPIPS_FRAME_RATE,
789+
use_guardrails=False,
790+
)
791+
if result is None or result.video is None:
792+
return None
793+
return result.video.detach().cpu()
794+
finally:
795+
del pipeline
796+
_cleanup_cuda()
797+
finally:
798+
if previous_guardrails_env is None:
799+
os.environ.pop(guardrails_env_key, None)
800+
else:
801+
os.environ[guardrails_env_key] = previous_guardrails_env
802+
803+
804+
def _generate_cosmos3_lpips_video(output_path):
805+
"""Generate the Cosmos3-Nano text-to-video LPIPS sample."""
806+
video = _run_cosmos3_lpips_pipeline(COSMOS3_LPIPS_T2V_NUM_FRAMES)
807+
assert video is not None, "Cosmos3-Nano T2V LPIPS run produced no video"
808+
_save_lpips_video_mp4(video, output_path, frame_rate=COSMOS3_LPIPS_FRAME_RATE)
809+
810+
811+
def _generate_cosmos3_lpips_image(output_path):
812+
"""Generate the Cosmos3-Nano text-to-image LPIPS sample (single frame)."""
813+
from tensorrt_llm.media.encoding import save_image
814+
815+
video = _run_cosmos3_lpips_pipeline(COSMOS3_LPIPS_T2I_NUM_FRAMES)
816+
assert video is not None, "Cosmos3-Nano T2I LPIPS run produced no frame"
817+
# video is (B, T, H, W, C); take the single frame -> (H, W, C) for save_image.
818+
save_image(video[0, 0], output_path)
819+
820+
680821
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
681822
def test_flux1_lpips_against_golden(tmp_path):
682823
generated_path = tmp_path / "flux1_generated.png"
@@ -779,6 +920,62 @@ def test_wan22_t2v_lpips_against_golden(tmp_path, wan22_bf16_video_path):
779920
_assert_lpips_below_threshold(score, WAN_LPIPS_THRESHOLD)
780921

781922

923+
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
924+
def test_qwenimage_lpips_against_golden(tmp_path):
925+
generated_path = tmp_path / "qwenimage_generated.png"
926+
golden_path = _golden_media_path(
927+
tmp_path, "qwenimage_lpips_golden.png", "QwenImage LPIPS golden image"
928+
)
929+
_generate_qwenimage_lpips_image(_lpips_model_path(QWENIMAGE_MODEL_SUBPATH), generated_path)
930+
score = _run_lpips_eval(
931+
tmp_path,
932+
"qwenimage",
933+
"image",
934+
QWENIMAGE_LPIPS_PROMPT,
935+
golden_path,
936+
generated_path,
937+
)
938+
_assert_lpips_below_threshold(score, QWENIMAGE_LPIPS_THRESHOLD)
939+
940+
941+
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
942+
def test_cosmos3_nano_t2v_lpips_against_golden(tmp_path):
943+
generated_path = tmp_path / "cosmos3_nano_t2v_generated.mp4"
944+
golden_path = _golden_media_path(
945+
tmp_path,
946+
"cosmos3_nano_t2v_lpips_golden_video.mp4",
947+
"Cosmos3-Nano T2V LPIPS golden video",
948+
)
949+
_generate_cosmos3_lpips_video(generated_path)
950+
score = _run_lpips_eval(
951+
tmp_path,
952+
"cosmos3_nano_t2v",
953+
"video",
954+
COSMOS3_LPIPS_PROMPT,
955+
golden_path,
956+
generated_path,
957+
)
958+
_assert_lpips_below_threshold(score, COSMOS3_LPIPS_THRESHOLD)
959+
960+
961+
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
962+
def test_cosmos3_nano_t2i_lpips_against_golden(tmp_path):
963+
generated_path = tmp_path / "cosmos3_nano_t2i_generated.png"
964+
golden_path = _golden_media_path(
965+
tmp_path, "cosmos3_nano_t2i_lpips_golden.png", "Cosmos3-Nano T2I LPIPS golden image"
966+
)
967+
_generate_cosmos3_lpips_image(generated_path)
968+
score = _run_lpips_eval(
969+
tmp_path,
970+
"cosmos3_nano_t2i",
971+
"image",
972+
COSMOS3_LPIPS_PROMPT,
973+
golden_path,
974+
generated_path,
975+
)
976+
_assert_lpips_below_threshold(score, COSMOS3_LPIPS_THRESHOLD)
977+
978+
782979
def _generate_wan_video(llm_venv, model_subpath, output_subdir):
783980
"""Generate a WAN video for a given model checkpoint.
784981

0 commit comments

Comments
 (0)