Skip to content

Commit 32010aa

Browse files
committed
[None][test] Add Stage-1 LPIPS golden accuracy tests for QwenImage and Cosmos3-Nano
Add default-setting single-GPU LPIPS golden tests for QwenImage and Cosmos3-Nano, and complete the VisualGen LPIPS CI protection introduced by the preceding changes. Refresh all eight golden media entries with the pinned staging main image at TRT-LLM commit 85665f5, which contains the Cosmos3 accuracy fix from #15545. Record the TRT-LLM commit, package versions, and container digest in every golden JSON while retaining the original LPIPS thresholds. Remove every remaining LPIPS waiver and keep all eight single-GPU and eight multi-GPU cases registered in the B200 test lists. Restore the Cosmos3 guardrail environment after generation and disable TRT-LLM MPI bootstrap before the torch-distributed multi-GPU harness imports its helpers. Validated on B200: all eight single-GPU and eight multi-GPU LPIPS cases passed with the original thresholds. Signed-off-by: Chang Liu <9713593+chang-l@users.noreply.github.com>
1 parent b6d186a commit 32010aa

13 files changed

Lines changed: 288 additions & 16 deletions
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"image": "cosmos3_nano_t2i_lpips_golden.png",
3+
"model": "Cosmos3-Nano",
4+
"source": "TensorRT-LLM VisualGen",
5+
"prompt": "A serene mountain landscape with snow-capped peaks and a flowing river",
6+
"height": 720,
7+
"width": 1280,
8+
"num_frames": 1,
9+
"num_inference_steps": 35,
10+
"guidance_scale": 6.0,
11+
"seed": 42,
12+
"attention_backend": "VANILLA",
13+
"torch_compile": false,
14+
"lpips_net": "alex",
15+
"lpips_threshold": 0.05,
16+
"diffusers_version": "0.38.0",
17+
"tensorrt_llm_version": "1.3.0rc20",
18+
"tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
19+
"container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
20+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{
2+
"video": "cosmos3_nano_t2v_lpips_golden_video.mp4",
3+
"model": "Cosmos3-Nano",
4+
"source": "TensorRT-LLM VisualGen",
5+
"prompt": "A serene mountain landscape with snow-capped peaks and a flowing river",
6+
"height": 720,
7+
"width": 1280,
8+
"num_frames": 189,
9+
"num_inference_steps": 35,
10+
"guidance_scale": 6.0,
11+
"seed": 42,
12+
"frame_rate": 24.0,
13+
"attention_backend": "VANILLA",
14+
"torch_compile": false,
15+
"lpips_net": "alex",
16+
"lpips_threshold": 0.05,
17+
"diffusers_version": "0.38.0",
18+
"tensorrt_llm_version": "1.3.0rc20",
19+
"tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
20+
"container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
21+
}

tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/flux1_lpips_golden.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,8 @@
1010
"seed": 42,
1111
"lpips_net": "alex",
1212
"lpips_threshold": 0.05,
13-
"diffusers_version": "0.37.1"
13+
"diffusers_version": "0.38.0",
14+
"tensorrt_llm_version": "1.3.0rc20",
15+
"tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
16+
"container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
1417
}

tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/flux2_lpips_golden.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,8 @@
1010
"seed": 42,
1111
"lpips_net": "alex",
1212
"lpips_threshold": 0.05,
13-
"diffusers_version": "0.37.1"
13+
"diffusers_version": "0.38.0",
14+
"tensorrt_llm_version": "1.3.0rc20",
15+
"tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
16+
"container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
1417
}

tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/ltx2_lpips_golden_video.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,9 @@
1212
"seed": 42,
1313
"frame_rate": 24.0,
1414
"lpips_net": "alex",
15-
"lpips_threshold": 0.05
15+
"lpips_threshold": 0.05,
16+
"diffusers_version": "0.38.0",
17+
"tensorrt_llm_version": "1.3.0rc20",
18+
"tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
19+
"container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
1620
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"image": "qwenimage_lpips_golden.png",
3+
"model": "Qwen-Image",
4+
"source": "TensorRT-LLM VisualGen",
5+
"prompt": "a tiny astronaut hatching from an egg on the moon",
6+
"negative_prompt": "",
7+
"height": 1328,
8+
"width": 1328,
9+
"num_inference_steps": 50,
10+
"true_cfg_scale": 4.0,
11+
"seed": 42,
12+
"torch_compile": false,
13+
"lpips_net": "alex",
14+
"lpips_threshold": 0.05,
15+
"diffusers_version": "0.38.0",
16+
"tensorrt_llm_version": "1.3.0rc20",
17+
"tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
18+
"container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
19+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:2d6938a0d47b090dd99cf9736985f699097c1ebd96ff93fe931b959a26655033
3+
size 14369870

tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/wan21_t2v_lpips_golden_video.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,8 @@
1313
"frame_rate": 16.0,
1414
"lpips_net": "alex",
1515
"lpips_threshold": 0.05,
16-
"diffusers_version": "0.37.1"
16+
"diffusers_version": "0.38.0",
17+
"tensorrt_llm_version": "1.3.0rc20",
18+
"tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
19+
"container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
1720
}

tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/wan22_t2v_lpips_golden_video.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,8 @@
1313
"frame_rate": 16.0,
1414
"lpips_net": "alex",
1515
"lpips_threshold": 0.05,
16-
"diffusers_version": "0.37.1"
16+
"diffusers_version": "0.38.0",
17+
"tensorrt_llm_version": "1.3.0rc20",
18+
"tensorrt_llm_commit": "85665f5fd331d0154a78172954846d843085e83f",
19+
"container_image": "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release@sha256:3308a2dc0192a8329ea02eca7b5c44f290f5e894cd8c5921099308d84c3e5691"
1720
}

tests/integration/defs/examples/visual_gen/test_visual_gen.py

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@
4646
WAN_T2V_WIDTH = 832
4747
WAN_T2V_NUM_FRAMES = 165
4848

49+
# NB: this test file lives at tests/integration/defs/examples/visual_gen/, so the repo
50+
# root is five levels up (the LPIPS eval script is referenced from <repo>/scripts/).
4951
REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", ".."))
5052
VISUAL_GEN_LPIPS_EVAL_SCRIPT = os.path.join(
5153
REPO_ROOT, "scripts", "visualgen_eval", "visual_gen_lpips_score_eval.py"
@@ -87,6 +89,36 @@
8789
WAN22_LPIPS_GUIDANCE_SCALE = 4.0
8890
WAN22_LPIPS_SEED = 42
8991
WAN22_LPIPS_FRAME_RATE = 16.0
92+
93+
# QwenImage (text-to-image) — default-setting LPIPS golden.
94+
# Params mirror the QwenImage 20B reference defaults (pipeline_qwen_image.py).
95+
# NOTE: QwenImage's forward CFG knob is ``true_cfg_scale`` (not ``guidance_scale``),
96+
# and real-CFG only engages when a negative prompt is supplied.
97+
QWENIMAGE_MODEL_SUBPATH = "qwen-image"
98+
QWENIMAGE_LPIPS_PROMPT = "a tiny astronaut hatching from an egg on the moon"
99+
QWENIMAGE_LPIPS_NEGATIVE_PROMPT = ""
100+
QWENIMAGE_LPIPS_HEIGHT = 1328
101+
QWENIMAGE_LPIPS_WIDTH = 1328
102+
QWENIMAGE_LPIPS_NUM_INFERENCE_STEPS = 50
103+
QWENIMAGE_LPIPS_TRUE_CFG_SCALE = 4.0
104+
QWENIMAGE_LPIPS_SEED = 42
105+
QWENIMAGE_LPIPS_THRESHOLD = 0.05
106+
107+
# Cosmos3-Nano (text-to-video + text-to-image) — default-setting LPIPS golden.
108+
# Params are the Cosmos3 720P defaults (cosmos3/defaults.py:COSMOS3_720P_PARAMS).
109+
# Cosmos3 requires VANILLA attention and guardrails disabled in CI.
110+
COSMOS3_NANO_MODEL_SUBPATH = "Cosmos3-Nano"
111+
COSMOS3_LPIPS_PROMPT = "A serene mountain landscape with snow-capped peaks and a flowing river"
112+
COSMOS3_LPIPS_HEIGHT = 720
113+
COSMOS3_LPIPS_WIDTH = 1280
114+
COSMOS3_LPIPS_T2V_NUM_FRAMES = 189
115+
COSMOS3_LPIPS_T2I_NUM_FRAMES = 1
116+
COSMOS3_LPIPS_NUM_INFERENCE_STEPS = 35
117+
COSMOS3_LPIPS_GUIDANCE_SCALE = 6.0
118+
COSMOS3_LPIPS_SEED = 42
119+
COSMOS3_LPIPS_FRAME_RATE = 24.0
120+
COSMOS3_LPIPS_THRESHOLD = 0.05
121+
90122
# LTX-2 configuration
91123
LTX2_MODEL_CHECKPOINT_PATH = "LTX-2/ltx-2-19b-dev.safetensors"
92124
LTX2_TEXT_ENCODER_SUBPATH = "gemma-3-12b-it"
@@ -674,6 +706,111 @@ def wan22_bf16_video_path(_visual_gen_deps, llm_venv):
674706
return output_path
675707

676708

709+
def _generate_qwenimage_lpips_image(model_path, output_path):
710+
"""Generate the QwenImage text-to-image LPIPS sample (default setting, compile-off)."""
711+
from tensorrt_llm._torch.visual_gen.pipeline_loader import PipelineLoader
712+
from tensorrt_llm.media.encoding import save_image
713+
from tensorrt_llm.visual_gen.args import TorchCompileConfig, VisualGenArgs
714+
715+
_skip_if_missing(model_path, "QwenImage checkpoint", is_dir=True)
716+
_disable_inductor_compile_worker_quiesce()
717+
args = VisualGenArgs(
718+
model=model_path,
719+
torch_compile_config=TorchCompileConfig(enable=False),
720+
)
721+
pipeline = PipelineLoader(args).load(skip_warmup=True)
722+
try:
723+
with torch.no_grad():
724+
result = pipeline.forward(
725+
prompt=QWENIMAGE_LPIPS_PROMPT,
726+
negative_prompt=QWENIMAGE_LPIPS_NEGATIVE_PROMPT,
727+
height=QWENIMAGE_LPIPS_HEIGHT,
728+
width=QWENIMAGE_LPIPS_WIDTH,
729+
num_inference_steps=QWENIMAGE_LPIPS_NUM_INFERENCE_STEPS,
730+
true_cfg_scale=QWENIMAGE_LPIPS_TRUE_CFG_SCALE,
731+
seed=QWENIMAGE_LPIPS_SEED,
732+
)
733+
generated_image = result.image[0].detach().cpu()
734+
finally:
735+
del pipeline
736+
_cleanup_cuda()
737+
738+
save_image(generated_image, output_path)
739+
740+
741+
def _run_cosmos3_lpips_pipeline(num_frames):
742+
"""Run the Cosmos3-Nano pipeline (default setting, VANILLA attn, compile-off).
743+
744+
Returns the generated video tensor ``(B, T, H, W, C)`` (T == ``num_frames``),
745+
or ``None`` if generation produced no video. ``num_frames=1`` yields the
746+
single-frame text-to-image path.
747+
"""
748+
# Cosmos3 re-reads the guardrail flag in __init__; set it before the pipeline loads.
749+
guardrails_env_key = "TRTLLM_DISABLE_COSMOS3_GUARDRAILS"
750+
previous_guardrails_env = os.environ.get(guardrails_env_key)
751+
os.environ[guardrails_env_key] = "1"
752+
try:
753+
from tensorrt_llm._torch.visual_gen.pipeline_loader import PipelineLoader
754+
from tensorrt_llm.visual_gen.args import (
755+
AttentionConfig,
756+
CompilationConfig,
757+
TorchCompileConfig,
758+
VisualGenArgs,
759+
)
760+
761+
model_path = _lpips_model_path(COSMOS3_NANO_MODEL_SUBPATH)
762+
_skip_if_missing(model_path, "Cosmos3-Nano checkpoint", is_dir=True)
763+
_disable_inductor_compile_worker_quiesce()
764+
args = VisualGenArgs(
765+
model=model_path,
766+
compilation_config=CompilationConfig(skip_warmup=True),
767+
torch_compile_config=TorchCompileConfig(enable=False),
768+
attention_config=AttentionConfig(backend="VANILLA"),
769+
)
770+
pipeline = PipelineLoader(args).load(skip_warmup=True)
771+
try:
772+
with torch.no_grad():
773+
result = pipeline.forward(
774+
prompt=COSMOS3_LPIPS_PROMPT,
775+
seed=COSMOS3_LPIPS_SEED,
776+
height=COSMOS3_LPIPS_HEIGHT,
777+
width=COSMOS3_LPIPS_WIDTH,
778+
num_frames=num_frames,
779+
num_inference_steps=COSMOS3_LPIPS_NUM_INFERENCE_STEPS,
780+
guidance_scale=COSMOS3_LPIPS_GUIDANCE_SCALE,
781+
frame_rate=COSMOS3_LPIPS_FRAME_RATE,
782+
use_guardrails=False,
783+
)
784+
if result is None or result.video is None:
785+
return None
786+
return result.video.detach().cpu()
787+
finally:
788+
del pipeline
789+
_cleanup_cuda()
790+
finally:
791+
if previous_guardrails_env is None:
792+
os.environ.pop(guardrails_env_key, None)
793+
else:
794+
os.environ[guardrails_env_key] = previous_guardrails_env
795+
796+
797+
def _generate_cosmos3_lpips_video(output_path):
798+
"""Generate the Cosmos3-Nano text-to-video LPIPS sample."""
799+
video = _run_cosmos3_lpips_pipeline(COSMOS3_LPIPS_T2V_NUM_FRAMES)
800+
assert video is not None, "Cosmos3-Nano T2V LPIPS run produced no video"
801+
_save_lpips_video_mp4(video, output_path, frame_rate=COSMOS3_LPIPS_FRAME_RATE)
802+
803+
804+
def _generate_cosmos3_lpips_image(output_path):
805+
"""Generate the Cosmos3-Nano text-to-image LPIPS sample (single frame)."""
806+
from tensorrt_llm.media.encoding import save_image
807+
808+
video = _run_cosmos3_lpips_pipeline(COSMOS3_LPIPS_T2I_NUM_FRAMES)
809+
assert video is not None, "Cosmos3-Nano T2I LPIPS run produced no frame"
810+
# video is (B, T, H, W, C); take the single frame -> (H, W, C) for save_image.
811+
save_image(video[0, 0], output_path)
812+
813+
677814
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
678815
def test_flux1_lpips_against_golden(tmp_path):
679816
generated_path = tmp_path / "flux1_generated.png"
@@ -758,6 +895,62 @@ def test_wan22_t2v_lpips_against_golden(tmp_path, wan22_bf16_video_path):
758895
_assert_lpips_below_threshold(score, WAN_LPIPS_THRESHOLD)
759896

760897

898+
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
899+
def test_qwenimage_lpips_against_golden(tmp_path):
900+
generated_path = tmp_path / "qwenimage_generated.png"
901+
golden_path = _golden_media_path(
902+
tmp_path, "qwenimage_lpips_golden.png", "QwenImage LPIPS golden image"
903+
)
904+
_generate_qwenimage_lpips_image(_lpips_model_path(QWENIMAGE_MODEL_SUBPATH), generated_path)
905+
score = _run_lpips_eval(
906+
tmp_path,
907+
"qwenimage",
908+
"image",
909+
QWENIMAGE_LPIPS_PROMPT,
910+
golden_path,
911+
generated_path,
912+
)
913+
_assert_lpips_below_threshold(score, QWENIMAGE_LPIPS_THRESHOLD)
914+
915+
916+
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
917+
def test_cosmos3_nano_t2v_lpips_against_golden(tmp_path):
918+
generated_path = tmp_path / "cosmos3_nano_t2v_generated.mp4"
919+
golden_path = _golden_media_path(
920+
tmp_path,
921+
"cosmos3_nano_t2v_lpips_golden_video.mp4",
922+
"Cosmos3-Nano T2V LPIPS golden video",
923+
)
924+
_generate_cosmos3_lpips_video(generated_path)
925+
score = _run_lpips_eval(
926+
tmp_path,
927+
"cosmos3_nano_t2v",
928+
"video",
929+
COSMOS3_LPIPS_PROMPT,
930+
golden_path,
931+
generated_path,
932+
)
933+
_assert_lpips_below_threshold(score, COSMOS3_LPIPS_THRESHOLD)
934+
935+
936+
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
937+
def test_cosmos3_nano_t2i_lpips_against_golden(tmp_path):
938+
generated_path = tmp_path / "cosmos3_nano_t2i_generated.png"
939+
golden_path = _golden_media_path(
940+
tmp_path, "cosmos3_nano_t2i_lpips_golden.png", "Cosmos3-Nano T2I LPIPS golden image"
941+
)
942+
_generate_cosmos3_lpips_image(generated_path)
943+
score = _run_lpips_eval(
944+
tmp_path,
945+
"cosmos3_nano_t2i",
946+
"image",
947+
COSMOS3_LPIPS_PROMPT,
948+
golden_path,
949+
generated_path,
950+
)
951+
_assert_lpips_below_threshold(score, COSMOS3_LPIPS_THRESHOLD)
952+
953+
761954
def _generate_wan_video(llm_venv, model_subpath, output_subdir):
762955
"""Generate a WAN video for a given model checkpoint.
763956

0 commit comments

Comments
 (0)