Skip to content

Commit bbfd5a0

Browse files
committed
[None][test] Add Stage-1 LPIPS golden accuracy tests for QwenImage and Cosmos3-Nano
Add default-setting single-GPU LPIPS golden tests for QwenImage and Cosmos3-Nano, and complete the VisualGen LPIPS CI protection introduced by the preceding changes. Refresh all eight golden media entries with the 1.3.0rc19 release container, record the diffusers version, calibrate measured B200 timeouts and Wan tolerance, and remove every remaining LPIPS waiver. Keep all eight single-GPU and eight multi-GPU cases registered in the B200 test lists. Restore the Cosmos3 guardrail environment after generation and disable TRT-LLM MPI bootstrap before the torch-distributed multi-GPU harness imports its helpers. Validated on B200: 8 single-GPU and 8 multi-GPU LPIPS cases passed. Signed-off-by: Chang Liu <9713593+chang-l@users.noreply.github.com>
1 parent b6d186a commit bbfd5a0

13 files changed

Lines changed: 270 additions & 19 deletions
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"image": "cosmos3_nano_t2i_lpips_golden.png",
3+
"model": "Cosmos3-Nano",
4+
"source": "TensorRT-LLM VisualGen",
5+
"prompt": "A serene mountain landscape with snow-capped peaks and a flowing river",
6+
"height": 720,
7+
"width": 1280,
8+
"num_frames": 1,
9+
"num_inference_steps": 35,
10+
"guidance_scale": 6.0,
11+
"seed": 42,
12+
"attention_backend": "VANILLA",
13+
"torch_compile": false,
14+
"lpips_net": "alex",
15+
"lpips_threshold": 0.05,
16+
"diffusers_version": "0.38.0"
17+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
{
2+
"video": "cosmos3_nano_t2v_lpips_golden_video.mp4",
3+
"model": "Cosmos3-Nano",
4+
"source": "TensorRT-LLM VisualGen",
5+
"prompt": "A serene mountain landscape with snow-capped peaks and a flowing river",
6+
"height": 720,
7+
"width": 1280,
8+
"num_frames": 189,
9+
"num_inference_steps": 35,
10+
"guidance_scale": 6.0,
11+
"seed": 42,
12+
"frame_rate": 24.0,
13+
"attention_backend": "VANILLA",
14+
"torch_compile": false,
15+
"lpips_net": "alex",
16+
"lpips_threshold": 0.05,
17+
"diffusers_version": "0.38.0"
18+
}

tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/flux1_lpips_golden.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,5 @@
1010
"seed": 42,
1111
"lpips_net": "alex",
1212
"lpips_threshold": 0.05,
13-
"diffusers_version": "0.37.1"
13+
"diffusers_version": "0.38.0"
1414
}

tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/flux2_lpips_golden.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,5 @@
1010
"seed": 42,
1111
"lpips_net": "alex",
1212
"lpips_threshold": 0.05,
13-
"diffusers_version": "0.37.1"
13+
"diffusers_version": "0.38.0"
1414
}

tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/ltx2_lpips_golden_video.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,6 @@
1212
"seed": 42,
1313
"frame_rate": 24.0,
1414
"lpips_net": "alex",
15-
"lpips_threshold": 0.05
15+
"lpips_threshold": 0.05,
16+
"diffusers_version": "0.38.0"
1617
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
{
2+
"image": "qwenimage_lpips_golden.png",
3+
"model": "Qwen-Image",
4+
"source": "TensorRT-LLM VisualGen",
5+
"prompt": "a tiny astronaut hatching from an egg on the moon",
6+
"negative_prompt": "",
7+
"height": 1328,
8+
"width": 1328,
9+
"num_inference_steps": 50,
10+
"true_cfg_scale": 4.0,
11+
"seed": 42,
12+
"torch_compile": false,
13+
"lpips_net": "alex",
14+
"lpips_threshold": 0.05,
15+
"diffusers_version": "0.38.0"
16+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:0571abd60b6d4e176ff51472bcc0cf9905b7a2db8ddca6517dbd6fa8be27d842
3+
size 27135566

tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/wan21_t2v_lpips_golden_video.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@
1212
"seed": 42,
1313
"frame_rate": 16.0,
1414
"lpips_net": "alex",
15-
"lpips_threshold": 0.05,
16-
"diffusers_version": "0.37.1"
15+
"lpips_threshold": 0.1,
16+
"diffusers_version": "0.38.0"
1717
}

tests/integration/defs/examples/visual_gen/golden/visual_gen_lpips/wan22_t2v_lpips_golden_video.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@
1212
"seed": 42,
1313
"frame_rate": 16.0,
1414
"lpips_net": "alex",
15-
"lpips_threshold": 0.05,
16-
"diffusers_version": "0.37.1"
15+
"lpips_threshold": 0.1,
16+
"diffusers_version": "0.38.0"
1717
}

tests/integration/defs/examples/visual_gen/test_visual_gen.py

Lines changed: 197 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@
4646
WAN_T2V_WIDTH = 832
4747
WAN_T2V_NUM_FRAMES = 165
4848

49+
# NB: this test file lives at tests/integration/defs/examples/visual_gen/, so the repo
50+
# root is five levels up (the LPIPS eval script is referenced from <repo>/scripts/).
4951
REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", ".."))
5052
VISUAL_GEN_LPIPS_EVAL_SCRIPT = os.path.join(
5153
REPO_ROOT, "scripts", "visualgen_eval", "visual_gen_lpips_score_eval.py"
@@ -76,7 +78,10 @@
7678
WAN21_LPIPS_GUIDANCE_SCALE = 5.0
7779
WAN21_LPIPS_SEED = 42
7880
WAN_LPIPS_FRAME_RATE = 16.0
79-
WAN_LPIPS_THRESHOLD = 0.05
81+
# Repeated B200 runs differ from the freshly generated Wan goldens by 0.06-0.08
82+
# while remaining deterministic run-to-run. Keep enough margin for that backend
83+
# variance without admitting the stale-baseline regressions measured above 0.10.
84+
WAN_LPIPS_THRESHOLD = 0.10
8085

8186
WAN22_LPIPS_PROMPT = "A cat sitting on a sunny windowsill watching birds outside."
8287
WAN22_LPIPS_NEGATIVE_PROMPT = ""
@@ -87,6 +92,36 @@
8792
WAN22_LPIPS_GUIDANCE_SCALE = 4.0
8893
WAN22_LPIPS_SEED = 42
8994
WAN22_LPIPS_FRAME_RATE = 16.0
95+
96+
# QwenImage (text-to-image) — default-setting LPIPS golden.
97+
# Params mirror the QwenImage 20B reference defaults (pipeline_qwen_image.py).
98+
# NOTE: QwenImage's forward CFG knob is ``true_cfg_scale`` (not ``guidance_scale``),
99+
# and real-CFG only engages when a negative prompt is supplied.
100+
QWENIMAGE_MODEL_SUBPATH = "qwen-image"
101+
QWENIMAGE_LPIPS_PROMPT = "a tiny astronaut hatching from an egg on the moon"
102+
QWENIMAGE_LPIPS_NEGATIVE_PROMPT = ""
103+
QWENIMAGE_LPIPS_HEIGHT = 1328
104+
QWENIMAGE_LPIPS_WIDTH = 1328
105+
QWENIMAGE_LPIPS_NUM_INFERENCE_STEPS = 50
106+
QWENIMAGE_LPIPS_TRUE_CFG_SCALE = 4.0
107+
QWENIMAGE_LPIPS_SEED = 42
108+
QWENIMAGE_LPIPS_THRESHOLD = 0.05
109+
110+
# Cosmos3-Nano (text-to-video + text-to-image) — default-setting LPIPS golden.
111+
# Params are the Cosmos3 720P defaults (cosmos3/defaults.py:COSMOS3_720P_PARAMS).
112+
# Cosmos3 requires VANILLA attention and guardrails disabled in CI.
113+
COSMOS3_NANO_MODEL_SUBPATH = "Cosmos3-Nano"
114+
COSMOS3_LPIPS_PROMPT = "A serene mountain landscape with snow-capped peaks and a flowing river"
115+
COSMOS3_LPIPS_HEIGHT = 720
116+
COSMOS3_LPIPS_WIDTH = 1280
117+
COSMOS3_LPIPS_T2V_NUM_FRAMES = 189
118+
COSMOS3_LPIPS_T2I_NUM_FRAMES = 1
119+
COSMOS3_LPIPS_NUM_INFERENCE_STEPS = 35
120+
COSMOS3_LPIPS_GUIDANCE_SCALE = 6.0
121+
COSMOS3_LPIPS_SEED = 42
122+
COSMOS3_LPIPS_FRAME_RATE = 24.0
123+
COSMOS3_LPIPS_THRESHOLD = 0.05
124+
90125
# LTX-2 configuration
91126
LTX2_MODEL_CHECKPOINT_PATH = "LTX-2/ltx-2-19b-dev.safetensors"
92127
LTX2_TEXT_ENCODER_SUBPATH = "gemma-3-12b-it"
@@ -674,6 +709,111 @@ def wan22_bf16_video_path(_visual_gen_deps, llm_venv):
674709
return output_path
675710

676711

712+
def _generate_qwenimage_lpips_image(model_path, output_path):
713+
"""Generate the QwenImage text-to-image LPIPS sample (default setting, compile-off)."""
714+
from tensorrt_llm._torch.visual_gen.pipeline_loader import PipelineLoader
715+
from tensorrt_llm.media.encoding import save_image
716+
from tensorrt_llm.visual_gen.args import TorchCompileConfig, VisualGenArgs
717+
718+
_skip_if_missing(model_path, "QwenImage checkpoint", is_dir=True)
719+
_disable_inductor_compile_worker_quiesce()
720+
args = VisualGenArgs(
721+
model=model_path,
722+
torch_compile_config=TorchCompileConfig(enable=False),
723+
)
724+
pipeline = PipelineLoader(args).load(skip_warmup=True)
725+
try:
726+
with torch.no_grad():
727+
result = pipeline.forward(
728+
prompt=QWENIMAGE_LPIPS_PROMPT,
729+
negative_prompt=QWENIMAGE_LPIPS_NEGATIVE_PROMPT,
730+
height=QWENIMAGE_LPIPS_HEIGHT,
731+
width=QWENIMAGE_LPIPS_WIDTH,
732+
num_inference_steps=QWENIMAGE_LPIPS_NUM_INFERENCE_STEPS,
733+
true_cfg_scale=QWENIMAGE_LPIPS_TRUE_CFG_SCALE,
734+
seed=QWENIMAGE_LPIPS_SEED,
735+
)
736+
generated_image = result.image[0].detach().cpu()
737+
finally:
738+
del pipeline
739+
_cleanup_cuda()
740+
741+
save_image(generated_image, output_path)
742+
743+
744+
def _run_cosmos3_lpips_pipeline(num_frames):
745+
"""Run the Cosmos3-Nano pipeline (default setting, VANILLA attn, compile-off).
746+
747+
Returns the generated video tensor ``(B, T, H, W, C)`` (T == ``num_frames``),
748+
or ``None`` if generation produced no video. ``num_frames=1`` yields the
749+
single-frame text-to-image path.
750+
"""
751+
# Cosmos3 re-reads the guardrail flag in __init__; set it before the pipeline loads.
752+
guardrails_env_key = "TRTLLM_DISABLE_COSMOS3_GUARDRAILS"
753+
previous_guardrails_env = os.environ.get(guardrails_env_key)
754+
os.environ[guardrails_env_key] = "1"
755+
try:
756+
from tensorrt_llm._torch.visual_gen.pipeline_loader import PipelineLoader
757+
from tensorrt_llm.visual_gen.args import (
758+
AttentionConfig,
759+
CompilationConfig,
760+
TorchCompileConfig,
761+
VisualGenArgs,
762+
)
763+
764+
model_path = _lpips_model_path(COSMOS3_NANO_MODEL_SUBPATH)
765+
_skip_if_missing(model_path, "Cosmos3-Nano checkpoint", is_dir=True)
766+
_disable_inductor_compile_worker_quiesce()
767+
args = VisualGenArgs(
768+
model=model_path,
769+
compilation_config=CompilationConfig(skip_warmup=True),
770+
torch_compile_config=TorchCompileConfig(enable=False),
771+
attention_config=AttentionConfig(backend="VANILLA"),
772+
)
773+
pipeline = PipelineLoader(args).load(skip_warmup=True)
774+
try:
775+
with torch.no_grad():
776+
result = pipeline.forward(
777+
prompt=COSMOS3_LPIPS_PROMPT,
778+
seed=COSMOS3_LPIPS_SEED,
779+
height=COSMOS3_LPIPS_HEIGHT,
780+
width=COSMOS3_LPIPS_WIDTH,
781+
num_frames=num_frames,
782+
num_inference_steps=COSMOS3_LPIPS_NUM_INFERENCE_STEPS,
783+
guidance_scale=COSMOS3_LPIPS_GUIDANCE_SCALE,
784+
frame_rate=COSMOS3_LPIPS_FRAME_RATE,
785+
use_guardrails=False,
786+
)
787+
if result is None or result.video is None:
788+
return None
789+
return result.video.detach().cpu()
790+
finally:
791+
del pipeline
792+
_cleanup_cuda()
793+
finally:
794+
if previous_guardrails_env is None:
795+
os.environ.pop(guardrails_env_key, None)
796+
else:
797+
os.environ[guardrails_env_key] = previous_guardrails_env
798+
799+
800+
def _generate_cosmos3_lpips_video(output_path):
801+
"""Generate the Cosmos3-Nano text-to-video LPIPS sample."""
802+
video = _run_cosmos3_lpips_pipeline(COSMOS3_LPIPS_T2V_NUM_FRAMES)
803+
assert video is not None, "Cosmos3-Nano T2V LPIPS run produced no video"
804+
_save_lpips_video_mp4(video, output_path, frame_rate=COSMOS3_LPIPS_FRAME_RATE)
805+
806+
807+
def _generate_cosmos3_lpips_image(output_path):
808+
"""Generate the Cosmos3-Nano text-to-image LPIPS sample (single frame)."""
809+
from tensorrt_llm.media.encoding import save_image
810+
811+
video = _run_cosmos3_lpips_pipeline(COSMOS3_LPIPS_T2I_NUM_FRAMES)
812+
assert video is not None, "Cosmos3-Nano T2I LPIPS run produced no frame"
813+
# video is (B, T, H, W, C); take the single frame -> (H, W, C) for save_image.
814+
save_image(video[0, 0], output_path)
815+
816+
677817
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
678818
def test_flux1_lpips_against_golden(tmp_path):
679819
generated_path = tmp_path / "flux1_generated.png"
@@ -758,6 +898,62 @@ def test_wan22_t2v_lpips_against_golden(tmp_path, wan22_bf16_video_path):
758898
_assert_lpips_below_threshold(score, WAN_LPIPS_THRESHOLD)
759899

760900

901+
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
902+
def test_qwenimage_lpips_against_golden(tmp_path):
903+
generated_path = tmp_path / "qwenimage_generated.png"
904+
golden_path = _golden_media_path(
905+
tmp_path, "qwenimage_lpips_golden.png", "QwenImage LPIPS golden image"
906+
)
907+
_generate_qwenimage_lpips_image(_lpips_model_path(QWENIMAGE_MODEL_SUBPATH), generated_path)
908+
score = _run_lpips_eval(
909+
tmp_path,
910+
"qwenimage",
911+
"image",
912+
QWENIMAGE_LPIPS_PROMPT,
913+
golden_path,
914+
generated_path,
915+
)
916+
_assert_lpips_below_threshold(score, QWENIMAGE_LPIPS_THRESHOLD)
917+
918+
919+
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
920+
def test_cosmos3_nano_t2v_lpips_against_golden(tmp_path):
921+
generated_path = tmp_path / "cosmos3_nano_t2v_generated.mp4"
922+
golden_path = _golden_media_path(
923+
tmp_path,
924+
"cosmos3_nano_t2v_lpips_golden_video.mp4",
925+
"Cosmos3-Nano T2V LPIPS golden video",
926+
)
927+
_generate_cosmos3_lpips_video(generated_path)
928+
score = _run_lpips_eval(
929+
tmp_path,
930+
"cosmos3_nano_t2v",
931+
"video",
932+
COSMOS3_LPIPS_PROMPT,
933+
golden_path,
934+
generated_path,
935+
)
936+
_assert_lpips_below_threshold(score, COSMOS3_LPIPS_THRESHOLD)
937+
938+
939+
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
940+
def test_cosmos3_nano_t2i_lpips_against_golden(tmp_path):
941+
generated_path = tmp_path / "cosmos3_nano_t2i_generated.png"
942+
golden_path = _golden_media_path(
943+
tmp_path, "cosmos3_nano_t2i_lpips_golden.png", "Cosmos3-Nano T2I LPIPS golden image"
944+
)
945+
_generate_cosmos3_lpips_image(generated_path)
946+
score = _run_lpips_eval(
947+
tmp_path,
948+
"cosmos3_nano_t2i",
949+
"image",
950+
COSMOS3_LPIPS_PROMPT,
951+
golden_path,
952+
generated_path,
953+
)
954+
_assert_lpips_below_threshold(score, COSMOS3_LPIPS_THRESHOLD)
955+
956+
761957
def _generate_wan_video(llm_venv, model_subpath, output_subdir):
762958
"""Generate a WAN video for a given model checkpoint.
763959

0 commit comments

Comments
 (0)