|
46 | 46 | WAN_T2V_WIDTH = 832 |
47 | 47 | WAN_T2V_NUM_FRAMES = 165 |
48 | 48 |
|
| 49 | +# NB: this test file lives at tests/integration/defs/examples/visual_gen/, so the repo |
| 50 | +# root is five levels up (the LPIPS eval script is referenced from <repo>/scripts/). |
49 | 51 | REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..")) |
50 | 52 | VISUAL_GEN_LPIPS_EVAL_SCRIPT = os.path.join( |
51 | 53 | REPO_ROOT, "scripts", "visualgen_eval", "visual_gen_lpips_score_eval.py" |
|
88 | 90 | WAN22_LPIPS_GUIDANCE_SCALE = 4.0 |
89 | 91 | WAN22_LPIPS_SEED = 42 |
90 | 92 | WAN22_LPIPS_FRAME_RATE = 16.0 |
| 93 | + |
| 94 | +# QwenImage (text-to-image) — default-setting LPIPS golden. |
| 95 | +# Params mirror the QwenImage 20B reference defaults (pipeline_qwen_image.py). |
| 96 | +# NOTE: QwenImage's forward CFG knob is ``true_cfg_scale`` (not ``guidance_scale``), |
| 97 | +# and real-CFG only engages when a negative prompt is supplied. |
| 98 | +QWENIMAGE_MODEL_SUBPATH = "qwen-image" |
| 99 | +QWENIMAGE_LPIPS_PROMPT = "a tiny astronaut hatching from an egg on the moon" |
| 100 | +QWENIMAGE_LPIPS_NEGATIVE_PROMPT = "" |
| 101 | +QWENIMAGE_LPIPS_HEIGHT = 1328 |
| 102 | +QWENIMAGE_LPIPS_WIDTH = 1328 |
| 103 | +QWENIMAGE_LPIPS_NUM_INFERENCE_STEPS = 50 |
| 104 | +QWENIMAGE_LPIPS_TRUE_CFG_SCALE = 4.0 |
| 105 | +QWENIMAGE_LPIPS_SEED = 42 |
| 106 | +QWENIMAGE_LPIPS_THRESHOLD = 0.05 |
| 107 | + |
| 108 | +# Cosmos3-Nano (text-to-video + text-to-image) — default-setting LPIPS golden. |
| 109 | +# Params are the Cosmos3 720P defaults (cosmos3/defaults.py:COSMOS3_720P_PARAMS). |
| 110 | +# Cosmos3 requires VANILLA attention and guardrails disabled in CI. |
| 111 | +COSMOS3_NANO_MODEL_SUBPATH = "Cosmos3-Nano" |
| 112 | +COSMOS3_LPIPS_PROMPT = "A serene mountain landscape with snow-capped peaks and a flowing river" |
| 113 | +COSMOS3_LPIPS_HEIGHT = 720 |
| 114 | +COSMOS3_LPIPS_WIDTH = 1280 |
| 115 | +COSMOS3_LPIPS_T2V_NUM_FRAMES = 189 |
| 116 | +COSMOS3_LPIPS_T2I_NUM_FRAMES = 1 |
| 117 | +COSMOS3_LPIPS_NUM_INFERENCE_STEPS = 35 |
| 118 | +COSMOS3_LPIPS_GUIDANCE_SCALE = 6.0 |
| 119 | +COSMOS3_LPIPS_SEED = 42 |
| 120 | +COSMOS3_LPIPS_FRAME_RATE = 24.0 |
| 121 | +COSMOS3_LPIPS_THRESHOLD = 0.05 |
| 122 | + |
91 | 123 | # LTX-2 configuration |
92 | 124 | LTX2_MODEL_CHECKPOINT_PATH = "LTX-2/ltx-2-19b-dev.safetensors" |
93 | 125 | LTX2_TEXT_ENCODER_SUBPATH = "gemma-3-12b-it" |
@@ -488,12 +520,15 @@ def _assert_lpips_below_threshold(score, threshold): |
488 | 520 | def _generate_flux_lpips_image(model_path, output_path): |
489 | 521 | from tensorrt_llm._torch.visual_gen.pipeline_loader import PipelineLoader |
490 | 522 | from tensorrt_llm.media.encoding import save_image |
491 | | - from tensorrt_llm.visual_gen.args import VisualGenArgs |
| 523 | + from tensorrt_llm.visual_gen.args import TorchCompileConfig, VisualGenArgs |
492 | 524 |
|
493 | 525 | _skip_if_missing(model_path, "FLUX checkpoint", is_dir=True) |
494 | 526 | _disable_inductor_compile_worker_quiesce() |
495 | 527 | with _lpips_deterministic_algorithms(): |
496 | | - args = VisualGenArgs(model=model_path) |
| 528 | + args = VisualGenArgs( |
| 529 | + model=model_path, |
| 530 | + torch_compile_config=TorchCompileConfig(enable=False), |
| 531 | + ) |
497 | 532 | pipeline = PipelineLoader(args).load(skip_warmup=True) |
498 | 533 | try: |
499 | 534 | result = pipeline.forward( |
@@ -572,13 +607,14 @@ def _run_wan_lpips_pipeline( |
572 | 607 | parallel=None, |
573 | 608 | ): |
574 | 609 | from tensorrt_llm._torch.visual_gen.pipeline_loader import PipelineLoader |
575 | | - from tensorrt_llm.visual_gen.args import AttentionConfig, VisualGenArgs |
| 610 | + from tensorrt_llm.visual_gen.args import AttentionConfig, TorchCompileConfig, VisualGenArgs |
576 | 611 |
|
577 | 612 | _skip_if_missing(model_path, "Wan checkpoint", is_dir=True) |
578 | 613 | _disable_inductor_compile_worker_quiesce() |
579 | 614 | args_kwargs = dict( |
580 | 615 | model=model_path, |
581 | 616 | attention_config=AttentionConfig(backend=attention_backend), |
| 617 | + torch_compile_config=TorchCompileConfig(enable=False), |
582 | 618 | ) |
583 | 619 | if parallel is not None: |
584 | 620 | args_kwargs["parallel_config"] = parallel |
@@ -677,6 +713,111 @@ def wan22_bf16_video_path(_visual_gen_deps, llm_venv): |
677 | 713 | return output_path |
678 | 714 |
|
679 | 715 |
|
| 716 | +def _generate_qwenimage_lpips_image(model_path, output_path): |
| 717 | + """Generate the QwenImage text-to-image LPIPS sample (default setting, compile-off).""" |
| 718 | + from tensorrt_llm._torch.visual_gen.pipeline_loader import PipelineLoader |
| 719 | + from tensorrt_llm.media.encoding import save_image |
| 720 | + from tensorrt_llm.visual_gen.args import TorchCompileConfig, VisualGenArgs |
| 721 | + |
| 722 | + _skip_if_missing(model_path, "QwenImage checkpoint", is_dir=True) |
| 723 | + _disable_inductor_compile_worker_quiesce() |
| 724 | + args = VisualGenArgs( |
| 725 | + model=model_path, |
| 726 | + torch_compile_config=TorchCompileConfig(enable=False), |
| 727 | + ) |
| 728 | + pipeline = PipelineLoader(args).load(skip_warmup=True) |
| 729 | + try: |
| 730 | + with torch.no_grad(): |
| 731 | + result = pipeline.forward( |
| 732 | + prompt=QWENIMAGE_LPIPS_PROMPT, |
| 733 | + negative_prompt=QWENIMAGE_LPIPS_NEGATIVE_PROMPT, |
| 734 | + height=QWENIMAGE_LPIPS_HEIGHT, |
| 735 | + width=QWENIMAGE_LPIPS_WIDTH, |
| 736 | + num_inference_steps=QWENIMAGE_LPIPS_NUM_INFERENCE_STEPS, |
| 737 | + true_cfg_scale=QWENIMAGE_LPIPS_TRUE_CFG_SCALE, |
| 738 | + seed=QWENIMAGE_LPIPS_SEED, |
| 739 | + ) |
| 740 | + generated_image = result.image[0].detach().cpu() |
| 741 | + finally: |
| 742 | + del pipeline |
| 743 | + _cleanup_cuda() |
| 744 | + |
| 745 | + save_image(generated_image, output_path) |
| 746 | + |
| 747 | + |
| 748 | +def _run_cosmos3_lpips_pipeline(num_frames): |
| 749 | + """Run the Cosmos3-Nano pipeline (default setting, VANILLA attn, compile-off). |
| 750 | +
|
| 751 | + Returns the generated video tensor ``(B, T, H, W, C)`` (T == ``num_frames``), |
| 752 | + or ``None`` if generation produced no video. ``num_frames=1`` yields the |
| 753 | + single-frame text-to-image path. |
| 754 | + """ |
| 755 | + # Cosmos3 re-reads the guardrail flag in __init__; set it before the pipeline loads. |
| 756 | + guardrails_env_key = "TRTLLM_DISABLE_COSMOS3_GUARDRAILS" |
| 757 | + previous_guardrails_env = os.environ.get(guardrails_env_key) |
| 758 | + os.environ[guardrails_env_key] = "1" |
| 759 | + try: |
| 760 | + from tensorrt_llm._torch.visual_gen.pipeline_loader import PipelineLoader |
| 761 | + from tensorrt_llm.visual_gen.args import ( |
| 762 | + AttentionConfig, |
| 763 | + CompilationConfig, |
| 764 | + TorchCompileConfig, |
| 765 | + VisualGenArgs, |
| 766 | + ) |
| 767 | + |
| 768 | + model_path = _lpips_model_path(COSMOS3_NANO_MODEL_SUBPATH) |
| 769 | + _skip_if_missing(model_path, "Cosmos3-Nano checkpoint", is_dir=True) |
| 770 | + _disable_inductor_compile_worker_quiesce() |
| 771 | + args = VisualGenArgs( |
| 772 | + model=model_path, |
| 773 | + compilation_config=CompilationConfig(skip_warmup=True), |
| 774 | + torch_compile_config=TorchCompileConfig(enable=False), |
| 775 | + attention_config=AttentionConfig(backend="VANILLA"), |
| 776 | + ) |
| 777 | + pipeline = PipelineLoader(args).load(skip_warmup=True) |
| 778 | + try: |
| 779 | + with torch.no_grad(): |
| 780 | + result = pipeline.forward( |
| 781 | + prompt=COSMOS3_LPIPS_PROMPT, |
| 782 | + seed=COSMOS3_LPIPS_SEED, |
| 783 | + height=COSMOS3_LPIPS_HEIGHT, |
| 784 | + width=COSMOS3_LPIPS_WIDTH, |
| 785 | + num_frames=num_frames, |
| 786 | + num_inference_steps=COSMOS3_LPIPS_NUM_INFERENCE_STEPS, |
| 787 | + guidance_scale=COSMOS3_LPIPS_GUIDANCE_SCALE, |
| 788 | + frame_rate=COSMOS3_LPIPS_FRAME_RATE, |
| 789 | + use_guardrails=False, |
| 790 | + ) |
| 791 | + if result is None or result.video is None: |
| 792 | + return None |
| 793 | + return result.video.detach().cpu() |
| 794 | + finally: |
| 795 | + del pipeline |
| 796 | + _cleanup_cuda() |
| 797 | + finally: |
| 798 | + if previous_guardrails_env is None: |
| 799 | + os.environ.pop(guardrails_env_key, None) |
| 800 | + else: |
| 801 | + os.environ[guardrails_env_key] = previous_guardrails_env |
| 802 | + |
| 803 | + |
| 804 | +def _generate_cosmos3_lpips_video(output_path): |
| 805 | + """Generate the Cosmos3-Nano text-to-video LPIPS sample.""" |
| 806 | + video = _run_cosmos3_lpips_pipeline(COSMOS3_LPIPS_T2V_NUM_FRAMES) |
| 807 | + assert video is not None, "Cosmos3-Nano T2V LPIPS run produced no video" |
| 808 | + _save_lpips_video_mp4(video, output_path, frame_rate=COSMOS3_LPIPS_FRAME_RATE) |
| 809 | + |
| 810 | + |
| 811 | +def _generate_cosmos3_lpips_image(output_path): |
| 812 | + """Generate the Cosmos3-Nano text-to-image LPIPS sample (single frame).""" |
| 813 | + from tensorrt_llm.media.encoding import save_image |
| 814 | + |
| 815 | + video = _run_cosmos3_lpips_pipeline(COSMOS3_LPIPS_T2I_NUM_FRAMES) |
| 816 | + assert video is not None, "Cosmos3-Nano T2I LPIPS run produced no frame" |
| 817 | + # video is (B, T, H, W, C); take the single frame -> (H, W, C) for save_image. |
| 818 | + save_image(video[0, 0], output_path) |
| 819 | + |
| 820 | + |
680 | 821 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") |
681 | 822 | def test_flux1_lpips_against_golden(tmp_path): |
682 | 823 | generated_path = tmp_path / "flux1_generated.png" |
@@ -779,6 +920,62 @@ def test_wan22_t2v_lpips_against_golden(tmp_path, wan22_bf16_video_path): |
779 | 920 | _assert_lpips_below_threshold(score, WAN_LPIPS_THRESHOLD) |
780 | 921 |
|
781 | 922 |
|
| 923 | +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") |
| 924 | +def test_qwenimage_lpips_against_golden(tmp_path): |
| 925 | + generated_path = tmp_path / "qwenimage_generated.png" |
| 926 | + golden_path = _golden_media_path( |
| 927 | + tmp_path, "qwenimage_lpips_golden.png", "QwenImage LPIPS golden image" |
| 928 | + ) |
| 929 | + _generate_qwenimage_lpips_image(_lpips_model_path(QWENIMAGE_MODEL_SUBPATH), generated_path) |
| 930 | + score = _run_lpips_eval( |
| 931 | + tmp_path, |
| 932 | + "qwenimage", |
| 933 | + "image", |
| 934 | + QWENIMAGE_LPIPS_PROMPT, |
| 935 | + golden_path, |
| 936 | + generated_path, |
| 937 | + ) |
| 938 | + _assert_lpips_below_threshold(score, QWENIMAGE_LPIPS_THRESHOLD) |
| 939 | + |
| 940 | + |
| 941 | +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") |
| 942 | +def test_cosmos3_nano_t2v_lpips_against_golden(tmp_path): |
| 943 | + generated_path = tmp_path / "cosmos3_nano_t2v_generated.mp4" |
| 944 | + golden_path = _golden_media_path( |
| 945 | + tmp_path, |
| 946 | + "cosmos3_nano_t2v_lpips_golden_video.mp4", |
| 947 | + "Cosmos3-Nano T2V LPIPS golden video", |
| 948 | + ) |
| 949 | + _generate_cosmos3_lpips_video(generated_path) |
| 950 | + score = _run_lpips_eval( |
| 951 | + tmp_path, |
| 952 | + "cosmos3_nano_t2v", |
| 953 | + "video", |
| 954 | + COSMOS3_LPIPS_PROMPT, |
| 955 | + golden_path, |
| 956 | + generated_path, |
| 957 | + ) |
| 958 | + _assert_lpips_below_threshold(score, COSMOS3_LPIPS_THRESHOLD) |
| 959 | + |
| 960 | + |
| 961 | +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") |
| 962 | +def test_cosmos3_nano_t2i_lpips_against_golden(tmp_path): |
| 963 | + generated_path = tmp_path / "cosmos3_nano_t2i_generated.png" |
| 964 | + golden_path = _golden_media_path( |
| 965 | + tmp_path, "cosmos3_nano_t2i_lpips_golden.png", "Cosmos3-Nano T2I LPIPS golden image" |
| 966 | + ) |
| 967 | + _generate_cosmos3_lpips_image(generated_path) |
| 968 | + score = _run_lpips_eval( |
| 969 | + tmp_path, |
| 970 | + "cosmos3_nano_t2i", |
| 971 | + "image", |
| 972 | + COSMOS3_LPIPS_PROMPT, |
| 973 | + golden_path, |
| 974 | + generated_path, |
| 975 | + ) |
| 976 | + _assert_lpips_below_threshold(score, COSMOS3_LPIPS_THRESHOLD) |
| 977 | + |
| 978 | + |
782 | 979 | def _generate_wan_video(llm_venv, model_subpath, output_subdir): |
783 | 980 | """Generate a WAN video for a given model checkpoint. |
784 | 981 |
|
|
0 commit comments