|
| 1 | +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 2 | +# SPDX-License-Identifier: Apache-2.0 |
| 3 | +# |
| 4 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +# you may not use this file except in compliance with the License. |
| 6 | +# You may obtain a copy of the License at |
| 7 | +# |
| 8 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +# |
| 10 | +# Unless required by applicable law or agreed to in writing, software |
| 11 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +# See the License for the specific language governing permissions and |
| 14 | +# limitations under the License. |
| 15 | + |
| 16 | +"""End-to-end tests for skip-softmax sparse attention on a tiny Wan 2.2 pipeline. |
| 17 | +
|
| 18 | +Uses ``create_tiny_wan22_pipeline_dir`` (matches diffusers' own tiny Wan 2.2 |
| 19 | +test config): dual 2-layer transformer, tiny VAE, tiny T5 text encoder. |
| 20 | +The full ``WanPipeline`` is loaded, sparsified with ``mtsa.sparsify``, and |
| 21 | +run end-to-end with a 2-step denoising loop — asserting no NaN/Inf in the |
| 22 | +pipeline output. |
| 23 | +""" |
| 24 | + |
| 25 | +import pytest |
| 26 | +import torch |
| 27 | + |
| 28 | +pytestmark = [ |
| 29 | + pytest.mark.filterwarnings("ignore::UserWarning"), |
| 30 | + pytest.mark.filterwarnings("ignore::RuntimeWarning"), |
| 31 | + pytest.mark.filterwarnings("ignore::DeprecationWarning"), |
| 32 | +] |
| 33 | + |
| 34 | +diffusers = pytest.importorskip("diffusers") |
| 35 | + |
| 36 | +from modelopt.torch.kernels import IS_AVAILABLE as TRITON_KERNEL_AVAILABLE |
| 37 | + |
| 38 | +if TRITON_KERNEL_AVAILABLE: |
| 39 | + import modelopt.torch.sparsity.attention_sparsity as mtsa |
| 40 | + from modelopt.torch.sparsity.attention_sparsity.sparse_attention import SparseAttentionModule |
| 41 | + |
| 42 | + |
| 43 | +# --------------------------------------------------------------------------- |
| 44 | +# Tiny Wan 2.2 pipeline fixture — shared across tests (pipeline load is costly) |
| 45 | +# --------------------------------------------------------------------------- |
| 46 | + |
| 47 | + |
| 48 | +@pytest.fixture(scope="module") |
| 49 | +def tiny_wan22_path(tmp_path_factory): |
| 50 | + """Create and save a tiny Wan 2.2 pipeline to disk once per module.""" |
| 51 | + from _test_utils.torch.diffusers_models import create_tiny_wan22_pipeline_dir |
| 52 | + |
| 53 | + return str(create_tiny_wan22_pipeline_dir(tmp_path_factory.mktemp("tiny_wan22"))) |
| 54 | + |
| 55 | + |
| 56 | +@pytest.fixture |
| 57 | +def tiny_wan22_pipe(tiny_wan22_path): |
| 58 | + """Load a fresh copy of the tiny Wan 2.2 pipeline on CUDA (per test).""" |
| 59 | + from diffusers import WanPipeline |
| 60 | + |
| 61 | + pipe = WanPipeline.from_pretrained(tiny_wan22_path, torch_dtype=torch.bfloat16) |
| 62 | + pipe.to("cuda") |
| 63 | + return pipe |
| 64 | + |
| 65 | + |
| 66 | +# --------------------------------------------------------------------------- |
| 67 | +# Helpers |
| 68 | +# --------------------------------------------------------------------------- |
| 69 | + |
| 70 | + |
| 71 | +_TINY_PIPE_KWARGS = { |
| 72 | + "prompt": "test", |
| 73 | + "negative_prompt": "", |
| 74 | + "num_frames": 5, |
| 75 | + "height": 16, |
| 76 | + "width": 16, |
| 77 | + "num_inference_steps": 2, |
| 78 | + "guidance_scale": 1.0, |
| 79 | +} |
| 80 | + |
| 81 | + |
| 82 | +def _skip_softmax_cfg(raw_threshold=-5.0): |
| 83 | + """Sparse config targeting Wan 2.2 self-attention (attn1) only.""" |
| 84 | + return { |
| 85 | + "sparse_cfg": { |
| 86 | + "*attn1*": { |
| 87 | + "method": "triton_skip_softmax", |
| 88 | + "backend": "triton", |
| 89 | + "skip_softmax_raw_threshold": raw_threshold, |
| 90 | + "enable": True, |
| 91 | + }, |
| 92 | + "default": {"enable": False}, |
| 93 | + }, |
| 94 | + } |
| 95 | + |
| 96 | + |
| 97 | +def _sparsify_both_transformers(pipe, cfg): |
| 98 | + """Apply sparsify to both transformer and transformer_2 (14B-style dual).""" |
| 99 | + mtsa.sparsify(pipe.transformer, cfg) |
| 100 | + mtsa.sparsify(pipe.transformer_2, cfg) |
| 101 | + |
| 102 | + |
| 103 | +def _run_pipe(pipe, seed=0): |
| 104 | + """Run the pipeline with a fixed seed; return output frames tensor.""" |
| 105 | + generator = torch.Generator(device="cuda").manual_seed(seed) |
| 106 | + with torch.no_grad(): |
| 107 | + output = pipe(generator=generator, **_TINY_PIPE_KWARGS) |
| 108 | + # output.frames[0] is a list of PIL images; for assertion we just need shape+health |
| 109 | + return output |
| 110 | + |
| 111 | + |
| 112 | +def _count_sparse_modules(module): |
| 113 | + return sum(isinstance(m, SparseAttentionModule) for m in module.modules()) |
| 114 | + |
| 115 | + |
| 116 | +# --------------------------------------------------------------------------- |
| 117 | +# E2E tests |
| 118 | +# --------------------------------------------------------------------------- |
| 119 | + |
| 120 | + |
| 121 | +@pytest.mark.skipif(not TRITON_KERNEL_AVAILABLE, reason="Need CUDA + triton") |
| 122 | +class TestWan22PipelineE2E: |
| 123 | + """End-to-end skip-softmax flow on a tiny Wan 2.2 pipeline.""" |
| 124 | + |
| 125 | + def test_baseline_pipeline_runs(self, tiny_wan22_pipe): |
| 126 | + """Dense baseline: pipeline produces finite frames without sparsification.""" |
| 127 | + output = _run_pipe(tiny_wan22_pipe) |
| 128 | + assert output.frames is not None |
| 129 | + assert len(output.frames) == 1 |
| 130 | + # Frames are PIL Images; ensure list isn't empty |
| 131 | + assert len(output.frames[0]) > 0 |
| 132 | + |
| 133 | + def test_sparsify_inserts_modules_in_both_transformers(self, tiny_wan22_pipe): |
| 134 | + """Both transformer and transformer_2 get SparseAttentionModule instances.""" |
| 135 | + _sparsify_both_transformers(tiny_wan22_pipe, _skip_softmax_cfg()) |
| 136 | + assert _count_sparse_modules(tiny_wan22_pipe.transformer) > 0 |
| 137 | + assert _count_sparse_modules(tiny_wan22_pipe.transformer_2) > 0 |
| 138 | + |
| 139 | + def test_skip_softmax_pipeline_runs_e2e(self, tiny_wan22_pipe): |
| 140 | + """Sparsified pipeline runs end-to-end producing finite frames.""" |
| 141 | + _sparsify_both_transformers(tiny_wan22_pipe, _skip_softmax_cfg(raw_threshold=-5.0)) |
| 142 | + output = _run_pipe(tiny_wan22_pipe) |
| 143 | + assert output.frames is not None |
| 144 | + assert len(output.frames[0]) > 0 |
| 145 | + |
| 146 | + def test_tight_threshold_matches_dense_within_tolerance(self, tiny_wan22_pipe, tiny_wan22_path): |
| 147 | + """raw_threshold=-50 (effectively dense) → output close to unsparsified run.""" |
| 148 | + from diffusers import WanPipeline |
| 149 | + |
| 150 | + # Dense run: fresh pipe, no sparsification |
| 151 | + dense_pipe = WanPipeline.from_pretrained(tiny_wan22_path, torch_dtype=torch.bfloat16) |
| 152 | + dense_pipe.to("cuda") |
| 153 | + dense_frame0 = _run_pipe(dense_pipe).frames[0][0] |
| 154 | + |
| 155 | + # Sparse run: same seed, raw_threshold=-50 (≈ no tiles skipped) |
| 156 | + _sparsify_both_transformers(tiny_wan22_pipe, _skip_softmax_cfg(raw_threshold=-50.0)) |
| 157 | + sparse_frame0 = _run_pipe(tiny_wan22_pipe).frames[0][0] |
| 158 | + |
| 159 | + # Both are PIL images — convert to tensor and compare |
| 160 | + import numpy as np |
| 161 | + |
| 162 | + d = np.asarray(dense_frame0, dtype=np.float32) |
| 163 | + s = np.asarray(sparse_frame0, dtype=np.float32) |
| 164 | + # Pixel-wise MAE should be small for tight threshold (but not bit-exact due to |
| 165 | + # different code paths in the online softmax accumulation). |
| 166 | + mae = np.abs(d - s).mean() |
| 167 | + assert mae < 20.0, f"MAE between dense and tight-sparse frames was {mae:.2f}" |
| 168 | + |
| 169 | + def test_measure_sparsity_counts_accumulate(self, tiny_wan22_pipe): |
| 170 | + """measure_sparsity=True + a permissive threshold → nonzero sparsity counters.""" |
| 171 | + from modelopt.torch.sparsity.attention_sparsity.methods.triton_skip_softmax import ( |
| 172 | + TritonSkipSoftmaxMethod, |
| 173 | + ) |
| 174 | + |
| 175 | + _sparsify_both_transformers(tiny_wan22_pipe, _skip_softmax_cfg(raw_threshold=-2.0)) |
| 176 | + |
| 177 | + # Enable measurement + reset counters on every sparse module |
| 178 | + for module in (tiny_wan22_pipe.transformer, tiny_wan22_pipe.transformer_2): |
| 179 | + for m in module.modules(): |
| 180 | + if isinstance(m, SparseAttentionModule): |
| 181 | + method = m._sparse_method_instance |
| 182 | + if isinstance(method, TritonSkipSoftmaxMethod): |
| 183 | + method.enable_measure_sparsity(True) |
| 184 | + method.reset_sparsity_counters() |
| 185 | + |
| 186 | + _run_pipe(tiny_wan22_pipe) |
| 187 | + |
| 188 | + # Sum counters across all sparse modules |
| 189 | + total_sum = 0 |
| 190 | + for module in (tiny_wan22_pipe.transformer, tiny_wan22_pipe.transformer_2): |
| 191 | + for m in module.modules(): |
| 192 | + if isinstance(m, SparseAttentionModule): |
| 193 | + method = m._sparse_method_instance |
| 194 | + if isinstance(method, TritonSkipSoftmaxMethod): |
| 195 | + total, skipped = method.get_sparsity_counters() |
| 196 | + assert skipped <= total |
| 197 | + total_sum += total |
| 198 | + |
| 199 | + assert total_sum > 0, "Expected nonzero sparsity counters after pipeline run" |
| 200 | + |
| 201 | + def test_save_restore_roundtrip(self, tiny_wan22_pipe): |
| 202 | + """Sparsified transformer saves & restores via modelopt_state.""" |
| 203 | + from _test_utils.torch.diffusers_models import get_tiny_wan22_transformer |
| 204 | + |
| 205 | + import modelopt.torch.opt as mto |
| 206 | + |
| 207 | + _sparsify_both_transformers(tiny_wan22_pipe, _skip_softmax_cfg()) |
| 208 | + state = mto.modelopt_state(tiny_wan22_pipe.transformer) |
| 209 | + |
| 210 | + # Restore into a fresh transformer of the same shape |
| 211 | + torch.manual_seed(0) |
| 212 | + restored = get_tiny_wan22_transformer().to("cuda", dtype=torch.bfloat16).eval() |
| 213 | + mto.restore_from_modelopt_state(restored, state) |
| 214 | + |
| 215 | + # The number of sparse modules should match the original |
| 216 | + restored_count = _count_sparse_modules(restored) |
| 217 | + orig_count = _count_sparse_modules(tiny_wan22_pipe.transformer) |
| 218 | + assert restored_count == orig_count > 0 |
| 219 | + |
| 220 | + |
| 221 | +@pytest.mark.skipif(not TRITON_KERNEL_AVAILABLE, reason="Need CUDA + triton") |
| 222 | +class TestWan22Calibration: |
| 223 | + """Multi-threshold calibration path on a tiny Wan 2.2 transformer.""" |
| 224 | + |
| 225 | + def test_calibration_collects_stats_per_module(self, tiny_wan22_pipe): |
| 226 | + """A forward pass under calibration_mode populates per-module _last_stats.""" |
| 227 | + from modelopt.torch.sparsity.attention_sparsity.methods.triton_skip_softmax import ( |
| 228 | + TritonSkipSoftmaxMethod, |
| 229 | + ) |
| 230 | + |
| 231 | + _sparsify_both_transformers(tiny_wan22_pipe, _skip_softmax_cfg()) |
| 232 | + |
| 233 | + threshold_trials = [1e-3, 1e-2, 1e-1] |
| 234 | + for module in (tiny_wan22_pipe.transformer, tiny_wan22_pipe.transformer_2): |
| 235 | + for m in module.modules(): |
| 236 | + if isinstance(m, SparseAttentionModule): |
| 237 | + method = m._sparse_method_instance |
| 238 | + if isinstance(method, TritonSkipSoftmaxMethod): |
| 239 | + method._calibration_mode = True |
| 240 | + method._threshold_trials = threshold_trials |
| 241 | + |
| 242 | + _run_pipe(tiny_wan22_pipe) |
| 243 | + |
| 244 | + # At least one sparse module should report stats of the correct shape |
| 245 | + found_stats = False |
| 246 | + for module in (tiny_wan22_pipe.transformer, tiny_wan22_pipe.transformer_2): |
| 247 | + for m in module.modules(): |
| 248 | + if isinstance(m, SparseAttentionModule) and m._last_stats is not None: |
| 249 | + stats = m._last_stats |
| 250 | + assert len(stats["sparsity"]) == len(threshold_trials) |
| 251 | + found_stats = True |
| 252 | + break |
| 253 | + if found_stats: |
| 254 | + break |
| 255 | + assert found_stats, "No sparse module reported calibration stats" |
0 commit comments