Skip to content

Commit dff9b33

Browse files
committed
Update more test cases
Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
1 parent ba09900 commit dff9b33

3 files changed

Lines changed: 259 additions & 4 deletions

File tree

tests/gpu/torch/sparsity/attention_sparsity/test_diffusers_triton_attention.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@
3434

3535

3636
@pytest.mark.skipif(not TRITON_KERNEL_AVAILABLE, reason="Need CUDA + triton")
37-
class TestDiffusersTritonAttentionGPU:
38-
"""Exercise _diffusers_triton_attention on GPU."""
37+
class TestDiffusersTritonAttention:
38+
"""Exercise _diffusers_triton_attention on a real device."""
3939

4040
@pytest.fixture(autouse=True)
4141
def _reset_thread_local(self):
@@ -100,8 +100,8 @@ def test_cross_attention_different_seq_lengths(self):
100100

101101

102102
@pytest.mark.skipif(not TRITON_KERNEL_AVAILABLE, reason="Need CUDA + triton")
103-
class TestLTXTritonAttentionGPU:
104-
"""Exercise _ltx_triton_attention on GPU (LTX layout [B, T, H*D])."""
103+
class TestLTXTritonAttention:
104+
"""Exercise _ltx_triton_attention (LTX layout [B, T, H*D])."""
105105

106106
@pytest.fixture(autouse=True)
107107
def _reset_thread_local(self):
Lines changed: 255 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,255 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""End-to-end tests for skip-softmax sparse attention on a tiny Wan 2.2 pipeline.
17+
18+
Uses ``create_tiny_wan22_pipeline_dir`` (matches diffusers' own tiny Wan 2.2
19+
test config): dual 2-layer transformer, tiny VAE, tiny T5 text encoder.
20+
The full ``WanPipeline`` is loaded, sparsified with ``mtsa.sparsify``, and
21+
run end-to-end with a 2-step denoising loop — asserting no NaN/Inf in the
22+
pipeline output.
23+
"""
24+
25+
import pytest
26+
import torch
27+
28+
pytestmark = [
29+
pytest.mark.filterwarnings("ignore::UserWarning"),
30+
pytest.mark.filterwarnings("ignore::RuntimeWarning"),
31+
pytest.mark.filterwarnings("ignore::DeprecationWarning"),
32+
]
33+
34+
diffusers = pytest.importorskip("diffusers")
35+
36+
from modelopt.torch.kernels import IS_AVAILABLE as TRITON_KERNEL_AVAILABLE
37+
38+
if TRITON_KERNEL_AVAILABLE:
39+
import modelopt.torch.sparsity.attention_sparsity as mtsa
40+
from modelopt.torch.sparsity.attention_sparsity.sparse_attention import SparseAttentionModule
41+
42+
43+
# ---------------------------------------------------------------------------
44+
# Tiny Wan 2.2 pipeline fixture — shared across tests (pipeline load is costly)
45+
# ---------------------------------------------------------------------------
46+
47+
48+
@pytest.fixture(scope="module")
49+
def tiny_wan22_path(tmp_path_factory):
50+
"""Create and save a tiny Wan 2.2 pipeline to disk once per module."""
51+
from _test_utils.torch.diffusers_models import create_tiny_wan22_pipeline_dir
52+
53+
return str(create_tiny_wan22_pipeline_dir(tmp_path_factory.mktemp("tiny_wan22")))
54+
55+
56+
@pytest.fixture
57+
def tiny_wan22_pipe(tiny_wan22_path):
58+
"""Load a fresh copy of the tiny Wan 2.2 pipeline on CUDA (per test)."""
59+
from diffusers import WanPipeline
60+
61+
pipe = WanPipeline.from_pretrained(tiny_wan22_path, torch_dtype=torch.bfloat16)
62+
pipe.to("cuda")
63+
return pipe
64+
65+
66+
# ---------------------------------------------------------------------------
67+
# Helpers
68+
# ---------------------------------------------------------------------------
69+
70+
71+
_TINY_PIPE_KWARGS = {
72+
"prompt": "test",
73+
"negative_prompt": "",
74+
"num_frames": 5,
75+
"height": 16,
76+
"width": 16,
77+
"num_inference_steps": 2,
78+
"guidance_scale": 1.0,
79+
}
80+
81+
82+
def _skip_softmax_cfg(raw_threshold=-5.0):
83+
"""Sparse config targeting Wan 2.2 self-attention (attn1) only."""
84+
return {
85+
"sparse_cfg": {
86+
"*attn1*": {
87+
"method": "triton_skip_softmax",
88+
"backend": "triton",
89+
"skip_softmax_raw_threshold": raw_threshold,
90+
"enable": True,
91+
},
92+
"default": {"enable": False},
93+
},
94+
}
95+
96+
97+
def _sparsify_both_transformers(pipe, cfg):
98+
"""Apply sparsify to both transformer and transformer_2 (14B-style dual)."""
99+
mtsa.sparsify(pipe.transformer, cfg)
100+
mtsa.sparsify(pipe.transformer_2, cfg)
101+
102+
103+
def _run_pipe(pipe, seed=0):
104+
"""Run the pipeline with a fixed seed; return output frames tensor."""
105+
generator = torch.Generator(device="cuda").manual_seed(seed)
106+
with torch.no_grad():
107+
output = pipe(generator=generator, **_TINY_PIPE_KWARGS)
108+
# output.frames[0] is a list of PIL images; for assertion we just need shape+health
109+
return output
110+
111+
112+
def _count_sparse_modules(module):
113+
return sum(isinstance(m, SparseAttentionModule) for m in module.modules())
114+
115+
116+
# ---------------------------------------------------------------------------
117+
# E2E tests
118+
# ---------------------------------------------------------------------------
119+
120+
121+
@pytest.mark.skipif(not TRITON_KERNEL_AVAILABLE, reason="Need CUDA + triton")
122+
class TestWan22PipelineE2E:
123+
"""End-to-end skip-softmax flow on a tiny Wan 2.2 pipeline."""
124+
125+
def test_baseline_pipeline_runs(self, tiny_wan22_pipe):
126+
"""Dense baseline: pipeline produces finite frames without sparsification."""
127+
output = _run_pipe(tiny_wan22_pipe)
128+
assert output.frames is not None
129+
assert len(output.frames) == 1
130+
# Frames are PIL Images; ensure list isn't empty
131+
assert len(output.frames[0]) > 0
132+
133+
def test_sparsify_inserts_modules_in_both_transformers(self, tiny_wan22_pipe):
134+
"""Both transformer and transformer_2 get SparseAttentionModule instances."""
135+
_sparsify_both_transformers(tiny_wan22_pipe, _skip_softmax_cfg())
136+
assert _count_sparse_modules(tiny_wan22_pipe.transformer) > 0
137+
assert _count_sparse_modules(tiny_wan22_pipe.transformer_2) > 0
138+
139+
def test_skip_softmax_pipeline_runs_e2e(self, tiny_wan22_pipe):
140+
"""Sparsified pipeline runs end-to-end producing finite frames."""
141+
_sparsify_both_transformers(tiny_wan22_pipe, _skip_softmax_cfg(raw_threshold=-5.0))
142+
output = _run_pipe(tiny_wan22_pipe)
143+
assert output.frames is not None
144+
assert len(output.frames[0]) > 0
145+
146+
def test_tight_threshold_matches_dense_within_tolerance(self, tiny_wan22_pipe, tiny_wan22_path):
147+
"""raw_threshold=-50 (effectively dense) → output close to unsparsified run."""
148+
from diffusers import WanPipeline
149+
150+
# Dense run: fresh pipe, no sparsification
151+
dense_pipe = WanPipeline.from_pretrained(tiny_wan22_path, torch_dtype=torch.bfloat16)
152+
dense_pipe.to("cuda")
153+
dense_frame0 = _run_pipe(dense_pipe).frames[0][0]
154+
155+
# Sparse run: same seed, raw_threshold=-50 (≈ no tiles skipped)
156+
_sparsify_both_transformers(tiny_wan22_pipe, _skip_softmax_cfg(raw_threshold=-50.0))
157+
sparse_frame0 = _run_pipe(tiny_wan22_pipe).frames[0][0]
158+
159+
# Both are PIL images — convert to tensor and compare
160+
import numpy as np
161+
162+
d = np.asarray(dense_frame0, dtype=np.float32)
163+
s = np.asarray(sparse_frame0, dtype=np.float32)
164+
# Pixel-wise MAE should be small for tight threshold (but not bit-exact due to
165+
# different code paths in the online softmax accumulation).
166+
mae = np.abs(d - s).mean()
167+
assert mae < 20.0, f"MAE between dense and tight-sparse frames was {mae:.2f}"
168+
169+
def test_measure_sparsity_counts_accumulate(self, tiny_wan22_pipe):
170+
"""measure_sparsity=True + a permissive threshold → nonzero sparsity counters."""
171+
from modelopt.torch.sparsity.attention_sparsity.methods.triton_skip_softmax import (
172+
TritonSkipSoftmaxMethod,
173+
)
174+
175+
_sparsify_both_transformers(tiny_wan22_pipe, _skip_softmax_cfg(raw_threshold=-2.0))
176+
177+
# Enable measurement + reset counters on every sparse module
178+
for module in (tiny_wan22_pipe.transformer, tiny_wan22_pipe.transformer_2):
179+
for m in module.modules():
180+
if isinstance(m, SparseAttentionModule):
181+
method = m._sparse_method_instance
182+
if isinstance(method, TritonSkipSoftmaxMethod):
183+
method.enable_measure_sparsity(True)
184+
method.reset_sparsity_counters()
185+
186+
_run_pipe(tiny_wan22_pipe)
187+
188+
# Sum counters across all sparse modules
189+
total_sum = 0
190+
for module in (tiny_wan22_pipe.transformer, tiny_wan22_pipe.transformer_2):
191+
for m in module.modules():
192+
if isinstance(m, SparseAttentionModule):
193+
method = m._sparse_method_instance
194+
if isinstance(method, TritonSkipSoftmaxMethod):
195+
total, skipped = method.get_sparsity_counters()
196+
assert skipped <= total
197+
total_sum += total
198+
199+
assert total_sum > 0, "Expected nonzero sparsity counters after pipeline run"
200+
201+
def test_save_restore_roundtrip(self, tiny_wan22_pipe):
202+
"""Sparsified transformer saves & restores via modelopt_state."""
203+
from _test_utils.torch.diffusers_models import get_tiny_wan22_transformer
204+
205+
import modelopt.torch.opt as mto
206+
207+
_sparsify_both_transformers(tiny_wan22_pipe, _skip_softmax_cfg())
208+
state = mto.modelopt_state(tiny_wan22_pipe.transformer)
209+
210+
# Restore into a fresh transformer of the same shape
211+
torch.manual_seed(0)
212+
restored = get_tiny_wan22_transformer().to("cuda", dtype=torch.bfloat16).eval()
213+
mto.restore_from_modelopt_state(restored, state)
214+
215+
# The number of sparse modules should match the original
216+
restored_count = _count_sparse_modules(restored)
217+
orig_count = _count_sparse_modules(tiny_wan22_pipe.transformer)
218+
assert restored_count == orig_count > 0
219+
220+
221+
@pytest.mark.skipif(not TRITON_KERNEL_AVAILABLE, reason="Need CUDA + triton")
222+
class TestWan22Calibration:
223+
"""Multi-threshold calibration path on a tiny Wan 2.2 transformer."""
224+
225+
def test_calibration_collects_stats_per_module(self, tiny_wan22_pipe):
226+
"""A forward pass under calibration_mode populates per-module _last_stats."""
227+
from modelopt.torch.sparsity.attention_sparsity.methods.triton_skip_softmax import (
228+
TritonSkipSoftmaxMethod,
229+
)
230+
231+
_sparsify_both_transformers(tiny_wan22_pipe, _skip_softmax_cfg())
232+
233+
threshold_trials = [1e-3, 1e-2, 1e-1]
234+
for module in (tiny_wan22_pipe.transformer, tiny_wan22_pipe.transformer_2):
235+
for m in module.modules():
236+
if isinstance(m, SparseAttentionModule):
237+
method = m._sparse_method_instance
238+
if isinstance(method, TritonSkipSoftmaxMethod):
239+
method._calibration_mode = True
240+
method._threshold_trials = threshold_trials
241+
242+
_run_pipe(tiny_wan22_pipe)
243+
244+
# At least one sparse module should report stats of the correct shape
245+
found_stats = False
246+
for module in (tiny_wan22_pipe.transformer, tiny_wan22_pipe.transformer_2):
247+
for m in module.modules():
248+
if isinstance(m, SparseAttentionModule) and m._last_stats is not None:
249+
stats = m._last_stats
250+
assert len(stats["sparsity"]) == len(threshold_trials)
251+
found_stats = True
252+
break
253+
if found_stats:
254+
break
255+
assert found_stats, "No sparse module reported calibration stats"
File renamed without changes.

0 commit comments

Comments
 (0)