Skip to content

Commit de61ae5

Browse files
TimDettmersclaude
andcommitted
Fix test: init weights with normal dist (torch.empty is zeroed on fresh GPU)
torch.empty() on a freshly initialized GPU returns zeroed memory, causing all-zero weights -> all-zero GEMM output. Add _make_moe_layer() helper that does nn.init.normal_(std=0.02) after construction. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 6e66780 commit de61ae5

File tree

1 file changed

+23
-20
lines changed

1 file changed

+23
-20
lines changed

tests/test_moe_sm100_pipeline.py

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,19 @@ def _make_expert_offsets(tokens_per_expert):
5151
return torch.tensor(offsets, dtype=torch.int32, device="cuda")
5252

5353

54+
def _make_moe_layer(num_experts, input_features, output_features, bias=False):
55+
"""Create a LinearNVFP4MoE layer with random weight initialization.
56+
57+
torch.empty() on a fresh GPU returns zeroed memory, so we must
58+
explicitly initialize weights to non-zero values for meaningful tests.
59+
"""
60+
from bitsandbytes.nn.modules import LinearNVFP4MoE
61+
62+
layer = LinearNVFP4MoE(num_experts, input_features, output_features, bias=bias)
63+
torch.nn.init.normal_(layer.weight.data, std=0.02)
64+
return layer.cuda()
65+
66+
5467
class TestBuildVerification:
5568
"""Verify that SM_100 CUTLASS kernels are compiled and loadable."""
5669

@@ -250,16 +263,13 @@ class TestFullPipeline:
250263

251264
def test_pipeline_output_shape(self, small_moe_config):
252265
"""Full pipeline should produce correct output shape."""
253-
from bitsandbytes.nn.modules import LinearNVFP4MoE
254-
255266
K = small_moe_config["input_features"]
256267
N = small_moe_config["output_features"]
257268
num_experts = small_moe_config["num_experts"]
258269
tpe = small_moe_config["tokens_per_expert"]
259270
total_tokens = sum(tpe)
260271

261-
layer = LinearNVFP4MoE(num_experts, K, N, bias=False)
262-
layer = layer.cuda()
272+
layer = _make_moe_layer(num_experts, K, N, bias=False)
263273

264274
x = torch.randn(total_tokens, K, dtype=torch.bfloat16, device="cuda")
265275
expert_offsets = _make_expert_offsets(tpe)
@@ -272,16 +282,13 @@ def test_pipeline_output_shape(self, small_moe_config):
272282

273283
def test_pipeline_with_bias(self, small_moe_config):
274284
"""Full pipeline with bias should produce correct output shape."""
275-
from bitsandbytes.nn.modules import LinearNVFP4MoE
276-
277285
K = small_moe_config["input_features"]
278286
N = small_moe_config["output_features"]
279287
num_experts = small_moe_config["num_experts"]
280288
tpe = small_moe_config["tokens_per_expert"]
281289
total_tokens = sum(tpe)
282290

283-
layer = LinearNVFP4MoE(num_experts, K, N, bias=True)
284-
layer = layer.cuda()
291+
layer = _make_moe_layer(num_experts, K, N, bias=True)
285292

286293
x = torch.randn(total_tokens, K, dtype=torch.bfloat16, device="cuda")
287294
expert_offsets = _make_expert_offsets(tpe)
@@ -297,16 +304,14 @@ def test_pipeline_nan_diagnosis(self, small_moe_config):
297304
quantize_nvfp4_raw, moe_scatter_nvfp4, scale_to_blocked_batched,
298305
gemm_nvfp4_moe, moe_gather_bf16,
299306
)
300-
from bitsandbytes.nn.modules import LinearNVFP4MoE
301307

302308
K = small_moe_config["input_features"]
303309
N = small_moe_config["output_features"]
304310
num_experts = small_moe_config["num_experts"]
305311
tpe = small_moe_config["tokens_per_expert"]
306312
total_tokens = sum(tpe)
307313

308-
layer = LinearNVFP4MoE(num_experts, K, N, bias=False)
309-
layer = layer.cuda()
314+
layer = _make_moe_layer(num_experts, K, N, bias=False)
310315

311316
x = torch.randn(total_tokens, K, dtype=torch.bfloat16, device="cuda")
312317
expert_offsets = _make_expert_offsets(tpe)
@@ -383,18 +388,19 @@ def test_pipeline_nan_diagnosis(self, small_moe_config):
383388
assert nan_D == 0, \
384389
f"GEMM output has {nan_D}/{D.numel()} NaN elements"
385390

391+
# Verify output is non-zero (weights should be non-zero after init)
392+
assert D.abs().max().item() > 0, \
393+
f"GEMM output is all zeros despite non-zero weights"
394+
386395
def test_pipeline_deterministic(self, small_moe_config):
387396
"""Same input should produce approximately same output."""
388-
from bitsandbytes.nn.modules import LinearNVFP4MoE
389-
390397
K = small_moe_config["input_features"]
391398
N = small_moe_config["output_features"]
392399
num_experts = small_moe_config["num_experts"]
393400
tpe = small_moe_config["tokens_per_expert"]
394401
total_tokens = sum(tpe)
395402

396-
layer = LinearNVFP4MoE(num_experts, K, N, bias=False)
397-
layer = layer.cuda()
403+
layer = _make_moe_layer(num_experts, K, N, bias=False)
398404

399405
x = torch.randn(total_tokens, K, dtype=torch.bfloat16, device="cuda")
400406
expert_offsets = _make_expert_offsets(tpe)
@@ -420,7 +426,6 @@ def test_pipeline_deterministic(self, small_moe_config):
420426
def test_pipeline_larger_config(self, moe_config):
421427
"""Test with a larger, more realistic MoE configuration."""
422428
import ctypes as ct
423-
from bitsandbytes.nn.modules import LinearNVFP4MoE
424429
from bitsandbytes.cextension import lib
425430

426431
K = moe_config["input_features"]
@@ -441,8 +446,7 @@ def test_pipeline_larger_config(self, moe_config):
441446
print(f"\n SFB sizes: batched={sfb_batched}, concat={sfb_concat}, "
442447
f"per_expert={sfb_per_expert}, match={sfb_batched == sfb_concat}")
443448

444-
layer = LinearNVFP4MoE(num_experts, K, N, bias=False)
445-
layer = layer.cuda()
449+
layer = _make_moe_layer(num_experts, K, N, bias=False)
446450

447451
x = torch.randn(total_tokens, K, dtype=torch.bfloat16, device="cuda")
448452
expert_offsets = _make_expert_offsets(tpe)
@@ -522,8 +526,7 @@ def test_no_item_in_compute_path(self, small_moe_config):
522526
tpe = small_moe_config["tokens_per_expert"]
523527
total_tokens = sum(tpe)
524528

525-
layer = LinearNVFP4MoE(num_experts, K, N, bias=False)
526-
layer = layer.cuda()
529+
layer = _make_moe_layer(num_experts, K, N, bias=False)
527530

528531
x = torch.randn(total_tokens, K, dtype=torch.bfloat16, device="cuda")
529532
expert_offsets = _make_expert_offsets(tpe)

0 commit comments

Comments
 (0)