@@ -51,6 +51,19 @@ def _make_expert_offsets(tokens_per_expert):
5151 return torch .tensor (offsets , dtype = torch .int32 , device = "cuda" )
5252
5353
54+ def _make_moe_layer (num_experts , input_features , output_features , bias = False ):
55+ """Create a LinearNVFP4MoE layer with random weight initialization.
56+
57+ torch.empty() on a fresh GPU returns zeroed memory, so we must
58+ explicitly initialize weights to non-zero values for meaningful tests.
59+ """
60+ from bitsandbytes .nn .modules import LinearNVFP4MoE
61+
62+ layer = LinearNVFP4MoE (num_experts , input_features , output_features , bias = bias )
63+ torch .nn .init .normal_ (layer .weight .data , std = 0.02 )
64+ return layer .cuda ()
65+
66+
5467class TestBuildVerification :
5568 """Verify that SM_100 CUTLASS kernels are compiled and loadable."""
5669
@@ -250,16 +263,13 @@ class TestFullPipeline:
250263
251264 def test_pipeline_output_shape (self , small_moe_config ):
252265 """Full pipeline should produce correct output shape."""
253- from bitsandbytes .nn .modules import LinearNVFP4MoE
254-
255266 K = small_moe_config ["input_features" ]
256267 N = small_moe_config ["output_features" ]
257268 num_experts = small_moe_config ["num_experts" ]
258269 tpe = small_moe_config ["tokens_per_expert" ]
259270 total_tokens = sum (tpe )
260271
261- layer = LinearNVFP4MoE (num_experts , K , N , bias = False )
262- layer = layer .cuda ()
272+ layer = _make_moe_layer (num_experts , K , N , bias = False )
263273
264274 x = torch .randn (total_tokens , K , dtype = torch .bfloat16 , device = "cuda" )
265275 expert_offsets = _make_expert_offsets (tpe )
@@ -272,16 +282,13 @@ def test_pipeline_output_shape(self, small_moe_config):
272282
273283 def test_pipeline_with_bias (self , small_moe_config ):
274284 """Full pipeline with bias should produce correct output shape."""
275- from bitsandbytes .nn .modules import LinearNVFP4MoE
276-
277285 K = small_moe_config ["input_features" ]
278286 N = small_moe_config ["output_features" ]
279287 num_experts = small_moe_config ["num_experts" ]
280288 tpe = small_moe_config ["tokens_per_expert" ]
281289 total_tokens = sum (tpe )
282290
283- layer = LinearNVFP4MoE (num_experts , K , N , bias = True )
284- layer = layer .cuda ()
291+ layer = _make_moe_layer (num_experts , K , N , bias = True )
285292
286293 x = torch .randn (total_tokens , K , dtype = torch .bfloat16 , device = "cuda" )
287294 expert_offsets = _make_expert_offsets (tpe )
@@ -297,16 +304,14 @@ def test_pipeline_nan_diagnosis(self, small_moe_config):
297304 quantize_nvfp4_raw , moe_scatter_nvfp4 , scale_to_blocked_batched ,
298305 gemm_nvfp4_moe , moe_gather_bf16 ,
299306 )
300- from bitsandbytes .nn .modules import LinearNVFP4MoE
301307
302308 K = small_moe_config ["input_features" ]
303309 N = small_moe_config ["output_features" ]
304310 num_experts = small_moe_config ["num_experts" ]
305311 tpe = small_moe_config ["tokens_per_expert" ]
306312 total_tokens = sum (tpe )
307313
308- layer = LinearNVFP4MoE (num_experts , K , N , bias = False )
309- layer = layer .cuda ()
314+ layer = _make_moe_layer (num_experts , K , N , bias = False )
310315
311316 x = torch .randn (total_tokens , K , dtype = torch .bfloat16 , device = "cuda" )
312317 expert_offsets = _make_expert_offsets (tpe )
@@ -383,18 +388,19 @@ def test_pipeline_nan_diagnosis(self, small_moe_config):
383388 assert nan_D == 0 , \
384389 f"GEMM output has { nan_D } /{ D .numel ()} NaN elements"
385390
391+ # Verify output is non-zero (weights should be non-zero after init)
392+ assert D .abs ().max ().item () > 0 , \
393+ f"GEMM output is all zeros despite non-zero weights"
394+
386395 def test_pipeline_deterministic (self , small_moe_config ):
387396 """Same input should produce approximately same output."""
388- from bitsandbytes .nn .modules import LinearNVFP4MoE
389-
390397 K = small_moe_config ["input_features" ]
391398 N = small_moe_config ["output_features" ]
392399 num_experts = small_moe_config ["num_experts" ]
393400 tpe = small_moe_config ["tokens_per_expert" ]
394401 total_tokens = sum (tpe )
395402
396- layer = LinearNVFP4MoE (num_experts , K , N , bias = False )
397- layer = layer .cuda ()
403+ layer = _make_moe_layer (num_experts , K , N , bias = False )
398404
399405 x = torch .randn (total_tokens , K , dtype = torch .bfloat16 , device = "cuda" )
400406 expert_offsets = _make_expert_offsets (tpe )
@@ -420,7 +426,6 @@ def test_pipeline_deterministic(self, small_moe_config):
420426 def test_pipeline_larger_config (self , moe_config ):
421427 """Test with a larger, more realistic MoE configuration."""
422428 import ctypes as ct
423- from bitsandbytes .nn .modules import LinearNVFP4MoE
424429 from bitsandbytes .cextension import lib
425430
426431 K = moe_config ["input_features" ]
@@ -441,8 +446,7 @@ def test_pipeline_larger_config(self, moe_config):
441446 print (f"\n SFB sizes: batched={ sfb_batched } , concat={ sfb_concat } , "
442447 f"per_expert={ sfb_per_expert } , match={ sfb_batched == sfb_concat } " )
443448
444- layer = LinearNVFP4MoE (num_experts , K , N , bias = False )
445- layer = layer .cuda ()
449+ layer = _make_moe_layer (num_experts , K , N , bias = False )
446450
447451 x = torch .randn (total_tokens , K , dtype = torch .bfloat16 , device = "cuda" )
448452 expert_offsets = _make_expert_offsets (tpe )
@@ -522,8 +526,7 @@ def test_no_item_in_compute_path(self, small_moe_config):
522526 tpe = small_moe_config ["tokens_per_expert" ]
523527 total_tokens = sum (tpe )
524528
525- layer = LinearNVFP4MoE (num_experts , K , N , bias = False )
526- layer = layer .cuda ()
529+ layer = _make_moe_layer (num_experts , K , N , bias = False )
527530
528531 x = torch .randn (total_tokens , K , dtype = torch .bfloat16 , device = "cuda" )
529532 expert_offsets = _make_expert_offsets (tpe )
0 commit comments