@@ -300,6 +300,94 @@ def test_export_creates_per_expert_submodules(self):
300300 if QuantModuleRegistry .get (expert_type ) is not None :
301301 QuantModuleRegistry .unregister (expert_type )
302302
303+ def test_uncalibrated_expert_gate_up_share_amax (self , monkeypatch ):
304+ """gate_proj and up_proj must share weight_scale_2 even when an expert
305+ was never routed during calibration.
306+
307+ Regression for the bug where ``_export_fused_experts``'s per-projection
308+ fallback computed amax independently from the gate and up halves of the
309+ fused tensor — producing mismatched ``weight_scale_2`` values for any
310+ uncalibrated expert. vLLM fuses W1 (gate) and W3 (up) at load time and
311+ asserts a single shared scale; mismatched scales corrupted MoE output.
312+ The fix derives the fallback amax once from the fused ``gate_up[idx]``
313+ tensor before the deepcopies, so gate's clone and up's clone start with
314+ the same amax.
315+ """
316+ from modelopt .torch .export .moe_utils import _export_fused_experts
317+
318+ # Build experts where gate and up have very different magnitudes —
319+ # any per-half fallback would clearly produce different amaxes.
320+ experts = _SyntheticFusedExperts ()
321+ gate = torch .randn (NUM_EXPERTS , INTERMEDIATE_DIM , HIDDEN_DIM ) * 0.02
322+ up = torch .randn (NUM_EXPERTS , INTERMEDIATE_DIM , HIDDEN_DIM ) * 0.20
323+ with torch .no_grad ():
324+ experts .gate_up_proj .copy_ (torch .cat ([gate , up ], dim = 1 ))
325+
326+ expert_type = type (experts )
327+ if QuantModuleRegistry .get (expert_type ) is None :
328+ QuantModuleRegistry .register ({expert_type : "test.SyntheticFusedExperts" })(
329+ _QuantFusedExperts
330+ )
331+ try :
332+ converted = QuantModuleRegistry .convert (experts )
333+
334+ # Leave every expert weight quantizer uncalibrated (no _amax).
335+ # Mark them enabled to exercise the export-time fallback path.
336+ for q in converted .gate_up_proj_weight_quantizers :
337+ q ._disabled = False
338+ for q in converted .down_proj_weight_quantizers :
339+ q ._disabled = False
340+
341+ # Capture the amax each per-projection wrapper carries into the
342+ # FP4 quantization step. Patching here avoids needing CUDA / FP4.
343+ seen = {} # (expert_idx, proj_name) -> amax tensor
344+
345+ def _spy_export (wrapper , dtype ):
346+ # Identify which expert/projection this wrapper belongs to by
347+ # matching the weight tensor against the fused parameters.
348+ w = wrapper .weight .data
349+ # gate_up_proj is (N, 2*INTER, HIDDEN); split halves are
350+ # contiguous .data views or .contiguous() copies — we can match
351+ # by shape and value identity for this synthetic case.
352+ amax = wrapper .weight_quantizer ._amax .detach ().clone ()
353+ # Identify by matching against gate vs. up slices of each expert.
354+ for idx in range (NUM_EXPERTS ):
355+ g_slice = converted .gate_up_proj .data [idx , :INTERMEDIATE_DIM , :]
356+ u_slice = converted .gate_up_proj .data [idx , INTERMEDIATE_DIM :, :]
357+ d_slice = converted .down_proj .data [idx ]
358+ if w .shape == g_slice .shape and torch .equal (w , g_slice ):
359+ seen [(idx , "gate_proj" )] = amax
360+ return
361+ if w .shape == u_slice .shape and torch .equal (w , u_slice ):
362+ seen [(idx , "up_proj" )] = amax
363+ return
364+ if w .shape == d_slice .shape and torch .equal (w , d_slice ):
365+ seen [(idx , "down_proj" )] = amax
366+ return
367+
368+ monkeypatch .setattr (
369+ "modelopt.torch.export.unified_export_hf._export_quantized_weight" ,
370+ _spy_export ,
371+ )
372+
373+ _export_fused_experts (converted , torch .float16 )
374+
375+ # Assert: for every expert, gate's amax matches up's amax.
376+ for idx in range (NUM_EXPERTS ):
377+ g_amax = seen .get ((idx , "gate_proj" ))
378+ u_amax = seen .get ((idx , "up_proj" ))
379+ assert g_amax is not None and u_amax is not None , (
380+ f"Expert { idx } : missing recorded amax (gate={ g_amax } , up={ u_amax } )"
381+ )
382+ assert torch .allclose (g_amax , u_amax ), (
383+ f"Expert { idx } : gate amax { g_amax .item ()} != up amax { u_amax .item ()} . "
384+ f"Uncalibrated fused experts must share gate/up amax so that "
385+ f"weight_scale_2 stays consistent across the fusion."
386+ )
387+ finally :
388+ if QuantModuleRegistry .get (expert_type ) is not None :
389+ QuantModuleRegistry .unregister (expert_type )
390+
303391
304392# ---------------------------------------------------------------------------
305393# Tests for force_eager_experts_impl_on_the_fly
0 commit comments