Merge pull request #3875 from AI-Hypercomputer:nicogrande/improve-gemma4-vllm-perf

Google-ML-Automation · Google-ML-Automation · commit 4d9f39035c84 · 2026-05-12T11:56:04.000-07:00
PiperOrigin-RevId: 914403579
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -726,6 +726,11 @@ class MoEGeneral(BaseModel):
       description="Whether to pre-fuse MoE weights (w0 and w1) during initialization. "
       "This is useful for inference performance in vllm_rpa mode.",
   )
+  fuse_expert_scales: bool = Field(
+      False,
+      description="Whether to fuse the expert scaling factors into the expert weights. "
+      "This can improve inference performance.",
+  )
 
 
 class MoEKernels(BaseModel):
diff --git a/src/maxtext/layers/moe.py b/src/maxtext/layers/moe.py
@@ -539,6 +539,14 @@ def __init__(
     else:
       self.per_expert_scale = None
 
+    # Scale the output projection ahead of time during inference for higher generation throughput.
+    if (
+        self.per_expert_scale is not None
+        and self.config.model_call_mode == "inference"
+        and self.config.fuse_expert_scales
+    ):
+      self.wo.value = self.wo.value * self.per_expert_scale.value[:, None, None]
+
   def _maybe_shard_with_logical(self, inputs, logical_name):
     return maybe_shard_with_logical(
         inputs,
@@ -2242,7 +2250,8 @@ def __call__(
       w0_kernel = jnp.asarray(self.wi_0[...], self.dtype)
       w1_kernel = jnp.asarray(self.wi_1[...], self.dtype)
 
-    if self.per_expert_scale is not None:
+    # Only apply per expert scales if we have not fused with the out-projections at init time.
+    if self.per_expert_scale is not None and cfg.model_call_mode != "inference" and not cfg.fuse_expert_scales:
       wo_kernel = wo_kernel * jnp.asarray(self.per_expert_scale[...], self.dtype)[:, None, None]
 
     if self.wi_0_sparsity_module is not None: