Explicitly pass qwix config for deepseek batch split

shuningjin · shuningjin · commit 17800bf628df · 2026-03-13T14:24:19.000Z
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -2525,6 +2525,15 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
       self.use_grpo = True
     else:
       self.use_grpo = False
+
+    if self.use_batch_split_schedule:
+      if not (self.decoder_block == DecoderBlockType.DEEPSEEK and self.sparse_matmul and self.use_tokamax_gmm):
+        raise ValueError("Batch split only supports deepseek, with `sparse_matmul=True` and `use_tokamax_gmm=True`")
+      if self.quantization and not (self.use_qwix_quantization and self.quantization=="fp8_full"):
+        raise ValueError(
+            "Batch split quantization only supports `use_qwix_quantization=True` and `quantization=fp8_full`"
+        )
+
     if self.opt_type == "muon" and self.decoder_block not in [
         DecoderBlockType.DEEPSEEK,
         DecoderBlockType.QWEN3,
@@ -2533,7 +2542,7 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
     ]:
       raise ValueError(
           "Muon dimension numbers haven't been tested for this model. Run this command first: "
-          f"`python3 -m MaxText.muon_utils {self.model_name} True`"
+          f"`python3 -m maxtext.utils.muon_utils {self.model_name} True`"
       )
     if self.force_q_layout and not self.use_jax_splash:
       raise ValueError("`force_q_layout` can only be true if `use_jax_splash` is also true.")
diff --git a/src/maxtext/layers/quantizations.py b/src/maxtext/layers/quantizations.py
@@ -17,7 +17,7 @@
 import functools
 import json
 import re
-from typing import Tuple, Sequence
+from typing import Tuple, Sequence, Callable
 from dataclasses import dataclass
 
 from aqt.jax.v2 import config as aqt_config
@@ -27,6 +27,7 @@
 from aqt.jax.v2 import calibration
 
 import qwix
+from qwix._src.core import dot_general_qt
 
 import jax
 import jax.numpy as jnp
@@ -113,7 +114,7 @@ def _rhs_axis_metadata_wrapper(
 
 
 @dataclass
-class AqtQuantization:
+class AqtQuantization(Quantization):
   """Configures AQT quantization github.com/google/aqt."""
 
   quant_dg: aqt_config.DotGeneral
@@ -194,6 +195,83 @@ def einsum(self, mesh_axes: Tuple[str, ...] = ()):
     return aqt_einsum
 
 
+@dataclass
+class QwixQuantization(Quantization):
+  """Configures Qwix quantization github.com/google/qwix, for training only."""
+
+  quant_mode: aqt_flax.QuantMode = aqt_flax.QuantMode.TRAIN  # needed by external call
+  act_calibration_method: str = "absmax"
+  weight_calibration_method: str = "absmax"
+  bwd_calibration_method: str = "absmax"
+
+  def _get_fp8_full_qwix_config(self) -> dot_general_qt.DotGeneralQtConfig:
+    """Centralized factory for the Qwix dot_general config."""
+    return dot_general_qt.DotGeneralQtConfig(
+        lhs_qtype=jnp.float8_e4m3fn,  # activation
+        rhs_qtype=jnp.float8_e4m3fn,  # weight
+        dlhs_grad_qtype=jnp.float8_e5m2,  # activation gradient
+        drhs_grad_qtype=jnp.float8_e5m2,  # weight gradient
+        lhs_calibration_method=self.act_calibration_method,
+        rhs_calibration_method=self.weight_calibration_method,
+        dlhs_grad_calibration_method=self.bwd_calibration_method,
+        drhs_grad_calibration_method=self.bwd_calibration_method,
+        tile_size=None,
+    )
+
+  def dot_general_cls(self, mesh_axes: Tuple[str, ...] = ()):
+    """Returns qwix dot_general."""
+    return functools.partial(QwixDotGeneral, config=self._get_fp8_full_qwix_config())
+
+  def einsum(self, mesh_axes: Tuple[str, ...] = ()):
+    """Returns qwix eqinsum."""
+    return QwixEinsum(config=self._get_fp8_full_qwix_config())
+
+
+class QwixDotGeneral(nn.Module):
+  """A callable class for Qwix dot_general."""
+  config: dot_general_qt.DotGeneralQtConfig
+
+  @nn.compact
+  def __call__(
+      self,
+      lhs: jax.Array,
+      rhs: jax.Array,
+      dimension_numbers: jax.lax.DotDimensionNumbers,
+      precision: jax.lax.PrecisionLike = None,
+      preferred_element_type: jax.typing.DTypeLike | None = None,
+      *,
+      out_sharding=None,
+  ) -> jax.Array:
+
+    return dot_general_qt.dot_general_qt(lhs, rhs, dimension_numbers, self.config)
+
+
+class QwixEinsum(nn.Module):
+  """A callable class for Qwix einsum."""
+  config: dot_general_qt.DotGeneralQtConfig
+
+  @nn.compact
+  def __call__(
+      self,
+      einsum_str: str,
+      *operands: jax.Array,
+      precision: jax.lax.PrecisionLike = None,
+      preferred_element_type: jax.typing.DTypeLike | None = None,
+      _dot_general: Callable[..., jax.Array] = None,
+      out_sharding=None,
+  ) -> jax.Array:
+    custom_dot_general = lambda *args, **kwargs: dot_general_qt.dot_general_qt(*args[:3], self.config)
+    with jax.disable_jit():
+      return jnp.einsum(
+          einsum_str,
+          *operands,
+          precision=precision,
+          preferred_element_type=preferred_element_type,
+          _dot_general=custom_dot_general,
+          out_sharding=out_sharding,
+      )
+
+
 @dataclass
 class Fp8Quantization(Quantization):
   """Configures Fp8 quantization for NVIDIA GPUs"""
@@ -546,6 +624,15 @@ def get_quant_mode(quant_mode_str: str = "train"):
 
 def configure_quantization(config: Config, quant_mode_str: str = "train"):
   """Configure quantization based on user config and quant mode."""
+  if config.use_batch_split_schedule and config.quantization:
+    if not (config.use_qwix_quantization and config.quantization == "fp8_full"):
+      raise ValueError("Batch split quantization only supports `use_qwix_quantization=True` and `quantization=fp8_full`")
+    return QwixQuantization(
+        weight_calibration_method=config.weight_quantization_calibration_method,
+        act_calibration_method=config.act_quantization_calibration_method,
+        bwd_calibration_method=config.bwd_quantization_calibration_method,
+    )
+
   if config.use_qwix_quantization:
     return None
   quant_cfg = _get_quant_config(config)
@@ -726,7 +813,8 @@ def get_qt_provider(config):
 
 def maybe_quantize_model(model, config):
   """Quantize the model if quantization is enabled."""
-  if config.use_qwix_quantization:
+  # Batch split is not using Qwix's interception feature but manual plumbing
+  if config.use_qwix_quantization and not config.use_batch_split_schedule:
     quantization_provider = get_qt_provider(config)
     if quantization_provider:
       model = qwix.quantize_model(model, quantization_provider)
diff --git a/src/maxtext/models/deepseek_batchsplit.py b/src/maxtext/models/deepseek_batchsplit.py