feat/fix: QLoRA support and NNX Decoder Sharding Fixes

RexBearIU · RexBearIU · commit c37faefb0817 · 2026-05-28T08:18:58.000Z
diff --git a/src/maxtext/configs/post_train/sft.yml b/src/maxtext/configs/post_train/sft.yml
@@ -27,6 +27,9 @@ lora:
   lora_rank: 0
   lora_alpha: 0.0
   lora_module_path: ""
+  # For QLoRA, set lora_weight_qtype (e.g., "nf4") and optionally lora_tile_size.
+  lora_weight_qtype: null
+  lora_tile_size: null
   # Optional path to LoRA weights to load before training. Ignored if the current run is resumed.
   lora_restore_path: ""
 
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -1280,9 +1280,19 @@ class LoRA(BaseModel):
   lora_module_path: str = Field(
       "",
       description=(
-          "Regex identifying target modules for LoRA, e.g." " '.*q_einsum|.*kv_einsum|.*gate_proj|.*down_proj|.*up_proj'."
+          "Regex identifying target NNX modules for LoRA. "
+          "Example for standard models: 'decoder/layers/.*(self_attention/(query|out)|mlp/(wi_0|wo))'. "
+          "Example for MoE: 'decoder/scanned_blocks/layers.*/.*(MoeBlock_0|shared_experts)/(wi_0|wo)'."
       ),
   )
+  lora_weight_qtype: str | None = Field(
+      None,
+      description=("Optional quantization type for QLoRA (e.g., 'nf4'). If set, QLoRA is applied."),
+  )
+  lora_tile_size: NonNegativeInt | None = Field(
+      None,
+      description=("Tile size for block-wise quantization. Typically 32 or 64."),
+  )
   lora_restore_path: PathStr = Field(
       "",
       description=("Optional path to LoRA weights to load before training. Ignored if the current run is resumed."),
diff --git a/src/maxtext/layers/nnx_decoders.py b/src/maxtext/layers/nnx_decoders.py
@@ -64,8 +64,7 @@
     simple_layer,
 )
 from maxtext.multimodal import utils as mm_utils
-from maxtext.utils import max_logging, max_utils, maxtext_utils, sharding
-from maxtext.utils.maxtext_utils_nnx import nnx_ensure_scan_leading_axis
+from maxtext.utils import max_logging, max_utils, maxtext_utils, maxtext_utils_nnx, sharding
 from maxtext.utils.sharding import create_sharding
 
 # ------------------------------------------------------------------------------
@@ -601,6 +600,8 @@ def _extract_matching_state(template, full):
     use_kv = kv_caches_stacked is not None
 
     def layer_fn(carry, scanned_vars):
+      # Ensure metadata rank matches the sliced values
+      scanned_vars = maxtext_utils_nnx.nnx_remove_scan_axis(scanned_vars, "layers")
 
       # Unpack the sliced variables for THIS layer
       if use_kv:
@@ -670,8 +671,8 @@ def layer_fn(carry, scanned_vars):
       # inference with vLLM, parameters do not change and we don't need intermediates.
       return current_carry, layers, None
     else:
-      params = nnx_ensure_scan_leading_axis(params, length)
-      state = nnx_ensure_scan_leading_axis(state, length)
+      params = maxtext_utils_nnx.nnx_ensure_scan_leading_axis(params, length)
+      state = maxtext_utils_nnx.nnx_ensure_scan_leading_axis(state, length)
 
       # Linen FP8 ops keep amax_history in mutable Linen scope; jax.lax.scan
       # leaks the tracer and hits UnexpectedTracerError. Use a Python for-loop
@@ -691,10 +692,15 @@ def layer_fn(carry, scanned_vars):
         final_carry, scanned_state = jax.lax.scan(layer_fn_wrapped, x_in, (params, state))
       returned_kv_stacked = None
 
-    if scan_axis != 0:
-      new_params, new_rest = scanned_state.split(nnx.Param, ...)
-      new_params = jax.tree.map(lambda x: jnp.moveaxis(x, scan_axis, 0), new_params)
-      scanned_state = nnx.merge_state(new_params, new_rest)
+      # Ensure metadata rank matches the stacked values
+      scanned_state = maxtext_utils_nnx.nnx_add_scan_axis(scanned_state, "layers", 0)
+
+      if scan_axis != 0:
+        new_params, new_rest = scanned_state.split(nnx.Param, ...)
+        new_params = maxtext_utils_nnx.nnx_sync_moveaxis(new_params, 0, scan_axis)
+        scanned_state = nnx.merge_state(new_params, new_rest)
+
+      returned_kv_stacked = None
 
     if dynamic_graph_init:
       # If graph changed, we need to merge with the new graphdef.
diff --git a/src/maxtext/utils/lora_utils.py b/src/maxtext/utils/lora_utils.py
@@ -21,7 +21,7 @@
 import re
 from typing import Any, Optional
 
-from flax import nnx
+from flax import nnx, linen as nn
 from flax.linen import partitioning as nn_partitioning
 from flax.training import train_state
 import jax
@@ -35,7 +35,6 @@
 from maxtext.utils import max_logging
 from maxtext.utils import max_utils
 from maxtext.utils import maxtext_utils
-from maxtext.utils import sharding
 from maxtext.utils.globals import MAXTEXT_CONFIGS_DIR
 
 
@@ -416,11 +415,18 @@ def _build_lora_provider(mt_config: pyconfig.HyperParameters) -> qwix.LoraProvid
       "rank": mt_config.lora.lora_rank,
       "alpha": mt_config.lora.lora_alpha,
       "dropout": 0.0,
+      "weight_qtype": mt_config.lora.lora_weight_qtype,
+      "tile_size": mt_config.lora.lora_tile_size,
   }
+  # Distinguish between standard LoRA and QLoRA in logs
+  lora_type = "QLoRA" if mt_config.lora.lora_weight_qtype else "LoRA"
+
   max_logging.log(
-      f"LoRA configured: module_path={lora_module_path} "
-      f"rank={mt_config.lora.lora_rank} alpha={mt_config.lora.lora_alpha}"
+      f"{lora_type} configured: rank={mt_config.lora.lora_rank} alpha={mt_config.lora.lora_alpha} "
+      f"qtype={mt_config.lora.lora_weight_qtype} tile_size={mt_config.lora.lora_tile_size}"
   )
+
+  max_logging.log(f"Using lora_module_path: {lora_module_path}")
   return qwix.LoraProvider(**lora_kwargs)
 
 
@@ -518,13 +524,22 @@ def apply_lora_to_model(
 
       # Use logical_to_mesh_sharding to correctly map logical axes like 'embed'
       # to physical mesh axes.
-      dst_shardings = sharding.logical_to_mesh_sharding(
-          nnx.get_partition_spec(state), mesh, rules=mt_config.logical_axis_rules
-      )
-
-      from tunix.rl import reshard  # pylint: disable=import-outside-toplevel
+      dst_shardings = nn.logical_to_mesh_sharding(nnx.get_partition_spec(state), mesh, mt_config.logical_axis_rules)
+
+      def _safe_reshard(var, sharding_spec):
+        if not isinstance(var, nnx.Variable) or not isinstance(sharding_spec, jax.sharding.Sharding):
+          return var
+        val = var.get_value()
+        if not isinstance(val, jax.Array):
+          return var
+        # make_array_from_callback natively constructs a globally sharded array
+        # from the local host arrays, bypassing backend-specific device_put issues
+        # on both Pathways and McJAX.
+        resharded_val = jax.make_array_from_callback(val.shape, sharding_spec, lambda idx: val[idx])
+        return var.replace(value=resharded_val)
+
+      state = jax.tree_util.tree_map(_safe_reshard, state, dst_shardings, is_leaf=lambda x: isinstance(x, nnx.Variable))
 
-      state = reshard.reshard_pytree(state, dst_shardings)
       lora_model = nnx.merge(graph_def, state)
 
   _verify_lora_parameters(lora_model, mt_config)
diff --git a/src/maxtext/utils/maxtext_utils_nnx.py b/src/maxtext/utils/maxtext_utils_nnx.py
@@ -18,7 +18,8 @@
 
 from flax import nnx
 import jax
-from jax.sharding import Mesh, NamedSharding
+import jax.numpy as jnp
+from jax.sharding import Mesh, NamedSharding, PartitionSpec as P
 
 from maxtext.utils import max_logging
 from maxtext.configs import pyconfig
@@ -187,3 +188,90 @@ def _op(x):
     return x
 
   return jax.tree.map(_op, tree, is_leaf=lambda x: isinstance(x, nnx.Variable))
+
+
+# ------------------------------------------------------------------------------
+# Metadata Synchronization Helpers for NNX Variables
+# ------------------------------------------------------------------------------
+
+
+def nnx_update_sharding_meta(variable, transform_fn):
+  """Generic helper to apply a list transformation to all sharding-related metadata."""
+  if not (hasattr(variable, "get_metadata") and hasattr(variable, "replace")):
+    return variable
+
+  meta = variable.get_metadata()
+  updates = {}
+
+  for key in ["sharding", "out_sharding", "sharding_names"]:
+    if (val := meta.get(key)) and isinstance(val, (P, tuple, list)):
+      new_list = list(val)
+      transformed = transform_fn(new_list)
+      updates[key] = P(*transformed) if isinstance(val, P) else tuple(transformed)
+
+  if updates:
+    return variable.replace(**updates)
+  return variable
+
+
+def nnx_sync_moveaxis(tree, from_axis, to_axis):
+  """Moves an axis in both values and sharding metadata of nnx.Variables."""
+  if from_axis == to_axis:
+    return tree
+
+  def _op(x):
+    is_var = isinstance(x, nnx.Variable)
+    val = x.get_value() if is_var else x
+    if not hasattr(val, "shape"):
+      return x
+
+    new_val = jnp.moveaxis(val, from_axis, to_axis)
+    if not is_var:
+      return new_val
+
+    def move_fn(l):
+      if len(l) > max(from_axis, to_axis):
+        l.insert(to_axis, l.pop(from_axis))
+      return l
+
+    return nnx_update_sharding_meta(x.replace(value=new_val), move_fn)
+
+  return jax.tree.map(_op, tree, is_leaf=lambda x: isinstance(x, nnx.Variable) or hasattr(x, "shape"))
+
+
+def nnx_remove_scan_axis(tree, name="layers"):
+  """Removes the given scan axis from the PartitionSpec."""
+
+  def _op(x):
+    if not isinstance(x, nnx.Variable):
+      return x
+
+    def remove_fn(l):
+      if name in l:
+        l.remove(name)
+      while len(l) > x.get_value().ndim:
+        l.pop(0)
+      return l
+
+    return nnx_update_sharding_meta(x, remove_fn)
+
+  return jax.tree.map(_op, tree, is_leaf=lambda x: isinstance(x, nnx.Variable))
+
+
+def nnx_add_scan_axis(tree, name="layers", pos=0):
+  """Adds the given scan axis to the PartitionSpec at the specified position."""
+
+  def _op(x):
+    if not isinstance(x, nnx.Variable):
+      return x
+
+    def add_fn(l):
+      if name not in l:
+        l.insert(pos, name)
+      while len(l) < x.get_value().ndim:
+        l.insert(pos, None)
+      return l
+
+    return nnx_update_sharding_meta(x, add_fn)
+
+  return jax.tree.map(_op, tree, is_leaf=lambda x: isinstance(x, nnx.Variable))
diff --git a/tests/post_training/unit/lora_utils_test.py b/tests/post_training/unit/lora_utils_test.py
@@ -29,6 +29,8 @@
 from maxtext.utils import lora_utils
 from maxtext.utils import model_creation_utils
 from maxtext.configs import pyconfig
+from maxtext.utils import maxtext_utils
+from jax.sharding import Mesh
 from tests.utils.test_helpers import get_test_config_path
 
 # ---------------------------------------------------------------------------
@@ -104,10 +106,14 @@ def test_build_lora_provider(self):
     mock_config.lora.lora_module_path = "custom/path"
     mock_config.lora.lora_rank = 8
     mock_config.lora.lora_alpha = 16.0
+    mock_config.lora.lora_weight_qtype = "int8"
+    mock_config.lora.lora_tile_size = 32
 
     with mock.patch("qwix.LoraProvider") as mock_provider:
       lora_utils._build_lora_provider(mock_config)
-      mock_provider.assert_called_once_with(module_path="custom/path", rank=8, alpha=16.0, dropout=0.0)
+      mock_provider.assert_called_once_with(
+          module_path="custom/path", rank=8, alpha=16.0, dropout=0.0, weight_qtype="int8", tile_size=32
+      )
 
   def test_prepare_dummy_inputs(self):
     """Test preparation of dummy inputs for LoRA verification."""
@@ -158,27 +164,36 @@ def test_apply_lora_to_model_adapters_loaded(self):
     # If we skip Qwix, it should stay False.
     self.assertFalse(lora_utils.is_lora_enabled(result))
 
-  def _run_apply_lora_test(self, scan_layers: bool):
-    """Helper to run LoRA application test with/without scanned layers."""
+  def _run_apply_lora_test(self, scan_layers: bool, weight_qtype=None, tile_size=None, mock_multihost: bool = False):
+    """Helper to run LoRA application test with/without scanned layers and optional QLoRA."""
     # Passing nested dict as 'lora' kwarg to _make_config
     cfg = _make_config(
         lora={
             "enable_lora": True,
             "lora_rank": 4,
             "lora_alpha": 8.0,
             "lora_module_path": ".*mlp/wi_.*",
+            "lora_weight_qtype": weight_qtype,
+            "lora_tile_size": tile_size,
         },
         scan_layers=scan_layers,
     )
 
     # Create a real small model using standard creation utils
-    model, _ = model_creation_utils.from_pretrained(cfg, mesh=None, model_mode=model_creation_utils.MODEL_MODE_TRAIN)
+    model, mesh = model_creation_utils.from_pretrained(cfg, mesh=None, model_mode=model_creation_utils.MODEL_MODE_TRAIN)
 
     # Verify model is NOT lora enabled initially
     self.assertFalse(lora_utils.is_lora_enabled(model))
 
-    # Apply LoRA
-    lora_model = lora_utils.apply_lora_to_model(model, model.mesh, cfg)
+    if mock_multihost:
+      devices_array = maxtext_utils.create_device_mesh(cfg)
+      dummy_mesh = Mesh(devices_array, cfg.mesh_axes)
+
+      # Just verify that apply_lora_to_model runs successfully with the dummy mesh
+      lora_model = lora_utils.apply_lora_to_model(model, dummy_mesh, cfg)
+    else:
+      # Apply LoRA
+      lora_model = lora_utils.apply_lora_to_model(model, mesh, cfg)
 
     # Verify we can find LoRAParam in the state
     _, state = nnx.split(lora_model)
@@ -200,13 +215,27 @@ def _run_apply_lora_test(self, scan_layers: bool):
     self.assertGreater(len(jax.tree_util.tree_leaves(opt_state)), 0)
 
   def test_apply_lora_to_model_scan_layers_false(self):
-    """Test applying LoRA to model with scan_layers=False."""
+    """Test applying standard LoRA to model with scan_layers=False."""
     self._run_apply_lora_test(scan_layers=False)
 
   def test_apply_lora_to_model_scan_layers_true(self):
-    """Test applying LoRA to model with scan_layers=True."""
+    """Test applying standard LoRA to model with scan_layers=True."""
     self._run_apply_lora_test(scan_layers=True)
 
+  @unittest.skip("Awaiting qwix fix for QLoRA params materialization")
+  def test_apply_qlora_to_model_scan_layers_false(self):
+    """Test applying QLoRA to model with scan_layers=False."""
+    self._run_apply_lora_test(scan_layers=False, weight_qtype="int8", tile_size=32)
+
+  @unittest.skip("Awaiting qwix fix for QLoRA params materialization")
+  def test_apply_qlora_to_model_scan_layers_true(self):
+    """Test applying QLoRA to model with scan_layers=True."""
+    self._run_apply_lora_test(scan_layers=True, weight_qtype="int8", tile_size=32)
+
+  def test_apply_lora_multihost_mock(self):
+    """Test applying LoRA with a dummy mesh to trigger the multi-host reshard callback."""
+    self._run_apply_lora_test(scan_layers=False, mock_multihost=True)
+
   def test_restore_lora_from_path(self):
     """Test restoration of LoRA parameters from a path."""
     cfg = _make_config(
diff --git a/tests/utils/test_maxtext_utils_nnx.py b/tests/utils/test_maxtext_utils_nnx.py