feat: support QLoRA with NNX and Qwix

RexBearIU · RexBearIU · commit 324eb0a6f224 · 2026-05-06T11:06:04.000Z
diff --git a/src/maxtext/configs/post_train/sft.yml b/src/maxtext/configs/post_train/sft.yml
@@ -27,6 +27,9 @@ lora:
   lora_rank: 0
   lora_alpha: 0.0
   lora_module_path: ""
+  # For QLoRA, set lora_weight_qtype (e.g., "nf4") and optionally lora_tile_size.
+  lora_weight_qtype: null
+  lora_tile_size: null
   # Optional path to LoRA weights to load before training. Ignored if the current run is resumed.
   lora_restore_path: ""
 
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -1206,6 +1206,14 @@ class LoRA(BaseModel):
           "Regex identifying target modules for LoRA, e.g." " '.*q_einsum|.*kv_einsum|.*gate_proj|.*down_proj|.*up_proj'."
       ),
   )
+  lora_weight_qtype: str | None = Field(
+      None,
+      description=("Optional quantization type for QLoRA (e.g., 'nf4'). If set, QLoRA is applied."),
+  )
+  lora_tile_size: NonNegativeInt | None = Field(
+      None,
+      description="Optional tile size for QLoRA (e.g., 128 or 256).",
+  )
   lora_restore_path: PathStr = Field(
       "",
       description=("Optional path to LoRA weights to load before training. Ignored if the current run is resumed."),
diff --git a/src/maxtext/layers/nnx_decoders.py b/src/maxtext/layers/nnx_decoders.py
@@ -62,7 +62,12 @@
 )
 from maxtext.multimodal import utils as mm_utils
 from maxtext.utils import max_logging, max_utils, maxtext_utils, sharding
-from maxtext.utils.sharding import create_sharding
+from maxtext.utils.sharding import (
+    create_sharding,
+    nnx_ensure_scan_leading_axis,
+    nnx_reconcile_sharding,
+    nnx_sync_moveaxis,
+)
 
 # ------------------------------------------------------------------------------
 # The network: Decoder Definitions
@@ -453,7 +458,7 @@ def _apply_layers_sequentially(self, layers, x_in, *args, length: int, kv_caches
     graphdef, params, state = nnx.split(layers, nnx.Param, ...)
 
     if scan_axis != 0:
-      params = jax.tree.map(lambda x: jnp.moveaxis(x, scan_axis, 0), params)
+      params = nnx_sync_moveaxis(params, scan_axis, 0)
 
     sig = inspect.signature(layers.__class__.__call__)
     valid_kwargs = {k: v for k, v in kwargs.items() if k in sig.parameters or "kwargs" in sig.parameters}
@@ -463,7 +468,24 @@ def _apply_layers_sequentially(self, layers, x_in, *args, length: int, kv_caches
 
     use_kv = kv_caches_stacked is not None
 
+    def stash_origin_metadata(x):
+      is_var = hasattr(x, "get_metadata") and hasattr(x, "replace")
+      if is_var:
+        metadata = x.get_metadata()
+        updates = {"origin_shape": x.value.shape}
+        for k in ["sharding", "out_sharding", "sharding_names"]:
+          if k in metadata:
+            updates[f"origin_{k}"] = metadata[k]
+        return x.replace(**updates)
+      return x
+
+    params = jax.tree.map(stash_origin_metadata, params)
+    state = jax.tree.map(stash_origin_metadata, state)
+
     def layer_fn(carry, scanned_vars):
+      # Ensure metadata rank matches the sliced values
+      scanned_vars = nnx_reconcile_sharding(scanned_vars, "layers")
+
       if use_kv:
         current_params, current_state, kv_cache_layer = scanned_vars
       else:
@@ -527,21 +549,57 @@ def layer_fn(carry, scanned_vars):
     else:
       layer_fn_wrapped = jax.checkpoint(layer_fn, policy=policy, prevent_cse=prevent_cse)
 
-      def _ensure_scan_leading_axis(x):
-        if not hasattr(x, "shape") or len(x.shape) == 0:
-          return jnp.broadcast_to(x, (length,))
-        return x
-
-      params = jax.tree.map(_ensure_scan_leading_axis, params)
-      state = jax.tree.map(_ensure_scan_leading_axis, state)
+      params = nnx_ensure_scan_leading_axis(params, length)
+      state = nnx_ensure_scan_leading_axis(state, length)
 
       final_carry, (scanned_params, scanned_other) = jax.lax.scan(layer_fn_wrapped, x_in, (params, state))
 
+      # Ensure metadata rank matches the stacked values
+      scanned_params = nnx_reconcile_sharding(scanned_params, "layers")
+      scanned_other = nnx_reconcile_sharding(scanned_other, "layers")
+
       if scan_axis != 0:
-        scanned_params = jax.tree.map(lambda x: jnp.moveaxis(x, 0, scan_axis), scanned_params)
+        scanned_params = nnx_sync_moveaxis(scanned_params, 0, scan_axis)
+
+      def restore_origin_metadata(x):
+        is_var = hasattr(x, "get_metadata") and hasattr(x, "replace")
+        if is_var:
+          metadata = x.get_metadata()
+          updates = {}
+          for k in ["sharding", "out_sharding", "sharding_names"]:
+            origin_key = f"origin_{k}"
+            if origin_key in metadata:
+              updates[k] = metadata[origin_key]
+            else:
+              axes = metadata.get(k)
+              if isinstance(axes, (jax.sharding.PartitionSpec, tuple, list)):
+                spec_list = list(axes)
+                if "layers" not in spec_list:
+                  pos = min(self.config.param_scan_axis, len(spec_list))
+                  spec_list.insert(pos, "layers")
+                  new_spec = jax.sharding.PartitionSpec(*spec_list)
+                  updates[k] = tuple(new_spec) if isinstance(axes, (tuple, list)) else new_spec
+          if updates:
+            return x.replace(**updates)
+        return x
+
+      def is_leaf_with_metadata(x):
+        return hasattr(x, "get_metadata") and hasattr(x, "replace")
+
+      scanned_params = jax.tree.map(restore_origin_metadata, scanned_params, is_leaf=is_leaf_with_metadata)
+      scanned_other = jax.tree.map(restore_origin_metadata, scanned_other, is_leaf=is_leaf_with_metadata)
 
       if dynamic_graph_init:
         out_layers = nnx.merge(updated_graphdef[0], scanned_params, scanned_other)
+
+        for attr_name, attr_val in self.__dict__.items():
+          if attr_val is layers:
+            setattr(self, attr_name, out_layers)
+            break
+
+        g, s = nnx.split(self)
+        new_self = nnx.merge(g, s)
+        nnx.update(self, nnx.state(new_self))
       else:
         nnx.update(layers, nnx.State.merge(scanned_params, scanned_other))
         out_layers = layers
diff --git a/src/maxtext/utils/lora_utils.py b/src/maxtext/utils/lora_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-""" Common LoRA utils needed to support LoRA adapters."""
+"""Common LoRA utils needed to support LoRA adapters."""
 from functools import partial
 import json
 import os
@@ -385,14 +385,20 @@ def _get_lora_module_path(mt_config: pyconfig.HyperParameters) -> str:
   model_name = mt_config.model_name.lower()
 
   # Find the first matching architecture prefix or use 'default'
-  matched_key = next((k for k in lora_configs if k != "default" and model_name.startswith(k)), "default")
+  matched_key = next(
+      (k for k in lora_configs if k != "default" and model_name.startswith(k)),
+      "default",
+  )
 
   if matched_key == "default":
     max_logging.log(f"Warning: Model '{model_name}' is unverified; falling back to default LoRA path.")
   else:
     max_logging.log(f"Auto-detected lora_module_path for model '{model_name}' (matched: '{matched_key}')")
 
-  raw_path = lora_configs.get(matched_key, "decoder/layers/.*(self_attention/(query|key|value|out)|mlp/(wi_0|wi_1|wo))")
+  raw_path = lora_configs.get(
+      matched_key,
+      "decoder/layers/.*(self_attention/(query|key|value|out)|mlp/(wi_0|wi_1|wo))",
+  )
 
   # This regex makes the layer index optional, matching both scanned and unscanned layer paths
   # (e.g. 'layers/0/mlp/...' vs 'layers/mlp/...').
@@ -412,10 +418,15 @@ def _build_lora_provider(mt_config: pyconfig.HyperParameters) -> qwix.LoraProvid
       "alpha": mt_config.lora.lora_alpha,
       "dropout": 0.0,
   }
-  max_logging.log(
-      f"LoRA configured: module_path={lora_module_path} "
-      f"rank={mt_config.lora.lora_rank} alpha={mt_config.lora.lora_alpha}"
-  )
+  if mt_config.lora.lora_tile_size is not None:
+    lora_kwargs["tile_size"] = mt_config.lora.lora_tile_size
+  if mt_config.lora.lora_weight_qtype is not None:
+    lora_kwargs["weight_qtype"] = mt_config.lora.lora_weight_qtype
+
+  lora_type = "QLoRA" if mt_config.lora.lora_weight_qtype else "LoRA"
+  args_str = " ".join(f"{k}={v}" for k, v in lora_kwargs.items() if k != "dropout")
+  max_logging.log(f"{lora_type} configured: {args_str}")
+
   return qwix.LoraProvider(**lora_kwargs)
 
 
@@ -448,7 +459,7 @@ def _verify_lora_parameters(lora_model: nnx.Module, mt_config: pyconfig.HyperPar
   matched_module_paths = []
   sample_module_paths = []
 
-  for path, _ in nnx.iter_modules(lora_model):
+  for path, _ in nnx.iter_graph(lora_model):
     module_path = "/".join(str(p) for p in path)
     if len(sample_module_paths) < 100:
       sample_module_paths.append(module_path)
@@ -469,6 +480,81 @@ def _verify_lora_parameters(lora_model: nnx.Module, mt_config: pyconfig.HyperPar
   )
 
 
+def _patch_qwix_for_maxtext(mesh, mt_config):
+  # pylint: disable=protected-access,import-outside-toplevel,redefined-outer-name,reimported,missing-function-docstring,consider-using-from-import
+  import qwix._src.flax_util as flax_util
+  import qwix._src.providers.ptq as ptq
+  import jax.numpy as jnp
+  from flax import nnx
+
+  # 1. PTQ patch
+  original_get_intercept_map = ptq.PtqProvider.get_intercept_map
+
+  def patched_get_intercept_map(self):
+    mapping = original_get_intercept_map(self)
+
+    def intercept_asarray(a, dtype=None, order=None, **kwargs):
+      if isinstance(a, nnx.State) and "array" in a:
+        a = a["array"]
+        if isinstance(a, nnx.State) and "qvalue" in a and "scale" in a:
+          a = ptq.QArray(qvalue=a["qvalue"].value, scale=a["scale"].value)
+
+      if type(a).__name__ in ("WithAux", "QArray"):
+        return a
+      return jnp.asarray(a, dtype=dtype, order=order, **kwargs)
+
+    mapping["jax.numpy.asarray"] = intercept_asarray
+    return mapping
+
+  ptq.PtqProvider.get_intercept_map = patched_get_intercept_map
+
+  # 2. find_param patch
+  if not hasattr(flax_util, "_maxtext_find_param_patched"):
+
+    def patched_find_param(x, ptq_array_type=None):
+      module = flax_util.get_current_module()
+      if module is None:
+        return None
+      candidates = {}
+      if isinstance(module, nnx.Module):
+        array_types = nnx.Param | ptq_array_type if ptq_array_type else nnx.Param
+        for name, node in module.__dict__.items():
+          if isinstance(node, array_types):
+            candidates[name] = node.value if isinstance(node, nnx.Param) else node
+      else:
+        return (
+            flax_util.find_param.__wrapped__(x, ptq_array_type) if hasattr(flax_util.find_param, "__wrapped__") else None
+        )
+
+      candidates_by_id = {id(c): n for n, c in candidates.items()}
+      for n, c in candidates.items():
+        if type(c).__name__ == "WithAux" and hasattr(c, "array"):
+          candidates_by_id[id(c.array)] = n
+
+      if id(x) in candidates_by_id:
+        return candidates_by_id[id(x)]
+
+      if isinstance(x, jax.core.Tracer) and hasattr(x, "parent"):
+        curr_x = x
+        while True:
+          if id(curr_x) in candidates_by_id:
+            return candidates_by_id[id(curr_x)]
+          if curr_x.parent and len(curr_x.parent.in_tracers) == 1:
+            curr_x = curr_x.parent.in_tracers[0]
+          elif hasattr(curr_x, "get_const") and id(const := curr_x.get_const()) in candidates_by_id:
+            return candidates_by_id[id(const)]
+          else:
+            break
+
+      filtered = {n: c for n, c in candidates.items() if hasattr(c, "shape") and c.shape == getattr(x, "shape", None)}
+      if len(filtered) == 1:
+        return list(filtered.keys())[0]
+      return None
+
+    flax_util.find_param = patched_find_param
+    flax_util._maxtext_find_param_patched = True
+
+
 def apply_lora_to_model(
     model: nnx.Module,
     mesh: Optional[jax.sharding.Mesh],
@@ -485,6 +571,8 @@ def apply_lora_to_model(
 
   # Dynamically detect and set LoRA rank before model creation if restoring
 
+  _patch_qwix_for_maxtext(mesh, mt_config)
+
   lora_provider = _build_lora_provider(mt_config)
 
   model_rngs = getattr(model.decoder, "rngs", None)
diff --git a/src/maxtext/utils/sharding.py b/src/maxtext/utils/sharding.py
@@ -20,6 +20,7 @@
 from collections.abc import Iterable
 
 import jax
+import jax.numpy as jnp
 from jax.core import Tracer
 from jax.sharding import PartitionSpec as P, NamedSharding, reshard
 
@@ -670,3 +671,88 @@ def all_gather_over_fsdp(variables, sharding_info, mesh, logical_axis_rules, sha
   # Apply the constraint to the model's current variables. This tells JAX to
   # gather the weights into this layout.
   return maybe_shard_with_name(variables, physical_constraint_no_fsdp, shard_mode=shard_mode)
+
+
+# ------------------------------------------------------------------------------
+# Metadata Synchronization Helpers for NNX Variables
+# ------------------------------------------------------------------------------
+
+
+def nnx_update_sharding_meta(variable, transform_fn):
+  """Generic helper to apply a list transformation to all sharding-related metadata."""
+  if not (hasattr(variable, "get_metadata") and hasattr(variable, "replace")):
+    return variable
+
+  meta = variable.get_metadata()
+  updates = {}
+
+  for key in ["sharding", "out_sharding", "sharding_names"]:
+    if (val := meta.get(key)) and isinstance(val, (P, tuple, list)):
+      new_list = list(val)
+      transformed = transform_fn(new_list)
+      updates[key] = P(*transformed) if isinstance(val, P) else tuple(transformed)
+
+  return variable.replace(**updates) if updates else variable
+
+
+def nnx_sync_moveaxis(tree, from_axis, to_axis):
+  """Moves an axis in both values and sharding metadata of nnx.Variables."""
+  if from_axis == to_axis:
+    return tree
+
+  def _op(x):
+    is_var = hasattr(x, "value") and hasattr(x, "get_metadata")
+    val = x.value if is_var else x
+    if not hasattr(val, "shape"):
+      return x
+
+    new_val = jnp.moveaxis(val, from_axis, to_axis)
+    if not is_var:
+      return new_val
+
+    def move_fn(l):
+      if len(l) > max(from_axis, to_axis):
+        l.insert(to_axis, l.pop(from_axis))
+      return l
+
+    return nnx_update_sharding_meta(x.replace(value=new_val), move_fn)
+
+  return jax.tree.map(_op, tree, is_leaf=lambda x: hasattr(x, "value") or hasattr(x, "shape"))
+
+
+def nnx_reconcile_sharding(tree, name="layers"):
+  """Reconciles sharding metadata rank with value rank by adding/removing an axis."""
+
+  def _op(x):
+    if not (hasattr(x, "value") and hasattr(x, "get_metadata")):
+      return x
+
+    def reconcile_fn(l):
+      if len(l) > x.value.ndim:  # Sliced: rank decreased
+        if name in l:
+          l.remove(name)
+        while len(l) > x.value.ndim:
+          l.pop(0)
+      elif len(l) < x.value.ndim:  # Stacked: rank increased
+        l.insert(0, name)  # Assume axis 0 for scan
+        while len(l) < x.value.ndim:
+          l.insert(0, None)
+      return l
+
+    return nnx_update_sharding_meta(x, reconcile_fn)
+
+  return jax.tree.map(_op, tree, is_leaf=lambda x: hasattr(x, "get_metadata"))
+
+
+def nnx_ensure_scan_leading_axis(tree, length):
+  """Ensures all scanned variables have a leading axis of the given length."""
+
+  def _op(x):
+    is_var = hasattr(x, "value") and hasattr(x, "get_metadata")
+    val = x.value if is_var else x
+    if hasattr(val, "shape") and len(val.shape) == 0:
+      new_val = jnp.broadcast_to(val, (length,))
+      return x.replace(value=new_val) if is_var else new_val
+    return x
+
+  return jax.tree.map(_op, tree, is_leaf=lambda x: hasattr(x, "value") or hasattr(x, "shape"))
diff --git a/tests/post_training/unit/lora_utils_test.py b/tests/post_training/unit/lora_utils_test.py
@@ -106,6 +106,8 @@ def test_build_lora_provider(self):
     mock_config.lora.lora_module_path = "custom/path"
     mock_config.lora.lora_rank = 8
     mock_config.lora.lora_alpha = 16.0
+    mock_config.lora.lora_tile_size = None
+    mock_config.lora.lora_weight_qtype = None
 
     with mock.patch("qwix.LoraProvider") as mock_provider:
       lora_utils._build_lora_provider(mock_config)