feat: support QLoRA with NNX and Qwix

RexBearIU · RexBearIU · commit 2d237f8b6f68 · 2026-05-04T08:15:49.000Z
diff --git a/src/maxtext/configs/post_train/sft.yml b/src/maxtext/configs/post_train/sft.yml
@@ -27,6 +27,9 @@ lora:
   lora_rank: 0
   lora_alpha: 0.0
   lora_module_path: ""
+  # For QLoRA, set lora_weight_qtype (e.g., "nf4") and optionally lora_tile_size.
+  lora_weight_qtype: null
+  lora_tile_size: null
   # Optional path to LoRA weights to load before training. Ignored if the current run is resumed.
   lora_restore_path: ""
 
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -1206,6 +1206,14 @@ class LoRA(BaseModel):
           "Regex identifying target modules for LoRA, e.g." " '.*q_einsum|.*kv_einsum|.*gate_proj|.*down_proj|.*up_proj'."
       ),
   )
+  lora_weight_qtype: str | None = Field(
+      None,
+      description=("Optional quantization type for QLoRA (e.g., 'nf4'). If set, QLoRA is applied."),
+  )
+  lora_tile_size: NonNegativeInt | None = Field(
+      None,
+      description="Optional tile size for QLoRA (e.g., 128 or 256).",
+  )
   lora_restore_path: PathStr = Field(
       "",
       description=("Optional path to LoRA weights to load before training. Ignored if the current run is resumed."),
diff --git a/src/maxtext/layers/nnx_decoders.py b/src/maxtext/layers/nnx_decoders.py
@@ -463,13 +463,61 @@ def _apply_layers_sequentially(self, layers, x_in, *args, length: int, kv_caches
 
     use_kv = kv_caches_stacked is not None
 
+    def stash_origin_metadata(x):
+      is_var = hasattr(x, "get_metadata") and hasattr(x, "replace")
+      if is_var:
+        metadata = x.get_metadata()
+        updates = {'origin_shape': x.value.shape}
+        for k in ["sharding", "out_sharding", "sharding_names"]:
+          if k in metadata:
+            updates[f'origin_{k}'] = metadata[k]
+        return x.replace(**updates)
+      return x
+
+    params = jax.tree.map(stash_origin_metadata, params)
+    state = jax.tree.map(stash_origin_metadata, state)
+
     def layer_fn(carry, scanned_vars):
       if use_kv:
         current_params, current_state, kv_cache_layer = scanned_vars
       else:
         current_params, current_state = scanned_vars
         kv_cache_layer = None
 
+      def rank_consistent_spec(spec, shape):
+        if spec is None:
+          return None
+        spec_list = list(spec)
+        if len(spec_list) > len(shape):
+          for axis_name in ["layers", "stage"]:
+            if axis_name in spec_list:
+              spec_list.remove(axis_name)
+              if len(spec_list) == len(shape):
+                break
+        while len(spec_list) > len(shape):
+          spec_list.pop(0)
+        while len(spec_list) < len(shape):
+          spec_list.insert(0, None)
+        return jax.sharding.PartitionSpec(*spec_list)
+
+      def fix_node_rank(x):
+        if hasattr(x, "get_metadata") and hasattr(x, "replace") and hasattr(x, "value"):
+          metadata = x.get_metadata()
+          updates = {}
+          for k, axes in metadata.items():
+            if isinstance(axes, (jax.sharding.PartitionSpec, tuple, list)):
+              spec_obj = jax.sharding.PartitionSpec(*axes) if isinstance(axes, (tuple, list)) else axes
+              if len(spec_obj) != x.value.ndim:
+                new_spec = rank_consistent_spec(spec_obj, x.value.shape)
+                updates[k] = tuple(new_spec) if isinstance(axes, (tuple, list)) else new_spec
+          if updates:
+            return x.replace(**updates)
+        return x
+
+      is_nnx_var = lambda x: hasattr(x, "get_metadata") and hasattr(x, "replace")
+      current_params = jax.tree.map(fix_node_rank, current_params, is_leaf=is_nnx_var)
+      current_state = jax.tree.map(fix_node_rank, current_state, is_leaf=is_nnx_var)
+
       if self.config.parameter_memory_host_offload:
         current_params = jax.tree.map(lambda x: jax.device_put(x, max_utils.device_space()), current_params)
 
@@ -540,8 +588,43 @@ def _ensure_scan_leading_axis(x):
       if scan_axis != 0:
         scanned_params = jax.tree.map(lambda x: jnp.moveaxis(x, 0, scan_axis), scanned_params)
 
+      def restore_origin_metadata(x):
+        is_var = hasattr(x, "get_metadata") and hasattr(x, "replace")
+        if is_var:
+          metadata = x.get_metadata()
+          updates = {}
+          for k in ["sharding", "out_sharding", "sharding_names"]:
+            origin_key = f'origin_{k}'
+            if origin_key in metadata:
+              updates[k] = metadata[origin_key]
+            else:
+              axes = metadata.get(k)
+              if isinstance(axes, (jax.sharding.PartitionSpec, tuple, list)):
+                spec_list = list(axes)
+                if "layers" not in spec_list:
+                  pos = min(self.config.param_scan_axis, len(spec_list))
+                  spec_list.insert(pos, "layers")
+                  new_spec = jax.sharding.PartitionSpec(*spec_list)
+                  updates[k] = tuple(new_spec) if isinstance(axes, (tuple, list)) else new_spec
+          if updates:
+            return x.replace(**updates)
+        return x
+
+      is_leaf_with_metadata = lambda x: hasattr(x, "get_metadata") and hasattr(x, "replace")
+      scanned_params = jax.tree.map(restore_origin_metadata, scanned_params, is_leaf=is_leaf_with_metadata)
+      scanned_other = jax.tree.map(restore_origin_metadata, scanned_other, is_leaf=is_leaf_with_metadata)
+
       if dynamic_graph_init:
         out_layers = nnx.merge(updated_graphdef[0], scanned_params, scanned_other)
+        
+        for attr_name, attr_val in self.__dict__.items():
+          if attr_val is layers:
+            setattr(self, attr_name, out_layers)
+            break
+        
+        g, s = nnx.split(self)
+        new_self = nnx.merge(g, s)
+        nnx.update(self, nnx.state(new_self))
       else:
         nnx.update(layers, nnx.State.merge(scanned_params, scanned_other))
         out_layers = layers
diff --git a/src/maxtext/utils/lora_utils.py b/src/maxtext/utils/lora_utils.py
@@ -412,10 +412,15 @@ def _build_lora_provider(mt_config: pyconfig.HyperParameters) -> qwix.LoraProvid
       "alpha": mt_config.lora.lora_alpha,
       "dropout": 0.0,
   }
-  max_logging.log(
-      f"LoRA configured: module_path={lora_module_path} "
-      f"rank={mt_config.lora.lora_rank} alpha={mt_config.lora.lora_alpha}"
-  )
+  if mt_config.lora.lora_tile_size is not None:
+    lora_kwargs["tile_size"] = mt_config.lora.lora_tile_size
+  if mt_config.lora.lora_weight_qtype is not None:
+    lora_kwargs["weight_qtype"] = mt_config.lora.lora_weight_qtype
+
+  lora_type = "QLoRA" if mt_config.lora.lora_weight_qtype else "LoRA"
+  args_str = " ".join(f"{k}={v}" for k, v in lora_kwargs.items() if k != "dropout")
+  max_logging.log(f"{lora_type} configured: {args_str}")
+
   return qwix.LoraProvider(**lora_kwargs)
 
 
@@ -448,7 +453,7 @@ def _verify_lora_parameters(lora_model: nnx.Module, mt_config: pyconfig.HyperPar
   matched_module_paths = []
   sample_module_paths = []
 
-  for path, _ in nnx.iter_modules(lora_model):
+  for path, _ in nnx.iter_graph(lora_model):
     module_path = "/".join(str(p) for p in path)
     if len(sample_module_paths) < 100:
       sample_module_paths.append(module_path)
@@ -469,6 +474,34 @@ def _verify_lora_parameters(lora_model: nnx.Module, mt_config: pyconfig.HyperPar
   )
 
 
+def _patch_qwix_for_maxtext(mesh, mt_config):
+  import qwix._src.flax_util as flax_util
+  import qwix._src.providers.ptq as ptq
+  import jax.numpy as jnp
+  from flax import nnx
+
+  # 1. PTQ patch
+  original_get_intercept_map = ptq.PtqProvider.get_intercept_map
+
+  def patched_get_intercept_map(self):
+    mapping = original_get_intercept_map(self)
+
+    def intercept_asarray(a, dtype=None, order=None, **kwargs):
+      if isinstance(a, nnx.State) and 'array' in a:
+        a = a['array']
+        if isinstance(a, nnx.State) and 'qvalue' in a and 'scale' in a:
+          a = ptq.QArray(qvalue=a['qvalue'].value, scale=a['scale'].value)
+          
+      if type(a).__name__ in ("WithAux", "QArray"):
+        return a
+      return jnp.asarray(a, dtype=dtype, order=order, **kwargs)
+
+    mapping["jax.numpy.asarray"] = intercept_asarray
+    return mapping
+
+  ptq.PtqProvider.get_intercept_map = patched_get_intercept_map
+
+
 def apply_lora_to_model(
     model: nnx.Module,
     mesh: Optional[jax.sharding.Mesh],
@@ -484,6 +517,8 @@ def apply_lora_to_model(
     return model
 
   # Dynamically detect and set LoRA rank before model creation if restoring
+  
+  _patch_qwix_for_maxtext(mesh, mt_config)
 
   lora_provider = _build_lora_provider(mt_config)
 
diff --git a/tests/post_training/unit/lora_utils_test.py b/tests/post_training/unit/lora_utils_test.py
@@ -106,6 +106,8 @@ def test_build_lora_provider(self):
     mock_config.lora.lora_module_path = "custom/path"
     mock_config.lora.lora_rank = 8
     mock_config.lora.lora_alpha = 16.0
+    mock_config.lora.lora_tile_size = None
+    mock_config.lora.lora_weight_qtype = None
 
     with mock.patch("qwix.LoraProvider") as mock_provider:
       lora_utils._build_lora_provider(mock_config)