feat: enhance LoRA path flexibility and patch qwix

RexBearIU · RexBearIU · commit fc2781318ce0 · 2026-04-22T08:00:27.000Z
diff --git a/src/maxtext/layers/nnx_decoders.py b/src/maxtext/layers/nnx_decoders.py
@@ -449,13 +449,57 @@ def _apply_layers_sequentially(self, layers, x_in, *args, length: int, **kwargs)
     def layer_fn(carry, scanned_vars):
       current_params, current_state = scanned_vars
 
+      def rank_consistent_spec(spec, shape):
+        if spec is None: return None
+        spec_list = list(spec)
+        
+        # 1. Remove scanning axes if rank reduction is needed
+        if len(spec_list) > len(shape):
+          for axis_name in ["layers", "stage"]:
+            if axis_name in spec_list:
+              spec_list.remove(axis_name)
+              if len(spec_list) == len(shape): break
+
+        # 2. If still mismatched, strip from the left (standard JAX rank reduction)
+        while len(spec_list) > len(shape):
+          spec_list.pop(0)
+          
+        # 3. If rank is too small, pad with None
+        while len(spec_list) < len(shape):
+          spec_list.insert(0, None)
+          
+        return jax.sharding.PartitionSpec(*spec_list)
+
+      def fix_node_rank(x):
+        if hasattr(x, "get_metadata") and hasattr(x, "replace") and hasattr(x, "value"):
+          metadata = x.get_metadata()
+          updates = {}
+          for k, axes in metadata.items():
+            if isinstance(axes, (jax.sharding.PartitionSpec, tuple, list)):
+              # Convert tuple/list to spec for check
+              spec_obj = jax.sharding.PartitionSpec(*axes) if isinstance(axes, (tuple, list)) else axes
+              if len(spec_obj) != x.value.ndim:
+                new_spec = rank_consistent_spec(spec_obj, x.value.shape)
+                # Keep original type (tuple vs spec)
+                updates[k] = tuple(new_spec) if isinstance(axes, (tuple, list)) else new_spec
+                # print(f"[DEBUG] Normalizing metadata key '{k}' from rank {len(spec_obj)} to {len(new_spec)}")
+          if updates:
+            return x.replace(**updates)
+        return x
+
+      is_nnx_var = lambda x: hasattr(x, "get_metadata") and hasattr(x, "replace")
+      current_params = jax.tree.map(fix_node_rank, current_params, is_leaf=is_nnx_var)
+      current_state = jax.tree.map(fix_node_rank, current_state, is_leaf=is_nnx_var)
+
       if self.config.parameter_memory_host_offload:
         current_params = jax.tree.map(lambda x: jax.device_put(x, max_utils.device_space()), current_params)
 
       layer = nnx.merge(graphdef, current_params, current_state)
+
       layer_out = layer(carry, *args, **valid_kwargs)
       new_carry = layer_out[0] if isinstance(layer_out, tuple) else layer_out
 
+      # Extract EVERYTHING to capture new parameters
       new_graphdef, updated_params, updated_state = nnx.split(layer, nnx.Param, ...)
 
       if dynamic_graph_init:
@@ -466,23 +510,154 @@ def layer_fn(carry, scanned_vars):
 
       return new_carry, (returned_params, updated_state)
 
-    layer_fn_wrapped = jax.checkpoint(layer_fn, policy=policy, prevent_cse=prevent_cse)
+    if dynamic_graph_init:
+        print(f"[DEBUG] Starting Dynamic Graph Init Loop (length={length})")
+        curr_carry = x_in
+        out_params_list = []
+        out_other_list = []
+
+        def _slice_and_unpromote(x, i):
+            # Resolve physical value and shape
+            is_var = hasattr(x, "get_metadata") and hasattr(x, "replace")
+            val = x.value if is_var else x
+            
+            if not hasattr(val, "shape") or len(val.shape) == 0 or val.shape[0] != length:
+                return x
+            
+            # 1. Slice value
+            sliced_val = val[i]
+            
+            # 2. Slice logical metadata if it's an NNX variable
+            if is_var:
+                metadata = x.get_metadata()
+                updates = {}
+                for sharding_key in ["sharding", "out_sharding", "sharding_names"]:
+                    axes = metadata.get(sharding_key)
+                    if isinstance(axes, jax.sharding.PartitionSpec):
+                        spec_list = list(axes)
+                        
+                        # Aggressively reduce rank to match sliced_val.ndim
+                        for axis_to_remove in ["layers", "stage"]:
+                            if axis_to_remove in spec_list and len(spec_list) > sliced_val.ndim:
+                                spec_list.remove(axis_to_remove)
+                        
+                        while len(spec_list) > sliced_val.ndim:
+                            spec_list.pop(0)
+                        
+                        while len(spec_list) < sliced_val.ndim:
+                            spec_list.insert(0, None)
+                            
+                        new_spec = jax.sharding.PartitionSpec(*spec_list)
+                        updates[sharding_key] = new_spec
+                
+                return x.replace(value=sliced_val, **updates)
+            
+            return sliced_val
+
+        def _promote_to_scanned(x):
+            """Adds 'layers' axis back to newly created parameters if scanning is enabled."""
+            if not self.config.scan_layers:
+                return x
+
+            is_nnx_leaf = lambda x: hasattr(x, "get_metadata") and hasattr(x, "replace")
+            if is_nnx_leaf(x):
+                metadata = x.get_metadata()
+                updates = {}
+                # Determine which axis to insert 'layers' into based on config
+                scan_axis = self.config.param_scan_axis
+                
+                for sharding_key in ["sharding", "out_sharding", "sharding_names"]:
+                    axes = metadata.get(sharding_key)
+                    if isinstance(axes, jax.sharding.PartitionSpec):
+                        spec_list = list(axes)
+                        if "layers" not in spec_list:
+                            # Insert 'layers' at the correct scan axis position
+                            # Cap at current length to avoid index out of bounds
+                            insert_pos = min(scan_axis, len(spec_list))
+                            spec_list.insert(insert_pos, "layers")
+                            updates[sharding_key] = jax.sharding.PartitionSpec(*spec_list)
+                
+                if updates:
+                    return x.replace(**updates)
+            return x
+
+        for i in range(length):
+            # Slice both values AND logical metadata!
+            is_nnx_leaf = lambda x: hasattr(x, "get_metadata") and hasattr(x, "replace")
+            curr_params = jax.tree.map(lambda x: _slice_and_unpromote(x, i), params, is_leaf=is_nnx_leaf)
+            curr_state = jax.tree.map(lambda x: _slice_and_unpromote(x, i), state, is_leaf=is_nnx_leaf)
+            
+            curr_carry, (out_p, out_o) = layer_fn(curr_carry, (curr_params, curr_state))
+            
+            # Promote ALL parameters back to rank-3 metadata immediately
+            # This ensures they are ready to be stacked correctly.
+            out_p = jax.tree.map(_promote_to_scanned, out_p, is_leaf=is_nnx_leaf)
+            out_o = jax.tree.map(_promote_to_scanned, out_o, is_leaf=is_nnx_leaf)
+            
+            out_params_list.append(out_p)
+            out_other_list.append(out_o)
+        
+        final_carry = curr_carry
+        scanned_params = jax.tree.map(lambda *args: jnp.stack(args), *out_params_list)
+        scanned_other = jax.tree.map(lambda *args: jnp.stack(args), *out_other_list)
+        
+
+    else:
+        layer_fn_wrapped = jax.checkpoint(layer_fn, policy=policy, prevent_cse=prevent_cse)
 
-    def _ensure_scan_leading_axis(x):
-      if not hasattr(x, "shape") or len(x.shape) == 0:
-        return jnp.broadcast_to(x, (length,))
-      return x
+        def _ensure_scan_leading_axis(x):
+          if not hasattr(x, "shape") or len(x.shape) == 0:
+            return jnp.broadcast_to(x, (length,))
+          return x
 
-    params = jax.tree.map(_ensure_scan_leading_axis, params)
-    state = jax.tree.map(_ensure_scan_leading_axis, state)
+        params = jax.tree.map(_ensure_scan_leading_axis, params)
+        state = jax.tree.map(_ensure_scan_leading_axis, state)
 
-    final_carry, (scanned_params, scanned_other) = jax.lax.scan(layer_fn_wrapped, x_in, (params, state))
+        final_carry, (scanned_params, scanned_other) = jax.lax.scan(layer_fn_wrapped, x_in, (params, state))
 
     if scan_axis != 0:
       scanned_params = jax.tree.map(lambda x: jnp.moveaxis(x, 0, scan_axis), scanned_params)
 
+    scan_axis = self.config.param_scan_axis
+
+    def _force_promote(x):
+        is_nnx_leaf = hasattr(x, "get_metadata") and hasattr(x, "replace")
+        if is_nnx_leaf:
+            metadata = x.get_metadata()
+            updates = {}
+            val_ndim = x.value.ndim
+            for sharding_key in ["sharding", "out_sharding", "sharding_names"]:
+                axes = metadata.get(sharding_key)
+                if isinstance(axes, (jax.sharding.PartitionSpec, tuple, list)):
+                    l = list(axes)
+                    if len(l) < val_ndim and "layers" not in l:
+                        pos = min(scan_axis, len(l))
+                        l.insert(pos, "layers")
+                        updates[sharding_key] = jax.sharding.PartitionSpec(*l) if isinstance(axes, jax.sharding.PartitionSpec) else tuple(l)
+            if updates:
+                return x.replace(**updates)
+        return x
+
+    is_leaf_with_metadata = lambda x: hasattr(x, "get_metadata") and hasattr(x, "replace")
+    scanned_params = jax.tree.map(_force_promote, scanned_params, is_leaf=is_leaf_with_metadata)
+    scanned_other = jax.tree.map(_force_promote, scanned_other, is_leaf=is_leaf_with_metadata)
+
     if dynamic_graph_init:
+      # Perform a structural update: merge the new structure with the stacked arrays
       out_layers = nnx.merge(updated_graphdef[0], scanned_params, scanned_other)
+      
+      # We must update the PARENT (self) to point to the new structure.
+      for attr_name, attr_val in self.__dict__.items():
+          if attr_val is layers:
+              setattr(self, attr_name, out_layers)
+              print(f"[DEBUG] Materialization complete: updated self.{attr_name}")
+              break
+      
+      # FORCE NNX to recognize new structural changes by splitting/merging the PARENT
+      # This updates the underlying GraphDef for the entire Decoder.
+      g, s = nnx.split(self)
+      new_self = nnx.merge(g, s)
+      nnx.update(self, nnx.state(new_self))
     else:
       nnx.update(layers, nnx.State.merge(scanned_params, scanned_other))
       out_layers = layers
diff --git a/src/maxtext/utils/lora_utils.py b/src/maxtext/utils/lora_utils.py
@@ -28,7 +28,6 @@
 from orbax import checkpoint as ocp
 import qwix
 
-from maxtext.common import checkpointing
 from maxtext.configs import pyconfig
 from maxtext.utils import gcs_utils
 from maxtext.utils import max_logging
@@ -391,19 +390,23 @@ def _get_lora_module_path(mt_config: pyconfig.HyperParameters) -> str:
 
   for key, module_path in lora_configs.items():
     if key != "default" and model_name.startswith(key):
-      max_logging.log(f"Auto-detected lora_module_path for model '{model_name}': {module_path}")
-      return str(module_path)
+      # Make the layer index optional to support both scanned and non-scanned paths
+      # e.g., 'decoder/layers/0/mlp' vs 'decoder/layers/mlp'
+      flexible_path = str(module_path).replace("layers/", "layers/(?:[0-9]+/)?")
+      max_logging.log(f"Auto-detected lora_module_path for model '{model_name}': {flexible_path}")
+      return flexible_path
 
   default_path = lora_configs.get(
       "default",
       "decoder/layers/.*(self_attention/(query|key|value|out)|mlp/(wi_0|wi_1|wo))",
   )
+  flexible_default = str(default_path).replace("layers/", "layers/(?:[0-9]+/)?")
   max_logging.log(
       f"Warning: Model '{model_name}' is not in the list of verified LoRA models. "
       "Auto-detection might not work. Please provide an explicit `lora_module_path` in your config if training fails."
   )
-  max_logging.log(f"Falling back to default lora_module_path: {default_path}")
-  return str(default_path)
+  max_logging.log(f"Falling back to flexible default lora_module_path: {flexible_default}")
+  return flexible_default
 
 
 def _build_lora_provider(mt_config: pyconfig.HyperParameters) -> qwix.LoraProvider:
@@ -433,6 +436,7 @@ def _build_lora_provider(mt_config: pyconfig.HyperParameters) -> qwix.LoraProvid
         f"rank={lora_cfg.lora_rank} alpha={lora_cfg.lora_alpha} "
         f"tile_size={lora_cfg.lora_tile_size}"
     )
+
   return qwix.LoraProvider(**lora_kwargs)
 
 
@@ -465,7 +469,7 @@ def _verify_lora_parameters(lora_model: nnx.Module, mt_config: pyconfig.HyperPar
   matched_module_paths = []
   sample_module_paths = []
 
-  for path, _ in nnx.iter_modules(lora_model):
+  for path, _ in nnx.iter_graph(lora_model):
     module_path = "/".join(str(p) for p in path)
     if len(sample_module_paths) < 100:
       sample_module_paths.append(module_path)
@@ -486,6 +490,34 @@ def _verify_lora_parameters(lora_model: nnx.Module, mt_config: pyconfig.HyperPar
   )
 
 
+def _patch_qwix_for_maxtext(mesh, mt_config):
+  import qwix._src.flax_util as flax_util
+  import qwix._src.providers.ptq as ptq
+  import jax.numpy as jnp
+  from flax import nnx
+
+  # 1. PTQ patch
+  original_get_intercept_map = ptq.PtqProvider.get_intercept_map
+
+  def patched_get_intercept_map(self):
+    mapping = original_get_intercept_map(self)
+
+    def intercept_asarray(a, dtype=None, order=None, **kwargs):
+      if isinstance(a, nnx.State) and 'array' in a:
+        a = a['array']
+        if isinstance(a, nnx.State) and 'qvalue' in a and 'scale' in a:
+          a = ptq.QArray(qvalue=a['qvalue'].value, scale=a['scale'].value)
+          
+      if type(a).__name__ in ("WithAux", "QArray"):
+        return a
+      return jnp.asarray(a, dtype=dtype, order=order, **kwargs)
+
+    mapping["jax.numpy.asarray"] = intercept_asarray
+    return mapping
+
+  ptq.PtqProvider.get_intercept_map = patched_get_intercept_map
+
+
 def apply_lora_to_model(
     model: nnx.Module,
     mesh: Optional[jax.sharding.Mesh],
@@ -501,6 +533,8 @@ def apply_lora_to_model(
   if not getattr(lora_cfg, "enable_lora", False):
     return model
 
+  _patch_qwix_for_maxtext(mesh, mt_config)
+
   lora_provider = _build_lora_provider(mt_config)
 
   model_rngs = getattr(model.decoder, "rngs", None)