feat: enhance update_sharding for PartitionSpec support

RexBearIU · RexBearIU · commit 6798fcd36de0 · 2026-03-30T09:27:15.000Z
diff --git a/qwix.patch b/qwix.patch
@@ -1,5 +1,5 @@
 diff --git a/qwix/_src/flax_util.py b/qwix/_src/flax_util.py
-index 4ea5d80..7ae95d3 100644
+index 4ea5d80..17bb94b 100644
 --- a/qwix/_src/flax_util.py
 +++ b/qwix/_src/flax_util.py
 @@ -308,13 +308,13 @@ def unbox(maybe_boxed: Any) -> Any:
@@ -18,72 +18,54 @@ index 4ea5d80..7ae95d3 100644
    """Derives the partition spec from an existing spec.
  
    Args:
-@@ -330,21 +330,30 @@ def update_sharding(
+@@ -330,6 +330,8 @@ def update_sharding(
      The updated partition spec.
    """
    assert bool(split) + bool(merge) + bool(transpose) <= 1
 +  is_pspec = isinstance(spec, jax.sharding.PartitionSpec)
-+  spec_list = list(spec)
 +
    if split:
--    spec = [(a, None) if i in split else (a,) for i, a in enumerate(spec)]
--    spec = sum(spec, ())  # flatten the list of tuples.
-+    spec_list = [
-+        (a, None) if i in split else (a,) for i, a in enumerate(spec_list)
-+    ]
-+    spec_list = list(sum(spec_list, ()))  # flatten the list of tuples.
-   elif merge:
-     for i in merge:
--      spec = spec[: i + 1] + spec[i + 2 :]  # pytype: disable=unsupported-operands
-+      spec_list = spec_list[: i + 1] + spec_list[i + 2 :]
-   elif transpose:
--    spec = tuple(spec[i] if i is not None else None for i in transpose)
-+    spec_list = [
-+        spec_list[i] if i is not None else None for i in transpose
-+    ]
- 
-   if shape:
--    assert len(shape) == len(spec), f'{shape=} {spec=}'
-+    assert len(shape) == len(spec_list), f'{shape=} {spec_list=}'
+     spec = [(a, None) if i in split else (a,) for i, a in enumerate(spec)]
+     spec = sum(spec, ())  # flatten the list of tuples.
+@@ -344,6 +346,9 @@ def update_sharding(
      # For scales: remove sharding for dimensions of size 1.
--    spec = tuple(None if d == 1 else a for a, d in zip(spec, shape))
-+    spec_list = [None if d == 1 else a for a, d in zip(spec_list, shape)]
+     spec = tuple(None if d == 1 else a for a, d in zip(spec, shape))
  
--  return spec
 +  if is_pspec:
-+    return jax.sharding.PartitionSpec(*spec_list)
-+  return tuple(spec_list)
++    return jax.sharding.PartitionSpec(*spec)
++  
+   return spec
  
  
- def update_boxed(
-@@ -380,7 +389,7 @@ def update_boxed(
+@@ -380,10 +385,8 @@ def update_boxed(
      shape = boxed.unbox().shape
      for possible_field in ('names', 'mesh_axes', 'axes_types'):
        axes = getattr(boxed, possible_field, None)
 -      if isinstance(axes, (list, tuple)):
+-        axes = update_sharding(
+-            axes, shape=shape, split=split, merge=merge, transpose=transpose
+-        )
 +      if isinstance(axes, (list, tuple, jax.sharding.PartitionSpec)):
-         axes = update_sharding(
-             axes, shape=shape, split=split, merge=merge, transpose=transpose
-         )
-@@ -396,11 +405,13 @@ def update_boxed(
++        axes = update_sharding(axes, shape=shape, split=split, merge=merge, transpose=transpose)
+         boxed = dataclasses.replace(boxed, **{possible_field: axes})
+   elif isinstance(boxed, nnx.Variable):
+     if value is not None:
+@@ -396,10 +399,9 @@ def update_boxed(
      else:
        sharding_key = 'sharding_names'
      axes = metadata.get(sharding_key, None)
 -    if isinstance(axes, (list, tuple)):
 -      axes = update_sharding(
+-          axes, shape=shape, split=split, merge=merge, transpose=transpose
+-      )
++    
 +    if isinstance(axes, (list, tuple, jax.sharding.PartitionSpec)):
-+      updated_axes = update_sharding(
-           axes, shape=shape, split=split, merge=merge, transpose=transpose
-       )
--      boxed.set_metadata(sharding_key, axes)
-+      # Avoid mutating metadata unless sharding actually changed.
-+      if axes != updated_axes:
-+        boxed.set_metadata(sharding_key, updated_axes)
++      axes = update_sharding(axes, shape=shape, split=split, merge=merge, transpose=transpose)
+       boxed.set_metadata(sharding_key, axes)
    elif isinstance(boxed, jax.Array):  # not boxed.
      if value is not None:
-       boxed = value
 diff --git a/qwix/_src/providers/lora.py b/qwix/_src/providers/lora.py
-index e98c833..07be623 100644
+index e98c833..39ce3ef 100644
 --- a/qwix/_src/providers/lora.py
 +++ b/qwix/_src/providers/lora.py
 @@ -13,6 +13,7 @@
@@ -94,7 +76,7 @@ index e98c833..07be623 100644
  import string
  from typing import Any, Callable, Collection, Sequence
  import warnings
-@@ -189,29 +190,87 @@ class LoraProvider(ptq.PtqProvider):
+@@ -189,29 +190,68 @@ class LoraProvider(ptq.PtqProvider):
      if weight_name is None:  # rhs is not a weight.
        return res
  
@@ -107,25 +89,6 @@ index e98c833..07be623 100644
 +    (contract_lhs, contract_rhs) = dimension_numbers[0]
 +    (batch_lhs, batch_rhs) = dimension_numbers[1]
 +
-+    if len(rhs.shape) == 2 and not batch_rhs:
-+      # Standard LoRA path for ...a,ab->...b
-+      lora_a, lora_b = _get_or_create_lora_params(
-+          name=weight_name,
-+          rule=rule,
-+          a_shape=(rhs.shape[0], rule.rank),
-+          b_shape=(rule.rank, rhs.shape[1]),
-+          a_sharding_transpose=(0, None),
-+          b_sharding_transpose=(None, 1),
-+      )
-+
-+      if rule.dropout > 0:
-+        lhs = nnx.Dropout(rule.dropout, deterministic=False)(
-+            lhs, rngs=flax_util.make_rng('dropout')
-+        )
-+
-+      return res + lhs @ lora_a @ lora_b * (rule.alpha / rule.rank)
-+
-+    # General LoRA path for N-D kernels and batch dimensions.
 +    # Identify contracting, batch, and out axes for rhs.
 +    contract_rhs = tuple(contract_rhs)
 +    batch_rhs = tuple(batch_rhs)
diff --git a/src/maxtext/utils/lora_utils.py b/src/maxtext/utils/lora_utils.py
@@ -418,77 +418,62 @@ def _patch_nnx_decoder_apply_layers_sequentially(model: nnx.Module) -> None:
   """Patches the NNX decoder's _apply_layers_sequentially to include Qwix specific logic."""
 
   def _apply_layers_sequentially_with_qwix(self, layers, x_in, *args, length: int, **kwargs):
-    """Runs the layer stack using nnx.scan with Qwix specific graph init and VJP downcasting."""
+    """Runs the layer stack using nnx.scan with Qwix specific graph init."""
     policy = self.get_remat_policy()
     prevent_cse = maxtext_utils.should_prevent_cse_in_remat(self.config)
-    graphdef, params, state = nnx.split(
-        layers, nnx.Param, ...
-    )  # state: the mutable state we carry (KV cache, RNGs, etc.)
+    graphdef, params, state = nnx.split(layers, nnx.Param, ...)
 
     scan_axis = self.config.param_scan_axis
     if scan_axis != 0:
-      # Move scan_axis to 0 so scan can iterate over it
-      params = jax.tree.map(lambda x: jnp.moveaxis(x, scan_axis, 0), params)
+      params = jax.tree_util.tree_map(lambda x: jnp.moveaxis(x, scan_axis, 0), params)
 
-    layer_cls = layers.__class__
-    sig = inspect.signature(layer_cls.__call__)
+    sig = inspect.signature(layers.__class__.__call__)
     valid_kwargs = {k: v for k, v in kwargs.items() if k in sig.parameters or "kwargs" in sig.parameters}
-    # During Qwix init (disable_quant_stats_update=True), params may be lazily
-    # created and the layer graphdef can grow. Allow graphdef refresh in that
-    # phase only. Keep scanned training path static for remat purity.
+    
     dynamic_graph_init = bool(getattr(self, "disable_quant_stats_update", False))
     updated_graphdef = [graphdef]
 
     def layer_fn(carry, scanned_vars):
-      # Unpack the sliced variables for THIS layer
       current_params, current_state = scanned_vars
 
       if self.config.parameter_memory_host_offload:
-        current_params = jax.tree.map(lambda x: jax.device_put(x, max_utils.device_space()), current_params)
+        current_params = jax.tree_util.tree_map(lambda x: jax.device_put(x, max_utils.device_space()), current_params)
 
-      # Merge using the SLICED state
       layer = nnx.merge(graphdef, current_params, current_state)
-
-      # Run the layer (Filter kwargs if using the solution from previous turn)
       layer_out = layer(carry, *args, **valid_kwargs)
-
       new_carry = layer_out[0] if isinstance(layer_out, tuple) else layer_out
 
-      # Qwix init: return updated params so graphdef can grow.
-      # In normal training, keep params unchanged to avoid extra memory use.
       new_graphdef, updated_params, updated_state = nnx.split(layer, nnx.Param, ...)
+      
       if dynamic_graph_init:
         updated_graphdef[0] = new_graphdef
         returned_params = updated_params
       else:
         returned_params = current_params
+        
       return new_carry, (returned_params, updated_state)
 
     layer_fn_wrapped = jax.checkpoint(layer_fn, policy=policy, prevent_cse=prevent_cse)
 
     def _ensure_scan_leading_axis(x):
-      # Promote scalars for scan axis compatibility.
-      if not hasattr(x, "shape"):
-        return x
-      if len(x.shape) == 0:
+      if not hasattr(x, "shape") or len(x.shape) == 0:
         return jnp.broadcast_to(x, (length,))
       return x
 
-    params = jax.tree.map(_ensure_scan_leading_axis, params)
-    state = jax.tree.map(_ensure_scan_leading_axis, state)
+    params = jax.tree_util.tree_map(_ensure_scan_leading_axis, params)
+    state = jax.tree_util.tree_map(_ensure_scan_leading_axis, state)
 
     final_carry, (scanned_params, scanned_other) = jax.lax.scan(layer_fn_wrapped, x_in, (params, state))
 
     if scan_axis != 0:
-      scanned_params = jax.tree.map(lambda x: jnp.moveaxis(x, 0, scan_axis), scanned_params)
+      scanned_params = jax.tree_util.tree_map(lambda x: jnp.moveaxis(x, 0, scan_axis), scanned_params)
 
     if dynamic_graph_init:
       return final_carry, nnx.merge(updated_graphdef[0], scanned_params, scanned_other)
-    else:
-      nnx.update(layers, nnx.State.merge(scanned_params, scanned_other))
-      return final_carry, layers
+      
+    nnx.update(layers, nnx.State.merge(scanned_params, scanned_other))
+    return final_carry, layers
 
-  # IMPORTANT: Patch the class so nnx.merge doesn't lose the patch
   model.decoder.__class__._apply_layers_sequentially = _apply_layers_sequentially_with_qwix  # pylint: disable=protected-access