fix: remove __class__ swap in lora_utils to fix training loop crash, delete redundant sharding code

RexBearIU · RexBearIU · commit a2739ab486b7 · 2026-04-21T09:23:08.000Z
diff --git a/src/maxtext/utils/lora_utils.py b/src/maxtext/utils/lora_utils.py
@@ -628,7 +628,6 @@ def apply_lora_to_model(
   model_rngs = getattr(model.decoder, "rngs", None)
   decoder_input_tokens, decoder_positions = _prepare_dummy_inputs(mt_config)
 
-  print(f"[DEBUG] Starting Qwix materialization on model type {type(model)}")
   # Trigger materialization with Python loop fallback
   model.decoder.disable_quant_stats_update = True
   try:
@@ -639,66 +638,23 @@ def apply_lora_to_model(
         decoder_positions=decoder_positions,
         rngs=model_rngs,
     )
-    print(f"[DEBUG] Qwix call complete. Returned model type: {type(lora_model)}")
   finally:
     model.decoder.disable_quant_stats_update = False
 
-  # Important: Qwix dynamically swaps the __class__ of the model, which breaks nnx.iter_graph
-  # We must restore the original unquantized class type for Tunix to recognize the module correctly.
-  if hasattr(lora_model, "_unquantized_type"):
-      lora_model.__class__ = getattr(lora_model, "_unquantized_type")
-
   model = lora_model
 
-  def rank_consistent_spec(spec, shape):
-    if spec is None: return None
-    spec_list = list(spec)
-    if len(shape) < len(spec_list):
-      for axis_name in ["layers", "stage"]:
-        while axis_name in spec_list and len(spec_list) > len(shape):
-          spec_list.remove(axis_name)
-      if len(spec_list) > len(shape):
-        spec_list = spec_list[-len(shape):]
-    elif len(shape) > len(spec_list):
-      spec_list = [None] * (len(shape) - len(spec_list)) + spec_list
-    return jax.sharding.PartitionSpec(*spec_list)
-
   if mesh is not None:
     with mesh, nn_partitioning.axis_rules(mt_config.logical_axis_rules):
       graph_def, state = nnx.split(model)
 
-      def fix_metadata(x):
-        if hasattr(x, "get_metadata") and hasattr(x, "replace"):
-          metadata = x.get_metadata()
-          sharding_spec = metadata.get("sharding") or metadata.get("out_sharding")
-          if sharding_spec:
-            new_spec = rank_consistent_spec(sharding_spec, x.value.shape)
-            x = x.replace(sharding=new_spec, out_sharding=new_spec)
-            try:
-              from maxtext.utils import sharding as mt_sharding
-              physical_sharding = mt_sharding.create_sharding(mesh, new_spec)
-              x.value = jax.device_put(x.value, physical_sharding)
-            except Exception: pass
-        return x
-
-      state = jax.tree.map(fix_metadata, state)
-      
-      def force_sharding_on_device(x):
-        if hasattr(x, "get_metadata") and hasattr(x, "value"):
-          metadata = x.get_metadata()
-          spec = metadata.get("sharding") or metadata.get("out_sharding")
-          if spec:
-            try:
-                from maxtext.utils import sharding as mt_sharding
-                # Force rank-consistent physical sharding
-                physical_sharding = mt_sharding.create_sharding(mesh, spec)
-                x.value = jax.device_put(x.value, physical_sharding)
-            except Exception: pass
-        return x
-
-      is_nnx_leaf = lambda x: hasattr(x, "get_metadata") and hasattr(x, "replace")
-      state = jax.tree.map(force_sharding_on_device, state, is_leaf=is_nnx_leaf)
+      default_memory_kind = jax.devices()[0].default_memory().kind
+      dst_shardings = jax.tree.map(
+          lambda x: jax.sharding.NamedSharding(mesh, x, memory_kind=default_memory_kind) if x is not None else None,
+          nnx.get_partition_spec(state),
+      )
       
+      from tunix.rl import reshard  # pylint: disable=import-outside-toplevel
+      state = reshard.reshard_pytree(state, dst_shardings)
       model = nnx.merge(graph_def, state)
 
   _verify_lora_parameters(model, mt_config)