fix(inference): restore NemotronH mixer.D + e_score_correction_bias after vLLM reload

hallerite · claude · hallerite · commit fe05ce3b29aa · 2026-06-04T19:02:46.000+05:30
vLLM 0.22's layerwise reload mis-loads exactly two NemotronH per-layer param families through the online-reload path -- mixer.D (Mamba SSD skip) and the MoE router's gate.e_score_correction_bias -- while loading all other weights correctly. mixer.D becomes non-deterministic garbage/inf (NaN logits) and the gate bias gets a wrong value (broken routing), so generations go to NaN after a weight update. Restore both from the received broadcast (correct by definition) via each param's own weight_loader. Also drop monkey_patch_vllm_layerwise_reload_alias_buffers: it crashes on vLLM 0.22 (AttributeError on the delattr'd conv_weights) and conv_weights is handled correctly by vLLM's native reload finalize. Supersedes #2701. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
diff --git a/src/prime_rl/inference/patches.py b/src/prime_rl/inference/patches.py
@@ -18,7 +18,6 @@ def transformers_v5_compat():
     _patch_qwen35_lora()
     _patch_lora_key_prefix()
     monkey_patch_deep_gemm_silu_mul_quant_int64()
-    monkey_patch_vllm_layerwise_reload_alias_buffers()
     monkey_patch_vllm_padded_input_scrub()
     monkey_patch_return_routed_experts_with_nixl_connector()
 
@@ -67,38 +66,6 @@ def _post_init(config: VllmConfig):
     logger.warning("Enabled vLLM routed-experts capture with NIXL connector patch.")
 
 
-def monkey_patch_vllm_layerwise_reload_alias_buffers():
-    # vLLM's layerwise reload materializes each buffer as an independent tensor
-    # and then copies it back into the original kernel storage. When a buffer
-    # aliases a parameter (e.g. NemotronH Mamba's mixer.conv_weights, a view of
-    # mixer.conv1d.weight), the buffer copy stamps garbage into the parameter's
-    # storage *after* the parameter has been correctly reloaded. Skip the copy
-    # for any buffer that shares storage with a parameter; _place_kernel_tensors
-    # re-registers the original view, which trivially reflects the parameter.
-    # Remove this patch once https://github.com/vllm-project/vllm/pull/42481 is
-    # included in the vLLM release we pin/use.
-    from vllm.logger import init_logger
-    from vllm.model_executor.model_loader.reload import layerwise as reload_layerwise
-
-    logger = init_logger(__name__)
-
-    def _copy_and_restore_kernel_tensors(layer: torch.nn.Module, info: reload_layerwise.LayerReloadingInfo):
-        assert info.kernel_tensors is not None
-        parameters, buffers = info.kernel_tensors
-        param_storage_ptrs = {p.untyped_storage().data_ptr() for p in layer.parameters(recurse=True)}
-        for name, param in parameters.items():
-            param.data.copy_(getattr(layer, name))
-        for name, buffer in buffers.items():
-            if buffer.untyped_storage().data_ptr() in param_storage_ptrs:
-                continue
-            buffer.data.copy_(getattr(layer, name))
-
-        reload_layerwise._place_kernel_tensors(layer, info)
-
-    reload_layerwise._copy_and_restore_kernel_tensors = _copy_and_restore_kernel_tensors
-    logger.warning("Enabled vLLM layerwise reload alias-buffer patch.")
-
-
 @triton.jit
 def _silu_mul_per_token_group_quant_fp8_colmajor_int64_kernel(
     y_ptr,
diff --git a/src/prime_rl/inference/vllm/worker/nccl.py b/src/prime_rl/inference/vllm/worker/nccl.py
@@ -24,6 +24,45 @@
 
 logger = init_logger("vllm.inference.vllm.worker_nccl")
 
+# NemotronH params that vLLM 0.22's layerwise reload mis-loads through the online-reload path.
+_RELOAD_CORRUPTED_SUFFIXES = (".mixer.D", ".e_score_correction_bias")
+
+
+def _restore_reload_corrupted_params(model: Module, received: dict[str, torch.Tensor]) -> None:
+    """Work around a vLLM 0.22 layerwise-reload bug for NemotronH.
+
+    The online reload mis-loads exactly two per-layer parameter families -- ``mixer.D`` (Mamba SSD
+    skip) and the MoE router's ``gate.e_score_correction_bias`` -- while loading all other weights
+    correctly. ``mixer.D`` ends up as non-deterministic garbage/inf (NaN logits) and the gate bias
+    gets a wrong value (broken expert routing), so generations go to NaN after a weight update.
+
+    The received broadcast value is correct, so restore those params from it via each param's own
+    ``weight_loader`` (which applies the right sharding). Remove once the upstream reload bug is fixed.
+    """
+
+    def _layer_key(name: str) -> str:
+        index = name.find("layers.")
+        return name[index:] if index >= 0 else name
+
+    received_by_key = {_layer_key(name): tensor for name, tensor in received.items()}
+    restored = 0
+    for name, param in model.named_parameters():
+        if not name.endswith(_RELOAD_CORRUPTED_SUFFIXES):
+            continue
+        tensor = received_by_key.get(_layer_key(name))
+        if tensor is None:
+            continue
+        tensor = tensor.to(device=param.device)
+        weight_loader = getattr(param, "weight_loader", None)
+        if weight_loader is not None:
+            weight_loader(param, tensor)
+        elif tensor.shape == param.shape:
+            param.data.copy_(tensor.to(param.dtype))
+        else:
+            continue
+        restored += 1
+    logger.debug("Restored %d NemotronH params (mixer.D, e_score_correction_bias) after reload", restored)
+
 
 def receive_integer(communicator: PyNcclCommunicator) -> int:
     """Receive an integer from the trainer master rank using NCCL communicator."""
@@ -148,9 +187,20 @@ def update_weights_from_path(self, weight_dir: str) -> None:
             update_mla_absorbed_weights(model)
             return
 
+        # vLLM 0.22's layerwise reload mis-loads NemotronH mixer.D and MoE gate.e_score_correction_bias
+        # (see _restore_reload_corrupted_params). Capture the correct received values to restore after.
+        received_reload_fix: dict[str, torch.Tensor] = {}
+
+        def _capture_reload_fix(weights):
+            for name, tensor in weights:
+                if name.endswith(_RELOAD_CORRUPTED_SUFFIXES):
+                    received_reload_fix[name] = tensor.detach().to("cpu", copy=True)
+                yield name, tensor
+
         load_weights_checkpoint_layerwise(
             model,
-            state_iter,
+            _capture_reload_fix(state_iter),
             self.model_runner.model_config,
             self.vllm_config,
         )
+        _restore_reload_corrupted_params(model, received_reload_fix)