[TRTLLM-13248][feat] Wave 4 add MX staged receiver cutover

chienchunhung · chienchunhung · commit 431b816f123d · 2026-06-18T21:30:27.000-04:00
Signed-off-by: Chien-Chun Hung &lt;2679986+chienchunhung@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/checkpoints/base_checkpoint_loader.py b/tensorrt_llm/_torch/models/checkpoints/base_checkpoint_loader.py
@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from typing import Any
 
@@ -69,6 +72,17 @@ def is_weights_preloaded(self) -> bool:
         """Whether the last load wrote weights directly into the model."""
         return False
 
+    def is_post_transform_weights_preloaded(self) -> bool:
+        """Whether the last direct preload delivered post-transform weights.
+
+        This is narrower than :meth:`is_weights_preloaded`: a loader may write
+        bytes directly into the model while those bytes are still the raw
+        checkpoint layout. Only return ``True`` when the source identity was
+        verified and the incoming bytes can safely skip module
+        ``transform_weights()`` hooks.
+        """
+        return False
+
     def post_load_apply(self,
                         model: nn.Module,
                         *,
diff --git a/tensorrt_llm/_torch/models/checkpoints/mx/checkpoint_loader.py b/tensorrt_llm/_torch/models/checkpoints/mx/checkpoint_loader.py
@@ -129,6 +129,8 @@ def __init__(
         self._model_name = str(model_name) if model_name is not None else None
         self._query_timeout_s = query_timeout_s
         self._p2p_succeeded = False
+        self._post_transform_weights_preloaded = False
+        self._source_identity_compatible_for_last_load = False
         # Receiver's local SourceIdentity, supplied per load_weights() call by
         # ModelLoader; the authority for the pre-transfer compatibility gate.
         self._local_source_identity: Optional[SourceIdentity] = None
@@ -184,6 +186,21 @@ def is_weights_preloaded(self) -> bool:
         """
         return self._p2p_succeeded
 
+    def is_post_transform_weights_preloaded(self) -> bool:
+        """Whether the last successful MX preload delivered transformed bytes.
+
+        Wave 4 wires the receiver-side staged hook path, but the MX publisher
+        still emits raw pre-transform bytes. Keep this false until Wave 5 wires
+        explicit MX metadata for post-transform publication. The source
+        identity bit is included here so callers have one conservative signal:
+        no identity match, no transform skip.
+        """
+        return (
+            self._p2p_succeeded
+            and self._post_transform_weights_preloaded
+            and self._source_identity_compatible_for_last_load
+        )
+
     def load_weights(self, checkpoint_dir: str, mapping: Mapping, **kwargs) -> dict[str, Any]:
         """Load weights, preferring MX P2P transfer when available.
 
@@ -207,6 +224,8 @@ def load_weights(self, checkpoint_dir: str, mapping: Mapping, **kwargs) -> dict[
         # Popped here so it never leaks into the disk-fallback signature.
         self._local_source_identity = kwargs.pop("source_identity", None)
         self._p2p_succeeded = False
+        self._post_transform_weights_preloaded = False
+        self._source_identity_compatible_for_last_load = False
 
         if self._mx_server_url is None or model is None:
             return self._fallback_to_disk(
@@ -237,14 +256,21 @@ def load_weights(self, checkpoint_dir: str, mapping: Mapping, **kwargs) -> dict[
 
         # Pre-transfer compatibility gate: on mismatch, skip the transfer
         # before any RDMA work starts and fall back to disk.
-        if not self._source_identity_compatible(checkpoint_dir, MxClient, _build_trtllm_identity):
+        self._source_identity_compatible_for_last_load = self._source_identity_compatible(
+            checkpoint_dir, MxClient, _build_trtllm_identity
+        )
+        if not self._source_identity_compatible_for_last_load:
             return self._fallback_to_disk(
                 checkpoint_dir,
                 mapping,
                 reason="source SourceIdentity incompatible with receiver",
                 **kwargs,
             )
 
+        self._post_transform_weights_preloaded = self._source_metadata_is_post_transform(
+            checkpoint_dir, MxClient, _build_trtllm_identity
+        )
+
         timeout_override = self._resolve_query_timeout_override(
             checkpoint_dir,
             MxClient,
@@ -271,6 +297,24 @@ def load_weights(self, checkpoint_dir: str, mapping: Mapping, **kwargs) -> dict[
             fallback_bytes = sum(
                 tensor.numel() * tensor.element_size() for tensor in fallback_weights.values()
             )
+            if self._post_transform_weights_preloaded:
+                self._post_transform_weights_preloaded = False
+                self._source_identity_compatible_for_last_load = False
+                logger.warning(
+                    "MX P2P returned %d fallback weights (%.2f MiB, size mismatch) "
+                    "from a post-transform source at %s. Falling back to a full "
+                    "disk load to avoid mixing transformed P2P tensors with raw "
+                    "fallback tensors before the full post-load transform path.",
+                    len(fallback_weights),
+                    fallback_bytes / (1 << 20),
+                    self._mx_server_url,
+                )
+                return self._fallback_to_disk(
+                    checkpoint_dir,
+                    mapping,
+                    reason="post-transform source returned partial fallback weights",
+                    **kwargs,
+                )
             # Mixed-success case: MX delivered matched tensors into model
             # params via P2P and returned only size-mismatched tensors for
             # the standard disk path to apply. Keep the P2P transfer and
@@ -285,6 +329,7 @@ def load_weights(self, checkpoint_dir: str, mapping: Mapping, **kwargs) -> dict[
                 self._mx_server_url,
             )
             self._p2p_succeeded = True
+            self._post_transform_weights_preloaded = False
             return fallback_weights
 
         self._p2p_succeeded = True
@@ -386,6 +431,18 @@ def _fetch_source_identity(
         # exposes a field for it. This is the single seam the gate depends on.
         return None
 
+    def _source_metadata_is_post_transform(
+        self, _checkpoint_dir: str, _mx_client_type: Type[Any], _build_identity: Callable[..., Any]
+    ) -> bool:
+        """Whether the selected MX source publishes post-transform bytes.
+
+        Wave 4 keeps production behavior dormant: Modelexpress metadata does
+        not yet expose raw-vs-transformed layout state, and the TRT-LLM MX
+        publisher still publishes before module transforms. Wave 5 should wire
+        this seam to explicit source metadata when flipping the publisher.
+        """
+        return False
+
     def _resolve_publish_name(self, checkpoint_dir: Optional[str]) -> str:
         return _resolve_mx_model_name(self._model_name, checkpoint_dir)
 
diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py
@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
 import copy
 import inspect
 import os
@@ -264,6 +267,8 @@ class ModelLoader:
     Handles the loading, configuration, and weight initialization of a PyTorch model.
     This class isolates model loading logic from the main execution engine.
     """
+    _MX_STAGED_RECEIVER_TRANSFORM_PROTOCOL_VERSION = 1
+    _MX_STAGED_RECEIVER_ALLOWLIST = frozenset()
 
     def __init__(self,
                  llm_args: TorchLlmArgs,
@@ -782,12 +787,20 @@ def init_meta_tensor_in_pool(t: torch.Tensor):
             if not gms_post_load_handled:
                 checkpoint_loader.post_load_apply(
                     model, weights_preloaded=weights_preloaded)
+                mx_staged_receiver_path = self._should_run_mx_staged_receiver_path(
+                    checkpoint_loader,
+                    model,
+                    weights_preloaded=weights_preloaded)
+                if mx_staged_receiver_path:
+                    self._setup_aliases(model)
+                    self._mark_weights_transformed(model)
+                    self._walk_cache_state(model)
                 checkpoint_loader.post_load_publish(
                     model,
                     checkpoint_dir=checkpoint_dir,
                     weights_preloaded=weights_preloaded)
-
-                self._walk_full_post_load(model)
+                if not mx_staged_receiver_path:
+                    self._walk_full_post_load(model)
 
             # TODO(GMS-MOE-LB): when the (MoE, GMS) combination is enabled,
             # `register_weight_slots_after_to_cuda` and `finalize_model`
@@ -830,6 +843,67 @@ def _check_gms_source_identity(self, gms_backend) -> None:
             IdentityCheckPolicy.STRICT,
         )
 
+    @classmethod
+    def _should_run_mx_staged_receiver_path(
+            cls, checkpoint_loader: BaseCheckpointLoader,
+            model: DecoderModelForCausalLM, *, weights_preloaded: bool) -> bool:
+        """Whether an MX receiver can skip one-shot weight transforms.
+
+        The Wave 4 path is intentionally dormant for production: the allow-list
+        is empty, and MX still reports raw pre-transform bytes. Tests can opt in
+        a synthetic model by patching the allow-list and checkpoint-loader
+        signal, proving the staged receiver branch without enabling real models.
+        """
+        if checkpoint_loader.checkpoint_format != "MX" or not weights_preloaded:
+            return False
+
+        method = getattr(type(checkpoint_loader),
+                         'is_post_transform_weights_preloaded', None)
+        if method is None or not checkpoint_loader.is_post_transform_weights_preloaded(
+        ):
+            return False
+
+        allowlist_key = (
+            type(model),
+            cls._MX_STAGED_RECEIVER_TRANSFORM_PROTOCOL_VERSION,
+        )
+        if allowlist_key in cls._MX_STAGED_RECEIVER_ALLOWLIST:
+            logger.info(
+                "MX receiver using staged post-load path for %s "
+                "(transform protocol v%d).",
+                type(model).__name__,
+                cls._MX_STAGED_RECEIVER_TRANSFORM_PROTOCOL_VERSION,
+            )
+            return True
+
+        # WAVE 5 NOTE: once MX can publish real post-transform bytes, this
+        # fallthrough must not run the full post_load_weights() path on those
+        # bytes for a non-allow-listed model. Wave 5 should either fail/fallback
+        # before accepting the transfer or allow-list the model after validation.
+        logger.info(
+            "MX receiver got post-transform weights for %s, but the model is "
+            "not allow-listed for staged post-load transform protocol v%d. "
+            "Running the full post-load path.",
+            type(model).__name__,
+            cls._MX_STAGED_RECEIVER_TRANSFORM_PROTOCOL_VERSION,
+        )
+        return False
+
+    @staticmethod
+    def _mark_weights_transformed(model: DecoderModelForCausalLM) -> None:
+        """Mark modules with transform guards as already transformed.
+
+        Post-transform sharing paths skip ``transform_weights()`` because the
+        incoming bytes already use the final runtime layout. Preserve that
+        lifecycle state on modules that participate in the transform guard
+        protocol so a later orchestrator/refactor does not treat them as raw
+        checkpoint bytes.
+        """
+        for module in model.modules():
+            if hasattr(module, '_weights_transformed') and not getattr(
+                    module, '_weights_removed', False):
+                module._weights_transformed = True
+
     @staticmethod
     def _setup_aliases(model: DecoderModelForCausalLM) -> None:
         """Run structural alias setup on eligible modules.
diff --git a/tests/unittest/_torch/models/checkpoints/mx/test_mx_checkpoint_loader.py b/tests/unittest/_torch/models/checkpoints/mx/test_mx_checkpoint_loader.py
@@ -44,11 +44,13 @@ def test_no_args_constructs(self):
         loader = MXCheckpointLoader()
         assert loader.mx_server_url is None
         assert loader.is_weights_preloaded() is False
+        assert loader.is_post_transform_weights_preloaded() is False
 
     def test_mx_server_url_stored(self):
         loader = MXCheckpointLoader(mx_server_url="http://mx:8001")
         assert loader.mx_server_url == "http://mx:8001"
         assert loader.is_weights_preloaded() is False
+        assert loader.is_post_transform_weights_preloaded() is False
 
     def test_query_timeout_stored(self):
         loader = MXCheckpointLoader(mx_server_url="http://mx:8001", query_timeout_s=900)
@@ -75,6 +77,17 @@ def test_checkpoint_format_backing_attr(self):
     def test_is_weights_preloaded_initial(self):
         loader = MXCheckpointLoader()
         assert loader.is_weights_preloaded() is False
+        assert loader.is_post_transform_weights_preloaded() is False
+
+    def test_post_transform_signal_requires_p2p_and_identity_match(self):
+        loader = MXCheckpointLoader()
+        loader._p2p_succeeded = True
+        loader._post_transform_weights_preloaded = True
+        loader._source_identity_compatible_for_last_load = False
+        assert loader.is_post_transform_weights_preloaded() is False
+
+        loader._source_identity_compatible_for_last_load = True
+        assert loader.is_post_transform_weights_preloaded() is True
 
 
 # ---------------------------------------------------------------------------
@@ -145,9 +158,11 @@ def _modelexpress_unavailable(stack):
 
     @staticmethod
     def _upstream_raises(stack):
+        loader = MXCheckpointLoader(mx_server_url="http://mx:8001")
+        loader._source_identity_compatible = MagicMock(return_value=True)
         fake_mx = _build_fake_modelexpress(load_weights_side_effect=RuntimeError("boom"))
         stack.enter_context(_install_fake_modelexpress(fake_mx))
-        return (MXCheckpointLoader(mx_server_url="http://mx:8001"), {"model": MagicMock()})
+        return (loader, {"model": MagicMock()})
 
     @pytest.mark.parametrize(
         "trigger_id, setup",
@@ -196,6 +211,7 @@ def test_p2p_full_success_returns_empty_dict(self):
         # ``is_weights_preloaded()`` signal as "skip the standard
         # weight-mapping pipeline".
         loader = MXCheckpointLoader(mx_server_url="http://mx:8001")
+        loader._source_identity_compatible = MagicMock(return_value=True)
         fake_mx = _build_fake_modelexpress(load_weights_return={})
         mapping = MagicMock(name="mapping")
         model = MagicMock(name="model")
@@ -205,6 +221,7 @@ def test_p2p_full_success_returns_empty_dict(self):
 
         assert result == {}
         assert loader.is_weights_preloaded() is True
+        assert loader.is_post_transform_weights_preloaded() is False
 
         # Verify the integration contract with the upstream library:
         # 1. Constructed MxLiveWeightLoader with our mx_server_url.
@@ -222,6 +239,7 @@ def test_mixed_success_returns_fallback_weights(self):
         # tensors), keep the P2P transfer and let ModelLoader merge these
         # tensors through the standard disk pipeline.
         loader = MXCheckpointLoader(mx_server_url="http://mx:8001")
+        loader._source_identity_compatible = MagicMock(return_value=True)
         fallback = {"some.weight": MagicMock()}
         fake_mx = _build_fake_modelexpress(load_weights_return=fallback)
 
@@ -232,9 +250,36 @@ def test_mixed_success_returns_fallback_weights(self):
             result = loader.load_weights("/nonexistent", mapping=MagicMock(), model=MagicMock())
 
         assert loader.is_weights_preloaded() is True
+        assert loader.is_post_transform_weights_preloaded() is False
         assert result is fallback
         mock_super_load.assert_not_called()
 
+    def test_post_transform_mixed_success_falls_back_to_full_disk_load(self):
+        # Wave 5 will let MX advertise post-transform sources. If such a
+        # source only partially succeeds, merging raw fallback tensors would
+        # force ModelLoader onto the full post-load path and double-transform
+        # the P2P subset. Lock the safer behavior now: abandon the partial
+        # post-transform transfer and return a full disk load instead.
+        loader = MXCheckpointLoader(mx_server_url="http://mx:8001")
+        loader._source_identity_compatible = MagicMock(return_value=True)
+        loader._source_metadata_is_post_transform = MagicMock(return_value=True)
+        fallback = {"some.weight": MagicMock(numel=lambda: 1, element_size=lambda: 4)}
+        disk_weights = {"disk.weight": MagicMock()}
+        fake_mx = _build_fake_modelexpress(load_weights_return=fallback)
+
+        with (
+            _install_fake_modelexpress(fake_mx),
+            patch.object(
+                HfCheckpointLoader, "load_weights", return_value=disk_weights
+            ) as mock_super_load,
+        ):
+            result = loader.load_weights("/nonexistent", mapping=MagicMock(), model=MagicMock())
+
+        assert result is disk_weights
+        assert loader.is_weights_preloaded() is False
+        assert loader.is_post_transform_weights_preloaded() is False
+        mock_super_load.assert_called_once()
+
 
 # ---------------------------------------------------------------------------
 # publish_as_source — env-var dance and graceful no-op
@@ -428,6 +473,7 @@ def _assert_timeout(*args, **kwargs):
             return {}
 
         loader = MXCheckpointLoader(mx_server_url="http://mx:8001")
+        loader._source_identity_compatible = MagicMock(return_value=True)
         fake_mx = _build_fake_modelexpress(load_weights_side_effect=_assert_timeout)
         with _install_fake_modelexpress(fake_mx):
             loader.load_weights("/nonexistent", mapping=MagicMock(), model=MagicMock())
@@ -439,6 +485,7 @@ def _assert_no_timeout(*args, **kwargs):
             return {}
 
         loader = MXCheckpointLoader(mx_server_url="http://mx:8001")
+        loader._source_identity_compatible = MagicMock(return_value=True)
         fake_mx = _build_fake_modelexpress(
             load_weights_side_effect=_assert_no_timeout,
             source_instances=[MagicMock()],
@@ -457,6 +504,7 @@ def _assert_env_timeout(*args, **kwargs):
             return {}
 
         loader = MXCheckpointLoader(mx_server_url="http://mx:8001")
+        loader._source_identity_compatible = MagicMock(return_value=True)
         fake_mx = _build_fake_modelexpress(load_weights_side_effect=_assert_env_timeout)
         with _install_fake_modelexpress(fake_mx):
             loader.load_weights("/nonexistent", mapping=MagicMock(), model=MagicMock())
@@ -468,6 +516,7 @@ def _assert_config_timeout(*args, **kwargs):
             return {}
 
         loader = MXCheckpointLoader(mx_server_url="http://mx:8001", query_timeout_s=900)
+        loader._source_identity_compatible = MagicMock(return_value=True)
         fake_mx = _build_fake_modelexpress(load_weights_side_effect=_assert_config_timeout)
         with _install_fake_modelexpress(fake_mx):
             loader.load_weights("/nonexistent", mapping=MagicMock(), model=MagicMock())
diff --git a/tests/unittest/_torch/pyexecutor/test_model_loader_mx.py b/tests/unittest/_torch/pyexecutor/test_model_loader_mx.py