[TRTLLM-13248][feat] Wave 3 migrate MoE staged hooks

chienchunhung · chienchunhung · commit b2646d380fbe · 2026-06-17T20:36:08.000-04:00
Signed-off-by: Chien-Chun Hung &lt;2679986+chienchunhung@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/attention_backend/sparse/dsa.py b/tensorrt_llm/_torch/attention_backend/sparse/dsa.py
@@ -1,3 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """Dense Sparse Attention (DSA) backend for TRT-LLM with indexer-based TopK selection."""
 import math
 import threading
@@ -1504,13 +1518,16 @@ def __init__(self,
             # attribute queries do not end up frozen into a captured graph.
             warmup_heuristic_topk_decode(top_k=self.index_topk)
 
-    def post_load_weights(self):
+    def cache_derived_state(self):
         """Fuse wk + weights_proj into single FP32 weight for F.linear GEMM under allow_tf32 (TF32 tensor cores on Ampere+)."""
         # wk: [head_dim, hidden_size] + weights_proj: [n_heads, hidden_size]
         # → fused: [head_dim + n_heads, hidden_size]
         self._fused_wk_wp_weight = torch.cat(
             [self.wk.weight.data, self.weights_proj.weight.data], dim=0)
 
+    def post_load_weights(self):
+        self.cache_derived_state()
+
     @staticmethod
     def prepare_one_prefill_chunk(
         metadata: DSAtrtllmAttentionMetadata,
@@ -2404,7 +2421,7 @@ def pre_indexer_proj(
         split in MLA.forward_dsa_proj sees a stable signature.
         """
         assert self._fused_wk_wp_weight is not None, \
-            "post_load_weights() must be called before forward()"
+            "cache_derived_state() must be called before forward()"
         hidden_float = _to_float(hidden_states)
         with _tf32_matmul_enabled():
             # F.linear computes input @ weight.T internally; no explicit .t() needed.
diff --git a/tensorrt_llm/_torch/models/modeling_llama_min_latency.py b/tensorrt_llm/_torch/models/modeling_llama_min_latency.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from collections.abc import Callable
 from typing import Dict, List, Optional, Tuple, Union
 
@@ -308,7 +323,7 @@ def __init__(self,
 
     # After loading both gate_up_proj and down_proj, we need to set the scales needed by the special kernels and by
     # the trtllm-gen gemm+swiglu kernel.
-    def post_load_weights(self):
+    def cache_derived_state(self):
         if self.gate_up_proj.has_fp8_qdq:
             # For the special gemm+swiglu kernel, we need to set the inverse of the output scale, which is the inverse
             # of down_proj's combined input scale.
@@ -317,6 +332,9 @@ def post_load_weights(self):
             # combined input scale times inv_output_scale.
             self.gate_up_proj.trtllm_gen_global_scale = self.gate_up_proj.combined_scale * self.gate_up_proj.inv_output_scale
 
+    def post_load_weights(self):
+        self.cache_derived_state()
+
     def forward(
         self,
         x: Union[torch.Tensor, Fp4QuantizedTensor],
@@ -566,7 +584,7 @@ def __init__(
             dtype=model_config.pretrained_config.torch_dtype,
             quant_config=None)
 
-    def post_load_weights(self):
+    def cache_derived_state(self):
         # Set min-latency quant scales for routed experts if we plan to use min-latency MoE kernels.
         # This is because the routed experts' input scale is after the score multiplication, so we must use the
         # pre-score scaling input scale, which happens to be shared expert's input scale.
@@ -582,6 +600,9 @@ def post_load_weights(self):
                 fc1_input_dequant=pre_score_scaling_input_scale,
             )
 
+    def post_load_weights(self):
+        self.cache_derived_state()
+
     def compute_routed_output(
             self,
             hidden_states,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py
@@ -652,15 +652,36 @@ def load_weights(self, weights: List[Dict], allow_partial_loading: bool = False)
         )
         return self.backend.load_weights(weights, allow_partial_loading)
 
-    def post_load_weights(self):
+    def transform_weights(self):
+        """
+        Transform weights - delegated to backend
+
+        """
+        if getattr(self, "_weights_transformed", False):
+            return
+        assert hasattr(self.backend, "transform_weights"), (
+            f"Backend {self.backend.__class__.__name__} must implement transform_weights()"
+        )
+        self.backend.transform_weights()
+        self._weights_transformed = True
+
+    def cache_derived_state(self):
         """
-        Post load weights processing - delegated to backend
+        Cache derived state - delegated to backend
 
         """
-        assert hasattr(self.backend, "post_load_weights"), (
-            f"Backend {self.backend.__class__.__name__} must implement post_load_weights()"
+        assert hasattr(self.backend, "cache_derived_state"), (
+            f"Backend {self.backend.__class__.__name__} must implement cache_derived_state()"
         )
-        return self.backend.post_load_weights()
+        return self.backend.cache_derived_state()
+
+    def post_load_weights(self):
+        """
+        Backward-compatible staged post-load processing - delegated to backend
+
+        """
+        self.transform_weights()
+        self.cache_derived_state()
 
     def process_weights_after_loading(self):
         """
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl_b12x.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl_b12x.py
@@ -68,7 +68,7 @@ class CuteDslB12xFusedMoE(CuteDslFusedMoE):
     ``_get_quant_method``). The inherited CUTLASS NVFP4 layout is finalised
     by the base class, and the b12x-shaped tensors (un-normalised FP8 SF,
     ``convert_sf_to_mma_layout`` reshape, ``B12xMoEWrapper`` instance) are
-    materialised on top by the quant method's ``post_load_weights``. Both
+    materialised on top by the quant method's ``transform_weights``. Both
     layouts coexist in memory and the dispatcher picks per call based on
     ``x.shape[0]``.
 
@@ -173,7 +173,7 @@ def _route_to_cutlass(self, x) -> bool:
         return isinstance(x, torch.Tensor) and x.shape[0] >= self._PREFILL_VIA_CUTLASS_THRESHOLD
 
     # ``post_load_weights`` is inherited from ``CutlassFusedMoE`` and
-    # dispatches to ``self.quant_method.post_load_weights(self)`` — for this
+    # dispatches to ``self.quant_method.transform_weights(self)`` — for this
     # backend ``self.quant_method`` is ``NVFP4CuteDslB12xFusedMoEMethod``
     # (see ``_get_quant_method`` override), which performs the SF un-normalization,
     # ``convert_sf_to_mma_layout`` reshape, ``B12xMoEWrapper`` instantiation,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import inspect
 import os
 from functools import cached_property
@@ -1577,4 +1592,5 @@ def load_weights(self,
                                        **kargs)
 
     def post_load_weights(self):
-        self.quant_method.post_load_weights(self)
+        self.transform_weights()
+        self.cache_derived_state()
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_densegemm.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_densegemm.py
@@ -291,7 +291,8 @@ def load_weights(self, weights: List[Dict], allow_partial_loading: bool = False)
                 )
 
     def post_load_weights(self):
-        self.quant_method.post_load_weights(self)
+        self.transform_weights()
+        self.cache_derived_state()
 
     def _transform_w2_weight_scale_for_min_latency(self):
         """Transform w2_weight_scale for minimum latency path optimization."""
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py
@@ -1368,7 +1368,7 @@ def _maybe_remove_padding(gemm_output, expected_size):
 
         return gemm2_output
 
-    def post_load_weights(self, module: torch.nn.Module):
+    def transform_weights(self, module: torch.nn.Module):
         if 'w3_w1_weight' in module._parameters:
             w31_scale = shuffle_weight_for_activation_kernel(
                 module.fc31_dequant.data)
@@ -1382,7 +1382,7 @@ def post_load_weights(self, module: torch.nn.Module):
                 module.fc31_input_dequant = None
                 module.fc2_input_dequant = None
 
-        super().post_load_weights(module)
+        super().transform_weights(module)
 
 
 class TritonFusedMoE(MoE):
@@ -1586,4 +1586,5 @@ def load_weights(self,
         self.quant_method.load_weights(self, weights, self.weight_loading_mode)
 
     def post_load_weights(self):
-        self.quant_method.post_load_weights(self)
+        self.transform_weights()
+        self.cache_derived_state()
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py
@@ -525,7 +525,8 @@ def load_weights(self,
                                        **kargs)
 
     def post_load_weights(self):
-        self.quant_method.post_load_weights(self)
+        self.transform_weights()
+        self.cache_derived_state()
 
     def quantize_input(self, x, post_quant_comm: bool = True):
         """Quantize inputs prior to post-communication (alltoall/allgather) or before MoE computation.
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import inspect
 import os
 from typing import Dict, List, Optional, Tuple, Union
@@ -950,7 +965,8 @@ def load_weights(self,
                                        **kargs)
 
     def post_load_weights(self):
-        self.quant_method.post_load_weights(self)
+        self.transform_weights()
+        self.cache_derived_state()
 
     def forward_fake(
         self,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/interface.py b/tensorrt_llm/_torch/modules/fused_moe/interface.py
@@ -827,8 +827,18 @@ def load_weights(self,
         """
         raise NotImplementedError
 
+    def transform_weights(self):
+        if getattr(self, "_weights_transformed", False):
+            return
+        self.quant_method.transform_weights(self)
+        self._weights_transformed = True
+
+    def cache_derived_state(self):
+        self.quant_method.cache_derived_state(self)
+
     def post_load_weights(self):
-        pass
+        self.transform_weights()
+        self.cache_derived_state()
 
     def process_weights_after_loading(self):
         """
diff --git a/tensorrt_llm/_torch/modules/fused_moe/mega_moe/mega_moe_deepgemm.py b/tensorrt_llm/_torch/modules/fused_moe/mega_moe/mega_moe_deepgemm.py
@@ -518,7 +518,8 @@ def load_weights(self, weights: List[Dict], allow_partial_loading: bool = False)
     def post_load_weights(self) -> None:
         if self.quant_method is None:
             self.create_weights()
-        self.quant_method.post_load_weights(self)
+        self.transform_weights()
+        self.cache_derived_state()
 
     # ------------------------------------------------------------------
     # MoE-contract methods
diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
diff --git a/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py b/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py
diff --git a/tests/unittest/_torch/attention/sparse/test_dsa_indexer.py b/tests/unittest/_torch/attention/sparse/test_dsa_indexer.py
diff --git a/tests/unittest/_torch/modules/mamba/test_mamba2_mixer.py b/tests/unittest/_torch/modules/mamba/test_mamba2_mixer.py
diff --git a/tests/unittest/_torch/modules/moe/test_moe_backend.py b/tests/unittest/_torch/modules/moe/test_moe_backend.py

Original file line number	Diff line number	Diff line change
`@@ -291,7 +291,8 @@ def load_weights(self, weights: List[Dict], allow_partial_loading: bool = False)`
`291`	`291`	`)`
`292`	`292`
`293`	`293`	`def post_load_weights(self):`
`294`		`- self.quant_method.post_load_weights(self)`
	`294`	`+ self.transform_weights()`
	`295`	`+ self.cache_derived_state()`
`295`	`296`
`296`	`297`	`def _transform_w2_weight_scale_for_min_latency(self):`
`297`	`298`	`"""Transform w2_weight_scale for minimum latency path optimization."""`