[TRTLLM-13250][fix] Address Wave 5 review findings

chienchunhung · chienchunhung · commit 2ca7dbeb64b3 · 2026-06-23T09:43:18.000-07:00
Signed-off-by: Chien-Chun Hung &lt;2679986+chienchunhung@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/attention_backend/sparse/dsa.py b/tensorrt_llm/_torch/attention_backend/sparse/dsa.py
@@ -1518,14 +1518,14 @@ def __init__(self,
             # attribute queries do not end up frozen into a captured graph.
             warmup_heuristic_topk_decode(top_k=self.index_topk)
 
-    def cache_derived_state(self):
+    def cache_derived_state(self) -> None:
         """Fuse wk + weights_proj into single FP32 weight for F.linear GEMM under allow_tf32 (TF32 tensor cores on Ampere+)."""
         # wk: [head_dim, hidden_size] + weights_proj: [n_heads, hidden_size]
         # → fused: [head_dim + n_heads, hidden_size]
         self._fused_wk_wp_weight = torch.cat(
             [self.wk.weight.data, self.weights_proj.weight.data], dim=0)
 
-    def post_load_weights(self):
+    def post_load_weights(self) -> None:
         self.cache_derived_state()
 
     @staticmethod
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -1921,7 +1921,7 @@ def load_weights(self, weights: ConsumableWeightsDict):
         weight_loader = DeepseekV3WeightLoader(self)
         weight_loader.load_weights(weights)
 
-    def setup_aliases(self):
+    def setup_aliases(self) -> None:
         for idx, layer in enumerate(
                 self.model.layers[:self.config.num_hidden_layers]):
             if idx == self.config.num_hidden_layers - 1:
diff --git a/tensorrt_llm/_torch/models/modeling_exaone_moe.py b/tensorrt_llm/_torch/models/modeling_exaone_moe.py
@@ -725,7 +725,7 @@ def load_weights(
             allow_partial_loading=allow_partial_loading,
         )
 
-    def setup_aliases(self):
+    def setup_aliases(self) -> None:
         # For the cross-layer residual+LN fusion.
         for idx, layer in enumerate(self.model.layers[: self.config.num_hidden_layers]):
             if idx == self.config.num_hidden_layers - 1:
diff --git a/tensorrt_llm/_torch/models/modeling_glm.py b/tensorrt_llm/_torch/models/modeling_glm.py
@@ -1074,7 +1074,7 @@ def load_weights(self, weights: ConsumableWeightsDict, allow_partial_loading: bo
         weight_loader = Glm4WeightLoader(self)
         weight_loader.load_weights(weights, allow_partial_loading=allow_partial_loading)
 
-    def setup_aliases(self):
+    def setup_aliases(self) -> None:
         for idx, layer in enumerate(self.model.layers[: self.config.num_hidden_layers]):
             if idx == self.config.num_hidden_layers - 1:
                 layer.next_layer_layernorm = self.model.norm
diff --git a/tensorrt_llm/_torch/models/modeling_gpt_oss.py b/tensorrt_llm/_torch/models/modeling_gpt_oss.py
@@ -631,7 +631,7 @@ def load_weights(self, weights: Dict):
         else:
             self.load_hf_weights(weights)
 
-    def setup_aliases(self):
+    def setup_aliases(self) -> None:
         for idx, layer in enumerate(
                 self.model.block[:self.config.num_hidden_layers]):
             if idx == 0:
diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -1140,7 +1140,7 @@ def __init__(
     ):
         super().__init__(LlamaModel(model_config), model_config)
 
-    def setup_aliases(self):
+    def setup_aliases(self) -> None:
         for idx, layer in enumerate(
                 self.model.layers[:self.config.num_hidden_layers]):
             if idx == self.config.num_hidden_layers - 1:
@@ -1564,7 +1564,7 @@ def load_weights(self, weights: Dict, weight_mapper: BaseWeightMapper):
             if had_mm_encoder:
                 self.mm_encoder = saved_mm_encoder
 
-    def setup_aliases(self):
+    def setup_aliases(self) -> None:
         for idx, layer in enumerate(
                 self.model.layers[:self.config.num_hidden_layers]):
             if idx == self.config.num_hidden_layers - 1:
diff --git a/tensorrt_llm/_torch/models/modeling_llama_min_latency.py b/tensorrt_llm/_torch/models/modeling_llama_min_latency.py
@@ -323,7 +323,7 @@ def __init__(self,
 
     # After loading both gate_up_proj and down_proj, we need to set the scales needed by the special kernels and by
     # the trtllm-gen gemm+swiglu kernel.
-    def cache_derived_state(self):
+    def cache_derived_state(self) -> None:
         if self.gate_up_proj.has_fp8_qdq:
             # For the special gemm+swiglu kernel, we need to set the inverse of the output scale, which is the inverse
             # of down_proj's combined input scale.
@@ -332,7 +332,7 @@ def cache_derived_state(self):
             # combined input scale times inv_output_scale.
             self.gate_up_proj.trtllm_gen_global_scale = self.gate_up_proj.combined_scale * self.gate_up_proj.inv_output_scale
 
-    def post_load_weights(self):
+    def post_load_weights(self) -> None:
         self.cache_derived_state()
 
     def forward(
@@ -584,7 +584,7 @@ def __init__(
             dtype=model_config.pretrained_config.torch_dtype,
             quant_config=None)
 
-    def cache_derived_state(self):
+    def cache_derived_state(self) -> None:
         # Set min-latency quant scales for routed experts if we plan to use min-latency MoE kernels.
         # This is because the routed experts' input scale is after the score multiplication, so we must use the
         # pre-score scaling input scale, which happens to be shared expert's input scale.
@@ -600,7 +600,7 @@ def cache_derived_state(self):
                 fc1_input_dequant=pre_score_scaling_input_scale,
             )
 
-    def post_load_weights(self):
+    def post_load_weights(self) -> None:
         self.cache_derived_state()
 
     def compute_routed_output(
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py
@@ -417,7 +417,7 @@ def __init__(
         )
         self.preload_weight_modules = self.model.preload_weight_modules
 
-    def setup_aliases(self):
+    def setup_aliases(self) -> None:
         for idx, layer in enumerate(
                 self.model.layers[:self.config.num_hidden_layers]):
             if idx == self.config.num_hidden_layers - 1:
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_next.py b/tensorrt_llm/_torch/models/modeling_qwen3_next.py
@@ -980,7 +980,7 @@ def load_weights(self, weights: dict, weight_mapper: BaseWeightMapper):
         new_weights = weight_mapper.preprocess_weights(weights)
         super().load_weights(new_weights, weight_mapper)
 
-    def setup_aliases(self):
+    def setup_aliases(self) -> None:
         for idx, layer in enumerate(
                 self.model.layers[:self.config.num_hidden_layers]):
             if idx == self.config.num_hidden_layers - 1:
diff --git a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py
@@ -650,9 +650,10 @@ def load_weights(self, weights: List[Dict], allow_partial_loading: bool = False)
         assert hasattr(self.backend, "load_weights"), (
             f"Backend {self.backend.__class__.__name__} must implement load_weights()"
         )
+        self._weights_transformed = False
         return self.backend.load_weights(weights, allow_partial_loading)
 
-    def transform_weights(self):
+    def transform_weights(self) -> None:
         """
         Transform weights - delegated to backend
 
@@ -665,17 +666,17 @@ def transform_weights(self):
         self.backend.transform_weights()
         self._weights_transformed = True
 
-    def cache_derived_state(self):
+    def cache_derived_state(self) -> None:
         """
         Cache derived state - delegated to backend
 
         """
         assert hasattr(self.backend, "cache_derived_state"), (
             f"Backend {self.backend.__class__.__name__} must implement cache_derived_state()"
         )
-        return self.backend.cache_derived_state()
+        self.backend.cache_derived_state()
 
-    def post_load_weights(self):
+    def post_load_weights(self) -> None:
         """
         Backward-compatible staged post-load processing - delegated to backend
 
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
@@ -1591,6 +1591,6 @@ def load_weights(self,
         self.quant_method.load_weights(self, weights, self.weight_loading_mode,
                                        **kargs)
 
-    def post_load_weights(self):
+    def post_load_weights(self) -> None:
         self.transform_weights()
         self.cache_derived_state()
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_densegemm.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_densegemm.py
@@ -290,7 +290,7 @@ def load_weights(self, weights: List[Dict], allow_partial_loading: bool = False)
                     f"got {self.quant_config.quant_mode}."
                 )
 
-    def post_load_weights(self):
+    def post_load_weights(self) -> None:
         self.transform_weights()
         self.cache_derived_state()
 
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py
@@ -1368,7 +1368,7 @@ def _maybe_remove_padding(gemm_output, expected_size):
 
         return gemm2_output
 
-    def transform_weights(self, module: torch.nn.Module):
+    def transform_weights(self, module: torch.nn.Module) -> None:
         if 'w3_w1_weight' in module._parameters:
             w31_scale = shuffle_weight_for_activation_kernel(
                 module.fc31_dequant.data)
@@ -1585,6 +1585,6 @@ def load_weights(self,
 
         self.quant_method.load_weights(self, weights, self.weight_loading_mode)
 
-    def post_load_weights(self):
+    def post_load_weights(self) -> None:
         self.transform_weights()
         self.cache_derived_state()
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py
@@ -524,7 +524,7 @@ def load_weights(self,
         self.quant_method.load_weights(self, weights, self.weight_loading_mode,
                                        **kargs)
 
-    def post_load_weights(self):
+    def post_load_weights(self) -> None:
         self.transform_weights()
         self.cache_derived_state()
 
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
@@ -964,7 +964,7 @@ def load_weights(self,
         self.quant_method.load_weights(self, weights, self.weight_loading_mode,
                                        **kargs)
 
-    def post_load_weights(self):
+    def post_load_weights(self) -> None:
         self.transform_weights()
         self.cache_derived_state()
 
diff --git a/tensorrt_llm/_torch/modules/fused_moe/interface.py b/tensorrt_llm/_torch/modules/fused_moe/interface.py
@@ -827,16 +827,16 @@ def load_weights(self,
         """
         raise NotImplementedError
 
-    def transform_weights(self):
+    def transform_weights(self) -> None:
         if getattr(self, "_weights_transformed", False):
             return
         self.quant_method.transform_weights(self)
         self._weights_transformed = True
 
-    def cache_derived_state(self):
+    def cache_derived_state(self) -> None:
         self.quant_method.cache_derived_state(self)
 
-    def post_load_weights(self):
+    def post_load_weights(self) -> None:
         self.transform_weights()
         self.cache_derived_state()
 
diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@@ -559,22 +559,22 @@ def _finalize_shared_weights(self, module: torch.nn.Module):
         module.register_all_parameter_slot_and_to_fix_weight_fns(weight_fns)
         module.layer_load_balancer.host_tensor_sharer.finalize_layer_weights()
 
-    def transform_weights(self, module: torch.nn.Module):
+    def transform_weights(self, module: torch.nn.Module) -> None:
         # Safety net for deferred-finalization callers (e.g. RLHF reload, which
         # passes allow_partial_loading=True and so skips the eager per-layer
         # finalization in load_weights).  Idempotent when finalization already
         # ran eagerly.
         self._finalize_shared_weights(module)
 
-    def cache_derived_state(self, module: torch.nn.Module):
+    def cache_derived_state(self, module: torch.nn.Module) -> None:
         if hasattr(module,
                    "layer_load_balancer") and module.layer_load_balancer:
             module.layer_load_balancer.set_initial_weight_assignments(
                 module.initial_global_assignments)
         # Re-setup quant scales after loading weights as the tensors may have been modified.
         self.setup_quant_scales(module)
 
-    def post_load_weights(self, module: torch.nn.Module):
+    def post_load_weights(self, module: torch.nn.Module) -> None:
         self.transform_weights(module)
         self.cache_derived_state(module)
 
@@ -781,7 +781,7 @@ def process_weights_after_loading(self, module: torch.nn.Module):
                                             module.rebuild_tensor_metadata)
         module._trtllm_gen_layout_transform_pending = False
 
-    def transform_weights(self, module: torch.nn.Module):
+    def transform_weights(self, module: torch.nn.Module) -> None:
         if getattr(module, "_trtllm_gen_layout_transform_pending", False):
             self.process_weights_after_loading(module)
         super().transform_weights(module)
@@ -1012,7 +1012,7 @@ def process_weights_after_loading(self, module: torch.nn.Module):
         delattr(module, 'tmp_fc31_input_scale')
         delattr(module, 'tmp_fc2_input_scale')
 
-    def transform_weights(self, module):
+    def transform_weights(self, module: torch.nn.Module) -> None:
         super().transform_weights(module)
 
         # Padding weights to meet FP8 GEMM alignment requirements.
@@ -1277,7 +1277,7 @@ def _prepare_shared_weights_for_finalization(self, module: torch.nn.Module):
                             transformed_shared_w2_scale.cpu())
         super()._prepare_shared_weights_for_finalization(module)
 
-    def transform_weights(self, module: torch.nn.Module):
+    def transform_weights(self, module: torch.nn.Module) -> None:
         super().transform_weights(module)
 
         if self._needs_e8m0_resmooth():
@@ -3103,7 +3103,7 @@ class NVFP4CuteDslB12xFusedMoEMethod(NVFP4CutlassFusedMoEMethod):
         ActivationType.Swiglu: "silu",
     }
 
-    def transform_weights(self, module: torch.nn.Module):
+    def transform_weights(self, module: torch.nn.Module) -> None:
         # Base class handles shared-weight finalization. The cache stage
         # handles load-balancer assignments and setup_quant_scales.
         # Leaves the standard CUTLASS NVFP4 weight + SF layout in place
@@ -5348,7 +5348,7 @@ def create_weights(self, module: torch.nn.Module):
     def setup_quant_scales(self, module: torch.nn.Module):
         module.quant_scales = tuple()
 
-    def cache_derived_state(self, module: torch.nn.Module):
+    def cache_derived_state(self, module: torch.nn.Module) -> None:
         super().cache_derived_state(module)
         # Create a proxy weight of unpadded size; dtype does not matter
         w1_weight = torch.empty([module.intermediate_size, module.hidden_size])
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
@@ -381,7 +381,7 @@ def load_weights(self,
             self.process_weights_after_loading(module)
 
     def transform_weights(self, module: Linear) -> None:
-        pass
+        return None
 
     def post_load_weights(self, module: Linear) -> None:
         self.transform_weights(module)
diff --git a/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py b/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py
@@ -253,7 +253,7 @@ def __init__(
         self.aux_steram = torch.cuda.Stream()
         self.events = [torch.cuda.Event(), torch.cuda.Event()]
 
-    def cache_derived_state(self):
+    def cache_derived_state(self) -> None:
         """Recompute state derived from loaded weights."""
         if (self.norm.is_nvfp4 and fused_gated_rmsnorm_quant_shape_ok(
                 self.norm.hidden_size, self.norm.group_size)
@@ -270,7 +270,7 @@ def cache_derived_state(self):
                                         p=self.head_dim)
         self._D_expanded = repeat(self.D, "h -> h p", p=self.head_dim)
 
-    def post_load_weights(self):
+    def post_load_weights(self) -> None:
         self.cache_derived_state()
 
     def _try_attach_nvfp4_scale(self):
diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py
diff --git a/tests/unittest/_torch/models/checkpoints/mx/test_mx_checkpoint_loader.py b/tests/unittest/_torch/models/checkpoints/mx/test_mx_checkpoint_loader.py
diff --git a/tests/unittest/_torch/modules/moe/test_moe_backend.py b/tests/unittest/_torch/modules/moe/test_moe_backend.py
diff --git a/tests/unittest/_torch/pyexecutor/test_model_loader_mx.py b/tests/unittest/_torch/pyexecutor/test_model_loader_mx.py

Original file line number	Diff line number	Diff line change
`@@ -725,7 +725,7 @@ def load_weights(`
`725`	`725`	`allow_partial_loading=allow_partial_loading,`
`726`	`726`	`)`
`727`	`727`
`728`		`- def setup_aliases(self):`
	`728`	`+ def setup_aliases(self) -> None:`
`729`	`729`	`# For the cross-layer residual+LN fusion.`
`730`	`730`	`for idx, layer in enumerate(self.model.layers[: self.config.num_hidden_layers]):`
`731`	`731`	`if idx == self.config.num_hidden_layers - 1:`
Original file line number	Diff line number	Diff line change
`@@ -417,7 +417,7 @@ def __init__(`
`417`	`417`	`)`
`418`	`418`	`self.preload_weight_modules = self.model.preload_weight_modules`
`419`	`419`
`420`		`- def setup_aliases(self):`
	`420`	`+ def setup_aliases(self) -> None:`
`421`	`421`	`for idx, layer in enumerate(`
`422`	`422`	`self.model.layers[:self.config.num_hidden_layers]):`
`423`	`423`	`if idx == self.config.num_hidden_layers - 1:`