nv-auto-deploy
diff --git a/‎examples/auto_deploy/model_registry/configs/minimax_m2.yaml‎
Lines changed: 8 additions & 0 deletions b/‎examples/auto_deploy/model_registry/configs/minimax_m2.yaml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎examples/auto_deploy/model_registry/models.yaml‎
Lines changed: 1 addition & 1 deletion b/‎examples/auto_deploy/model_registry/models.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorrt_llm/_torch/auto_deploy/models/custom/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎tensorrt_llm/_torch/auto_deploy/models/custom/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -1,4 +1,12 @@
 # MiniMax-M2 - override model dtype and attention backend for AutoDeploy
 attn_backend: flashinfer
+# Disable fuse_finegrained_fp8_moe: the trtllm fused MoE kernel fails with
+# NVRTC compilation error for MiniMax-M2's MoE config (256 experts, block-wise FP8).
+# Use torch-simple compile backend since the Triton MoE fallback is not
+# CUDA-graph-capturable.
+compile_backend: torch-simple
+transforms:
+  fuse_finegrained_fp8_moe:
+    enabled: false
 model_kwargs:
   torch_dtype: bfloat16
@@ -229,7 +229,7 @@ models:
   yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml']
 # --- MiniMax-M2.5 (Feb 2026) ---
 - name: MiniMaxAI/MiniMax-M2.5
-  yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml']
+  yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'minimax_m2.yaml']
 # --- MiMo-V2-Flash (Feb 2026) ---
 - name: XiaomiMiMo/MiMo-V2-Flash
   yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml']
 
@@ -15,6 +15,7 @@
 from .modeling_internlm3 import InternLM3ForCausalLM
 from .modeling_kimi_k2 import KimiK2ForCausalLM, KimiK25ForConditionalGeneration
 from .modeling_llama3 import Llama3ForCausalLM
+from .modeling_minimax_m2 import MiniMaxM2ForCausalLM
 from .modeling_mistral import MistralForCausalLM
 from .modeling_mistral3 import Mistral3ForConditionalGeneration, Mistral3TextForCausalLM
 from .modeling_nemotron_flash import NemotronFlashForCausalLM, NemotronFlashPreTrainedTokenizerFast
@@ -55,6 +56,7 @@
     "KimiK2ForCausalLM",
     "KimiK25ForConditionalGeneration",
     "Llama3ForCausalLM",
+    "MiniMaxM2ForCausalLM",
     "MistralForCausalLM",
     "Mistral3ForConditionalGeneration",
     "Mistral3TextForCausalLM",