NVIDIA
diff --git a/‎docs/source/models/supported-models.md‎
Lines changed: 4 additions & 0 deletions b/‎docs/source/models/supported-models.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/attention_backend/sparse/__init__.py‎
Lines changed: 20 additions & 0 deletions b/‎tensorrt_llm/_torch/attention_backend/sparse/__init__.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/attention_backend/sparse/minimax_m3/__init__.py‎
Lines changed: 76 additions & 0 deletions b/‎tensorrt_llm/_torch/attention_backend/sparse/minimax_m3/__init__.py‎
Lines changed: 76 additions & 0 deletions
@@ -32,6 +32,7 @@ The following is a table of supported models for the PyTorch backend:
 | `LlamaForCausalLM`                   | Llama 3.1, Llama 3, Llama 2, LLaMA | `meta-llama/Meta-Llama-3.1-70B`              |
 | `Llama4ForConditionalGeneration`     | Llama 4                            | `meta-llama/Llama-4-Scout-17B-16E-Instruct`  |
 | `MiniMaxM2ForCausalLM` [^5]          | MiniMax M2/M2.1/M2.7              | `MiniMaxAI/MiniMax-M2.7`                    |
+| `MiniMaxM3SparseForConditionalGeneration` [^11]| MiniMax-M3                       | `MiniMaxAI/MiniMax-M3`                      |
 | `MistralForCausalLM`                 | Mistral                            | `mistralai/Mistral-7B-v0.1`                  |
 | `MixtralForCausalLM`                 | Mixtral                            | `mistralai/Mixtral-8x7B-v0.1`                |
 | `MllamaForConditionalGeneration`     | Llama 3.2                          | `meta-llama/Llama-3.2-11B-Vision`            |
@@ -72,6 +73,7 @@ Note: Support for other models may vary. Features marked "N/A" are not applicabl
 | `NemotronHForCausalLM`           | Yes               | Yes        | Yes                        | Yes                   | Yes             | Yes | No               | No                | No     | Yes           | Yes              | Yes            | N/A                      | Untested              | Untested        |
 | `Gemma4ForConditionalGeneration` | Untested          | Yes        | Untested                   | No                    | Yes             | No  | No               | No                | No     | Yes           | Untested         | No             | Yes                      | Untested              | Untested        |
 | `Step3p7ForConditionalGeneration`| Yes               | Yes        | Yes                        | Untested              | Untested        | Yes | No               | No                | No     | Yes           | Untested         | Untested       | Yes                      | Untested              | Untested        |
+| `MiniMaxM3SparseForConditionalGeneration` [^11] | Yes               | Yes        | Yes                        | Untested              | Untested        | No  | No               | No                | No     | Yes           | Untested         | No             | N/A                      | Untested              | Untested        |
 
 [^1]: Chunked Prefill for MLA can only be enabled on SM100/SM103.
 [^2]: KV cache reuse for MLA can only be enabled on SM90/SM100/SM103 and in BF16/FP8 KV cache dtype.
@@ -82,6 +84,7 @@ Note: Support for other models may vary. Features marked "N/A" are not applicabl
 [^8]: Supports text and image inputs. The vision tower runs in BF16 even when the text decoder is quantized (FP8 block-scale or NVFP4). The text decoder is also usable standalone (text-only) via the `Step3p5ForCausalLM` architecture.
 [^9]: Audio modality only supported on E2B/E4B variants.
 [^10]: Audio requires a checkpoint with a `sound_config` and is supported only on the full (non-disaggregated) model path, not the EPD disaggregated path.
+[^11]: Supports text, image, and video inputs over the block-sparse attention path. The published MXFP8 checkpoint is dequantized on load so the runtime sees an effectively BF16 model. The text decoder is also usable standalone (text-only) via the `MiniMaxM3SparseForCausalLM` architecture. KV cache reuse and MTP are not supported on the sparse-attention path in this release.
 
 # Multimodal Feature Support Matrix (PyTorch Backend)
 
@@ -102,6 +105,7 @@ Note: Support for other models may vary. Features marked "N/A" are not applicabl
 | `Qwen3VLForConditionalGeneration`    | Yes               | Yes        | Yes             | Yes           | Yes              | Yes            | Yes                   | Yes                       | L + I + V |
 | `Qwen3VLMoeForConditionalGeneration` | Yes               | Yes        | Yes             | Yes           | Yes              | Yes            | Yes                   | Yes                       | L + I + V |
 | `Step3p7ForConditionalGeneration`    | Yes               | Yes        | Untested        | Yes           | Untested         | Untested       | Untested              | Untested                  | L + I     |
+| `MiniMaxM3SparseForConditionalGeneration` [^11] | Yes               | Yes        | Untested        | Yes           | Untested         | No             | Untested              | Untested                  | L + I + V |
 
 Note:
 - L: Language
 
@@ -1,3 +1,13 @@
+# yapf: disable
+from .minimax_m3 import (MiniMaxM3SparseAttention,
+                         MiniMaxM3SparseAttentionMetadata,
+                         MiniMaxM3SparseConfig, MiniMaxM3SparseIndexCache,
+                         allocate_minimax_m3_static_buffers,
+                         build_runtime_metadata_from_kv_manager,
+                         get_minimax_m3_attention_backend_cls,
+                         get_minimax_m3_kv_cache_manager_cls,
+                         minimax_m3_sparse_decode, minimax_m3_sparse_prefill)
+# yapf: enable
 from .utils import (get_flashinfer_sparse_attn_attention_backend,
                     get_sparse_attn_kv_cache_manager,
                     get_trtllm_sparse_attn_attention_backend,
@@ -8,4 +18,14 @@
     "get_vanilla_sparse_attn_attention_backend",
     "get_trtllm_sparse_attn_attention_backend",
     "get_flashinfer_sparse_attn_attention_backend",
+    "MiniMaxM3SparseAttention",
+    "MiniMaxM3SparseAttentionMetadata",
+    "MiniMaxM3SparseConfig",
+    "MiniMaxM3SparseIndexCache",
+    "allocate_minimax_m3_static_buffers",
+    "build_runtime_metadata_from_kv_manager",
+    "get_minimax_m3_attention_backend_cls",
+    "get_minimax_m3_kv_cache_manager_cls",
+    "minimax_m3_sparse_decode",
+    "minimax_m3_sparse_prefill",
 ]
@@ -0,0 +1,76 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""MiniMax-M3 sparse attention package.
+
+Layered as:
+
+  * :mod:`.kernels`       -- OpenAI Triton kernels (per-block max
+                              score, masked softmax for sparse GQA).
+  * :mod:`.metadata`      -- ``MiniMaxM3SparseConfig`` /
+                              ``MiniMaxM3SparseAttentionMetadata``
+                              dataclasses, CUDA-graph-stable buffer
+                              allocator + builder, and the
+                              :class:`AttentionMetadata` subclass
+                              factory.
+  * :mod:`.cache_manager` -- standalone side index cache used by tests
+                              and the :class:`KVCacheManagerV2`
+                              subclass factory.
+  * :mod:`.backend`       -- the algorithm itself (vectorized
+                              paged-cache helpers, prefill / decode
+                              entry points, the thin
+                              :class:`MiniMaxM3SparseAttention`
+                              orchestrator) and the
+                              :class:`AttentionBackend` subclass
+                              factory.
+
+This package's public surface re-exports the names callers
+historically imported from ``...sparse.minimax_m3`` so external
+importers (the model code, ``sparse.utils``, focused tests) keep
+working unchanged.
+"""
+
+# Re-export the algorithm-internal helpers focused unit tests reach
+# into so the package preserves the surface the monolithic module
+# exposed. These are not part of ``__all__`` (still package-private)
+# but stay importable as ``from ...minimax_m3 import _write_main_kv_slots``.
+from .backend import (  # noqa: F401
+    MiniMaxM3SparseAttention,
+    _compute_index_attn_chunk_q,
+    _compute_sparse_gqa_chunk_q,
+    _gather_paged_batched,
+    _index_attention_and_select,
+    _write_main_kv_slots,
+    _write_main_kv_slots_to_pool,
+    get_minimax_m3_attention_backend_cls,
+    minimax_m3_sparse_decode,
+    minimax_m3_sparse_prefill,
+)
+from .cache_manager import (
+    MiniMaxM3KVCacheManagerV2,
+    MiniMaxM3SparseIndexCache,
+    get_minimax_m3_kv_cache_manager_cls,
+)
+from .metadata import (
+    MiniMaxM3SparseAttentionMetadata,
+    MiniMaxM3SparseConfig,
+    allocate_minimax_m3_static_buffers,
+    build_runtime_metadata_from_kv_manager,
+    get_minimax_m3_attention_metadata_cls,
+    replace_metadata,
+)
+
+__all__ = [
+    "MiniMaxM3KVCacheManagerV2",
+    "MiniMaxM3SparseAttention",
+    "MiniMaxM3SparseAttentionMetadata",
+    "MiniMaxM3SparseConfig",
+    "MiniMaxM3SparseIndexCache",
+    "allocate_minimax_m3_static_buffers",
+    "build_runtime_metadata_from_kv_manager",
+    "get_minimax_m3_attention_backend_cls",
+    "get_minimax_m3_attention_metadata_cls",
+    "get_minimax_m3_kv_cache_manager_cls",
+    "minimax_m3_sparse_decode",
+    "minimax_m3_sparse_prefill",
+    "replace_metadata",
+]