From 3af7ccb4f01661673e59342943400d015aeb68d1 Mon Sep 17 00:00:00 2001
From: ganyi <ygan@amd.com>
Date: Wed, 13 May 2026 01:45:37 +0000
Subject: [PATCH 01/15] mtp 1 acc right

Signed-off-by: ganyi <ygan@amd.com>
---
 atom/model_loader/loader.py                   |  5 +-
 atom/models/qwen3_next.py                     | 33 ++++++++++-
 atom/models/qwen3_next_mtp.py                 | 37 ++++++++++--
 atom/plugin/attention.py                      | 36 +++++++++++-
 atom/plugin/attention_mha.py                  | 58 +++++++++++++------
 .../vllm/attention_backend/attention_gdn.py   |  9 ++-
 atom/plugin/vllm/model_wrapper.py             | 30 +++++-----
 atom/plugin/vllm/register.py                  |  1 +
 8 files changed, 165 insertions(+), 44 deletions(-)

diff --git a/atom/model_loader/loader.py b/atom/model_loader/loader.py
index 0763179f5..08261e271 100644
--- a/atom/model_loader/loader.py
+++ b/atom/model_loader/loader.py
@@ -388,9 +388,12 @@ def _submit(fn, *args):
                 is_rocm_aiter_fusion_shared_expert_enabled()
                 and maybe_matching_name is not None
             ):
+                n_routed = getattr(
+                    hf_config, "n_routed_experts", None
+                ) or getattr(hf_config, "num_experts", None)
                 name = name.replace(
                     maybe_matching_name,
-                    f"mlp.experts.{hf_config.n_routed_experts}.",
+                    f"mlp.experts.{n_routed}.",
                 )
             for k in packed_modules_mapping:
                 # We handle the experts below in expert_params_mapping
diff --git a/atom/models/qwen3_next.py b/atom/models/qwen3_next.py
index f8abcb867..c42dd3b9e 100644
--- a/atom/models/qwen3_next.py
+++ b/atom/models/qwen3_next.py
@@ -279,6 +279,7 @@ def __init__(
         atom_config,
         quant_config=None,
         prefix: str = "",
+        layer_num: int | None = None,
     ) -> None:
         super().__init__()
         if hasattr(atom_config.hf_config, "text_config"):
@@ -380,6 +381,15 @@ def __init__(
                 k_norm=self.k_norm,
             )
 
+        # For MTP, the prefix is e.g. "mtp.layers.0.self_attn" so
+        # extract_layer_index(prefix) returns 0, which would collide with the
+        # target model's layer 0 KV cache slot. Allow callers (e.g.
+        # Qwen3NextDecoderLayer) to pass an explicit `layer_num` so MTP can
+        # use absolute indices (mtp_start_layer_idx + idx) and get its own
+        # KV cache slot.
+        attn_layer_num = (
+            layer_num if layer_num is not None else extract_layer_index(prefix)
+        )
         self.attn = Attention(
             self.num_heads,
             self.head_dim,
@@ -388,7 +398,7 @@ def __init__(
             kv_cache_dtype=atom_config.kv_cache_dtype,
             quant_config=quant_config,
             use_mla=False,
-            layer_num=extract_layer_index(prefix),
+            layer_num=attn_layer_num,
             config=atom_config,
             prefix=f"{prefix}",
             **fusion_kwargs,
@@ -486,6 +496,19 @@ def __init__(
         self.config = config
         self.quant_config = quant_config
         self.speculative_config = speculative_config
+        # When running as a vLLM plugin, Qwen3NextDecoderLayer instantiates
+        # this module without forwarding speculative_config. That left
+        # self.num_spec=0 even with MTP enabled, so get_state_shape() (the
+        # instance method vLLM's MambaBase.get_kv_cache_spec uses to size each
+        # layer's KV cache) allocated conv_state with only `kernel_size-1`
+        # token rows. During spec decode, causal_conv1d_update writes
+        # `kernel_size-1 + num_spec` rows per slot and the extra row spilled
+        # into the page-adjacent ssm_state, corrupting layer 0's recurrent
+        # state. Pull the spec config from the vLLM config as a fallback.
+        if is_vllm() and self.speculative_config is None:
+            vllm_spec_config = get_current_vllm_config().speculative_config
+            if vllm_spec_config is not None:
+                self.speculative_config = vllm_spec_config
         self.num_spec = (
             self.speculative_config.num_speculative_tokens
             if self.speculative_config
@@ -779,6 +802,7 @@ def __init__(
                 atom_config,
                 quant_config=quant_config,
                 prefix=f"{prefix}.self_attn",
+                layer_num=layer_num,
             )
         else:
             raise ValueError(f"Invalid layer_type {self.layer_type}")
@@ -863,7 +887,6 @@ def forward(
         residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
-
         if self.input_layernorm.use_fused_quant:
             if residual is None:
                 residual = hidden_states
@@ -1059,6 +1082,11 @@ def __init__(
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
 
+        # Expose embed_tokens at this level for vLLM MTP embedding sharing.
+        # vLLM's proposer accesses target_wrapper.model.embed_tokens, where
+        # target_wrapper.model = this class (Qwen3NextForCausalLM).
+        self.embed_tokens = self.model.embed_tokens
+
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
@@ -1132,6 +1160,7 @@ def get_mamba_state_shape_from_config(
                 if vllm_config.speculative_config
                 else 0
             )
+
             return MambaStateShapeCalculator.gated_delta_net_state_shape(
                 tp_size,
                 hf_config.linear_num_key_heads,
diff --git a/atom/models/qwen3_next_mtp.py b/atom/models/qwen3_next_mtp.py
index 2a5f0737e..3f49a80df 100644
--- a/atom/models/qwen3_next_mtp.py
+++ b/atom/models/qwen3_next_mtp.py
@@ -27,6 +27,13 @@ def __init__(self, atom_config: Config, prefix: str = ""):
         config: Qwen3NextConfig = atom_config.hf_config
 
         self.config = config
+        # Qwen3NextDecoderLayer's MoE block needs these attributes, which
+        # Qwen3NextModel.__init__ sets but which are absent from the raw
+        # HF config.  Set them here so the MTP predictor works standalone.
+        if not hasattr(config, "n_shared_experts"):
+            config.n_shared_experts = 1
+        if not hasattr(config, "n_routed_experts"):
+            config.n_routed_experts = config.num_experts
 
         self.vocab_size = config.vocab_size
 
@@ -38,6 +45,10 @@ def __init__(self, atom_config: Config, prefix: str = ""):
             config.hidden_size,
         )
 
+        # Pass the layer's HF-style prefix so the quant_config exclude list
+        # (which contains "mtp.fc" in Qwen3-Next FP8 checkpoints) is honored;
+        # without it the lookup uses "" and falls back to the global FP8 spec,
+        # which makes fc FP8 even though the source weight is BF16.
         self.fc = ColumnParallelLinear(
             self.config.hidden_size * 2,
             self.config.hidden_size,
@@ -46,16 +57,18 @@ def __init__(self, atom_config: Config, prefix: str = ""):
             prefix=f"{prefix}.fc",
         )
 
+        # Use 0-indexed prefix (matches checkpoint's mtp.layers.0.* weight
+        # names and vLLM's reference impl), but keep layer_num as the
+        # absolute index so the attention layer gets a KV cache slot that
+        # doesn't collide with the target model's layers.
         self.layers = torch.nn.ModuleList(
             Qwen3NextDecoderLayer(
                 atom_config,
                 layer_type="full_attention",
                 prefix=f"{prefix}.layers.{idx}",
-                layer_num=idx,
-            )
-            for idx in range(
-                self.mtp_start_layer_idx, self.mtp_start_layer_idx + self.num_mtp_layers
+                layer_num=self.mtp_start_layer_idx + idx,
             )
+            for idx in range(self.num_mtp_layers)
         )
 
         self.norm = Qwen3NextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -131,6 +144,10 @@ def remap_mtp_weight_name(self, name: str) -> str | None:
     def __init__(self, atom_config: Config, prefix: str = ""):
         super().__init__()
         config = atom_config.hf_config
+        if not hasattr(config, "n_shared_experts"):
+            config.n_shared_experts = 1
+        if not hasattr(config, "n_routed_experts"):
+            config.n_routed_experts = config.num_experts
         if atom_config.enable_prefix_caching:
             raise ValueError("Qwen3NextMTP currently does not support prefix caching")
         self.config = config
@@ -171,9 +188,19 @@ def compute_logits(
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
+        # Mirror target's get_expert_mapping: when shared-expert fusion is on,
+        # the loader rewrites `mlp.shared_expert.*` to `mlp.experts.{N}.*`
+        # (where N == n_routed_experts), so the expert_mapping must include
+        # an extra slot for that fused shared-expert. Without this, MTP's
+        # shared_expert weights get silently dropped during loading.
+        from atom.model_ops.topK import is_rocm_aiter_fusion_shared_expert_enabled
+
+        n_routed = getattr(self.config, "n_routed_experts", self.config.num_experts)
+        n_shared = getattr(self.config, "n_shared_experts", 0)
         return FusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.num_experts,
+            num_experts=n_routed
+            + (n_shared if is_rocm_aiter_fusion_shared_expert_enabled() else 0),
         )
diff --git a/atom/plugin/attention.py b/atom/plugin/attention.py
index 9c674c1cd..81ded7cfc 100644
--- a/atom/plugin/attention.py
+++ b/atom/plugin/attention.py
@@ -283,6 +283,28 @@ def init_method_under_plugin_mode(
         i64_kwargs = {"dtype": torch.int64, "device": device}
         self.positions = CpuGpuBuffer(max_num_batched_tokens, **i64_kwargs)
 
+        # Bump reorder_batch_threshold so multi-token spec-decode requests
+        # (MTP / EAGLE) are routed through the decode path. Mirrors vLLM's
+        # AttentionMetadataBuilder._init_reorder_batch_threshold(supports_spec_as_decode=True).
+        speculative_config = getattr(config, "speculative_config", None)
+        if (
+            getattr(self, "reorder_batch_threshold", None) is not None
+            and speculative_config is not None
+            and getattr(speculative_config, "num_speculative_tokens", None) is not None
+        ):
+            parallel_drafting = getattr(speculative_config, "parallel_drafting", False)
+            max_num_queries_for_spec = 1 + (2 if parallel_drafting else 1) * (
+                speculative_config.num_speculative_tokens
+            )
+            self.reorder_batch_threshold = max(
+                self.reorder_batch_threshold, max_num_queries_for_spec
+            )
+            logger.info(
+                "Spec decode: bumped reorder_batch_threshold to %d (num_spec_tokens=%d)",
+                self.reorder_batch_threshold,
+                speculative_config.num_speculative_tokens,
+            )
+
     return init_method_under_plugin_mode
 
 
@@ -300,7 +322,7 @@ def setup_attn_metadata_builder_base_class_and_attributes(class_dict: dict):
     needs_generic = True
 
     # align with vllm rocm aiter fa
-    class_dict["_cudagraph_support"] = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+    class_dict["_cudagraph_support"] = AttentionCGSupport.UNIFORM_BATCH
     class_dict["reorder_batch_threshold"] = 1
 
     return base_class, generic_base, needs_generic, class_dict
@@ -324,9 +346,12 @@ def build(
 
         from vllm.v1.attention.backends.utils import split_decodes_prefills_and_extends
 
-        # here assume the decode num token is 1 per request
+        # decode_threshold tracks reorder_batch_threshold so MTP/EAGLE
+        # multi-token verification (query_len > 1) routes through decode.
+        decode_threshold = getattr(self, "reorder_batch_threshold", 1) or 1
         split_ret = split_decodes_prefills_and_extends(
-            common_attn_metadata=common_attn_metadata, decode_threshold=1
+            common_attn_metadata=common_attn_metadata,
+            decode_threshold=decode_threshold,
         )
 
         (
@@ -351,6 +376,11 @@ def build(
         query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
 
         num_computed_tokens_cpu = common_attn_metadata._num_computed_tokens_cpu
+        # In async spec-decode mode (auto-enabled for MTP/EAGLE), vLLM sets
+        # _num_computed_tokens_cpu to None because the GPU seq_lens is the
+        # authoritative source. Reconstruct from CPU tensors we already have.
+        if num_computed_tokens_cpu is None:
+            num_computed_tokens_cpu = seq_lens - query_lens_cpu
 
         prefill_max_query_len = decode_max_query_len = (
             common_attn_metadata.max_query_len
diff --git a/atom/plugin/attention_mha.py b/atom/plugin/attention_mha.py
index 16c88949d..100c492ab 100644
--- a/atom/plugin/attention_mha.py
+++ b/atom/plugin/attention_mha.py
@@ -234,15 +234,27 @@ def paged_attention_triton_plugin_mode(
         v_cache: torch.Tensor,
         k_scale: torch.Tensor,
         v_scale: torch.Tensor,
+        num_decodes: int,
         out: torch.Tensor,
         attn_metadata: "AttentionMetaData",
         ps: bool = True,
     ):
-        o = out
-        num_seqs, num_q_heads_total, head_size = q.shape
+        # q.shape[0] == num_decodes * max_query_len for MTP (one row per decode
+        # token, query_len > 1). For non-MTP it equals num_decodes (query_len = 1).
+        # pa_decode_gluon handles multi-token causal masking internally when
+        # `query_length > 1` is passed; intermediate buffers must be sized
+        # `num_decodes` (not q.shape[0]) and `query_group_size` must include
+        # the max_qlen multiplier — mirroring server-mode `paged_attention_triton`.
+        _, num_q_heads_total, head_size = q.shape
         num_blocks, num_kv_heads, _, block_size, _ = k_cache.shape
-        query_group_size = num_q_heads_total // num_kv_heads
+        decode_metadata = attn_metadata.plugin_metadata.decode_metadata
+        max_qlen = decode_metadata.max_query_len if decode_metadata is not None else 1
         assert num_q_heads_total % num_kv_heads == 0
+
+        seq_lens = attn_metadata.plugin_metadata.seq_lens[:num_decodes]
+        block_tables = attn_metadata.plugin_metadata.block_table[:num_decodes]
+
+        query_group_size = max_qlen * (num_q_heads_total // num_kv_heads)
         context_partition_size = 256
 
         # use_ps = self.adopt_persistent_kernel(
@@ -250,7 +262,9 @@ def paged_attention_triton_plugin_mode(
         # )
         use_ps = True
         if use_ps:
-            max_context_partition_num = get_recommended_splits(num_seqs, num_kv_heads)
+            max_context_partition_num = get_recommended_splits(
+                num_decodes, num_kv_heads
+            )
         else:
             max_context_partition_num = _NO_PS_FIXED_SPLITS
 
@@ -258,9 +272,8 @@ def paged_attention_triton_plugin_mode(
             max_context_partition_num = 1
             context_partition_size = 128
 
-        # Output buffers (same as Triton)
         intermediate_shape = (
-            num_seqs,
+            num_decodes,
             num_kv_heads,
             max_context_partition_num,
             query_group_size,
@@ -283,21 +296,19 @@ def paged_attention_triton_plugin_mode(
             k_scale = k_scale.unsqueeze(-1)
             v_scale = v_scale.unsqueeze(-1)
 
-        num_decode_seqs = q.shape[0]
-        seq_lens_decode = attn_metadata.plugin_metadata.seq_lens[:num_decode_seqs]
-        block_tables_decode = attn_metadata.plugin_metadata.block_table[
-            :num_decode_seqs
-        ]
-
+        # Kernel takes natural q layout [batch * query_length, num_q_heads, head_size].
+        # Internally it derives batch_size = q.shape[0] // query_length and reshapes
+        # to [batch, query_length, num_kv_heads, group, head_size]. See
+        # aiter/aiter/ops/triton/gluon/pa_decode_gluon.py:5371-5377 and 5542-5544.
         torch.ops.aiter.pa_decode_gluon(
-            o,
+            out,
             q,
             k_cache,
             v_cache,
-            seq_lens_decode,
-            block_tables_decode,
+            seq_lens,
+            block_tables,
             self.scale,
-            1,  # query_lenth
+            max_qlen,  # query_length — handles multi-token causal mask internally
             max_context_partition_num,
             context_partition_size,
             compute_type,
@@ -312,8 +323,7 @@ def paged_attention_triton_plugin_mode(
             sliding_window=self.sliding_window,
             ps=use_ps,
         )
-
-        return o
+        return out
 
     def paged_attention_asm_plugin_mode(
         self,
@@ -327,6 +337,11 @@ def paged_attention_asm_plugin_mode(
         attn_metadata: "AttentionMetaData",
         out: torch.Tensor,
     ):
+        decode_metadata = attn_metadata.plugin_metadata.decode_metadata
+        max_qlen = decode_metadata.max_query_len if decode_metadata is not None else 1
+        qo_indptr = (
+            decode_metadata.query_start_loc if decode_metadata is not None else None
+        )
         aiter.pa_fwd_asm(
             Q=q,
             K=k_cache,
@@ -336,9 +351,11 @@ def paged_attention_asm_plugin_mode(
             block_tables_stride0=attn_metadata.plugin_metadata.block_table[
                 :num_decodes
             ].stride(0),
+            max_qlen=max_qlen,
             K_QScale=k_scale,
             V_QScale=v_scale,
             out_=out[:num_decode_tokens],
+            qo_indptr=qo_indptr,
             high_precision=0,
         )
 
@@ -706,12 +723,13 @@ def forward_impl_plugin_mode(
             extend_tokens_slice = slice(
                 num_decode_tokens, num_decode_tokens + num_extend_tokens
             )
+            extend_reqs_slice = slice(num_decodes, num_decodes + num_extends)
             extend_querys = query[extend_tokens_slice]
             extend_keys = key[extend_tokens_slice]
             extend_values = value[extend_tokens_slice]
             extend_outputs = output[extend_tokens_slice]
             extend_block_table = attn_metadata.plugin_metadata.block_table[
-                extend_tokens_slice
+                extend_reqs_slice
             ]
             extend_slot_mapping = attn_metadata.plugin_metadata.slot_mapping[
                 extend_tokens_slice
@@ -745,6 +763,7 @@ def forward_impl_plugin_mode(
                     v_cache=new_value_cache,
                     k_scale=k_scale,
                     v_scale=v_scale,
+                    num_decodes=num_decodes,
                     out=output_actual_tokens[:num_decode_tokens],
                     attn_metadata=attn_metadata,
                 )
@@ -757,6 +776,7 @@ def forward_impl_plugin_mode(
                         v_cache=new_value_cache,
                         k_scale=k_scale,
                         v_scale=v_scale,
+                        num_decodes=num_decodes,
                         out=output_actual_tokens[:num_decode_tokens],
                         attn_metadata=attn_metadata,
                     )
diff --git a/atom/plugin/vllm/attention_backend/attention_gdn.py b/atom/plugin/vllm/attention_backend/attention_gdn.py
index b6158a086..87a2f2f9f 100644
--- a/atom/plugin/vllm/attention_backend/attention_gdn.py
+++ b/atom/plugin/vllm/attention_backend/attention_gdn.py
@@ -22,6 +22,7 @@
 from atom.model_ops.fla_ops.fused_sigmoid_gating import (
     fused_sigmoid_gating_delta_rule_update,
 )
+
 from atom.utils import envs
 
 from torch import nn
@@ -385,7 +386,13 @@ def forward(
             ssm_state[non_spec_state_indices_tensor] = last_recurrent_state.to(
                 ssm_state.dtype
             )
-            core_attn_out[:num_actual_tokens] = core_attn_out_non_spec.squeeze(0)
+            # Only write directly when there are no spec tokens. With spec
+            # decode active, mixed_qkv was index_select'd by non_spec_token_indx
+            # so core_attn_out_non_spec has fewer rows than num_actual_tokens.
+            # The merge below (index_copy_) handles the scatter back to the
+            # correct slot positions.
+            if spec_sequence_masks is None:
+                core_attn_out[:num_actual_tokens] = core_attn_out_non_spec.squeeze(0)
         elif attn_metadata.num_decodes > 0:
             o = core_attn_out[: attn_metadata.num_decode_tokens]
             if USE_FLYDSL_GDR:
diff --git a/atom/plugin/vllm/model_wrapper.py b/atom/plugin/vllm/model_wrapper.py
index c2b990c18..31d546805 100644
--- a/atom/plugin/vllm/model_wrapper.py
+++ b/atom/plugin/vllm/model_wrapper.py
@@ -35,7 +35,9 @@
 
 logger = logging.getLogger("atom")
 
-
+_MTP_MASK_INPUT_ARCH: set[str] = {
+    "DeepSeekMTPModel",
+}
 _ATOM_MODEL_CLASSES: dict[str, str] = {
     "LlamaForCausalLM": "atom.models.llama:LlamaForCausalLM",
     "Qwen3ForCausalLM": "atom.models.qwen3:Qwen3ForCausalLM",
@@ -47,6 +49,7 @@
     "GlmMoeDsaForCausalLM": "atom.models.deepseek_v2:GlmMoeDsaForCausalLM",
     "DeepSeekMTPModel": "atom.models.deepseek_mtp:DeepSeekMTP",
     "Qwen3NextForCausalLM": "atom.models.qwen3_next:Qwen3NextForCausalLM",
+    "Qwen3NextMTP": "atom.models.qwen3_next_mtp:Qwen3NextMTP",
     "Qwen3_5MoeForConditionalGeneration": "atom.models.qwen3_5:Qwen3_5MoeForConditionalGeneration_",
     "Qwen3_5ForConditionalGeneration": "atom.models.qwen3_5:Qwen3_5ForConditionalGeneration_",
     "KimiK25ForConditionalGeneration": "atom.plugin.vllm.models.kimi_k25:KimiK25ForConditionalGeneration_",
@@ -121,7 +124,7 @@ def __init_subclass__(cls, *args, **kwargs):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
-
+        from atom.config import get_current_atom_config
         _set_framework_backbone("vllm")
 
         self.config = vllm_config.model_config.hf_config
@@ -147,12 +150,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             spec_method = speculative_config.method
             self.is_mtp = spec_method == "mtp"
 
-        _prepare_env(atom_config=self.atom_config)
-
         main_model_arch = vllm_config.model_config.architectures[0]
         model_arch = _select_model_arch(vllm_config)
         self.is_mtp_draft_model = self.is_mtp and model_arch != main_model_arch
+        if self.is_mtp_draft_model:
+            self.atom_config = get_current_atom_config()
+        else:
+            self.atom_config = generate_atom_config_for_plugin_mode(vllm_config)
         self.model_arch = model_arch
+        _prepare_env(atom_config=self.atom_config)
         model_cls = _get_atom_model_cls(model_arch)
         module_remapping = getattr(model_cls, "packed_modules_mapping", {})
         weights_mapper = getattr(model_cls, "hf_to_atom_mapper", {})
@@ -182,9 +188,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         logger.info(f"Construct ATOM model {model_arch} for vLLM plugin mode")
         self.model = model_cls(self.atom_config)
-        self._adapt_mtp_layers_for_vllm()
-        # Mirror nested attributes required by vLLM speculative decoding.
-        self._expose_spec_decode_attrs()
+
+        if model_arch in _MTP_MASK_INPUT_ARCH:
+            self._adapt_mtp_layers_for_vllm()
+            # Mirror nested attributes required by vLLM speculative decoding.
+            self._expose_spec_decode_attrs()
 
         # For sparse MLA, register the Indexer's DeepseekV32IndexerCache as
         # a virtual subclass of vLLM's AttentionLayerBase so vLLM can discover
@@ -192,7 +200,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self._register_indexer_caches_with_vllm()
 
         if self.model is None:
-            model_arch = vllm_config.model_config.architectures[0]
             raise ValueError(
                 f"The model {model_arch} is not supported by model impl backend atom"
             )
@@ -309,8 +316,7 @@ def _register_indexer_caches_with_vllm(self):
             if prefix not in vllm_sfc:
                 vllm_sfc[prefix] = module
                 logger.info(
-                    f"Registered indexer cache in vLLM static_forward_context: "
-                    f"{prefix}"
+                    f"Registered indexer cache in vLLM static_forward_context: {prefix}"
                 )
             else:
                 logger.warning(
@@ -397,7 +403,6 @@ def forward(
             inputs_embeds=inputs_embeds,
             **model_kwargs,
         )
-
         if not self.pp_group.is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
 
@@ -412,7 +417,7 @@ def load_weights(
 
         is_mtp_draft_model = self.model_arch in {
             "DeepSeekMTPModel",
-            "Qwen3NextMTPModel",
+            "Qwen3NextMTP",
         }
         draft_hf_config = None
         if is_mtp_draft_model:
@@ -452,7 +457,6 @@ class ATOMMoEForCausalLM(ATOMModelBase, VllmModelForTextGeneration): ...
 class ATOMForConditionalGeneration(
     ATOMModelBase, VllmModelForTextGeneration, SupportsMultiModal, SupportsMRoPE
 ):
-
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         """
diff --git a/atom/plugin/vllm/register.py b/atom/plugin/vllm/register.py
index 9ef76e601..91e241e9a 100644
--- a/atom/plugin/vllm/register.py
+++ b/atom/plugin/vllm/register.py
@@ -30,6 +30,7 @@
     "GlmMoeDsaForCausalLM": ATOM_MOE_CAUSAL_LM_MODEL_WRAPPER,
     "DeepSeekMTPModel": ATOM_MOE_CAUSAL_LM_MODEL_WRAPPER,
     "Qwen3NextForCausalLM": "atom.models.qwen3_next:Qwen3NextForCausalLMVllm",
+    "Qwen3NextMTP": ATOM_MOE_CAUSAL_LM_MODEL_WRAPPER,
     "Qwen3_5ForConditionalGeneration": "atom.models.qwen3_5:Qwen3_5ForConditionalGeneration",
     "Qwen3_5MoeForConditionalGeneration": "atom.models.qwen3_5:Qwen3_5MoeForConditionalGeneration",
     "KimiK25ForConditionalGeneration": "atom.plugin.vllm.models.kimi_k25:KimiK25ForConditionalGeneration",

From 580f0fdeab0048e5bd79895922ee41eea3c054b1 Mon Sep 17 00:00:00 2001
From: ganyi <ygan@amd.com>
Date: Thu, 14 May 2026 08:05:17 +0000
Subject: [PATCH 02/15] add recipe for qwen3-next-mtp

Signed-off-by: ganyi <ygan@amd.com>
---
 recipes/atom_vllm/Qwen3Next.md | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/recipes/atom_vllm/Qwen3Next.md b/recipes/atom_vllm/Qwen3Next.md
index e22f80d1c..97e8bdaa4 100644
--- a/recipes/atom_vllm/Qwen3Next.md
+++ b/recipes/atom_vllm/Qwen3Next.md
@@ -17,6 +17,7 @@ The ATOM vLLM plugin backend keeps the standard vLLM CLI, server APIs, and gener
 ```bash
 export ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1
 export ATOM_USE_CUSTOM_ALL_GATHER=0
+export AITER_QUICK_REDUCE_QUANTIZATION=INT4
 
 vllm serve Qwen/Qwen3-Next-80B-A3B-Instruct-FP8 \
     --host localhost \
@@ -31,8 +32,25 @@ vllm serve Qwen/Qwen3-Next-80B-A3B-Instruct-FP8 \
     --no-enable-prefix-caching
 ```
 
-**Important**: `ATOM_DISABLE_VLLM_PLUGIN_ATTENTION=1` is required for Qwen3-Next because it uses a hybrid architecture with both linear attention (GatedDeltaNet) and full attention layers. This env var ensures full attention layers use vLLM's default implementation.
+### Qwen3-Next-80B-A3B-Instruct-FP8 MTP (TP=1, MI355X)
+```bash
+export ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1
+export ATOM_USE_CUSTOM_ALL_GATHER=0
+export AITER_QUICK_REDUCE_QUANTIZATION=INT4
 
+vllm serve Qwen/Qwen3-Next-80B-A3B-Instruct-FP8 \
+    --host localhost \
+    --port 8000 \
+    --tensor-parallel-size 1 \
+    --kv-cache-dtype fp8 \
+    --gpu_memory_utilization 0.9 \
+    --async-scheduling \
+    --compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \
+    --max-model-len 16384 \
+    --max-num-batched-tokens 32768 \
+    --speculative-config '{"num_speculative_tokens":1, "method": "mtp"}' \
+    --no-enable-prefix-caching
+```
 ## Step 3: Performance Benchmark
 
 Users can use the default vllm bench commands for performance benchmarking.
@@ -70,9 +88,6 @@ lm_eval --model local-completions \
         --num_fewshot 3
 ```
 
-## Key Environment Variables
-
-- `ATOM_DISABLE_VLLM_PLUGIN_ATTENTION=1`: **Required** - disables ATOM attention plugin to use vLLM's implementation for full attention layers
 
 ## Architecture Notes
 

From 598be9a919fde691765e518eb596d5eaa0961040 Mon Sep 17 00:00:00 2001
From: ganyi <ygan@amd.com>
Date: Thu, 14 May 2026 08:06:59 +0000
Subject: [PATCH 03/15] modify some qwen3.5 recipe

Signed-off-by: ganyi <ygan@amd.com>
---
 recipes/atom_vllm/Qwen3.5.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/recipes/atom_vllm/Qwen3.5.md b/recipes/atom_vllm/Qwen3.5.md
index 94a900e07..2b30b540a 100644
--- a/recipes/atom_vllm/Qwen3.5.md
+++ b/recipes/atom_vllm/Qwen3.5.md
@@ -71,7 +71,6 @@ vllm serve amd/Qwen3.5-397B-A17B-MXFP4 \
 
 **Important**: The following three environment variables are required for Qwen3.5:
 
-- `ATOM_DISABLE_VLLM_PLUGIN_ATTENTION=1`: Disables ATOM attention plugin to use vLLM's implementation for full attention layers (required because Qwen3.5 uses a hybrid architecture with both linear attention (GatedDeltaNet) and full attention layers)
 - `ATOM_USE_CUSTOM_ALL_GATHER=0`: Disables custom all-gather for compatibility with Qwen3.5 model architecture
 - `AITER_QUICK_REDUCE_QUANTIZATION=INT4`: **Performance optimization** - enables INT4 quantization for quick reduce operations, which can significantly improve TTFT (Time To First Token) performance. **Note**: This optimization may introduce a risk of accuracy degradation. For accuracy-critical workloads, consider validating with your specific use case.
 
@@ -133,7 +132,6 @@ Reference result (TP=4):
 
 ## Key Environment Variables
 
-- `ATOM_DISABLE_VLLM_PLUGIN_ATTENTION=1`: **Required** - disables ATOM attention plugin to use vLLM's implementation for full attention layers
 - `ATOM_USE_CUSTOM_ALL_GATHER=0`: **Required** - disables custom all-gather for compatibility with Qwen3.5 model architecture
 - `AITER_QUICK_REDUCE_QUANTIZATION=INT4`: **Performance optimization** - enables INT4 quantization for quick reduce operations
   - **Benefit**: Significantly improves TTFT (Time To First Token) performance by reducing communication overhead during tensor parallelism all-reduce operations

From 9a6381e9f17aaf59089045d8d5ccbf54baba71d4 Mon Sep 17 00:00:00 2001
From: ganyi <ygan@amd.com>
Date: Thu, 14 May 2026 08:35:34 +0000
Subject: [PATCH 04/15] black

Signed-off-by: ganyi <ygan@amd.com>
---
 atom/model_loader/loader.py       | 6 +++---
 atom/plugin/vllm/model_wrapper.py | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/atom/model_loader/loader.py b/atom/model_loader/loader.py
index 08261e271..36c9db22b 100644
--- a/atom/model_loader/loader.py
+++ b/atom/model_loader/loader.py
@@ -388,9 +388,9 @@ def _submit(fn, *args):
                 is_rocm_aiter_fusion_shared_expert_enabled()
                 and maybe_matching_name is not None
             ):
-                n_routed = getattr(
-                    hf_config, "n_routed_experts", None
-                ) or getattr(hf_config, "num_experts", None)
+                n_routed = getattr(hf_config, "n_routed_experts", None) or getattr(
+                    hf_config, "num_experts", None
+                )
                 name = name.replace(
                     maybe_matching_name,
                     f"mlp.experts.{n_routed}.",
diff --git a/atom/plugin/vllm/model_wrapper.py b/atom/plugin/vllm/model_wrapper.py
index 31d546805..e540a1274 100644
--- a/atom/plugin/vllm/model_wrapper.py
+++ b/atom/plugin/vllm/model_wrapper.py
@@ -125,6 +125,7 @@ def __init_subclass__(cls, *args, **kwargs):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         from atom.config import get_current_atom_config
+
         _set_framework_backbone("vllm")
 
         self.config = vllm_config.model_config.hf_config

From ce844440494039f4663da5671da5f56f8a5c7ac3 Mon Sep 17 00:00:00 2001
From: ganyi <ygan@amd.com>
Date: Thu, 14 May 2026 13:54:58 +0000
Subject: [PATCH 05/15] remove redundant code

Signed-off-by: ganyi <ygan@amd.com>
---
 atom/models/qwen3_next_mtp.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/atom/models/qwen3_next_mtp.py b/atom/models/qwen3_next_mtp.py
index 3f49a80df..d89b6bbdc 100644
--- a/atom/models/qwen3_next_mtp.py
+++ b/atom/models/qwen3_next_mtp.py
@@ -27,13 +27,6 @@ def __init__(self, atom_config: Config, prefix: str = ""):
         config: Qwen3NextConfig = atom_config.hf_config
 
         self.config = config
-        # Qwen3NextDecoderLayer's MoE block needs these attributes, which
-        # Qwen3NextModel.__init__ sets but which are absent from the raw
-        # HF config.  Set them here so the MTP predictor works standalone.
-        if not hasattr(config, "n_shared_experts"):
-            config.n_shared_experts = 1
-        if not hasattr(config, "n_routed_experts"):
-            config.n_routed_experts = config.num_experts
 
         self.vocab_size = config.vocab_size
 
@@ -196,7 +189,7 @@ def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         from atom.model_ops.topK import is_rocm_aiter_fusion_shared_expert_enabled
 
         n_routed = getattr(self.config, "n_routed_experts", self.config.num_experts)
-        n_shared = getattr(self.config, "n_shared_experts", 0)
+        n_shared = getattr(self.config, "n_shared_experts", 1)
         return FusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",

From 8f83eb716a09c4a8d2325b08bcfe2ab6930835b5 Mon Sep 17 00:00:00 2001
From: ganyi <ygan@amd.com>
Date: Thu, 14 May 2026 14:16:10 +0000
Subject: [PATCH 06/15] remove redundant code

Signed-off-by: ganyi <ygan@amd.com>
---
 atom/models/qwen3_next_mtp.py     | 4 ----
 atom/plugin/vllm/model_wrapper.py | 1 -
 2 files changed, 5 deletions(-)

diff --git a/atom/models/qwen3_next_mtp.py b/atom/models/qwen3_next_mtp.py
index d89b6bbdc..d89df525b 100644
--- a/atom/models/qwen3_next_mtp.py
+++ b/atom/models/qwen3_next_mtp.py
@@ -137,10 +137,6 @@ def remap_mtp_weight_name(self, name: str) -> str | None:
     def __init__(self, atom_config: Config, prefix: str = ""):
         super().__init__()
         config = atom_config.hf_config
-        if not hasattr(config, "n_shared_experts"):
-            config.n_shared_experts = 1
-        if not hasattr(config, "n_routed_experts"):
-            config.n_routed_experts = config.num_experts
         if atom_config.enable_prefix_caching:
             raise ValueError("Qwen3NextMTP currently does not support prefix caching")
         self.config = config
diff --git a/atom/plugin/vllm/model_wrapper.py b/atom/plugin/vllm/model_wrapper.py
index e540a1274..7197d0d65 100644
--- a/atom/plugin/vllm/model_wrapper.py
+++ b/atom/plugin/vllm/model_wrapper.py
@@ -144,7 +144,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.ignore_unexpected_suffixes: list[str] = []
 
         self.vllm_config = vllm_config
-        self.atom_config = generate_atom_config_for_plugin_mode(vllm_config)
         self.is_mtp = False
         speculative_config = getattr(vllm_config, "speculative_config", None)
         if speculative_config is not None:

From 8b53857aca6162c80f93fadd46553489956353c6 Mon Sep 17 00:00:00 2001
From: ganyi <ygan@amd.com>
Date: Thu, 14 May 2026 14:57:45 +0000
Subject: [PATCH 07/15] add spec decode convert for vllm plugin

Signed-off-by: ganyi <ygan@amd.com>
---
 atom/plugin/config.py | 45 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/atom/plugin/config.py b/atom/plugin/config.py
index 07aafa4a5..8eb48da4c 100644
--- a/atom/plugin/config.py
+++ b/atom/plugin/config.py
@@ -1,3 +1,4 @@
+import copy
 from typing import Any, Optional
 from dataclasses import dataclass
 
@@ -71,6 +72,45 @@ def _normalize_sglang_parallel_config(
     return tp_size, 1, 0, tp_rank
 
 
+def _build_atom_speculative_config_from_vllm(vllm_spec_config: Any):
+    """Translate vLLM's SpeculativeConfig into ATOM's SpeculativeConfig.
+
+    Reuses vLLM's already-loaded draft hf_config (skips a second disk fetch
+    in ATOM SpeculativeConfig.__post_init__) but still runs ATOM's
+    hf_config_override on it — so MTP model_type remap, n_routed_experts
+    backfill (Qwen families), and architecture rewrite all land on the
+    draft config in one place. Mirrors how standalone ATOM MTP exposes
+    the draft hf_config via atom_config.speculative_config.
+
+    The draft hf_config is deepcopied first because hf_config_override
+    mutates `architectures` to ATOM's standalone naming (e.g.
+    "Qwen3NextMTPModel"), which differs from vLLM's registry name
+    ("Qwen3NextMTP"). Mutating in place would make vLLM's later draft
+    architecture lookup fail.
+    """
+    if vllm_spec_config is None:
+        return None
+
+    from atom.config import SpeculativeConfig
+
+    draft_model_config = getattr(vllm_spec_config, "draft_model_config", None)
+    draft_hf_config = getattr(draft_model_config, "hf_config", None)
+    if draft_hf_config is not None:
+        draft_hf_config = copy.deepcopy(draft_hf_config)
+    model_path = getattr(draft_model_config, "model", None) or getattr(
+        vllm_spec_config, "model", None
+    )
+
+    return SpeculativeConfig(
+        method=getattr(vllm_spec_config, "method", "") or "",
+        model=model_path,
+        num_speculative_tokens=getattr(
+            vllm_spec_config, "num_speculative_tokens", None
+        ),
+        draft_model_hf_config=draft_hf_config,
+    )
+
+
 def _generate_atom_config_from_vllm_config(config: Any) -> PluginConfig:
     from atom.config import Config, CompilationConfig
 
@@ -117,6 +157,10 @@ def _generate_atom_config_from_vllm_config(config: Any) -> PluginConfig:
 
     max_num_batched_tokens = vllm_scheduler_config.max_num_batched_tokens
 
+    atom_speculative_config = _build_atom_speculative_config_from_vllm(
+        getattr(config, "speculative_config", None)
+    )
+
     return Config(
         model=vllm_model_config.model,
         trust_remote_code=getattr(vllm_model_config, "trust_remote_code", False),
@@ -140,6 +184,7 @@ def _generate_atom_config_from_vllm_config(config: Any) -> PluginConfig:
         master_addr=None,
         enable_dp_attention=False,
         plugin_config=plugin_config,
+        speculative_config=atom_speculative_config,
     )
 
 

From 885c3291e0c5f7c5ba6edb49d8bea7a0b0940a59 Mon Sep 17 00:00:00 2001
From: ganyi <ygan@amd.com>
Date: Thu, 14 May 2026 15:09:02 +0000
Subject: [PATCH 08/15] remove vllm related branch

Signed-off-by: ganyi <ygan@amd.com>
---
 atom/models/qwen3_next.py | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/atom/models/qwen3_next.py b/atom/models/qwen3_next.py
index c42dd3b9e..5fd55e072 100644
--- a/atom/models/qwen3_next.py
+++ b/atom/models/qwen3_next.py
@@ -495,20 +495,17 @@ def __init__(
 
         self.config = config
         self.quant_config = quant_config
-        self.speculative_config = speculative_config
-        # When running as a vLLM plugin, Qwen3NextDecoderLayer instantiates
-        # this module without forwarding speculative_config. That left
-        # self.num_spec=0 even with MTP enabled, so get_state_shape() (the
-        # instance method vLLM's MambaBase.get_kv_cache_spec uses to size each
-        # layer's KV cache) allocated conv_state with only `kernel_size-1`
-        # token rows. During spec decode, causal_conv1d_update writes
-        # `kernel_size-1 + num_spec` rows per slot and the extra row spilled
-        # into the page-adjacent ssm_state, corrupting layer 0's recurrent
-        # state. Pull the spec config from the vLLM config as a fallback.
-        if is_vllm() and self.speculative_config is None:
-            vllm_spec_config = get_current_vllm_config().speculative_config
-            if vllm_spec_config is not None:
-                self.speculative_config = vllm_spec_config
+        # Qwen3NextDecoderLayer instantiates this module without forwarding
+        # speculative_config, so fall back to atom_config.speculative_config
+        # (populated by both standalone ATOM and the vLLM plugin path's
+        # _generate_atom_config_from_vllm_config). Without a correct num_spec,
+        # get_state_shape() (used by vLLM's MambaBase.get_kv_cache_spec to
+        # size each layer's KV cache) sizes conv_state with only
+        # `kernel_size-1` token rows, but causal_conv1d_update writes
+        # `kernel_size-1 + num_spec` rows per slot during spec decode — the
+        # extra row spills into the page-adjacent ssm_state and corrupts
+        # layer 0's recurrent state.
+        self.speculative_config = speculative_config or atom_config.speculative_config
         self.num_spec = (
             self.speculative_config.num_speculative_tokens
             if self.speculative_config

From a7063090eef6299bfa4a27cfaeec1c2ca91bc7e3 Mon Sep 17 00:00:00 2001
From: ganyi <ygan@amd.com>
Date: Thu, 14 May 2026 15:21:58 +0000
Subject: [PATCH 09/15] use atom spec decode config for plugin loading

Signed-off-by: ganyi <ygan@amd.com>
---
 atom/model_loader/loader.py       | 5 +----
 atom/plugin/vllm/model_wrapper.py | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/atom/model_loader/loader.py b/atom/model_loader/loader.py
index 36c9db22b..0763179f5 100644
--- a/atom/model_loader/loader.py
+++ b/atom/model_loader/loader.py
@@ -388,12 +388,9 @@ def _submit(fn, *args):
                 is_rocm_aiter_fusion_shared_expert_enabled()
                 and maybe_matching_name is not None
             ):
-                n_routed = getattr(hf_config, "n_routed_experts", None) or getattr(
-                    hf_config, "num_experts", None
-                )
                 name = name.replace(
                     maybe_matching_name,
-                    f"mlp.experts.{n_routed}.",
+                    f"mlp.experts.{hf_config.n_routed_experts}.",
                 )
             for k in packed_modules_mapping:
                 # We handle the experts below in expert_params_mapping
diff --git a/atom/plugin/vllm/model_wrapper.py b/atom/plugin/vllm/model_wrapper.py
index 7197d0d65..4eada7d4c 100644
--- a/atom/plugin/vllm/model_wrapper.py
+++ b/atom/plugin/vllm/model_wrapper.py
@@ -422,7 +422,7 @@ def load_weights(
         draft_hf_config = None
         if is_mtp_draft_model:
             draft_model_config = getattr(
-                getattr(self.vllm_config, "speculative_config", None),
+                getattr(self.atom_config, "speculative_config", None),
                 "draft_model_config",
                 None,
             )

From 4dca1352fafc63e6a21a5d4d455549692a209a1d Mon Sep 17 00:00:00 2001
From: ganyi <ygan@amd.com>
Date: Fri, 15 May 2026 01:59:26 +0000
Subject: [PATCH 10/15] remove unnecessary changes in modeling

Signed-off-by: ganyi <ygan@amd.com>
---
 atom/models/qwen3_next.py     | 24 ++----------------------
 atom/models/qwen3_next_mtp.py | 13 +++----------
 2 files changed, 5 insertions(+), 32 deletions(-)

diff --git a/atom/models/qwen3_next.py b/atom/models/qwen3_next.py
index 5fd55e072..40ad2380a 100644
--- a/atom/models/qwen3_next.py
+++ b/atom/models/qwen3_next.py
@@ -279,7 +279,6 @@ def __init__(
         atom_config,
         quant_config=None,
         prefix: str = "",
-        layer_num: int | None = None,
     ) -> None:
         super().__init__()
         if hasattr(atom_config.hf_config, "text_config"):
@@ -381,15 +380,6 @@ def __init__(
                 k_norm=self.k_norm,
             )
 
-        # For MTP, the prefix is e.g. "mtp.layers.0.self_attn" so
-        # extract_layer_index(prefix) returns 0, which would collide with the
-        # target model's layer 0 KV cache slot. Allow callers (e.g.
-        # Qwen3NextDecoderLayer) to pass an explicit `layer_num` so MTP can
-        # use absolute indices (mtp_start_layer_idx + idx) and get its own
-        # KV cache slot.
-        attn_layer_num = (
-            layer_num if layer_num is not None else extract_layer_index(prefix)
-        )
         self.attn = Attention(
             self.num_heads,
             self.head_dim,
@@ -398,7 +388,7 @@ def __init__(
             kv_cache_dtype=atom_config.kv_cache_dtype,
             quant_config=quant_config,
             use_mla=False,
-            layer_num=attn_layer_num,
+            layer_num=extract_layer_index(prefix),
             config=atom_config,
             prefix=f"{prefix}",
             **fusion_kwargs,
@@ -495,16 +485,7 @@ def __init__(
 
         self.config = config
         self.quant_config = quant_config
-        # Qwen3NextDecoderLayer instantiates this module without forwarding
-        # speculative_config, so fall back to atom_config.speculative_config
-        # (populated by both standalone ATOM and the vLLM plugin path's
-        # _generate_atom_config_from_vllm_config). Without a correct num_spec,
-        # get_state_shape() (used by vLLM's MambaBase.get_kv_cache_spec to
-        # size each layer's KV cache) sizes conv_state with only
-        # `kernel_size-1` token rows, but causal_conv1d_update writes
-        # `kernel_size-1 + num_spec` rows per slot during spec decode — the
-        # extra row spills into the page-adjacent ssm_state and corrupts
-        # layer 0's recurrent state.
+
         self.speculative_config = speculative_config or atom_config.speculative_config
         self.num_spec = (
             self.speculative_config.num_speculative_tokens
@@ -799,7 +780,6 @@ def __init__(
                 atom_config,
                 quant_config=quant_config,
                 prefix=f"{prefix}.self_attn",
-                layer_num=layer_num,
             )
         else:
             raise ValueError(f"Invalid layer_type {self.layer_type}")
diff --git a/atom/models/qwen3_next_mtp.py b/atom/models/qwen3_next_mtp.py
index d89df525b..dcdc603b2 100644
--- a/atom/models/qwen3_next_mtp.py
+++ b/atom/models/qwen3_next_mtp.py
@@ -38,10 +38,6 @@ def __init__(self, atom_config: Config, prefix: str = ""):
             config.hidden_size,
         )
 
-        # Pass the layer's HF-style prefix so the quant_config exclude list
-        # (which contains "mtp.fc" in Qwen3-Next FP8 checkpoints) is honored;
-        # without it the lookup uses "" and falls back to the global FP8 spec,
-        # which makes fc FP8 even though the source weight is BF16.
         self.fc = ColumnParallelLinear(
             self.config.hidden_size * 2,
             self.config.hidden_size,
@@ -50,18 +46,15 @@ def __init__(self, atom_config: Config, prefix: str = ""):
             prefix=f"{prefix}.fc",
         )
 
-        # Use 0-indexed prefix (matches checkpoint's mtp.layers.0.* weight
-        # names and vLLM's reference impl), but keep layer_num as the
-        # absolute index so the attention layer gets a KV cache slot that
-        # doesn't collide with the target model's layers.
         self.layers = torch.nn.ModuleList(
             Qwen3NextDecoderLayer(
                 atom_config,
                 layer_type="full_attention",
                 prefix=f"{prefix}.layers.{idx}",
-                layer_num=self.mtp_start_layer_idx + idx,
+                layer_num=idx,
             )
-            for idx in range(self.num_mtp_layers)
+            for idx in range(
+                self.mtp_start_layer_idx, self.mtp_start_layer_idx + self.num_mtp_layers)
         )
 
         self.norm = Qwen3NextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

From 809e9319e1905e3928cb27bc790308ada4d2cf7c Mon Sep 17 00:00:00 2001
From: ganyi <ygan@amd.com>
Date: Fri, 15 May 2026 02:00:43 +0000
Subject: [PATCH 11/15] format

Signed-off-by: ganyi <ygan@amd.com>
---
 atom/models/qwen3_next_mtp.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/atom/models/qwen3_next_mtp.py b/atom/models/qwen3_next_mtp.py
index dcdc603b2..5c547d955 100644
--- a/atom/models/qwen3_next_mtp.py
+++ b/atom/models/qwen3_next_mtp.py
@@ -54,7 +54,8 @@ def __init__(self, atom_config: Config, prefix: str = ""):
                 layer_num=idx,
             )
             for idx in range(
-                self.mtp_start_layer_idx, self.mtp_start_layer_idx + self.num_mtp_layers)
+                self.mtp_start_layer_idx, self.mtp_start_layer_idx + self.num_mtp_layers
+            )
         )
 
         self.norm = Qwen3NextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

From d104a466abb717b5cd0de69bb6fc91d3368ef9a0 Mon Sep 17 00:00:00 2001
From: ganyi <ygan@amd.com>
Date: Fri, 15 May 2026 02:21:18 +0000
Subject: [PATCH 12/15] add qwen3next mtp into benchmark

Signed-off-by: ganyi <ygan@amd.com>
---
 .github/benchmark/oot_benchmark_models.json    | 18 ++++++++++++++++++
 .github/benchmark/oot_models_accuracy.json     | 12 ++++++++++++
 .../atom-vllm-accuracy-validation.yaml         | 15 +++++++++++++++
 .github/workflows/atom-vllm-benchmark.yaml     | 16 ++++++++++++++++
 4 files changed, 61 insertions(+)

diff --git a/.github/benchmark/oot_benchmark_models.json b/.github/benchmark/oot_benchmark_models.json
index 1b54c6265..9a478cbf1 100644
--- a/.github/benchmark/oot_benchmark_models.json
+++ b/.github/benchmark/oot_benchmark_models.json
@@ -268,6 +268,24 @@
           "bench_args": "",
           "extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 32768 --max-model-len 16384",
           "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0"
+        },
+        {
+          "tp_size": 1,
+          "display": "Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET)",
+          "dashboard_model": "Qwen3-Next-80B-A3B-Instruct-FP8-mtp-tp1",
+          "prefix": "qwen3-next-80b-a3b-instruct-fp8-mtp-tp1-met",
+          "bench_args": "",
+          "extra_args": "--trust-remote-code --tensor-parallel-size 1 --max-num-batched-tokens 32768 --max-model-len 16384 --speculative-config '{\"num_speculative_tokens\":1, \"method\": \"mtp\"}'",
+          "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0"
+        },
+        {
+          "tp_size": 4,
+          "display": "Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET)",
+          "dashboard_model": "Qwen3-Next-80B-A3B-Instruct-FP8-mtp-tp4",
+          "prefix": "qwen3-next-80b-a3b-instruct-fp8-mtp-tp4-met",
+          "bench_args": "",
+          "extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 32768 --max-model-len 16384 --speculative-config '{\"num_speculative_tokens\":1, \"method\": \"mtp\"}'",
+          "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0"
         }
       ]
     },
diff --git a/.github/benchmark/oot_models_accuracy.json b/.github/benchmark/oot_models_accuracy.json
index 1050e3f85..c2b9da043 100644
--- a/.github/benchmark/oot_models_accuracy.json
+++ b/.github/benchmark/oot_models_accuracy.json
@@ -55,6 +55,18 @@
     "accuracy_baseline_model": "Qwen/Qwen3-235B-A22B-Instruct-2507",
     "_baseline_note": "Using Qwen3-235B baseline as proxy; needs CI measurement for Qwen3.5 specific baseline"
   },
+  {
+    "model_name": "Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4",
+    "model_path": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
+    "extraArgs": "--tensor-parallel-size 4 --speculative-config '{\"num_speculative_tokens\":1, \"method\": \"mtp\"}'",
+    "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0",
+    "runner": "linux-atom-mi35x-4",
+    "test_level": "nightly",
+    "accuracy_threshold": 0.80,
+    "accuracy_baseline": 0.81,
+    "accuracy_baseline_model": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
+    "_baseline_note": "Qwen3-Next-80B-A3B-Instruct-FP8 baseline with TP4 (no MTP) as proxy; needs CI measurement for MTP-specific baseline"
+  },
   {
     "model_name": "Llama-3.1-8B-Instruct TP1",
     "model_path": "meta-llama/Llama-3.1-8B-Instruct",
diff --git a/.github/workflows/atom-vllm-accuracy-validation.yaml b/.github/workflows/atom-vllm-accuracy-validation.yaml
index 0b2d7e7a0..b940f3753 100644
--- a/.github/workflows/atom-vllm-accuracy-validation.yaml
+++ b/.github/workflows/atom-vllm-accuracy-validation.yaml
@@ -24,6 +24,11 @@ on:
         required: false
         type: boolean
         default: false
+      run_qwen3_next_80b_mtp_tp4:
+        description: "Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4"
+        required: false
+        type: boolean
+        default: false
       run_qwen35_397b_fp8_tp8:
         description: "Qwen3.5-397B-A17B-FP8 TP8"
         required: false
@@ -137,6 +142,7 @@ jobs:
           RUN_QWEN3_MOE_TP8: ${{ inputs.run_qwen3_moe_tp8 }}
           RUN_QWEN3_NEXT_80B_TP1: ${{ inputs.run_qwen3_next_80b_tp1 }}
           RUN_QWEN3_NEXT_80B_TP4: ${{ inputs.run_qwen3_next_80b_tp4 }}
+          RUN_QWEN3_NEXT_80B_MTP_TP4: ${{ inputs.run_qwen3_next_80b_mtp_tp4 }}
           RUN_QWEN35_397B_FP8_TP8: ${{ inputs.run_qwen35_397b_fp8_tp8 }}
           RUN_QWEN35_397B_TP8: ${{ inputs.run_qwen35_397b_tp8 }}
           RUN_QWEN35_397B_FP4_TP4: ${{ inputs.run_qwen35_397b_fp4_tp4 }}
@@ -190,6 +196,15 @@ jobs:
                   "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=1",
                   "runner": "linux-atom-mi35x-4",
               },
+              {
+                  "toggle_env": "RUN_QWEN3_NEXT_80B_MTP_TP4",
+                  "model_name": "Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4",
+                  "model_path": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
+                  "extra_args": "--tensor-parallel-size 4 --speculative-config '{\"num_speculative_tokens\":1, \"method\": \"mtp\"}'",
+                  "accuracy_test_threshold": 0.80,
+                  "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0",
+                  "runner": "linux-atom-mi35x-4",
+              },
               {
                   "toggle_env": "RUN_QWEN35_397B_FP8_TP8",
                   "model_name": "Qwen3.5-397B-A17B-FP8 TP8",
diff --git a/.github/workflows/atom-vllm-benchmark.yaml b/.github/workflows/atom-vllm-benchmark.yaml
index a12b91483..fa3d96707 100644
--- a/.github/workflows/atom-vllm-benchmark.yaml
+++ b/.github/workflows/atom-vllm-benchmark.yaml
@@ -36,6 +36,8 @@ on:
           - Qwen3.5-397B-A17B TP8 (OOB)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW)
@@ -69,6 +71,8 @@ on:
           - Qwen3.5-397B-A17B TP8 (OOB)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW)
@@ -102,6 +106,8 @@ on:
           - Qwen3.5-397B-A17B TP8 (OOB)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW)
@@ -135,6 +141,8 @@ on:
           - Qwen3.5-397B-A17B TP8 (OOB)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW)
@@ -168,6 +176,8 @@ on:
           - Qwen3.5-397B-A17B TP8 (OOB)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW)
@@ -201,6 +211,8 @@ on:
           - Qwen3.5-397B-A17B TP8 (OOB)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW)
@@ -234,6 +246,8 @@ on:
           - Qwen3.5-397B-A17B TP8 (OOB)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW)
@@ -267,6 +281,8 @@ on:
           - Qwen3.5-397B-A17B TP8 (OOB)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW)

From e262d7e4c8e306d8a5fad1a815125e7e9438a4dc Mon Sep 17 00:00:00 2001
From: zejunchen-zejun <zejun.chen@amd.com>
Date: Fri, 15 May 2026 14:01:44 +0800
Subject: [PATCH 13/15] [ci] disable FP8 blockscale weight preshuffle for
 Qwen3.5/Qwen3-Next

Add ATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0 to all Qwen3.5 and
Qwen3-Next model configs across benchmark, nightly accuracy, and
recipe files.

Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
---
 .github/benchmark/oot_benchmark_models.json   | 20 +++++++++----------
 .github/benchmark/oot_models_accuracy.json    | 18 ++++++++---------
 .../atom-vllm-accuracy-validation.yaml        | 14 ++++++-------
 recipes/atom_vllm/Qwen3.5.md                  |  3 +++
 recipes/atom_vllm/Qwen3Next.md                |  2 ++
 5 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/.github/benchmark/oot_benchmark_models.json b/.github/benchmark/oot_benchmark_models.json
index 9a478cbf1..4b9c7a9a9 100644
--- a/.github/benchmark/oot_benchmark_models.json
+++ b/.github/benchmark/oot_benchmark_models.json
@@ -181,7 +181,7 @@
             "1024x8192"
           ],
           "extra_args": "--trust-remote-code --tensor-parallel-size 4 --attention-backend ROCM_AITER_FA --gpu-memory-utilization 0.8 --max-num-batched-tokens 16384 --max-model-len 16384",
-          "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0"
+          "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0"
         },
         {
           "tp_size": 8,
@@ -192,7 +192,7 @@
             "1024x8192"
           ],
           "extra_args": "--trust-remote-code --tensor-parallel-size 8 --attention-backend ROCM_AITER_FA --gpu-memory-utilization 0.8 --max-num-batched-tokens 16384 --max-model-len 16384",
-          "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0"
+          "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0"
         }
       ]
     },
@@ -213,7 +213,7 @@
             "1024x8192"
           ],
           "extra_args": "--trust-remote-code --tensor-parallel-size 8 --attention-backend ROCM_AITER_FA --gpu-memory-utilization 0.8 --max-num-batched-tokens 16384 --max-model-len 16384",
-          "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0"
+          "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0"
         }
       ]
     },
@@ -231,7 +231,7 @@
           "prefix": "qwen3-next-80b-a3b-instruct-fp8-tp1-met",
           "bench_args": "",
           "extra_args": "--trust-remote-code --tensor-parallel-size 1 --max-num-batched-tokens 32768 --max-model-len 16384",
-          "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0"
+          "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0"
         },
         {
           "tp_size": 4,
@@ -240,7 +240,7 @@
           "prefix": "qwen3-next-80b-a3b-instruct-fp8-tp4-met",
           "bench_args": "",
           "extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 32768 --max-model-len 16384",
-          "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0"
+          "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0"
         },
         {
           "tp_size": 1,
@@ -249,7 +249,7 @@
           "prefix": "qwen3-next-80b-a3b-instruct-fp8-aw-tp1",
           "bench_args": "",
           "extra_args": "--trust-remote-code --tensor-parallel-size 1 --max-num-batched-tokens 32768 --max-model-len 16384",
-          "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0"
+          "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0"
         },
         {
           "tp_size": 2,
@@ -258,7 +258,7 @@
           "prefix": "qwen3-next-80b-a3b-instruct-fp8-aw-tp2",
           "bench_args": "",
           "extra_args": "--trust-remote-code --tensor-parallel-size 2 --max-num-batched-tokens 32768 --max-model-len 16384",
-          "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0"
+          "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0"
         },
         {
           "tp_size": 4,
@@ -267,7 +267,7 @@
           "prefix": "qwen3-next-80b-a3b-instruct-fp8-aw-tp4",
           "bench_args": "",
           "extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 32768 --max-model-len 16384",
-          "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0"
+          "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0"
         },
         {
           "tp_size": 1,
@@ -276,7 +276,7 @@
           "prefix": "qwen3-next-80b-a3b-instruct-fp8-mtp-tp1-met",
           "bench_args": "",
           "extra_args": "--trust-remote-code --tensor-parallel-size 1 --max-num-batched-tokens 32768 --max-model-len 16384 --speculative-config '{\"num_speculative_tokens\":1, \"method\": \"mtp\"}'",
-          "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0"
+          "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0"
         },
         {
           "tp_size": 4,
@@ -285,7 +285,7 @@
           "prefix": "qwen3-next-80b-a3b-instruct-fp8-mtp-tp4-met",
           "bench_args": "",
           "extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 32768 --max-model-len 16384 --speculative-config '{\"num_speculative_tokens\":1, \"method\": \"mtp\"}'",
-          "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0"
+          "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0"
         }
       ]
     },
diff --git a/.github/benchmark/oot_models_accuracy.json b/.github/benchmark/oot_models_accuracy.json
index c2b9da043..226c374ed 100644
--- a/.github/benchmark/oot_models_accuracy.json
+++ b/.github/benchmark/oot_models_accuracy.json
@@ -3,7 +3,7 @@
     "model_name": "Qwen3-235B-A22B-Instruct-2507-FP8 TP8+EP8",
     "model_path": "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8",
     "extraArgs": "--tensor-parallel-size 8 --enable-expert-parallel",
-    "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1",
+    "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0",
     "runner": "linux-atom-mi35x-8",
     "test_level": "nightly",
     "accuracy_threshold": 0.87,
@@ -14,7 +14,7 @@
     "model_name": "Qwen3-Next-80B-A3B-Instruct-FP8 TP4",
     "model_path": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
     "extraArgs": "--tensor-parallel-size 4 --attention-backend ROCM_AITER_FA",
-    "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0",
+    "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0",
     "runner": "linux-atom-mi35x-4",
     "test_level": "nightly",
     "accuracy_threshold": 0.76,
@@ -25,7 +25,7 @@
     "model_name": "Qwen3.5-397B-A17B-FP8 TP8",
     "model_path": "Qwen/Qwen3.5-397B-A17B-FP8",
     "extraArgs": "--tensor-parallel-size 8 --attention-backend ROCM_AITER_FA",
-    "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0",
+    "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0",
     "runner": "linux-atom-mi35x-8",
     "test_level": "nightly",
     "accuracy_threshold": 0.83,
@@ -36,7 +36,7 @@
     "model_name": "Qwen3.5-397B-A17B TP8",
     "model_path": "Qwen/Qwen3.5-397B-A17B",
     "extraArgs": "--tensor-parallel-size 8 --attention-backend ROCM_AITER_FA",
-    "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0",
+    "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0",
     "runner": "linux-atom-mi35x-8",
     "test_level": "nightly",
     "accuracy_threshold": 0.83,
@@ -47,7 +47,7 @@
     "model_name": "Qwen3.5-397B-A17B-MXFP4 TP4",
     "model_path": "amd/Qwen3.5-397B-A17B-MXFP4",
     "extraArgs": "--tensor-parallel-size 4 --attention-backend ROCM_AITER_FA",
-    "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0",
+    "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0",
     "runner": "linux-atom-mi35x-4",
     "test_level": "nightly",
     "accuracy_threshold": 0.82,
@@ -59,10 +59,10 @@
     "model_name": "Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4",
     "model_path": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
     "extraArgs": "--tensor-parallel-size 4 --speculative-config '{\"num_speculative_tokens\":1, \"method\": \"mtp\"}'",
-    "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0",
+    "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0",
     "runner": "linux-atom-mi35x-4",
     "test_level": "nightly",
-    "accuracy_threshold": 0.80,
+    "accuracy_threshold": 0.8,
     "accuracy_baseline": 0.81,
     "accuracy_baseline_model": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
     "_baseline_note": "Qwen3-Next-80B-A3B-Instruct-FP8 baseline with TP4 (no MTP) as proxy; needs CI measurement for MTP-specific baseline"
@@ -169,7 +169,7 @@
     "runner": "linux-atom-mi35x-1",
     "test_level": "nightly",
     "accuracy_threshold": 0.88,
-    "accuracy_baseline": 0.90,
+    "accuracy_baseline": 0.9,
     "accuracy_baseline_model": "openai/gpt-oss-120b"
   },
   {
@@ -181,7 +181,7 @@
     "runner": "linux-atom-mi35x-4",
     "test_level": "nightly",
     "accuracy_threshold": 0.88,
-    "accuracy_baseline": 0.90,
+    "accuracy_baseline": 0.9,
     "accuracy_baseline_model": "openai/gpt-oss-120b"
   },
   {
diff --git a/.github/workflows/atom-vllm-accuracy-validation.yaml b/.github/workflows/atom-vllm-accuracy-validation.yaml
index b940f3753..dc448212e 100644
--- a/.github/workflows/atom-vllm-accuracy-validation.yaml
+++ b/.github/workflows/atom-vllm-accuracy-validation.yaml
@@ -175,7 +175,7 @@ jobs:
                   "model_path": "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8",
                   "extra_args": "--tensor-parallel-size 8 --enable-expert-parallel",
                   "accuracy_test_threshold": 0.87,
-                  "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1",
+                  "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0",
                   "runner": "linux-atom-mi35x-8",
               },
               {
@@ -184,7 +184,7 @@ jobs:
                   "model_path": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
                   "extra_args": "--tensor-parallel-size 1",
                   "accuracy_test_threshold": 0.83,
-                  "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=1",
+                  "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=1\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0",
                   "runner": "linux-atom-mi35x-1",
               },
               {
@@ -193,7 +193,7 @@ jobs:
                   "model_path": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
                   "extra_args": "--tensor-parallel-size 4",
                   "accuracy_test_threshold": 0.83,
-                  "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=1",
+                  "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=1\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0",
                   "runner": "linux-atom-mi35x-4",
               },
               {
@@ -202,7 +202,7 @@ jobs:
                   "model_path": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
                   "extra_args": "--tensor-parallel-size 4 --speculative-config '{\"num_speculative_tokens\":1, \"method\": \"mtp\"}'",
                   "accuracy_test_threshold": 0.80,
-                  "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0",
+                  "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0",
                   "runner": "linux-atom-mi35x-4",
               },
               {
@@ -211,7 +211,7 @@ jobs:
                   "model_path": "Qwen/Qwen3.5-397B-A17B-FP8",
                   "extra_args": "--tensor-parallel-size 8",
                   "accuracy_test_threshold": 0.83,
-                  "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0",
+                  "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0",
                   "runner": "linux-atom-mi35x-8",
               },
               {
@@ -220,7 +220,7 @@ jobs:
                   "model_path": "Qwen/Qwen3.5-397B-A17B",
                   "extra_args": "--tensor-parallel-size 8",
                   "accuracy_test_threshold": 0.83,
-                  "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0",
+                  "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0",
                   "runner": "linux-atom-mi35x-8",
               },
               {
@@ -229,7 +229,7 @@ jobs:
                   "model_path": "amd/Qwen3.5-397B-A17B-MXFP4",
                   "extra_args": "--tensor-parallel-size 4",
                   "accuracy_test_threshold": 0.83,
-                  "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0",
+                  "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0",
                   "runner": "linux-atom-mi35x-4",
               },
               {
diff --git a/recipes/atom_vllm/Qwen3.5.md b/recipes/atom_vllm/Qwen3.5.md
index 2b30b540a..93848beca 100644
--- a/recipes/atom_vllm/Qwen3.5.md
+++ b/recipes/atom_vllm/Qwen3.5.md
@@ -18,6 +18,7 @@ The ATOM vLLM plugin backend keeps the standard vLLM CLI, server APIs, and gener
 export AITER_QUICK_REDUCE_QUANTIZATION=INT4
 export ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1
 export ATOM_USE_CUSTOM_ALL_GATHER=0
+export ATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0
 
 vllm serve Qwen/Qwen3.5-35B-A3B-FP8 \
     --host localhost \
@@ -37,6 +38,7 @@ vllm serve Qwen/Qwen3.5-35B-A3B-FP8 \
 export AITER_QUICK_REDUCE_QUANTIZATION=INT4
 export ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1
 export ATOM_USE_CUSTOM_ALL_GATHER=0
+export ATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0
 
 vllm serve Qwen/Qwen3.5-397B-A17B-FP8 \
     --host localhost \
@@ -56,6 +58,7 @@ vllm serve Qwen/Qwen3.5-397B-A17B-FP8 \
 export AITER_QUICK_REDUCE_QUANTIZATION=INT4
 export ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1
 export ATOM_USE_CUSTOM_ALL_GATHER=0
+export ATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0
 
 vllm serve amd/Qwen3.5-397B-A17B-MXFP4 \
     --host localhost \
diff --git a/recipes/atom_vllm/Qwen3Next.md b/recipes/atom_vllm/Qwen3Next.md
index 97e8bdaa4..019e297f6 100644
--- a/recipes/atom_vllm/Qwen3Next.md
+++ b/recipes/atom_vllm/Qwen3Next.md
@@ -18,6 +18,7 @@ The ATOM vLLM plugin backend keeps the standard vLLM CLI, server APIs, and gener
 export ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1
 export ATOM_USE_CUSTOM_ALL_GATHER=0
 export AITER_QUICK_REDUCE_QUANTIZATION=INT4
+export ATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0
 
 vllm serve Qwen/Qwen3-Next-80B-A3B-Instruct-FP8 \
     --host localhost \
@@ -37,6 +38,7 @@ vllm serve Qwen/Qwen3-Next-80B-A3B-Instruct-FP8 \
 export ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1
 export ATOM_USE_CUSTOM_ALL_GATHER=0
 export AITER_QUICK_REDUCE_QUANTIZATION=INT4
+export ATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0
 
 vllm serve Qwen/Qwen3-Next-80B-A3B-Instruct-FP8 \
     --host localhost \

From f3d17e7c86f4ccb5055b5257fc8f8cb4a3898d73 Mon Sep 17 00:00:00 2001
From: zejunchen-zejun <zejun.chen@amd.com>
Date: Fri, 15 May 2026 14:42:24 +0800
Subject: [PATCH 14/15] [ci] fix Qwen3-Next MTP benchmark label from MET to AW

Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
---
 .github/benchmark/oot_benchmark_models.json |  8 +++---
 .github/workflows/atom-vllm-benchmark.yaml  | 32 ++++++++++-----------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/.github/benchmark/oot_benchmark_models.json b/.github/benchmark/oot_benchmark_models.json
index 4b9c7a9a9..f06a8e3c3 100644
--- a/.github/benchmark/oot_benchmark_models.json
+++ b/.github/benchmark/oot_benchmark_models.json
@@ -271,18 +271,18 @@
         },
         {
           "tp_size": 1,
-          "display": "Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET)",
+          "display": "Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (AW)",
           "dashboard_model": "Qwen3-Next-80B-A3B-Instruct-FP8-mtp-tp1",
-          "prefix": "qwen3-next-80b-a3b-instruct-fp8-mtp-tp1-met",
+          "prefix": "qwen3-next-80b-a3b-instruct-fp8-mtp-tp1-aw",
           "bench_args": "",
           "extra_args": "--trust-remote-code --tensor-parallel-size 1 --max-num-batched-tokens 32768 --max-model-len 16384 --speculative-config '{\"num_speculative_tokens\":1, \"method\": \"mtp\"}'",
           "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0"
         },
         {
           "tp_size": 4,
-          "display": "Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET)",
+          "display": "Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (AW)",
           "dashboard_model": "Qwen3-Next-80B-A3B-Instruct-FP8-mtp-tp4",
-          "prefix": "qwen3-next-80b-a3b-instruct-fp8-mtp-tp4-met",
+          "prefix": "qwen3-next-80b-a3b-instruct-fp8-mtp-tp4-aw",
           "bench_args": "",
           "extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 32768 --max-model-len 16384 --speculative-config '{\"num_speculative_tokens\":1, \"method\": \"mtp\"}'",
           "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0"
diff --git a/.github/workflows/atom-vllm-benchmark.yaml b/.github/workflows/atom-vllm-benchmark.yaml
index fa3d96707..d7a178d51 100644
--- a/.github/workflows/atom-vllm-benchmark.yaml
+++ b/.github/workflows/atom-vllm-benchmark.yaml
@@ -36,8 +36,8 @@ on:
           - Qwen3.5-397B-A17B TP8 (OOB)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET)
-          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET)
-          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (AW)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW)
@@ -71,8 +71,8 @@ on:
           - Qwen3.5-397B-A17B TP8 (OOB)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET)
-          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET)
-          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (AW)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW)
@@ -106,8 +106,8 @@ on:
           - Qwen3.5-397B-A17B TP8 (OOB)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET)
-          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET)
-          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (AW)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW)
@@ -141,8 +141,8 @@ on:
           - Qwen3.5-397B-A17B TP8 (OOB)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET)
-          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET)
-          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (AW)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW)
@@ -176,8 +176,8 @@ on:
           - Qwen3.5-397B-A17B TP8 (OOB)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET)
-          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET)
-          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (AW)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW)
@@ -211,8 +211,8 @@ on:
           - Qwen3.5-397B-A17B TP8 (OOB)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET)
-          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET)
-          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (AW)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW)
@@ -246,8 +246,8 @@ on:
           - Qwen3.5-397B-A17B TP8 (OOB)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET)
-          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET)
-          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (AW)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW)
@@ -281,8 +281,8 @@ on:
           - Qwen3.5-397B-A17B TP8 (OOB)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET)
-          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET)
-          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (AW)
+          - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW)
           - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW)

From 9b9a77c0a6200cacbd9f68bb69e91fc8f431382a Mon Sep 17 00:00:00 2001
From: zejunchen-zejun <zejun.chen@amd.com>
Date: Fri, 15 May 2026 14:47:41 +0800
Subject: [PATCH 15/15] [docs] fix Qwen3.5 recipe: update env var count and add
 preshuffle doc

Remove stale "three" count (now variable list), add
ATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0 to both the Important section
and Key Environment Variables section.

Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
---
 recipes/atom_vllm/Qwen3.5.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/recipes/atom_vllm/Qwen3.5.md b/recipes/atom_vllm/Qwen3.5.md
index 93848beca..4e3b8c077 100644
--- a/recipes/atom_vllm/Qwen3.5.md
+++ b/recipes/atom_vllm/Qwen3.5.md
@@ -72,9 +72,10 @@ vllm serve amd/Qwen3.5-397B-A17B-MXFP4 \
     --no-enable-prefix-caching
 ```
 
-**Important**: The following three environment variables are required for Qwen3.5:
+**Important**: The following environment variables are required for Qwen3.5:
 
 - `ATOM_USE_CUSTOM_ALL_GATHER=0`: Disables custom all-gather for compatibility with Qwen3.5 model architecture
+- `ATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0`: Disables FP8 blockscale weight preshuffle
 - `AITER_QUICK_REDUCE_QUANTIZATION=INT4`: **Performance optimization** - enables INT4 quantization for quick reduce operations, which can significantly improve TTFT (Time To First Token) performance. **Note**: This optimization may introduce a risk of accuracy degradation. For accuracy-critical workloads, consider validating with your specific use case.
 
 ## Step 3: Performance Benchmark
@@ -136,6 +137,7 @@ Reference result (TP=4):
 ## Key Environment Variables
 
 - `ATOM_USE_CUSTOM_ALL_GATHER=0`: **Required** - disables custom all-gather for compatibility with Qwen3.5 model architecture
+- `ATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0`: **Required** - disables FP8 blockscale weight preshuffle
 - `AITER_QUICK_REDUCE_QUANTIZATION=INT4`: **Performance optimization** - enables INT4 quantization for quick reduce operations
   - **Benefit**: Significantly improves TTFT (Time To First Token) performance by reducing communication overhead during tensor parallelism all-reduce operations
   - **Risk**: May cause slight accuracy degradation due to lower quantization precision