From 3af7ccb4f01661673e59342943400d015aeb68d1 Mon Sep 17 00:00:00 2001 From: ganyi Date: Wed, 13 May 2026 01:45:37 +0000 Subject: [PATCH 01/15] mtp 1 acc right Signed-off-by: ganyi --- atom/model_loader/loader.py | 5 +- atom/models/qwen3_next.py | 33 ++++++++++- atom/models/qwen3_next_mtp.py | 37 ++++++++++-- atom/plugin/attention.py | 36 +++++++++++- atom/plugin/attention_mha.py | 58 +++++++++++++------ .../vllm/attention_backend/attention_gdn.py | 9 ++- atom/plugin/vllm/model_wrapper.py | 30 +++++----- atom/plugin/vllm/register.py | 1 + 8 files changed, 165 insertions(+), 44 deletions(-) diff --git a/atom/model_loader/loader.py b/atom/model_loader/loader.py index 0763179f5..08261e271 100644 --- a/atom/model_loader/loader.py +++ b/atom/model_loader/loader.py @@ -388,9 +388,12 @@ def _submit(fn, *args): is_rocm_aiter_fusion_shared_expert_enabled() and maybe_matching_name is not None ): + n_routed = getattr( + hf_config, "n_routed_experts", None + ) or getattr(hf_config, "num_experts", None) name = name.replace( maybe_matching_name, - f"mlp.experts.{hf_config.n_routed_experts}.", + f"mlp.experts.{n_routed}.", ) for k in packed_modules_mapping: # We handle the experts below in expert_params_mapping diff --git a/atom/models/qwen3_next.py b/atom/models/qwen3_next.py index f8abcb867..c42dd3b9e 100644 --- a/atom/models/qwen3_next.py +++ b/atom/models/qwen3_next.py @@ -279,6 +279,7 @@ def __init__( atom_config, quant_config=None, prefix: str = "", + layer_num: int | None = None, ) -> None: super().__init__() if hasattr(atom_config.hf_config, "text_config"): @@ -380,6 +381,15 @@ def __init__( k_norm=self.k_norm, ) + # For MTP, the prefix is e.g. "mtp.layers.0.self_attn" so + # extract_layer_index(prefix) returns 0, which would collide with the + # target model's layer 0 KV cache slot. Allow callers (e.g. + # Qwen3NextDecoderLayer) to pass an explicit `layer_num` so MTP can + # use absolute indices (mtp_start_layer_idx + idx) and get its own + # KV cache slot. + attn_layer_num = ( + layer_num if layer_num is not None else extract_layer_index(prefix) + ) self.attn = Attention( self.num_heads, self.head_dim, @@ -388,7 +398,7 @@ def __init__( kv_cache_dtype=atom_config.kv_cache_dtype, quant_config=quant_config, use_mla=False, - layer_num=extract_layer_index(prefix), + layer_num=attn_layer_num, config=atom_config, prefix=f"{prefix}", **fusion_kwargs, @@ -486,6 +496,19 @@ def __init__( self.config = config self.quant_config = quant_config self.speculative_config = speculative_config + # When running as a vLLM plugin, Qwen3NextDecoderLayer instantiates + # this module without forwarding speculative_config. That left + # self.num_spec=0 even with MTP enabled, so get_state_shape() (the + # instance method vLLM's MambaBase.get_kv_cache_spec uses to size each + # layer's KV cache) allocated conv_state with only `kernel_size-1` + # token rows. During spec decode, causal_conv1d_update writes + # `kernel_size-1 + num_spec` rows per slot and the extra row spilled + # into the page-adjacent ssm_state, corrupting layer 0's recurrent + # state. Pull the spec config from the vLLM config as a fallback. + if is_vllm() and self.speculative_config is None: + vllm_spec_config = get_current_vllm_config().speculative_config + if vllm_spec_config is not None: + self.speculative_config = vllm_spec_config self.num_spec = ( self.speculative_config.num_speculative_tokens if self.speculative_config @@ -779,6 +802,7 @@ def __init__( atom_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", + layer_num=layer_num, ) else: raise ValueError(f"Invalid layer_type {self.layer_type}") @@ -863,7 +887,6 @@ def forward( residual: torch.Tensor | None, ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention - if self.input_layernorm.use_fused_quant: if residual is None: residual = hidden_states @@ -1059,6 +1082,11 @@ def __init__( if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight + # Expose embed_tokens at this level for vLLM MTP embedding sharing. + # vLLM's proposer accesses target_wrapper.model.embed_tokens, where + # target_wrapper.model = this class (Qwen3NextForCausalLM). + self.embed_tokens = self.model.embed_tokens + self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors ) @@ -1132,6 +1160,7 @@ def get_mamba_state_shape_from_config( if vllm_config.speculative_config else 0 ) + return MambaStateShapeCalculator.gated_delta_net_state_shape( tp_size, hf_config.linear_num_key_heads, diff --git a/atom/models/qwen3_next_mtp.py b/atom/models/qwen3_next_mtp.py index 2a5f0737e..3f49a80df 100644 --- a/atom/models/qwen3_next_mtp.py +++ b/atom/models/qwen3_next_mtp.py @@ -27,6 +27,13 @@ def __init__(self, atom_config: Config, prefix: str = ""): config: Qwen3NextConfig = atom_config.hf_config self.config = config + # Qwen3NextDecoderLayer's MoE block needs these attributes, which + # Qwen3NextModel.__init__ sets but which are absent from the raw + # HF config. Set them here so the MTP predictor works standalone. + if not hasattr(config, "n_shared_experts"): + config.n_shared_experts = 1 + if not hasattr(config, "n_routed_experts"): + config.n_routed_experts = config.num_experts self.vocab_size = config.vocab_size @@ -38,6 +45,10 @@ def __init__(self, atom_config: Config, prefix: str = ""): config.hidden_size, ) + # Pass the layer's HF-style prefix so the quant_config exclude list + # (which contains "mtp.fc" in Qwen3-Next FP8 checkpoints) is honored; + # without it the lookup uses "" and falls back to the global FP8 spec, + # which makes fc FP8 even though the source weight is BF16. self.fc = ColumnParallelLinear( self.config.hidden_size * 2, self.config.hidden_size, @@ -46,16 +57,18 @@ def __init__(self, atom_config: Config, prefix: str = ""): prefix=f"{prefix}.fc", ) + # Use 0-indexed prefix (matches checkpoint's mtp.layers.0.* weight + # names and vLLM's reference impl), but keep layer_num as the + # absolute index so the attention layer gets a KV cache slot that + # doesn't collide with the target model's layers. self.layers = torch.nn.ModuleList( Qwen3NextDecoderLayer( atom_config, layer_type="full_attention", prefix=f"{prefix}.layers.{idx}", - layer_num=idx, - ) - for idx in range( - self.mtp_start_layer_idx, self.mtp_start_layer_idx + self.num_mtp_layers + layer_num=self.mtp_start_layer_idx + idx, ) + for idx in range(self.num_mtp_layers) ) self.norm = Qwen3NextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -131,6 +144,10 @@ def remap_mtp_weight_name(self, name: str) -> str | None: def __init__(self, atom_config: Config, prefix: str = ""): super().__init__() config = atom_config.hf_config + if not hasattr(config, "n_shared_experts"): + config.n_shared_experts = 1 + if not hasattr(config, "n_routed_experts"): + config.n_routed_experts = config.num_experts if atom_config.enable_prefix_caching: raise ValueError("Qwen3NextMTP currently does not support prefix caching") self.config = config @@ -171,9 +188,19 @@ def compute_logits( def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) + # Mirror target's get_expert_mapping: when shared-expert fusion is on, + # the loader rewrites `mlp.shared_expert.*` to `mlp.experts.{N}.*` + # (where N == n_routed_experts), so the expert_mapping must include + # an extra slot for that fused shared-expert. Without this, MTP's + # shared_expert weights get silently dropped during loading. + from atom.model_ops.topK import is_rocm_aiter_fusion_shared_expert_enabled + + n_routed = getattr(self.config, "n_routed_experts", self.config.num_experts) + n_shared = getattr(self.config, "n_shared_experts", 0) return FusedMoE.make_expert_params_mapping( ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", ckpt_up_proj_name="up_proj", - num_experts=self.config.num_experts, + num_experts=n_routed + + (n_shared if is_rocm_aiter_fusion_shared_expert_enabled() else 0), ) diff --git a/atom/plugin/attention.py b/atom/plugin/attention.py index 9c674c1cd..81ded7cfc 100644 --- a/atom/plugin/attention.py +++ b/atom/plugin/attention.py @@ -283,6 +283,28 @@ def init_method_under_plugin_mode( i64_kwargs = {"dtype": torch.int64, "device": device} self.positions = CpuGpuBuffer(max_num_batched_tokens, **i64_kwargs) + # Bump reorder_batch_threshold so multi-token spec-decode requests + # (MTP / EAGLE) are routed through the decode path. Mirrors vLLM's + # AttentionMetadataBuilder._init_reorder_batch_threshold(supports_spec_as_decode=True). + speculative_config = getattr(config, "speculative_config", None) + if ( + getattr(self, "reorder_batch_threshold", None) is not None + and speculative_config is not None + and getattr(speculative_config, "num_speculative_tokens", None) is not None + ): + parallel_drafting = getattr(speculative_config, "parallel_drafting", False) + max_num_queries_for_spec = 1 + (2 if parallel_drafting else 1) * ( + speculative_config.num_speculative_tokens + ) + self.reorder_batch_threshold = max( + self.reorder_batch_threshold, max_num_queries_for_spec + ) + logger.info( + "Spec decode: bumped reorder_batch_threshold to %d (num_spec_tokens=%d)", + self.reorder_batch_threshold, + speculative_config.num_speculative_tokens, + ) + return init_method_under_plugin_mode @@ -300,7 +322,7 @@ def setup_attn_metadata_builder_base_class_and_attributes(class_dict: dict): needs_generic = True # align with vllm rocm aiter fa - class_dict["_cudagraph_support"] = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE + class_dict["_cudagraph_support"] = AttentionCGSupport.UNIFORM_BATCH class_dict["reorder_batch_threshold"] = 1 return base_class, generic_base, needs_generic, class_dict @@ -324,9 +346,12 @@ def build( from vllm.v1.attention.backends.utils import split_decodes_prefills_and_extends - # here assume the decode num token is 1 per request + # decode_threshold tracks reorder_batch_threshold so MTP/EAGLE + # multi-token verification (query_len > 1) routes through decode. + decode_threshold = getattr(self, "reorder_batch_threshold", 1) or 1 split_ret = split_decodes_prefills_and_extends( - common_attn_metadata=common_attn_metadata, decode_threshold=1 + common_attn_metadata=common_attn_metadata, + decode_threshold=decode_threshold, ) ( @@ -351,6 +376,11 @@ def build( query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1] num_computed_tokens_cpu = common_attn_metadata._num_computed_tokens_cpu + # In async spec-decode mode (auto-enabled for MTP/EAGLE), vLLM sets + # _num_computed_tokens_cpu to None because the GPU seq_lens is the + # authoritative source. Reconstruct from CPU tensors we already have. + if num_computed_tokens_cpu is None: + num_computed_tokens_cpu = seq_lens - query_lens_cpu prefill_max_query_len = decode_max_query_len = ( common_attn_metadata.max_query_len diff --git a/atom/plugin/attention_mha.py b/atom/plugin/attention_mha.py index 16c88949d..100c492ab 100644 --- a/atom/plugin/attention_mha.py +++ b/atom/plugin/attention_mha.py @@ -234,15 +234,27 @@ def paged_attention_triton_plugin_mode( v_cache: torch.Tensor, k_scale: torch.Tensor, v_scale: torch.Tensor, + num_decodes: int, out: torch.Tensor, attn_metadata: "AttentionMetaData", ps: bool = True, ): - o = out - num_seqs, num_q_heads_total, head_size = q.shape + # q.shape[0] == num_decodes * max_query_len for MTP (one row per decode + # token, query_len > 1). For non-MTP it equals num_decodes (query_len = 1). + # pa_decode_gluon handles multi-token causal masking internally when + # `query_length > 1` is passed; intermediate buffers must be sized + # `num_decodes` (not q.shape[0]) and `query_group_size` must include + # the max_qlen multiplier — mirroring server-mode `paged_attention_triton`. + _, num_q_heads_total, head_size = q.shape num_blocks, num_kv_heads, _, block_size, _ = k_cache.shape - query_group_size = num_q_heads_total // num_kv_heads + decode_metadata = attn_metadata.plugin_metadata.decode_metadata + max_qlen = decode_metadata.max_query_len if decode_metadata is not None else 1 assert num_q_heads_total % num_kv_heads == 0 + + seq_lens = attn_metadata.plugin_metadata.seq_lens[:num_decodes] + block_tables = attn_metadata.plugin_metadata.block_table[:num_decodes] + + query_group_size = max_qlen * (num_q_heads_total // num_kv_heads) context_partition_size = 256 # use_ps = self.adopt_persistent_kernel( @@ -250,7 +262,9 @@ def paged_attention_triton_plugin_mode( # ) use_ps = True if use_ps: - max_context_partition_num = get_recommended_splits(num_seqs, num_kv_heads) + max_context_partition_num = get_recommended_splits( + num_decodes, num_kv_heads + ) else: max_context_partition_num = _NO_PS_FIXED_SPLITS @@ -258,9 +272,8 @@ def paged_attention_triton_plugin_mode( max_context_partition_num = 1 context_partition_size = 128 - # Output buffers (same as Triton) intermediate_shape = ( - num_seqs, + num_decodes, num_kv_heads, max_context_partition_num, query_group_size, @@ -283,21 +296,19 @@ def paged_attention_triton_plugin_mode( k_scale = k_scale.unsqueeze(-1) v_scale = v_scale.unsqueeze(-1) - num_decode_seqs = q.shape[0] - seq_lens_decode = attn_metadata.plugin_metadata.seq_lens[:num_decode_seqs] - block_tables_decode = attn_metadata.plugin_metadata.block_table[ - :num_decode_seqs - ] - + # Kernel takes natural q layout [batch * query_length, num_q_heads, head_size]. + # Internally it derives batch_size = q.shape[0] // query_length and reshapes + # to [batch, query_length, num_kv_heads, group, head_size]. See + # aiter/aiter/ops/triton/gluon/pa_decode_gluon.py:5371-5377 and 5542-5544. torch.ops.aiter.pa_decode_gluon( - o, + out, q, k_cache, v_cache, - seq_lens_decode, - block_tables_decode, + seq_lens, + block_tables, self.scale, - 1, # query_lenth + max_qlen, # query_length — handles multi-token causal mask internally max_context_partition_num, context_partition_size, compute_type, @@ -312,8 +323,7 @@ def paged_attention_triton_plugin_mode( sliding_window=self.sliding_window, ps=use_ps, ) - - return o + return out def paged_attention_asm_plugin_mode( self, @@ -327,6 +337,11 @@ def paged_attention_asm_plugin_mode( attn_metadata: "AttentionMetaData", out: torch.Tensor, ): + decode_metadata = attn_metadata.plugin_metadata.decode_metadata + max_qlen = decode_metadata.max_query_len if decode_metadata is not None else 1 + qo_indptr = ( + decode_metadata.query_start_loc if decode_metadata is not None else None + ) aiter.pa_fwd_asm( Q=q, K=k_cache, @@ -336,9 +351,11 @@ def paged_attention_asm_plugin_mode( block_tables_stride0=attn_metadata.plugin_metadata.block_table[ :num_decodes ].stride(0), + max_qlen=max_qlen, K_QScale=k_scale, V_QScale=v_scale, out_=out[:num_decode_tokens], + qo_indptr=qo_indptr, high_precision=0, ) @@ -706,12 +723,13 @@ def forward_impl_plugin_mode( extend_tokens_slice = slice( num_decode_tokens, num_decode_tokens + num_extend_tokens ) + extend_reqs_slice = slice(num_decodes, num_decodes + num_extends) extend_querys = query[extend_tokens_slice] extend_keys = key[extend_tokens_slice] extend_values = value[extend_tokens_slice] extend_outputs = output[extend_tokens_slice] extend_block_table = attn_metadata.plugin_metadata.block_table[ - extend_tokens_slice + extend_reqs_slice ] extend_slot_mapping = attn_metadata.plugin_metadata.slot_mapping[ extend_tokens_slice @@ -745,6 +763,7 @@ def forward_impl_plugin_mode( v_cache=new_value_cache, k_scale=k_scale, v_scale=v_scale, + num_decodes=num_decodes, out=output_actual_tokens[:num_decode_tokens], attn_metadata=attn_metadata, ) @@ -757,6 +776,7 @@ def forward_impl_plugin_mode( v_cache=new_value_cache, k_scale=k_scale, v_scale=v_scale, + num_decodes=num_decodes, out=output_actual_tokens[:num_decode_tokens], attn_metadata=attn_metadata, ) diff --git a/atom/plugin/vllm/attention_backend/attention_gdn.py b/atom/plugin/vllm/attention_backend/attention_gdn.py index b6158a086..87a2f2f9f 100644 --- a/atom/plugin/vllm/attention_backend/attention_gdn.py +++ b/atom/plugin/vllm/attention_backend/attention_gdn.py @@ -22,6 +22,7 @@ from atom.model_ops.fla_ops.fused_sigmoid_gating import ( fused_sigmoid_gating_delta_rule_update, ) + from atom.utils import envs from torch import nn @@ -385,7 +386,13 @@ def forward( ssm_state[non_spec_state_indices_tensor] = last_recurrent_state.to( ssm_state.dtype ) - core_attn_out[:num_actual_tokens] = core_attn_out_non_spec.squeeze(0) + # Only write directly when there are no spec tokens. With spec + # decode active, mixed_qkv was index_select'd by non_spec_token_indx + # so core_attn_out_non_spec has fewer rows than num_actual_tokens. + # The merge below (index_copy_) handles the scatter back to the + # correct slot positions. + if spec_sequence_masks is None: + core_attn_out[:num_actual_tokens] = core_attn_out_non_spec.squeeze(0) elif attn_metadata.num_decodes > 0: o = core_attn_out[: attn_metadata.num_decode_tokens] if USE_FLYDSL_GDR: diff --git a/atom/plugin/vllm/model_wrapper.py b/atom/plugin/vllm/model_wrapper.py index c2b990c18..31d546805 100644 --- a/atom/plugin/vllm/model_wrapper.py +++ b/atom/plugin/vllm/model_wrapper.py @@ -35,7 +35,9 @@ logger = logging.getLogger("atom") - +_MTP_MASK_INPUT_ARCH: set[str] = { + "DeepSeekMTPModel", +} _ATOM_MODEL_CLASSES: dict[str, str] = { "LlamaForCausalLM": "atom.models.llama:LlamaForCausalLM", "Qwen3ForCausalLM": "atom.models.qwen3:Qwen3ForCausalLM", @@ -47,6 +49,7 @@ "GlmMoeDsaForCausalLM": "atom.models.deepseek_v2:GlmMoeDsaForCausalLM", "DeepSeekMTPModel": "atom.models.deepseek_mtp:DeepSeekMTP", "Qwen3NextForCausalLM": "atom.models.qwen3_next:Qwen3NextForCausalLM", + "Qwen3NextMTP": "atom.models.qwen3_next_mtp:Qwen3NextMTP", "Qwen3_5MoeForConditionalGeneration": "atom.models.qwen3_5:Qwen3_5MoeForConditionalGeneration_", "Qwen3_5ForConditionalGeneration": "atom.models.qwen3_5:Qwen3_5ForConditionalGeneration_", "KimiK25ForConditionalGeneration": "atom.plugin.vllm.models.kimi_k25:KimiK25ForConditionalGeneration_", @@ -121,7 +124,7 @@ def __init_subclass__(cls, *args, **kwargs): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - + from atom.config import get_current_atom_config _set_framework_backbone("vllm") self.config = vllm_config.model_config.hf_config @@ -147,12 +150,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): spec_method = speculative_config.method self.is_mtp = spec_method == "mtp" - _prepare_env(atom_config=self.atom_config) - main_model_arch = vllm_config.model_config.architectures[0] model_arch = _select_model_arch(vllm_config) self.is_mtp_draft_model = self.is_mtp and model_arch != main_model_arch + if self.is_mtp_draft_model: + self.atom_config = get_current_atom_config() + else: + self.atom_config = generate_atom_config_for_plugin_mode(vllm_config) self.model_arch = model_arch + _prepare_env(atom_config=self.atom_config) model_cls = _get_atom_model_cls(model_arch) module_remapping = getattr(model_cls, "packed_modules_mapping", {}) weights_mapper = getattr(model_cls, "hf_to_atom_mapper", {}) @@ -182,9 +188,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): logger.info(f"Construct ATOM model {model_arch} for vLLM plugin mode") self.model = model_cls(self.atom_config) - self._adapt_mtp_layers_for_vllm() - # Mirror nested attributes required by vLLM speculative decoding. - self._expose_spec_decode_attrs() + + if model_arch in _MTP_MASK_INPUT_ARCH: + self._adapt_mtp_layers_for_vllm() + # Mirror nested attributes required by vLLM speculative decoding. + self._expose_spec_decode_attrs() # For sparse MLA, register the Indexer's DeepseekV32IndexerCache as # a virtual subclass of vLLM's AttentionLayerBase so vLLM can discover @@ -192,7 +200,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._register_indexer_caches_with_vllm() if self.model is None: - model_arch = vllm_config.model_config.architectures[0] raise ValueError( f"The model {model_arch} is not supported by model impl backend atom" ) @@ -309,8 +316,7 @@ def _register_indexer_caches_with_vllm(self): if prefix not in vllm_sfc: vllm_sfc[prefix] = module logger.info( - f"Registered indexer cache in vLLM static_forward_context: " - f"{prefix}" + f"Registered indexer cache in vLLM static_forward_context: {prefix}" ) else: logger.warning( @@ -397,7 +403,6 @@ def forward( inputs_embeds=inputs_embeds, **model_kwargs, ) - if not self.pp_group.is_last_rank: return IntermediateTensors({"hidden_states": hidden_states}) @@ -412,7 +417,7 @@ def load_weights( is_mtp_draft_model = self.model_arch in { "DeepSeekMTPModel", - "Qwen3NextMTPModel", + "Qwen3NextMTP", } draft_hf_config = None if is_mtp_draft_model: @@ -452,7 +457,6 @@ class ATOMMoEForCausalLM(ATOMModelBase, VllmModelForTextGeneration): ... class ATOMForConditionalGeneration( ATOMModelBase, VllmModelForTextGeneration, SupportsMultiModal, SupportsMRoPE ): - @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: """ diff --git a/atom/plugin/vllm/register.py b/atom/plugin/vllm/register.py index 9ef76e601..91e241e9a 100644 --- a/atom/plugin/vllm/register.py +++ b/atom/plugin/vllm/register.py @@ -30,6 +30,7 @@ "GlmMoeDsaForCausalLM": ATOM_MOE_CAUSAL_LM_MODEL_WRAPPER, "DeepSeekMTPModel": ATOM_MOE_CAUSAL_LM_MODEL_WRAPPER, "Qwen3NextForCausalLM": "atom.models.qwen3_next:Qwen3NextForCausalLMVllm", + "Qwen3NextMTP": ATOM_MOE_CAUSAL_LM_MODEL_WRAPPER, "Qwen3_5ForConditionalGeneration": "atom.models.qwen3_5:Qwen3_5ForConditionalGeneration", "Qwen3_5MoeForConditionalGeneration": "atom.models.qwen3_5:Qwen3_5MoeForConditionalGeneration", "KimiK25ForConditionalGeneration": "atom.plugin.vllm.models.kimi_k25:KimiK25ForConditionalGeneration", From 580f0fdeab0048e5bd79895922ee41eea3c054b1 Mon Sep 17 00:00:00 2001 From: ganyi Date: Thu, 14 May 2026 08:05:17 +0000 Subject: [PATCH 02/15] add recipe for qwen3-next-mtp Signed-off-by: ganyi --- recipes/atom_vllm/Qwen3Next.md | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/recipes/atom_vllm/Qwen3Next.md b/recipes/atom_vllm/Qwen3Next.md index e22f80d1c..97e8bdaa4 100644 --- a/recipes/atom_vllm/Qwen3Next.md +++ b/recipes/atom_vllm/Qwen3Next.md @@ -17,6 +17,7 @@ The ATOM vLLM plugin backend keeps the standard vLLM CLI, server APIs, and gener ```bash export ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1 export ATOM_USE_CUSTOM_ALL_GATHER=0 +export AITER_QUICK_REDUCE_QUANTIZATION=INT4 vllm serve Qwen/Qwen3-Next-80B-A3B-Instruct-FP8 \ --host localhost \ @@ -31,8 +32,25 @@ vllm serve Qwen/Qwen3-Next-80B-A3B-Instruct-FP8 \ --no-enable-prefix-caching ``` -**Important**: `ATOM_DISABLE_VLLM_PLUGIN_ATTENTION=1` is required for Qwen3-Next because it uses a hybrid architecture with both linear attention (GatedDeltaNet) and full attention layers. This env var ensures full attention layers use vLLM's default implementation. +### Qwen3-Next-80B-A3B-Instruct-FP8 MTP (TP=1, MI355X) +```bash +export ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1 +export ATOM_USE_CUSTOM_ALL_GATHER=0 +export AITER_QUICK_REDUCE_QUANTIZATION=INT4 +vllm serve Qwen/Qwen3-Next-80B-A3B-Instruct-FP8 \ + --host localhost \ + --port 8000 \ + --tensor-parallel-size 1 \ + --kv-cache-dtype fp8 \ + --gpu_memory_utilization 0.9 \ + --async-scheduling \ + --compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ + --max-model-len 16384 \ + --max-num-batched-tokens 32768 \ + --speculative-config '{"num_speculative_tokens":1, "method": "mtp"}' \ + --no-enable-prefix-caching +``` ## Step 3: Performance Benchmark Users can use the default vllm bench commands for performance benchmarking. @@ -70,9 +88,6 @@ lm_eval --model local-completions \ --num_fewshot 3 ``` -## Key Environment Variables - -- `ATOM_DISABLE_VLLM_PLUGIN_ATTENTION=1`: **Required** - disables ATOM attention plugin to use vLLM's implementation for full attention layers ## Architecture Notes From 598be9a919fde691765e518eb596d5eaa0961040 Mon Sep 17 00:00:00 2001 From: ganyi Date: Thu, 14 May 2026 08:06:59 +0000 Subject: [PATCH 03/15] modify some qwen3.5 recipe Signed-off-by: ganyi --- recipes/atom_vllm/Qwen3.5.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/recipes/atom_vllm/Qwen3.5.md b/recipes/atom_vllm/Qwen3.5.md index 94a900e07..2b30b540a 100644 --- a/recipes/atom_vllm/Qwen3.5.md +++ b/recipes/atom_vllm/Qwen3.5.md @@ -71,7 +71,6 @@ vllm serve amd/Qwen3.5-397B-A17B-MXFP4 \ **Important**: The following three environment variables are required for Qwen3.5: -- `ATOM_DISABLE_VLLM_PLUGIN_ATTENTION=1`: Disables ATOM attention plugin to use vLLM's implementation for full attention layers (required because Qwen3.5 uses a hybrid architecture with both linear attention (GatedDeltaNet) and full attention layers) - `ATOM_USE_CUSTOM_ALL_GATHER=0`: Disables custom all-gather for compatibility with Qwen3.5 model architecture - `AITER_QUICK_REDUCE_QUANTIZATION=INT4`: **Performance optimization** - enables INT4 quantization for quick reduce operations, which can significantly improve TTFT (Time To First Token) performance. **Note**: This optimization may introduce a risk of accuracy degradation. For accuracy-critical workloads, consider validating with your specific use case. @@ -133,7 +132,6 @@ Reference result (TP=4): ## Key Environment Variables -- `ATOM_DISABLE_VLLM_PLUGIN_ATTENTION=1`: **Required** - disables ATOM attention plugin to use vLLM's implementation for full attention layers - `ATOM_USE_CUSTOM_ALL_GATHER=0`: **Required** - disables custom all-gather for compatibility with Qwen3.5 model architecture - `AITER_QUICK_REDUCE_QUANTIZATION=INT4`: **Performance optimization** - enables INT4 quantization for quick reduce operations - **Benefit**: Significantly improves TTFT (Time To First Token) performance by reducing communication overhead during tensor parallelism all-reduce operations From 9a6381e9f17aaf59089045d8d5ccbf54baba71d4 Mon Sep 17 00:00:00 2001 From: ganyi Date: Thu, 14 May 2026 08:35:34 +0000 Subject: [PATCH 04/15] black Signed-off-by: ganyi --- atom/model_loader/loader.py | 6 +++--- atom/plugin/vllm/model_wrapper.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/atom/model_loader/loader.py b/atom/model_loader/loader.py index 08261e271..36c9db22b 100644 --- a/atom/model_loader/loader.py +++ b/atom/model_loader/loader.py @@ -388,9 +388,9 @@ def _submit(fn, *args): is_rocm_aiter_fusion_shared_expert_enabled() and maybe_matching_name is not None ): - n_routed = getattr( - hf_config, "n_routed_experts", None - ) or getattr(hf_config, "num_experts", None) + n_routed = getattr(hf_config, "n_routed_experts", None) or getattr( + hf_config, "num_experts", None + ) name = name.replace( maybe_matching_name, f"mlp.experts.{n_routed}.", diff --git a/atom/plugin/vllm/model_wrapper.py b/atom/plugin/vllm/model_wrapper.py index 31d546805..e540a1274 100644 --- a/atom/plugin/vllm/model_wrapper.py +++ b/atom/plugin/vllm/model_wrapper.py @@ -125,6 +125,7 @@ def __init_subclass__(cls, *args, **kwargs): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() from atom.config import get_current_atom_config + _set_framework_backbone("vllm") self.config = vllm_config.model_config.hf_config From ce844440494039f4663da5671da5f56f8a5c7ac3 Mon Sep 17 00:00:00 2001 From: ganyi Date: Thu, 14 May 2026 13:54:58 +0000 Subject: [PATCH 05/15] remove redundant code Signed-off-by: ganyi --- atom/models/qwen3_next_mtp.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/atom/models/qwen3_next_mtp.py b/atom/models/qwen3_next_mtp.py index 3f49a80df..d89b6bbdc 100644 --- a/atom/models/qwen3_next_mtp.py +++ b/atom/models/qwen3_next_mtp.py @@ -27,13 +27,6 @@ def __init__(self, atom_config: Config, prefix: str = ""): config: Qwen3NextConfig = atom_config.hf_config self.config = config - # Qwen3NextDecoderLayer's MoE block needs these attributes, which - # Qwen3NextModel.__init__ sets but which are absent from the raw - # HF config. Set them here so the MTP predictor works standalone. - if not hasattr(config, "n_shared_experts"): - config.n_shared_experts = 1 - if not hasattr(config, "n_routed_experts"): - config.n_routed_experts = config.num_experts self.vocab_size = config.vocab_size @@ -196,7 +189,7 @@ def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: from atom.model_ops.topK import is_rocm_aiter_fusion_shared_expert_enabled n_routed = getattr(self.config, "n_routed_experts", self.config.num_experts) - n_shared = getattr(self.config, "n_shared_experts", 0) + n_shared = getattr(self.config, "n_shared_experts", 1) return FusedMoE.make_expert_params_mapping( ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", From 8f83eb716a09c4a8d2325b08bcfe2ab6930835b5 Mon Sep 17 00:00:00 2001 From: ganyi Date: Thu, 14 May 2026 14:16:10 +0000 Subject: [PATCH 06/15] remove redundant code Signed-off-by: ganyi --- atom/models/qwen3_next_mtp.py | 4 ---- atom/plugin/vllm/model_wrapper.py | 1 - 2 files changed, 5 deletions(-) diff --git a/atom/models/qwen3_next_mtp.py b/atom/models/qwen3_next_mtp.py index d89b6bbdc..d89df525b 100644 --- a/atom/models/qwen3_next_mtp.py +++ b/atom/models/qwen3_next_mtp.py @@ -137,10 +137,6 @@ def remap_mtp_weight_name(self, name: str) -> str | None: def __init__(self, atom_config: Config, prefix: str = ""): super().__init__() config = atom_config.hf_config - if not hasattr(config, "n_shared_experts"): - config.n_shared_experts = 1 - if not hasattr(config, "n_routed_experts"): - config.n_routed_experts = config.num_experts if atom_config.enable_prefix_caching: raise ValueError("Qwen3NextMTP currently does not support prefix caching") self.config = config diff --git a/atom/plugin/vllm/model_wrapper.py b/atom/plugin/vllm/model_wrapper.py index e540a1274..7197d0d65 100644 --- a/atom/plugin/vllm/model_wrapper.py +++ b/atom/plugin/vllm/model_wrapper.py @@ -144,7 +144,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.ignore_unexpected_suffixes: list[str] = [] self.vllm_config = vllm_config - self.atom_config = generate_atom_config_for_plugin_mode(vllm_config) self.is_mtp = False speculative_config = getattr(vllm_config, "speculative_config", None) if speculative_config is not None: From 8b53857aca6162c80f93fadd46553489956353c6 Mon Sep 17 00:00:00 2001 From: ganyi Date: Thu, 14 May 2026 14:57:45 +0000 Subject: [PATCH 07/15] add spec decode convert for vllm plugin Signed-off-by: ganyi --- atom/plugin/config.py | 45 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/atom/plugin/config.py b/atom/plugin/config.py index 07aafa4a5..8eb48da4c 100644 --- a/atom/plugin/config.py +++ b/atom/plugin/config.py @@ -1,3 +1,4 @@ +import copy from typing import Any, Optional from dataclasses import dataclass @@ -71,6 +72,45 @@ def _normalize_sglang_parallel_config( return tp_size, 1, 0, tp_rank +def _build_atom_speculative_config_from_vllm(vllm_spec_config: Any): + """Translate vLLM's SpeculativeConfig into ATOM's SpeculativeConfig. + + Reuses vLLM's already-loaded draft hf_config (skips a second disk fetch + in ATOM SpeculativeConfig.__post_init__) but still runs ATOM's + hf_config_override on it — so MTP model_type remap, n_routed_experts + backfill (Qwen families), and architecture rewrite all land on the + draft config in one place. Mirrors how standalone ATOM MTP exposes + the draft hf_config via atom_config.speculative_config. + + The draft hf_config is deepcopied first because hf_config_override + mutates `architectures` to ATOM's standalone naming (e.g. + "Qwen3NextMTPModel"), which differs from vLLM's registry name + ("Qwen3NextMTP"). Mutating in place would make vLLM's later draft + architecture lookup fail. + """ + if vllm_spec_config is None: + return None + + from atom.config import SpeculativeConfig + + draft_model_config = getattr(vllm_spec_config, "draft_model_config", None) + draft_hf_config = getattr(draft_model_config, "hf_config", None) + if draft_hf_config is not None: + draft_hf_config = copy.deepcopy(draft_hf_config) + model_path = getattr(draft_model_config, "model", None) or getattr( + vllm_spec_config, "model", None + ) + + return SpeculativeConfig( + method=getattr(vllm_spec_config, "method", "") or "", + model=model_path, + num_speculative_tokens=getattr( + vllm_spec_config, "num_speculative_tokens", None + ), + draft_model_hf_config=draft_hf_config, + ) + + def _generate_atom_config_from_vllm_config(config: Any) -> PluginConfig: from atom.config import Config, CompilationConfig @@ -117,6 +157,10 @@ def _generate_atom_config_from_vllm_config(config: Any) -> PluginConfig: max_num_batched_tokens = vllm_scheduler_config.max_num_batched_tokens + atom_speculative_config = _build_atom_speculative_config_from_vllm( + getattr(config, "speculative_config", None) + ) + return Config( model=vllm_model_config.model, trust_remote_code=getattr(vllm_model_config, "trust_remote_code", False), @@ -140,6 +184,7 @@ def _generate_atom_config_from_vllm_config(config: Any) -> PluginConfig: master_addr=None, enable_dp_attention=False, plugin_config=plugin_config, + speculative_config=atom_speculative_config, ) From 885c3291e0c5f7c5ba6edb49d8bea7a0b0940a59 Mon Sep 17 00:00:00 2001 From: ganyi Date: Thu, 14 May 2026 15:09:02 +0000 Subject: [PATCH 08/15] remove vllm related branch Signed-off-by: ganyi --- atom/models/qwen3_next.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/atom/models/qwen3_next.py b/atom/models/qwen3_next.py index c42dd3b9e..5fd55e072 100644 --- a/atom/models/qwen3_next.py +++ b/atom/models/qwen3_next.py @@ -495,20 +495,17 @@ def __init__( self.config = config self.quant_config = quant_config - self.speculative_config = speculative_config - # When running as a vLLM plugin, Qwen3NextDecoderLayer instantiates - # this module without forwarding speculative_config. That left - # self.num_spec=0 even with MTP enabled, so get_state_shape() (the - # instance method vLLM's MambaBase.get_kv_cache_spec uses to size each - # layer's KV cache) allocated conv_state with only `kernel_size-1` - # token rows. During spec decode, causal_conv1d_update writes - # `kernel_size-1 + num_spec` rows per slot and the extra row spilled - # into the page-adjacent ssm_state, corrupting layer 0's recurrent - # state. Pull the spec config from the vLLM config as a fallback. - if is_vllm() and self.speculative_config is None: - vllm_spec_config = get_current_vllm_config().speculative_config - if vllm_spec_config is not None: - self.speculative_config = vllm_spec_config + # Qwen3NextDecoderLayer instantiates this module without forwarding + # speculative_config, so fall back to atom_config.speculative_config + # (populated by both standalone ATOM and the vLLM plugin path's + # _generate_atom_config_from_vllm_config). Without a correct num_spec, + # get_state_shape() (used by vLLM's MambaBase.get_kv_cache_spec to + # size each layer's KV cache) sizes conv_state with only + # `kernel_size-1` token rows, but causal_conv1d_update writes + # `kernel_size-1 + num_spec` rows per slot during spec decode — the + # extra row spills into the page-adjacent ssm_state and corrupts + # layer 0's recurrent state. + self.speculative_config = speculative_config or atom_config.speculative_config self.num_spec = ( self.speculative_config.num_speculative_tokens if self.speculative_config From a7063090eef6299bfa4a27cfaeec1c2ca91bc7e3 Mon Sep 17 00:00:00 2001 From: ganyi Date: Thu, 14 May 2026 15:21:58 +0000 Subject: [PATCH 09/15] use atom spec decode config for plugin loading Signed-off-by: ganyi --- atom/model_loader/loader.py | 5 +---- atom/plugin/vllm/model_wrapper.py | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/atom/model_loader/loader.py b/atom/model_loader/loader.py index 36c9db22b..0763179f5 100644 --- a/atom/model_loader/loader.py +++ b/atom/model_loader/loader.py @@ -388,12 +388,9 @@ def _submit(fn, *args): is_rocm_aiter_fusion_shared_expert_enabled() and maybe_matching_name is not None ): - n_routed = getattr(hf_config, "n_routed_experts", None) or getattr( - hf_config, "num_experts", None - ) name = name.replace( maybe_matching_name, - f"mlp.experts.{n_routed}.", + f"mlp.experts.{hf_config.n_routed_experts}.", ) for k in packed_modules_mapping: # We handle the experts below in expert_params_mapping diff --git a/atom/plugin/vllm/model_wrapper.py b/atom/plugin/vllm/model_wrapper.py index 7197d0d65..4eada7d4c 100644 --- a/atom/plugin/vllm/model_wrapper.py +++ b/atom/plugin/vllm/model_wrapper.py @@ -422,7 +422,7 @@ def load_weights( draft_hf_config = None if is_mtp_draft_model: draft_model_config = getattr( - getattr(self.vllm_config, "speculative_config", None), + getattr(self.atom_config, "speculative_config", None), "draft_model_config", None, ) From 4dca1352fafc63e6a21a5d4d455549692a209a1d Mon Sep 17 00:00:00 2001 From: ganyi Date: Fri, 15 May 2026 01:59:26 +0000 Subject: [PATCH 10/15] remove unnecessary changes in modeling Signed-off-by: ganyi --- atom/models/qwen3_next.py | 24 ++---------------------- atom/models/qwen3_next_mtp.py | 13 +++---------- 2 files changed, 5 insertions(+), 32 deletions(-) diff --git a/atom/models/qwen3_next.py b/atom/models/qwen3_next.py index 5fd55e072..40ad2380a 100644 --- a/atom/models/qwen3_next.py +++ b/atom/models/qwen3_next.py @@ -279,7 +279,6 @@ def __init__( atom_config, quant_config=None, prefix: str = "", - layer_num: int | None = None, ) -> None: super().__init__() if hasattr(atom_config.hf_config, "text_config"): @@ -381,15 +380,6 @@ def __init__( k_norm=self.k_norm, ) - # For MTP, the prefix is e.g. "mtp.layers.0.self_attn" so - # extract_layer_index(prefix) returns 0, which would collide with the - # target model's layer 0 KV cache slot. Allow callers (e.g. - # Qwen3NextDecoderLayer) to pass an explicit `layer_num` so MTP can - # use absolute indices (mtp_start_layer_idx + idx) and get its own - # KV cache slot. - attn_layer_num = ( - layer_num if layer_num is not None else extract_layer_index(prefix) - ) self.attn = Attention( self.num_heads, self.head_dim, @@ -398,7 +388,7 @@ def __init__( kv_cache_dtype=atom_config.kv_cache_dtype, quant_config=quant_config, use_mla=False, - layer_num=attn_layer_num, + layer_num=extract_layer_index(prefix), config=atom_config, prefix=f"{prefix}", **fusion_kwargs, @@ -495,16 +485,7 @@ def __init__( self.config = config self.quant_config = quant_config - # Qwen3NextDecoderLayer instantiates this module without forwarding - # speculative_config, so fall back to atom_config.speculative_config - # (populated by both standalone ATOM and the vLLM plugin path's - # _generate_atom_config_from_vllm_config). Without a correct num_spec, - # get_state_shape() (used by vLLM's MambaBase.get_kv_cache_spec to - # size each layer's KV cache) sizes conv_state with only - # `kernel_size-1` token rows, but causal_conv1d_update writes - # `kernel_size-1 + num_spec` rows per slot during spec decode — the - # extra row spills into the page-adjacent ssm_state and corrupts - # layer 0's recurrent state. + self.speculative_config = speculative_config or atom_config.speculative_config self.num_spec = ( self.speculative_config.num_speculative_tokens @@ -799,7 +780,6 @@ def __init__( atom_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", - layer_num=layer_num, ) else: raise ValueError(f"Invalid layer_type {self.layer_type}") diff --git a/atom/models/qwen3_next_mtp.py b/atom/models/qwen3_next_mtp.py index d89df525b..dcdc603b2 100644 --- a/atom/models/qwen3_next_mtp.py +++ b/atom/models/qwen3_next_mtp.py @@ -38,10 +38,6 @@ def __init__(self, atom_config: Config, prefix: str = ""): config.hidden_size, ) - # Pass the layer's HF-style prefix so the quant_config exclude list - # (which contains "mtp.fc" in Qwen3-Next FP8 checkpoints) is honored; - # without it the lookup uses "" and falls back to the global FP8 spec, - # which makes fc FP8 even though the source weight is BF16. self.fc = ColumnParallelLinear( self.config.hidden_size * 2, self.config.hidden_size, @@ -50,18 +46,15 @@ def __init__(self, atom_config: Config, prefix: str = ""): prefix=f"{prefix}.fc", ) - # Use 0-indexed prefix (matches checkpoint's mtp.layers.0.* weight - # names and vLLM's reference impl), but keep layer_num as the - # absolute index so the attention layer gets a KV cache slot that - # doesn't collide with the target model's layers. self.layers = torch.nn.ModuleList( Qwen3NextDecoderLayer( atom_config, layer_type="full_attention", prefix=f"{prefix}.layers.{idx}", - layer_num=self.mtp_start_layer_idx + idx, + layer_num=idx, ) - for idx in range(self.num_mtp_layers) + for idx in range( + self.mtp_start_layer_idx, self.mtp_start_layer_idx + self.num_mtp_layers) ) self.norm = Qwen3NextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) From 809e9319e1905e3928cb27bc790308ada4d2cf7c Mon Sep 17 00:00:00 2001 From: ganyi Date: Fri, 15 May 2026 02:00:43 +0000 Subject: [PATCH 11/15] format Signed-off-by: ganyi --- atom/models/qwen3_next_mtp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/atom/models/qwen3_next_mtp.py b/atom/models/qwen3_next_mtp.py index dcdc603b2..5c547d955 100644 --- a/atom/models/qwen3_next_mtp.py +++ b/atom/models/qwen3_next_mtp.py @@ -54,7 +54,8 @@ def __init__(self, atom_config: Config, prefix: str = ""): layer_num=idx, ) for idx in range( - self.mtp_start_layer_idx, self.mtp_start_layer_idx + self.num_mtp_layers) + self.mtp_start_layer_idx, self.mtp_start_layer_idx + self.num_mtp_layers + ) ) self.norm = Qwen3NextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) From d104a466abb717b5cd0de69bb6fc91d3368ef9a0 Mon Sep 17 00:00:00 2001 From: ganyi Date: Fri, 15 May 2026 02:21:18 +0000 Subject: [PATCH 12/15] add qwen3next mtp into benchmark Signed-off-by: ganyi --- .github/benchmark/oot_benchmark_models.json | 18 ++++++++++++++++++ .github/benchmark/oot_models_accuracy.json | 12 ++++++++++++ .../atom-vllm-accuracy-validation.yaml | 15 +++++++++++++++ .github/workflows/atom-vllm-benchmark.yaml | 16 ++++++++++++++++ 4 files changed, 61 insertions(+) diff --git a/.github/benchmark/oot_benchmark_models.json b/.github/benchmark/oot_benchmark_models.json index 1b54c6265..9a478cbf1 100644 --- a/.github/benchmark/oot_benchmark_models.json +++ b/.github/benchmark/oot_benchmark_models.json @@ -268,6 +268,24 @@ "bench_args": "", "extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 32768 --max-model-len 16384", "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0" + }, + { + "tp_size": 1, + "display": "Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET)", + "dashboard_model": "Qwen3-Next-80B-A3B-Instruct-FP8-mtp-tp1", + "prefix": "qwen3-next-80b-a3b-instruct-fp8-mtp-tp1-met", + "bench_args": "", + "extra_args": "--trust-remote-code --tensor-parallel-size 1 --max-num-batched-tokens 32768 --max-model-len 16384 --speculative-config '{\"num_speculative_tokens\":1, \"method\": \"mtp\"}'", + "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0" + }, + { + "tp_size": 4, + "display": "Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET)", + "dashboard_model": "Qwen3-Next-80B-A3B-Instruct-FP8-mtp-tp4", + "prefix": "qwen3-next-80b-a3b-instruct-fp8-mtp-tp4-met", + "bench_args": "", + "extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 32768 --max-model-len 16384 --speculative-config '{\"num_speculative_tokens\":1, \"method\": \"mtp\"}'", + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0" } ] }, diff --git a/.github/benchmark/oot_models_accuracy.json b/.github/benchmark/oot_models_accuracy.json index 1050e3f85..c2b9da043 100644 --- a/.github/benchmark/oot_models_accuracy.json +++ b/.github/benchmark/oot_models_accuracy.json @@ -55,6 +55,18 @@ "accuracy_baseline_model": "Qwen/Qwen3-235B-A22B-Instruct-2507", "_baseline_note": "Using Qwen3-235B baseline as proxy; needs CI measurement for Qwen3.5 specific baseline" }, + { + "model_name": "Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4", + "model_path": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8", + "extraArgs": "--tensor-parallel-size 4 --speculative-config '{\"num_speculative_tokens\":1, \"method\": \"mtp\"}'", + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0", + "runner": "linux-atom-mi35x-4", + "test_level": "nightly", + "accuracy_threshold": 0.80, + "accuracy_baseline": 0.81, + "accuracy_baseline_model": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8", + "_baseline_note": "Qwen3-Next-80B-A3B-Instruct-FP8 baseline with TP4 (no MTP) as proxy; needs CI measurement for MTP-specific baseline" + }, { "model_name": "Llama-3.1-8B-Instruct TP1", "model_path": "meta-llama/Llama-3.1-8B-Instruct", diff --git a/.github/workflows/atom-vllm-accuracy-validation.yaml b/.github/workflows/atom-vllm-accuracy-validation.yaml index 0b2d7e7a0..b940f3753 100644 --- a/.github/workflows/atom-vllm-accuracy-validation.yaml +++ b/.github/workflows/atom-vllm-accuracy-validation.yaml @@ -24,6 +24,11 @@ on: required: false type: boolean default: false + run_qwen3_next_80b_mtp_tp4: + description: "Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4" + required: false + type: boolean + default: false run_qwen35_397b_fp8_tp8: description: "Qwen3.5-397B-A17B-FP8 TP8" required: false @@ -137,6 +142,7 @@ jobs: RUN_QWEN3_MOE_TP8: ${{ inputs.run_qwen3_moe_tp8 }} RUN_QWEN3_NEXT_80B_TP1: ${{ inputs.run_qwen3_next_80b_tp1 }} RUN_QWEN3_NEXT_80B_TP4: ${{ inputs.run_qwen3_next_80b_tp4 }} + RUN_QWEN3_NEXT_80B_MTP_TP4: ${{ inputs.run_qwen3_next_80b_mtp_tp4 }} RUN_QWEN35_397B_FP8_TP8: ${{ inputs.run_qwen35_397b_fp8_tp8 }} RUN_QWEN35_397B_TP8: ${{ inputs.run_qwen35_397b_tp8 }} RUN_QWEN35_397B_FP4_TP4: ${{ inputs.run_qwen35_397b_fp4_tp4 }} @@ -190,6 +196,15 @@ jobs: "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=1", "runner": "linux-atom-mi35x-4", }, + { + "toggle_env": "RUN_QWEN3_NEXT_80B_MTP_TP4", + "model_name": "Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4", + "model_path": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8", + "extra_args": "--tensor-parallel-size 4 --speculative-config '{\"num_speculative_tokens\":1, \"method\": \"mtp\"}'", + "accuracy_test_threshold": 0.80, + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0", + "runner": "linux-atom-mi35x-4", + }, { "toggle_env": "RUN_QWEN35_397B_FP8_TP8", "model_name": "Qwen3.5-397B-A17B-FP8 TP8", diff --git a/.github/workflows/atom-vllm-benchmark.yaml b/.github/workflows/atom-vllm-benchmark.yaml index a12b91483..fa3d96707 100644 --- a/.github/workflows/atom-vllm-benchmark.yaml +++ b/.github/workflows/atom-vllm-benchmark.yaml @@ -36,6 +36,8 @@ on: - Qwen3.5-397B-A17B TP8 (OOB) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW) @@ -69,6 +71,8 @@ on: - Qwen3.5-397B-A17B TP8 (OOB) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW) @@ -102,6 +106,8 @@ on: - Qwen3.5-397B-A17B TP8 (OOB) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW) @@ -135,6 +141,8 @@ on: - Qwen3.5-397B-A17B TP8 (OOB) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW) @@ -168,6 +176,8 @@ on: - Qwen3.5-397B-A17B TP8 (OOB) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW) @@ -201,6 +211,8 @@ on: - Qwen3.5-397B-A17B TP8 (OOB) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW) @@ -234,6 +246,8 @@ on: - Qwen3.5-397B-A17B TP8 (OOB) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW) @@ -267,6 +281,8 @@ on: - Qwen3.5-397B-A17B TP8 (OOB) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW) From e262d7e4c8e306d8a5fad1a815125e7e9438a4dc Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Fri, 15 May 2026 14:01:44 +0800 Subject: [PATCH 13/15] [ci] disable FP8 blockscale weight preshuffle for Qwen3.5/Qwen3-Next Add ATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0 to all Qwen3.5 and Qwen3-Next model configs across benchmark, nightly accuracy, and recipe files. Co-Authored-By: Claude Opus 4 --- .github/benchmark/oot_benchmark_models.json | 20 +++++++++---------- .github/benchmark/oot_models_accuracy.json | 18 ++++++++--------- .../atom-vllm-accuracy-validation.yaml | 14 ++++++------- recipes/atom_vllm/Qwen3.5.md | 3 +++ recipes/atom_vllm/Qwen3Next.md | 2 ++ 5 files changed, 31 insertions(+), 26 deletions(-) diff --git a/.github/benchmark/oot_benchmark_models.json b/.github/benchmark/oot_benchmark_models.json index 9a478cbf1..4b9c7a9a9 100644 --- a/.github/benchmark/oot_benchmark_models.json +++ b/.github/benchmark/oot_benchmark_models.json @@ -181,7 +181,7 @@ "1024x8192" ], "extra_args": "--trust-remote-code --tensor-parallel-size 4 --attention-backend ROCM_AITER_FA --gpu-memory-utilization 0.8 --max-num-batched-tokens 16384 --max-model-len 16384", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0" + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0" }, { "tp_size": 8, @@ -192,7 +192,7 @@ "1024x8192" ], "extra_args": "--trust-remote-code --tensor-parallel-size 8 --attention-backend ROCM_AITER_FA --gpu-memory-utilization 0.8 --max-num-batched-tokens 16384 --max-model-len 16384", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0" + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0" } ] }, @@ -213,7 +213,7 @@ "1024x8192" ], "extra_args": "--trust-remote-code --tensor-parallel-size 8 --attention-backend ROCM_AITER_FA --gpu-memory-utilization 0.8 --max-num-batched-tokens 16384 --max-model-len 16384", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0" + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0" } ] }, @@ -231,7 +231,7 @@ "prefix": "qwen3-next-80b-a3b-instruct-fp8-tp1-met", "bench_args": "", "extra_args": "--trust-remote-code --tensor-parallel-size 1 --max-num-batched-tokens 32768 --max-model-len 16384", - "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0" + "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0" }, { "tp_size": 4, @@ -240,7 +240,7 @@ "prefix": "qwen3-next-80b-a3b-instruct-fp8-tp4-met", "bench_args": "", "extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 32768 --max-model-len 16384", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0" + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0" }, { "tp_size": 1, @@ -249,7 +249,7 @@ "prefix": "qwen3-next-80b-a3b-instruct-fp8-aw-tp1", "bench_args": "", "extra_args": "--trust-remote-code --tensor-parallel-size 1 --max-num-batched-tokens 32768 --max-model-len 16384", - "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0" + "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0" }, { "tp_size": 2, @@ -258,7 +258,7 @@ "prefix": "qwen3-next-80b-a3b-instruct-fp8-aw-tp2", "bench_args": "", "extra_args": "--trust-remote-code --tensor-parallel-size 2 --max-num-batched-tokens 32768 --max-model-len 16384", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0" + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0" }, { "tp_size": 4, @@ -267,7 +267,7 @@ "prefix": "qwen3-next-80b-a3b-instruct-fp8-aw-tp4", "bench_args": "", "extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 32768 --max-model-len 16384", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0" + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0" }, { "tp_size": 1, @@ -276,7 +276,7 @@ "prefix": "qwen3-next-80b-a3b-instruct-fp8-mtp-tp1-met", "bench_args": "", "extra_args": "--trust-remote-code --tensor-parallel-size 1 --max-num-batched-tokens 32768 --max-model-len 16384 --speculative-config '{\"num_speculative_tokens\":1, \"method\": \"mtp\"}'", - "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0" + "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0" }, { "tp_size": 4, @@ -285,7 +285,7 @@ "prefix": "qwen3-next-80b-a3b-instruct-fp8-mtp-tp4-met", "bench_args": "", "extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 32768 --max-model-len 16384 --speculative-config '{\"num_speculative_tokens\":1, \"method\": \"mtp\"}'", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0" + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0" } ] }, diff --git a/.github/benchmark/oot_models_accuracy.json b/.github/benchmark/oot_models_accuracy.json index c2b9da043..226c374ed 100644 --- a/.github/benchmark/oot_models_accuracy.json +++ b/.github/benchmark/oot_models_accuracy.json @@ -3,7 +3,7 @@ "model_name": "Qwen3-235B-A22B-Instruct-2507-FP8 TP8+EP8", "model_path": "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8", "extraArgs": "--tensor-parallel-size 8 --enable-expert-parallel", - "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1", + "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0", "runner": "linux-atom-mi35x-8", "test_level": "nightly", "accuracy_threshold": 0.87, @@ -14,7 +14,7 @@ "model_name": "Qwen3-Next-80B-A3B-Instruct-FP8 TP4", "model_path": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8", "extraArgs": "--tensor-parallel-size 4 --attention-backend ROCM_AITER_FA", - "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0", + "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0", "runner": "linux-atom-mi35x-4", "test_level": "nightly", "accuracy_threshold": 0.76, @@ -25,7 +25,7 @@ "model_name": "Qwen3.5-397B-A17B-FP8 TP8", "model_path": "Qwen/Qwen3.5-397B-A17B-FP8", "extraArgs": "--tensor-parallel-size 8 --attention-backend ROCM_AITER_FA", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0", + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0", "runner": "linux-atom-mi35x-8", "test_level": "nightly", "accuracy_threshold": 0.83, @@ -36,7 +36,7 @@ "model_name": "Qwen3.5-397B-A17B TP8", "model_path": "Qwen/Qwen3.5-397B-A17B", "extraArgs": "--tensor-parallel-size 8 --attention-backend ROCM_AITER_FA", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0", + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0", "runner": "linux-atom-mi35x-8", "test_level": "nightly", "accuracy_threshold": 0.83, @@ -47,7 +47,7 @@ "model_name": "Qwen3.5-397B-A17B-MXFP4 TP4", "model_path": "amd/Qwen3.5-397B-A17B-MXFP4", "extraArgs": "--tensor-parallel-size 4 --attention-backend ROCM_AITER_FA", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0", + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0", "runner": "linux-atom-mi35x-4", "test_level": "nightly", "accuracy_threshold": 0.82, @@ -59,10 +59,10 @@ "model_name": "Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4", "model_path": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8", "extraArgs": "--tensor-parallel-size 4 --speculative-config '{\"num_speculative_tokens\":1, \"method\": \"mtp\"}'", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0", + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0", "runner": "linux-atom-mi35x-4", "test_level": "nightly", - "accuracy_threshold": 0.80, + "accuracy_threshold": 0.8, "accuracy_baseline": 0.81, "accuracy_baseline_model": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8", "_baseline_note": "Qwen3-Next-80B-A3B-Instruct-FP8 baseline with TP4 (no MTP) as proxy; needs CI measurement for MTP-specific baseline" @@ -169,7 +169,7 @@ "runner": "linux-atom-mi35x-1", "test_level": "nightly", "accuracy_threshold": 0.88, - "accuracy_baseline": 0.90, + "accuracy_baseline": 0.9, "accuracy_baseline_model": "openai/gpt-oss-120b" }, { @@ -181,7 +181,7 @@ "runner": "linux-atom-mi35x-4", "test_level": "nightly", "accuracy_threshold": 0.88, - "accuracy_baseline": 0.90, + "accuracy_baseline": 0.9, "accuracy_baseline_model": "openai/gpt-oss-120b" }, { diff --git a/.github/workflows/atom-vllm-accuracy-validation.yaml b/.github/workflows/atom-vllm-accuracy-validation.yaml index b940f3753..dc448212e 100644 --- a/.github/workflows/atom-vllm-accuracy-validation.yaml +++ b/.github/workflows/atom-vllm-accuracy-validation.yaml @@ -175,7 +175,7 @@ jobs: "model_path": "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8", "extra_args": "--tensor-parallel-size 8 --enable-expert-parallel", "accuracy_test_threshold": 0.87, - "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1", + "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0", "runner": "linux-atom-mi35x-8", }, { @@ -184,7 +184,7 @@ jobs: "model_path": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8", "extra_args": "--tensor-parallel-size 1", "accuracy_test_threshold": 0.83, - "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=1", + "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=1\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0", "runner": "linux-atom-mi35x-1", }, { @@ -193,7 +193,7 @@ jobs: "model_path": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8", "extra_args": "--tensor-parallel-size 4", "accuracy_test_threshold": 0.83, - "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=1", + "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=1\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0", "runner": "linux-atom-mi35x-4", }, { @@ -202,7 +202,7 @@ jobs: "model_path": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8", "extra_args": "--tensor-parallel-size 4 --speculative-config '{\"num_speculative_tokens\":1, \"method\": \"mtp\"}'", "accuracy_test_threshold": 0.80, - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0", + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0", "runner": "linux-atom-mi35x-4", }, { @@ -211,7 +211,7 @@ jobs: "model_path": "Qwen/Qwen3.5-397B-A17B-FP8", "extra_args": "--tensor-parallel-size 8", "accuracy_test_threshold": 0.83, - "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0", + "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0", "runner": "linux-atom-mi35x-8", }, { @@ -220,7 +220,7 @@ jobs: "model_path": "Qwen/Qwen3.5-397B-A17B", "extra_args": "--tensor-parallel-size 8", "accuracy_test_threshold": 0.83, - "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0", + "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0", "runner": "linux-atom-mi35x-8", }, { @@ -229,7 +229,7 @@ jobs: "model_path": "amd/Qwen3.5-397B-A17B-MXFP4", "extra_args": "--tensor-parallel-size 4", "accuracy_test_threshold": 0.83, - "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0", + "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0", "runner": "linux-atom-mi35x-4", }, { diff --git a/recipes/atom_vllm/Qwen3.5.md b/recipes/atom_vllm/Qwen3.5.md index 2b30b540a..93848beca 100644 --- a/recipes/atom_vllm/Qwen3.5.md +++ b/recipes/atom_vllm/Qwen3.5.md @@ -18,6 +18,7 @@ The ATOM vLLM plugin backend keeps the standard vLLM CLI, server APIs, and gener export AITER_QUICK_REDUCE_QUANTIZATION=INT4 export ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1 export ATOM_USE_CUSTOM_ALL_GATHER=0 +export ATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0 vllm serve Qwen/Qwen3.5-35B-A3B-FP8 \ --host localhost \ @@ -37,6 +38,7 @@ vllm serve Qwen/Qwen3.5-35B-A3B-FP8 \ export AITER_QUICK_REDUCE_QUANTIZATION=INT4 export ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1 export ATOM_USE_CUSTOM_ALL_GATHER=0 +export ATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0 vllm serve Qwen/Qwen3.5-397B-A17B-FP8 \ --host localhost \ @@ -56,6 +58,7 @@ vllm serve Qwen/Qwen3.5-397B-A17B-FP8 \ export AITER_QUICK_REDUCE_QUANTIZATION=INT4 export ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1 export ATOM_USE_CUSTOM_ALL_GATHER=0 +export ATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0 vllm serve amd/Qwen3.5-397B-A17B-MXFP4 \ --host localhost \ diff --git a/recipes/atom_vllm/Qwen3Next.md b/recipes/atom_vllm/Qwen3Next.md index 97e8bdaa4..019e297f6 100644 --- a/recipes/atom_vllm/Qwen3Next.md +++ b/recipes/atom_vllm/Qwen3Next.md @@ -18,6 +18,7 @@ The ATOM vLLM plugin backend keeps the standard vLLM CLI, server APIs, and gener export ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1 export ATOM_USE_CUSTOM_ALL_GATHER=0 export AITER_QUICK_REDUCE_QUANTIZATION=INT4 +export ATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0 vllm serve Qwen/Qwen3-Next-80B-A3B-Instruct-FP8 \ --host localhost \ @@ -37,6 +38,7 @@ vllm serve Qwen/Qwen3-Next-80B-A3B-Instruct-FP8 \ export ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1 export ATOM_USE_CUSTOM_ALL_GATHER=0 export AITER_QUICK_REDUCE_QUANTIZATION=INT4 +export ATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0 vllm serve Qwen/Qwen3-Next-80B-A3B-Instruct-FP8 \ --host localhost \ From f3d17e7c86f4ccb5055b5257fc8f8cb4a3898d73 Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Fri, 15 May 2026 14:42:24 +0800 Subject: [PATCH 14/15] [ci] fix Qwen3-Next MTP benchmark label from MET to AW Co-Authored-By: Claude Opus 4 --- .github/benchmark/oot_benchmark_models.json | 8 +++--- .github/workflows/atom-vllm-benchmark.yaml | 32 ++++++++++----------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/benchmark/oot_benchmark_models.json b/.github/benchmark/oot_benchmark_models.json index 4b9c7a9a9..f06a8e3c3 100644 --- a/.github/benchmark/oot_benchmark_models.json +++ b/.github/benchmark/oot_benchmark_models.json @@ -271,18 +271,18 @@ }, { "tp_size": 1, - "display": "Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET)", + "display": "Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (AW)", "dashboard_model": "Qwen3-Next-80B-A3B-Instruct-FP8-mtp-tp1", - "prefix": "qwen3-next-80b-a3b-instruct-fp8-mtp-tp1-met", + "prefix": "qwen3-next-80b-a3b-instruct-fp8-mtp-tp1-aw", "bench_args": "", "extra_args": "--trust-remote-code --tensor-parallel-size 1 --max-num-batched-tokens 32768 --max-model-len 16384 --speculative-config '{\"num_speculative_tokens\":1, \"method\": \"mtp\"}'", "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0" }, { "tp_size": 4, - "display": "Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET)", + "display": "Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (AW)", "dashboard_model": "Qwen3-Next-80B-A3B-Instruct-FP8-mtp-tp4", - "prefix": "qwen3-next-80b-a3b-instruct-fp8-mtp-tp4-met", + "prefix": "qwen3-next-80b-a3b-instruct-fp8-mtp-tp4-aw", "bench_args": "", "extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 32768 --max-model-len 16384 --speculative-config '{\"num_speculative_tokens\":1, \"method\": \"mtp\"}'", "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=0\nATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0" diff --git a/.github/workflows/atom-vllm-benchmark.yaml b/.github/workflows/atom-vllm-benchmark.yaml index fa3d96707..d7a178d51 100644 --- a/.github/workflows/atom-vllm-benchmark.yaml +++ b/.github/workflows/atom-vllm-benchmark.yaml @@ -36,8 +36,8 @@ on: - Qwen3.5-397B-A17B TP8 (OOB) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET) - - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET) - - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (AW) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW) @@ -71,8 +71,8 @@ on: - Qwen3.5-397B-A17B TP8 (OOB) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET) - - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET) - - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (AW) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW) @@ -106,8 +106,8 @@ on: - Qwen3.5-397B-A17B TP8 (OOB) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET) - - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET) - - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (AW) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW) @@ -141,8 +141,8 @@ on: - Qwen3.5-397B-A17B TP8 (OOB) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET) - - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET) - - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (AW) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW) @@ -176,8 +176,8 @@ on: - Qwen3.5-397B-A17B TP8 (OOB) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET) - - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET) - - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (AW) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW) @@ -211,8 +211,8 @@ on: - Qwen3.5-397B-A17B TP8 (OOB) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET) - - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET) - - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (AW) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW) @@ -246,8 +246,8 @@ on: - Qwen3.5-397B-A17B TP8 (OOB) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET) - - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET) - - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (AW) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW) @@ -281,8 +281,8 @@ on: - Qwen3.5-397B-A17B TP8 (OOB) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (MET) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (MET) - - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (MET) - - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (MET) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP1 (AW) + - Qwen3-Next-80B-A3B-Instruct-FP8-MTP TP4 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP1 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP2 (AW) - Qwen3-Next-80B-A3B-Instruct-FP8 TP4 (AW) From 9b9a77c0a6200cacbd9f68bb69e91fc8f431382a Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Fri, 15 May 2026 14:47:41 +0800 Subject: [PATCH 15/15] [docs] fix Qwen3.5 recipe: update env var count and add preshuffle doc Remove stale "three" count (now variable list), add ATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0 to both the Important section and Key Environment Variables section. Co-Authored-By: Claude Opus 4 --- recipes/atom_vllm/Qwen3.5.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/recipes/atom_vllm/Qwen3.5.md b/recipes/atom_vllm/Qwen3.5.md index 93848beca..4e3b8c077 100644 --- a/recipes/atom_vllm/Qwen3.5.md +++ b/recipes/atom_vllm/Qwen3.5.md @@ -72,9 +72,10 @@ vllm serve amd/Qwen3.5-397B-A17B-MXFP4 \ --no-enable-prefix-caching ``` -**Important**: The following three environment variables are required for Qwen3.5: +**Important**: The following environment variables are required for Qwen3.5: - `ATOM_USE_CUSTOM_ALL_GATHER=0`: Disables custom all-gather for compatibility with Qwen3.5 model architecture +- `ATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0`: Disables FP8 blockscale weight preshuffle - `AITER_QUICK_REDUCE_QUANTIZATION=INT4`: **Performance optimization** - enables INT4 quantization for quick reduce operations, which can significantly improve TTFT (Time To First Token) performance. **Note**: This optimization may introduce a risk of accuracy degradation. For accuracy-critical workloads, consider validating with your specific use case. ## Step 3: Performance Benchmark @@ -136,6 +137,7 @@ Reference result (TP=4): ## Key Environment Variables - `ATOM_USE_CUSTOM_ALL_GATHER=0`: **Required** - disables custom all-gather for compatibility with Qwen3.5 model architecture +- `ATOM_FP8_BLOCKSCALE_WEIGHT_PRESHUFFLE=0`: **Required** - disables FP8 blockscale weight preshuffle - `AITER_QUICK_REDUCE_QUANTIZATION=INT4`: **Performance optimization** - enables INT4 quantization for quick reduce operations - **Benefit**: Significantly improves TTFT (Time To First Token) performance by reducing communication overhead during tensor parallelism all-reduce operations - **Risk**: May cause slight accuracy degradation due to lower quantization precision