diff --git a/.gitignore b/.gitignore index 3e9609afe1..5f049abe88 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,8 @@ __pycache__/ *.so # Distribution / packaging +.*/ +tests/ .Python triton-rerope/ develop-eggs/ diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py index 126bf5e800..15ac1effb3 100644 --- a/lmdeploy/turbomind/deploy/config.py +++ b/lmdeploy/turbomind/deploy/config.py @@ -62,6 +62,7 @@ class ModelConfig: window_size: List[int] = field(default_factory=list) attn_sink: bool = False qk_norm: bool = False + qk_norm_type: str = 'per_head' size_per_head: int = 128 group_size: int = 32 data_type: str = None @@ -82,6 +83,7 @@ class ModelConfig: routed_scale: float = 1.0 topk_group: int = 1 topk_method: str = 'greedy' + scoring_func: str = 'softmax' moe_group_num: int = 1 # MLA q_lora_rank: int = 0 diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py index 63af6c0de8..1ebf5dc4d7 100644 --- a/lmdeploy/turbomind/deploy/converter.py +++ b/lmdeploy/turbomind/deploy/converter.py @@ -87,8 +87,8 @@ def get_output_model_registered_name_and_config(model_path: str, model_format: s expert_weight_type = weight_type - # ONLY experts are in mxfp4 - if model_arch == 'GptOssForCausalLM': + # ONLY experts are quantized, attention weights remain in native dtype + if model_arch in ('GptOssForCausalLM', 'MiniMaxM2ForCausalLM'): weight_type = dtype config.model_config.model_arch = model_arch @@ -160,13 +160,19 @@ def get_tm_model(model_path, engine_config.model_format = quant_method group_size = _group_size - if engine_config.model_format in ['awq', 'gptq', 'compressed-tensors']: + if engine_config.model_format in ['awq', 'gptq']: # Compatible to awq models that are quantized by lmdeploy (<=v0.3.0) if not group_size: group_size = 128 assert group_size == 128, (f'model format is "{engine_config.model_format}" ' f'but group_size is {group_size}. Currently, only 128 ' 'is supported') + elif engine_config.model_format == 'compressed-tensors': + if not group_size: + group_size = 128 + assert group_size in [32, 128], (f'model format is "{engine_config.model_format}" ' + f'but group_size is {group_size}. Currently, only ' + '32 and 128 are supported') input_model_name = get_input_model_registered_name(model_path, engine_config.model_format) input_policy = get_input_policy(engine_config.model_format) diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py index 27f53ca452..694fb5a1ee 100644 --- a/lmdeploy/turbomind/deploy/module.py +++ b/lmdeploy/turbomind/deploy/module.py @@ -22,6 +22,35 @@ def permute_v2(x: torch.Tensor, size_per_head: int = 128): return x.view(-1, head_num, 2, size_per_head // 2).transpose(2, 3).reshape(x.shape) +def permute_v2_partial(x: torch.Tensor, size_per_head: int, rotary_dim: int): + """Permute only the first `rotary_dim` dims within each head for + TurboMind's interleaved RoPE layout. Non-RoPE dims are left as-is. + + This is needed for partial rotary models (e.g. MiniMax-M2.1 where + rotary_dim=64 but head_dim=128). + """ + assert x.size(-1) > 1 + output_dims = x.size(-1) + head_num = output_dims // size_per_head + non_rope_dim = size_per_head - rotary_dim + + # reshape to (*, head_num, size_per_head) + orig_shape = x.shape + x = x.view(-1, head_num, size_per_head) if x.dim() >= 2 else x.view(head_num, size_per_head) + + # split each head into rope part and non-rope part + rope_part = x[..., :rotary_dim] # (..., head_num, rotary_dim) + rest_part = x[..., rotary_dim:] # (..., head_num, non_rope_dim) + + # permute only the rope part: interleave first/second halves + rope_part = rope_part.view(*rope_part.shape[:-1], 2, rotary_dim // 2) + rope_part = rope_part.transpose(-2, -1).reshape(*rope_part.shape[:-2], rotary_dim) + + # recombine + x = torch.cat([rope_part, rest_part], dim=-1) + return x.reshape(orig_shape) + + def merge_qkv_v2(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, tp: int): """ Contract: x.size(-1) is output dims @@ -181,14 +210,27 @@ def __init__(self, model: BaseOutputModel): self.qk_norm = model.model_config.qk_norm self.attn_sink = model.model_config.attn_sink self.group_size = max(1, model.model_config.group_size) + # rotary_dim for partial rotary models (e.g. MiniMax-M2.1) + rope_param = getattr(model.attention_config, 'rope_param', None) + self.rotary_dim = rope_param.dim if rope_param else self.head_dim + + def _permute(self, x): + """Permute Q/K weights for TurboMind's interleaved RoPE layout. + + Uses partial permutation when rotary_dim < head_dim to avoid + corrupting non-RoPE dimensions. + """ + if self.rotary_dim < self.head_dim: + return permute_v2_partial(x, self.head_dim, self.rotary_dim) + return permute_v2(x, self.head_dim) def _reorder_and_merge(self, qkvo, gs: int): q, k, v, o = qkvo # reorder output dim for tm's rotary embedding layout if self.model.permute_qk: if gs == 1: - q = permute_v2(q, self.head_dim) - k = permute_v2(k, self.head_dim) + q = self._permute(q) + k = self._permute(k) else: assert gs % self.head_dim == 0 qkv = merge_qkv_v2(q, k, v, self.tp) @@ -254,10 +296,25 @@ def apply(self, i: int, r: BaseReader): if self.qk_norm: q, k = r.qk_norm(i) if self.model.permute_qk: - q = permute_v2(q, self.head_dim) - k = permute_v2(k, self.head_dim) - self.model.save_split(q, self._attn.format(i, 'q_norm', '')[:-1]) - self.model.save_split(k, self._attn.format(i, 'k_norm', '')[:-1]) + q = self._permute(q) + k = self._permute(k) + head_num = self.model.model_config.head_num + kv_head_num = self.model.model_config.kv_head_num + # C++ allocates per-head buffers: (local_head_num * head_dim) + # Shared QK norm (head_dim,): broadcast to per-head size + # Per-head QK norm (num_heads * head_dim,): split across TP + if q.numel() == self.head_dim: + q = q.repeat(head_num // self.tp) + if k.numel() == self.head_dim: + k = k.repeat(kv_head_num // self.tp) + # Handle repeat_kv: replicate per-head K norm weights + if self.model.repeat_kv and k.numel() > self.head_dim: + k = k.view(-1, self.head_dim).repeat_interleave( + self.model.repeat_kv, dim=0).reshape(-1) + q_name = self._attn.format(i, 'q_norm', '')[:-1] + k_name = self._attn.format(i, 'k_norm', '')[:-1] + self.model.save_split(q, q_name, split_dim=-1, split_num=self.tp) + self.model.save_split(k, k_name, split_dim=-1, split_num=self.tp) if self.attn_sink: sinks = r.attn_sinks(i) self.model.save_split(sinks, self._attn.format(i, 'sinks', '')[:-1], split_dim=-1, split_num=self.tp) diff --git a/lmdeploy/turbomind/deploy/parameter.py b/lmdeploy/turbomind/deploy/parameter.py index 31babc5ec5..4a78291dbf 100644 --- a/lmdeploy/turbomind/deploy/parameter.py +++ b/lmdeploy/turbomind/deploy/parameter.py @@ -32,11 +32,10 @@ def pack_u4_row(x: torch.Tensor) -> torch.Tensor: def generate_zero_point(g): - weight_shapes = g('weight_shape') + weight_scales = g('weight_scale') result = [] - for weight_shape in weight_shapes: - row, col = weight_shape - tensor = torch.full((row, col // 128), 8, dtype=torch.uint8) + for scale in weight_scales: + tensor = torch.full(scale.shape, 8, dtype=torch.uint8) result.append(tensor) return (*result, ) diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py index ecdb6400ee..bb77a22f4a 100644 --- a/lmdeploy/turbomind/deploy/source_model/__init__.py +++ b/lmdeploy/turbomind/deploy/source_model/__init__.py @@ -9,6 +9,7 @@ from .llama import LlamaModel # noqa: F401 from .llava import LlavaModel # noqa: F401 from .minicpmv import MiniCPMVModel # noqa: F401 +from .minimax_m2 import MiniMaxM2Model # noqa: F401 from .mixtral import MixtralModel # noqa: F401 from .molmo import MolmoModel # noqa: F401 from .qwen import QwenModel # noqa: F401 diff --git a/lmdeploy/turbomind/deploy/source_model/minimax_m2.py b/lmdeploy/turbomind/deploy/source_model/minimax_m2.py new file mode 100644 index 0000000000..3f8df2734d --- /dev/null +++ b/lmdeploy/turbomind/deploy/source_model/minimax_m2.py @@ -0,0 +1,57 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from .base import INPUT_MODELS +from .llama import LlamaModel, LlamaReader + + +class MiniMaxM2Reader(LlamaReader): + + def moe_ffn_expert(self, e=None, i=None, kind=None): + if not kind: + return self.filter(r'experts') + result = [] + for x in ['w1', 'w2', 'w3']: + name = f'model.layers.{i}.block_sparse_moe.experts.{e}.{x}.{kind}' + tensor = self.params.get(name) + tensor = self.transform(tensor, kind) + result.append(tensor) + return (*result, ) + + def moe_ffn_gate(self, i, kind): + if kind == 'bias': + return self.params.get( + f'model.layers.{i}.block_sparse_moe.e_score_correction_bias') + return self.params.get( + f'model.layers.{i}.block_sparse_moe.gate.{kind}') + + def qk_norm(self, i: int): + result = [] + for x in ['q', 'k']: + name = f'model.layers.{i}.self_attn.{x}_norm.weight' + result.append(self.transform(self.params.get(name), 'weight')) + return (*result, ) + + +@INPUT_MODELS.register_module(name='minimax-m2') +class MiniMaxM2Model(LlamaModel): + + Reader = MiniMaxM2Reader + + def model_info(self): + cfg = self.model_config + info = super().model_info() + info.update( + qk_norm=True, + qk_norm_type='per_token', + expert_num=cfg['num_local_experts'], + expert_inter_size=cfg['intermediate_size'], + experts_per_token=cfg['num_experts_per_tok'], + inter_size=0, + norm_topk_prob=True, + expert_router_bias=True, + scoring_func=cfg.get('scoring_func', 'sigmoid'), + ) + rotary_dim = cfg.get('rotary_dim', None) + if rotary_dim is not None: + info['rope_param'].dim = rotary_dim + return info diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py index da328e808b..13bc3a2b63 100644 --- a/lmdeploy/turbomind/supported_models.py +++ b/lmdeploy/turbomind/supported_models.py @@ -56,6 +56,8 @@ # mixtral MixtralForCausalLM='mixtral', MolmoForCausalLM='molmo', + # minimax-m2 + MiniMaxM2ForCausalLM='minimax-m2', ) diff --git a/src/turbomind/kernels/gemm/kernel/sm70_884_4.cu b/src/turbomind/kernels/gemm/kernel/sm70_884_4.cu index 707fd4a289..5e495e5dbb 100644 --- a/src/turbomind/kernels/gemm/kernel/sm70_884_4.cu +++ b/src/turbomind/kernels/gemm/kernel/sm70_884_4.cu @@ -51,6 +51,24 @@ void Registry::sm70_884_4() // clang-format on } + // U4 grouped with group_size=32 (for compressed-tensors models like MiniMax M2.1) + if constexpr (1) { + // clang-format off + using C = Config_U4_g; + Add>(); + Add>(); + Add>(); + Add>(); + Add>(); + Add>(); + Add>(); + Add>(); + Add>(); + Add>(); + Add>(); + // clang-format on + } + if constexpr (1) { // clang-format off using C = Config_MXF4; diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu index ef8230a7df..0c990e3a53 100644 --- a/src/turbomind/kernels/gemm/moe_utils_v2.cu +++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu @@ -271,7 +271,9 @@ __global__ void MoeGateKernel_v8(float* scales, // [e,n] int top_k, bool softmax, bool norm_topk, - float routed_scale) + float routed_scale, + const float* router_bias, + bool use_sigmoid) { constexpr int max_tiles = kMoeGateMaxTiles; constexpr int threads_per_token = max_expert_num / items_per_thread; // 8 @@ -298,6 +300,7 @@ __global__ void MoeGateKernel_v8(float* scales, // [e,n] float data[items_per_thread]; int idxs[items_per_thread]; + float orig_data[items_per_thread]; #if 0 PRAGMA_UNROLL @@ -428,6 +431,35 @@ __global__ void MoeGateKernel_v8(float* scales, // [e,n] // } // } + // Sigmoid mode: apply sigmoid to logits, save original scores, then add bias for top-k selection + if (use_sigmoid) { + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + if (data[i] > -std::numeric_limits::infinity()) { + data[i] = 1.0f / (1.0f + expf(-data[i])); + } + else { + data[i] = 0.0f; + } + orig_data[i] = data[i]; // save original sigmoid scores (without bias) + } + // Add bias for top-k selection only + if (router_bias && ti < token_num) { + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; i += access_size) { + const int e = ei * items_per_thread + i; + if (e < expert_num) { + Array bias_vec; + Ldg(bias_vec, &router_bias[e]); + PRAGMA_UNROLL + for (int c = 0; c < access_size; ++c) { + data[i + c] += bias_vec[c]; + } + } + } + } + } + unsigned mask = (unsigned)-1; float max_logit; @@ -505,6 +537,26 @@ __global__ void MoeGateKernel_v8(float* scales, // [e,n] } sum_prob = fdividef(1.f, sum_prob); } + else if (use_sigmoid) { + // Use original sigmoid scores (without bias) for weight computation + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + if (used[i]) { + sum_prob += orig_data[i]; + } + } + // Reduce across threads + PRAGMA_UNROLL + for (int m = threads_per_token / 2; m >= 1; m /= 2) { + sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m); + } + sum_prob = fdividef(1.f, sum_prob); + // Replace data with orig_data for the selected experts + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + data[i] = orig_data[i]; + } + } else { sum_prob = 1.f; } @@ -583,6 +635,8 @@ void invokeMoeGate_V2(int* f2n, // [e*n] -> n bool softmax, bool norm_topk, float routed_scale, + const float* router_bias, + bool use_sigmoid, cudaStream_t st) { constexpr int base_log_tile = 9; @@ -616,12 +670,14 @@ void invokeMoeGate_V2(int* f2n, // [e*n] -> n experts_per_token, softmax, norm_topk, - routed_scale); + routed_scale, + router_bias, + use_sigmoid); return true; }; - if (!softmax && norm_topk) { + if (!softmax && !use_sigmoid && norm_topk) { // norm top-k is part of softmax impl TM_CHECK(0) << softmax << " " << norm_topk; } diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.h b/src/turbomind/kernels/gemm/moe_utils_v2.h index 1946d3ed39..9f03fb4516 100644 --- a/src/turbomind/kernels/gemm/moe_utils_v2.h +++ b/src/turbomind/kernels/gemm/moe_utils_v2.h @@ -27,6 +27,8 @@ void invokeMoeGate_V2(int* f2n, bool softmax, bool norm_topk, float routed_scale, + const float* router_bias, + bool use_sigmoid, cudaStream_t st); void invokeMoeDispatch(Ref out_, // diff --git a/src/turbomind/kernels/norm/rms_norm.cu b/src/turbomind/kernels/norm/rms_norm.cu index ed263ddacf..5e61fb4594 100644 --- a/src/turbomind/kernels/norm/rms_norm.cu +++ b/src/turbomind/kernels/norm/rms_norm.cu @@ -129,7 +129,8 @@ __global__ void RMSNormQK(T* data, // int n, int token_num, float eps, - float inv_dim) + float inv_dim, + int w_stride) { static_assert((max_dim & (max_dim - 1)) == 0); @@ -170,7 +171,7 @@ __global__ void RMSNormQK(T* data, // Array w; if (di < dim) { - Ldg(w, &weight[di]); + Ldg(w, &weight[hi * w_stride + di]); PRAGMA_UNROLL for (int i = 0; i < vec_size; ++i) { vec[i] = (T)((float)vec[i] * sum) * w[i]; @@ -209,7 +210,7 @@ void invokeQkRMSNorm(void* data, const int grid_dim = cdiv(threads, block_dim); kernel::RMSNormQK<<>>( - (T*)data, ld, (const T*)weight, head_dim, n, token_num, eps, 1.f / head_dim); + (T*)data, ld, (const T*)weight, head_dim, n, token_num, eps, 1.f / head_dim, 0); }; TM_DISPATCH_PRIMARY_DTYPES(dtype, invoke); @@ -224,6 +225,20 @@ void invokeRMSNormQK(Tensor& x, const Tensor& w, float eps, cudaStream_t st) TM_CHECK(x.stride(1) == head_dim); + // Detect per-head vs shared QK norm based on weight size + // shared: w.shape(0) == head_dim, per-head: w.shape(0) == head_num * head_dim + const int w_size = w.shape(0); + int w_stride; + if (w_size == head_dim) { + w_stride = 0; // shared: same weight for all heads + } + else { + TM_CHECK(w_size == head_num * head_dim) << "qk_norm weight size " << w_size + << " must be head_dim (" << head_dim + << ") or head_num*head_dim (" << head_num * head_dim << ")"; + w_stride = head_dim; // per-head: stride by head_dim + } + auto data = x.raw_data(); auto stride = x.stride(0); @@ -243,7 +258,7 @@ void invokeRMSNormQK(Tensor& x, const Tensor& w, float eps, cudaStream_t st) const int grid_dim = cdiv(threads, block_dim); kernel::RMSNormQK<<>>( - (T*)data, stride, (const T*)w.raw_data(), head_dim, head_num, token_num, eps, 1.f / head_dim); + (T*)data, stride, (const T*)w.raw_data(), head_dim, head_num, token_num, eps, 1.f / head_dim, w_stride); }; TM_DISPATCH_PRIMARY_DTYPES(x.dtype(), invoke); diff --git a/src/turbomind/models/llama/LlamaDenseWeight.cc b/src/turbomind/models/llama/LlamaDenseWeight.cc index 6035de7e2e..ad2b5f32cb 100644 --- a/src/turbomind/models/llama/LlamaDenseWeight.cc +++ b/src/turbomind/models/llama/LlamaDenseWeight.cc @@ -282,10 +282,10 @@ LlamaAttentionWeight::LlamaAttentionWeight(int hidden_dim, hidden_dim, (head_num + 2 * kv_head_num) * head_dim / tp_size, data_type, bias, weight_type, group_size); register_module("w_qkv", qkv, tp_rank); if (qk_norm) { - q_a_layernorm = Tensor{{head_dim}, data_type, kDEVICE}; - kv_a_layernorm = Tensor{{head_dim}, data_type, kDEVICE}; - register_parameter("q_norm", q_a_layernorm); - register_parameter("k_norm", kv_a_layernorm); + q_a_layernorm = Tensor{{head_num * head_dim / tp_size}, data_type, kDEVICE}; + kv_a_layernorm = Tensor{{kv_head_num * head_dim / tp_size}, data_type, kDEVICE}; + register_parameter(std::to_string(tp_rank) + ".q_norm", q_a_layernorm); + register_parameter(std::to_string(tp_rank) + ".k_norm", kv_a_layernorm); } } else { diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h index 169b898142..45d712e01c 100644 --- a/src/turbomind/models/llama/llama_params.h +++ b/src/turbomind/models/llama/llama_params.h @@ -37,9 +37,10 @@ struct ModelParam { DataType weight_type; DataType expert_weight_type; int group_size; - MLAParam mla; - bool qk_norm; - int tune_layer_num; + MLAParam mla; + bool qk_norm; + std::string qk_norm_type; // "per_head" (default) or "per_token" (MiniMax-M2 style) + int tune_layer_num; ActivationType act_type; @@ -65,6 +66,7 @@ struct MoeParam { int topk_group; std::string topk_method; + std::string scoring_func; // "softmax" or "sigmoid" int n_group; std::vector expert_num; diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc index c4246b1aa7..3d869c68bf 100644 --- a/src/turbomind/models/llama/moe_ffn_layer.cc +++ b/src/turbomind/models/llama/moe_ffn_layer.cc @@ -8,6 +8,7 @@ #include "src/turbomind/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/LlamaLinear.h" +#include "src/turbomind/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_params.h" #include "src/turbomind/models/llama/llama_utils.h" #include "src/turbomind/models/llama/moe_ffn_layer.h" @@ -62,15 +63,17 @@ MoeFfnLayer::MoeFfnLayer(const ModelParam& model, const MoeParam& param, const E accum_ = {max_expert_num * kMoeGateMaxTiles, kDEVICE}; } -Tensor_ MoeFfnLayer::Gate(const Tensor& input, const LlamaDenseWeight& gate) +Tensor_ MoeFfnLayer::Gate(const Tensor& input, const LlamaDenseWeight& gate, bool apply_bias) { auto& weight = gate.weight; TM_CHECK_EQ(input.shape(1), weight.shape(0)); Tensor_ logits{{input.shape(0), weight.shape(1)}, kDEVICE}; linear_.Forward(input, gate, logits); sync_check_cuda_error(); - ApplyBias(logits, gate.bias, core::Context::stream().handle()); - sync_check_cuda_error(); + if (apply_bias) { + ApplyBias(logits, gate.bias, core::Context::stream().handle()); + sync_check_cuda_error(); + } return logits; } @@ -84,7 +87,9 @@ void MoeFfnLayer::Forward(ForwardParam& p) FT_CHECK(expert_num); - auto logits = Gate(p.input, moe.gate); + const bool use_sigmoid = (param_.scoring_func == "sigmoid"); + + auto logits = Gate(p.input, moe.gate, !use_sigmoid); TM_DEBUG_TENSOR(logits, "logits", 2); @@ -96,6 +101,9 @@ void MoeFfnLayer::Forward(ForwardParam& p) // dump_logits(tokens, layer_id); bool softmax = true; + if (use_sigmoid) { + softmax = false; + } if (param_.topk_method == "group_limited_greedy") { invokeMoeSoftmaxMaskTopKGroups( logits.data(), tokens, expert_num, expert_num / param_.n_group, param_.topk_group, st); @@ -103,6 +111,21 @@ void MoeFfnLayer::Forward(ForwardParam& p) softmax = false; } + const float* router_bias = nullptr; + if (use_sigmoid && moe.gate.bias) { + // The gate bias tensor may be stored in the model's data_type (e.g. float16), + // but the kernel expects float32. Convert and cache on first use. + if (!router_bias_f32_) { + const auto bias_size = moe.gate.bias.size(); + const auto src_2d = moe.gate.bias.view({1, bias_size}); + Tensor dst_2d{{1, bias_size}, kFloat, kDEVICE}; + invokeCastFloat2D(src_2d, dst_2d, st); + sync_check_cuda_error(); + router_bias_f32_ = dst_2d.buffer(); + } + router_bias = router_bias_f32_.data(); + } + /// TODO: fix illegal memory access even if NaN are present in logits invokeMoeGate_V2(f2n_.data(), f2E_.data(), @@ -119,6 +142,8 @@ void MoeFfnLayer::Forward(ForwardParam& p) softmax, param_.norm_topk_prob, param_.routed_scale, + router_bias, + use_sigmoid, st); sync_check_cuda_error(); diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h index 939cd9c60e..8207ab5340 100644 --- a/src/turbomind/models/llama/moe_ffn_layer.h +++ b/src/turbomind/models/llama/moe_ffn_layer.h @@ -27,7 +27,7 @@ class MoeFfnLayer { void Combine(ForwardParam& p); private: - Tensor_ Gate(const Tensor& input, const LlamaDenseWeight& gate); + Tensor_ Gate(const Tensor& input, const LlamaDenseWeight& gate, bool apply_bias = true); void dump_logits(int token_num, int layer_id, int expert_num); @@ -57,6 +57,8 @@ class MoeFfnLayer { Tensor temp_; Tensor_ shared_scales_; + + Buffer_ router_bias_f32_; // cached float32 copy of router bias for sigmoid mode /////////////////////////////////////////////////////// }; diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc index d09b696d1a..71931b0a57 100644 --- a/src/turbomind/models/llama/unified_attention_layer.cc +++ b/src/turbomind/models/llama/unified_attention_layer.cc @@ -603,15 +603,32 @@ void UnifiedAttentionLayer::qk_norm(Tensor& qkv, const WeightType& weights) const auto token_num = qkv.shape(0); - auto qkv3 = qkv.view({token_num, -1, (int)size_per_head_}); + if (model_param_.qk_norm_type == "per_token") { + // MiniMax-M2 style: RMS norm across all heads combined (per-token, not per-head). + // Q shape: (tokens, local_heads * head_dim), K shape: (tokens, local_kv_heads * head_dim) + const int q_dim = local_head_num_ * size_per_head_; + const int k_dim = local_kv_head_num_ * size_per_head_; + + auto q = qkv.view({token_num, -1}).slice({0, 0}, {-1, (int)q_dim}); + invokeRMSNorm(q, q, weights.q_a_layernorm, model_param_.norm_eps, stream); + sync_check_cuda_error(); - auto q = qkv3.slice({0, 0, 0}, {-1, (int)local_head_num_, -1}); - invokeRMSNormQK(q, weights.q_a_layernorm, model_param_.norm_eps, stream); - sync_check_cuda_error(); + auto k = qkv.view({token_num, -1}).slice({0, (int)q_dim}, {-1, (int)k_dim}); + invokeRMSNorm(k, k, weights.kv_a_layernorm, model_param_.norm_eps, aux_stream_); + sync_check_cuda_error(); + } + else { + // Standard per-head QK norm (e.g. Qwen3) + auto qkv3 = qkv.view({token_num, -1, (int)size_per_head_}); - auto k = qkv3.slice({0, (int)local_head_num_, 0}, {-1, (int)local_kv_head_num_, -1}); - invokeRMSNormQK(k, weights.kv_a_layernorm, model_param_.norm_eps, aux_stream_); - sync_check_cuda_error(); + auto q = qkv3.slice({0, 0, 0}, {-1, (int)local_head_num_, -1}); + invokeRMSNormQK(q, weights.q_a_layernorm, model_param_.norm_eps, stream); + sync_check_cuda_error(); + + auto k = qkv3.slice({0, (int)local_head_num_, 0}, {-1, (int)local_kv_head_num_, -1}); + invokeRMSNormQK(k, weights.kv_a_layernorm, model_param_.norm_eps, aux_stream_); + sync_check_cuda_error(); + } check_cuda_error(cudaEventRecord(aux_event_, aux_stream_)); check_cuda_error(cudaStreamWaitEvent(stream, aux_event_)); diff --git a/src/turbomind/turbomind.cc b/src/turbomind/turbomind.cc index 45bfab1bcb..4195bed225 100644 --- a/src/turbomind/turbomind.cc +++ b/src/turbomind/turbomind.cc @@ -390,9 +390,10 @@ TurboMind::Impl::Impl(string model_dir, string config, FFICtxFactory ffi_ctx_fac model_param_.window_size.push_back(it->as()); } - model_param_.attn_bias = model["attn_bias"].as(0); - model_param_.qk_norm = model["qk_norm"].as(); - model_param_.group_size = model["group_size"].as(0); + model_param_.attn_bias = model["attn_bias"].as(0); + model_param_.qk_norm = model["qk_norm"].as(); + model_param_.qk_norm_type = model["qk_norm_type"].as("per_head"); + model_param_.group_size = model["group_size"].as(0); attn_param_.softmax_scale = attention["softmax_scale"].as(0); // logn attn for qwen model @@ -451,6 +452,7 @@ TurboMind::Impl::Impl(string model_dir, string config, FFICtxFactory ffi_ctx_fac moe_param_.topk_method = model["topk_method"].as("greedy"); moe_param_.n_group = model["moe_group_num"].as(1); moe_param_.router_bias = model["expert_router_bias"].as(); + moe_param_.scoring_func = model["scoring_func"].as("softmax"); YAML::Node expert_num = model["expert_num"]; for (auto it = expert_num.begin(); it != expert_num.end(); ++it) { moe_param_.expert_num.push_back(it->as());