Skip to content

Commit 4988f6e

Browse files
michaelw9999CISC
andauthored
Add arch support for cohere2-MoE (ggml-org#24260)
* Add arch support for cohere2-MoE * Removed redundant gating_func checks * Changed ffn lookup to prefer prefix_dense_intermediate_size * Renamed arch to cohere2moe * Removed redundant lmhead check and chat template changes * Removed lm_head.weight check from modify tensors, load output tensor not required, fallback to token_embd.weight * Changed to (routed+shared)*0.5 for shared expert combined avg * fixed sliding_window_pattern issue and pattern * Fixed transformers crash 'first_k_dense_replace' error * Remove comment * Removed cohere2-moe as a tokenizer type and kept as tiny_aya. Renamed North-Mini-Code-1.0. * Fixed MTP fail, changed to use iSWA * Fixed remaining todos: cohere2moe renamed, changed swa parsing to use get_key_or_arr, removed extra get_arr use * Force metadata usage Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Remove Cohere2 checkpoint comment Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Remove MTP comment Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Regenerate cohere2moe tokenizer hash * Add cohere2moe to Llama Model Saver supported list * Check for zerobios tensors and add support for Command to use LayerNorm * Map expert_selection_fn to sigmoid in base.py instead of command.py * use bools for foundnorm/foundnormrms Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
1 parent f05cf46 commit 4988f6e

13 files changed

Lines changed: 632 additions & 7 deletions

conversion/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
"ChatGLMModel": "chatglm",
4141
"CodeShellForCausalLM": "codeshell",
4242
"CogVLMForCausalLM": "cogvlm",
43+
"Cohere2MoeForCausalLM": "command_r",
4344
"Cohere2ForCausalLM": "command_r",
4445
"CohereForCausalLM": "command_r",
4546
"DbrxForCausalLM": "dbrx",

conversion/base.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1195,7 +1195,7 @@ def set_gguf_parameters(self):
11951195
self.gguf_writer.add_embedding_length(n_embd)
11961196
logger.info(f"gguf: embedding length = {n_embd}")
11971197

1198-
if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
1198+
if (n_ff := self.find_hparam(["prefix_dense_intermediate_size", "intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
11991199
self.gguf_writer.add_feed_forward_length(n_ff)
12001200
logger.info(f"gguf: feed forward length = {n_ff}")
12011201

@@ -1280,7 +1280,7 @@ def set_gguf_parameters(self):
12801280
self.gguf_writer.add_expert_group_used_count(n_group_used)
12811281
logger.info(f"gguf: expert groups used count = {n_group_used}")
12821282

1283-
if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func"], optional=True)) is not None:
1283+
if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func", "expert_selection_fn"], optional=True)) is not None:
12841284
if score_func == "sigmoid":
12851285
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
12861286
elif score_func == "softmax":
@@ -1495,6 +1495,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
14951495
if chkhsh == "d772b220ace2baec124bed8cfafce0ead7d6c38a4b65ef11261cf9d5d62246d1":
14961496
# ref: https://huggingface.co/CohereLabs/tiny-aya-base
14971497
res = "tiny_aya"
1498+
if chkhsh == "52df12b4c8d4176e7481aab4b6e8454d1fd0a210a04a574f6d4e067d10e23c3e":
1499+
# ref: https://huggingface.co/CohereLabs/North-Mini-Code-1.0
1500+
res = "cohere2moe"
14981501
if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
14991502
# ref: https://huggingface.co/Qwen/Qwen1.5-7B
15001503
res = "qwen2"

conversion/command_r.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import re
34
from typing import Iterable, TYPE_CHECKING
45

56
import torch
@@ -55,3 +56,122 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
5556
return
5657

5758
yield from super().modify_tensors(data_torch, name, bid)
59+
60+
61+
@ModelBase.register("Cohere2MoeForCausalLM")
62+
class Cohere2MoeModel(TextModel):
63+
model_arch = gguf.MODEL_ARCH.COHERE2MOE
64+
_n_main_layers: int | None = None
65+
_expert_tensor_re = re.compile(
66+
r"model\.layers\.(\d+)\.mlp\.experts\.(\d+)\.(down_proj|gate_proj|up_proj)\.weight"
67+
)
68+
69+
def __init__(self, *args, **kwargs):
70+
super().__init__(*args, **kwargs)
71+
if (n_nextn := int(self.hparams.get("num_nextn_predict_layers", 0) or 0)) > 0 and not self.no_mtp:
72+
self.block_count += n_nextn
73+
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
74+
self._experts: list[dict[str, Tensor]] = [{} for _ in range(self.block_count)]
75+
76+
def _set_vocab_gpt2(self) -> None:
77+
tokens, toktypes, tokpre = self.get_vocab_base()
78+
self.gguf_writer.add_tokenizer_model("gpt2")
79+
self.gguf_writer.add_tokenizer_pre(tokpre)
80+
self.gguf_writer.add_token_list(tokens)
81+
self.gguf_writer.add_token_types(toktypes)
82+
83+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
84+
special_vocab.add_to_gguf(self.gguf_writer)
85+
86+
def set_gguf_parameters(self):
87+
hparams = self.hparams
88+
expert_intermediate_size = hparams["intermediate_size"]
89+
mlp_layer_types = hparams.get("mlp_layer_types")
90+
n_dense_lead = hparams.get("first_k_dense_replace", 0)
91+
if mlp_layer_types is not None:
92+
n_dense_lead = next((i for i, t in enumerate(mlp_layer_types) if t != "dense"), len(mlp_layer_types))
93+
94+
super().set_gguf_parameters()
95+
96+
self.gguf_writer.add_logit_scale(hparams["logit_scale"])
97+
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
98+
self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in hparams["layer_types"]])
99+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
100+
self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
101+
self.gguf_writer.add_leading_dense_block_count(n_dense_lead)
102+
self.gguf_writer.add_expert_weights_norm(hparams.get("norm_topk_prob", False))
103+
if (num_shared_experts := hparams.get("num_shared_experts", 0)) > 0:
104+
if hparams.get("shared_expert_combination_strategy", "average") != "average":
105+
raise ValueError("Cohere2 MoE only supports average shared expert combination")
106+
self.gguf_writer.add_expert_shared_count(num_shared_experts)
107+
self.gguf_writer.add_expert_shared_feed_forward_length(expert_intermediate_size * num_shared_experts)
108+
if (n_nextn := hparams.get("num_nextn_predict_layers", 0)) > 0 and not self.no_mtp:
109+
self.gguf_writer.add_nextn_predict_layers(n_nextn)
110+
self.gguf_writer.add_rope_dimension_count(hparams["head_dim"])
111+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
112+
113+
def index_tensors(self, remote_hf_model_id: str | None = None):
114+
hparams = {**self.hparams, **self.hparams.get("text_config", {})}
115+
self._n_main_layers = hparams.get("num_hidden_layers")
116+
type(self)._n_main_layers = self._n_main_layers
117+
return super().index_tensors(remote_hf_model_id=remote_hf_model_id)
118+
119+
@classmethod
120+
def filter_tensors(cls, item):
121+
if (titem := super().filter_tensors(item)) is None:
122+
return None
123+
name, gen = titem
124+
125+
if cls._n_main_layers is not None:
126+
is_mtp = (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None and int(m.group(1)) >= cls._n_main_layers
127+
if is_mtp and cls.no_mtp:
128+
return None
129+
if cls.mtp_only and not is_mtp and name not in (
130+
"model.embed_tokens.weight", "model.norm.weight", "lm_head.weight",
131+
):
132+
return None
133+
134+
return name, gen
135+
136+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
137+
if name.endswith(".bias"):
138+
if torch.any(data_torch != 0):
139+
raise ValueError(f"Bias tensor {name!r} is not zero.")
140+
logger.debug(f"Skipping bias tensor {name!r}.")
141+
return
142+
143+
if (m := self._expert_tensor_re.fullmatch(name)) is not None:
144+
n_experts = self.hparams["num_experts"]
145+
layer_idx = int(m.group(1))
146+
assert bid is None or bid == layer_idx
147+
148+
self._experts[layer_idx][name] = data_torch
149+
150+
expected = {
151+
f"model.layers.{layer_idx}.mlp.experts.{xid}.{w_name}.weight"
152+
for xid in range(n_experts)
153+
for w_name in ("down_proj", "gate_proj", "up_proj")
154+
}
155+
if expected.issubset(self._experts[layer_idx]):
156+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
157+
datas: list[Tensor] = []
158+
159+
for xid in range(n_experts):
160+
ename = f"model.layers.{layer_idx}.mlp.experts.{xid}.{w_name}.weight"
161+
datas.append(self._experts[layer_idx][ename])
162+
del self._experts[layer_idx][ename]
163+
164+
data_torch = torch.stack(datas, dim=0)
165+
merged_name = f"model.layers.{layer_idx}.mlp.experts.{w_name}.weight"
166+
167+
yield from super().modify_tensors(data_torch, merged_name, layer_idx)
168+
return
169+
170+
yield from super().modify_tensors(data_torch, name, bid)
171+
172+
def prepare_tensors(self):
173+
super().prepare_tensors()
174+
175+
experts = [k for d in self._experts for k in d.keys()]
176+
if len(experts) > 0:
177+
raise ValueError(f"Unprocessed experts: {experts}")

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ class TOKENIZER_TYPE(IntEnum):
100100
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
101101
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
102102
{"name": "tiny_aya", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereLabs/tiny-aya-base", },
103+
{"name": "cohere2moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereLabs/North-Mini-Code-1.0", },
103104
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
104105
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
105106
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },

gguf-py/gguf/constants.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,7 @@ class MODEL_ARCH(IntEnum):
457457
XVERSE = auto()
458458
COMMAND_R = auto()
459459
COHERE2 = auto()
460+
COHERE2MOE = auto()
460461
DBRX = auto()
461462
OLMO = auto()
462463
OLMO2 = auto()
@@ -1012,6 +1013,7 @@ class MODEL_TENSOR(IntEnum):
10121013
MODEL_ARCH.XVERSE: "xverse",
10131014
MODEL_ARCH.COMMAND_R: "command-r",
10141015
MODEL_ARCH.COHERE2: "cohere2",
1016+
MODEL_ARCH.COHERE2MOE: "cohere2moe",
10151017
MODEL_ARCH.DBRX: "dbrx",
10161018
MODEL_ARCH.OLMO: "olmo",
10171019
MODEL_ARCH.OLMO2: "olmo2",
@@ -2872,6 +2874,33 @@ class MODEL_TENSOR(IntEnum):
28722874
MODEL_TENSOR.FFN_DOWN,
28732875
MODEL_TENSOR.FFN_UP,
28742876
],
2877+
MODEL_ARCH.COHERE2MOE: [
2878+
MODEL_TENSOR.TOKEN_EMBD,
2879+
MODEL_TENSOR.OUTPUT_NORM,
2880+
MODEL_TENSOR.OUTPUT,
2881+
MODEL_TENSOR.ATTN_NORM,
2882+
MODEL_TENSOR.ATTN_Q,
2883+
MODEL_TENSOR.ATTN_K,
2884+
MODEL_TENSOR.ATTN_V,
2885+
MODEL_TENSOR.ATTN_OUT,
2886+
MODEL_TENSOR.FFN_GATE,
2887+
MODEL_TENSOR.FFN_DOWN,
2888+
MODEL_TENSOR.FFN_UP,
2889+
MODEL_TENSOR.FFN_GATE_INP,
2890+
MODEL_TENSOR.FFN_GATE_EXP,
2891+
MODEL_TENSOR.FFN_GATE_UP_EXP,
2892+
MODEL_TENSOR.FFN_DOWN_EXP,
2893+
MODEL_TENSOR.FFN_UP_EXP,
2894+
MODEL_TENSOR.FFN_GATE_SHEXP,
2895+
MODEL_TENSOR.FFN_DOWN_SHEXP,
2896+
MODEL_TENSOR.FFN_UP_SHEXP,
2897+
MODEL_TENSOR.NEXTN_EH_PROJ,
2898+
MODEL_TENSOR.NEXTN_EMBED_TOKENS,
2899+
MODEL_TENSOR.NEXTN_ENORM,
2900+
MODEL_TENSOR.NEXTN_HNORM,
2901+
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
2902+
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
2903+
],
28752904
MODEL_ARCH.DBRX: [
28762905
MODEL_TENSOR.TOKEN_EMBD,
28772906
MODEL_TENSOR.OUTPUT_NORM,

src/llama-arch.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
6666
{ LLM_ARCH_XVERSE, "xverse" },
6767
{ LLM_ARCH_COMMAND_R, "command-r" },
6868
{ LLM_ARCH_COHERE2, "cohere2" },
69+
{ LLM_ARCH_COHERE2MOE, "cohere2moe" },
6970
{ LLM_ARCH_DBRX, "dbrx" },
7071
{ LLM_ARCH_OLMO, "olmo" },
7172
{ LLM_ARCH_OLMO2, "olmo2" },

src/llama-arch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ enum llm_arch {
7171
LLM_ARCH_XVERSE,
7272
LLM_ARCH_COMMAND_R,
7373
LLM_ARCH_COHERE2,
74+
LLM_ARCH_COHERE2MOE,
7475
LLM_ARCH_DBRX,
7576
LLM_ARCH_OLMO,
7677
LLM_ARCH_OLMO2,

src/llama-model-saver.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ bool llama_model_saver_supports_arch(llm_arch arch) {
1818
case LLM_ARCH_GEMMA3:
1919
case LLM_ARCH_GEMMA3N:
2020
case LLM_ARCH_COHERE2:
21+
case LLM_ARCH_COHERE2MOE:
2122
case LLM_ARCH_OLMO2:
2223
case LLM_ARCH_BITNET:
2324
case LLM_ARCH_T5:

src/llama-model.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
157157
return new llama_model_command_r(params);
158158
case LLM_ARCH_COHERE2:
159159
return new llama_model_cohere2(params);
160+
case LLM_ARCH_COHERE2MOE:
161+
return new llama_model_cohere2moe(params);
160162
case LLM_ARCH_DBRX:
161163
return new llama_model_dbrx(params);
162164
case LLM_ARCH_OLMO:
@@ -1467,9 +1469,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
14671469
}
14681470
ml.done_getting_tensors();
14691471

1472+
// Tied NVFP4 output is valid when no separate LM-head scale tensors are present.
1473+
// If sidecar scales exist, the output weight must be an actual output tensor.
14701474
GGML_ASSERT(!(output && tok_embd &&
14711475
strcmp(output->name, tok_embd->name) == 0 &&
1472-
output->type == GGML_TYPE_NVFP4));
1476+
output->type == GGML_TYPE_NVFP4 &&
1477+
(output_s || output_in_s)));
14731478
// populate tensors_by_name
14741479
for (auto & [_, ctx_ptr] : ml.ctx_map) {
14751480
for (auto * cur = ggml_get_first_tensor(ctx_ptr.get()); cur != NULL; cur = ggml_get_next_tensor(ctx_ptr.get(), cur)) {
@@ -1844,6 +1849,7 @@ void llama_model::print_info() const {
18441849
}
18451850

18461851
if (arch == LLM_ARCH_MELLUM ||
1852+
arch == LLM_ARCH_COHERE2MOE ||
18471853
arch == LLM_ARCH_QWEN3MOE ||
18481854
arch == LLM_ARCH_OPENAI_MOE ||
18491855
arch == LLM_ARCH_QWEN3VLMOE ||
@@ -2389,6 +2395,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
23892395
case LLM_ARCH_XVERSE:
23902396
case LLM_ARCH_COMMAND_R:
23912397
case LLM_ARCH_COHERE2:
2398+
case LLM_ARCH_COHERE2MOE:
23922399
case LLM_ARCH_OLMO:
23932400
case LLM_ARCH_ARCTIC:
23942401
case LLM_ARCH_DEEPSEEK:

src/models/cohere2.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,9 @@ llama_model_cohere2::graph::graph(const llama_model & model, const llm_graph_par
122122
// feed-forward network
123123
{
124124
cur = build_ffn(ffn_inp,
125-
model.layers[il].ffn_up, NULL, NULL,
126-
model.layers[il].ffn_gate, NULL, NULL,
127-
model.layers[il].ffn_down, NULL, NULL,
125+
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s,
126+
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s,
127+
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down_s,
128128
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
129129
cb(cur, "ffn_out", il);
130130
}

0 commit comments

Comments
 (0)