|
1 | 1 | from __future__ import annotations |
2 | 2 |
|
| 3 | +import re |
3 | 4 | from typing import Iterable, TYPE_CHECKING |
4 | 5 |
|
5 | 6 | import torch |
@@ -55,3 +56,122 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter |
55 | 56 | return |
56 | 57 |
|
57 | 58 | yield from super().modify_tensors(data_torch, name, bid) |
| 59 | + |
| 60 | + |
| 61 | +@ModelBase.register("Cohere2MoeForCausalLM") |
| 62 | +class Cohere2MoeModel(TextModel): |
| 63 | + model_arch = gguf.MODEL_ARCH.COHERE2MOE |
| 64 | + _n_main_layers: int | None = None |
| 65 | + _expert_tensor_re = re.compile( |
| 66 | + r"model\.layers\.(\d+)\.mlp\.experts\.(\d+)\.(down_proj|gate_proj|up_proj)\.weight" |
| 67 | + ) |
| 68 | + |
| 69 | + def __init__(self, *args, **kwargs): |
| 70 | + super().__init__(*args, **kwargs) |
| 71 | + if (n_nextn := int(self.hparams.get("num_nextn_predict_layers", 0) or 0)) > 0 and not self.no_mtp: |
| 72 | + self.block_count += n_nextn |
| 73 | + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) |
| 74 | + self._experts: list[dict[str, Tensor]] = [{} for _ in range(self.block_count)] |
| 75 | + |
| 76 | + def _set_vocab_gpt2(self) -> None: |
| 77 | + tokens, toktypes, tokpre = self.get_vocab_base() |
| 78 | + self.gguf_writer.add_tokenizer_model("gpt2") |
| 79 | + self.gguf_writer.add_tokenizer_pre(tokpre) |
| 80 | + self.gguf_writer.add_token_list(tokens) |
| 81 | + self.gguf_writer.add_token_types(toktypes) |
| 82 | + |
| 83 | + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) |
| 84 | + special_vocab.add_to_gguf(self.gguf_writer) |
| 85 | + |
| 86 | + def set_gguf_parameters(self): |
| 87 | + hparams = self.hparams |
| 88 | + expert_intermediate_size = hparams["intermediate_size"] |
| 89 | + mlp_layer_types = hparams.get("mlp_layer_types") |
| 90 | + n_dense_lead = hparams.get("first_k_dense_replace", 0) |
| 91 | + if mlp_layer_types is not None: |
| 92 | + n_dense_lead = next((i for i, t in enumerate(mlp_layer_types) if t != "dense"), len(mlp_layer_types)) |
| 93 | + |
| 94 | + super().set_gguf_parameters() |
| 95 | + |
| 96 | + self.gguf_writer.add_logit_scale(hparams["logit_scale"]) |
| 97 | + self.gguf_writer.add_sliding_window(hparams["sliding_window"]) |
| 98 | + self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in hparams["layer_types"]]) |
| 99 | + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) |
| 100 | + self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size) |
| 101 | + self.gguf_writer.add_leading_dense_block_count(n_dense_lead) |
| 102 | + self.gguf_writer.add_expert_weights_norm(hparams.get("norm_topk_prob", False)) |
| 103 | + if (num_shared_experts := hparams.get("num_shared_experts", 0)) > 0: |
| 104 | + if hparams.get("shared_expert_combination_strategy", "average") != "average": |
| 105 | + raise ValueError("Cohere2 MoE only supports average shared expert combination") |
| 106 | + self.gguf_writer.add_expert_shared_count(num_shared_experts) |
| 107 | + self.gguf_writer.add_expert_shared_feed_forward_length(expert_intermediate_size * num_shared_experts) |
| 108 | + if (n_nextn := hparams.get("num_nextn_predict_layers", 0)) > 0 and not self.no_mtp: |
| 109 | + self.gguf_writer.add_nextn_predict_layers(n_nextn) |
| 110 | + self.gguf_writer.add_rope_dimension_count(hparams["head_dim"]) |
| 111 | + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) |
| 112 | + |
| 113 | + def index_tensors(self, remote_hf_model_id: str | None = None): |
| 114 | + hparams = {**self.hparams, **self.hparams.get("text_config", {})} |
| 115 | + self._n_main_layers = hparams.get("num_hidden_layers") |
| 116 | + type(self)._n_main_layers = self._n_main_layers |
| 117 | + return super().index_tensors(remote_hf_model_id=remote_hf_model_id) |
| 118 | + |
| 119 | + @classmethod |
| 120 | + def filter_tensors(cls, item): |
| 121 | + if (titem := super().filter_tensors(item)) is None: |
| 122 | + return None |
| 123 | + name, gen = titem |
| 124 | + |
| 125 | + if cls._n_main_layers is not None: |
| 126 | + is_mtp = (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None and int(m.group(1)) >= cls._n_main_layers |
| 127 | + if is_mtp and cls.no_mtp: |
| 128 | + return None |
| 129 | + if cls.mtp_only and not is_mtp and name not in ( |
| 130 | + "model.embed_tokens.weight", "model.norm.weight", "lm_head.weight", |
| 131 | + ): |
| 132 | + return None |
| 133 | + |
| 134 | + return name, gen |
| 135 | + |
| 136 | + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: |
| 137 | + if name.endswith(".bias"): |
| 138 | + if torch.any(data_torch != 0): |
| 139 | + raise ValueError(f"Bias tensor {name!r} is not zero.") |
| 140 | + logger.debug(f"Skipping bias tensor {name!r}.") |
| 141 | + return |
| 142 | + |
| 143 | + if (m := self._expert_tensor_re.fullmatch(name)) is not None: |
| 144 | + n_experts = self.hparams["num_experts"] |
| 145 | + layer_idx = int(m.group(1)) |
| 146 | + assert bid is None or bid == layer_idx |
| 147 | + |
| 148 | + self._experts[layer_idx][name] = data_torch |
| 149 | + |
| 150 | + expected = { |
| 151 | + f"model.layers.{layer_idx}.mlp.experts.{xid}.{w_name}.weight" |
| 152 | + for xid in range(n_experts) |
| 153 | + for w_name in ("down_proj", "gate_proj", "up_proj") |
| 154 | + } |
| 155 | + if expected.issubset(self._experts[layer_idx]): |
| 156 | + for w_name in ["down_proj", "gate_proj", "up_proj"]: |
| 157 | + datas: list[Tensor] = [] |
| 158 | + |
| 159 | + for xid in range(n_experts): |
| 160 | + ename = f"model.layers.{layer_idx}.mlp.experts.{xid}.{w_name}.weight" |
| 161 | + datas.append(self._experts[layer_idx][ename]) |
| 162 | + del self._experts[layer_idx][ename] |
| 163 | + |
| 164 | + data_torch = torch.stack(datas, dim=0) |
| 165 | + merged_name = f"model.layers.{layer_idx}.mlp.experts.{w_name}.weight" |
| 166 | + |
| 167 | + yield from super().modify_tensors(data_torch, merged_name, layer_idx) |
| 168 | + return |
| 169 | + |
| 170 | + yield from super().modify_tensors(data_torch, name, bid) |
| 171 | + |
| 172 | + def prepare_tensors(self): |
| 173 | + super().prepare_tensors() |
| 174 | + |
| 175 | + experts = [k for d in self._experts for k in d.keys()] |
| 176 | + if len(experts) > 0: |
| 177 | + raise ValueError(f"Unprocessed experts: {experts}") |
0 commit comments