Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
94659f3
convert: add dsv4 conversion
am17an Jun 1, 2026
9295a5a
add basic setup
am17an Jun 1, 2026
76e7b32
add llm_graph_input_dsv4
am17an Jun 2, 2026
67e1f16
add save-load state
am17an Jun 3, 2026
a34132b
add sinkhorn eps - correction by @fairydreaming
am17an Jun 4, 2026
02f6dff
add rope fix
am17an Jun 4, 2026
e38b69f
cleanup dead code
am17an Jun 4, 2026
33196bf
fix bugs
am17an Jun 4, 2026
1dd87e1
support pro model: added by @fairydreaming
am17an Jun 5, 2026
9703c1c
remove redundant V cache
am17an Jun 5, 2026
177861e
Chat template
pwilkin Jun 5, 2026
53a6b19
remove debugging leftovers
sszymczy Jun 5, 2026
c377a35
Add mechanism for inlining templates based on architecture
pwilkin Jun 5, 2026
7e42764
s/deepseek-v4-flash/deepseek4/g
am17an Jun 10, 2026
54112dc
s/deepseek-v4-flash/deepseek4/g continued
sszymczy Jun 10, 2026
4fc95b6
enable graph reuse
am17an Jun 12, 2026
acdd432
enable FA
am17an Jun 12, 2026
8ecf393
fix test llama archs
am17an Jun 12, 2026
e4b24ba
rename
am17an Jun 12, 2026
0f8e29c
compatibility with antirez ds4 GGUFs
sszymczy Jun 15, 2026
6efe7de
simplified set_gguf_parameters() by calling super class method, repla…
sszymczy Jun 15, 2026
7b62f72
reserve worst-case kv-cache
am17an Jun 15, 2026
646ca19
revert max split inputs
am17an Jun 16, 2026
eb4854c
address review comments
am17an Jun 16, 2026
6d78606
add padding to enable FA
am17an Jun 16, 2026
5c4870d
pad only the final value of plan.n_kv to 256
sszymczy Jun 16, 2026
c0d0324
remove built-in cpp chat template
ngxson Jun 18, 2026
c54dfe8
cont: remove cpp built-in template
ngxson Jun 18, 2026
aa56a11
rm outdated test
ngxson Jun 18, 2026
7b2dfad
replace ggml_view_3d() with ggml_reshape_3d()
fairydreaming Jun 18, 2026
4ecddb8
only support n_seq=1 for now
am17an Jun 20, 2026
73f8890
remove unused var
am17an Jun 20, 2026
e7ab657
cont: remove unused var
am17an Jun 20, 2026
6903be7
use scale bias
am17an Jun 26, 2026
3dcda81
use correct ptr for can_reuse
am17an Jun 27, 2026
26728ef
remove gen-chat-inline-templates.py
am17an Jun 27, 2026
5c18871
simplify graph reuse
am17an Jun 27, 2026
ea4dc4f
cont: cleanup
am17an Jun 27, 2026
683fd55
remove unused inputs
am17an Jun 27, 2026
bd54e8c
enable partial checkpointing
am17an Jun 27, 2026
0994be7
add correct shape for kq_mask + set llama_model_n_swa to 0 for dsv4
am17an Jun 28, 2026
000c117
precompute source_idx + add comment about dummy write
am17an Jun 28, 2026
e16065f
support multi-seq
am17an Jun 28, 2026
2333185
remove restored_trim_pos
am17an Jun 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions conversion/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
"DeepseekV3ForCausalLM": "deepseek",
"DeepseekV32ForCausalLM": "deepseek",
"DFlashDraftModel": "qwen",
"DeepseekV4ForCausalLM": "deepseek",
"DistilBertForMaskedLM": "bert",
"DistilBertForSequenceClassification": "bert",
"DistilBertModel": "bert",
Expand Down
15 changes: 14 additions & 1 deletion conversion/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1273,7 +1273,7 @@
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
logger.info(f"gguf: layer norm epsilon = {f_norm_eps}")
if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None:
if (n_experts := self.find_hparam(["num_local_experts", "num_experts", "n_routed_experts"], optional=True)) is not None:
self.gguf_writer.add_expert_count(n_experts)
logger.info(f"gguf: expert count = {n_experts}")
if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token", "top_k_experts"], optional=True)) is not None:
Expand All @@ -1291,6 +1291,8 @@
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
elif score_func == "softmax":
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
elif score_func == "sqrtsoftplus":
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SQRTSOFTPLUS)
else:
raise ValueError(f"Unsupported expert score gating function value: {score_func}")
logger.info(f"gguf: expert score gating function = {score_func}")
Expand Down Expand Up @@ -1341,15 +1343,15 @@

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute]

Check warning on line 1346 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1346:76: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]

Check warning on line 1347 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1347:60: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment

tokpre = self.get_vocab_base_pre(tokenizer)

reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]

Check warning on line 1351 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1351:93: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]

Check warning on line 1352 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1352:52: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment

added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]

Check warning on line 1354 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1354:64: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment

for i in range(vocab_size):
if i not in reverse_vocab:
Expand All @@ -1362,7 +1364,7 @@
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
if not added_tokens_decoder[i].normalized:
previous_token = token
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment]

Check warning on line 1367 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1367:102: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment
if previous_token != token:
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")

Expand Down Expand Up @@ -1726,14 +1728,14 @@
def _set_vocab_hybriddna(self):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute]

Check warning on line 1731 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1731:76: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]

Check warning on line 1732 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1732:60: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment

reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]

Check warning on line 1734 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1734:93: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment
# k-mers can share text with a base-vocab BPE token (e.g. CCCCCC) and get
# dropped by get_vocab(); a reserved marker suffix (U+E000) keeps each
# k-mer's own id (llama.cpp strips it on detokenization)
for kmer in tokenizer.kmers: # ty: ignore[unresolved-attribute]

Check warning on line 1738 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1738:39: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment
reverse_vocab[tokenizer.dna_token_to_id[kmer]] = kmer + "\ue000" # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]
Expand Down Expand Up @@ -2600,6 +2602,17 @@
return cls._wrap_fn(func)(*args, **kwargs)


if hasattr(torch, "float8_e8m0fnu"):
_torch_float8_e8m0 = torch.float8_e8m0fnu
LazyTorchTensor._dtype_map[_torch_float8_e8m0] = np.uint8
LazyTorchTensor._dtype_byteswap_map[_torch_float8_e8m0] = np.uint8
LazyTorchTensor._dtype_str_map["F8_E8M0"] = _torch_float8_e8m0
else:
# Older torch builds do not expose F8_E8M0. Keep the raw bytes so callers
# that know the format can decode them explicitly.
LazyTorchTensor._dtype_str_map["F8_E8M0"] = torch.uint8


def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str:
# TODO @ngxson : this won't work correctly if the model has both audio & vision encoders
# maybe we should fallback to text model's arch in that case, since not many models have both
Expand Down
312 changes: 311 additions & 1 deletion conversion/deepseek.py

Large diffs are not rendered by default.

104 changes: 102 additions & 2 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ class LLM:
TOKEN_SHIFT_COUNT = "{arch}.token_shift_count"
INTERLEAVE_MOE_LAYER_STEP = "{arch}.interleave_moe_layer_step"
FULL_ATTENTION_INTERVAL = "{arch}.full_attention_interval"
HASH_LAYER_COUNT = "{arch}.hash_layer_count"
ACTIVATION_SPARSITY_SCALE = "{arch}.activation_sparsity_scale"
ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx"
ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs"
Expand Down Expand Up @@ -179,8 +180,12 @@ class Attention:
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
SLIDING_WINDOW = "{arch}.attention.sliding_window"
SCALE = "{arch}.attention.scale"
OUTPUT_GROUP_COUNT = "{arch}.attention.output_group_count"
OUTPUT_LORA_RANK = "{arch}.attention.output_lora_rank"
OUTPUT_SCALE = "{arch}.attention.output_scale"
VALUE_SCALE = "{arch}.attention.value_scale"
COMPRESS_RATIOS = "{arch}.attention.compress_ratios"
COMPRESS_ROPE_FREQ_BASE = "{arch}.attention.compress_rope_freq_base"
TEMPERATURE_LENGTH = "{arch}.attention.temperature_length"
KEY_LENGTH_MLA = "{arch}.attention.key_length_mla"
VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla"
Expand All @@ -195,6 +200,11 @@ class Indexer:
KEY_LENGTH = "{arch}.attention.indexer.key_length"
TOP_K = "{arch}.attention.indexer.top_k"

class HyperConnection:
COUNT = "{arch}.hyper_connection.count"
SINKHORN_ITERATIONS = "{arch}.hyper_connection.sinkhorn_iterations"
EPSILON = "{arch}.hyper_connection.epsilon"

class Rope:
DIMENSION_COUNT = "{arch}.rope.dimension_count"
DIMENSION_COUNT_SWA = "{arch}.rope.dimension_count_swa"
Expand Down Expand Up @@ -469,6 +479,7 @@ class MODEL_ARCH(IntEnum):
DEEPSEEK2 = auto()
DEEPSEEK2OCR = auto()
DEEPSEEK32 = auto()
DEEPSEEK4 = auto()
CHATGLM = auto()
GLM4 = auto()
GLM4_MOE = auto()
Expand Down Expand Up @@ -554,6 +565,9 @@ class MODEL_TENSOR(IntEnum):
DENSE_2_OUT = auto() # embeddinggemma 2_Dense
DENSE_3_OUT = auto() # embeddinggemma 3_Dense
OUTPUT_NORM = auto()
HC_HEAD_FN = auto()
HC_HEAD_BASE = auto()
HC_HEAD_SCALE = auto()
ROPE_FREQS = auto()
ROPE_FACTORS_LONG = auto()
ROPE_FACTORS_SHORT = auto()
Expand Down Expand Up @@ -593,6 +607,7 @@ class MODEL_TENSOR(IntEnum):
FFN_DOWN_CHEXP = auto()
FFN_UP_CHEXP = auto()
FFN_EXP_PROBS_B = auto()
FFN_GATE_TID2EID = auto()
MOE_LATENT_DOWN = auto() # nemotron 3 super
MOE_LATENT_UP = auto() # nemotron 3 super
ATTN_Q_NORM = auto()
Expand Down Expand Up @@ -680,6 +695,20 @@ class MODEL_TENSOR(IntEnum):
ATTN_V_B = auto()
ATTN_Q_A_NORM = auto()
ATTN_KV_A_NORM = auto()
ATTN_KV = auto()
ATTN_KV_NORM = auto()
ATTN_OUT_A = auto()
ATTN_OUT_B = auto()
HC_ATTN_FN = auto()
HC_ATTN_BASE = auto()
HC_ATTN_SCALE = auto()
HC_FFN_FN = auto()
HC_FFN_BASE = auto()
HC_FFN_SCALE = auto()
ATTN_COMPRESSOR_WKV = auto()
ATTN_COMPRESSOR_WGATE = auto()
ATTN_COMPRESSOR_APE = auto()
ATTN_COMPRESSOR_NORM = auto()
FFN_SUB_NORM = auto()
ATTN_SUB_NORM = auto()
DEC_ATTN_NORM = auto()
Expand Down Expand Up @@ -741,6 +770,10 @@ class MODEL_TENSOR(IntEnum):
INDEXER_PROJ = auto()
INDEXER_ATTN_K = auto()
INDEXER_ATTN_Q_B = auto()
INDEXER_COMPRESSOR_WKV = auto()
INDEXER_COMPRESSOR_WGATE = auto()
INDEXER_COMPRESSOR_APE = auto()
INDEXER_COMPRESSOR_NORM = auto()
# vision
V_MMPROJ = auto()
V_MMPROJ_FC = auto()
Expand Down Expand Up @@ -1026,6 +1059,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.DEEPSEEK2: "deepseek2",
MODEL_ARCH.DEEPSEEK2OCR: "deepseek2-ocr",
MODEL_ARCH.DEEPSEEK32: "deepseek32",
MODEL_ARCH.DEEPSEEK4: "deepseek4",
MODEL_ARCH.CHATGLM: "chatglm",
MODEL_ARCH.GLM4: "glm4",
MODEL_ARCH.GLM4_MOE: "glm4moe",
Expand Down Expand Up @@ -1110,6 +1144,9 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.OUTPUT: "output",
MODEL_TENSOR.DENSE_2_OUT: "dense_2", # embeddinggemma 2_Dense
MODEL_TENSOR.DENSE_3_OUT: "dense_3", # embeddinggemma 2_Dense
MODEL_TENSOR.HC_HEAD_FN: "output_hc_fn",
MODEL_TENSOR.HC_HEAD_BASE: "output_hc_base",
MODEL_TENSOR.HC_HEAD_SCALE: "output_hc_scale",
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
Expand Down Expand Up @@ -1151,6 +1188,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
MODEL_TENSOR.FFN_GATE_UP_EXP: "blk.{bid}.ffn_gate_up_exps",
MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b",
MODEL_TENSOR.FFN_GATE_TID2EID: "blk.{bid}.ffn_gate_tid2eid",
MODEL_TENSOR.MOE_LATENT_DOWN: "blk.{bid}.ffn_latent_down", # nemotron 3 super
MODEL_TENSOR.MOE_LATENT_UP: "blk.{bid}.ffn_latent_up", # nemotron 3 super
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
Expand Down Expand Up @@ -1236,6 +1274,20 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.ATTN_V_B: "blk.{bid}.attn_v_b",
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
MODEL_TENSOR.ATTN_KV: "blk.{bid}.attn_kv",
MODEL_TENSOR.ATTN_KV_NORM: "blk.{bid}.attn_kv_a_norm",
MODEL_TENSOR.ATTN_OUT_A: "blk.{bid}.attn_output_a",
MODEL_TENSOR.ATTN_OUT_B: "blk.{bid}.attn_output_b",
MODEL_TENSOR.HC_ATTN_FN: "blk.{bid}.hc_attn_fn",
MODEL_TENSOR.HC_ATTN_BASE: "blk.{bid}.hc_attn_base",
MODEL_TENSOR.HC_ATTN_SCALE: "blk.{bid}.hc_attn_scale",
MODEL_TENSOR.HC_FFN_FN: "blk.{bid}.hc_ffn_fn",
MODEL_TENSOR.HC_FFN_BASE: "blk.{bid}.hc_ffn_base",
MODEL_TENSOR.HC_FFN_SCALE: "blk.{bid}.hc_ffn_scale",
MODEL_TENSOR.ATTN_COMPRESSOR_WKV: "blk.{bid}.attn_compressor_kv",
MODEL_TENSOR.ATTN_COMPRESSOR_WGATE: "blk.{bid}.attn_compressor_gate",
MODEL_TENSOR.ATTN_COMPRESSOR_APE: "blk.{bid}.attn_compressor_ape",
MODEL_TENSOR.ATTN_COMPRESSOR_NORM: "blk.{bid}.attn_compressor_norm",
MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm",
MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm",
Expand Down Expand Up @@ -1297,6 +1349,10 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.INDEXER_PROJ: "blk.{bid}.indexer.proj",
MODEL_TENSOR.INDEXER_ATTN_K: "blk.{bid}.indexer.attn_k",
MODEL_TENSOR.INDEXER_ATTN_Q_B: "blk.{bid}.indexer.attn_q_b",
MODEL_TENSOR.INDEXER_COMPRESSOR_WKV: "blk.{bid}.indexer_compressor_kv",
MODEL_TENSOR.INDEXER_COMPRESSOR_WGATE: "blk.{bid}.indexer_compressor_gate",
MODEL_TENSOR.INDEXER_COMPRESSOR_APE: "blk.{bid}.indexer_compressor_ape",
MODEL_TENSOR.INDEXER_COMPRESSOR_NORM: "blk.{bid}.indexer_compressor_norm",
# vision
MODEL_TENSOR.V_MMPROJ: "mm.{bid}",
MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc",
Expand Down Expand Up @@ -3137,6 +3193,49 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
],
MODEL_ARCH.DEEPSEEK4: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.HC_HEAD_FN,
MODEL_TENSOR.HC_HEAD_BASE,
MODEL_TENSOR.HC_HEAD_SCALE,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_SINKS,
MODEL_TENSOR.ATTN_Q_A,
MODEL_TENSOR.ATTN_Q_B,
MODEL_TENSOR.ATTN_Q_A_NORM,
MODEL_TENSOR.ATTN_KV,
MODEL_TENSOR.ATTN_KV_NORM,
MODEL_TENSOR.ATTN_OUT_A,
MODEL_TENSOR.ATTN_OUT_B,
MODEL_TENSOR.HC_ATTN_FN,
MODEL_TENSOR.HC_ATTN_BASE,
MODEL_TENSOR.HC_ATTN_SCALE,
MODEL_TENSOR.HC_FFN_FN,
MODEL_TENSOR.HC_FFN_BASE,
MODEL_TENSOR.HC_FFN_SCALE,
MODEL_TENSOR.ATTN_COMPRESSOR_WKV,
MODEL_TENSOR.ATTN_COMPRESSOR_WGATE,
MODEL_TENSOR.ATTN_COMPRESSOR_APE,
MODEL_TENSOR.ATTN_COMPRESSOR_NORM,
MODEL_TENSOR.INDEXER_PROJ,
MODEL_TENSOR.INDEXER_ATTN_Q_B,
MODEL_TENSOR.INDEXER_COMPRESSOR_WKV,
MODEL_TENSOR.INDEXER_COMPRESSOR_WGATE,
MODEL_TENSOR.INDEXER_COMPRESSOR_APE,
MODEL_TENSOR.INDEXER_COMPRESSOR_NORM,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_GATE_TID2EID,
MODEL_TENSOR.FFN_EXP_PROBS_B,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_GATE_SHEXP,
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
],
MODEL_ARCH.ERNIE4_5_MOE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
Expand Down Expand Up @@ -4436,8 +4535,9 @@ class GGMLQuantizationType(IntEnum):


class ExpertGatingFuncType(IntEnum):
SOFTMAX = 1
SIGMOID = 2
SOFTMAX = 1
SIGMOID = 2
SQRTSOFTPLUS = 4


# TODO: add GGMLFileType from ggml_ftype in ggml.h
Expand Down
24 changes: 24 additions & 0 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -715,6 +715,9 @@ def add_leading_dense_block_count(self, length: int) -> None:
def add_full_attention_interval(self, interval: int) -> None:
self.add_uint32(Keys.LLM.FULL_ATTENTION_INTERVAL.format(arch=self.arch), interval)

def add_hash_layer_count(self, count: int) -> None:
self.add_uint32(Keys.LLM.HASH_LAYER_COUNT.format(arch=self.arch), count)

def add_feed_forward_length(self, length: int | Sequence[int]) -> None:
if isinstance(length, int):
self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length)
Expand Down Expand Up @@ -940,6 +943,27 @@ def add_relative_attn_buckets_count(self, value: int) -> None:
def add_sliding_window(self, value: int) -> None:
self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value)

def add_attention_output_group_count(self, count: int) -> None:
self.add_uint32(Keys.Attention.OUTPUT_GROUP_COUNT.format(arch=self.arch), count)

def add_attention_output_lora_rank(self, length: int) -> None:
self.add_uint32(Keys.Attention.OUTPUT_LORA_RANK.format(arch=self.arch), length)

def add_attention_compress_ratios(self, values: Sequence[int]) -> None:
self.add_array(Keys.Attention.COMPRESS_RATIOS.format(arch=self.arch), values)

def add_attention_compress_rope_freq_base(self, value: float) -> None:
self.add_float32(Keys.Attention.COMPRESS_ROPE_FREQ_BASE.format(arch=self.arch), value)

def add_hyper_connection_count(self, count: int) -> None:
self.add_uint32(Keys.HyperConnection.COUNT.format(arch=self.arch), count)

def add_hyper_connection_sinkhorn_iterations(self, count: int) -> None:
self.add_uint32(Keys.HyperConnection.SINKHORN_ITERATIONS.format(arch=self.arch), count)

def add_hyper_connection_epsilon(self, value: float) -> None:
self.add_float32(Keys.HyperConnection.EPSILON.format(arch=self.arch), value)

def add_attention_scale(self, value: float) -> None:
self.add_float32(Keys.Attention.SCALE.format(arch=self.arch), value)

Expand Down
Loading
Loading