Skip to content

Commit e626de2

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # docs/ops.md # docs/ops/WebGPU.csv # embd_res/templates/stepfun-ai-Step-3.5-Flash.jinja # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl # src/CMakeLists.txt # tests/test-backend-ops.cpp # tests/test-chat.cpp # tools/mtmd/CMakeLists.txt
2 parents 07c45ce + 77d6ae4 commit e626de2

26 files changed

Lines changed: 618 additions & 29 deletions

common/chat.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2057,6 +2057,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
20572057
if (has_reasoning_content && has_tool_calls) {
20582058
auto adjusted_message = msg;
20592059
adjusted_message["thinking"] = msg.at("reasoning_content");
2060+
adjusted_message.erase("content");
20602061
adjusted_messages.push_back(adjusted_message);
20612062
} else {
20622063
adjusted_messages.push_back(msg);
@@ -3154,15 +3155,15 @@ static common_chat_params common_chat_templates_apply_jinja(
31543155
}
31553156

31563157
// Qwen3-Coder XML format detection (must come before Hermes 2 Pro)
3157-
// Detect via explicit XML markers unique to Qwen3-Coder to avoid false positives in other templates.
3158-
// Require presence of <tool_call>, <function=...>, and <parameter=...> blocks.
3158+
// Detect via XML markers: <tool_call>, <function=...>, and <parameter=...> blocks.
3159+
// Also matches Step-3.5-Flash and Nemotron 3 Nano which use the same output format.
31593160
if (src.find("<tool_call>") != std::string::npos &&
3160-
src.find("<function>") != std::string::npos &&
31613161
src.find("<function=") != std::string::npos &&
3162-
src.find("<parameters>") != std::string::npos &&
31633162
src.find("<parameter=") != std::string::npos) {
31643163
workaround::func_args_not_string(params.messages);
3165-
// Nemotron 3 Nano 30B A3B
3164+
// Models with <think> support (Step-3.5-Flash, Nemotron 3 Nano) use the
3165+
// Nemotron v3 PEG parser for streaming and schema-aware parameter parsing.
3166+
// Qwen3-Coder has no <think> in its template.
31663167
if (src.find("<think>") != std::string::npos) {
31673168
return common_chat_params_init_nemotron_v3(tmpl, params);
31683169
}

convert_hf_to_gguf.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1163,6 +1163,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
11631163
if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
11641164
# ref: https://huggingface.co/core42/jais-13b
11651165
res = "jais"
1166+
if chkhsh == "bc5108ee1eb6a3d600cadd065f63190fbd0554dbc9e4bbd6a0d977970afc8d2a":
1167+
# ref: https://huggingface.co/inceptionai/Jais-2-8B-Chat
1168+
res = "jais-2"
11661169
if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
11671170
# ref: https://huggingface.co/WisdomShell/CodeShell-7B
11681171
res = "codeshell"
@@ -3730,6 +3733,13 @@ class Ernie4_5Model(TextModel):
37303733
def set_vocab(self):
37313734
self._set_vocab_sentencepiece()
37323735

3736+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
3737+
if tokenizer_config_file.is_file():
3738+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
3739+
tokenizer_config_json = json.load(f)
3740+
if "add_prefix_space" in tokenizer_config_json:
3741+
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
3742+
37333743
def set_gguf_parameters(self):
37343744
super().set_gguf_parameters()
37353745

@@ -3739,6 +3749,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
37393749
if (head_dim := self.hparams.get("head_dim")) is None:
37403750
head_dim = self.hparams["hidden_size"] // num_heads
37413751

3752+
if "mlp_AR" in name or "vision_model" in name:
3753+
# skip vision model and projector tensors
3754+
return
3755+
37423756
if "ernie." in name:
37433757
name = name.replace("ernie.", "model.")
37443758
# split the qkv weights
@@ -3848,6 +3862,48 @@ def prepare_tensors(self):
38483862
raise ValueError(f"Unprocessed experts: {experts}")
38493863

38503864

3865+
@ModelBase.register("PaddleOCRVLForConditionalGeneration")
3866+
class PaddleOCRModel(Ernie4_5Model):
3867+
model_arch = gguf.MODEL_ARCH.PADDLEOCR
3868+
3869+
3870+
@ModelBase.register("PaddleOCRVisionModel")
3871+
class PaddleOCRVisionModel(MmprojModel):
3872+
# PaddleOCR-VL uses a modified version of Siglip
3873+
min_pixels: int = 0
3874+
max_pixels: int = 0
3875+
3876+
def __init__(self, *args, **kwargs):
3877+
super().__init__(*args, **kwargs)
3878+
assert self.hparams_vision is not None
3879+
self.min_pixels = self.preprocessor_config["min_pixels"]
3880+
self.max_pixels = self.preprocessor_config["max_pixels"]
3881+
self.hparams_vision["image_size"] = int(math.sqrt(self.max_pixels))
3882+
3883+
def set_gguf_parameters(self):
3884+
super().set_gguf_parameters()
3885+
assert self.hparams_vision is not None
3886+
hparams = self.hparams_vision
3887+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PADDLEOCR)
3888+
self.gguf_writer.add_vision_max_pixels(self.max_pixels)
3889+
self.gguf_writer.add_vision_min_pixels(self.min_pixels)
3890+
self.gguf_writer.add_vision_use_gelu(True)
3891+
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-6))
3892+
3893+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3894+
name = name.replace("visual.", "model.")
3895+
3896+
if "vision_model" in name or "mlp_AR" in name:
3897+
if "packing_position_embedding" in name:
3898+
return # unused
3899+
elif "vision_model.head" in name:
3900+
# we don't yet support image embeddings for this model
3901+
return
3902+
else:
3903+
yield from super().modify_tensors(data_torch, name, bid)
3904+
return # skip other tensors
3905+
3906+
38513907
@ModelBase.register(
38523908
"Qwen2VLModel",
38533909
"Qwen2VLForConditionalGeneration",
@@ -8633,6 +8689,17 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
86338689
yield from super().modify_tensors(data_torch, name, bid)
86348690

86358691

8692+
@ModelBase.register("Jais2ForCausalLM")
8693+
class Jais2Model(TextModel):
8694+
model_arch = gguf.MODEL_ARCH.JAIS2
8695+
8696+
def set_gguf_parameters(self):
8697+
super().set_gguf_parameters()
8698+
hparams = self.hparams
8699+
head_dim = hparams.get("head_dim", hparams["hidden_size"] // hparams["num_attention_heads"])
8700+
self.gguf_writer.add_rope_dimension_count(head_dim)
8701+
8702+
86368703
@ModelBase.register("JAISLMHeadModel")
86378704
class JaisModel(TextModel):
86388705
model_arch = gguf.MODEL_ARCH.JAIS

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ class TOKENIZER_TYPE(IntEnum):
114114
{"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
115115
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
116116
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
117+
{"name": "jais-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inceptionai/Jais-2-8B-Chat", },
117118
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
118119
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
119120
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },

ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ layout (push_constant) uniform parameter
5757
uint nbi1;
5858
uint ne11;
5959
#else
60+
uint base_work_group_z;
61+
uint num_batches;
6062
uint k_split;
6163
uint ne02;
6264
uint ne12;
@@ -108,7 +110,7 @@ void main() {
108110
const uint ic = gl_WorkGroupID.y;
109111

110112
#ifdef MUL_MAT_ID
111-
const uint expert_idx = gl_GlobalInvocationID.z;
113+
const uint expert_idx = gl_WorkGroupID.z;
112114
if (ic * BN >= data_expert_count[expert_idx]) {
113115
return;
114116
}
@@ -118,7 +120,7 @@ void main() {
118120
#endif
119121

120122
#ifndef MUL_MAT_ID
121-
const uint batch_idx = gl_GlobalInvocationID.z;
123+
const uint batch_idx = gl_WorkGroupID.z + p.base_work_group_z;
122124

123125
const uint i13 = batch_idx / p.ne12;
124126
const uint i12 = batch_idx % p.ne12;
@@ -276,7 +278,7 @@ void main() {
276278
const uint dc = ic * BN + warp_c * WN;
277279

278280
#ifndef MUL_MAT_ID
279-
const uint offsets = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z;
281+
const uint offsets = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * p.num_batches;
280282
#endif
281283

282284
[[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {

gguf-py/gguf/constants.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,7 @@ class MODEL_ARCH(IntEnum):
435435
T5 = auto()
436436
T5ENCODER = auto()
437437
JAIS = auto()
438+
JAIS2 = auto()
438439
NEMOTRON = auto()
439440
NEMOTRON_H = auto()
440441
NEMOTRON_H_MOE = auto()
@@ -472,6 +473,7 @@ class MODEL_ARCH(IntEnum):
472473
RND1 = auto()
473474
PANGU_EMBED = auto()
474475
MISTRAL3 = auto()
476+
PADDLEOCR = auto()
475477
MIMO2 = auto()
476478
STEP35 = auto()
477479
LLAMA_EMBED = auto()
@@ -874,6 +876,7 @@ class MODEL_TENSOR(IntEnum):
874876
MODEL_ARCH.T5: "t5",
875877
MODEL_ARCH.T5ENCODER: "t5encoder",
876878
MODEL_ARCH.JAIS: "jais",
879+
MODEL_ARCH.JAIS2: "jais2",
877880
MODEL_ARCH.NEMOTRON: "nemotron",
878881
MODEL_ARCH.NEMOTRON_H: "nemotron_h",
879882
MODEL_ARCH.NEMOTRON_H_MOE: "nemotron_h_moe",
@@ -912,6 +915,7 @@ class MODEL_TENSOR(IntEnum):
912915
MODEL_ARCH.RND1: "rnd1",
913916
MODEL_ARCH.PANGU_EMBED: "pangu-embedded",
914917
MODEL_ARCH.MISTRAL3: "mistral3",
918+
MODEL_ARCH.PADDLEOCR: "paddleocr",
915919
MODEL_ARCH.MIMO2: "mimo2",
916920
MODEL_ARCH.STEP35: "step35",
917921
MODEL_ARCH.LLAMA_EMBED: "llama-embed",
@@ -2817,6 +2821,19 @@ class MODEL_TENSOR(IntEnum):
28172821
MODEL_TENSOR.FFN_GATE,
28182822
MODEL_TENSOR.FFN_UP,
28192823
],
2824+
MODEL_ARCH.JAIS2: [
2825+
MODEL_TENSOR.TOKEN_EMBD,
2826+
MODEL_TENSOR.OUTPUT_NORM,
2827+
MODEL_TENSOR.OUTPUT,
2828+
MODEL_TENSOR.ATTN_NORM,
2829+
MODEL_TENSOR.ATTN_Q,
2830+
MODEL_TENSOR.ATTN_K,
2831+
MODEL_TENSOR.ATTN_V,
2832+
MODEL_TENSOR.ATTN_OUT,
2833+
MODEL_TENSOR.FFN_NORM,
2834+
MODEL_TENSOR.FFN_DOWN,
2835+
MODEL_TENSOR.FFN_UP,
2836+
],
28202837
MODEL_ARCH.NEMOTRON: [
28212838
MODEL_TENSOR.TOKEN_EMBD,
28222839
MODEL_TENSOR.OUTPUT_NORM,
@@ -3171,6 +3188,20 @@ class MODEL_TENSOR(IntEnum):
31713188
MODEL_TENSOR.FFN_DOWN,
31723189
MODEL_TENSOR.FFN_UP,
31733190
],
3191+
MODEL_ARCH.PADDLEOCR: [
3192+
MODEL_TENSOR.TOKEN_EMBD,
3193+
MODEL_TENSOR.OUTPUT_NORM,
3194+
MODEL_TENSOR.OUTPUT,
3195+
MODEL_TENSOR.ATTN_NORM,
3196+
MODEL_TENSOR.ATTN_Q,
3197+
MODEL_TENSOR.ATTN_K,
3198+
MODEL_TENSOR.ATTN_V,
3199+
MODEL_TENSOR.ATTN_OUT,
3200+
MODEL_TENSOR.FFN_NORM,
3201+
MODEL_TENSOR.FFN_GATE,
3202+
MODEL_TENSOR.FFN_DOWN,
3203+
MODEL_TENSOR.FFN_UP,
3204+
],
31743205
MODEL_ARCH.FALCON_H1: [
31753206
# Token embedding
31763207
MODEL_TENSOR.TOKEN_EMBD,
@@ -3832,6 +3863,7 @@ class VisionProjectorType:
38323863
VOXTRAL = "voxtral"
38333864
LFM2 = "lfm2"
38343865
KIMIVL = "kimivl"
3866+
PADDLEOCR = "paddleocr"
38353867
KIMIK25 = "kimik25"
38363868
LIGHTONOCR = "lightonocr"
38373869
COGVLM = "cogvlm"

gguf-py/gguf/tensor_mapping.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1325,6 +1325,7 @@ class TensorNameMap:
13251325
"multi_modal_projector.linear_{bid}",
13261326
"mm_projector.proj.linear_{bid}", # Kimi-K2.5
13271327
"visual.merger.mlp.{bid}", # qwen2vl
1328+
"mlp_AR.linear_{bid}", # PaddleOCR-VL
13281329
"merger.mlp.{bid}",
13291330
),
13301331

@@ -1574,6 +1575,7 @@ class TensorNameMap:
15741575
"mm_projector.pre_norm", # Kimi-K2.5
15751576
"pre_mm_projector_norm",
15761577
"model.vision.linear_proj.norm1", # cogvlm
1578+
"mlp_AR.pre_norm", # PaddleOCR-VL
15771579
"merger.ln_q",
15781580
),
15791581

@@ -1599,6 +1601,7 @@ class TensorNameMap:
15991601

16001602
MODEL_TENSOR.V_RESMPL_ATTN_OUT: (
16011603
"resampler.attn.out_proj",
1604+
"model.vision_model.head.attention.out_proj",
16021605
),
16031606

16041607
MODEL_TENSOR.V_RESMPL_KV: (

src/llama-arch.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
7979
{ LLM_ARCH_T5, "t5" },
8080
{ LLM_ARCH_T5ENCODER, "t5encoder" },
8181
{ LLM_ARCH_JAIS, "jais" },
82+
{ LLM_ARCH_JAIS2, "jais2" },
8283
{ LLM_ARCH_NEMOTRON, "nemotron" },
8384
{ LLM_ARCH_NEMOTRON_H, "nemotron_h" },
8485
{ LLM_ARCH_NEMOTRON_H_MOE, "nemotron_h_moe" },
@@ -120,6 +121,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
120121
{ LLM_ARCH_RND1, "rnd1" },
121122
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
122123
{ LLM_ARCH_MISTRAL3, "mistral3" },
124+
{ LLM_ARCH_PADDLEOCR, "paddleocr" },
123125
{ LLM_ARCH_MIMO2, "mimo2" },
124126
{ LLM_ARCH_STEP35, "step35" },
125127
{ LLM_ARCH_LLAMA_EMBED, "llama-embed" },
@@ -738,6 +740,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
738740
case LLM_ARCH_INTERNLM2:
739741
case LLM_ARCH_GRANITE:
740742
case LLM_ARCH_ERNIE4_5:
743+
case LLM_ARCH_PADDLEOCR:
741744
case LLM_ARCH_SMOLLM3:
742745
case LLM_ARCH_DREAM:
743746
case LLM_ARCH_LLADA:
@@ -1791,6 +1794,20 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
17911794
LLM_TENSOR_FFN_GATE,
17921795
LLM_TENSOR_FFN_DOWN,
17931796
};
1797+
case LLM_ARCH_JAIS2:
1798+
return {
1799+
LLM_TENSOR_TOKEN_EMBD,
1800+
LLM_TENSOR_OUTPUT_NORM,
1801+
LLM_TENSOR_OUTPUT,
1802+
LLM_TENSOR_ATTN_NORM,
1803+
LLM_TENSOR_ATTN_Q,
1804+
LLM_TENSOR_ATTN_K,
1805+
LLM_TENSOR_ATTN_V,
1806+
LLM_TENSOR_ATTN_OUT,
1807+
LLM_TENSOR_FFN_NORM,
1808+
LLM_TENSOR_FFN_UP,
1809+
LLM_TENSOR_FFN_DOWN,
1810+
};
17941811
case LLM_ARCH_NEMOTRON_H:
17951812
return {
17961813
LLM_TENSOR_TOKEN_EMBD,

src/llama-arch.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ enum llm_arch {
8383
LLM_ARCH_T5,
8484
LLM_ARCH_T5ENCODER,
8585
LLM_ARCH_JAIS,
86+
LLM_ARCH_JAIS2,
8687
LLM_ARCH_NEMOTRON,
8788
LLM_ARCH_NEMOTRON_H,
8889
LLM_ARCH_NEMOTRON_H_MOE,
@@ -124,6 +125,7 @@ enum llm_arch {
124125
LLM_ARCH_RND1,
125126
LLM_ARCH_PANGU_EMBED,
126127
LLM_ARCH_MISTRAL3,
128+
LLM_ARCH_PADDLEOCR,
127129
LLM_ARCH_MIMO2,
128130
LLM_ARCH_STEP35,
129131
LLM_ARCH_LLAMA_EMBED,

src/llama-graph.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1128,8 +1128,8 @@ ggml_tensor * llm_graph_context::build_ffn(
11281128

11291129
if (down) {
11301130
cur = build_lora_mm(down, cur);
1131-
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
1132-
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
1131+
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
1132+
// GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators
11331133
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
11341134
}
11351135
}
@@ -1724,7 +1724,8 @@ ggml_tensor * llm_graph_context::build_attn_mha(
17241724

17251725
ggml_tensor * cur;
17261726

1727-
if (cparams.flash_attn && kq_b == nullptr) {
1727+
const bool use_flash_attn = cparams.flash_attn && kq_b == nullptr;
1728+
if (use_flash_attn) {
17281729
GGML_ASSERT(kq_b == nullptr && "Flash attention does not support KQ bias yet");
17291730

17301731
if (v_trans) {
@@ -1984,8 +1985,8 @@ ggml_tensor * llm_graph_context::build_attn(
19841985

19851986
if (wo) {
19861987
cur = build_lora_mm(wo, cur);
1987-
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
1988-
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
1988+
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
1989+
// GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators
19891990
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
19901991
}
19911992
}

0 commit comments

Comments
 (0)