Skip to content

Commit 4c0cfcc

Browse files
committed
Merge remote-tracking branch 'origin/master' into first-line-titles
* origin/master: webui: MCP Diagnostics improvements (ggml-org#21803) Remove extra conditional check on debug mode. (ggml-org#21798) sycl: disable Q1_0 in backend and cleanup unused variables (ggml-org#21807) mtmd: fix crash when sending image under 2x2 pixels (ggml-org#21711) mtmd: qwen3 audio support (qwen3-omni and qwen3-asr) (ggml-org#19441) convert : force f16 or f32 on step3-vl conv weights (ggml-org#21646) mtmd: add gemma 4 test (vision + audio) [no ci] (ggml-org#21806) mtmd: add Gemma 4 audio conformer encoder support (ggml-org#21421) fix: Proper messages rendering for "Show raw output" (ggml-org#21672) docs: add guide on how to add multimodal support (ggml-org#21778)
2 parents 1532fd4 + 227ed28 commit 4c0cfcc

42 files changed

Lines changed: 2445 additions & 428 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

convert_hf_to_gguf.py

Lines changed: 161 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4258,9 +4258,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
42584258
yield from super().modify_tensors(data_torch, name, bid)
42594259

42604260

4261-
@ModelBase.register("Qwen2_5OmniModel")
4262-
class Qwen25OmniModel(Qwen2VLVisionModel):
4263-
has_vision_encoder = True
4261+
class Qwen25AudioModel(MmprojModel):
42644262
has_audio_encoder = True
42654263

42664264
def __init__(self, *args, **kwargs):
@@ -4276,12 +4274,6 @@ def set_gguf_parameters(self):
42764274
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"])
42774275
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5))
42784276

4279-
def get_vision_config(self) -> dict[str, Any] | None:
4280-
return self.global_config["thinker_config"].get("vision_config")
4281-
4282-
def get_audio_config(self) -> dict[str, Any] | None:
4283-
return self.global_config["thinker_config"].get("audio_config")
4284-
42854277
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
42864278
# SinusoidsPositionEmbedding
42874279
assert self.hparams_audio is not None
@@ -4312,7 +4304,32 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
43124304
# this tensor is left unused in transformers code
43134305
# https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809
43144306
return
4315-
yield from super().modify_tensors(data_torch, name, bid)
4307+
yield from MmprojModel.modify_tensors(self, data_torch, name, bid)
4308+
4309+
return # skip other tensors
4310+
4311+
4312+
@ModelBase.register("Qwen2_5OmniModel")
4313+
class Qwen25OmniModel(Qwen2VLVisionModel, Qwen25AudioModel):
4314+
has_audio_encoder = True
4315+
has_vision_encoder = True
4316+
4317+
def get_vision_config(self) -> dict[str, Any] | None:
4318+
return self.global_config["thinker_config"].get("vision_config")
4319+
4320+
def get_audio_config(self) -> dict[str, Any] | None:
4321+
return self.global_config["thinker_config"].get("audio_config")
4322+
4323+
def set_gguf_parameters(self):
4324+
super().set_gguf_parameters()
4325+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O)
4326+
4327+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4328+
if "visual." in name:
4329+
yield from Qwen2VLVisionModel.modify_tensors(self, data_torch, name, bid)
4330+
elif "audio_tower." in name:
4331+
yield from Qwen25AudioModel.modify_tensors(self, data_torch, name, bid)
4332+
return # skip other tensors
43164333

43174334

43184335
@ModelBase.register("InternVisionModel")
@@ -4816,7 +4833,10 @@ def set_gguf_parameters(self):
48164833
class Qwen3VLVisionModel(MmprojModel):
48174834
def __init__(self, *args, **kwargs):
48184835
super().__init__(*args, **kwargs)
4819-
assert self.hparams_vision is not None
4836+
if self.hparams_vision is None:
4837+
logger.info("No vision config found, skipping vision tensor processing")
4838+
return
4839+
48204840
# Compute image_size if not present
48214841
if "image_size" not in self.hparams_vision:
48224842
# For Qwen3VL/Qwen3VLMoe, compute from num_position_embeddings
@@ -4837,7 +4857,9 @@ def __init__(self, *args, **kwargs):
48374857

48384858
def set_gguf_parameters(self):
48394859
super().set_gguf_parameters()
4840-
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL)
4860+
# in case mixed modalities, the arch will be handled by subclass
4861+
if not self.has_audio_encoder:
4862+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL)
48414863
self.gguf_writer.add_vision_use_gelu(True)
48424864

48434865
if self.hparams_vision is not None:
@@ -4925,11 +4947,64 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
49254947
return
49264948

49274949
if name.startswith("visual."):
4928-
yield from super().modify_tensors(data_torch, name, bid)
4929-
return
4950+
yield from MmprojModel.modify_tensors(self, data_torch, name, bid)
4951+
return # skip other tensors
49304952

4931-
# Fall back to parent class for other tensors
4932-
yield from super().modify_tensors(data_torch, name, bid)
4953+
4954+
@ModelBase.register("Qwen3OmniMoeForConditionalGeneration")
4955+
class Qwen3OmniMmprojModel(Qwen3VLVisionModel, Qwen25AudioModel):
4956+
has_audio_encoder = True
4957+
has_vision_encoder = True
4958+
4959+
def get_vision_config(self) -> dict[str, Any] | None:
4960+
if self.has_vision_encoder:
4961+
return self.global_config["thinker_config"].get("vision_config")
4962+
else:
4963+
return None
4964+
4965+
def get_audio_config(self) -> dict[str, Any] | None:
4966+
if self.has_audio_encoder:
4967+
return self.global_config["thinker_config"].get("audio_config")
4968+
else:
4969+
return None
4970+
4971+
def set_gguf_parameters(self):
4972+
if self.has_vision_encoder:
4973+
Qwen3VLVisionModel.set_gguf_parameters(self)
4974+
self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.QWEN3VL)
4975+
if self.has_audio_encoder:
4976+
Qwen25AudioModel.set_gguf_parameters(self)
4977+
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.QWEN3A)
4978+
4979+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4980+
if "visual." in name:
4981+
if not self.has_vision_encoder:
4982+
raise ValueError(f"Model does not have vision encoder, but found tensor {name}")
4983+
# need to transform vision tensor naming, so that modify_tensors() logic can be used correctly
4984+
name = name.replace("thinker.visual.", "model.visual.")
4985+
if ".merger_list." in name:
4986+
name = name.replace(".merger_list.", ".deepstack_merger_list.")
4987+
name = name.replace(".ln_q", ".norm")
4988+
name = name.replace(".mlp.0", ".linear_fc1")
4989+
name = name.replace(".mlp.2", ".linear_fc2")
4990+
elif ".merger." in name:
4991+
name = name.replace(".ln_q", ".norm")
4992+
name = name.replace(".mlp.0", ".linear_fc1")
4993+
name = name.replace(".mlp.2", ".linear_fc2")
4994+
yield from Qwen3VLVisionModel.modify_tensors(self, data_torch, name, bid)
4995+
elif "audio_tower." in name:
4996+
if not self.has_audio_encoder:
4997+
raise ValueError(f"Model does not have audio encoder, but found tensor {name}")
4998+
if "conv2d" in name and name.endswith(".bias"):
4999+
# transform conv2d bias [n_embd] --> [1, 1, n_embd]
5000+
data_torch = data_torch.unsqueeze(-1).unsqueeze(-1)
5001+
yield from Qwen25AudioModel.modify_tensors(self, data_torch, name, bid)
5002+
5003+
5004+
@ModelBase.register("Qwen3ASRForConditionalGeneration")
5005+
class Qwen3ASRMmprojModel(Qwen3OmniMmprojModel):
5006+
has_audio_encoder = True
5007+
has_vision_encoder = False
49335008

49345009

49355010
@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration", "GlmOcrForConditionalGeneration")
@@ -4992,6 +5067,8 @@ def set_gguf_parameters(self):
49925067
def tensor_force_quant(self, name, new_name, bid, n_dims):
49935068
if ".position_embd." in new_name:
49945069
return gguf.GGMLQuantizationType.F32
5070+
if ("mm.0." in new_name or "mm.1." in new_name) and new_name.endswith(".weight"):
5071+
return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
49955072
return super().tensor_force_quant(name, new_name, bid, n_dims)
49965073

49975074
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
@@ -5030,9 +5107,10 @@ class Qwen3VLTextModel(Qwen3Model):
50305107

50315108
def set_gguf_parameters(self):
50325109
super().set_gguf_parameters()
5033-
5034-
# Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
5035-
vision_config = self.hparams.get("vision_config", {})
5110+
if "thinker_config" in self.hparams:
5111+
vision_config = self.hparams["thinker_config"].get("vision_config", {})
5112+
else:
5113+
vision_config = self.hparams.get("vision_config", {})
50365114
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
50375115
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
50385116

@@ -5101,6 +5179,70 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
51015179
yield from super().modify_tensors(data_torch, name, bid)
51025180

51035181

5182+
@ModelBase.register("Qwen3OmniMoeForConditionalGeneration")
5183+
class Qwen3OmniMoeTextModel(Qwen3VLMoeTextModel):
5184+
model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
5185+
5186+
def set_vocab(self):
5187+
super().set_vocab()
5188+
# correct BOS/EOS tokens
5189+
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
5190+
tokenizer_config = json.load(f)
5191+
added_tokens = tokenizer_config.get("added_tokens_decoder", {})
5192+
for token_id, data in added_tokens.items():
5193+
if data.get("content") == "<|im_end|>":
5194+
self.gguf_writer.add_bos_token_id(int(token_id))
5195+
self.gguf_writer.add_eos_token_id(int(token_id))
5196+
break
5197+
5198+
def set_gguf_parameters(self):
5199+
super().set_gguf_parameters()
5200+
self.gguf_writer.add_num_deepstack_layers(0)
5201+
5202+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5203+
# Skip vision and audio tensors - they go in the mmproj file
5204+
if "visual." in name or "audio_tower." in name \
5205+
or "talker." in name or "code2wav." in name:
5206+
return
5207+
5208+
name = name.replace("thinker.", "")
5209+
yield from super().modify_tensors(data_torch, name, bid)
5210+
5211+
5212+
@ModelBase.register("Qwen3ASRForConditionalGeneration")
5213+
class Qwen3ASRTextModel(Qwen3VLTextModel):
5214+
model_arch = gguf.MODEL_ARCH.QWEN3VL
5215+
5216+
def set_gguf_parameters(self):
5217+
super().set_gguf_parameters()
5218+
self.gguf_writer.add_num_deepstack_layers(0)
5219+
5220+
def set_vocab(self):
5221+
super().set_vocab()
5222+
# fix chat template, use correct chatml format
5223+
self.gguf_writer.add_chat_template("{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}")
5224+
# correct BOS/EOS tokens
5225+
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
5226+
tokenizer_config = json.load(f)
5227+
added_tokens = tokenizer_config.get("added_tokens_decoder", {})
5228+
for token_id, data in added_tokens.items():
5229+
if data.get("content") == "<|im_end|>":
5230+
self.gguf_writer.add_bos_token_id(int(token_id))
5231+
self.gguf_writer.add_eos_token_id(int(token_id))
5232+
break
5233+
5234+
def modify_tensors(self, data_torch, name, bid):
5235+
# qwen3-omni
5236+
name = name.replace("thinker.", "")
5237+
5238+
# Skip vision and audio tensors - they go in the mmproj file
5239+
if "visual." in name or "audio_tower." in name \
5240+
or "talker." in name or "code2wav." in name:
5241+
return
5242+
5243+
yield from super().modify_tensors(data_torch, name, bid)
5244+
5245+
51045246
class _LinearAttentionVReorderBase(Qwen3NextModel):
51055247
model_arch = gguf.MODEL_ARCH.QWEN3NEXT # overridden by subclasses
51065248
"""reorders V heads from grouped to tiled order for ggml broadcast

docs/development/HOWTO-add-model.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ Adding a model requires few steps:
55
1. Convert the model to GGUF
66
2. Define the model architecture in `llama.cpp`
77
3. Build the GGML graph implementation
8+
4. Optional: Add multimodal encoder implementation
89

910
After following these steps, you can open PR.
1011

@@ -114,6 +115,21 @@ Some `ggml` backends do not support all operations. Backend implementations can
114115

115116
Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/).
116117

118+
### 4. Optional: Add multimodal encoder implementation
119+
120+
If the new model supports multimodal inputs, you will need to add a new encoder definition in `libmtmd`. You can find more information about llama.cpp's multimodal support in [the docs](../multimodal.md) and in the `tools/mtmd` source directory.
121+
122+
1. In the conversion script, make sure you add a subclass that extends `MmprojModel` or another class that inherits from the same base class.
123+
2. Add the encoder definition in `clip.cpp`.
124+
3. Implement the preprocessor in `mtmd.cpp`. In most cases, you can reuse an existing preprocessor.
125+
4. Implement the encoder GGML graph, either in a dedicated file if the model is truly different from existing ones, or by reusing an existing implementation (for example: siglip, pixtral, or qwen) and adding a model-specific projector.
126+
127+
Note:
128+
- Many multimodal encoders are based on models that are already supported. Make sure to read the existing encoder definitions in `tools/mtmd/models` before adding a new one. In `libmtmd`, it is generally better to extend an existing model than to duplicate code.
129+
- To debug the multimodal preprocessor and encoder, you can use [llama-mtmd-debug](tools/mtmd/debug/mtmd-debug.cpp).
130+
- Adding a model-specific API or CLI is an anti-pattern in `libmtmd`. The goal of `libmtmd` is to provide an easy-to-use, model-agnostic library for multimodal pipeline.
131+
- In most cases, `llama-mtmd-cli` should not be modified. If a model requires a specific prompt, either let the user provide it or bake it into the Jinja chat template.
132+
117133
## GGUF specification
118134

119135
https://github.com/ggml-org/ggml/blob/master/docs/gguf.md

docs/multimodal.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,11 @@ NOTE: some models may require large context window, for example: `-c 8192`
9494
# Moondream2 20250414 version
9595
(tool_name) -hf ggml-org/moondream2-20250414-GGUF
9696

97+
# Gemma 4
98+
(tool_name) -hf ggml-org/gemma-4-E2B-it-GGUF
99+
(tool_name) -hf ggml-org/gemma-4-E4B-it-GGUF
100+
(tool_name) -hf ggml-org/gemma-4-26B-A4B-it-GGUF
101+
(tool_name) -hf ggml-org/gemma-4-31B-it-GGUF
97102
```
98103

99104
**Audio models**:
@@ -118,6 +123,11 @@ NOTE: some models may require large context window, for example: `-c 8192`
118123
# Capabilities: audio input, vision input
119124
(tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
120125
(tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
126+
127+
# Gemma 4
128+
# Capabilities: audio input, vision input
129+
(tool_name) -hf ggml-org/gemma-4-E2B-it-GGUF
130+
(tool_name) -hf ggml-org/gemma-4-E4B-it-GGUF
121131
```
122132

123133
## Finding more models:

ggml/src/ggml-cuda/ssm-conv.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,8 +134,9 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
134134
switch (nc) {
135135
case 3: launch_kernel(std::integral_constant<int, 3>{}); break;
136136
case 4: launch_kernel(std::integral_constant<int, 4>{}); break;
137+
case 5: launch_kernel(std::integral_constant<int, 5>{}); break;
137138
case 9: launch_kernel(std::integral_constant<int, 9>{}); break;
138-
default: GGML_ABORT("Only support kernel sizes 3, 4, 9 right now.");
139+
default: GGML_ABORT("Only support kernel sizes 3, 4, 5, 9 right now.");
139140
}
140141
}
141142

ggml/src/ggml-sycl/convert.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -488,7 +488,7 @@ static void dequantize_row_nvfp4_sycl(const void * vx, dst_t * y, const int64_t
488488
const int nb = k / QK_NVFP4;
489489
stream->parallel_for(
490490
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
491-
[=](sycl::nd_item<3> item_ct1) {
491+
[=](sycl::nd_item<3> /*item_ct1*/) {
492492
dequantize_block_nvfp4(vx, y, k);
493493
});
494494
}

ggml/src/ggml-sycl/dequantize.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#define GGML_SYCL_DEQUANTIZE_HPP
1515

1616
#include "common.hpp"
17+
#include "convert.hpp"
1718

1819
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
1920
typedef void (*dequantize_kernel_t_reorder)(const void *d, const int64_t ib, const void *qs,

ggml/src/ggml-sycl/element_wise.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ static void acc_f32_sycl(const float *x, const float *y, float *dst,
355355
const int num_blocks = (n_elements + SYCL_ACC_BLOCK_SIZE - 1) / SYCL_ACC_BLOCK_SIZE;
356356
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE),
357357
sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)),
358-
[=](sycl::nd_item<3> item_ct1) {
358+
[=](sycl::nd_item<3> /*item_ct1*/) {
359359
acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, ne13, s1, s2, s3, offset);
360360
});
361361
}

ggml/src/ggml-sycl/gated_delta_net.cpp

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -176,14 +176,12 @@ static void launch_gated_delta_net(const float * q_d,
176176
const sycl::uint3 neqk1_magic = init_fastdiv_values(neqk1);
177177
const sycl::uint3 rq3_magic = init_fastdiv_values(rq3);
178178

179-
int cc = ggml_sycl_info().devices[ggml_sycl_get_device()].cc;
180-
181179
switch (S_v) {
182180
case 16:
183181
{
184182
constexpr int sv = 16;
185183
stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims),
186-
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
184+
[=](sycl::nd_item<3> /*item_ct1*/) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
187185
gated_delta_net_sycl<sv, KDA>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens,
188186
n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2,
189187
sb3, neqk1_magic, rq3_magic, scale);
@@ -194,7 +192,7 @@ static void launch_gated_delta_net(const float * q_d,
194192
{
195193
constexpr int sv = 32;
196194
stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims),
197-
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
195+
[=](sycl::nd_item<3> /*item_ct1*/) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
198196
gated_delta_net_sycl<sv, KDA>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens,
199197
n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2,
200198
sb3, neqk1_magic, rq3_magic, scale);
@@ -205,7 +203,7 @@ static void launch_gated_delta_net(const float * q_d,
205203
{
206204
constexpr int sv = 64;
207205
stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims),
208-
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
206+
[=](sycl::nd_item<3> /*item_ct1*/) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
209207
gated_delta_net_sycl<sv, KDA>(
210208
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2,
211209
sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
@@ -217,7 +215,7 @@ static void launch_gated_delta_net(const float * q_d,
217215
{
218216
constexpr int sv = 128;
219217
stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims),
220-
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
218+
[=](sycl::nd_item<3> /*item_ct1*/) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
221219
gated_delta_net_sycl<sv, KDA>(
222220
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2,
223221
sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);

0 commit comments

Comments
 (0)