Skip to content

Commit 1c5c739

Browse files
Merge pull request #508 from janhq/update-dev-from-master-2026-05-07-01-03
Sync master with upstream release b9049
2 parents a0592e9 + 2496f9c commit 1c5c739

54 files changed

Lines changed: 4283 additions & 3368 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/gguf-publish.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@ jobs:
2929
uses: actions/setup-python@v6
3030
with:
3131
python-version: '3.11'
32+
pip-install: poetry==2.4.0
3233
- name: Install dependencies
3334
run: |
3435
cd gguf-py
35-
python -m pip install poetry==2.3.2
3636
poetry install
3737
3838
- name: Build package

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@
105105
__pycache__/
106106
*/poetry.lock
107107
poetry.toml
108+
poetry.lock
109+
uv.lock
108110

109111
# Nix
110112

common/fit.cpp

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -109,16 +109,24 @@ static std::vector<llama_device_memory_data> common_get_device_memory_data(
109109
ret.back().total = total;
110110
}
111111
for (size_t i = 0; i < nd; i++) {
112+
ggml_backend_dev_t dev = llama_model_get_device(model, i);
113+
112114
size_t free;
113115
size_t total;
114-
ggml_backend_dev_memory(llama_model_get_device(model, i), &free, &total);
116+
ggml_backend_dev_memory(dev, &free, &total);
115117

116-
// devices can return 0 bytes for free and total memory if they do not
117-
// have any to report. in this case, we will use the host memory as a fallback
118-
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
118+
// Some non-GPU accelerator backends, such as BLAS, report 0/0 and rely on
119+
// the host-memory fallback. For GPU-like backends, keep 0/0 so --fit does
120+
// not assign anything to a device with an unknown memory budget.
119121
if (free == 0 && total == 0) {
120-
free = ret.back().free;
121-
total = ret.back().total;
122+
const enum ggml_backend_dev_type type = ggml_backend_dev_type(dev);
123+
if (type == GGML_BACKEND_DEVICE_TYPE_GPU || type == GGML_BACKEND_DEVICE_TYPE_IGPU) {
124+
LOG_WRN("%s: device %s did not report memory; --fit will not use it\n",
125+
__func__, ggml_backend_dev_name(dev));
126+
} else {
127+
free = ret.back().free;
128+
total = ret.back().total;
129+
}
122130
}
123131
ret[i].free = free;
124132
ret[i].total = total;

convert_hf_to_gguf.py

Lines changed: 876 additions & 554 deletions
Large diffs are not rendered by default.

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ class TOKENIZER_TYPE(IntEnum):
175175
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
176176
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
177177
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
178+
{"name": "qwen35", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM-V-4_6", "chkhsh": "1444df51289cfa8063b96f0e62b1125440111bc79a52003ea14b6eac7016fd5f"},
178179
{"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
179180
# jina-v2-de variants
180181
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},

docs/multimodal/minicpmv4.6.md

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
## MiniCPM-V 4.6
2+
3+
### Prepare models and code
4+
5+
Download [MiniCPM-V-4_6](https://huggingface.co/openbmb/MiniCPM-V-4_6) PyTorch model from huggingface to "MiniCPM-V-4_6" folder.
6+
7+
The model must be the standard `transformers` v5.7.0+ checkpoint (no `trust_remote_code`); the architecture in `config.json` is `MiniCPMV4_6ForConditionalGeneration` with a `qwen3_5_text` text model and a SigLIP-based vision tower plus a window-attention `vit_merger`.
8+
9+
### Build llama.cpp
10+
11+
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
12+
13+
Clone llama.cpp:
14+
```bash
15+
git clone https://github.com/ggml-org/llama.cpp
16+
cd llama.cpp
17+
```
18+
19+
Build llama.cpp using `CMake`:
20+
```bash
21+
cmake -B build
22+
cmake --build build --config Release
23+
```
24+
25+
26+
### Usage of MiniCPM-V 4.6
27+
28+
Unlike older MiniCPM-V variants, MiniCPM-V 4.6 is converted directly through `convert_hf_to_gguf.py`. The same script is invoked twice on the original Hugging Face directory: once to produce the language-model GGUF and once with `--mmproj` to produce the multimodal projector GGUF.
29+
30+
```bash
31+
# language model
32+
python ./convert_hf_to_gguf.py ../MiniCPM-V-4_6 --outfile ../MiniCPM-V-4_6/ggml-model-f16.gguf
33+
34+
# multimodal projector (vision tower + window-attention vit_merger + DownsampleMLP merger)
35+
python ./convert_hf_to_gguf.py ../MiniCPM-V-4_6 --mmproj --outfile ../MiniCPM-V-4_6/mmproj-model-f16.gguf
36+
37+
# optional: quantize to Q4_K_M
38+
./build/bin/llama-quantize ../MiniCPM-V-4_6/ggml-model-f16.gguf ../MiniCPM-V-4_6/ggml-model-Q4_K_M.gguf Q4_K_M
39+
```
40+
41+
42+
Inference on Linux or Mac
43+
```bash
44+
# run in single-turn mode
45+
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_6/ggml-model-f16.gguf --mmproj ../MiniCPM-V-4_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
46+
47+
# run in conversation mode
48+
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_6/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-4_6/mmproj-model-f16.gguf
49+
```

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2965,6 +2965,45 @@ struct ggml_cplan ggml_graph_plan(
29652965
return cplan;
29662966
}
29672967

2968+
2969+
// Try to fuse the current node with subsequent nodes for better performance.
2970+
// Returns the number of nodes skipped by fusion (>=1), or 0 if no fusion was applied.
2971+
static bool ggml_cpu_disable_fusion = false; // initialized once in ggml_cpu_init(), read-only afterwards
2972+
2973+
static int ggml_cpu_try_fuse_ops(
2974+
const struct ggml_cgraph * cgraph,
2975+
const int node_n,
2976+
const struct ggml_compute_params * params,
2977+
const struct ggml_cplan * cplan) {
2978+
2979+
if (ggml_cpu_disable_fusion || cplan->use_ref) {
2980+
return 0;
2981+
}
2982+
2983+
struct ggml_tensor * node = cgraph->nodes[node_n];
2984+
2985+
if (node->op == GGML_OP_RMS_NORM) {
2986+
// RMS_NORM + MUL fusion
2987+
const enum ggml_op fuse_ops[] = { GGML_OP_RMS_NORM, GGML_OP_MUL };
2988+
if (ggml_can_fuse(cgraph, node_n, fuse_ops, 2)) {
2989+
struct ggml_tensor * mul_node = cgraph->nodes[node_n + 1];
2990+
const struct ggml_tensor * mul_w = (mul_node->src[0] == node)
2991+
? mul_node->src[1] : mul_node->src[0];
2992+
if (node->src[0]->type == GGML_TYPE_F32 &&
2993+
mul_node->type == GGML_TYPE_F32 &&
2994+
mul_w->type == GGML_TYPE_F32 &&
2995+
mul_w->ne[0] == node->ne[0] &&
2996+
mul_w->nb[0] == sizeof(float)) {
2997+
2998+
ggml_compute_forward_rms_norm_mul_fused(params, node, mul_node);
2999+
return 1;
3000+
}
3001+
}
3002+
}
3003+
3004+
return 0;
3005+
}
3006+
29683007
static thread_ret_t ggml_graph_compute_thread(void * data) {
29693008
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
29703009
struct ggml_threadpool * tp = state->threadpool;
@@ -3001,7 +3040,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
30013040
continue;
30023041
}
30033042

3004-
ggml_compute_forward(&params, node);
3043+
// TODO: move fused-op detection into ggml_graph_plan so fusion decisions are made once at planning time
3044+
// Try fused ops, fall back to normal compute
3045+
const int n_fused = ggml_cpu_try_fuse_ops(cgraph, node_n, &params, cplan);
3046+
if (n_fused > 0) {
3047+
node_n += n_fused;
3048+
} else {
3049+
ggml_compute_forward(&params, node);
3050+
}
30053051

30063052
if (state->ith == 0 && cplan->abort_callback &&
30073053
cplan->abort_callback(cplan->abort_callback_data)) {
@@ -3763,6 +3809,11 @@ void ggml_cpu_init(void) {
37633809
ggml_init_riscv_arch_features();
37643810
#endif
37653811

3812+
{
3813+
const char * env = getenv("GGML_CPU_DISABLE_FUSION");
3814+
ggml_cpu_disable_fusion = (env != NULL && atoi(env) == 1);
3815+
}
3816+
37663817
is_first_call = false;
37673818
}
37683819

ggml/src/ggml-cpu/ops.cpp

Lines changed: 62 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3713,11 +3713,27 @@ void ggml_compute_forward_norm(
37133713

37143714
// ggml_compute_forward_group_rms_norm
37153715

3716+
// fusion kinds that can be combined with the rms_norm computation in a single pass.
3717+
// extend this enum when adding new fused variants (e.g. FUSE_ADD, FUSE_MUL_ADD, ...).
3718+
enum ggml_rms_norm_fuse_op {
3719+
GGML_RMS_NORM_FUSE_OP_NONE,
3720+
GGML_RMS_NORM_FUSE_OP_MUL,
3721+
};
3722+
3723+
template <ggml_rms_norm_fuse_op FUSE_OP>
37163724
static void ggml_compute_forward_rms_norm_f32(
37173725
const ggml_compute_params * params,
3718-
ggml_tensor * dst) {
3726+
ggml_tensor * dst_rms_norm,
3727+
ggml_tensor * dst_fused = nullptr) {
37193728

3720-
const ggml_tensor * src0 = dst->src[0];
3729+
const ggml_tensor * src0 = dst_rms_norm->src[0];
3730+
const ggml_tensor * src1 = nullptr;
3731+
ggml_tensor * dst = dst_rms_norm;
3732+
3733+
if constexpr (FUSE_OP == GGML_RMS_NORM_FUSE_OP_MUL) {
3734+
src1 = (dst_fused->src[0] == dst_rms_norm) ? dst_fused->src[1] : dst_fused->src[0];
3735+
dst = dst_fused;
3736+
}
37213737

37223738
GGML_ASSERT(ggml_are_same_shape(src0, dst));
37233739

@@ -3726,11 +3742,10 @@ static void ggml_compute_forward_rms_norm_f32(
37263742
const int ith = params->ith;
37273743
const int nth = params->nth;
37283744

3729-
GGML_TENSOR_UNARY_OP_LOCALS
3745+
GGML_TENSOR_BINARY_OP_LOCALS
37303746

37313747
float eps;
3732-
memcpy(&eps, dst->op_params, sizeof(float));
3733-
3748+
memcpy(&eps, dst_rms_norm->op_params, sizeof(float));
37343749
GGML_ASSERT(eps >= 0.0f);
37353750

37363751
// TODO: optimize
@@ -3740,25 +3755,32 @@ static void ggml_compute_forward_rms_norm_f32(
37403755
const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
37413756

37423757
ggml_float sum = 0.0;
3758+
// worth switching to explicit SIMD?
37433759
for (int64_t i00 = 0; i00 < ne00; i00++) {
37443760
sum += (ggml_float)(x[i00] * x[i00]);
37453761
}
37463762

3747-
const float mean = sum/ne00;
3748-
3749-
float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
3750-
3751-
memcpy(y, x, ne00 * sizeof(float));
3752-
// for (int i00 = 0; i00 < ne00; i00++) {
3753-
// y[i00] = x[i00];
3754-
// }
3755-
3763+
const float mean = sum/ne00;
37563764
const float scale = 1.0f/sqrtf(mean + eps);
37573765

37583766
// if you hit this, likely you got an inf somewhere earlier
37593767
assert(scale > 0.0f);
37603768

3761-
ggml_vec_scale_f32(ne00, y, scale);
3769+
float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
3770+
3771+
if constexpr (FUSE_OP == GGML_RMS_NORM_FUSE_OP_MUL) {
3772+
const int64_t i11 = i01 % ne11;
3773+
const int64_t i12 = i02 % ne12;
3774+
const int64_t i13 = i03 % ne13;
3775+
const float * w = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13);
3776+
3777+
for (int64_t i00 = 0; i00 < ne00; i00++) {
3778+
y[i00] = x[i00] * scale * w[i00];
3779+
}
3780+
} else {
3781+
memcpy(y, x, ne00 * sizeof(float));
3782+
ggml_vec_scale_f32(ne00, y, scale);
3783+
}
37623784
}
37633785
}
37643786
}
@@ -3773,7 +3795,31 @@ void ggml_compute_forward_rms_norm(
37733795
switch (src0->type) {
37743796
case GGML_TYPE_F32:
37753797
{
3776-
ggml_compute_forward_rms_norm_f32(params, dst);
3798+
ggml_compute_forward_rms_norm_f32<GGML_RMS_NORM_FUSE_OP_NONE>(params, dst);
3799+
} break;
3800+
default:
3801+
{
3802+
GGML_ABORT("fatal error");
3803+
}
3804+
}
3805+
}
3806+
3807+
// Fused RMS_NORM + MUL: computes dst = rms_norm(src0) * src1 in a single pass.
3808+
// This avoids materializing the intermediate rms_norm result in memory.
3809+
void ggml_compute_forward_rms_norm_mul_fused(
3810+
const ggml_compute_params * params,
3811+
ggml_tensor * dst_rms_norm,
3812+
ggml_tensor * dst_mul) {
3813+
3814+
GGML_ASSERT(dst_mul != nullptr);
3815+
GGML_ASSERT(dst_mul->src[0] == dst_rms_norm || dst_mul->src[1] == dst_rms_norm);
3816+
3817+
const ggml_tensor * src0 = dst_rms_norm->src[0];
3818+
3819+
switch (src0->type) {
3820+
case GGML_TYPE_F32:
3821+
{
3822+
ggml_compute_forward_rms_norm_f32<GGML_RMS_NORM_FUSE_OP_MUL>(params, dst_rms_norm, dst_mul);
37773823
} break;
37783824
default:
37793825
{

ggml/src/ggml-cpu/ops.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ void ggml_compute_forward_concat(const struct ggml_compute_params * params, stru
4444
void ggml_compute_forward_silu_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
4545
void ggml_compute_forward_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
4646
void ggml_compute_forward_rms_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
47+
void ggml_compute_forward_rms_norm_mul_fused(const struct ggml_compute_params * params, struct ggml_tensor * dst_rms_norm, struct ggml_tensor * dst_mul);
4748
void ggml_compute_forward_rms_norm_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
4849
void ggml_compute_forward_group_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
4950
void ggml_compute_forward_l2_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,7 @@ struct ggml_backend_opencl_context {
389389
ADRENO_GPU_GEN adreno_gen;
390390

391391
cl_int alignment;
392+
size_t global_mem_size;
392393
size_t max_alloc_size;
393394
size_t max_workgroup_size;
394395
bool fp16_support;
@@ -3386,6 +3387,9 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
33863387
backend_ctx->alignment = base_align_in_bits / 8u;
33873388
GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
33883389

3390+
clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &backend_ctx->global_mem_size, NULL);
3391+
GGML_LOG_INFO("ggml_opencl: global mem size: %zu MB\n", backend_ctx->global_mem_size/1024/1024);
3392+
33893393
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
33903394
GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024);
33913395

@@ -6356,11 +6360,16 @@ static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_
63566360
}
63576361

63586362
static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
6359-
// no memory to report
6360-
*free = 0;
6361-
*total = 0;
6363+
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
6364+
ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *) dev_ctx->backend_ctx;
63626365

6363-
GGML_UNUSED(dev);
6366+
static const size_t opencl_extra_margin = 1024ull*1024ull*1024ull;
6367+
6368+
// OpenCL does not provide reliable currently-free device memory.
6369+
// Use total/global memory as a best-effort upper bound.
6370+
// Improved safety: Reduce by a 1GiB extra margin for common --fit
6371+
*total = backend_ctx->global_mem_size;
6372+
*free = *total > opencl_extra_margin ? *total - opencl_extra_margin : 0;
63646373
}
63656374

63666375
static enum ggml_backend_dev_type ggml_backend_opencl_device_get_type(ggml_backend_dev_t dev) {

0 commit comments

Comments
 (0)