Skip to content

Commit a0592e9

Browse files
Merge pull request #507 from janhq/update-dev-from-master-2026-05-06-00-59
Sync master with upstream release b9037
2 parents 2c66a5c + bbeb89d commit a0592e9

41 files changed

Lines changed: 1082 additions & 769 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

common/arg.cpp

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,8 @@ std::vector<std::string> common_arg::get_env() const {
248248

249249
// Helper function to parse tensor buffer override strings
250250
static void parse_tensor_buffer_overrides(const std::string & value, std::vector<llama_model_tensor_buft_override> & overrides) {
251+
ggml_backend_load_all();
252+
251253
std::map<std::string, ggml_backend_buffer_type_t> buft_list;
252254
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
253255
auto * dev = ggml_backend_dev_get(i);
@@ -425,6 +427,10 @@ static bool parse_bool_value(const std::string & value) {
425427
}
426428
}
427429

430+
[[noreturn]] static void arg_removed(const std::string & msg) {
431+
throw std::invalid_argument("the argument has been removed. " + msg);
432+
}
433+
428434
//
429435
// CLI argument parsing functions
430436
//
@@ -803,6 +809,7 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
803809
if (dev_names.size() == 1 && dev_names[0] == "none") {
804810
devices.push_back(nullptr);
805811
} else {
812+
ggml_backend_load_all();
806813
for (const auto & device : dev_names) {
807814
auto * dev = ggml_backend_dev_by_name(device.c_str());
808815
if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
@@ -820,6 +827,7 @@ static void add_rpc_devices(const std::string & servers) {
820827
if (rpc_servers.empty()) {
821828
throw std::invalid_argument("no RPC servers specified");
822829
}
830+
ggml_backend_load_all();
823831
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
824832
if (!rpc_reg) {
825833
throw std::invalid_argument("failed to find RPC backend");
@@ -1016,9 +1024,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
10161024

10171025
params.use_color = tty_can_use_colors();
10181026

1019-
// load dynamic backends
1020-
ggml_backend_load_all();
1021-
10221027
common_params_context ctx_arg(params);
10231028
ctx_arg.print_usage = print_usage;
10241029
ctx_arg.ex = ex;
@@ -2275,6 +2280,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
22752280
{"--list-devices"},
22762281
"print list of available devices and exit",
22772282
[](common_params &) {
2283+
ggml_backend_load_all();
22782284
std::vector<ggml_backend_dev_t> devices;
22792285
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
22802286
auto * dev = ggml_backend_dev_get(i);
@@ -3715,35 +3721,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
37153721
{"--draft", "--draft-n", "--draft-max"}, "N",
37163722
"the argument has been removed. use --spec-draft-n-max or --spec-ngram-mod-n-max",
37173723
[](common_params & /*params*/, int /*value*/) {
3718-
throw std::invalid_argument("the argument has been removed. use --spec-draft-n-max or --spec-ngram-mod-n-max");
3724+
arg_removed("use --spec-draft-n-max or --spec-ngram-mod-n-max");
37193725
}
37203726
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MAX"));
37213727
add_opt(common_arg(
37223728
{"--draft-min", "--draft-n-min"}, "N",
37233729
"the argument has been removed. use --spec-draft-n-min or --spec-ngram-mod-n-min",
37243730
[](common_params & /*params*/, int /*value*/) {
3725-
throw std::invalid_argument("the argument has been removed. use --spec-draft-n-min or --spec-ngram-mod-n-min");
3731+
arg_removed("use --spec-draft-n-min or --spec-ngram-mod-n-min");
37263732
}
37273733
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN"));
37283734
add_opt(common_arg(
37293735
{"--spec-ngram-size-n"}, "N",
37303736
"the argument has been removed. use the respective --spec-ngram-*-size-n or --spec-ngram-mod-n-match",
37313737
[](common_params & /*params*/, int /*value*/) {
3732-
throw std::invalid_argument("the argument has been removed. use the respective --spec-ngram-*-size-n");
3738+
arg_removed("use the respective --spec-ngram-*-size-n");
37333739
}
37343740
).set_spec().set_examples({LLAMA_EXAMPLE_SERVER}));
37353741
add_opt(common_arg(
37363742
{"--spec-ngram-size-m"}, "N",
37373743
"the argument has been removed. use the respective --spec-ngram-*-size-m",
37383744
[](common_params & /*params*/, int /*value*/) {
3739-
throw std::invalid_argument("the argument has been removed. use the respective --spec-ngram-*-size-m");
3745+
arg_removed("use the respective --spec-ngram-*-size-m");
37403746
}
37413747
).set_spec().set_examples({LLAMA_EXAMPLE_SERVER}));
37423748
add_opt(common_arg(
37433749
{"--spec-ngram-min-hits"}, "N",
37443750
"the argument has been removed. use the respective --spec-ngram-*-min-hits",
37453751
[](common_params & /*params*/, int /*value*/) {
3746-
throw std::invalid_argument("the argument has been removed. use the respective --spec-ngram-*-min-hits");
3752+
arg_removed("use the respective --spec-ngram-*-min-hits");
37473753
}
37483754
).set_spec().set_examples({LLAMA_EXAMPLE_SERVER}));
37493755

common/speculative.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -252,14 +252,14 @@ struct common_speculative_state_draft : public common_speculative_state {
252252

253253
size_t create_checkpoint(int n_tokens_prompt) {
254254
int slot_id = 0;
255-
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
255+
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
256256

257257
ckpt.pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_dft), slot_id);
258258
ckpt.pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), slot_id);
259259
ckpt.n_tokens = n_tokens_prompt;
260260
ckpt.data.resize(checkpoint_size);
261261

262-
const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
262+
const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
263263
if (n != checkpoint_size) {
264264
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", checkpoint_size, n);
265265
}
@@ -272,7 +272,7 @@ struct common_speculative_state_draft : public common_speculative_state {
272272
size_t restore_checkpoint() {
273273
int slot_id = 0;
274274
LOG_DBG("%s: pos_min = %d, pos_max = %d\n", __func__, ckpt.pos_min, ckpt.pos_max);
275-
const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
275+
const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
276276
if (n != ckpt.size()) {
277277
GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu",
278278
__func__, ckpt.pos_min, ckpt.pos_max, ckpt.size());

examples/save-load-state/save-load-state.cpp

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,12 @@ int main(int argc, char ** argv) {
3838
std::string result0;
3939
std::string result1;
4040
std::string result2;
41+
std::string result3;
4142

4243
// init
44+
45+
ggml_backend_load_all();
46+
4347
auto llama_init = common_init_from_params(params);
4448

4549
auto * model = llama_init->model();
@@ -213,11 +217,83 @@ int main(int argc, char ** argv) {
213217
n_past += 1;
214218
}
215219

220+
// test on-device state save/load
221+
auto params_ctx4 = common_context_params_to_llama(params);
222+
params_ctx4.n_seq_max = 2;
223+
llama_context * ctx4 = llama_init_from_model(model, params_ctx4);
224+
225+
llama_sampler * smpl4 = llama_sampler_chain_init(sparams);
226+
227+
llama_sampler_chain_add(smpl4, llama_sampler_init_dist(params.sampling.seed));
228+
229+
printf("\nsingle seq run: %s", params.prompt.c_str());
230+
231+
// load state (rng, logits, embedding and kv_cache) from file
232+
n_token_count_out = 0;
233+
234+
if (!llama_state_load_file(ctx4, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
235+
fprintf(stderr, "\n%s : failed to load state\n", __func__);
236+
return 1;
237+
}
238+
239+
fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
240+
241+
// restore state (last tokens)
242+
n_past = n_token_count_out;
243+
if (!common_replay_last_token(ctx4, tokens.back(), n_past)) {
244+
return 1;
245+
}
246+
++n_past;
247+
248+
// save seq 0 and load into seq 1
249+
{
250+
// save kv of seq 0
251+
std::vector<uint8_t> seq_store(llama_state_seq_get_size_ext(ctx4, 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE));
252+
const size_t ncopy = llama_state_seq_get_data_ext(ctx4, seq_store.data(), seq_store.size(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
253+
if (ncopy != seq_store.size()) {
254+
fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
255+
return 1;
256+
}
257+
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
258+
259+
// erase whole kv
260+
llama_memory_clear(llama_get_memory(ctx4), true);
261+
fprintf(stderr, "%s : kv cache cleared\n", __func__);
262+
263+
// restore kv into seq 0
264+
const size_t nset = llama_state_seq_set_data_ext(ctx4, seq_store.data(), seq_store.size(), 1, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
265+
if (nset != seq_store.size()) {
266+
fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
267+
return 1;
268+
}
269+
fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
270+
}
271+
272+
// forth run
273+
for (auto i = 0; i < params.n_predict; i++) {
274+
auto next_token = llama_sampler_sample(smpl4, ctx4, -1);
275+
auto next_token_str = common_token_to_piece(ctx4, next_token);
276+
277+
printf("%s", next_token_str.c_str());
278+
result3 += next_token_str;
279+
280+
common_batch_clear(batch);
281+
common_batch_add(batch, next_token, n_past, {1}, true);
282+
283+
if (llama_decode(ctx4, batch)) {
284+
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
285+
llama_batch_free(batch);
286+
return 1;
287+
}
288+
n_past += 1;
289+
}
290+
216291
printf("\n");
217292

218293
llama_sampler_free(smpl);
219294
llama_sampler_free(smpl2);
220295
llama_sampler_free(smpl3);
296+
llama_sampler_free(smpl4);
221297

222298
llama_batch_free(batch);
223299

@@ -226,12 +302,18 @@ int main(int argc, char ** argv) {
226302

227303
llama_free(ctx2);
228304
llama_free(ctx3);
305+
llama_free(ctx4);
229306

230307
if (result0 != result2) {
231308
fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
232309
return 1;
233310
}
234311

312+
if (result0 != result3) {
313+
fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
314+
return 1;
315+
}
316+
235317
fprintf(stderr, "\n%s : success\n", __func__);
236318

237319
return 0;

ggml/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ project("ggml" C CXX ASM)
44

55
### GGML Version
66
set(GGML_VERSION_MAJOR 0)
7-
set(GGML_VERSION_MINOR 10)
8-
set(GGML_VERSION_PATCH 2)
7+
set(GGML_VERSION_MINOR 11)
8+
set(GGML_VERSION_PATCH 0)
99
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
1010

1111
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")

ggml/include/ggml.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,12 @@ extern "C" {
438438
GGML_PREC_F32 = 10,
439439
};
440440

441+
// op hint
442+
enum ggml_op_hint {
443+
GGML_HINT_NONE = 0,
444+
GGML_HINT_SRC0_IS_HADAMARD = 1,
445+
};
446+
441447
// model file types
442448
enum ggml_ftype {
443449
GGML_FTYPE_UNKNOWN = -1,
@@ -1419,6 +1425,11 @@ extern "C" {
14191425
struct ggml_tensor * a,
14201426
enum ggml_prec prec);
14211427

1428+
// change the hint of a matrix multiplication
1429+
GGML_API void ggml_mul_mat_set_hint(
1430+
struct ggml_tensor * a,
1431+
enum ggml_op_hint hint);
1432+
14221433
// indirect matrix multiplication
14231434
GGML_API struct ggml_tensor * ggml_mul_mat_id(
14241435
struct ggml_context * ctx,

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,6 +1245,12 @@ void ggml_compute_forward_mul_mat(
12451245
const struct ggml_tensor * src0 = dst->src[0];
12461246
const struct ggml_tensor * src1 = dst->src[1];
12471247

1248+
const int32_t hint = ggml_get_op_params_i32(dst, 1);
1249+
if (hint == GGML_HINT_SRC0_IS_HADAMARD && !params->use_ref) {
1250+
ggml_compute_forward_fwht(params, dst);
1251+
return;
1252+
}
1253+
12481254
GGML_TENSOR_BINARY_OP_LOCALS
12491255

12501256
const int ith = params->ith;

ggml/src/ggml-cpu/ops.cpp

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11212,3 +11212,91 @@ void ggml_compute_forward_opt_step_sgd(const ggml_compute_params * params, ggml_
1121211212
}
1121311213
}
1121411214
}
11215+
11216+
static void ggml_compute_forward_fwht_f32(const ggml_compute_params * params, ggml_tensor * dst) {
11217+
const ggml_tensor * src0 = dst->src[0];
11218+
const ggml_tensor * src1 = dst->src[1];
11219+
11220+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
11221+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
11222+
11223+
GGML_TENSOR_BINARY_OP_LOCALS
11224+
11225+
const int ith = params->ith;
11226+
const int nth = params->nth;
11227+
11228+
const int64_t n = ne10;
11229+
GGML_ASSERT((n & (n - 1)) == 0); // must be power of 2
11230+
11231+
const int64_t nr = ne11 * ne12 * ne13;
11232+
const int64_t rows_per_thread = (nr + nth - 1) / nth;
11233+
const int64_t start_row = ith * rows_per_thread;
11234+
const int64_t end_row = MIN(start_row + rows_per_thread, nr);
11235+
11236+
const float scale = 1.0f / sqrtf((float)n);
11237+
11238+
#if defined(GGML_SIMD)
11239+
const GGML_F32_VEC v_minus_one = GGML_F32_VEC_SET1(-1.0f);
11240+
#endif
11241+
11242+
for (int64_t r = start_row; r < end_row; r++) {
11243+
const int64_t i13 = r / (ne11 * ne12);
11244+
const int64_t i12 = (r - i13 * ne11 * ne12) / ne11;
11245+
const int64_t i11 = r - i13 * ne11 * ne12 - i12 * ne11;
11246+
11247+
const float * src_row = (const float *) ((const char *) src1->data + i11 * nb11 + i12 * nb12 + i13 * nb13);
11248+
float * dst_row = (float *) ((char *) dst->data + i11 * nb1 + i12 * nb2 + i13 * nb3);
11249+
11250+
for (int64_t j = 0; j < n; j++) {
11251+
dst_row[j] = src_row[j] * scale;
11252+
}
11253+
11254+
// Scalar passes
11255+
#if defined(GGML_SIMD)
11256+
const int step = GGML_F32_EPR;
11257+
#else
11258+
const int step = n;
11259+
#endif
11260+
for (int64_t len = 1; len < step && len < n; len <<= 1) {
11261+
for (int64_t i = 0; i < n; i += 2 * len) {
11262+
for (int64_t j = 0; j < len; j++) {
11263+
float u = dst_row[i + j];
11264+
float v = dst_row[i + len + j];
11265+
dst_row[i + j] = u + v;
11266+
dst_row[i + len + j] = u - v;
11267+
}
11268+
}
11269+
}
11270+
11271+
// SIMD passes using GGML_F32_VEC_* macros for multi-architecture support
11272+
#if defined(GGML_SIMD)
11273+
for (int64_t len = step; len < n; len <<= 1) {
11274+
for (int64_t i = 0; i < n; i += 2 * len) {
11275+
for (int64_t j = 0; j < len; j += step) {
11276+
GGML_F32_VEC u = GGML_F32_VEC_LOAD(dst_row + i + j);
11277+
GGML_F32_VEC v = GGML_F32_VEC_LOAD(dst_row + i + len + j);
11278+
11279+
GGML_F32_VEC_STORE(dst_row + i + j, GGML_F32_VEC_ADD(u, v));
11280+
GGML_F32_VEC_STORE(dst_row + i + len + j, GGML_F32_VEC_FMA(u, v, v_minus_one));
11281+
}
11282+
}
11283+
}
11284+
#endif
11285+
}
11286+
}
11287+
11288+
void ggml_compute_forward_fwht(const ggml_compute_params * params, ggml_tensor * dst) {
11289+
const ggml_tensor * src1 = dst->src[1];
11290+
11291+
switch (src1->type) {
11292+
case GGML_TYPE_F32:
11293+
{
11294+
ggml_compute_forward_fwht_f32(params, dst);
11295+
}
11296+
break;
11297+
default:
11298+
{
11299+
GGML_ABORT("fatal error - fwht is F32 only");
11300+
}
11301+
}
11302+
}

ggml/src/ggml-cpu/ops.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params *
111111
void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
112112
void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
113113
void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
114+
void ggml_compute_forward_fwht(const struct ggml_compute_params * params, struct ggml_tensor * dst);
114115
void ggml_compute_forward_opt_step_sgd(const struct ggml_compute_params * params, struct ggml_tensor * dst);
115116
#ifdef __cplusplus
116117
}

0 commit comments

Comments
 (0)