Skip to content

Commit b1da22e

Browse files
authored
Merge branch 'ggml-org:master' into feat/qlora-training-v2
2 parents 8a30a71 + bf76ac7 commit b1da22e

30 files changed

Lines changed: 664 additions & 72 deletions

common/arg.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,8 @@ std::vector<std::string> common_arg::get_env() const {
248248

249249
// Helper function to parse tensor buffer override strings
250250
static void parse_tensor_buffer_overrides(const std::string & value, std::vector<llama_model_tensor_buft_override> & overrides) {
251+
ggml_backend_load_all();
252+
251253
std::map<std::string, ggml_backend_buffer_type_t> buft_list;
252254
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
253255
auto * dev = ggml_backend_dev_get(i);
@@ -803,6 +805,7 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
803805
if (dev_names.size() == 1 && dev_names[0] == "none") {
804806
devices.push_back(nullptr);
805807
} else {
808+
ggml_backend_load_all();
806809
for (const auto & device : dev_names) {
807810
auto * dev = ggml_backend_dev_by_name(device.c_str());
808811
if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
@@ -820,6 +823,7 @@ static void add_rpc_devices(const std::string & servers) {
820823
if (rpc_servers.empty()) {
821824
throw std::invalid_argument("no RPC servers specified");
822825
}
826+
ggml_backend_load_all();
823827
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
824828
if (!rpc_reg) {
825829
throw std::invalid_argument("failed to find RPC backend");
@@ -1016,9 +1020,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
10161020

10171021
params.use_color = tty_can_use_colors();
10181022

1019-
// load dynamic backends
1020-
ggml_backend_load_all();
1021-
10221023
common_params_context ctx_arg(params);
10231024
ctx_arg.print_usage = print_usage;
10241025
ctx_arg.ex = ex;
@@ -2275,6 +2276,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
22752276
{"--list-devices"},
22762277
"print list of available devices and exit",
22772278
[](common_params &) {
2279+
ggml_backend_load_all();
22782280
std::vector<ggml_backend_dev_t> devices;
22792281
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
22802282
auto * dev = ggml_backend_dev_get(i);

common/speculative.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -252,14 +252,14 @@ struct common_speculative_state_draft : public common_speculative_state {
252252

253253
size_t create_checkpoint(int n_tokens_prompt) {
254254
int slot_id = 0;
255-
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
255+
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
256256

257257
ckpt.pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_dft), slot_id);
258258
ckpt.pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), slot_id);
259259
ckpt.n_tokens = n_tokens_prompt;
260260
ckpt.data.resize(checkpoint_size);
261261

262-
const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
262+
const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
263263
if (n != checkpoint_size) {
264264
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", checkpoint_size, n);
265265
}
@@ -272,7 +272,7 @@ struct common_speculative_state_draft : public common_speculative_state {
272272
size_t restore_checkpoint() {
273273
int slot_id = 0;
274274
LOG_DBG("%s: pos_min = %d, pos_max = %d\n", __func__, ckpt.pos_min, ckpt.pos_max);
275-
const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
275+
const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
276276
if (n != ckpt.size()) {
277277
GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu",
278278
__func__, ckpt.pos_min, ckpt.pos_max, ckpt.size());

examples/save-load-state/save-load-state.cpp

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,12 @@ int main(int argc, char ** argv) {
3838
std::string result0;
3939
std::string result1;
4040
std::string result2;
41+
std::string result3;
4142

4243
// init
44+
45+
ggml_backend_load_all();
46+
4347
auto llama_init = common_init_from_params(params);
4448

4549
auto * model = llama_init->model();
@@ -213,11 +217,83 @@ int main(int argc, char ** argv) {
213217
n_past += 1;
214218
}
215219

220+
// test on-device state save/load
221+
auto params_ctx4 = common_context_params_to_llama(params);
222+
params_ctx4.n_seq_max = 2;
223+
llama_context * ctx4 = llama_init_from_model(model, params_ctx4);
224+
225+
llama_sampler * smpl4 = llama_sampler_chain_init(sparams);
226+
227+
llama_sampler_chain_add(smpl4, llama_sampler_init_dist(params.sampling.seed));
228+
229+
printf("\nsingle seq run: %s", params.prompt.c_str());
230+
231+
// load state (rng, logits, embedding and kv_cache) from file
232+
n_token_count_out = 0;
233+
234+
if (!llama_state_load_file(ctx4, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
235+
fprintf(stderr, "\n%s : failed to load state\n", __func__);
236+
return 1;
237+
}
238+
239+
fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
240+
241+
// restore state (last tokens)
242+
n_past = n_token_count_out;
243+
if (!common_replay_last_token(ctx4, tokens.back(), n_past)) {
244+
return 1;
245+
}
246+
++n_past;
247+
248+
// save seq 0 and load into seq 1
249+
{
250+
// save kv of seq 0
251+
std::vector<uint8_t> seq_store(llama_state_seq_get_size_ext(ctx4, 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE));
252+
const size_t ncopy = llama_state_seq_get_data_ext(ctx4, seq_store.data(), seq_store.size(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
253+
if (ncopy != seq_store.size()) {
254+
fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
255+
return 1;
256+
}
257+
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
258+
259+
// erase whole kv
260+
llama_memory_clear(llama_get_memory(ctx4), true);
261+
fprintf(stderr, "%s : kv cache cleared\n", __func__);
262+
263+
// restore kv into seq 0
264+
const size_t nset = llama_state_seq_set_data_ext(ctx4, seq_store.data(), seq_store.size(), 1, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
265+
if (nset != seq_store.size()) {
266+
fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
267+
return 1;
268+
}
269+
fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
270+
}
271+
272+
// forth run
273+
for (auto i = 0; i < params.n_predict; i++) {
274+
auto next_token = llama_sampler_sample(smpl4, ctx4, -1);
275+
auto next_token_str = common_token_to_piece(ctx4, next_token);
276+
277+
printf("%s", next_token_str.c_str());
278+
result3 += next_token_str;
279+
280+
common_batch_clear(batch);
281+
common_batch_add(batch, next_token, n_past, {1}, true);
282+
283+
if (llama_decode(ctx4, batch)) {
284+
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
285+
llama_batch_free(batch);
286+
return 1;
287+
}
288+
n_past += 1;
289+
}
290+
216291
printf("\n");
217292

218293
llama_sampler_free(smpl);
219294
llama_sampler_free(smpl2);
220295
llama_sampler_free(smpl3);
296+
llama_sampler_free(smpl4);
221297

222298
llama_batch_free(batch);
223299

@@ -226,12 +302,18 @@ int main(int argc, char ** argv) {
226302

227303
llama_free(ctx2);
228304
llama_free(ctx3);
305+
llama_free(ctx4);
229306

230307
if (result0 != result2) {
231308
fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
232309
return 1;
233310
}
234311

312+
if (result0 != result3) {
313+
fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
314+
return 1;
315+
}
316+
235317
fprintf(stderr, "\n%s : success\n", __func__);
236318

237319
return 0;

ggml/include/ggml.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,12 @@ extern "C" {
438438
GGML_PREC_F32 = 10,
439439
};
440440

441+
// op hint
442+
enum ggml_op_hint {
443+
GGML_HINT_NONE = 0,
444+
GGML_HINT_SRC0_IS_HADAMARD = 1,
445+
};
446+
441447
// model file types
442448
enum ggml_ftype {
443449
GGML_FTYPE_UNKNOWN = -1,
@@ -1420,6 +1426,11 @@ extern "C" {
14201426
struct ggml_tensor * a,
14211427
enum ggml_prec prec);
14221428

1429+
// change the hint of a matrix multiplication
1430+
GGML_API void ggml_mul_mat_set_hint(
1431+
struct ggml_tensor * a,
1432+
enum ggml_op_hint hint);
1433+
14231434
// indirect matrix multiplication
14241435
GGML_API struct ggml_tensor * ggml_mul_mat_id(
14251436
struct ggml_context * ctx,

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -578,13 +578,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
578578

579579
# Fetch KleidiAI sources:
580580
include(FetchContent)
581-
set(KLEIDIAI_COMMIT_TAG "v1.22.0")
582-
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
583-
set(KLEIDIAI_ARCHIVE_MD5 "54049037570ab0ee0a0d126b2ba5ece1")
581+
set(KLEIDIAI_COMMIT_TAG "v1.24.0")
582+
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/releases/download/${KLEIDIAI_COMMIT_TAG}/kleidiai-${KLEIDIAI_COMMIT_TAG}-src.tar.gz")
583+
set(KLEIDIAI_RELEASE_ARCHIVE_MD5 "2f02ebe29573d45813e671eb304f2a00")
584584

585585
set(KLEIDIAI_FETCH_ARGS
586586
URL ${KLEIDIAI_DOWNLOAD_URL}
587-
URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5}
587+
URL_HASH MD5=${KLEIDIAI_RELEASE_ARCHIVE_MD5}
588588
)
589589
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
590590
list(APPEND KLEIDIAI_FETCH_ARGS DOWNLOAD_EXTRACT_TIMESTAMP NEW)

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,6 +1245,12 @@ void ggml_compute_forward_mul_mat(
12451245
const struct ggml_tensor * src0 = dst->src[0];
12461246
const struct ggml_tensor * src1 = dst->src[1];
12471247

1248+
const int32_t hint = ggml_get_op_params_i32(dst, 1);
1249+
if (hint == GGML_HINT_SRC0_IS_HADAMARD && !params->use_ref) {
1250+
ggml_compute_forward_fwht(params, dst);
1251+
return;
1252+
}
1253+
12481254
GGML_TENSOR_BINARY_OP_LOCALS
12491255

12501256
const int ith = params->ith;

ggml/src/ggml-cpu/ops.cpp

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11302,3 +11302,91 @@ void ggml_compute_forward_opt_step_sgd(const ggml_compute_params * params, ggml_
1130211302
}
1130311303
}
1130411304
}
11305+
11306+
static void ggml_compute_forward_fwht_f32(const ggml_compute_params * params, ggml_tensor * dst) {
11307+
const ggml_tensor * src0 = dst->src[0];
11308+
const ggml_tensor * src1 = dst->src[1];
11309+
11310+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
11311+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
11312+
11313+
GGML_TENSOR_BINARY_OP_LOCALS
11314+
11315+
const int ith = params->ith;
11316+
const int nth = params->nth;
11317+
11318+
const int64_t n = ne10;
11319+
GGML_ASSERT((n & (n - 1)) == 0); // must be power of 2
11320+
11321+
const int64_t nr = ne11 * ne12 * ne13;
11322+
const int64_t rows_per_thread = (nr + nth - 1) / nth;
11323+
const int64_t start_row = ith * rows_per_thread;
11324+
const int64_t end_row = MIN(start_row + rows_per_thread, nr);
11325+
11326+
const float scale = 1.0f / sqrtf((float)n);
11327+
11328+
#if defined(GGML_SIMD)
11329+
const GGML_F32_VEC v_minus_one = GGML_F32_VEC_SET1(-1.0f);
11330+
#endif
11331+
11332+
for (int64_t r = start_row; r < end_row; r++) {
11333+
const int64_t i13 = r / (ne11 * ne12);
11334+
const int64_t i12 = (r - i13 * ne11 * ne12) / ne11;
11335+
const int64_t i11 = r - i13 * ne11 * ne12 - i12 * ne11;
11336+
11337+
const float * src_row = (const float *) ((const char *) src1->data + i11 * nb11 + i12 * nb12 + i13 * nb13);
11338+
float * dst_row = (float *) ((char *) dst->data + i11 * nb1 + i12 * nb2 + i13 * nb3);
11339+
11340+
for (int64_t j = 0; j < n; j++) {
11341+
dst_row[j] = src_row[j] * scale;
11342+
}
11343+
11344+
// Scalar passes
11345+
#if defined(GGML_SIMD)
11346+
const int step = GGML_F32_EPR;
11347+
#else
11348+
const int step = n;
11349+
#endif
11350+
for (int64_t len = 1; len < step && len < n; len <<= 1) {
11351+
for (int64_t i = 0; i < n; i += 2 * len) {
11352+
for (int64_t j = 0; j < len; j++) {
11353+
float u = dst_row[i + j];
11354+
float v = dst_row[i + len + j];
11355+
dst_row[i + j] = u + v;
11356+
dst_row[i + len + j] = u - v;
11357+
}
11358+
}
11359+
}
11360+
11361+
// SIMD passes using GGML_F32_VEC_* macros for multi-architecture support
11362+
#if defined(GGML_SIMD)
11363+
for (int64_t len = step; len < n; len <<= 1) {
11364+
for (int64_t i = 0; i < n; i += 2 * len) {
11365+
for (int64_t j = 0; j < len; j += step) {
11366+
GGML_F32_VEC u = GGML_F32_VEC_LOAD(dst_row + i + j);
11367+
GGML_F32_VEC v = GGML_F32_VEC_LOAD(dst_row + i + len + j);
11368+
11369+
GGML_F32_VEC_STORE(dst_row + i + j, GGML_F32_VEC_ADD(u, v));
11370+
GGML_F32_VEC_STORE(dst_row + i + len + j, GGML_F32_VEC_FMA(u, v, v_minus_one));
11371+
}
11372+
}
11373+
}
11374+
#endif
11375+
}
11376+
}
11377+
11378+
void ggml_compute_forward_fwht(const ggml_compute_params * params, ggml_tensor * dst) {
11379+
const ggml_tensor * src1 = dst->src[1];
11380+
11381+
switch (src1->type) {
11382+
case GGML_TYPE_F32:
11383+
{
11384+
ggml_compute_forward_fwht_f32(params, dst);
11385+
}
11386+
break;
11387+
default:
11388+
{
11389+
GGML_ABORT("fatal error - fwht is F32 only");
11390+
}
11391+
}
11392+
}

ggml/src/ggml-cpu/ops.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params *
112112
void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
113113
void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
114114
void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
115+
void ggml_compute_forward_fwht(const struct ggml_compute_params * params, struct ggml_tensor * dst);
115116
void ggml_compute_forward_opt_step_sgd(const struct ggml_compute_params * params, struct ggml_tensor * dst);
116117
#ifdef __cplusplus
117118
}

ggml/src/ggml-metal/ggml-metal-device.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ bool ggml_metal_buffer_is_shared(ggml_metal_buffer_t buf);
282282
void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
283283
void ggml_metal_buffer_set_tensor (ggml_metal_buffer_t buf, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
284284
void ggml_metal_buffer_get_tensor (ggml_metal_buffer_t buf, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
285+
bool ggml_metal_buffer_cpy_tensor (ggml_metal_buffer_t buf, const struct ggml_tensor * src, struct ggml_tensor * dst);
285286
void ggml_metal_buffer_clear (ggml_metal_buffer_t buf, uint8_t value);
286287

287288
// finds the Metal buffer that contains the tensor data on the GPU device

0 commit comments

Comments
 (0)