ggml-org
diff --git a/‎common/speculative.cpp‎
Lines changed: 3 additions & 3 deletions b/‎common/speculative.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/save-load-state/save-load-state.cpp‎
Lines changed: 79 additions & 0 deletions b/‎examples/save-load-state/save-load-state.cpp‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎ggml/src/ggml-metal/ggml-metal-device.h‎
Lines changed: 1 addition & 0 deletions b/‎ggml/src/ggml-metal/ggml-metal-device.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ggml/src/ggml-metal/ggml-metal-device.m‎
Lines changed: 42 additions & 0 deletions b/‎ggml/src/ggml-metal/ggml-metal-device.m‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎ggml/src/ggml-metal/ggml-metal.cpp‎
Lines changed: 11 additions & 8 deletions b/‎ggml/src/ggml-metal/ggml-metal.cpp‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎include/llama.h‎
Lines changed: 3 additions & 0 deletions b/‎include/llama.h‎
Lines changed: 3 additions & 0 deletions
@@ -252,14 +252,14 @@ struct common_speculative_state_draft : public common_speculative_state {
 
     size_t create_checkpoint(int n_tokens_prompt) {
         int slot_id = 0;
-        const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+        const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
 
         ckpt.pos_min  = llama_memory_seq_pos_min(llama_get_memory(ctx_dft), slot_id);
         ckpt.pos_max  = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), slot_id);
         ckpt.n_tokens = n_tokens_prompt;
         ckpt.data.resize(checkpoint_size);
 
-        const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+        const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
         if (n != checkpoint_size) {
             GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", checkpoint_size, n);
         }
@@ -272,7 +272,7 @@ struct common_speculative_state_draft : public common_speculative_state {
     size_t restore_checkpoint() {
         int slot_id = 0;
         LOG_DBG("%s: pos_min = %d, pos_max = %d\n", __func__, ckpt.pos_min, ckpt.pos_max);
-        const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+        const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
         if (n != ckpt.size()) {
             GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu",
                         __func__, ckpt.pos_min, ckpt.pos_max, ckpt.size());
 
@@ -38,6 +38,7 @@ int main(int argc, char ** argv) {
     std::string result0;
     std::string result1;
     std::string result2;
+    std::string result3;
 
     // init
     auto llama_init = common_init_from_params(params);
@@ -213,11 +214,83 @@ int main(int argc, char ** argv) {
         n_past += 1;
     }
 
+    // test on-device state save/load
+    auto params_ctx4 = common_context_params_to_llama(params);
+    params_ctx4.n_seq_max = 2;
+    llama_context * ctx4 = llama_init_from_model(model, params_ctx4);
+
+    llama_sampler * smpl4 = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl4, llama_sampler_init_dist(params.sampling.seed));
+
+    printf("\nsingle seq run: %s", params.prompt.c_str());
+
+    // load state (rng, logits, embedding and kv_cache) from file
+    n_token_count_out = 0;
+
+    if (!llama_state_load_file(ctx4, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
+        fprintf(stderr, "\n%s : failed to load state\n", __func__);
+        return 1;
+    }
+
+    fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
+
+    // restore state (last tokens)
+    n_past = n_token_count_out;
+    if (!common_replay_last_token(ctx4, tokens.back(), n_past)) {
+        return 1;
+    }
+    ++n_past;
+
+    // save seq 0 and load into seq 1
+    {
+        // save kv of seq 0
+        std::vector<uint8_t> seq_store(llama_state_seq_get_size_ext(ctx4, 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE));
+        const size_t ncopy = llama_state_seq_get_data_ext(ctx4, seq_store.data(), seq_store.size(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
+        if (ncopy != seq_store.size()) {
+            fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
+            return 1;
+        }
+        fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
+
+        // erase whole kv
+        llama_memory_clear(llama_get_memory(ctx4), true);
+        fprintf(stderr, "%s : kv cache cleared\n", __func__);
+
+        // restore kv into seq 0
+        const size_t nset = llama_state_seq_set_data_ext(ctx4, seq_store.data(), seq_store.size(), 1, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
+        if (nset != seq_store.size()) {
+            fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
+            return 1;
+        }
+        fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
+    }
+
+    // forth run
+    for (auto i = 0; i < params.n_predict; i++) {
+        auto next_token     = llama_sampler_sample(smpl4, ctx4, -1);
+        auto next_token_str = common_token_to_piece(ctx4, next_token);
+
+        printf("%s", next_token_str.c_str());
+        result3 += next_token_str;
+
+        common_batch_clear(batch);
+        common_batch_add(batch, next_token, n_past, {1}, true);
+
+        if (llama_decode(ctx4, batch)) {
+            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+            llama_batch_free(batch);
+            return 1;
+        }
+        n_past += 1;
+    }
+
     printf("\n");
 
     llama_sampler_free(smpl);
     llama_sampler_free(smpl2);
     llama_sampler_free(smpl3);
+    llama_sampler_free(smpl4);
 
     llama_batch_free(batch);
 
@@ -226,12 +299,18 @@ int main(int argc, char ** argv) {
 
     llama_free(ctx2);
     llama_free(ctx3);
+    llama_free(ctx4);
 
     if (result0 != result2) {
         fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
         return 1;
     }
 
+    if (result0 != result3) {
+        fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
+        return 1;
+    }
+
     fprintf(stderr, "\n%s : success\n", __func__);
 
     return 0;
 
@@ -282,6 +282,7 @@ bool   ggml_metal_buffer_is_shared(ggml_metal_buffer_t buf);
 void   ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
 void   ggml_metal_buffer_set_tensor   (ggml_metal_buffer_t buf, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
 void   ggml_metal_buffer_get_tensor   (ggml_metal_buffer_t buf, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
+bool   ggml_metal_buffer_cpy_tensor   (ggml_metal_buffer_t buf, const struct ggml_tensor * src, struct ggml_tensor * dst);
 void   ggml_metal_buffer_clear        (ggml_metal_buffer_t buf, uint8_t value);
 
 // finds the Metal buffer that contains the tensor data on the GPU device
 
@@ -1,6 +1,7 @@
 #import "ggml-metal-device.h"
 
 #import "ggml-impl.h"
+#import "ggml-backend-impl.h"
 
 #include <Foundation/Foundation.h>
 
@@ -1737,6 +1738,47 @@ void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_ten
     }
 }
 
+bool ggml_metal_buffer_cpy_tensor(ggml_metal_buffer_t buf_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+    ggml_metal_buffer_t buf_src = (ggml_metal_buffer_t)src->buffer->context;
+
+    const size_t size = ggml_nbytes(src);
+
+    // if both buffers are shared, we can use memcpy directly
+    if (buf_dst->is_shared && buf_src->is_shared) {
+        memcpy(dst->data, src->data, size);
+        return true;
+    }
+
+    // for private buffers, we need to use Metal blit commands
+    @autoreleasepool {
+        struct ggml_metal_buffer_id bid_src = ggml_metal_buffer_get_id(buf_src, src);
+        struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf_dst, dst);
+
+        if (bid_src.metal == nil || bid_dst.metal == nil) {
+            return false;
+        }
+
+        id<MTLCommandBuffer> cmd_buf = [buf_dst->dev->mtl_queue commandBufferWithUnretainedReferences];
+
+        {
+            id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
+
+            [encoder copyFromBuffer:bid_src.metal
+                       sourceOffset:bid_src.offs
+                           toBuffer:bid_dst.metal
+                  destinationOffset:bid_dst.offs
+                               size:size];
+
+            [encoder endEncoding];
+        }
+
+        [cmd_buf commit];
+        [cmd_buf waitUntilCompleted];
+    }
+
+    return true;
+}
+
 void ggml_metal_buffer_clear(ggml_metal_buffer_t buf, uint8_t value) {
     if (buf->is_shared) {
         memset(buf->all_data, value, buf->all_size);
 
@@ -17,6 +17,9 @@
 // note: can be overridden with GGML_METAL_DEVICES env to simulate virtual devices
 static int g_devices = 1;
 
+// forward declaration
+static bool ggml_backend_buffer_is_metal(ggml_backend_buffer_t buffer);
+
 ////////////////////////////////////////////////////////////////////////////////
 // backend interface
 ////////////////////////////////////////////////////////////////////////////////
@@ -68,11 +71,11 @@ static bool ggml_backend_metal_buffer_shared_cpy_tensor(ggml_backend_buffer_t bu
 
     GGML_ASSERT(ggml_metal_buffer_is_shared(ctx));
 
-    GGML_UNUSED(buffer);
-    GGML_UNUSED(src);
-    GGML_UNUSED(dst);
+    if (!ggml_backend_buffer_is_metal(src->buffer)) {
+        return false;
+    }
 
-    return false;
+    return ggml_metal_buffer_cpy_tensor(ctx, src, dst);
 }
 
 static void ggml_backend_metal_buffer_shared_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -144,11 +147,11 @@ static bool ggml_backend_metal_buffer_private_cpy_tensor(ggml_backend_buffer_t b
 
     GGML_ASSERT(!ggml_metal_buffer_is_shared(ctx));
 
-    GGML_UNUSED(buffer);
-    GGML_UNUSED(src);
-    GGML_UNUSED(dst);
+    if (!ggml_backend_buffer_is_metal(src->buffer)) {
+        return false;
+    }
 
-    return false;
+    return ggml_metal_buffer_cpy_tensor(ctx, src, dst);
 }
 
 static void ggml_backend_metal_buffer_private_clear(ggml_backend_buffer_t buffer, uint8_t value) {
 
@@ -864,6 +864,9 @@ extern "C" {
 // work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba)
 #define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1
 
+// keeps the tensor data on device buffers (i.e. not accessible in host memory, but faster save/load)
+#define LLAMA_STATE_SEQ_FLAGS_ON_DEVICE 2
+
     typedef uint32_t llama_state_seq_flags;
 
     LLAMA_API size_t llama_state_seq_get_size_ext(