Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
/examples/parallel/ @ggerganov
/examples/passkey/ @ggerganov
/examples/retrieval/ @ggerganov
/examples/save-load-state/ @ggerganov
/examples/speculative-simple/ @ggerganov
/examples/speculative/ @ggerganov
/ggml/cmake/ @ggerganov
Expand Down
8 changes: 4 additions & 4 deletions ci/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -461,10 +461,10 @@ function gg_run_qwen3_0_6b {

(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

function check_ppl {
qnt="$1"
Expand Down
1 change: 0 additions & 1 deletion examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ else()
add_subdirectory(parallel)
add_subdirectory(passkey)
add_subdirectory(retrieval)
add_subdirectory(save-load-state)
add_subdirectory(simple)
add_subdirectory(simple-chat)
add_subdirectory(speculative)
Expand Down
5 changes: 0 additions & 5 deletions examples/save-load-state/CMakeLists.txt

This file was deleted.

19 changes: 15 additions & 4 deletions ggml/src/ggml-metal/ggml-metal-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -564,9 +564,20 @@ int ggml_metal_op_concat(ggml_metal_op_t ctx, int idx) {
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);

const int nth = std::min(1024, ne0);
int nth = std::min(256, ne0);

ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
// when rows are small, we can batch them together in a single threadgroup
int nrptg = 1;
if (nth < 256) {
nrptg = std::min((256 + nth - 1) / nth, ne1);
if (nrptg * nth > 256) {
nrptg = 256 / nth;
}
}

const int nw0 = (ne1 + nrptg - 1) / nrptg;

ggml_metal_encoder_dispatch_threadgroups(enc, nw0, ne2, ne3, nth, nrptg, 1);

return 1;
}
Expand Down Expand Up @@ -1786,7 +1797,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) {
nk0 = ne10/ggml_blck_size(op->type);
}

int nth = std::min<int>(nk0, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
int nth = std::min<int>(nk0*ne11, 256);

// when rows are small, we can batch them together in a single threadgroup
int nrptg = 1;
Expand All @@ -1797,7 +1808,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) {
nrptg = (nth + nk0 - 1)/nk0;
nth = nk0;

if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
if (nrptg*nth > 256) {
nrptg--;
}
}
Expand Down
6 changes: 5 additions & 1 deletion ggml/src/ggml-metal/ggml-metal.metal
Original file line number Diff line number Diff line change
Expand Up @@ -7486,7 +7486,11 @@ kernel void kernel_concat(

const int i3 = tgpig.z;
const int i2 = tgpig.y;
const int i1 = tgpig.x;
const int i1 = ntg.y == 1 ? tgpig.x : tgpig.x*ntg.y + tpitg.y;

if (i1 >= args.ne1) {
return;
}

int o[4] = {0, 0, 0, 0};
o[args.dim] = args.dim == 0 ? args.ne00 : (args.dim == 1 ? args.ne01 : (args.dim == 2 ? args.ne02 : args.ne03));
Expand Down
44 changes: 22 additions & 22 deletions ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,9 @@ const V_CHUNKS: u32 = HEAD_DIM_V / 4u;
const SCORE_REGS_PER_LANE: u32 = (KV_TILE + MIN_SUBGROUP_SIZE - 1u) / MIN_SUBGROUP_SIZE;
const OUT_REGS_PER_LANE: u32 = (V_CHUNKS + MIN_SUBGROUP_SIZE - 1u) / MIN_SUBGROUP_SIZE;

var<workgroup> q_shmem: array<f32, Q_TILE * HEAD_DIM_QK>;
var<workgroup> kv_shmem: array<f32, KV_TILE * KV_STAGE_STRIDE>;
var<workgroup> p_shmem: array<f32, Q_TILE * KV_TILE>;
var<workgroup> q_shmem: array<Q_TYPE, Q_TILE * HEAD_DIM_QK>;
var<workgroup> kv_shmem: array<KV_TYPE, KV_TILE * KV_STAGE_STRIDE>;
var<workgroup> p_shmem: array<KV_TYPE, Q_TILE * KV_TILE>;

@compute @workgroup_size(WG_SIZE)
fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
Expand Down Expand Up @@ -169,10 +169,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,

let head = f32(head_idx);
let slope = select(1.0,
select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0),
pow(params.m0, head + 1.0),
head < params.n_head_log2),
params.max_bias > 0.0);
select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0),
pow(params.m0, head + 1.0),
head < params.n_head_log2),
params.max_bias > 0.0);

for (var elem_idx = local_id.x; elem_idx < Q_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE) {
let q_tile_row = elem_idx / HEAD_DIM_QK;
Expand All @@ -181,7 +181,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
let global_q_row_offset = q_head_offset + head_q_row * params.stride_q1;
q_shmem[elem_idx] = select(
0.0,
f32(Q[global_q_row_offset + q_col]) * params.scale,
Q_TYPE(Q[global_q_row_offset + q_col]) * params.scale,
head_q_row < params.seq_len_q);
}

Expand Down Expand Up @@ -213,10 +213,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
let k_vec_index = (k_head_offset + global_k_row * params.stride_k1 + chunk * 4u) >> 2u;
let k4 = K[k_vec_index];
let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
kv_shmem[kv_off + 0u] = f32(k4.x);
kv_shmem[kv_off + 1u] = f32(k4.y);
kv_shmem[kv_off + 2u] = f32(k4.z);
kv_shmem[kv_off + 3u] = f32(k4.w);
kv_shmem[kv_off + 0u] = KV_TYPE(k4.x);
kv_shmem[kv_off + 1u] = KV_TYPE(k4.y);
kv_shmem[kv_off + 2u] = KV_TYPE(k4.z);
kv_shmem[kv_off + 3u] = KV_TYPE(k4.w);
}

workgroupBarrier();
Expand All @@ -233,18 +233,18 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
var dot_val = 0.0;
for (var chunk = 0u; chunk < Q_CHUNKS; chunk += 1u) {
let q_off = q_base + chunk * 4u;
let qv = vec4<f32>(
let qv = vec4<Q_TYPE>(
q_shmem[q_off + 0u],
q_shmem[q_off + 1u],
q_shmem[q_off + 2u],
q_shmem[q_off + 3u]);
let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
let kv = vec4<f32>(
let kv = vec4<KV_TYPE>(
kv_shmem[kv_off + 0u],
kv_shmem[kv_off + 1u],
kv_shmem[kv_off + 2u],
kv_shmem[kv_off + 3u]);
dot_val += dot(qv, kv);
dot_val += dot(vec4<f32>(qv), vec4<f32>(kv));
}
#ifdef LOGIT_SOFTCAP
dot_val = params.logit_softcap * tanh(dot_val);
Expand All @@ -271,7 +271,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
let kv_local = sg_inv_id + slot * subgroup_size;
if (row_active && kv_local < kv_count) {
let p = exp(local_scores[slot] - new_max);
p_shmem[subgroup_p_offset + kv_local] = p;
p_shmem[subgroup_p_offset + kv_local] = KV_TYPE(p);
local_sum += p;
}
}
Expand All @@ -285,10 +285,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
let v_vec_index = (v_head_offset + global_v_row * params.stride_v1 + chunk * 4u) >> 2u;
let v4 = V[v_vec_index];
let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
kv_shmem[kv_off + 0u] = f32(v4.x);
kv_shmem[kv_off + 1u] = f32(v4.y);
kv_shmem[kv_off + 2u] = f32(v4.z);
kv_shmem[kv_off + 3u] = f32(v4.w);
kv_shmem[kv_off + 0u] = KV_TYPE(v4.x);
kv_shmem[kv_off + 1u] = KV_TYPE(v4.y);
kv_shmem[kv_off + 2u] = KV_TYPE(v4.z);
kv_shmem[kv_off + 3u] = KV_TYPE(v4.w);
}

workgroupBarrier();
Expand All @@ -308,12 +308,12 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
for (var kv_local = 0u; kv_local < kv_count; kv_local += 1u) {
let p = p_shmem[subgroup_p_offset + kv_local];
let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
let v4 = vec4<f32>(
let v4 = vec4<KV_TYPE>(
kv_shmem[kv_off + 0u],
kv_shmem[kv_off + 1u],
kv_shmem[kv_off + 2u],
kv_shmem[kv_off + 3u]);
acc += p * v4;
acc += f32(p) * vec4<f32>(v4);
}
out_regs[reg_idx] = acc;
}
Expand Down
4 changes: 4 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,10 @@ set_tests_properties(test-state-restore-fragmented PROPERTIES FIXTURES_REQUIRED
llama_build_and_test(test-recurrent-state-rollback.cpp LABEL "model" ARGS -m "${MODEL_DEST}")
set_tests_properties(test-recurrent-state-rollback PROPERTIES FIXTURES_REQUIRED test-download-model)

# Test state save/load functionality
llama_build_and_test(test-save-load-state.cpp LABEL "model" ARGS -m "${MODEL_DEST}")
set_tests_properties(test-save-load-state PROPERTIES FIXTURES_REQUIRED test-download-model)

if (NOT GGML_BACKEND_DL)
# these tests use the backends directly and cannot be built with dynamic loading
llama_build_and_test(test-barrier.cpp)
Expand Down
Loading
Loading