Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2808,7 +2808,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.embd_normalize = value;
}
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_DEBUG}));
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}));
add_opt(common_arg(
{"--embd-output-format"}, "FORMAT",
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
Expand Down
30 changes: 1 addition & 29 deletions common/chat-peg-parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -358,35 +358,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
if (is_potential_container) {
value_content = normalize_container_value(value_content);
}

// Try to parse as JSON value (number, bool, null, object, array)
try {
ordered_json parsed = ordered_json::parse(value_content);
if (parsed.is_string()) {
// Don't add closing quote yet (added by arg_close) for monotonic streaming
std::string escaped = parsed.dump();
if (!escaped.empty() && escaped.back() == '"') {
escaped.pop_back();
}
value_to_add = escaped;
closing_quote_pending = true;
} else {
// Non-string values: use raw content to preserve whitespace for monotonicity
value_to_add = value_content;
}
} catch (...) {
if (node.is_partial && is_potential_container) {
// Partial container: pass through the already-normalized content
value_to_add = value_content;
} else {
// Not valid JSON - treat as string value
if (!closing_quote_pending) {
value_to_add = "\"";
closing_quote_pending = true;
}
value_to_add += escape_json_string_inner(value_content);
}
}
value_to_add += value_content;
}

args_target() += value_to_add;
Expand Down
2 changes: 1 addition & 1 deletion common/chat-peg-parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ class common_chat_peg_builder : public common_peg_parser_builder {

// Use for schema-declared string types - won't be treated as potential JSON container
common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); }
common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_VALUE, p)); }
common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return tag(TOOL_ARG_VALUE, p); }


// Return a parser that parses the prefix of a string, up to a given delimiter.
Expand Down
4 changes: 2 additions & 2 deletions common/ngram-map.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ void common_ngram_map_draft(common_ngram_map & map,
sum_occur += curr_occur;
}

LOG_INF("%s: key_offset = %zu, max_occur = %d, sum_occur = %d, slot_max = %d [%zu/%d, %zu/%d, %zu/%d, %zu/%d]\n", __func__,
LOG_DBG("%s: key_offset = %zu, max_occur = %d, sum_occur = %d, slot_max = %d [%zu/%d, %zu/%d, %zu/%d, %zu/%d]\n", __func__,
key_offset,
max_occur, sum_occur, slot_max,
curr_key.values[0].value_idx, curr_key.values[0].value_num,
Expand All @@ -482,7 +482,7 @@ void common_ngram_map_draft(common_ngram_map & map,
// Print the tokens of the four values (if idx != 0), use LOG_INF
for (int v = 0; v < COMMON_NGRAM_MAX_VALUES; ++v) {
if (curr_key.values[v].value_idx != 0) {
LOG_INF("%s: value[%d] = %s\n", __func__, v, common_tokens_to_str(inp, curr_key.values[v].value_idx, m).c_str());
LOG_DBG("%s: value[%d] = %s\n", __func__, v, common_tokens_to_str(inp, curr_key.values[v].value_idx, m).c_str());
}
}

Expand Down
157 changes: 147 additions & 10 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp

Large diffs are not rendered by default.

8 changes: 6 additions & 2 deletions ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ void main() {
if (idx + (num_iter-1)*num_threads < p.ne) {
[[unroll]] for (uint i = 0; i < num_iter; ++i) {

#if defined(DATA_D_BF16)
#if defined(DATA_A_BF16)
data_d[get_doffset() + idx] = D_TYPE(bf16_to_fp32(uint32_t(data_a[get_aoffset() + idx])));
#elif defined(DATA_D_BF16)
float f = float(data_a[get_aoffset() + idx]);
data_d[get_doffset() + idx] = D_TYPE(fp32_to_bf16(f));
#elif !defined(OPTIMIZATION_ERROR_WORKAROUND)
Expand All @@ -35,7 +37,9 @@ void main() {
continue;
}

#if defined(DATA_D_BF16)
#if defined(DATA_A_BF16)
data_d[get_doffset() + idx] = D_TYPE(bf16_to_fp32(uint32_t(data_a[get_aoffset() + idx])));
#elif defined(DATA_D_BF16)
float f = float(data_a[get_aoffset() + idx]);
data_d[get_doffset() + idx] = D_TYPE(fp32_to_bf16(f));
#elif !defined(OPTIMIZATION_ERROR_WORKAROUND)
Expand Down
4 changes: 3 additions & 1 deletion ggml/src/ggml-vulkan/vulkan-shaders/copy.comp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ void main() {
return;
}

#if defined(DATA_D_BF16)
#if defined(DATA_A_BF16)
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(bf16_to_fp32(uint32_t(data_a[get_aoffset() + src0_idx(idx)])));
#elif defined(DATA_D_BF16)
float f = float(data_a[get_aoffset() + src0_idx(idx)]);
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(fp32_to_bf16(f));
#elif !defined(OPTIMIZATION_ERROR_WORKAROUND)
Expand Down
7 changes: 5 additions & 2 deletions ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ uint rope_a_coord(const uint i0, const uint i01, const uint i02, const uint i03,
// Per-row offset in shared memory
const uint ix = i0;
#else
const uint ix = i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i0;
const uint ix = p.a_offset + i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i0;
#endif
return ix;
}
Expand Down Expand Up @@ -48,6 +48,7 @@ void rope_norm(const uint i0, const uint i1, const uint i2, const uint i3, rope_
idst = i1*p.nb11 + i0;
idst += rope_data_i[i2].x * p.set_rows_stride;
}
idst += p.d_offset;

if (i0 >= p.n_dims) {
rope_data_d[idst + 0] = ROPE_D_TYPE(rope_data_a[ix + 0]);
Expand Down Expand Up @@ -84,6 +85,7 @@ void rope_neox(const uint i0, const uint i1, const uint i2, const uint i3, rope_
idst = i1*p.nb11 + i0/2;
idst += rope_data_i[i2].x * p.set_rows_stride;
}
idst += p.d_offset;

if (i0 >= p.n_dims) {
rope_data_d[idst + i0/2 + 0] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 0]);
Expand Down Expand Up @@ -121,6 +123,7 @@ void rope_multi(const uint i0, const uint i1, const uint i2, const uint i3, rope
idst = i1*p.nb11 + i0/2;
idst += rope_data_i[i2].x * p.set_rows_stride;
}
idst += p.d_offset;

if (i0 >= p.n_dims) {
rope_data_d[idst + i0/2 + 0] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 0]);
Expand Down Expand Up @@ -176,7 +179,7 @@ void rope_vision(const uint i0, const uint i1, const uint i2, const uint i3, rop
return;
}

const uint idst = i0/2 + i1 * p.nb11 + i2 * p.nb12 + i3 * p.nb13;
const uint idst = p.d_offset + i0/2 + i1 * p.nb11 + i2 * p.nb12 + i3 * p.nb13;
const uint ix = rope_a_coord(i0/2, i1, i2, i3, p);

const int sect_dims = p.sections[0] + p.sections[1];
Expand Down
3 changes: 3 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ struct rope_params {
uint nb11;
uint nb12;
uint nb13;

uint a_offset;
uint d_offset;
};

#endif // !defined(GGML_ROPE_PARAMS)
12 changes: 11 additions & 1 deletion ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,15 @@

layout(constant_id = 0) const uint BLOCK_SIZE = 32;
layout(constant_id = 1) const uint TOKENS_PER_WG = 16;
layout(constant_id = 2) const bool APPLY_BIAS = false;
layout(constant_id = 3) const bool APPLY_SILU = false;

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z = 1) in;

layout(binding = 0) readonly buffer Src0 { float src0[]; };
layout(binding = 1) readonly buffer Src1 { float src1[]; };
layout(binding = 2) buffer Dst { float dst[]; };
layout(binding = 2) readonly buffer Bias { float bias[]; };
layout(binding = 3) buffer Dst { float dst[]; };

layout(push_constant) uniform PushConstants {
uint nb01; uint nb02;
Expand Down Expand Up @@ -45,6 +48,13 @@ void main() {
}
}

if (APPLY_BIAS) {
sum += bias[i1];
}
if (APPLY_SILU) {
sum = sum / (1.0f + exp(-sum));
}

const uint dst_idx = i3 * (dst_nb2 / 4) + i2 * (dst_nb1 / 4) + i1;
dst[dst_idx] = sum;
}
2 changes: 2 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -731,13 +731,15 @@ void process_shaders() {
string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
string_to_spv("cpy_f16_f32", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
string_to_spv("cpy_f32_bf16","copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "uint16_t"}, {"DATA_D_BF16", "1"}});
string_to_spv("cpy_bf16_f32","copy.comp", {{"A_TYPE", "uint16_t"}, {"D_TYPE", "float"}, {"DATA_A_BF16", "1"}});
string_to_spv("contig_cpy_f32_f32", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("contig_cpy_f32_i32", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "int"}});
string_to_spv("contig_cpy_i32_f32", "contig_copy.comp", {{"A_TYPE", "int"}, {"D_TYPE", "float"}});
string_to_spv("contig_cpy_f32_f16", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
string_to_spv("contig_cpy_f16_f16", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
string_to_spv("contig_cpy_f16_f32", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
string_to_spv("contig_cpy_f32_bf16","contig_copy.comp",{{"A_TYPE", "float"}, {"D_TYPE", "uint16_t"}, {"DATA_D_BF16", "1"}});
string_to_spv("contig_cpy_bf16_f32","contig_copy.comp",{{"A_TYPE", "uint16_t"}, {"D_TYPE", "float"}, {"DATA_A_BF16", "1"}});
string_to_spv("cpy_f32_i32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "int"}});
string_to_spv("cpy_i32_f32", "copy.comp", {{"A_TYPE", "int"}, {"D_TYPE", "float"}});

Expand Down
24 changes: 22 additions & 2 deletions tests/test-backend-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4851,6 +4851,21 @@ struct test_rope : public test_case {

a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
ggml_set_name(a, "view_of_a");
} else if (v == 2) {
// second-half slice along dim 0 (mimics build_rope_2d in clip.cpp).
// The non-zero view offset (ne_a[0] * elem_size) often produces a
// non-aligned buffer offset, which exercises backends' alignment paths.
auto ne = ne_a; ne[0] *= 2;
a = ggml_new_tensor(ctx, type, 4, ne.data());
if (forward) {
ggml_set_param(a);
}
ggml_set_name(a, "a");

a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3],
a->nb[1], a->nb[2], a->nb[3],
ne_a[0] * ggml_element_size(a));
ggml_set_name(a, "view_of_a");
} else {
a = ggml_new_tensor(ctx, type, 4, ne_a.data());
if (forward) {
Expand Down Expand Up @@ -4913,8 +4928,6 @@ struct test_rope : public test_case {
} else {
out = ggml_rope_ext_back(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
}

// TODO: add test with a non-contiguous view as input ; this case is needed for build_rope_2d in clip.cpp
}
ggml_set_name(out, "out");

Expand Down Expand Up @@ -8687,6 +8700,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {

test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1}, 64, GGML_ROPE_TYPE_NEOX, 512, fs, ef, af, ff, v, fw)); // neox (falcon 40B)
}

// build_rope_2d-style: ROPE on a non-contiguous view
// that starts at a non-zero offset along dim 0
// (e.g. gemma4v vision second-half view).
for (int rmode : { GGML_ROPE_TYPE_NORMAL, GGML_ROPE_TYPE_NEOX, GGML_ROPE_TYPE_MROPE, GGML_ROPE_TYPE_IMROPE, GGML_ROPE_TYPE_VISION }) {
test_cases.emplace_back(new test_rope(type, { 36, 16, 2457, 1}, 36, rmode, 512, fs, ef, af, ff, 2, fw));
}
}

all = false;
Expand Down
2 changes: 1 addition & 1 deletion tools/server/server-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4527,7 +4527,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_embeddings_impl(cons
}
}

int embd_normalize = 2; // default to Euclidean/L2 norm
int embd_normalize = params.embd_normalize;
if (body.count("embd_normalize") != 0) {
embd_normalize = body.at("embd_normalize");
if (meta->pooling_type == LLAMA_POOLING_TYPE_NONE) {
Expand Down
Loading