Skip to content

Commit 113ba08

Browse files
committed
ggml-ve: run F16 weights as BF16 + compile the VEBP/Q4K N>1 prompt graph (ggml-org#72)
VEBP keeps token_embd as F16 (tied, so it's both the embedding and the lm_head). The VE has no F16 path, so GET_ROWS + that MUL_MAT were refused and the graph fragmented. Two changes make VEBP fully self-contained and let its prompt graph compile: 1. F16 -> BF16 on HBM upload (same 2-byte size, strides unchanged), served by the existing BF16 GET_ROWS / matvec / colmajor paths. get_rows/mul_mat supports + dispatch accept F16; the graph compiler maps src_type F16->BF16 and uploads the converted copy. Conversion uses the F16C row helpers in 1M chunks (a per-element call loop was ~10 s for the 621M-elem token_embd, charged to the first prompt eval). 2. Removed a stale guard that refused Q4_K/VEBP MUL_MAT when ne[1] != 1. It dated from when N>1 was refused entirely; now the codegen loops the matvec _inner over the n_tok columns, so it handles N>1. The guard was rejecting EVERY VEBP prompt graph (Qcur is VEBP, ne[1]=N) -> the prompt silently ran on the interpreter. (A new first-execute verbose log made this visible.) Also fixed N>1 codegen vectorization found while profiling (the .L showed it was NOT a table overflow — it was unvectorized loops): the MUL `src1[e % period]` modulo forced scalar code -> restored the nested broadcast form; added `restrict` to the element-wise / RMS_NORM / GLU pointers (void*-cast aliasing). Measured (GGML_VE_HBM=1, -fa on, -ctk/-ctv bf16, warm, run twice): Llama-3.2-3B prompt 57 -> 65, decode 48 -> 56 (vectorization fixes) Bonsai-VEBP prompt 9.8 -> 12.65 (1.29x, V.OP 12% -> 92%), decode 33.5 -> 38.7 All outputs token-for-token identical to the interpreter. VEBP's prompt gain is smaller than Llama's 16x because its per-token ternary matvec is compute-bound (V.OP already 92%), so the fork/join fusion saves a smaller fraction. Further gains need a batched ternary matmul (read the weight once across N columns) — a follow-up.
1 parent fb9264c commit 113ba08

6 files changed

Lines changed: 138 additions & 37 deletions

File tree

ggml/src/ggml-ve/common.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,9 @@ enum ggml_ve_hbm_format {
8989
GGML_VE_HBM_VEBP_WS = 7, // VEBP interleaved sign plane [blk][word][256]
9090
GGML_VE_HBM_VEBP_WN = 8, // VEBP interleaved nonzero plane
9191
GGML_VE_HBM_VEBP_WSCALE = 9, // VEBP interleaved group scales [blk][grp][256]
92+
GGML_VE_HBM_BF16_FROM_F16 = 10, // F16 weight converted to BF16 on upload
93+
// (VE has no F16 path; BF16 is the same
94+
// 2-byte size, so strides are unchanged)
9295
};
9396

9497
#endif // GGML_VE_COMMON_HPP

ggml/src/ggml-ve/ggml-ve.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,10 @@ ggml_status backend_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph)
419419
if (cg2) {
420420
new_entry.cg = cg2;
421421
new_entry.executable = gc.execute(cg2, ctx, cgraph);
422+
if (gc_verbose) {
423+
fprintf(stderr, "[VE-GC] first execute sig=%016lx n_nodes=%d -> executable=%d\n",
424+
(unsigned long) sig, cgraph->n_nodes, (int) new_entry.executable);
425+
}
422426
} else if (gc_verbose) {
423427
fprintf(stderr, "[VE-GC] compile failed for sig=%016lx\n", (unsigned long) sig);
424428
}

ggml/src/ggml-ve/graph_compiler.cpp

Lines changed: 52 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,12 @@ bool GraphCompiler::trace_one(ggml_tensor * node) {
229229
}
230230
out_idx = assign_buffer(src, out_kind);
231231
for (int i = 0; i < 4; ++i) out_ne[i] = src->ne[i];
232-
if (slot == 0) op.src_type = src->type;
232+
// F16 weights are uploaded as BF16 (the VE has no F16 path); map the
233+
// type here so MUL_MAT classification and the GET_ROWS codegen take the
234+
// BF16 path, matching the BF16-converted HBM copy uploaded at execute.
235+
if (slot == 0) {
236+
op.src_type = (src->type == GGML_TYPE_F16) ? GGML_TYPE_BF16 : src->type;
237+
}
233238
};
234239
fill_src(0, node->src[0], op.src0_idx, op.src0_kind, op.src0_ne);
235240

@@ -309,12 +314,11 @@ bool GraphCompiler::trace_one(ggml_tensor * node) {
309314
else if (op.src_type == GGML_TYPE_Q4_K) op.type = OpType::MUL_MAT_Q4K;
310315
else if (op.src_type == GGML_TYPE_VEBP) op.type = OpType::MUL_MAT_VEBP;
311316
else op.type = OpType::MUL_MAT_F32;
312-
// Q4_K / VEBP inners are matvec-only (N=1). The compiler already
313-
// refuses N>1 MUL_MAT above (prompt eval), but guard explicitly.
314-
if ((op.type == OpType::MUL_MAT_Q4K || op.type == OpType::MUL_MAT_VEBP)
315-
&& node->src[1] && node->src[1]->ne[1] != 1) {
316-
return false;
317-
}
317+
// N>1 prompt eval is handled by looping the matvec _inner over the
318+
// n_tok activation columns (see gen_op_code MUL_MAT_VEBP/Q4K), so
319+
// Q4_K / VEBP MUL_MAT no longer needs to be matvec-only. (Removing
320+
// this guard is what lets the VEBP/Q4K prompt graph compile instead
321+
// of being exiled to the interpreter.)
318322
break;
319323
case GGML_OP_ROPE: {
320324
int32_t mode = 0, n_dims = 0;
@@ -805,8 +809,8 @@ std::string GraphCompiler::gen_op_code(const TracedOp & op, int idx) const {
805809
ss << " float eps = " << flit(op.p.rms_norm.eps) << ";\n";
806810
ss << " #pragma omp for\n";
807811
ss << " for (long r = 0; r < rows; r++) {\n";
808-
ss << " const float* x = (const float*)" << src0 << " + r * cols;\n";
809-
ss << " float* y = (float*)" << dst << " + r * cols;\n";
812+
ss << " const float * restrict x = (const float*)" << src0 << " + r * cols;\n";
813+
ss << " float * restrict y = (float*)" << dst << " + r * cols;\n";
810814
ss << " float sumsq = 0.f;\n";
811815
ss << " for (int j = 0; j < cols; j++) sumsq += x[j] * x[j];\n";
812816
ss << " float inv = 1.f / sqrtf(sumsq / cols + eps);\n";
@@ -822,8 +826,8 @@ std::string GraphCompiler::gen_op_code(const TracedOp & op, int idx) const {
822826
ss << " float eps = " << flit(op.p.rms_norm.eps) << ";\n";
823827
ss << " #pragma omp for\n";
824828
ss << " for (long r = 0; r < rows; r++) {\n";
825-
ss << " const float* x = (const float*)" << src0 << " + r * cols;\n";
826-
ss << " float* y = (float*)" << dst << " + r * cols;\n";
829+
ss << " const float * restrict x = (const float*)" << src0 << " + r * cols;\n";
830+
ss << " float * restrict y = (float*)" << dst << " + r * cols;\n";
827831
ss << " float sumsq = 0.f;\n";
828832
ss << " for (int j = 0; j < cols; j++) sumsq += x[j] * x[j];\n";
829833
ss << " float inv = 1.f / sqrtf(sumsq / cols + eps);\n";
@@ -870,22 +874,38 @@ std::string GraphCompiler::gen_op_code(const TracedOp & op, int idx) const {
870874
int64_t src1_period = op.src1_ne[0] * op.src1_ne[1] * op.src1_ne[2] * op.src1_ne[3];
871875
if (src1_period == 0) src1_period = op.src1_ne[0];
872876
if (src1_period <= 0) src1_period = 1;
873-
ss << " #pragma omp for\n";
874-
ss << " for (int64_t e = 0; e < " << elem_n << "; e++) {\n";
875-
ss << " ((float*)" << dst << ")[e] = "
876-
<< "((float*)" << src0 << ")[e] * "
877-
<< "((float*)" << src1 << ")[e % " << src1_period << "LL];\n";
877+
// Nested (broadcast-block) form, NOT `src1[e % period]`: a modulo in
878+
// the inner loop trips NCC's "loop division overhead" and forces
879+
// scalar code. The inner loop runs the (compile-time-constant)
880+
// period with contiguous src1[i], so it vectorises; the outer loop
881+
// (reps = total/period, runtime) is shared across the team.
882+
// `restrict` kills the void*-cast aliasing ("Dependency unknown").
883+
ss << " {\n";
884+
ss << " float * restrict yv = (float*)" << dst << ";\n";
885+
ss << " const float * restrict av = (const float*)" << src0 << ";\n";
886+
ss << " const float * restrict bv = (const float*)" << src1 << ";\n";
887+
ss << " const int64_t period = " << src1_period << "LL;\n";
888+
ss << " const int64_t reps = (" << elem_n << ") / period;\n";
889+
ss << " #pragma omp for\n";
890+
ss << " for (int64_t r = 0; r < reps; r++) {\n";
891+
ss << " const float * restrict ar = av + r*period;\n";
892+
ss << " float * restrict yr = yv + r*period;\n";
893+
ss << " for (int64_t i = 0; i < period; i++) yr[i] = ar[i] * bv[i];\n";
894+
ss << " }\n";
878895
ss << " }\n";
879896
break;
880897
}
881898

882899
case OpType::ADD:
883900
// src0 and src1 are the same shape (residual + branch). Element-wise
884901
// over the full per-token range (pt*n_tok when scaling, else full).
885-
ss << " #pragma omp for\n";
886-
ss << " for (int64_t e = 0; e < " << elem_n << "; e++) {\n";
887-
ss << " ((float*)" << dst << ")[e] = "
888-
<< "((float*)" << src0 << ")[e] + ((float*)" << src1 << ")[e];\n";
902+
// restrict pointers so NCC vectorises (no assumed aliasing).
903+
ss << " {\n";
904+
ss << " float * restrict yv = (float*)" << dst << ";\n";
905+
ss << " const float * restrict av = (const float*)" << src0 << ";\n";
906+
ss << " const float * restrict bv = (const float*)" << src1 << ";\n";
907+
ss << " #pragma omp for\n";
908+
ss << " for (int64_t e = 0; e < " << elem_n << "; e++) yv[e] = av[e] + bv[e];\n";
889909
ss << " }\n";
890910
break;
891911

@@ -1222,9 +1242,9 @@ std::string GraphCompiler::gen_op_code(const TracedOp & op, int idx) const {
12221242
// size-independent. Clamp before expf — the VE's vectorised expf
12231243
// returns NaN past |x|~88 (see CLAUDE.md).
12241244
ss << " {\n";
1225-
ss << " float* y = (float*)" << dst << ";\n";
1226-
ss << " float* gate = (float*)" << src0 << ";\n";
1227-
ss << " float* up = (float*)" << src1 << ";\n";
1245+
ss << " float * restrict y = (float*)" << dst << ";\n";
1246+
ss << " const float * restrict gate = (const float*)" << src0 << ";\n";
1247+
ss << " const float * restrict up = (const float*)" << src1 << ";\n";
12281248
ss << " long total = (long)(" << elem_n << ");\n";
12291249
ss << " #pragma omp for\n";
12301250
ss << " for (long i = 0; i < total; i++) {\n";
@@ -1790,9 +1810,15 @@ bool GraphCompiler::execute(CompiledGraph * graph,
17901810
// 875-node decode graph was refused over this one weight.
17911811
size_t nb = ggml_nbytes(c);
17921812
const char * nm = (c->name && c->name[0]) ? c->name : nullptr;
1793-
VEDAdeviceptr w_hbm = nm
1794-
? bctx->cache().get_or_upload_by_name(nm, c->data, nb)
1795-
: bctx->cache().get_or_upload(c->data, nb);
1813+
// F16 weights (e.g. a tied/F16 token_embd) have no VE F16 path:
1814+
// convert to BF16 once on upload (same byte size, so the view
1815+
// offset below is unchanged). The codegen treats this slot as
1816+
// BF16 (src_type is mapped F16->BF16 at trace).
1817+
VEDAdeviceptr w_hbm = (c->type == GGML_TYPE_F16)
1818+
? bctx->cache().get_or_upload_f16_as_bf16(nm, c->data, nb)
1819+
: (nm
1820+
? bctx->cache().get_or_upload_by_name(nm, c->data, nb)
1821+
: bctx->cache().get_or_upload(c->data, nb));
17961822
if (w_hbm) {
17971823
size_t off = (const uint8_t *) src_for_addr->data - (const uint8_t *) c->data;
17981824
hbm = w_hbm + off;

ggml/src/ggml-ve/hbm_cache.hpp

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ class hbm_weight_cache {
6464
for (auto & e : entries_) {
6565
if (e.host_data == host_data && e.size == size &&
6666
e.format != GGML_VE_HBM_FP32_COLMAJOR &&
67-
e.format != GGML_VE_HBM_BF16_COLMAJOR) {
67+
e.format != GGML_VE_HBM_BF16_COLMAJOR &&
68+
e.format != GGML_VE_HBM_BF16_FROM_F16) {
6869
hits_++;
6970
return e.vptr;
7071
}
@@ -101,6 +102,56 @@ class hbm_weight_cache {
101102
return v;
102103
}
103104

105+
// F16 weight -> BF16 on upload. The VE has no F16 vector path, so we
106+
// re-pack each F16 element to BF16 (both are 2 bytes, so the byte size and
107+
// every row stride are identical) and serve it through the normal BF16
108+
// kernels — this is what lets VEBP's F16 token_embd run GET_ROWS and the
109+
// tied lm_head MUL_MAT on the VE instead of fragmenting onto the CPU.
110+
// Keyed by name with a distinct format so it never aliases a raw entry.
111+
VEDAdeviceptr get_or_upload_f16_as_bf16(const char * tensor_name,
112+
const void * host_f16,
113+
size_t size_bytes) {
114+
if (tensor_name != nullptr) {
115+
for (auto & e : entries_) {
116+
if (e.format == GGML_VE_HBM_BF16_FROM_F16 &&
117+
e.name == tensor_name && e.size == size_bytes) {
118+
hits_++;
119+
return e.vptr;
120+
}
121+
}
122+
}
123+
for (auto & e : entries_) {
124+
if (e.format == GGML_VE_HBM_BF16_FROM_F16 &&
125+
e.host_data == host_f16 && e.size == size_bytes) {
126+
hits_++;
127+
return e.vptr;
128+
}
129+
}
130+
const size_t n = size_bytes / sizeof(uint16_t);
131+
std::vector<uint16_t> bf16(n);
132+
const ggml_fp16_t * src = (const ggml_fp16_t *) host_f16;
133+
// Convert via ggml's row helpers (F16C-accelerated F16->F32, then a
134+
// shift+round F32->BF16) in chunks. A per-element ggml_fp16_to_fp32 /
135+
// ggml_fp32_to_bf16 call loop is ~16 ns/elem (~10 s for a 621M-element
136+
// token_embd) — slow enough to dominate the first prompt eval; the row
137+
// helpers bring it to well under a second.
138+
{
139+
const size_t CHUNK = 1u << 20; // 1M elems -> 4 MB f32 scratch
140+
std::vector<float> f32(CHUNK < n ? CHUNK : n);
141+
for (size_t off = 0; off < n; off += CHUNK) {
142+
const size_t cnt = (n - off < CHUNK) ? (n - off) : CHUNK;
143+
ggml_fp16_to_fp32_row(src + off, f32.data(), (int64_t) cnt);
144+
ggml_fp32_to_bf16_row(f32.data(),
145+
(ggml_bf16_t *) (bf16.data() + off),
146+
(int64_t) cnt);
147+
}
148+
}
149+
VEDAdeviceptr v = upload(bf16.data(), size_bytes);
150+
if (v == 0) return 0;
151+
record(v, size_bytes, host_f16, tensor_name, GGML_VE_HBM_BF16_FROM_F16);
152+
return v;
153+
}
154+
104155
// Q4_K canonical-split upload WITH PRE-DECODED HEADERS.
105156
//
106157
// Pre-decoding moves the per-block scale work (h2f conversion of d/dmin,

ggml/src/ggml-ve/ops/get_rows.cpp

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@ bool get_rows_supports(const ggml_tensor * op) {
2121
const ggml_tensor * src = op->src[0]; // embeddings
2222
const ggml_tensor * idx = op->src[1]; // i32 indices
2323
if (src == nullptr || idx == nullptr) return false;
24-
if (src->type != GGML_TYPE_BF16 && src->type != GGML_TYPE_F32) return false;
24+
// F16 src (e.g. a tied/F16 token_embd) is accepted: the VE has no F16 path,
25+
// so it's converted to BF16 once at HBM upload and served by the BF16 kernel.
26+
if (src->type != GGML_TYPE_BF16 && src->type != GGML_TYPE_F32 &&
27+
src->type != GGML_TYPE_F16) return false;
2528
if (idx->type != GGML_TYPE_I32) return false;
2629
// Need contiguous indices and output.
2730
if (!ggml_is_contiguous(idx) || !ggml_is_contiguous(op)) return false;
@@ -37,8 +40,12 @@ bool get_rows(backend_context * ctx, ggml_tensor * dst) {
3740

3841
// src (embedding table) is typically a CPU-side BF16 weight on first
3942
// call; resolve_in uploads it via hbm_weight_cache so subsequent calls
40-
// re-use the cached HBM ptr.
41-
const VEDAdeviceptr src_hbm = ctx->resolve_in(src);
43+
// re-use the cached HBM ptr. F16 weights are converted to BF16 once on
44+
// upload (same byte size) and served by the BF16 kernel below.
45+
const VEDAdeviceptr src_hbm =
46+
(src->type == GGML_TYPE_F16)
47+
? ctx->cache().get_or_upload_f16_as_bf16(src->name, src->data, ggml_nbytes(src))
48+
: ctx->resolve_in(src);
4249
const VEDAdeviceptr dst_hbm = ctx->resolve_out(dst);
4350
if (src_hbm == 0 || dst_hbm == 0) return false;
4451

@@ -72,7 +79,7 @@ bool get_rows(backend_context * ctx, ggml_tensor * dst) {
7279
}
7380
ctx->enqueue_hbm_free(idx_tmp);
7481

75-
kernel_id kid = (src->type == GGML_TYPE_BF16)
82+
kernel_id kid = (src->type == GGML_TYPE_BF16 || src->type == GGML_TYPE_F16)
7683
? K_GET_ROWS_BF16_F32_HBM_HBM
7784
: K_GET_ROWS_F32_F32_HBM_HBM;
7885
VEDAfunction fn = ctx->fn(kid);

ggml/src/ggml-ve/ops/mul_mat.cpp

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,11 @@ bool mul_mat_supports(const ggml_tensor * op) {
3838
const ggml_tensor * x = op->src[1];
3939
if (w == nullptr || x == nullptr) return rej("missing srcs");
4040
if (x->type != GGML_TYPE_F32) return rej("x not f32");
41-
if (w->type != GGML_TYPE_F32 && w->type != GGML_TYPE_BF16) return rej("w type");
41+
// F16 weights (e.g. a tied/F16 token_embd used as the lm_head) are accepted:
42+
// the VE has no F16 path, so they're converted to BF16 once at HBM upload
43+
// (same byte size) and run through the BF16 matvec/colmajor paths below.
44+
if (w->type != GGML_TYPE_F32 && w->type != GGML_TYPE_BF16 &&
45+
w->type != GGML_TYPE_F16) return rej("w type");
4246

4347
// Shapes: dst = src0 @ src1^T -> K matches, M = src0 row dim, N = src1 row dim.
4448
const int64_t K = w->ne[0];
@@ -62,9 +66,9 @@ bool mul_mat_supports(const ggml_tensor * op) {
6266
// BF16 N>1 : ve_bf16_matmul_hbm_full (sgemv_packed_bf16_unr per batch)
6367
// F32 N=1 : ve_f32_matvec_hbm_full
6468
// F32 N>1 : ve_f32_sgemm_batched_cblas_hbm (NLC cblas_sgemm)
65-
if (w->type == GGML_TYPE_BF16 && (K % 16) != 0) {
69+
if ((w->type == GGML_TYPE_BF16 || w->type == GGML_TYPE_F16) && (K % 16) != 0) {
6670
// The packed BF16 sgemv unrolls 16 across K, so K must be a multiple
67-
// of 16 to stay inside the row.
71+
// of 16 to stay inside the row. (F16 is converted to BF16, same rule.)
6872
return rej("BF16 K%16 != 0");
6973
}
7074
if (M <= 0 || K <= 0 || N <= 0) return rej("zero dim");
@@ -108,7 +112,13 @@ bool mul_mat(backend_context * ctx, ggml_tensor * dst) {
108112
const uint64_t K = (uint64_t) w->ne[0];
109113
const uint64_t N = (uint64_t) x->ne[1];
110114

111-
const VEDAdeviceptr w_vptr = ctx->resolve_in(w);
115+
// F16 weight -> BF16 once on upload (same byte size); downstream treats it
116+
// as BF16 (w_is_bf16). All other types go through the normal resolver.
117+
const bool w_is_bf16 = (w->type == GGML_TYPE_BF16 || w->type == GGML_TYPE_F16);
118+
const VEDAdeviceptr w_vptr =
119+
(w->type == GGML_TYPE_F16)
120+
? ctx->cache().get_or_upload_f16_as_bf16(w->name, w->data, ggml_nbytes(w))
121+
: ctx->resolve_in(w);
112122
const VEDAdeviceptr x_vptr = ctx->resolve_in(x);
113123
const VEDAdeviceptr y_vptr = ctx->resolve_out(dst);
114124
if (w_vptr == 0 || x_vptr == 0 || y_vptr == 0) return false;
@@ -149,7 +159,7 @@ bool mul_mat(backend_context * ctx, ggml_tensor * dst) {
149159
(std::getenv("GGML_VE_COLMAJOR_N1") != nullptr);
150160
if (colmajor_enabled
151161
&& (N > 1 || n1_colmajor)
152-
&& w->type == GGML_TYPE_BF16
162+
&& w_is_bf16
153163
&& ctx->dev() && ctx->dev()->colmajor
154164
&& ctx->fn(K_BF16_TO_F32_COLMAJOR_HBM) != 0
155165
&& ctx->fn(K_F32_SGEMM_BATCHED_CBLAS_HBM_NOTRANS) != 0) {
@@ -196,7 +206,7 @@ bool mul_mat(backend_context * ctx, ggml_tensor * dst) {
196206
VEDAfunction fn = 0;
197207
bool include_N = false;
198208

199-
if (w->type == GGML_TYPE_BF16) {
209+
if (w_is_bf16) {
200210
if (N == 1) {
201211
fn = ctx->fn(K_BF16_MATVEC_HBM_FULL);
202212
} else {

0 commit comments

Comments
 (0)