Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions dflash/src/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,26 @@ struct TargetLayer {
ggml_tensor * ssm_dt_bias = nullptr; // [dt_rank] per-head alpha bias
ggml_tensor * ssm_norm = nullptr; // [head_v_dim]
ggml_tensor * ssm_out = nullptr; // output projection after delta-net

// NVFP4 per-tensor weight scales (optional; 1.0f = no scaling).
// Each corresponds to a weight tensor above: result = mul_mat(w, x) * scale.
// Stored as host-side floats (read from the GGUF at load time) and applied
// via ggml_scale() — a compile-time scalar multiply with zero extra kernel
// launches, unlike ggml_mul() with a [1]-shaped GPU tensor which adds 768
// kernel launches per forward pass and causes catastrophic overhead in
// batched DDTree verify mode.
float w_gate_s = 1.0f;
float w_up_s = 1.0f;
float w_down_s = 1.0f;
float wq_s = 1.0f;
float wk_s = 1.0f;
float wv_s = 1.0f;
float wo_s = 1.0f;
float wqkv_s = 1.0f;
float wqkv_gate_s = 1.0f;
float ssm_beta_s = 1.0f;
float ssm_alpha_s = 1.0f;
float ssm_out_s = 1.0f;
};

// CPU-side embedder: keeps a mmap of the GGUF alive and knows how to
Expand Down
58 changes: 58 additions & 0 deletions dflash/src/qwen35/gguf_target_loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,8 @@ bool load_target_gguf_partial(const std::string & path,
L.ssm_norm = fnd("ssm_norm.weight");
L.ssm_out = fnd("ssm_out.weight");

// NVFP4 per-tensor weight scales are read after the mmap is loaded (below).

// Sanity: each layer must be EITHER full-attn OR deltanet, not both, not neither.
const bool has_attn = L.wq && L.wk && L.wv && L.wo && L.q_norm && L.k_norm;
const bool has_ssm = L.wqkv && L.wqkv_gate && L.ssm_conv1d && L.ssm_out;
Expand Down Expand Up @@ -575,6 +577,62 @@ bool load_target_gguf_partial(const std::string & path,
total += sz;
}

// ── 4b. Read NVFP4 per-tensor weight scales (optional; 1.0 for non-NVFP4).
//
// Scale tensors are F32 shape [1] — a single float per matmul weight.
// We read the value from mmap into host-side floats so the graph builder
// can use ggml_scale() (compile-time scalar, zero kernel launches) instead
// of ggml_mul() with a [1]-shaped GPU tensor. The ggml_mul approach adds
// 768 kernel launches per forward pass and causes catastrophic overhead
// (~1000ms vs ~30ms) in batched DDTree verify mode.
//
// LibertAI convention: "blk.N.ffn_gate.scale"
// Heretic convention: "blk.N.ffn_gate.weight.scale"
{
auto read_scale = [&](int il, const char * base) -> float {
char sname[128];
// Try "base.scale" first (LibertAI), then "base.weight.scale" (heretic)
std::snprintf(sname, sizeof(sname), "blk.%d.%s.scale", il, base);
int64_t stid = gguf_find_tensor(gctx, sname);
if (stid < 0) {
std::snprintf(sname, sizeof(sname), "blk.%d.%s.weight.scale", il, base);
stid = gguf_find_tensor(gctx, sname);
}
if (stid < 0) return 1.0f;
const size_t soff = data_start + gguf_get_tensor_offset(gctx, stid);
if (soff + sizeof(float) > mm.len) return 1.0f;
float val;
std::memcpy(&val, (const uint8_t *)mm.addr + soff, sizeof(float));
return val;
};

int n_scales = 0;
for (int il = 0; il < (int)n_layer; il++) {
TargetLayer & L = out.layers[il];
L.w_gate_s = read_scale(il, "ffn_gate");
L.w_up_s = read_scale(il, "ffn_up");
L.w_down_s = read_scale(il, "ffn_down");
L.wq_s = read_scale(il, "attn_q");
L.wk_s = read_scale(il, "attn_k");
L.wv_s = read_scale(il, "attn_v");
L.wo_s = read_scale(il, "attn_output");
L.wqkv_s = read_scale(il, "attn_qkv");
L.wqkv_gate_s = read_scale(il, "attn_gate");
L.ssm_beta_s = read_scale(il, "ssm_beta");
L.ssm_alpha_s = read_scale(il, "ssm_alpha");
L.ssm_out_s = read_scale(il, "ssm_out");
// Count non-trivial scales for the summary message.
auto count_s = [&](float s) { if (s != 1.0f) n_scales++; };
count_s(L.w_gate_s); count_s(L.w_up_s); count_s(L.w_down_s);
count_s(L.wq_s); count_s(L.wk_s); count_s(L.wv_s);
count_s(L.wo_s); count_s(L.wqkv_s); count_s(L.wqkv_gate_s);
count_s(L.ssm_beta_s); count_s(L.ssm_alpha_s); count_s(L.ssm_out_s);
}
if (n_scales > 0) {
std::printf("[loader] read %d NVFP4 per-tensor scale2 values (host-side, using ggml_scale)\n", n_scales);
}
}

gguf_free(gctx);

if (tok_embd_off == 0 || tok_embd_type == GGML_TYPE_COUNT) {
Expand Down
33 changes: 21 additions & 12 deletions dflash/src/qwen35/qwen35_target_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -416,13 +416,21 @@ static ggml_tensor * rms_norm_mul(ggml_context * ctx, ggml_tensor * x,
return ggml_mul(ctx, n, weight);
}

// NVFP4 scale2: if weight has a per-tensor scale, multiply the matmul result
// by that scale. No-op when scale==1.0f (non-NVFP4 models).
static ggml_tensor * apply_scale2(ggml_context * ctx, ggml_tensor * mm_result,
float scale) {
if (scale == 1.0f) return mm_result;
return ggml_scale(ctx, mm_result, scale);
}

static ggml_tensor * build_swiglu_ffn(ggml_context * ctx, ggml_tensor * cur,
const TargetLayer & L) {
ggml_tensor * gate = ggml_mul_mat(ctx, L.w_gate, cur); // [inter, n_tokens]
ggml_tensor * gate = apply_scale2(ctx, ggml_mul_mat(ctx, L.w_gate, cur), L.w_gate_s); // [inter, n_tokens]
gate = ggml_silu(ctx, gate);
ggml_tensor * up = ggml_mul_mat(ctx, L.w_up, cur);
ggml_tensor * up = apply_scale2(ctx, ggml_mul_mat(ctx, L.w_up, cur), L.w_up_s);
ggml_tensor * gu = ggml_mul(ctx, gate, up);
return ggml_mul_mat(ctx, L.w_down, gu); // [hidden, n_tokens]
return apply_scale2(ctx, ggml_mul_mat(ctx, L.w_down, gu), L.w_down_s); // [hidden, n_tokens]
}

// Full-attention block (matches llama.cpp's build_layer_attn for qwen35)
Expand Down Expand Up @@ -456,7 +464,7 @@ static ggml_tensor * build_full_attn_block(
const int n_head_kv = w.n_head_kv;
const int q_dim = head_dim * n_head;
// ── Q projection (packed Q || gate), shape [2*q_dim, n_tokens]
ggml_tensor * QG = ggml_mul_mat(ctx, L.wq, cur);
ggml_tensor * QG = apply_scale2(ctx, ggml_mul_mat(ctx, L.wq, cur), L.wq_s);
// Reshape to [head_dim*2, n_head, n_tokens] so we can view the Q and gate halves
QG = ggml_reshape_3d(ctx, QG, head_dim * 2, n_head, n_tokens);

Expand All @@ -478,8 +486,8 @@ static ggml_tensor * build_full_attn_block(
gate = ggml_cont_2d(ctx, gate, q_dim, n_tokens); // [q_dim, n_tokens]

// ── K and V projections
ggml_tensor * Kcur = ggml_mul_mat(ctx, L.wk, cur); // [kv_dim, n_tokens]
ggml_tensor * Vcur = ggml_mul_mat(ctx, L.wv, cur); // [kv_dim, n_tokens]
ggml_tensor * Kcur = apply_scale2(ctx, ggml_mul_mat(ctx, L.wk, cur), L.wk_s);
ggml_tensor * Vcur = apply_scale2(ctx, ggml_mul_mat(ctx, L.wv, cur), L.wv_s);

Kcur = ggml_reshape_3d(ctx, Kcur, head_dim, n_head_kv, n_tokens);
Kcur = rms_norm_mul(ctx, Kcur, L.k_norm, w.rms_eps);
Expand Down Expand Up @@ -610,7 +618,7 @@ static ggml_tensor * build_full_attn_block(
attn = ggml_mul(ctx, attn, gate_sig);

// ── Output projection
attn = ggml_mul_mat(ctx, L.wo, attn); // [hidden, n_tokens]
attn = apply_scale2(ctx, ggml_mul_mat(ctx, L.wo, attn), L.wo_s);
return attn;
}

Expand Down Expand Up @@ -645,22 +653,22 @@ static ggml_tensor * build_delta_net_block(
const int n_seq_tokens = n_tokens;

// ── qkv_mixed = wqkv @ cur [10240, n_tokens]
ggml_tensor * qkv_mixed = ggml_mul_mat(ctx, L.wqkv, cur);
ggml_tensor * qkv_mixed = apply_scale2(ctx, ggml_mul_mat(ctx, L.wqkv, cur), L.wqkv_s);
qkv_mixed = ggml_reshape_3d(ctx, qkv_mixed, conv_channels, n_seq_tokens, n_seqs);

// ── z = wqkv_gate @ cur [inner, n_tokens]
ggml_tensor * z = ggml_mul_mat(ctx, L.wqkv_gate, cur);
ggml_tensor * z = apply_scale2(ctx, ggml_mul_mat(ctx, L.wqkv_gate, cur), L.wqkv_gate_s);

// ── beta = ssm_beta @ cur [dt_rank, n_tokens]
ggml_tensor * beta = ggml_mul_mat(ctx, L.ssm_beta, cur);
ggml_tensor * beta = apply_scale2(ctx, ggml_mul_mat(ctx, L.ssm_beta, cur), L.ssm_beta_s);
beta = ggml_reshape_4d(ctx, beta, 1, num_v_heads, n_seq_tokens, n_seqs);
beta = ggml_sigmoid(ctx, beta);

// ── alpha = ssm_alpha @ cur [dt_rank, n_tokens]
// alpha = alpha + ssm_dt_bias (per-head bias)
// alpha = softplus(alpha)
// g = alpha * ssm_a (-A_log.exp() * softplus)
ggml_tensor * alpha = ggml_mul_mat(ctx, L.ssm_alpha, cur);
ggml_tensor * alpha = apply_scale2(ctx, ggml_mul_mat(ctx, L.ssm_alpha, cur), L.ssm_alpha_s);
alpha = ggml_reshape_3d(ctx, alpha, num_v_heads, n_seq_tokens, n_seqs);
alpha = ggml_add(ctx, alpha, L.ssm_dt_bias);
alpha = ggml_softplus(ctx, alpha);
Expand Down Expand Up @@ -885,7 +893,7 @@ static ggml_tensor * build_delta_net_block(
head_v_dim * num_v_heads, n_seq_tokens, n_seqs);

// Output projection
ggml_tensor * out = ggml_mul_mat(ctx, L.ssm_out, flat);
ggml_tensor * out = apply_scale2(ctx, ggml_mul_mat(ctx, L.ssm_out, flat), L.ssm_out_s);
out = ggml_reshape_2d(ctx, out, w.n_embd, n_seq_tokens * n_seqs);
return out;
}
Expand Down Expand Up @@ -1462,4 +1470,5 @@ bool restore_target_cache_chain(const PrefixSnapshot * thick,
return true;
}


} // namespace dflash27b
Loading