Skip to content

Commit 6a98465

Browse files
phazeidavide221
authored andcommitted
feat(dflash): add NVFP4 per-tensor scale2 support
Add support for NVFP4-quantized GGUF models (e.g. LibertAI Qwen3.6-27B-NVFP4) by loading per-tensor weight scales and applying them in the target graph. Scale values are read as host-side floats from the GGUF mmap at load time and applied via ggml_scale() — a compile-time scalar multiply with zero extra kernel launches. This avoids ggml_mul() with [1]-shaped GPU tensors, which adds 768 kernel launches per forward pass and causes ~30x overhead in batched DDTree verify mode (1001ms -> 43ms per step on RTX 5090). Supports both naming conventions: - LibertAI: blk.N.ffn_gate.scale - Heretic: blk.N.ffn_gate.weight.scale Non-NVFP4 models (Q4_K_M etc) are unaffected — scale fields default to 1.0f and apply_scale2() returns early with zero overhead. Also removes the DFLASH27B_USE_BLACKWELL_CONSUMER_FIX CMake workaround, which incorrectly assumed consumer Blackwell GPUs (RTX 5090) lack FP4 MMA instructions. The RTX 5090 fully supports sm_120a and native FP4 tensor cores. Note: full native FP4 MMA performance requires upstream PR ggml-org#22196 to be merged into the Luce-Org llama.cpp submodule fork. Without it, NVFP4 models still work correctly via the generic dequant-to-Q8_1 fallback path.
1 parent 230ff17 commit 6a98465

3 files changed

Lines changed: 99 additions & 12 deletions

File tree

dflash/src/internal.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,26 @@ struct TargetLayer {
7171
ggml_tensor * ssm_dt_bias = nullptr; // [dt_rank] per-head alpha bias
7272
ggml_tensor * ssm_norm = nullptr; // [head_v_dim]
7373
ggml_tensor * ssm_out = nullptr; // output projection after delta-net
74+
75+
// NVFP4 per-tensor weight scales (optional; 1.0f = no scaling).
76+
// Each corresponds to a weight tensor above: result = mul_mat(w, x) * scale.
77+
// Stored as host-side floats (read from the GGUF at load time) and applied
78+
// via ggml_scale() — a compile-time scalar multiply with zero extra kernel
79+
// launches, unlike ggml_mul() with a [1]-shaped GPU tensor which adds 768
80+
// kernel launches per forward pass and causes catastrophic overhead in
81+
// batched DDTree verify mode.
82+
float w_gate_s = 1.0f;
83+
float w_up_s = 1.0f;
84+
float w_down_s = 1.0f;
85+
float wq_s = 1.0f;
86+
float wk_s = 1.0f;
87+
float wv_s = 1.0f;
88+
float wo_s = 1.0f;
89+
float wqkv_s = 1.0f;
90+
float wqkv_gate_s = 1.0f;
91+
float ssm_beta_s = 1.0f;
92+
float ssm_alpha_s = 1.0f;
93+
float ssm_out_s = 1.0f;
7494
};
7595

7696
// CPU-side embedder: keeps a mmap of the GGUF alive and knows how to

dflash/src/qwen35/gguf_target_loader.cpp

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -474,6 +474,8 @@ bool load_target_gguf_partial(const std::string & path,
474474
L.ssm_norm = fnd("ssm_norm.weight");
475475
L.ssm_out = fnd("ssm_out.weight");
476476

477+
// NVFP4 per-tensor weight scales are read after the mmap is loaded (below).
478+
477479
// Sanity: each layer must be EITHER full-attn OR deltanet, not both, not neither.
478480
const bool has_attn = L.wq && L.wk && L.wv && L.wo && L.q_norm && L.k_norm;
479481
const bool has_ssm = L.wqkv && L.wqkv_gate && L.ssm_conv1d && L.ssm_out;
@@ -575,6 +577,62 @@ bool load_target_gguf_partial(const std::string & path,
575577
total += sz;
576578
}
577579

580+
// ── 4b. Read NVFP4 per-tensor weight scales (optional; 1.0 for non-NVFP4).
581+
//
582+
// Scale tensors are F32 shape [1] — a single float per matmul weight.
583+
// We read the value from mmap into host-side floats so the graph builder
584+
// can use ggml_scale() (compile-time scalar, zero kernel launches) instead
585+
// of ggml_mul() with a [1]-shaped GPU tensor. The ggml_mul approach adds
586+
// 768 kernel launches per forward pass and causes catastrophic overhead
587+
// (~1000ms vs ~30ms) in batched DDTree verify mode.
588+
//
589+
// LibertAI convention: "blk.N.ffn_gate.scale"
590+
// Heretic convention: "blk.N.ffn_gate.weight.scale"
591+
{
592+
auto read_scale = [&](int il, const char * base) -> float {
593+
char sname[128];
594+
// Try "base.scale" first (LibertAI), then "base.weight.scale" (heretic)
595+
std::snprintf(sname, sizeof(sname), "blk.%d.%s.scale", il, base);
596+
int64_t stid = gguf_find_tensor(gctx, sname);
597+
if (stid < 0) {
598+
std::snprintf(sname, sizeof(sname), "blk.%d.%s.weight.scale", il, base);
599+
stid = gguf_find_tensor(gctx, sname);
600+
}
601+
if (stid < 0) return 1.0f;
602+
const size_t soff = data_start + gguf_get_tensor_offset(gctx, stid);
603+
if (soff + sizeof(float) > mm.len) return 1.0f;
604+
float val;
605+
std::memcpy(&val, (const uint8_t *)mm.addr + soff, sizeof(float));
606+
return val;
607+
};
608+
609+
int n_scales = 0;
610+
for (int il = 0; il < (int)n_layer; il++) {
611+
TargetLayer & L = out.layers[il];
612+
L.w_gate_s = read_scale(il, "ffn_gate");
613+
L.w_up_s = read_scale(il, "ffn_up");
614+
L.w_down_s = read_scale(il, "ffn_down");
615+
L.wq_s = read_scale(il, "attn_q");
616+
L.wk_s = read_scale(il, "attn_k");
617+
L.wv_s = read_scale(il, "attn_v");
618+
L.wo_s = read_scale(il, "attn_output");
619+
L.wqkv_s = read_scale(il, "attn_qkv");
620+
L.wqkv_gate_s = read_scale(il, "attn_gate");
621+
L.ssm_beta_s = read_scale(il, "ssm_beta");
622+
L.ssm_alpha_s = read_scale(il, "ssm_alpha");
623+
L.ssm_out_s = read_scale(il, "ssm_out");
624+
// Count non-trivial scales for the summary message.
625+
auto count_s = [&](float s) { if (s != 1.0f) n_scales++; };
626+
count_s(L.w_gate_s); count_s(L.w_up_s); count_s(L.w_down_s);
627+
count_s(L.wq_s); count_s(L.wk_s); count_s(L.wv_s);
628+
count_s(L.wo_s); count_s(L.wqkv_s); count_s(L.wqkv_gate_s);
629+
count_s(L.ssm_beta_s); count_s(L.ssm_alpha_s); count_s(L.ssm_out_s);
630+
}
631+
if (n_scales > 0) {
632+
std::printf("[loader] read %d NVFP4 per-tensor scale2 values (host-side, using ggml_scale)\n", n_scales);
633+
}
634+
}
635+
578636
gguf_free(gctx);
579637

580638
if (tok_embd_off == 0 || tok_embd_type == GGML_TYPE_COUNT) {

dflash/src/qwen35/qwen35_target_graph.cpp

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -416,13 +416,21 @@ static ggml_tensor * rms_norm_mul(ggml_context * ctx, ggml_tensor * x,
416416
return ggml_mul(ctx, n, weight);
417417
}
418418

419+
// NVFP4 scale2: if weight has a per-tensor scale, multiply the matmul result
420+
// by that scale. No-op when scale==1.0f (non-NVFP4 models).
421+
static ggml_tensor * apply_scale2(ggml_context * ctx, ggml_tensor * mm_result,
422+
float scale) {
423+
if (scale == 1.0f) return mm_result;
424+
return ggml_scale(ctx, mm_result, scale);
425+
}
426+
419427
static ggml_tensor * build_swiglu_ffn(ggml_context * ctx, ggml_tensor * cur,
420428
const TargetLayer & L) {
421-
ggml_tensor * gate = ggml_mul_mat(ctx, L.w_gate, cur); // [inter, n_tokens]
429+
ggml_tensor * gate = apply_scale2(ctx, ggml_mul_mat(ctx, L.w_gate, cur), L.w_gate_s); // [inter, n_tokens]
422430
gate = ggml_silu(ctx, gate);
423-
ggml_tensor * up = ggml_mul_mat(ctx, L.w_up, cur);
431+
ggml_tensor * up = apply_scale2(ctx, ggml_mul_mat(ctx, L.w_up, cur), L.w_up_s);
424432
ggml_tensor * gu = ggml_mul(ctx, gate, up);
425-
return ggml_mul_mat(ctx, L.w_down, gu); // [hidden, n_tokens]
433+
return apply_scale2(ctx, ggml_mul_mat(ctx, L.w_down, gu), L.w_down_s); // [hidden, n_tokens]
426434
}
427435

428436
// Full-attention block (matches llama.cpp's build_layer_attn for qwen35)
@@ -456,7 +464,7 @@ static ggml_tensor * build_full_attn_block(
456464
const int n_head_kv = w.n_head_kv;
457465
const int q_dim = head_dim * n_head;
458466
// ── Q projection (packed Q || gate), shape [2*q_dim, n_tokens]
459-
ggml_tensor * QG = ggml_mul_mat(ctx, L.wq, cur);
467+
ggml_tensor * QG = apply_scale2(ctx, ggml_mul_mat(ctx, L.wq, cur), L.wq_s);
460468
// Reshape to [head_dim*2, n_head, n_tokens] so we can view the Q and gate halves
461469
QG = ggml_reshape_3d(ctx, QG, head_dim * 2, n_head, n_tokens);
462470

@@ -478,8 +486,8 @@ static ggml_tensor * build_full_attn_block(
478486
gate = ggml_cont_2d(ctx, gate, q_dim, n_tokens); // [q_dim, n_tokens]
479487

480488
// ── K and V projections
481-
ggml_tensor * Kcur = ggml_mul_mat(ctx, L.wk, cur); // [kv_dim, n_tokens]
482-
ggml_tensor * Vcur = ggml_mul_mat(ctx, L.wv, cur); // [kv_dim, n_tokens]
489+
ggml_tensor * Kcur = apply_scale2(ctx, ggml_mul_mat(ctx, L.wk, cur), L.wk_s);
490+
ggml_tensor * Vcur = apply_scale2(ctx, ggml_mul_mat(ctx, L.wv, cur), L.wv_s);
483491

484492
Kcur = ggml_reshape_3d(ctx, Kcur, head_dim, n_head_kv, n_tokens);
485493
Kcur = rms_norm_mul(ctx, Kcur, L.k_norm, w.rms_eps);
@@ -610,7 +618,7 @@ static ggml_tensor * build_full_attn_block(
610618
attn = ggml_mul(ctx, attn, gate_sig);
611619

612620
// ── Output projection
613-
attn = ggml_mul_mat(ctx, L.wo, attn); // [hidden, n_tokens]
621+
attn = apply_scale2(ctx, ggml_mul_mat(ctx, L.wo, attn), L.wo_s);
614622
return attn;
615623
}
616624

@@ -645,22 +653,22 @@ static ggml_tensor * build_delta_net_block(
645653
const int n_seq_tokens = n_tokens;
646654

647655
// ── qkv_mixed = wqkv @ cur [10240, n_tokens]
648-
ggml_tensor * qkv_mixed = ggml_mul_mat(ctx, L.wqkv, cur);
656+
ggml_tensor * qkv_mixed = apply_scale2(ctx, ggml_mul_mat(ctx, L.wqkv, cur), L.wqkv_s);
649657
qkv_mixed = ggml_reshape_3d(ctx, qkv_mixed, conv_channels, n_seq_tokens, n_seqs);
650658

651659
// ── z = wqkv_gate @ cur [inner, n_tokens]
652-
ggml_tensor * z = ggml_mul_mat(ctx, L.wqkv_gate, cur);
660+
ggml_tensor * z = apply_scale2(ctx, ggml_mul_mat(ctx, L.wqkv_gate, cur), L.wqkv_gate_s);
653661

654662
// ── beta = ssm_beta @ cur [dt_rank, n_tokens]
655-
ggml_tensor * beta = ggml_mul_mat(ctx, L.ssm_beta, cur);
663+
ggml_tensor * beta = apply_scale2(ctx, ggml_mul_mat(ctx, L.ssm_beta, cur), L.ssm_beta_s);
656664
beta = ggml_reshape_4d(ctx, beta, 1, num_v_heads, n_seq_tokens, n_seqs);
657665
beta = ggml_sigmoid(ctx, beta);
658666

659667
// ── alpha = ssm_alpha @ cur [dt_rank, n_tokens]
660668
// alpha = alpha + ssm_dt_bias (per-head bias)
661669
// alpha = softplus(alpha)
662670
// g = alpha * ssm_a (-A_log.exp() * softplus)
663-
ggml_tensor * alpha = ggml_mul_mat(ctx, L.ssm_alpha, cur);
671+
ggml_tensor * alpha = apply_scale2(ctx, ggml_mul_mat(ctx, L.ssm_alpha, cur), L.ssm_alpha_s);
664672
alpha = ggml_reshape_3d(ctx, alpha, num_v_heads, n_seq_tokens, n_seqs);
665673
alpha = ggml_add(ctx, alpha, L.ssm_dt_bias);
666674
alpha = ggml_softplus(ctx, alpha);
@@ -885,7 +893,7 @@ static ggml_tensor * build_delta_net_block(
885893
head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
886894

887895
// Output projection
888-
ggml_tensor * out = ggml_mul_mat(ctx, L.ssm_out, flat);
896+
ggml_tensor * out = apply_scale2(ctx, ggml_mul_mat(ctx, L.ssm_out, flat), L.ssm_out_s);
889897
out = ggml_reshape_2d(ctx, out, w.n_embd, n_seq_tokens * n_seqs);
890898
return out;
891899
}
@@ -1462,4 +1470,5 @@ bool restore_target_cache_chain(const PrefixSnapshot * thick,
14621470
return true;
14631471
}
14641472

1473+
14651474
} // namespace dflash27b

0 commit comments

Comments
 (0)