feat(dflash): add NVFP4 per-tensor scale2 support

phazei · phazei · commit 94a4dfd0e528 · 2026-05-10T00:33:43.000-07:00
Add support for NVFP4-quantized GGUF models (e.g. LibertAI Qwen3.6-27B-NVFP4)
by loading per-tensor weight scales and applying them in the target graph.

Scale values are read as host-side floats from the GGUF mmap at load time and
applied via ggml_scale() — a compile-time scalar multiply with zero extra
kernel launches. This avoids ggml_mul() with [1]-shaped GPU tensors, which
adds 768 kernel launches per forward pass and causes ~30x overhead in batched
DDTree verify mode (1001ms -&gt; 43ms per step on RTX 5090).

Supports both naming conventions:
  - LibertAI: blk.N.ffn_gate.scale
  - Heretic:  blk.N.ffn_gate.weight.scale

Non-NVFP4 models (Q4_K_M etc) are unaffected — scale fields default to 1.0f
and apply_scale2() returns early with zero overhead.

Also removes the DFLASH27B_USE_BLACKWELL_CONSUMER_FIX CMake workaround, which
incorrectly assumed consumer Blackwell GPUs (RTX 5090) lack FP4 MMA
instructions. The RTX 5090 fully supports sm_120a and native FP4 tensor cores.

Note: full native FP4 MMA performance requires upstream PR ggml-org#22196 to
be merged into the Luce-Org llama.cpp submodule fork. Without it, NVFP4
models still work correctly via the generic dequant-to-Q8_1 fallback path.
diff --git a/dflash/CMakeLists.txt b/dflash/CMakeLists.txt
@@ -69,34 +69,6 @@ else()
     endif()
 endif()
 
-# Consumer Blackwell workaround: skip sm_12x→sm_12xa replacement and FP4
-# mmq kernels that can trigger illegal-instruction faults on consumer chips.
-# By default, auto-enable when the resolved CUDA arch list includes a 12x
-# entry. Set DFLASH27B_USE_BLACKWELL_CONSUMER_FIX=ON to force this behavior
-# explicitly (for cross-compiles or custom arch lists).
-option(DFLASH27B_USE_BLACKWELL_CONSUMER_FIX
-    "Enable ggml consumer-Blackwell workaround (skip sm_12x→sm_12xa, exclude FP4 mmq kernels)" OFF)
-if(DFLASH27B_USE_BLACKWELL_CONSUMER_FIX)
-    set(_dflash_is_consumer_blackwell ON)
-endif()
-
-if(NOT DEFINED _dflash_is_consumer_blackwell)
-    set(_dflash_is_consumer_blackwell OFF)
-    # Iterate the resolved dflash27b arch list, not raw CMAKE_CUDA_ARCHITECTURES,
-    # which is empty on the default path (the project supplies its own list above).
-    foreach(_arch IN LISTS _dflash27b_archs)
-        string(REGEX REPLACE "[^0-9]" "" _dflash_arch_num "${_arch}")
-        if(_dflash_arch_num MATCHES "^12[0-9]$")
-            set(_dflash_is_consumer_blackwell ON)
-            break()
-        endif()
-    endforeach()
-endif()
-
-if(_dflash_is_consumer_blackwell)
-    set(GGML_CUDA_BLACKWELL_CONSUMER ON CACHE BOOL
-        "Skip sm_12X→sm_12Xa for consumer Blackwell (no FP4)" FORCE)
-endif()
 # Use only the ggml subtree of llama.cpp (skip libllama).
 add_subdirectory(deps/llama.cpp/ggml EXCLUDE_FROM_ALL)
 
diff --git a/dflash/src/gguf_target_loader.cpp b/dflash/src/gguf_target_loader.cpp
@@ -471,6 +471,8 @@ bool load_target_gguf_partial(const std::string & path,
         L.ssm_norm     = fnd("ssm_norm.weight");
         L.ssm_out      = fnd("ssm_out.weight");
 
+        // NVFP4 per-tensor weight scales are read after the mmap is loaded (below).
+
         // Sanity: each layer must be EITHER full-attn OR deltanet, not both, not neither.
         const bool has_attn = L.wq && L.wk && L.wv && L.wo && L.q_norm && L.k_norm;
         const bool has_ssm  = L.wqkv && L.wqkv_gate && L.ssm_conv1d && L.ssm_out;
@@ -572,6 +574,62 @@ bool load_target_gguf_partial(const std::string & path,
         total += sz;
     }
 
+    // ── 4b. Read NVFP4 per-tensor weight scales (optional; 1.0 for non-NVFP4).
+    //
+    // Scale tensors are F32 shape [1] — a single float per matmul weight.
+    // We read the value from mmap into host-side floats so the graph builder
+    // can use ggml_scale() (compile-time scalar, zero kernel launches) instead
+    // of ggml_mul() with a [1]-shaped GPU tensor. The ggml_mul approach adds
+    // 768 kernel launches per forward pass and causes catastrophic overhead
+    // (~1000ms vs ~30ms) in batched DDTree verify mode.
+    //
+    // LibertAI convention: "blk.N.ffn_gate.scale"
+    // Heretic convention: "blk.N.ffn_gate.weight.scale"
+    {
+        auto read_scale = [&](int il, const char * base) -> float {
+            char sname[128];
+            // Try "base.scale" first (LibertAI), then "base.weight.scale" (heretic)
+            std::snprintf(sname, sizeof(sname), "blk.%d.%s.scale", il, base);
+            int64_t stid = gguf_find_tensor(gctx, sname);
+            if (stid < 0) {
+                std::snprintf(sname, sizeof(sname), "blk.%d.%s.weight.scale", il, base);
+                stid = gguf_find_tensor(gctx, sname);
+            }
+            if (stid < 0) return 1.0f;
+            const size_t soff = data_start + gguf_get_tensor_offset(gctx, stid);
+            if (soff + sizeof(float) > mm.len) return 1.0f;
+            float val;
+            std::memcpy(&val, (const uint8_t *)mm.addr + soff, sizeof(float));
+            return val;
+        };
+
+        int n_scales = 0;
+        for (int il = 0; il < (int)n_layer; il++) {
+            TargetLayer & L = out.layers[il];
+            L.w_gate_s     = read_scale(il, "ffn_gate");
+            L.w_up_s       = read_scale(il, "ffn_up");
+            L.w_down_s     = read_scale(il, "ffn_down");
+            L.wq_s         = read_scale(il, "attn_q");
+            L.wk_s         = read_scale(il, "attn_k");
+            L.wv_s         = read_scale(il, "attn_v");
+            L.wo_s         = read_scale(il, "attn_output");
+            L.wqkv_s       = read_scale(il, "attn_qkv");
+            L.wqkv_gate_s  = read_scale(il, "attn_gate");
+            L.ssm_beta_s   = read_scale(il, "ssm_beta");
+            L.ssm_alpha_s  = read_scale(il, "ssm_alpha");
+            L.ssm_out_s    = read_scale(il, "ssm_out");
+            // Count non-trivial scales for the summary message.
+            auto count_s = [&](float s) { if (s != 1.0f) n_scales++; };
+            count_s(L.w_gate_s);   count_s(L.w_up_s);   count_s(L.w_down_s);
+            count_s(L.wq_s);      count_s(L.wk_s);      count_s(L.wv_s);
+            count_s(L.wo_s);      count_s(L.wqkv_s);    count_s(L.wqkv_gate_s);
+            count_s(L.ssm_beta_s); count_s(L.ssm_alpha_s); count_s(L.ssm_out_s);
+        }
+        if (n_scales > 0) {
+            std::printf("[loader] read %d NVFP4 per-tensor scale2 values (host-side, using ggml_scale)\n", n_scales);
+        }
+    }
+
     gguf_free(gctx);
 
     if (tok_embd_off == 0 || tok_embd_type == GGML_TYPE_COUNT) {
diff --git a/dflash/src/internal.h b/dflash/src/internal.h
@@ -71,6 +71,26 @@ struct TargetLayer {
     ggml_tensor * ssm_dt_bias    = nullptr;  // [dt_rank] per-head alpha bias
     ggml_tensor * ssm_norm       = nullptr;  // [head_v_dim]
     ggml_tensor * ssm_out        = nullptr;  // output projection after delta-net
+
+    // NVFP4 per-tensor weight scales (optional; 1.0f = no scaling).
+    // Each corresponds to a weight tensor above: result = mul_mat(w, x) * scale.
+    // Stored as host-side floats (read from the GGUF at load time) and applied
+    // via ggml_scale() — a compile-time scalar multiply with zero extra kernel
+    // launches, unlike ggml_mul() with a [1]-shaped GPU tensor which adds 768
+    // kernel launches per forward pass and causes catastrophic overhead in
+    // batched DDTree verify mode.
+    float w_gate_s       = 1.0f;
+    float w_up_s         = 1.0f;
+    float w_down_s       = 1.0f;
+    float wq_s           = 1.0f;
+    float wk_s           = 1.0f;
+    float wv_s           = 1.0f;
+    float wo_s           = 1.0f;
+    float wqkv_s         = 1.0f;
+    float wqkv_gate_s    = 1.0f;
+    float ssm_beta_s     = 1.0f;
+    float ssm_alpha_s    = 1.0f;
+    float ssm_out_s      = 1.0f;
 };
 
 // CPU-side embedder: keeps a mmap of the GGUF alive and knows how to
diff --git a/dflash/src/qwen35_target_graph.cpp b/dflash/src/qwen35_target_graph.cpp
@@ -678,6 +678,14 @@ bool restore_target_cache_chain(const PrefixSnapshot * thick,
 
 // ─── Helpers ─────────────────────────────────────────────────────────
 
+// NVFP4 scale2: if weight has a per-tensor scale, multiply the matmul result
+// by that scale.  No-op when scale is nullptr (non-NVFP4 models).
+static ggml_tensor * apply_scale2(ggml_context * ctx, ggml_tensor * mm_result,
+                                   float scale) {
+    if (scale == 1.0f) return mm_result;
+    return ggml_scale(ctx, mm_result, scale);
+}
+
 static ggml_tensor * rms_norm_mul(ggml_context * ctx, ggml_tensor * x,
                                   ggml_tensor * weight, float eps) {
     ggml_tensor * n = ggml_rms_norm(ctx, x, eps);
@@ -686,10 +694,10 @@ static ggml_tensor * rms_norm_mul(ggml_context * ctx, ggml_tensor * x,
 
 static ggml_tensor * build_swiglu_ffn(ggml_context * ctx, ggml_tensor * cur,
                                       const TargetLayer & L) {
-    ggml_tensor * gate = ggml_mul_mat(ctx, L.w_gate, cur);   // [inter, n_tokens]
-    ggml_tensor * up = ggml_mul_mat(ctx, L.w_up, cur);
+    ggml_tensor * gate = apply_scale2(ctx, ggml_mul_mat(ctx, L.w_gate, cur), L.w_gate_s);
+    ggml_tensor * up   = apply_scale2(ctx, ggml_mul_mat(ctx, L.w_up,   cur), L.w_up_s);
     ggml_tensor * gu = ggml_swiglu_split(ctx, gate, up);
-    return ggml_mul_mat(ctx, L.w_down, gu);                  // [hidden, n_tokens]
+    return apply_scale2(ctx, ggml_mul_mat(ctx, L.w_down, gu), L.w_down_s);
 }
 
 // Full-attention block (matches llama.cpp's build_layer_attn for qwen35)
@@ -721,7 +729,7 @@ static ggml_tensor * build_full_attn_block(
     const int q_dim    = n_head * head_dim;
 
     // ── Q projection (packed Q || gate), shape [2*q_dim, n_tokens]
-    ggml_tensor * QG = ggml_mul_mat(ctx, L.wq, cur);
+    ggml_tensor * QG = apply_scale2(ctx, ggml_mul_mat(ctx, L.wq, cur), L.wq_s);
     // Reshape to [head_dim*2, n_head, n_tokens] so we can view the Q and gate halves
     QG = ggml_reshape_3d(ctx, QG, head_dim * 2, n_head, n_tokens);
 
@@ -743,8 +751,8 @@ static ggml_tensor * build_full_attn_block(
     gate = ggml_cont_2d(ctx, gate, q_dim, n_tokens);  // [q_dim, n_tokens]
 
     // ── K and V projections
-    ggml_tensor * Kcur = ggml_mul_mat(ctx, L.wk, cur);   // [kv_dim, n_tokens]
-    ggml_tensor * Vcur = ggml_mul_mat(ctx, L.wv, cur);   // [kv_dim, n_tokens]
+    ggml_tensor * Kcur = apply_scale2(ctx, ggml_mul_mat(ctx, L.wk, cur), L.wk_s);
+    ggml_tensor * Vcur = apply_scale2(ctx, ggml_mul_mat(ctx, L.wv, cur), L.wv_s);
 
     Kcur = ggml_reshape_3d(ctx, Kcur, head_dim, n_head_kv, n_tokens);
     Kcur = rms_norm_mul(ctx, Kcur, L.k_norm, EPS);
@@ -850,7 +858,7 @@ static ggml_tensor * build_full_attn_block(
     attn = ggml_mul(ctx, attn, gate_sig);
 
     // ── Output projection
-    attn = ggml_mul_mat(ctx, L.wo, attn);  // [hidden, n_tokens]
+    attn = apply_scale2(ctx, ggml_mul_mat(ctx, L.wo, attn), L.wo_s);
     return attn;
 }
 
@@ -885,22 +893,22 @@ static ggml_tensor * build_delta_net_block(
     const int n_seq_tokens  = n_tokens;
 
     // ── qkv_mixed = wqkv @ cur         [conv_channels, n_tokens]
-    ggml_tensor * qkv_mixed = ggml_mul_mat(ctx, L.wqkv, cur);
+    ggml_tensor * qkv_mixed = apply_scale2(ctx, ggml_mul_mat(ctx, L.wqkv, cur), L.wqkv_s);
     qkv_mixed = ggml_reshape_3d(ctx, qkv_mixed, conv_channels, n_seq_tokens, n_seqs);
 
     // ── z = wqkv_gate @ cur            [inner, n_tokens]
-    ggml_tensor * z = ggml_mul_mat(ctx, L.wqkv_gate, cur);
+    ggml_tensor * z = apply_scale2(ctx, ggml_mul_mat(ctx, L.wqkv_gate, cur), L.wqkv_gate_s);
 
     // ── beta = ssm_beta @ cur          [dt_rank, n_tokens]
-    ggml_tensor * beta = ggml_mul_mat(ctx, L.ssm_beta, cur);
+    ggml_tensor * beta = apply_scale2(ctx, ggml_mul_mat(ctx, L.ssm_beta, cur), L.ssm_beta_s);
     beta = ggml_reshape_4d(ctx, beta, 1, num_v_heads, n_seq_tokens, n_seqs);
     beta = ggml_sigmoid(ctx, beta);
 
     // ── alpha = ssm_alpha @ cur        [dt_rank, n_tokens]
     //    alpha = alpha + ssm_dt_bias          (per-head bias)
     //    alpha = softplus(alpha)
     //    g     = alpha * ssm_a                (-A_log.exp() * softplus)
-    ggml_tensor * alpha = ggml_mul_mat(ctx, L.ssm_alpha, cur);
+    ggml_tensor * alpha = apply_scale2(ctx, ggml_mul_mat(ctx, L.ssm_alpha, cur), L.ssm_alpha_s);
     alpha = ggml_reshape_3d(ctx, alpha, num_v_heads, n_seq_tokens, n_seqs);
     alpha = ggml_add(ctx, alpha, L.ssm_dt_bias);
     alpha = ggml_softplus(ctx, alpha);
@@ -1131,7 +1139,7 @@ static ggml_tensor * build_delta_net_block(
         head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
 
     // Output projection
-    ggml_tensor * out = ggml_mul_mat(ctx, L.ssm_out, flat);
+    ggml_tensor * out = apply_scale2(ctx, ggml_mul_mat(ctx, L.ssm_out, flat), L.ssm_out_s);
     out = ggml_reshape_2d(ctx, out, w.n_embd, n_seq_tokens * n_seqs);
     return out;
 }