diff --git a/dflash/src/internal.h b/dflash/src/internal.h index 7a65b41a..ca92f1b2 100644 --- a/dflash/src/internal.h +++ b/dflash/src/internal.h @@ -71,6 +71,26 @@ struct TargetLayer { ggml_tensor * ssm_dt_bias = nullptr; // [dt_rank] per-head alpha bias ggml_tensor * ssm_norm = nullptr; // [head_v_dim] ggml_tensor * ssm_out = nullptr; // output projection after delta-net + + // NVFP4 per-tensor weight scales (optional; 1.0f = no scaling). + // Each corresponds to a weight tensor above: result = mul_mat(w, x) * scale. + // Stored as host-side floats (read from the GGUF at load time) and applied + // via ggml_scale() — a compile-time scalar multiply with zero extra kernel + // launches, unlike ggml_mul() with a [1]-shaped GPU tensor which adds 768 + // kernel launches per forward pass and causes catastrophic overhead in + // batched DDTree verify mode. + float w_gate_s = 1.0f; + float w_up_s = 1.0f; + float w_down_s = 1.0f; + float wq_s = 1.0f; + float wk_s = 1.0f; + float wv_s = 1.0f; + float wo_s = 1.0f; + float wqkv_s = 1.0f; + float wqkv_gate_s = 1.0f; + float ssm_beta_s = 1.0f; + float ssm_alpha_s = 1.0f; + float ssm_out_s = 1.0f; }; // CPU-side embedder: keeps a mmap of the GGUF alive and knows how to diff --git a/dflash/src/qwen35/gguf_target_loader.cpp b/dflash/src/qwen35/gguf_target_loader.cpp index 5f861982..f5a5f1de 100644 --- a/dflash/src/qwen35/gguf_target_loader.cpp +++ b/dflash/src/qwen35/gguf_target_loader.cpp @@ -474,6 +474,8 @@ bool load_target_gguf_partial(const std::string & path, L.ssm_norm = fnd("ssm_norm.weight"); L.ssm_out = fnd("ssm_out.weight"); + // NVFP4 per-tensor weight scales are read after the mmap is loaded (below). + // Sanity: each layer must be EITHER full-attn OR deltanet, not both, not neither. const bool has_attn = L.wq && L.wk && L.wv && L.wo && L.q_norm && L.k_norm; const bool has_ssm = L.wqkv && L.wqkv_gate && L.ssm_conv1d && L.ssm_out; @@ -575,6 +577,62 @@ bool load_target_gguf_partial(const std::string & path, total += sz; } + // ── 4b. Read NVFP4 per-tensor weight scales (optional; 1.0 for non-NVFP4). + // + // Scale tensors are F32 shape [1] — a single float per matmul weight. + // We read the value from mmap into host-side floats so the graph builder + // can use ggml_scale() (compile-time scalar, zero kernel launches) instead + // of ggml_mul() with a [1]-shaped GPU tensor. The ggml_mul approach adds + // 768 kernel launches per forward pass and causes catastrophic overhead + // (~1000ms vs ~30ms) in batched DDTree verify mode. + // + // LibertAI convention: "blk.N.ffn_gate.scale" + // Heretic convention: "blk.N.ffn_gate.weight.scale" + { + auto read_scale = [&](int il, const char * base) -> float { + char sname[128]; + // Try "base.scale" first (LibertAI), then "base.weight.scale" (heretic) + std::snprintf(sname, sizeof(sname), "blk.%d.%s.scale", il, base); + int64_t stid = gguf_find_tensor(gctx, sname); + if (stid < 0) { + std::snprintf(sname, sizeof(sname), "blk.%d.%s.weight.scale", il, base); + stid = gguf_find_tensor(gctx, sname); + } + if (stid < 0) return 1.0f; + const size_t soff = data_start + gguf_get_tensor_offset(gctx, stid); + if (soff + sizeof(float) > mm.len) return 1.0f; + float val; + std::memcpy(&val, (const uint8_t *)mm.addr + soff, sizeof(float)); + return val; + }; + + int n_scales = 0; + for (int il = 0; il < (int)n_layer; il++) { + TargetLayer & L = out.layers[il]; + L.w_gate_s = read_scale(il, "ffn_gate"); + L.w_up_s = read_scale(il, "ffn_up"); + L.w_down_s = read_scale(il, "ffn_down"); + L.wq_s = read_scale(il, "attn_q"); + L.wk_s = read_scale(il, "attn_k"); + L.wv_s = read_scale(il, "attn_v"); + L.wo_s = read_scale(il, "attn_output"); + L.wqkv_s = read_scale(il, "attn_qkv"); + L.wqkv_gate_s = read_scale(il, "attn_gate"); + L.ssm_beta_s = read_scale(il, "ssm_beta"); + L.ssm_alpha_s = read_scale(il, "ssm_alpha"); + L.ssm_out_s = read_scale(il, "ssm_out"); + // Count non-trivial scales for the summary message. + auto count_s = [&](float s) { if (s != 1.0f) n_scales++; }; + count_s(L.w_gate_s); count_s(L.w_up_s); count_s(L.w_down_s); + count_s(L.wq_s); count_s(L.wk_s); count_s(L.wv_s); + count_s(L.wo_s); count_s(L.wqkv_s); count_s(L.wqkv_gate_s); + count_s(L.ssm_beta_s); count_s(L.ssm_alpha_s); count_s(L.ssm_out_s); + } + if (n_scales > 0) { + std::printf("[loader] read %d NVFP4 per-tensor scale2 values (host-side, using ggml_scale)\n", n_scales); + } + } + gguf_free(gctx); if (tok_embd_off == 0 || tok_embd_type == GGML_TYPE_COUNT) { diff --git a/dflash/src/qwen35/qwen35_target_graph.cpp b/dflash/src/qwen35/qwen35_target_graph.cpp index 239ec1f4..3d55417f 100644 --- a/dflash/src/qwen35/qwen35_target_graph.cpp +++ b/dflash/src/qwen35/qwen35_target_graph.cpp @@ -416,13 +416,21 @@ static ggml_tensor * rms_norm_mul(ggml_context * ctx, ggml_tensor * x, return ggml_mul(ctx, n, weight); } +// NVFP4 scale2: if weight has a per-tensor scale, multiply the matmul result +// by that scale. No-op when scale==1.0f (non-NVFP4 models). +static ggml_tensor * apply_scale2(ggml_context * ctx, ggml_tensor * mm_result, + float scale) { + if (scale == 1.0f) return mm_result; + return ggml_scale(ctx, mm_result, scale); +} + static ggml_tensor * build_swiglu_ffn(ggml_context * ctx, ggml_tensor * cur, const TargetLayer & L) { - ggml_tensor * gate = ggml_mul_mat(ctx, L.w_gate, cur); // [inter, n_tokens] + ggml_tensor * gate = apply_scale2(ctx, ggml_mul_mat(ctx, L.w_gate, cur), L.w_gate_s); // [inter, n_tokens] gate = ggml_silu(ctx, gate); - ggml_tensor * up = ggml_mul_mat(ctx, L.w_up, cur); + ggml_tensor * up = apply_scale2(ctx, ggml_mul_mat(ctx, L.w_up, cur), L.w_up_s); ggml_tensor * gu = ggml_mul(ctx, gate, up); - return ggml_mul_mat(ctx, L.w_down, gu); // [hidden, n_tokens] + return apply_scale2(ctx, ggml_mul_mat(ctx, L.w_down, gu), L.w_down_s); // [hidden, n_tokens] } // Full-attention block (matches llama.cpp's build_layer_attn for qwen35) @@ -456,7 +464,7 @@ static ggml_tensor * build_full_attn_block( const int n_head_kv = w.n_head_kv; const int q_dim = head_dim * n_head; // ── Q projection (packed Q || gate), shape [2*q_dim, n_tokens] - ggml_tensor * QG = ggml_mul_mat(ctx, L.wq, cur); + ggml_tensor * QG = apply_scale2(ctx, ggml_mul_mat(ctx, L.wq, cur), L.wq_s); // Reshape to [head_dim*2, n_head, n_tokens] so we can view the Q and gate halves QG = ggml_reshape_3d(ctx, QG, head_dim * 2, n_head, n_tokens); @@ -478,8 +486,8 @@ static ggml_tensor * build_full_attn_block( gate = ggml_cont_2d(ctx, gate, q_dim, n_tokens); // [q_dim, n_tokens] // ── K and V projections - ggml_tensor * Kcur = ggml_mul_mat(ctx, L.wk, cur); // [kv_dim, n_tokens] - ggml_tensor * Vcur = ggml_mul_mat(ctx, L.wv, cur); // [kv_dim, n_tokens] + ggml_tensor * Kcur = apply_scale2(ctx, ggml_mul_mat(ctx, L.wk, cur), L.wk_s); + ggml_tensor * Vcur = apply_scale2(ctx, ggml_mul_mat(ctx, L.wv, cur), L.wv_s); Kcur = ggml_reshape_3d(ctx, Kcur, head_dim, n_head_kv, n_tokens); Kcur = rms_norm_mul(ctx, Kcur, L.k_norm, w.rms_eps); @@ -610,7 +618,7 @@ static ggml_tensor * build_full_attn_block( attn = ggml_mul(ctx, attn, gate_sig); // ── Output projection - attn = ggml_mul_mat(ctx, L.wo, attn); // [hidden, n_tokens] + attn = apply_scale2(ctx, ggml_mul_mat(ctx, L.wo, attn), L.wo_s); return attn; } @@ -645,14 +653,14 @@ static ggml_tensor * build_delta_net_block( const int n_seq_tokens = n_tokens; // ── qkv_mixed = wqkv @ cur [10240, n_tokens] - ggml_tensor * qkv_mixed = ggml_mul_mat(ctx, L.wqkv, cur); + ggml_tensor * qkv_mixed = apply_scale2(ctx, ggml_mul_mat(ctx, L.wqkv, cur), L.wqkv_s); qkv_mixed = ggml_reshape_3d(ctx, qkv_mixed, conv_channels, n_seq_tokens, n_seqs); // ── z = wqkv_gate @ cur [inner, n_tokens] - ggml_tensor * z = ggml_mul_mat(ctx, L.wqkv_gate, cur); + ggml_tensor * z = apply_scale2(ctx, ggml_mul_mat(ctx, L.wqkv_gate, cur), L.wqkv_gate_s); // ── beta = ssm_beta @ cur [dt_rank, n_tokens] - ggml_tensor * beta = ggml_mul_mat(ctx, L.ssm_beta, cur); + ggml_tensor * beta = apply_scale2(ctx, ggml_mul_mat(ctx, L.ssm_beta, cur), L.ssm_beta_s); beta = ggml_reshape_4d(ctx, beta, 1, num_v_heads, n_seq_tokens, n_seqs); beta = ggml_sigmoid(ctx, beta); @@ -660,7 +668,7 @@ static ggml_tensor * build_delta_net_block( // alpha = alpha + ssm_dt_bias (per-head bias) // alpha = softplus(alpha) // g = alpha * ssm_a (-A_log.exp() * softplus) - ggml_tensor * alpha = ggml_mul_mat(ctx, L.ssm_alpha, cur); + ggml_tensor * alpha = apply_scale2(ctx, ggml_mul_mat(ctx, L.ssm_alpha, cur), L.ssm_alpha_s); alpha = ggml_reshape_3d(ctx, alpha, num_v_heads, n_seq_tokens, n_seqs); alpha = ggml_add(ctx, alpha, L.ssm_dt_bias); alpha = ggml_softplus(ctx, alpha); @@ -885,7 +893,7 @@ static ggml_tensor * build_delta_net_block( head_v_dim * num_v_heads, n_seq_tokens, n_seqs); // Output projection - ggml_tensor * out = ggml_mul_mat(ctx, L.ssm_out, flat); + ggml_tensor * out = apply_scale2(ctx, ggml_mul_mat(ctx, L.ssm_out, flat), L.ssm_out_s); out = ggml_reshape_2d(ctx, out, w.n_embd, n_seq_tokens * n_seqs); return out; } @@ -1462,4 +1470,5 @@ bool restore_target_cache_chain(const PrefixSnapshot * thick, return true; } + } // namespace dflash27b