Luce-Org
diff --git a/‎server/deps/llama.cpp‎ b/‎server/deps/llama.cpp‎
diff --git a/‎server/src/common/dflash_draft_graph.cpp‎
Lines changed: 103 additions & 12 deletions b/‎server/src/common/dflash_draft_graph.cpp‎
Lines changed: 103 additions & 12 deletions
diff --git a/‎server/src/common/dflash_draft_graph.h‎
Lines changed: 2 additions & 1 deletion b/‎server/src/common/dflash_draft_graph.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎server/src/common/dflash_target.h‎
Lines changed: 9 additions & 0 deletions b/‎server/src/common/dflash_target.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎server/src/common/domino_head.cpp‎
Lines changed: 140 additions & 0 deletions b/‎server/src/common/domino_head.cpp‎
Lines changed: 140 additions & 0 deletions
diff --git a/‎server/src/common/domino_head.h‎
Lines changed: 14 additions & 0 deletions b/‎server/src/common/domino_head.h‎
Lines changed: 14 additions & 0 deletions
@@ -24,18 +24,25 @@ static bool draft_has_swa_layers(const DraftWeights & dw) {
 
 // Build draft graph at a given ctx_len into sg. Does NOT touch sg.alloc.
 // mirror_view: if true, uses a view into mirror->target_feat at slot0.
+// ctx_alloc: allocation/topology size of the ctx dimension (>= ctx_len).
+// When ctx_alloc > 0 and differs from the legacy behavior, a full-layer pad
+// mask input is created so the graph topology stays stable while ctx_len
+// grows (CUDA-graph replay for the draft forward).
 static bool build_draft_graph_internal(
     StepGraph & sg,
     const DraftWeights & dw,
     ggml_tensor * lm_head,
     int ctx_len,
     const DraftFeatureMirror * mirror,
     int mirror_slot0,
-    bool mirror_view) {
+    bool mirror_view,
+    bool pad_masked = false) {
 
+    const size_t arena_sz = 32u * 1024 * 1024;
+    if (sg.meta_arena.size() < arena_sz) sg.meta_arena.resize(arena_sz);
     ggml_init_params ip{};
-    ip.mem_size   = 256 * 1024 * 1024;
-    ip.mem_buffer = nullptr;
+    ip.mem_size   = sg.meta_arena.size();
+    ip.mem_buffer = sg.meta_arena.data();
     ip.no_alloc   = true;
     sg.ctx = ggml_init(ip);
     if (!sg.ctx) return false;
@@ -86,6 +93,18 @@ static bool build_draft_graph_internal(
         ggml_set_input(sg.attn_mask);
     }
 
+    bool any_full_layer = false;
+    for (int i = 0; i < dw.n_layer; i++)
+        if (!dw.layers[i].is_swa) { any_full_layer = true; break; }
+    sg.pad_mask_full = nullptr;
+    if (pad_masked && any_full_layer) {
+        const int total_k = ctx_len + q_len;
+        const int kv_pad = mask_align_up(total_k, MASK_KV_PAD);
+        sg.pad_mask_full = ggml_new_tensor_2d(sg.ctx, GGML_TYPE_F16, kv_pad, q_len);
+        ggml_set_name(sg.pad_mask_full, "pad_mask_full");
+        ggml_set_input(sg.pad_mask_full);
+    }
+
     sg.gf = ggml_new_graph_custom(sg.ctx, 4096, false);
 
     DraftGraphInputs gi{};
@@ -96,6 +115,7 @@ static bool build_draft_graph_internal(
     gi.positions_k       = sg.positions_k;
     gi.lm_head           = lm_head;
     gi.causal_mask_swa   = sg.attn_mask;
+    gi.pad_mask_full     = sg.pad_mask_full;
     DraftGraphOutputs go = build_draft_graph(sg.ctx, dw, gi);
     sg.hidden_states = go.hidden_states;
     sg.logits = go.logits;
@@ -123,16 +143,42 @@ bool build_draft_step(
     int ctx_len,
     const DraftFeatureMirror * mirror,
     int committed,
-    int /*ctx_len_max*/) {
+    int /*ctx_len_max*/,
+    bool pad_ctx) {
     step_graph_free(sg);
 
     if (!sg.alloc) {
         sg.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
     }
 
+    // Padded-ctx mode: build the graph at the 64-aligned ctx size and mask the
+    // pad keys, so the topology (and gallocr layout) stays IDENTICAL across
+    // ~64 tokens of context growth and ggml-cuda can replay the draft forward
+    // as a CUDA graph. Requires masking, so only usable when the draft has no
+    // SWA layers (SWA windowing would slide into the pad region).
+    // Padding is safe as long as no layer does actual SWA WINDOWING at this
+    // context size (windowing would slide the K view into the pad region).
+    // Layers flagged is_swa below the window size just mean "causal noise
+    // mask" and pad fine with the pad rows masked out.
+    const int ctx_pad_cand = (ctx_len + 63) & ~63;
+    const bool swa_windowing = draft_has_swa_layers(dw) && dw.swa_window > 0 &&
+                               ctx_pad_cand > dw.swa_window;
+    const bool do_pad = pad_ctx && !swa_windowing;
+    const int ctx_alloc = do_pad ? ctx_pad_cand : ctx_len;
+    static bool s_pad_logged = false;
+    if (!s_pad_logged) {
+        s_pad_logged = true;
+        std::fprintf(stderr, "[draft-pad] pad_ctx=%d has_swa=%d do_pad=%d ctx_len=%d ctx_alloc=%d\n",
+                     (int)pad_ctx, (int)draft_has_swa_layers(dw), (int)do_pad, ctx_len, ctx_alloc);
+    }
+
     int mirror_slot0 = 0;
-    const bool use_view = mirror &&
+    bool use_view = mirror &&
         draft_feature_mirror_can_view(*mirror, committed, ctx_len, mirror_slot0);
+    if (use_view && do_pad &&
+        mirror_slot0 + ctx_alloc > mirror->cap) {
+        use_view = false;  // padded view would run past the ring
+    }
 
     // If ctx_len exceeds our cached reserve, re-reserve at next 64 boundary.
     // This makes all subsequent alloc_graph calls within the 64-token window
@@ -142,29 +188,74 @@ bool build_draft_step(
         // Build a dummy graph at ctx_padded just for sizing.
         // Use non-view path for reserve (view tensors don't need allocation).
         if (!build_draft_graph_internal(sg, dw, lm_head, ctx_padded,
-                                        nullptr, 0, false)) {
+                                        nullptr, 0, false, do_pad)) {
             return false;
         }
         ggml_gallocr_reserve(sg.alloc, sg.gf);
         sg.alloc_reserved_ctx = ctx_padded;
         step_graph_free(sg);
     }
 
-    // Build real graph at ctx_len for actual computation.
-    if (!build_draft_graph_internal(sg, dw, lm_head, ctx_len,
-                                    mirror, mirror_slot0, use_view)) {
+    // Build real graph. Padded mode: topology at ctx_alloc, real rows = ctx_len.
+    if (!build_draft_graph_internal(sg, dw, lm_head,
+                                    do_pad ? ctx_alloc : ctx_len,
+                                    mirror, mirror_slot0, use_view, do_pad)) {
         return false;
     }
+    sg.ctx_alloc = do_pad ? ctx_alloc : 0;
 
     if (!ggml_gallocr_alloc_graph(sg.alloc, sg.gf)) {
         return false;
     }
 
+    if (do_pad) {
+        const int q_len = dw.block_size;
+        const int total_k = ctx_alloc + q_len;
+        const int kv_pad = mask_align_up(total_k, MASK_KV_PAD);
+        // Full-layer mask: real ctx keys + all noise keys visible (the DFlash
+        // block is non-causal on full layers), pad keys and alignment columns
+        // -inf.
+        static constexpr uint16_t ZERO = 0x0000;
+        static constexpr uint16_t NEG_INF = 0xFC00;
+        std::vector<uint16_t> mask_data((size_t)kv_pad * q_len, NEG_INF);
+        for (int q = 0; q < q_len; q++) {
+            for (int k = 0; k < ctx_len; k++)
+                mask_data[(size_t)q * kv_pad + k] = ZERO;
+            for (int j = 0; j < q_len; j++)
+                mask_data[(size_t)q * kv_pad + (ctx_alloc + j)] = ZERO;
+        }
+        if (sg.pad_mask_full) {
+            ggml_backend_tensor_set(sg.pad_mask_full, mask_data.data(), 0,
+                                    sizeof(uint16_t) * mask_data.size());
+        }
+
+        // The pad rows of the ctx features must be FINITE (they are masked
+        // out, but NaN/Inf would still poison flash-attn). Zero them.
+        if (ctx_alloc > ctx_len) {
+            if (use_view) {
+                const size_t row_bytes = (size_t)mirror->target_feat->nb[1];
+                std::vector<uint8_t> zeros((size_t)(ctx_alloc - ctx_len) * row_bytes, 0);
+                ggml_backend_tensor_set(mirror->target_feat, zeros.data(),
+                                        (size_t)(mirror_slot0 + ctx_len) * row_bytes,
+                                        zeros.size());
+            } else {
+                const size_t row_bytes = (size_t)sg.target_hidden_cat->nb[1];
+                std::vector<uint8_t> zeros((size_t)(ctx_alloc - ctx_len) * row_bytes, 0);
+                ggml_backend_tensor_set(sg.target_hidden_cat, zeros.data(),
+                                        (size_t)ctx_len * row_bytes,
+                                        zeros.size());
+            }
+        }
+    }
+
     // Fill causal mask data for SWA layers (after allocation gives memory to the tensor).
     if (sg.attn_mask) {
         const int q_len = dw.block_size;
-        const bool swa_active = dw.swa_window > 0 && ctx_len > dw.swa_window;
-        const int eff_ctx = swa_active ? dw.swa_window : ctx_len;
+        const bool swa_active = !do_pad && dw.swa_window > 0 && ctx_len > dw.swa_window;
+        // Padded mode: keys span ctx_alloc rows; only the first ctx_len are
+        // real (visible), the pad rows stay -inf. Noise keys sit at ctx_alloc.
+        const int eff_ctx = do_pad ? ctx_alloc : (swa_active ? dw.swa_window : ctx_len);
+        const int vis_ctx = do_pad ? ctx_len : eff_ctx;
         const int eff_total_k = eff_ctx + q_len;
         const int kv_pad = mask_align_up(eff_total_k, MASK_KV_PAD);
 
@@ -175,7 +266,7 @@ bool build_draft_step(
         static constexpr uint16_t NEG_INF = 0xFC00;
         std::vector<uint16_t> mask_data((size_t)kv_pad * q_len, NEG_INF);
         for (int q = 0; q < q_len; q++) {
-            for (int k = 0; k < eff_ctx; k++)
+            for (int k = 0; k < vis_ctx; k++)
                 mask_data[(size_t)q * kv_pad + k] = ZERO;
             for (int j = 0; j <= q; j++)
                 mask_data[(size_t)q * kv_pad + (eff_ctx + j)] = ZERO;
 
@@ -31,6 +31,7 @@ bool build_draft_step(
     int ctx_len,
     const DraftFeatureMirror * mirror = nullptr,
     int committed = 0,
-    int ctx_len_max = 0);
+    int ctx_len_max = 0,
+    bool pad_ctx = false);
 
 }  // namespace dflash::common
@@ -16,6 +16,9 @@
 
 #include "ddtree.h"
 
+struct ggml_tensor;
+struct ggml_backend;
+
 namespace dflash::common {
 
 struct DFlashTarget {
@@ -110,6 +113,12 @@ struct DFlashTarget {
 
     // Embed token IDs using the target's embedding table.
     // Output: `out` must have space for `n * hidden_size()` floats.
+    // Optional GPU handles for the fused domino draft head. A target that
+    // returns non-null for all three enables the single-graph draft-side path.
+    virtual ggml_tensor *  lm_head_tensor()  { return nullptr; }
+    virtual ggml_tensor *  gpu_embd_table()  { return nullptr; }
+    virtual ggml_backend * fused_head_backend() { return nullptr; }
+
     virtual bool embed_tokens(const int32_t * tokens, int n,
                               float * out) const = 0;
 
 
@@ -179,4 +179,144 @@ bool domino_correct_greedy_chain(const DraftWeights & dw,
     return true;
 }
 
+bool domino_correct_greedy_chain_fused(const DraftWeights & dw,
+                                       ggml_backend_t backend,
+                                       ggml_tensor * lm_head,
+                                       ggml_tensor * embd_table,
+                                       const float * local_hidden,
+                                       int q_len,
+                                       int32_t last_tok,
+                                       std::vector<int32_t> & draft_tok) {
+    if (!dw.domino.enabled || q_len <= 1 || !local_hidden ||
+        !backend || !lm_head || !embd_table) {
+        return false;
+    }
+    const int hidden = dw.n_embd;
+    const int H      = dw.domino.gru_hidden_dim;
+    const int E      = dw.domino.emb_dim;
+    const int n_cand = q_len - 1;
+    const int vocab  = (int)lm_head->ne[1];
+    if (hidden <= 0 || H <= 0 || E <= 0 || vocab <= 0) return false;
+    if (dw.domino.vocab_size > 0 && vocab != dw.domino.vocab_size) {
+        static bool s_vocab_warned = false;
+        if (!s_vocab_warned) {
+            s_vocab_warned = true;
+            std::fprintf(stderr,
+                "domino_fused: vocab mismatch lm_head=%d domino=%d; falling back\n",
+                vocab, dw.domino.vocab_size);
+        }
+        return false;
+    }
+
+    static const bool zero_start = std::getenv("DFLASH_DOMINO_ZERO_START") != nullptr;
+
+    const size_t arena_size = ggml_tensor_overhead() * (size_t)(96 + 48 * n_cand) +
+                              ggml_graph_overhead_custom(1024, false) + 4 * 1024 * 1024;
+    static thread_local std::vector<uint8_t> g_arena_fused;
+    if (g_arena_fused.size() < arena_size) g_arena_fused.resize(arena_size);
+
+    ggml_init_params ip{};
+    ip.mem_size   = g_arena_fused.size();
+    ip.mem_buffer = g_arena_fused.data();
+    ip.no_alloc   = true;
+    ggml_context * ctx = ggml_init(ip);
+    if (!ctx) return false;
+    ggml_cgraph * gf = ggml_new_graph_custom(ctx, 1024, false);
+
+    ggml_tensor * inp_hidden = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden, n_cand);
+    ggml_tensor * inp_seed   = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
+    ggml_set_input(inp_hidden);
+    ggml_set_input(inp_seed);
+
+    // Base logits for every candidate in one matmul: [vocab, n_cand].
+    ggml_tensor * base = ggml_mul_mat(ctx, lm_head, inp_hidden);
+
+    ggml_tensor * state = ggml_reshape_2d(ctx, dw.domino.start, H, 1);
+    if (zero_start) state = ggml_scale(ctx, state, 0.0f);
+    ggml_tensor * prev_embed = ggml_get_rows(ctx, embd_table, inp_seed);  // [hidden,1] f32
+    ggml_set_name(prev_embed, "dom_embed_seed");
+
+    std::vector<ggml_tensor *> toks((size_t)n_cand, nullptr);
+    for (int i = 0; i < n_cand; ++i) {
+        ggml_tensor * gi = ggml_mul_mat(ctx, dw.domino.gru_w_ih, prev_embed);
+        gi = ggml_add(ctx, gi, ggml_reshape_2d(ctx, dw.domino.gru_b_ih, 3 * H, 1));
+        ggml_tensor * gh = ggml_mul_mat(ctx, dw.domino.gru_w_hh, state);
+        gh = ggml_add(ctx, gh, ggml_reshape_2d(ctx, dw.domino.gru_b_hh, 3 * H, 1));
+
+        const size_t gate_bytes = (size_t)H * ggml_element_size(gi);
+        ggml_tensor * i_r = ggml_view_2d(ctx, gi, H, 1, gi->nb[1], 0);
+        ggml_tensor * i_z = ggml_view_2d(ctx, gi, H, 1, gi->nb[1], gate_bytes);
+        ggml_tensor * i_n = ggml_view_2d(ctx, gi, H, 1, gi->nb[1], 2 * gate_bytes);
+        ggml_tensor * h_r = ggml_view_2d(ctx, gh, H, 1, gh->nb[1], 0);
+        ggml_tensor * h_z = ggml_view_2d(ctx, gh, H, 1, gh->nb[1], gate_bytes);
+        ggml_tensor * h_n = ggml_view_2d(ctx, gh, H, 1, gh->nb[1], 2 * gate_bytes);
+
+        ggml_tensor * reset  = ggml_sigmoid(ctx, ggml_add(ctx, i_r, h_r));
+        ggml_tensor * update = ggml_sigmoid(ctx, ggml_add(ctx, i_z, h_z));
+        ggml_tensor * cand   = ggml_tanh(ctx, ggml_add(ctx, i_n, ggml_mul(ctx, reset, h_n)));
+        ggml_tensor * h_new  = ggml_add(ctx, cand,
+                                        ggml_mul(ctx, update,
+                                                 ggml_sub(ctx, state, cand)));
+
+        ggml_tensor * hid_i = ggml_view_2d(ctx, inp_hidden, hidden, 1,
+                                           inp_hidden->nb[1],
+                                           (size_t)i * inp_hidden->nb[1]);
+        ggml_tensor * zcat = ggml_concat(ctx, hid_i, h_new, 0);
+        ggml_tensor * bias = ggml_mul_mat(ctx, dw.domino.head_w1, zcat);
+        bias = ggml_add(ctx, bias, ggml_reshape_2d(ctx, dw.domino.head_b1, E, 1));
+        bias = ggml_silu(ctx, bias);
+        bias = ggml_mul_mat(ctx, dw.domino.head_w2, bias);
+        bias = ggml_add(ctx, bias, ggml_reshape_2d(ctx, dw.domino.head_b2, vocab, 1));
+
+        ggml_tensor * base_i = ggml_view_2d(ctx, base, vocab, 1,
+                                            base->nb[1], (size_t)i * base->nb[1]);
+        ggml_tensor * corrected = ggml_add(ctx, base_i, bias);
+        ggml_tensor * tok = ggml_argmax(ctx, corrected);
+        ggml_set_output(tok);
+        ggml_build_forward_expand(gf, tok);
+        toks[(size_t)i] = tok;
+
+        if (i + 1 < n_cand) {
+            prev_embed = ggml_get_rows(ctx, embd_table, tok);
+            ggml_set_name(prev_embed, "dom_embed_tok");
+        }
+        state = h_new;
+    }
+
+    static thread_local ggml_gallocr_t galloc_fused = nullptr;
+    if (!galloc_fused) {
+        galloc_fused = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+    }
+    if (!ggml_gallocr_alloc_graph(galloc_fused, gf)) {
+        std::fprintf(stderr, "domino_fused: gallocr_alloc_graph failed\n");
+        ggml_free(ctx);
+        return false;
+    }
+
+    ggml_backend_tensor_set(inp_hidden, local_hidden + (size_t)hidden, 0,
+                            sizeof(float) * (size_t)hidden * (size_t)n_cand);
+    ggml_backend_tensor_set(inp_seed, &last_tok, 0, sizeof(int32_t));
+
+    if (ggml_backend_graph_compute(backend, gf) != GGML_STATUS_SUCCESS) {
+        std::fprintf(stderr, "domino_fused: graph_compute failed\n");
+        ggml_free(ctx);
+        return false;
+    }
+
+    draft_tok.assign((size_t)q_len, 0);
+    draft_tok[0] = last_tok;
+    // One synchronize instead of n_cand blocking readbacks.
+    int32_t t_out[16];
+    const int n_get = n_cand < 16 ? n_cand : 16;
+    for (int i = 0; i < n_get; ++i) {
+        ggml_backend_tensor_get_async(backend, toks[(size_t)i], &t_out[i], 0, sizeof(int32_t));
+    }
+    ggml_backend_synchronize(backend);
+    for (int i = 0; i < n_get; ++i) {
+        draft_tok[(size_t)i + 1] = t_out[i];
+    }
+    ggml_free(ctx);
+    return true;
+}
+
 }  // namespace dflash::common
@@ -8,6 +8,20 @@
 
 namespace dflash::common {
 
+// Fused variant: one GPU graph = lm_head projection of the candidate hidden
+// states + unrolled GRU correction chain with in-graph argmax -> get_rows
+// token feedback. Requires the target to expose its lm_head and a GPU (f16)
+// token-embedding table. Runs on a dedicated CUDA backend instance so the
+// ggml-cuda graph cache can replay it across steps.
+bool domino_correct_greedy_chain_fused(const DraftWeights & dw,
+                                       ggml_backend_t backend,
+                                       ggml_tensor * lm_head,
+                                       ggml_tensor * embd_table,
+                                       const float * local_hidden,
+                                       int q_len,
+                                       int32_t last_tok,
+                                       std::vector<int32_t> & draft_tok);
+
 bool domino_correct_greedy_chain(const DraftWeights & dw,
                                  ggml_backend_t backend,
                                  DFlashTarget & target,