[prf/dec] Fix KV-cache propagation bug from prefill to decode path and refactor task graph consumption logic

orionpapadakis · orionpapadakis · commit 2b1aababc02e · 2026-04-07T20:28:34.000+03:00
Introduce `LogitsFP16LayerDecode` with KV-cache pass-through. Override `consumeFromDevice` and `persistOnDevice` in LlamaFFN layers to fix cross-graph propagation for both CUDA and interpreter modes.
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithBatchPrefillDecode.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithBatchPrefillDecode.java
@@ -10,7 +10,7 @@
 import org.beehive.gpullama3.tornadovm.layerplanner.strategy.SchedulerType;
 import org.beehive.gpullama3.tornadovm.layers.type.fp16.decode.LlamaFP16FFNLayersDecode;
 import org.beehive.gpullama3.tornadovm.layers.type.fp16.prefill.LlamaFP16LayersBatchPrefill;
-import org.beehive.gpullama3.tornadovm.layers.type.fp16.LogitsFP16Layer;
+import org.beehive.gpullama3.tornadovm.layers.type.fp16.decode.LogitsFP16LayerDecode;
 import uk.ac.manchester.tornado.api.GridScheduler;
 import uk.ac.manchester.tornado.api.ImmutableTaskGraph;
 import uk.ac.manchester.tornado.api.KernelContext;
@@ -94,8 +94,8 @@ private TornadoVMMasterPlanWithBatchPrefillDecode(LlamaState state, Model model,
 
         // [N+1] Decode activation (with KV-cache pass-through) ────────────────
         KernelContext decodeActCtx = new KernelContext();
-        all.add(buildDecodeActivationGraph(decodeActCtx).snapshot());
-        scheduler.addWorkerGrid("activationUpdate.updateX",
+        all.add(buildDecodeActivationGraph(decodeActCtx, batchLayers.getLastLayerTaskGraphID()).snapshot());
+        scheduler.addWorkerGrid("decodeActivationUpdate.updateX",
                 WorkerGridFactory.genericWorker(config.dim(), 128));
 
         // [N+2..2N+1] Decode layer graphs  ────────────────────────────────────
@@ -107,7 +107,10 @@ private TornadoVMMasterPlanWithBatchPrefillDecode(LlamaState state, Model model,
         decodeLayers.updateGridScheduler(scheduler);
 
         // [2N+2] Logits ───────────────────────────────────────────────────────
-        LogitsFP16Layer logitsLayer = new LogitsFP16Layer("logits", state, weights, config,
+        // LogitsFP16LayerDecode extends LogitsFP16Layer: adds consumeFromDevice(wrapKeyCache)
+        // at the start of the graph and persistOnDevice(wrapKeyCache) at the end, so the
+        // KV-cache pointer survives the logits → decode-activation boundary across tokens.
+        LogitsFP16LayerDecode logitsLayer = new LogitsFP16LayerDecode("logits", state, weights, config,
                 decodeLayers.getLastFFNLayerTaskGraphID(), schedulerType);
         all.add(logitsLayer.getImmutableTaskGraph());
         logitsLayer.updateGridScheduler(scheduler);
@@ -123,9 +126,7 @@ private TaskGraph buildBatchPrefillActivationGraph(KernelContext ctx) {
         return new TaskGraph("batchActivation")
                 .transferToDevice(DataTransferMode.FIRST_EXECUTION, ctx, state.wrapXBatch)
                 .transferToDevice(DataTransferMode.EVERY_EXECUTION, state.embeddingXBatch)
-                .task("batchUpdateX",
-                        (KernelContext c, HalfFloatArray src, FloatArray dst) ->
-                                dst.set(c.globalIdx, src.get(c.globalIdx).getFloat32()),
+                .task("batchUpdateX", TransformerComputeKernels::convertFP16toFP32,
                         ctx, state.embeddingXBatch, state.wrapXBatch)
                 .persistOnDevice(state.wrapXBatch);
     }
@@ -139,17 +140,24 @@ private TaskGraph buildBatchPrefillActivationGraph(KernelContext ctx) {
      * Both halves of the chain are required; without the re-persist the pointer is
      * not forwarded in interpreter (non-CUDA-graph) mode.</p>
      */
-    private TaskGraph buildDecodeActivationGraph(KernelContext ctx) {
-        return new TaskGraph("activationUpdate")
-                .consumeFromDevice(state.wrapKeyCache, state.wrapValueCache)   // KV pass-through
-//                .transferToDevice(DataTransferMode.EVERY_EXECUTION,
-//                        state.wrapKeyCache,
-//                        state.wrapValueCache)
-                .transferToDevice(DataTransferMode.FIRST_EXECUTION, ctx, state.wrapX)
+    private TaskGraph buildDecodeActivationGraph(KernelContext ctx, String lastBatchLayerID) {
+//        System.out.println("lastBatchLayerID = " + lastBatchLayerID);
+//        System.out.println("[buildDecodeActivationGraph] state.wrapX = " + state.wrapX.toString());
+//        System.out.println("[buildDecodeActivationGraph] state.wrapKeyCache = " + state.wrapKeyCache.toString());
+//        System.out.println("[buildDecodeActivationGraph] state.wrapValueCache = " + state.wrapValueCache.toString());
+        return new TaskGraph("decodeActivationUpdate")
+                .consumeFromDevice(lastBatchLayerID, state.wrapKeyCache, state.wrapValueCache)   // KV pass-through
+                //.transferToDevice(DataTransferMode.FIRST_EXECUTION, ctx, state.wrapX, debugKV)
+                //.transferToDevice(DataTransferMode.FIRST_EXECUTION, ctx, state.wrapX)
                 .transferToDevice(DataTransferMode.EVERY_EXECUTION, state.embeddingX)
                 .task("updateX",
                         TransformerComputeKernels::convertFP16toFP32,
                         ctx, (HalfFloatArray) state.embeddingX, state.wrapX)
+//                // DEBUG: snapshot first 8 elements of wrapKeyCache and wrapX for host-side probe
+//                .task("dbgKV",
+//                        TransformerComputeKernels::dbgCopyFirst8,
+//                        state.wrapKeyCache, debugKV)
+//                .transferToHost(DataTransferMode.EVERY_EXECUTION, state.wrapX, debugKV)
                 // wrapX persisted for decode layer 0; wrapKeyCache/wrapValueCache
                 // re-persisted so updatePersistedObjectState() propagates the device
                 // pointer to decode layer 0's consumeFromDevice without CUDA graphs.
@@ -197,6 +205,7 @@ private void forceCopyInReadOnlyData() {
         state.batchStartPosHolder.init(0);
 
         for (int i = 0; i <= logitsIdx(); i++) {
+            //System.out.println(i + " " + executionPlan.withGraph(i).toString());
             var g = executionPlan.withGraph(i).withGridScheduler(gridScheduler);
             if (CUDA_GRAPHS) g.withCUDAGraph();
             g.execute();
@@ -268,6 +277,14 @@ public FloatArray tornadoVMForwardDecode(int token, int position, Model model) {
         if (CUDA_GRAPHS) decodeAct.withCUDAGraph();
         //System.err.println("[DEBUG] about to execute decode activation (graph " + decodeActivationIdx() + "--)");
         decodeAct.execute();
+        // DEBUG: print first 4 of wrapX (should be non-zero FP32 embedding) and
+        //        first 4 of debugKV (should be non-zero after batch prefill wrote the KV cache)
+//        if (position <= 290) {
+//            System.err.printf("[DBG pos=%d] wrapX[0..3]  = %.4f %.4f %.4f %.4f%n",
+//                    position, state.wrapX.get(0), state.wrapX.get(1), state.wrapX.get(2), state.wrapX.get(3));
+//            System.err.printf("[DBG pos=%d] debugKV[0..3]= %.4f %.4f %.4f %.4f%n",
+//                    position, debugKV.get(0), debugKV.get(1), debugKV.get(2), debugKV.get(3));
+//        }
 
         // Graphs N+2..2N+1: decode transformer layers
         for (int l = 0; l < N; l++) {
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/fp16/LlamaFP16FFNLayers.java b/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/fp16/LlamaFP16FFNLayers.java
@@ -146,7 +146,17 @@ protected TaskGraph createFFNLayerTaskGraph(int layerIndex) {
         TaskGraph unifiedLayer = new TaskGraph(layerTaskGraphName);
 
         // === Data Setup ===
-        unifiedLayer.consumeFromDevice(state.wrapX);
+        // consumeFromDevice for wrapX: the no-arg form uses the current graph's own name as the
+        // source key, which works in CUDA-graph mode (pointers are frozen) but fails in interpreter
+        // mode (updatePersistedObjectState looks up the predecessor's name, not the current name).
+        // Subclasses that receive wrapX across a graph boundary override predecessorGraphName() to
+        // return the correct predecessor graph name so the XPUBuffer is propagated in both modes.
+        String wrapXSrc = predecessorGraphName(layerIndex);
+        if (wrapXSrc != null) {
+            unifiedLayer.consumeFromDevice(wrapXSrc, state.wrapX);
+        } else {
+            unifiedLayer.consumeFromDevice(state.wrapX);
+        }
         unifiedLayer.transferToDevice(DataTransferMode.FIRST_EXECUTION,
                 weights.rms_att_weightLayered[layerIndex].asFloatArray(),
                 weights.wqLayered[layerIndex].asHalfFloatArray(),
@@ -248,11 +258,31 @@ protected TaskGraph createFFNLayerTaskGraph(int layerIndex) {
                 weights.w2Layered[layerIndex].asHalfFloatArray(),
                 config.hiddenDim(), config.dim(), LOCAL_WORK_GROUP_SIZE_ALLOC);
 
-        unifiedLayer.persistOnDevice(state.wrapX);
+        unifiedLayer.persistOnDevice(state.wrapX, state.wrapKeyCache,
+                state.wrapValueCache);
 
         return unifiedLayer;
     }
 
+    /**
+     * Returns the name of the predecessor task graph from which {@code wrapX} should be consumed,
+     * or {@code null} to fall back to the no-arg form (source key = own graph name).
+     *
+     * <p>The no-arg form is safe in CUDA-graph mode (device pointers are frozen at capture time)
+     * but fails in interpreter mode: {@code updatePersistedObjectState} looks up the predecessor's
+     * graph name, not the current graph's name, so the XPUBuffer is never propagated and
+     * {@code executeAlloc} NPEs on a null buffer.</p>
+     *
+     * <p>Override in subclasses that receive {@code wrapX} from a named predecessor graph:</p>
+     * <ul>
+     *   <li>layer 0: return the activation graph name (e.g. {@code "activationUpdate"})</li>
+     *   <li>layer k &gt; 0: return {@code "layer_" + (k-1)}</li>
+     * </ul>
+     */
+    protected String predecessorGraphName(int layerIndex) {
+        return null;
+    }
+
     protected TaskGraph configureLayerDataTransfers(TaskGraph unifiedLayer, int layerIndex) {
         if (layerIndex == 0) {
             // First layer: Transfer initial data to device (one-time transfer)
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/fp16/LogitsFP16Layer.java b/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/fp16/LogitsFP16Layer.java
@@ -22,11 +22,25 @@ public LogitsFP16Layer(String name, State state, Weights weights, Configuration
         super(name, state, weights, config, lastTaskGraphID, schedulerType);
     }
 
+    /**
+     * Hook called before any data transfers or tasks. Override to prepend
+     * {@code consumeFromDevice} declarations that must precede the bytecode
+     * (e.g. KV-cache pass-through in the Phase 4 unified plan).
+     */
+    protected void configureAdditionalConsumes(TaskGraph logits) {}
+
+    /**
+     * Hook called after {@code transferToHost}. Override to append
+     * {@code persistOnDevice} declarations (e.g. KV-cache pass-through in Phase 4).
+     */
+    protected void configureAdditionalPersists(TaskGraph logits) {}
+
     // @formatter:off
     @Override
     protected TaskGraph setupLogitsTaskGraph(TornadoWeights weights, Configuration config) {
         var logits = new TaskGraph("logits");
         // === Data Setup ===
+        configureAdditionalConsumes(logits);
         logits.consumeFromDevice(lastTaskGraphID, state.wrapX);
         logits.transferToDevice(DataTransferMode.EVERY_EXECUTION, state.tempLogits);
         logits.transferToDevice(DataTransferMode.FIRST_EXECUTION,
@@ -80,6 +94,7 @@ protected TaskGraph setupLogitsTaskGraph(TornadoWeights weights, Configuration c
 
         // === Transfer Results to Host ===
         logits.transferToHost(DataTransferMode.EVERY_EXECUTION, state.wrapLogits);
+        configureAdditionalPersists(logits);
         return logits;
     }
     // @formatter:on
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/fp16/decode/LlamaFP16FFNLayersDecode.java b/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/fp16/decode/LlamaFP16FFNLayersDecode.java
@@ -9,12 +9,22 @@
 import uk.ac.manchester.tornado.api.enums.DataTransferMode;
 
 /**
- * Identical to {@link LlamaFP16FFNLayers} except decode layer 0 uses
- * {@code consumeFromDevice} for the KV cache instead of {@code FIRST_EXECUTION}.
+ * Decode-path FFN layers for the Phase 4 unified plan.
  *
- * <p>This ensures decode layer 0 receives the KV-cache device pointer that was
- * persisted by the last batch prefill layer and passed through the decode
- * activation graph.</p>
+ * <p>Overrides data-transfer declarations so that all cross-graph boundaries use
+ * the explicit-source form of {@code consumeFromDevice}.  The no-arg form (used by
+ * the base class) passes the <em>current</em> graph's own name as the source key.
+ * In CUDA-graph mode this is harmless (device pointers are frozen at capture time),
+ * but in interpreter mode {@code updatePersistedObjectState} looks up the
+ * <em>predecessor's</em> name, so the lookup always misses and the XPUBuffer is
+ * never propagated — causing either a null-pointer crash or a silent re-upload
+ * from host (zeros), corrupting the hidden state and KV cache.</p>
+ *
+ * <p>Two boundaries are fixed here:</p>
+ * <ul>
+ *   <li>{@code wrapX}: via {@link #predecessorGraphName} hook in the base class.</li>
+ *   <li>All other consumed objects: via the {@link #configureLayerDataTransfers} override.</li>
+ * </ul>
  */
 public class LlamaFP16FFNLayersDecode extends LlamaFP16FFNLayers {
     public LlamaFP16FFNLayersDecode(String taskGraph, LlamaState state,
@@ -23,24 +33,46 @@ public LlamaFP16FFNLayersDecode(String taskGraph, LlamaState state,
         super(taskGraph, state, weights, config, schedulerType);
     }
 
+    /**
+     * Supplies the correct predecessor graph name for {@code consumeFromDevice(wrapX)}.
+     *
+     * <p>Layer 0 receives {@code wrapX} from the decode activation graph;
+     * layers 1+ receive it from the previous decode layer.
+     * Must match the {@code TaskGraph} names used in
+     * {@code buildDecodeActivationGraph()} and {@code createFFNLayerTaskGraph()}.</p>
+     */
+    @Override
+    protected String predecessorGraphName(int layerIndex) {
+        return (layerIndex == 0) ? "decodeActivationUpdate" : "layer_" + (layerIndex - 1);
+    }
+
     @Override
     protected TaskGraph configureLayerDataTransfers(TaskGraph layer, int layerIndex) {
         if (layerIndex == 0) {
-            // Same as parent layer 0 BUT wrapKeyCache/wrapValueCache come
-            // from device (passed through by the decode activation graph).
+            // Same as parent layer 0, but wrapKeyCache/wrapValueCache come from device
+            // (passed through by the decode activation graph, which relays them from
+            // the last batch prefill layer).  No FIRST_EXECUTION for KV cache here.
             layer.transferToDevice(DataTransferMode.EVERY_EXECUTION,
                     state.positionHolder, state.temp, state.tempFFN);
             layer.transferToDevice(DataTransferMode.FIRST_EXECUTION,
                     context,
                     state.wrapXb, state.wrapXb2,
                     state.wrapQ, state.wrapK, state.wrapV,
                     state.wrapAtt, state.wrapHb, state.wrapXbFP16);
-            // KV cache: consume from device (device pointer supplied by
-            // decode activation's pass-through from last batch layer).
-            layer.consumeFromDevice(state.wrapKeyCache, state.wrapValueCache);
+            // Explicit source — must match the TaskGraph name in buildDecodeActivationGraph().
+            layer.consumeFromDevice("decodeActivationUpdate", state.wrapKeyCache, state.wrapValueCache);
         } else {
-            // Identical to parent for layers 1+ (already uses consumeFromDevice).
-            return super.configureLayerDataTransfers(layer, layerIndex);
+            // Layers 1+: use explicit predecessor name for ALL consumed objects.
+            // Calling super here would use the no-arg form (source key = own graph name),
+            // which silently fails in interpreter mode and causes re-upload from host.
+            String pred = "layer_" + (layerIndex - 1);
+            layer.consumeFromDevice(pred,
+                    context,
+                    state.wrapXb, state.wrapXb2,
+                    state.wrapQ, state.wrapK, state.wrapV,
+                    state.wrapKeyCache, state.wrapValueCache,
+                    state.wrapAtt, state.wrapHb,
+                    state.positionHolder, state.wrapXbFP16);
         }
         return layer;
     }
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/fp16/decode/LogitsFP16LayerDecode.java b/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/fp16/decode/LogitsFP16LayerDecode.java
@@ -0,0 +1,53 @@
+package org.beehive.gpullama3.tornadovm.layers.type.fp16.decode;
+
+import org.beehive.gpullama3.inference.state.State;
+import org.beehive.gpullama3.inference.weights.Weights;
+import org.beehive.gpullama3.model.Configuration;
+import org.beehive.gpullama3.tornadovm.layerplanner.strategy.SchedulerType;
+import org.beehive.gpullama3.tornadovm.layers.type.fp16.LogitsFP16Layer;
+import uk.ac.manchester.tornado.api.TaskGraph;
+
+/**
+ * Logits layer for the unified prefill-decode plan (Phase 4).
+ *
+ * <p>Extends {@link LogitsFP16Layer} with KV-cache pass-through so the device
+ * pointers for {@code wrapKeyCache} and {@code wrapValueCache} survive the
+ * logits → decode-activation boundary across decode tokens.</p>
+ *
+ * <p>In interpreter (non-CUDA-graph) mode, {@code updatePersistedObjectState()}
+ * propagates device pointers from the predecessor graph's persisted set. After the
+ * last decode token the predecessor of the next decode-activation graph is the
+ * logits graph. Without the pass-through here, the KV-cache pointer is absent from
+ * the logits persisted set, cleared to null, and the first decode layer crashes with
+ * an NPE in {@code executeAlloc}.</p>
+ *
+ * <p>Bytecode order matters: {@code consumeFromDevice} must precede task declarations,
+ * and {@code persistOnDevice} must follow {@code transferToHost}. The hooks in
+ * {@link LogitsFP16Layer} guarantee this ordering.</p>
+ */
+public class LogitsFP16LayerDecode extends LogitsFP16Layer {
+
+    public LogitsFP16LayerDecode(String name, State state, Weights weights, Configuration config,
+            String lastTaskGraphID, SchedulerType schedulerType) {
+        super(name, state, weights, config, lastTaskGraphID, schedulerType);
+    }
+
+    /**
+     * Prepends {@code consumeFromDevice(lastTaskGraphID, wrapKeyCache, wrapValueCache)} before all tasks.
+     *
+     * <p>Must use the named-source form so that {@code updatePersistedObjectState()} adds the KV cache
+     * to the source-keyed map. Without the source name, the fallback in {@code updatePersistedObjectState}
+     * uses the current graph's general persisted list, which causes the XPUBuffer from the predecessor
+     * (last decode layer) to never be propagated into the logits graph's device state.</p>
+     */
+    @Override
+    protected void configureAdditionalConsumes(TaskGraph logits) {
+        logits.consumeFromDevice(lastTaskGraphID, state.wrapKeyCache, state.wrapValueCache);
+    }
+
+    /** Appends {@code persistOnDevice(wrapKeyCache, wrapValueCache)} after {@code transferToHost}. */
+    @Override
+    protected void configureAdditionalPersists(TaskGraph logits) {
+        logits.persistOnDevice(state.wrapKeyCache, state.wrapValueCache);
+    }
+}
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/fp16/prefill/LlamaFP16LayersBatchPrefill.java b/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/fp16/prefill/LlamaFP16LayersBatchPrefill.java
@@ -69,10 +69,19 @@ private TaskGraph createBatchPrefillLayerTaskGraph(int layerIndex) {
                     state.wrapXbBatch,
                     state.wrapHbBatch,
                     state.wrapKeyCache, state.wrapValueCache);
-            // wrapXBatch produced by the batch activation graph
-            layer.consumeFromDevice(state.wrapXBatch);
+            // wrapXBatch produced by the batch activation graph.
+            // Explicit source name required: the no-arg form uses the current graph's own
+            // name ("batchLayer_0") which never matches "batchActivation" in interpreter mode,
+            // causing wrapXBatch to be re-uploaded from host (zeros) instead of using the
+            // FP32 embeddings computed by the activation graph's convertFP16toFP32 kernel.
+            layer.consumeFromDevice("batchActivation", state.wrapXBatch);
         } else {
-            layer.consumeFromDevice(
+            // Explicit predecessor name for all objects.
+            // The no-arg form would use "batchLayer_k" as the source key, which never matches
+            // "batchLayer_{k-1}" in interpreter mode — every object would be re-uploaded from
+            // host (zeros or stale), corrupting the KV cache written by the previous layer.
+            String pred = "batchLayer_" + (layerIndex - 1);
+            layer.consumeFromDevice(pred,
                     context,
                     state.wrapXBatch,
                     state.wrapXbFP16Batch,