[prf/dec] Refactor TornadoVM execution plans to unify GPU paths for standard, prefill-decode, and batched-prefill-decode setups.

orionpapadakis · orionpapadakis · commit 869c67d84730 · 2026-04-19T12:26:33.000+03:00
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlan.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlan.java
@@ -1,8 +1,8 @@
 package org.beehive.gpullama3.tornadovm;
 
-import org.beehive.gpullama3.inference.state.LlamaState;
 import org.beehive.gpullama3.inference.state.State;
 import org.beehive.gpullama3.model.Model;
+import uk.ac.manchester.tornado.api.TornadoExecutionPlan;
 import uk.ac.manchester.tornado.api.types.arrays.FloatArray;
 
 /**
@@ -35,14 +35,14 @@ public interface TornadoVMMasterPlan {
     int PREFILL_BATCH_SIZE = Integer.getInteger("llama.prefillBatchSize", 1);
 
     /**
-     * Factory: creates, JIT-compiles, and warms up the appropriate plan.
+     * Factory: creates, JIT-compiles, and warms up the appropriate TornadoVMMasterPlan.
      *
      * <p>When {@code llama.withPrefillDecode=true} and {@code llama.prefillBatchSize > 1},
      * a {@link TornadoVMMasterPlanWithBatchPrefillDecode} is returned.
      * Otherwise a {@link TornadoVMMasterPlanStandard} is returned (used for the baseline
      * path and the sequential prefill/decode path when batch size is 1).</p>
      *
-     * @param state the model state (must be {@link LlamaState} when batch size {@code > 1})
+     * @param state the model state
      * @param model the model instance
      * @return the initialized plan, also stored via {@link Model#setTornadoVMPlan}
      */
@@ -51,29 +51,26 @@ static TornadoVMMasterPlan initializeTornadoVMPlan(State state, Model model) {
 
         if (WITH_PREFILL_DECODE && PREFILL_BATCH_SIZE > 1) {
             // GPU path with batched prefill/decode
-            plan = TornadoVMMasterPlanWithBatchPrefillDecode.initializeUnifiedPlan(
-                    (LlamaState) state, model, PREFILL_BATCH_SIZE);
+            plan = new TornadoVMMasterPlanWithBatchPrefillDecode(state, model);
         } else if (WITH_PREFILL_DECODE) {
             // GPU path with simple prefill/decode
-            plan = TornadoVMMasterPlanWithPrefillDecode.initialize(state, model);
+            plan = new TornadoVMMasterPlanWithPrefillDecode(state, model);
         } else {
             // GPU path with no prefill/decode
-            plan = TornadoVMMasterPlanStandard.initialize(state, model);
+            plan = new TornadoVMMasterPlanStandard(state, model);
         }
         model.setTornadoVMPlan(plan);
         return plan;
     }
 
     /**
-     * Single-token forward pass returning output logits.
-     *
-     * <p>Used by the standard GPU path ({@link org.beehive.gpullama3.inference.InferenceCore#forwardTornadoVM})
-     * and the Phase 2 sequential decode path. Not applicable to
-     * {@link TornadoVMMasterPlanWithBatchPrefillDecode} — that plan uses its own typed methods.</p>
-     *
-     * @param position sequence position of the current token
-     * @return logits array for token sampling
+     * Creates the appropriate {@link TornadoExecutionPlan} instance
+     * for the given {@link Model} and {@link State}.
      */
+    TornadoExecutionPlan createExecutionPlan();
+
+    void forceCopyInReadOnlyData();
+
     FloatArray tornadoVMForwardExecuteLayered(int position);
 
     /** Releases all device memory held by this plan. */
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanStandard.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanStandard.java
@@ -19,28 +19,14 @@
  */
 public class TornadoVMMasterPlanStandard implements TornadoVMMasterPlan {
 
-    public static final boolean ENABLE_TORNADOVM_INIT_TIME = Boolean.parseBoolean(System.getProperty("llama.EnableTimingForTornadoVMInit", "False"));
-
     private final State state;
+    private final Model model;
     private final Configuration config;
-    public TornadoExecutionPlan executionPlan;
+
     GenericLayerPlanner tornadoVMLayerPlanner;
+    public TornadoExecutionPlan executionPlan;
 
     public TornadoVMMasterPlanStandard(State state, Model model) {
-        this.tornadoVMLayerPlanner = createPlanner(state, model);
-        this.executionPlan = createExecutionPlan();
-        this.state = state;
-        this.config = model.configuration();
-    }
-
-    /**
-     * Initializes and warms up the standard TornadoVM plan.
-     *
-     * @param state the model state containing KV cache
-     * @param model the model instance
-     * @return the initialized plan ready for inference
-     */
-    static TornadoVMMasterPlanStandard initialize(State state, Model model) {
         long startTime = System.nanoTime();
         long planCreationTime = 0;
         long warmupTime = 0;
@@ -49,43 +35,52 @@ static TornadoVMMasterPlanStandard initialize(State state, Model model) {
             System.err.println("\nStarting TornadoVM initialization...");
         }
 
-        TornadoVMMasterPlanStandard tornadoVMPlan = new TornadoVMMasterPlanStandard(state, model);
+        this.state = state;
+        this.model = model;
+        this.config = model.configuration();
+
+        this.executionPlan = createExecutionPlan();
 
         if (ENABLE_TORNADOVM_INIT_TIME) {
             planCreationTime = System.nanoTime();
-            System.err.printf("TornadoVM GPU execution plan creation: %.2f ms\n", (planCreationTime - startTime) / 1_000_000.0);
+            System.err.printf("TornadoVM GPU standard execution plan creation: %.2f ms\n", (planCreationTime - startTime) / 1_000_000.0);
         }
 
-        if (CUDA_GRAPHS) tornadoVMPlan.executionPlan.withAllGraphs().withCUDAGraph();
-        tornadoVMPlan.executionPlan.withPreCompilation();
+        if (CUDA_GRAPHS) executionPlan.withAllGraphs().withCUDAGraph();
+        executionPlan.withPreCompilation();
 
         if (ENABLE_TORNADOVM_INIT_TIME) {
             warmupTime = System.nanoTime();
             System.err.printf("Java to GPU JIT compiler warmup: %.2f ms\n", (warmupTime - planCreationTime) / 1_000_000.0);
         }
 
-        tornadoVMPlan.forceCopyInReadOnlyDataLayered();
+        forceCopyInReadOnlyData();
 
         if (ENABLE_TORNADOVM_INIT_TIME) {
             long copyTime = System.nanoTime();
             System.err.printf("Transfer read-only weights to GPU: %.2f ms\n", (copyTime - warmupTime) / 1_000_000.0);
             System.err.printf("Finished TornadoVM initialization...\n \n");
         }
-
-        return tornadoVMPlan;
     }
 
-    private TornadoExecutionPlan createExecutionPlan() {
+//    @Override
+//    public GenericLayerPlanner createPlanner() {
+//        GGMLType weightType = model.weights().getWeightType();
+//        return QuantizationPlannerFactory.create(weightType, state, model);
+//    }
+
+    /**
+     * Creates the {@link TornadoExecutionPlan} for *simple/standard* single-token forward pass.
+     */
+    @Override
+    public TornadoExecutionPlan createExecutionPlan() {
+        GGMLType weightType = model.weights().getWeightType();
+        this.tornadoVMLayerPlanner = QuantizationPlannerFactory.create(weightType, state, model);
         var taskGraphs = tornadoVMLayerPlanner.getImmutableTaskGraphs();
         var taskGraphArray = taskGraphs.toArray(new ImmutableTaskGraph[taskGraphs.size()]);
         return new TornadoExecutionPlan(taskGraphArray);
     }
 
-    private GenericLayerPlanner createPlanner(State state, Model model) {
-        GGMLType weightType = model.weights().getWeightType();
-        return QuantizationPlannerFactory.create(weightType, state, model);
-    }
-
     @Override
     public FloatArray tornadoVMForwardExecuteLayered(int position) {
         // @formatter:off
@@ -126,7 +121,8 @@ private int getFinalLogitsGraphIndex() {
         return tornadoVMLayerPlanner.getImmutableTaskGraphs().size() - 1;
     }
 
-    public void forceCopyInReadOnlyDataLayered() {
+    @Override
+    public void forceCopyInReadOnlyData() {
         state.wrapX.clear();
         state.positionHolder.init(0);
 
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithBatchPrefillDecode.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithBatchPrefillDecode.java
@@ -1,6 +1,7 @@
 package org.beehive.gpullama3.tornadovm;
 
 import org.beehive.gpullama3.inference.state.LlamaState;
+import org.beehive.gpullama3.inference.state.State;
 import org.beehive.gpullama3.inference.weights.tornado.LlamaTornadoWeights;
 import org.beehive.gpullama3.model.Model;
 import org.beehive.gpullama3.model.llama.LlamaConfiguration;
@@ -25,7 +26,7 @@
 import java.util.List;
 
 /**
- * Unified GPU execution plan for Phase 4: batched prefill + single-token decode.
+ * GPU execution plan for batched prefill + single-token decode.
  *
  * <p>A single {@link TornadoExecutionPlan} holds all graphs so that the KV cache
  * ({@code wrapKeyCache}, {@code wrapValueCache}) is shared on device via
@@ -50,10 +51,8 @@
  */
 public class TornadoVMMasterPlanWithBatchPrefillDecode implements TornadoVMMasterPlan {
 
-    private static final boolean ENABLE_TIMING =
-            Boolean.parseBoolean(System.getProperty("llama.EnableTimingForTornadoVMInit", "False"));
-
     private final LlamaState         state;
+    private final Model              model;
     private final LlamaConfiguration config;
     private final int                batchSize;
     private final int                N;   // numberOfLayers
@@ -68,55 +67,44 @@ public class TornadoVMMasterPlanWithBatchPrefillDecode implements TornadoVMMaste
     private int logitsIdx()                { return 2 * N + 2; }
 
     // ── Construction ─────────────────────────────────────────────────────────
-    private TornadoVMMasterPlanWithBatchPrefillDecode(LlamaState state, Model model, int batchSize) {
-        this.state     = state;
+    TornadoVMMasterPlanWithBatchPrefillDecode(State initialState, Model model) {
+        long startTime = System.nanoTime();
+        long planCreationTime = 0;
+        long warmupTime = 0;
+
+        if (ENABLE_TORNADOVM_INIT_TIME) {
+            System.err.println("\nStarting TornadoVM initialization...");
+        }
+
+        this.state     = (LlamaState) initialState; // only LlamaFP16 supports batched prefill for now
+        this.model     = model;
         this.config    = (LlamaConfiguration) model.configuration();
-        this.batchSize = batchSize;
+        this.batchSize = PREFILL_BATCH_SIZE;
         this.N         = config.numberOfLayers();
 
-        LlamaTornadoWeights weights       = (LlamaTornadoWeights) model.weights();
-        SchedulerType       schedulerType = SchedulerDetectionService.determineSchedulerType(model);
-
-        List<ImmutableTaskGraph> all       = new ArrayList<>(2 * N + 3);
-        GridScheduler            scheduler = new GridScheduler();
+        this.gridScheduler  = new GridScheduler();
+        this.executionPlan  = createExecutionPlan();
 
-        // [0] Batch prefill activation ────────────────────────────────────────────────
-        KernelContext batchActCtx = new KernelContext();
-        all.add(buildBatchPrefillActivationGraph(batchActCtx).snapshot());
-        scheduler.addWorkerGrid("batchActivation.batchUpdateX",
-                WorkerGridFactory.genericWorker(batchSize * config.dim(), 128));
+        if (ENABLE_TORNADOVM_INIT_TIME) {
+            planCreationTime = System.nanoTime();
+            System.err.printf("TornadoVM GPU batched prefill/decode execution plan creation: %.2f ms\n", (planCreationTime - startTime) / 1_000_000.0);
+        }
 
-        // [1..N] Batch prefill layer graphs ───────────────────────────────────────────
-        LlamaFP16LayersBatchPrefill batchLayers =
-                new LlamaFP16LayersBatchPrefill(state, weights, config, batchSize);
-        all.addAll(batchLayers.getLayerImmutableTaskGraphs());
-        batchLayers.updateGridScheduler(scheduler);
+        if (CUDA_GRAPHS) executionPlan.withAllGraphs().withCUDAGraph();
+        executionPlan.withPreCompilation();
 
-        // [N+1] Decode activation (with KV-cache pass-through) ────────────────
-        KernelContext decodeActCtx = new KernelContext();
-        all.add(buildDecodeActivationGraph(decodeActCtx, batchLayers.getLastLayerTaskGraphID()).snapshot());
-        scheduler.addWorkerGrid("decodeActivationUpdate.updateX",
-                WorkerGridFactory.genericWorker(config.dim(), 128));
+        if (ENABLE_TORNADOVM_INIT_TIME) {
+            warmupTime = System.nanoTime();
+            System.err.printf("Java to GPU JIT compiler warmup: %.2f ms\n", (warmupTime - planCreationTime) / 1_000_000.0);
+        }
 
-        // [N+2..2N+1] Decode layer graphs  ────────────────────────────────────
-        // Layer 0 uses consumeFromDevice for KV cache (no FIRST_EXECUTION upload).
-        LlamaFP16FFNLayersDecode decodeLayers =
-                new LlamaFP16FFNLayersDecode(
-                        "llamaFFNDecode", state, weights, config, schedulerType);
-        all.addAll(decodeLayers.getFFNLayerImmutableTaskGraphs());
-        decodeLayers.updateGridScheduler(scheduler);
+        forceCopyInReadOnlyData();
 
-        // [2N+2] Logits ───────────────────────────────────────────────────────
-        // LogitsFP16LayerDecode extends LogitsFP16Layer: adds consumeFromDevice(wrapKeyCache)
-        // at the start of the graph and persistOnDevice(wrapKeyCache) at the end, so the
-        // KV-cache pointer survives the logits → decode-activation boundary across tokens.
-        LogitsFP16LayerDecode logitsLayer = new LogitsFP16LayerDecode("logits", state, weights, config,
-                decodeLayers.getLastFFNLayerTaskGraphID(), schedulerType);
-        all.add(logitsLayer.getImmutableTaskGraph());
-        logitsLayer.updateGridScheduler(scheduler);
-
-        this.gridScheduler  = scheduler;
-        this.executionPlan  = new TornadoExecutionPlan(all.toArray(new ImmutableTaskGraph[0]));
+        if (ENABLE_TORNADOVM_INIT_TIME) {
+            long copyTime = System.nanoTime();
+            System.err.printf("Transfer read-only weights to GPU: %.2f ms\n", (copyTime - warmupTime) / 1_000_000.0);
+            System.err.printf("Finished TornadoVM initialization...\n \n");
+        }
     }
 
     // ── Batch Prefill Activation graphs ─────────────────────────────────────────────────────
@@ -164,41 +152,58 @@ private TaskGraph buildDecodeActivationGraph(KernelContext ctx, String lastBatch
                 .persistOnDevice(state.wrapX, state.wrapKeyCache, state.wrapValueCache);
     }
 
-    // ── Static factory ────────────────────────────────────────────────────────
-
     /**
-     * Creates, JIT-compiles, and warms up the unified plan.
-     * Mirrors {@link TornadoVMMasterPlan#initializeTornadoVMPlan}.
+     * Creates the {@link TornadoExecutionPlan} for forward pass with *prefill in batches and separated decode*.
      */
-    public static TornadoVMMasterPlanWithBatchPrefillDecode initializeUnifiedPlan(
-            LlamaState state, Model model, int batchSize) {
+    @Override
+    public TornadoExecutionPlan createExecutionPlan() {
+        LlamaTornadoWeights weights       = (LlamaTornadoWeights) model.weights();
+        SchedulerType       schedulerType = SchedulerDetectionService.determineSchedulerType(model);
 
-        long t0 = System.nanoTime();
-        TornadoVMMasterPlanWithBatchPrefillDecode plan =
-                new TornadoVMMasterPlanWithBatchPrefillDecode(state, model, batchSize);
+        List<ImmutableTaskGraph> all       = new ArrayList<>(2 * N + 3);
 
-        if (ENABLE_TIMING)
-            System.err.printf("[BatchPlan] Graph construction: %.2f ms%n",
-                    (System.nanoTime() - t0) / 1e6);
+        // [0] Batch prefill activation ────────────────────────────────────────────────
+        KernelContext batchActCtx = new KernelContext();
+        all.add(buildBatchPrefillActivationGraph(batchActCtx).snapshot());
+        gridScheduler.addWorkerGrid("batchActivation.batchUpdateX",
+                WorkerGridFactory.genericWorker(batchSize * config.dim(), 128));
 
-        if (CUDA_GRAPHS) plan.executionPlan.withAllGraphs().withCUDAGraph();
-        plan.executionPlan.withPreCompilation();
+        // [1..N] Batch prefill layer graphs ───────────────────────────────────────────
+        LlamaFP16LayersBatchPrefill batchLayers =
+                new LlamaFP16LayersBatchPrefill(state, weights, config, batchSize);
+        all.addAll(batchLayers.getLayerImmutableTaskGraphs());
+        batchLayers.updateGridScheduler(gridScheduler);
 
-        if (ENABLE_TIMING)
-            System.err.printf("[BatchPlan] JIT compilation: %.2f ms%n",
-                    (System.nanoTime() - t0) / 1e6);
+        // [N+1] Decode activation (with KV-cache pass-through) ────────────────
+        KernelContext decodeActCtx = new KernelContext();
+        all.add(buildDecodeActivationGraph(decodeActCtx, batchLayers.getLastLayerTaskGraphID()).snapshot());
+        gridScheduler.addWorkerGrid("decodeActivationUpdate.updateX",
+                WorkerGridFactory.genericWorker(config.dim(), 128));
 
-        plan.forceCopyInReadOnlyData();
+        // [N+2..2N+1] Decode layer graphs  ────────────────────────────────────
+        // Layer 0 uses consumeFromDevice for KV cache (no FIRST_EXECUTION upload).
+        LlamaFP16FFNLayersDecode decodeLayers =
+                new LlamaFP16FFNLayersDecode(
+                        "llamaFFNDecode", state, weights, config, schedulerType);
+        all.addAll(decodeLayers.getFFNLayerImmutableTaskGraphs());
+        decodeLayers.updateGridScheduler(gridScheduler);
 
-        if (ENABLE_TIMING)
-            System.err.printf("[BatchPlan] Init complete: %.2f ms%n",
-                    (System.nanoTime() - t0) / 1e6);
+        // [2N+2] Logits ───────────────────────────────────────────────────────
+        // LogitsFP16LayerDecode extends LogitsFP16Layer: adds consumeFromDevice(wrapKeyCache)
+        // at the start of the graph and persistOnDevice(wrapKeyCache) at the end, so the
+        // KV-cache pointer survives the logits → decode-activation boundary across tokens.
+        LogitsFP16LayerDecode logitsLayer = new LogitsFP16LayerDecode("logits", state, weights, config,
+                decodeLayers.getLastFFNLayerTaskGraphID(), schedulerType);
+        all.add(logitsLayer.getImmutableTaskGraph());
+        logitsLayer.updateGridScheduler(gridScheduler);
 
-        return plan;
+        return new TornadoExecutionPlan(all.toArray(new ImmutableTaskGraph[0]));
     }
 
+
     /** Runs all graphs once to trigger FIRST_EXECUTION uploads and warm up CUDA graphs. */
-    private void forceCopyInReadOnlyData() {
+    @Override
+    public void forceCopyInReadOnlyData() {
         state.wrapXBatch.clear();
         state.wrapX.clear();
         state.positionHolder.init(0);
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithPrefillDecode.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithPrefillDecode.java