[prf/dec] Provide distinct support for standard, prefill-decode and batched-prefill-decode execution paths for both CPU and GPU

orionpapadakis · orionpapadakis · commit 9aff199ccb0f · 2026-04-19T12:26:33.000+03:00
diff --git a/llama-tornado b/llama-tornado
@@ -87,11 +87,11 @@ class LlamaRunner:
         if args.verbose_init:
             cmd.append("-Dllama.EnableTimingForTornadoVMInit=true")
 
-        if args.batched_prefill:
-            cmd.append("-Dllama.batchedPrefill=true")
+        if args.with_prefill_decode or args.batch_prefill_size is not None:
+            cmd.append("-Dllama.withPrefillDecode=true")
 
-        if args.prefill_batch_size is not None:
-            cmd.append(f"-Dllama.prefillBatchSize={args.prefill_batch_size}")
+        if args.batch_prefill_size is not None:
+            cmd.append(f"-Dllama.prefillBatchSize={args.batch_prefill_size}")
 
         if args.no_cuda_graphs:
             cmd.append("-Dllama.cudaGraphs=false")
@@ -484,17 +484,26 @@ def create_parser() -> argparse.ArgumentParser:
     # Prefill/Decode optimization
     prefill_group = parser.add_argument_group("Prefill/Decode Optimization")
     prefill_group.add_argument(
-        "--batched-prefill",
-        dest="batched_prefill",
+        "--with-prefill-decode",
+        dest="with_prefill_decode",
         action="store_true",
-        help="Enable batched prefill/decode separation (llama.batchedPrefill=true)",
+        help=(
+            "Enable prefill/decode separation. "
+            "Alone: sequential prefill (skip logits) + standard decode. "
+            "With --batch-prefill-size N (N>1): batched GPU prefill via TornadoVMMasterPlanWithBatchPrefillDecode."
+        ),
     )
     prefill_group.add_argument(
-        "--prefill-batch-size",
-        dest="prefill_batch_size",
+        "--batch-prefill-size",
+        dest="batch_prefill_size",
         type=int,
         default=None,
-        help="Prefill chunk/batch size (llama.prefillBatchSize=N, default: 32)",
+        metavar="N",
+        help=(
+            "Prefill chunk size (requires --with-prefill-decode). "
+            "N=1: sequential prefill (same as --with-prefill-decode alone). "
+            "N>1: batched prefill processing N tokens per chunk (llama.prefillBatchSize=N)."
+        ),
     )
     prefill_group.add_argument(
         "--no-cuda-graphs",
diff --git a/src/main/java/org/beehive/gpullama3/Options.java b/src/main/java/org/beehive/gpullama3/Options.java
@@ -5,14 +5,20 @@
 import java.nio.file.Paths;
 
 public record Options(Path modelPath, String prompt, String systemPrompt, String suffix, boolean interactive, float temperature, float topp, long seed, int maxTokens, boolean stream, boolean echo,
-                      boolean useTornadovm) {
+                      boolean useTornadovm, boolean withPrefillDecode, int batchPrefillSize) {
 
     public static final int DEFAULT_MAX_TOKENS = 1024;
 
     public Options {
         require(interactive || prompt != null, "Missing argument: --prompt is required in --instruct mode e.g. --prompt \"Why is the sky blue?\"");
         require(0 <= temperature, "Invalid argument: --temperature must be non-negative");
         require(0 <= topp && topp <= 1, "Invalid argument: --top-p must be within [0, 1]");
+        require(batchPrefillSize >= 1, "Invalid argument: --batch-prefill-size must be >= 1");
+        require(batchPrefillSize == 1 || withPrefillDecode, "Invalid argument: --batch-prefill-size requires --with-prefill-decode");
+        // Publish to system properties so TornadoVMMasterPlan and Llama read the right values
+        // even when the JAR is invoked directly (without the Python launcher).
+        if (withPrefillDecode) System.setProperty("llama.withPrefillDecode", "true");
+        if (batchPrefillSize > 1) System.setProperty("llama.prefillBatchSize", String.valueOf(batchPrefillSize));
     }
 
     static void require(boolean condition, String messageFormat, Object... args) {
@@ -44,6 +50,8 @@ public static void printUsage(PrintStream out) {
         out.println("  --max-tokens, -n <int>        number of steps to run for < 0 = limited by context length, default " + DEFAULT_MAX_TOKENS);
         out.println("  --stream <boolean>            print tokens during generation; may cause encoding artifacts for non ASCII text, default true");
         out.println("  --echo <boolean>              print ALL tokens to stderr, if true, recommended to set --stream=false, default false");
+        out.println("  --with-prefill-decode         enable prefill/decode separation (skip logits during prefill)");
+        out.println("  --batch-prefill-size <int>    batched prefill chunk size; requires --with-prefill-decode, must be > 1, enables batched CPU/GPU prefill");
         out.println();
     }
 
@@ -61,7 +69,7 @@ public static Options getDefaultOptions() {
         boolean echo = false;
         boolean useTornadoVM = getDefaultTornadoVM();
 
-        return new Options(modelPath, prompt, systemPrompt, suffix, interactive, temperature, topp, seed, maxTokens, stream, echo, useTornadoVM);
+        return new Options(modelPath, prompt, systemPrompt, suffix, interactive, temperature, topp, seed, maxTokens, stream, echo, useTornadoVM, false, 1);
     }
 
     public static Options parseOptions(String[] args) {
@@ -77,13 +85,16 @@ public static Options parseOptions(String[] args) {
         boolean stream = false;
         boolean echo = false;
         Boolean useTornadovm = null; // null means not specified via command line
+        boolean withPrefillDecode = false;
+        int batchPrefillSize = 1;
 
         for (int i = 0; i < args.length; i++) {
             String optionName = args[i];
             require(optionName.startsWith("-"), "Invalid option %s", optionName);
             switch (optionName) {
                 case "--interactive", "--chat", "-i" -> interactive = true;
                 case "--instruct" -> interactive = false;
+                case "--with-prefill-decode" -> withPrefillDecode = true;
                 case "--help", "-h" -> {
                     printUsage(System.out);
                     System.exit(0);
@@ -111,6 +122,7 @@ public static Options parseOptions(String[] args) {
                         case "--stream" -> stream = Boolean.parseBoolean(nextArg);
                         case "--echo" -> echo = Boolean.parseBoolean(nextArg);
                         case "--use-tornadovm" -> useTornadovm = Boolean.parseBoolean(nextArg);
+                        case "--batch-prefill-size" -> batchPrefillSize = Integer.parseInt(nextArg);
                         default -> require(false, "Unknown option: %s", optionName);
                     }
                 }
@@ -123,6 +135,6 @@ public static Options parseOptions(String[] args) {
             useTornadovm = getDefaultTornadoVM();
         }
 
-        return new Options(modelPath, prompt, systemPrompt, suffix, interactive, temperature, topp, seed, maxTokens, stream, echo, useTornadovm);
+        return new Options(modelPath, prompt, systemPrompt, suffix, interactive, temperature, topp, seed, maxTokens, stream, echo, useTornadovm, withPrefillDecode, batchPrefillSize);
     }
 }
diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithPrefillDecode.java b/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithPrefillDecode.java
@@ -11,7 +11,6 @@
 import org.beehive.gpullama3.tokenizer.Tokenizer;
 import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlan;
 import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanWithBatchPrefillDecode;
-import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanStandard;
 import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanWithPrefillDecode;
 
 import java.util.ArrayList;
@@ -36,8 +35,8 @@
  *       Behaviour is identical to the baseline decode path.</li>
  * </ol>
  *
- * <p>Activated by {@code -Dllama.batchedPrefill=true} (set via
- * {@code --batched-prefill} in the Python launcher).</p>
+ * <p>Activated by {@code -Dllama.withPrefillDecode=true} (set via
+ * {@code --with-prefill-decode} in the Python launcher).</p>
  */
 public final class InferenceEngineWithPrefillDecode {
 
@@ -269,11 +268,10 @@ public static List<Integer> generateTokensGPULlama(
         } else {
         // ── Phase 2: Sequential GPU Prefill + Decode ─────────────────────────
 
-        // Thin wrapper: no new TornadoVM plan created, just holds the reference
-        // Plan is a TornadoVMMasterPlanStandard when PREFILL_BATCH_SIZE == 1.
+        // Plan was initialized by TornadoVMMasterPlan.initializeTornadoVMPlan as
+        // TornadoVMMasterPlanWithPrefillDecode when WITH_PREFILL_DECODE && PREFILL_BATCH_SIZE == 1.
         TornadoVMMasterPlanWithPrefillDecode prefillPlan =
-                new TornadoVMMasterPlanWithPrefillDecode(
-                        (TornadoVMMasterPlanStandard) tornadoVMPlan, state, model);
+                (TornadoVMMasterPlanWithPrefillDecode) tornadoVMPlan;
 
         // ── Phase 1: Prefill (GPU, no logits) ────────────────────────────────
         for (int promptIndex = 0; promptIndex < promptTokens.size() && pos < actualMaxTokens; promptIndex++) {
diff --git a/src/main/java/org/beehive/gpullama3/model/llama/Llama.java b/src/main/java/org/beehive/gpullama3/model/llama/Llama.java
@@ -20,7 +20,7 @@
 
 public class Llama extends AbstractModel {
 
-    static final boolean BATCHED_PREFILL = Boolean.getBoolean("llama.batchedPrefill");
+    static final boolean WITH_PREFILL_DECODE = Boolean.getBoolean("llama.withPrefillDecode");
 
     LlamaConfiguration configuration;
 
@@ -66,7 +66,7 @@ public void forward(State state, int token, int position) {
     @Override
     public List<Integer> generateTokens(State state, int startPosition, List<Integer> promptTokens, Set<Integer> stopTokens, int maxTokens, Sampler sampler, boolean echo,
             IntConsumer onTokenGenerated) {
-        if (BATCHED_PREFILL) {
+        if (WITH_PREFILL_DECODE) {
             return InferenceEngineWithPrefillDecode.generateTokensLlama(this, state, startPosition, promptTokens, stopTokens, maxTokens, sampler, echo, onTokenGenerated);
         }
         return InferenceEngine.generateTokensLlama(this, state, startPosition, promptTokens, stopTokens, maxTokens, sampler, echo, onTokenGenerated);
@@ -75,7 +75,7 @@ public List<Integer> generateTokens(State state, int startPosition, List<Integer
     @Override
     public List<Integer> generateTokensGPU(State state, int startPosition, List<Integer> promptTokens, Set<Integer> stopTokens, int maxTokens, Sampler sampler, boolean echo,
             IntConsumer onTokenGenerated, TornadoVMMasterPlan tornadoVMPlan) {
-        if (BATCHED_PREFILL) {
+        if (WITH_PREFILL_DECODE) {
             return InferenceEngineWithPrefillDecode.generateTokensGPULlama(this, state, startPosition, promptTokens, stopTokens, maxTokens, sampler, echo, onTokenGenerated, tornadoVMPlan);
         }
         return InferenceEngine.generateTokensGPULlama(this, state, startPosition, promptTokens, stopTokens, maxTokens, sampler, echo, onTokenGenerated, tornadoVMPlan);
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlan.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlan.java
@@ -30,41 +30,52 @@ public interface TornadoVMMasterPlan {
     boolean CUDA_GRAPHS = Boolean.parseBoolean(
             System.getProperty("llama.cudaGraphs", "true"));
 
-    /**
-     * Single-token forward pass returning output logits.
-     *
-     * <p>Used by the standard GPU path ({@link org.beehive.gpullama3.inference.InferenceCore#forwardTornadoVM})
-     * and the Phase 2 sequential decode path. Not applicable to
-     * {@link TornadoVMMasterPlanWithBatchPrefillDecode} — that plan uses its own typed methods.</p>
-     *
-     * @param position sequence position of the current token
-     * @return logits array for token sampling
-     */
-    FloatArray tornadoVMForwardExecuteLayered(int position);
+    boolean WITH_PREFILL_DECODE = Boolean.getBoolean("llama.withPrefillDecode");
 
-    /** Releases all device memory held by this plan. */
-    void freeTornadoExecutionPlan();
+    int PREFILL_BATCH_SIZE = Integer.getInteger("llama.prefillBatchSize", 1);
 
     /**
      * Factory: creates, JIT-compiles, and warms up the appropriate plan.
      *
-     * <p>When {@code llama.prefillBatchSize > 1} a {@link TornadoVMMasterPlanWithBatchPrefillDecode}
-     * is returned; otherwise a {@link TornadoVMMasterPlanStandard} is returned.</p>
+     * <p>When {@code llama.withPrefillDecode=true} and {@code llama.prefillBatchSize > 1},
+     * a {@link TornadoVMMasterPlanWithBatchPrefillDecode} is returned.
+     * Otherwise a {@link TornadoVMMasterPlanStandard} is returned (used for the baseline
+     * path and the sequential prefill/decode path when batch size is 1).</p>
      *
      * @param state the model state (must be {@link LlamaState} when batch size {@code > 1})
      * @param model the model instance
      * @return the initialized plan, also stored via {@link Model#setTornadoVMPlan}
      */
     static TornadoVMMasterPlan initializeTornadoVMPlan(State state, Model model) {
-        int batchSize = Integer.getInteger("llama.prefillBatchSize", 1);
         TornadoVMMasterPlan plan;
-        if (batchSize > 1) {
+
+        if (WITH_PREFILL_DECODE && PREFILL_BATCH_SIZE > 1) {
+            // GPU path with batched prefill/decode
             plan = TornadoVMMasterPlanWithBatchPrefillDecode.initializeUnifiedPlan(
-                    (LlamaState) state, model, batchSize);
+                    (LlamaState) state, model, PREFILL_BATCH_SIZE);
+        } else if (WITH_PREFILL_DECODE) {
+            // GPU path with simple prefill/decode
+            plan = TornadoVMMasterPlanWithPrefillDecode.initialize(state, model);
         } else {
+            // GPU path with no prefill/decode
             plan = TornadoVMMasterPlanStandard.initialize(state, model);
         }
         model.setTornadoVMPlan(plan);
         return plan;
     }
+
+    /**
+     * Single-token forward pass returning output logits.
+     *
+     * <p>Used by the standard GPU path ({@link org.beehive.gpullama3.inference.InferenceCore#forwardTornadoVM})
+     * and the Phase 2 sequential decode path. Not applicable to
+     * {@link TornadoVMMasterPlanWithBatchPrefillDecode} — that plan uses its own typed methods.</p>
+     *
+     * @param position sequence position of the current token
+     * @return logits array for token sampling
+     */
+    FloatArray tornadoVMForwardExecuteLayered(int position);
+
+    /** Releases all device memory held by this plan. */
+    void freeTornadoExecutionPlan();
 }
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithPrefillDecode.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithPrefillDecode.java
@@ -25,8 +25,12 @@
  * <p>For decode, {@link #tornadoVMForwardDecode} delegates to the wrapped
  * plan's {@code tornadoVMForwardExecuteLayered}, preserving identical behaviour
  * to the baseline GPU path.</p>
+ *
+ * <p>Implements {@link TornadoVMMasterPlan} so it can be returned by the factory
+ * and stored in the model; {@link #tornadoVMForwardExecuteLayered} delegates to
+ * {@link #tornadoVMForwardDecode}.</p>
  */
-public class TornadoVMMasterPlanWithPrefillDecode {
+public class TornadoVMMasterPlanWithPrefillDecode implements TornadoVMMasterPlan {
 
     private final TornadoVMMasterPlanStandard plan;
     private final State state;
@@ -38,6 +42,12 @@ public TornadoVMMasterPlanWithPrefillDecode(TornadoVMMasterPlanStandard plan, St
         this.config = model.configuration();
     }
 
+    /** Factory: initializes the inner standard plan then wraps it. */
+    public static TornadoVMMasterPlanWithPrefillDecode initialize(State state, Model model) {
+        TornadoVMMasterPlanStandard inner = TornadoVMMasterPlanStandard.initialize(state, model);
+        return new TornadoVMMasterPlanWithPrefillDecode(inner, state, model);
+    }
+
     /**
      * GPU prefill forward: runs preprocessing + all transformer layers, skips logits.
      *
@@ -76,4 +86,15 @@ public void tornadoVMForwardPrefill(int position) {
     public FloatArray tornadoVMForwardDecode(int position) {
         return plan.tornadoVMForwardExecuteLayered(position);
     }
+
+    /** Delegates to the wrapped plan's full forward pass (used by the standard decode path). */
+    @Override
+    public FloatArray tornadoVMForwardExecuteLayered(int position) {
+        return tornadoVMForwardDecode(position);
+    }
+
+    @Override
+    public void freeTornadoExecutionPlan() {
+        plan.freeTornadoExecutionPlan();
+    }
 }