@@ -81,7 +81,6 @@ public class TornadoVMMasterPlanWithBatchPrefillDecode implements TornadoVMMaste
8181 this .config = (LlamaConfiguration ) model .configuration ();
8282 this .batchSize = PREFILL_BATCH_SIZE ;
8383 this .N = config .numberOfLayers ();
84-
8584 this .gridScheduler = new GridScheduler ();
8685 this .executionPlan = createExecutionPlan ();
8786
@@ -129,23 +128,12 @@ private TaskGraph buildBatchPrefillActivationGraph(KernelContext ctx) {
129128 * not forwarded in interpreter (non-CUDA-graph) mode.</p>
130129 */
131130 private TaskGraph buildDecodeActivationGraph (KernelContext ctx , String lastBatchLayerID ) {
132- // System.out.println("lastBatchLayerID = " + lastBatchLayerID);
133- // System.out.println("[buildDecodeActivationGraph] state.wrapX = " + state.wrapX.toString());
134- // System.out.println("[buildDecodeActivationGraph] state.wrapKeyCache = " + state.wrapKeyCache.toString());
135- // System.out.println("[buildDecodeActivationGraph] state.wrapValueCache = " + state.wrapValueCache.toString());
136131 return new TaskGraph ("decodeActivationUpdate" )
137132 .consumeFromDevice (lastBatchLayerID , state .wrapKeyCache , state .wrapValueCache ) // KV pass-through
138- //.transferToDevice(DataTransferMode.FIRST_EXECUTION, ctx, state.wrapX, debugKV)
139- //.transferToDevice(DataTransferMode.FIRST_EXECUTION, ctx, state.wrapX)
140133 .transferToDevice (DataTransferMode .EVERY_EXECUTION , state .embeddingX )
141134 .task ("updateX" ,
142135 TransformerComputeKernels ::convertFP16toFP32 ,
143136 ctx , (HalfFloatArray ) state .embeddingX , state .wrapX )
144- // // DEBUG: snapshot first 8 elements of wrapKeyCache and wrapX for host-side probe
145- // .task("dbgKV",
146- // TransformerComputeKernels::dbgCopyFirst8,
147- // state.wrapKeyCache, debugKV)
148- // .transferToHost(DataTransferMode.EVERY_EXECUTION, state.wrapX, debugKV)
149137 // wrapX persisted for decode layer 0; wrapKeyCache/wrapValueCache
150138 // re-persisted so updatePersistedObjectState() propagates the device
151139 // pointer to decode layer 0's consumeFromDevice without CUDA graphs.
@@ -210,7 +198,6 @@ public void forceCopyInReadOnlyData() {
210198 state .batchStartPosHolder .init (0 );
211199
212200 for (int i = 0 ; i <= logitsIdx (); i ++) {
213- //System.out.println(i + " " + executionPlan.withGraph(i).toString());
214201 var g = executionPlan .withGraph (i ).withGridScheduler (gridScheduler );
215202 if (CUDA_GRAPHS ) g .withCUDAGraph ();
216203 g .execute ();
@@ -252,7 +239,6 @@ public void tornadoVMForwardBatchPrefill(int[] tokenIds, int startPos, Model mod
252239 if (CUDA_GRAPHS ) batchLayer .withCUDAGraph ();
253240 batchLayer .execute ();
254241 }
255- //System.err.println("[DEBUG] last batch layer done, about to return from prefill");
256242 // Logits skipped — not needed for prefill positions.
257243 }
258244
@@ -280,16 +266,7 @@ public FloatArray tornadoVMForwardDecode(int token, int position, Model model) {
280266 // Graph N+1: decode activation
281267 var decodeAct = executionPlan .withGraph (decodeActivationIdx ()).withGridScheduler (gridScheduler );
282268 if (CUDA_GRAPHS ) decodeAct .withCUDAGraph ();
283- //System.err.println("[DEBUG] about to execute decode activation (graph " + decodeActivationIdx() + "--)");
284269 decodeAct .execute ();
285- // DEBUG: print first 4 of wrapX (should be non-zero FP32 embedding) and
286- // first 4 of debugKV (should be non-zero after batch prefill wrote the KV cache)
287- // if (position <= 290) {
288- // System.err.printf("[DBG pos=%d] wrapX[0..3] = %.4f %.4f %.4f %.4f%n",
289- // position, state.wrapX.get(0), state.wrapX.get(1), state.wrapX.get(2), state.wrapX.get(3));
290- // System.err.printf("[DBG pos=%d] debugKV[0..3]= %.4f %.4f %.4f %.4f%n",
291- // position, debugKV.get(0), debugKV.get(1), debugKV.get(2), debugKV.get(3));
292- // }
293270
294271 // Graphs N+2..2N+1: decode transformer layers
295272 for (int l = 0 ; l < N ; l ++) {
@@ -321,11 +298,4 @@ public void freeTornadoExecutionPlan() {
321298 executionPlan .freeDeviceMemory ();
322299 }
323300
324- // ── Inner class: decode layer 0 with consumeFromDevice for KV cache ───────
325- // moved to package
326- //
327- // private static final class LlamaFP16FFNLayersForUnifiedDecode extends LlamaFP16FFNLayers {
328- //
329- //
330- // }
331301}
0 commit comments