[ci][prf/dec] Simplify CI workflow by unifying backend-specific prefill-decode and batch-prefill-decode steps

orionpapadakis · orionpapadakis · commit 11ecb11a2ce0 · 2026-04-30T20:09:00.000+03:00
diff --git a/.github/workflows/build-and-run.yml b/.github/workflows/build-and-run.yml
@@ -101,26 +101,22 @@ jobs:
           ./llama-tornado --gpu --${{ matrix.backend.name }} \
             --model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \
             --prompt "Say hello"
-      - name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Prefill-Decode - PTX
-        if: matrix.backend.name == 'ptx'
+      - name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Prefill-Decode
         run: |
           cd ${{ github.workspace }}
           export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
-          ./llama-tornado --gpu --ptx \
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
             --model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \
             --prompt "Say hello" \
-            --with-prefill-decode \
-            --no-cuda-graphs
-      - name: PTX- FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Batch-Prefill-Decode
-        if: matrix.backend.name == 'ptx'
+            --with-prefill-decode
+      - name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Batch-Prefill-Decode
         run: |
           cd ${{ github.workspace }}
           export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
-          ./llama-tornado --gpu --ptx \
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
             --model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \
             --prompt "Say hello" \
-            --with-prefill-decode --batch-prefill-size 32 \
-            --no-cuda-graphs
+            --with-prefill-decode --batch-prefill-size 32
       - name: PTX- FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Prefill-Decode-CUDA-Graphs
         if: matrix.backend.name == 'ptx'
         run: |
@@ -129,7 +125,8 @@ jobs:
           ./llama-tornado --gpu --ptx \
             --model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \
             --prompt "Say hello" \
-            --with-prefill-decode
+            --with-prefill-decode \
+            --cuda-graphs
       - name: PTX - FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Batch-Prefill-Decode-CUDA-Graphs
         if: matrix.backend.name == 'ptx'
         run: |
@@ -138,7 +135,8 @@ jobs:
           ./llama-tornado --gpu --ptx \
             --model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \
             --prompt "Say hello" \
-            --with-prefill-decode --batch-prefill-size 32
+            --with-prefill-decode --batch-prefill-size 32 \
+            --cuda-graphs
       - name: FP16 - Run Qwen3-4B-f16.gguf
         run: |
           cd ${{ github.workspace }}