Skip to content

Commit 1569343

Browse files
[cuda-graphs] Add support for --cuda-graphs argument and update default behavior for CUDA Graphs
1 parent e8ceba7 commit 1569343

2 files changed

Lines changed: 12 additions & 4 deletions

File tree

llama-tornado

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,9 @@ class LlamaRunner:
9393
if args.batch_prefill_size is not None:
9494
cmd.append(f"-Dllama.prefillBatchSize={args.batch_prefill_size}")
9595

96-
if args.no_cuda_graphs:
96+
if args.cuda_graphs:
97+
cmd.append("-Dllama.cudaGraphs=true")
98+
elif args.no_cuda_graphs:
9799
cmd.append("-Dllama.cudaGraphs=false")
98100

99101
# Debug options
@@ -505,11 +507,17 @@ def create_parser() -> argparse.ArgumentParser:
505507
"N>1: batched prefill processing N tokens per chunk (llama.prefillBatchSize=N)."
506508
),
507509
)
510+
prefill_group.add_argument(
511+
"--cuda-graphs",
512+
dest="cuda_graphs",
513+
action="store_true",
514+
help="Enable CUDA graph capture/replay (llama.cudaGraphs=true); PTX backend only",
515+
)
508516
prefill_group.add_argument(
509517
"--no-cuda-graphs",
510518
dest="no_cuda_graphs",
511519
action="store_true",
512-
help="Disable CUDA graph capture/replay (llama.cudaGraphs=false); useful for debugging",
520+
help="Disable CUDA graph capture/replay (llama.cudaGraphs=false); no-op, disabled by default",
513521
)
514522

515523
# Advanced options

src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlan.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@ public interface TornadoVMMasterPlan {
2626
boolean ENABLE_TORNADOVM_INIT_TIME = Boolean.parseBoolean(
2727
System.getProperty("llama.EnableTimingForTornadoVMInit", "False"));
2828

29-
/** When {@code false}, {@code withCUDAGraph()} is never called — useful for debugging. */
29+
/** When {@code true}, {@code withCUDAGraph()} is called — PTX/CUDA backend only. */
3030
boolean CUDA_GRAPHS = Boolean.parseBoolean(
31-
System.getProperty("llama.cudaGraphs", "true"));
31+
System.getProperty("llama.cudaGraphs", "false"));
3232

3333
boolean WITH_PREFILL_DECODE = Boolean.getBoolean("llama.withPrefillDecode");
3434

0 commit comments

Comments
 (0)