Skip to content

Commit 41f0ee5

Browse files
authored
Merge pull request #112 from beehive-lab/refactor/llama-tornado
llama-tornado script curation
2 parents 8ce9f82 + e0414a3 commit 41f0ee5

1 file changed

Lines changed: 9 additions & 18 deletions

File tree

llama-tornado

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -499,17 +499,13 @@ def create_parser() -> argparse.ArgumentParser:
499499
help="Execute the command after showing it (use with --show-command)",
500500
)
501501

502-
# Prefill/Decode optimization
503-
prefill_group = parser.add_argument_group("Prefill/Decode Optimization")
502+
# Prefill-Decode optimizations
503+
prefill_group = parser.add_argument_group("Prefill-Decode Optimizations")
504504
prefill_group.add_argument(
505505
"--with-prefill-decode",
506506
dest="with_prefill_decode",
507507
action="store_true",
508-
help=(
509-
"Enable prefill/decode separation. "
510-
"Alone: sequential prefill (skip logits) + standard decode. "
511-
"With --batch-prefill-size N (N>1): batched GPU prefill via TornadoVMMasterPlanWithBatchPrefillDecode."
512-
),
508+
help="Enable single-token prefill decode",
513509
)
514510
prefill_group.add_argument(
515511
"--batch-prefill-size",
@@ -518,22 +514,17 @@ def create_parser() -> argparse.ArgumentParser:
518514
default=None,
519515
metavar="N",
520516
help=(
521-
"Prefill chunk size (requires --with-prefill-decode). "
522-
"N=1: sequential prefill (same as --with-prefill-decode alone). "
523-
"N>1: batched prefill processing N tokens per chunk (llama.prefillBatchSize=N)."
517+
"Enable batching in prefill when --with-prefill-decode is active and N>1. "
524518
),
525519
)
526-
prefill_group.add_argument(
520+
521+
# Advanced CUDA features
522+
advanced_cuda_features_group = parser.add_argument_group("Advanced CUDA Features")
523+
advanced_cuda_features_group.add_argument(
527524
"--cuda-graphs",
528525
dest="cuda_graphs",
529526
action="store_true",
530-
help="Enable CUDA graph capture/replay (llama.cudaGraphs=true); PTX backend only",
531-
)
532-
prefill_group.add_argument(
533-
"--no-cuda-graphs",
534-
dest="no_cuda_graphs",
535-
action="store_true",
536-
help="Disable CUDA graph capture/replay (llama.cudaGraphs=false); no-op, disabled by default",
527+
help="Enable CUDA graph capture/replay (llama.cudaGraphs=true); PTX backend only.",
537528
)
538529

539530
# Advanced options

0 commit comments

Comments
 (0)