NVIDIA
diff --git a/‎examples/auto_deploy/model_registry/configs/dashboard_default.yaml‎
Lines changed: 1 addition & 0 deletions b/‎examples/auto_deploy/model_registry/configs/dashboard_default.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/auto_deploy/model_registry/configs/gemma3_1b.yaml‎
Lines changed: 1 addition & 0 deletions b/‎examples/auto_deploy/model_registry/configs/gemma3_1b.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/auto_deploy/model_registry/configs/glm-4.7-flash.yaml‎
Lines changed: 2 additions & 1 deletion b/‎examples/auto_deploy/model_registry/configs/glm-4.7-flash.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/auto_deploy/model_registry/configs/kimi_k2.yaml‎
Lines changed: 2 additions & 1 deletion b/‎examples/auto_deploy/model_registry/configs/kimi_k2.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/auto_deploy/model_registry/configs/llama3_3_70b.yaml‎
Lines changed: 2 additions & 1 deletion b/‎examples/auto_deploy/model_registry/configs/llama3_3_70b.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/auto_deploy/model_registry/configs/llama4_scout.yaml‎
Lines changed: 2 additions & 1 deletion b/‎examples/auto_deploy/model_registry/configs/llama4_scout.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/auto_deploy/model_registry/configs/nano_v3.yaml‎
Lines changed: 2 additions & 1 deletion b/‎examples/auto_deploy/model_registry/configs/nano_v3.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/auto_deploy/model_registry/configs/nemotron-nano-9b-v2.yaml‎
Lines changed: 2 additions & 1 deletion b/‎examples/auto_deploy/model_registry/configs/nemotron-nano-9b-v2.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/auto_deploy/model_registry/configs/qwen3.5_moe_35b.yaml‎
Lines changed: 2 additions & 1 deletion b/‎examples/auto_deploy/model_registry/configs/qwen3.5_moe_35b.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/auto_deploy/model_registry/configs/qwen3.5_moe_400b.yaml‎
Lines changed: 2 additions & 1 deletion b/‎examples/auto_deploy/model_registry/configs/qwen3.5_moe_400b.yaml‎
Lines changed: 2 additions & 1 deletion
@@ -13,3 +13,4 @@ transforms:
   fuse_rmsnorm_quant_fp8:
     stage: post_load_fusion
     enabled: true
+max_batch_size: 8
@@ -1,3 +1,4 @@
 # Configuration for Gemma 3 1B model
 # Specific sequence length requirement due to small attention window
 max_seq_len: 511
+max_batch_size: 8
@@ -2,7 +2,8 @@ compile_backend: torch-cudagraph
 max_batch_size: 64
 max_seq_len: 4096
 enable_chunked_prefill: true
-cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64]
+cuda_graph_config:
+  batch_sizes: [1, 2, 4, 8, 16, 32, 64]
 transforms:
   match_swiglu_pattern:
     enabled: true
 
@@ -7,7 +7,8 @@ max_num_tokens: 4096
 max_batch_size: 64
 world_size: 8
 enable_chunked_prefill: true
-cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64]
+cuda_graph_config:
+  batch_sizes: [1, 2, 4, 8, 16, 32, 64]
 kv_cache_config:
   dtype: bfloat16
   enable_block_reuse: false
 
@@ -4,7 +4,8 @@
 max_batch_size: 1024
 max_num_tokens: 2048
 trust_remote_code: true
-cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 768, 1024]
+cuda_graph_config:
+  batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 768, 1024]
 kv_cache_config:
   dtype: fp8
 # match_swiglu_pattern fuses gate+up projections before sharding, but the
 
@@ -4,7 +4,8 @@
 max_batch_size: 1024
 max_num_tokens: 2048
 trust_remote_code: true
-cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 768, 1024]
+cuda_graph_config:
+  batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 768, 1024]
 kv_cache_config:
   dtype: fp8
 transforms:
 
@@ -6,7 +6,8 @@ enable_chunked_prefill: true
 attn_backend: trtllm
 model_factory: AutoModelForCausalLM
 skip_loading_weights: false
-cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 24, 32, 64, 128, 256, 320, 384]
+cuda_graph_config:
+  batch_sizes: [1, 2, 4, 8, 16, 24, 32, 64, 128, 256, 320, 384]
 kv_cache_config:
   free_gpu_memory_fraction: 0.88
   # tunable mamba cache dtype
 
@@ -19,7 +19,8 @@ max_num_tokens: 8192
 
 skip_loading_weights: false
 
+cuda_graph_config:
+  batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128]
 transforms:
   compile_model:
     backend: torch-cudagraph
-    cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128]
 
@@ -4,7 +4,8 @@ attn_backend: trtllm
 max_seq_len: 8192
 max_num_tokens: 4096
 max_batch_size: 512
-cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
+cuda_graph_config:
+  batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
 enable_chunked_prefill: true
 model_factory: Qwen3_5MoeForConditionalGeneration
 kv_cache_config:
 
@@ -4,7 +4,8 @@ attn_backend: trtllm
 max_seq_len: 262144
 max_num_tokens: 8192
 max_batch_size: 32
-cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
+cuda_graph_config:
+  batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
 enable_chunked_prefill: true
 model_factory: Qwen3_5MoeForConditionalGeneration
 kv_cache_config: