test(agentic): enable blocking CUDA offload diagnostics

cquil11 · cquil11 · commit 06a4ea7711ce · 2026-06-04T16:24:50.000-05:00
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
@@ -9420,11 +9420,13 @@ dsv4-fp4-b300-vllm-agentic:
     agentic-coding:
     - duration: 1800
       search-space:
-      - { tp: 4, offloading: none,  conc-list: [1, 4, 8, 16, 32] }
-      - { tp: 8, offloading: none,  conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] }
-      - { tp: 4, ep: 4, dp-attn: true, offloading: none,  conc-list: [8, 16, 32, 64, 128] }
+      # TEMPORARY: run only native CPU-offload scenarios while diagnosing
+      # asynchronous CUDA failures.
+      # - { tp: 4, offloading: none,  conc-list: [1, 4, 8, 16, 32] }
+      # - { tp: 8, offloading: none,  conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] }
+      # - { tp: 4, ep: 4, dp-attn: true, offloading: none,  conc-list: [8, 16, 32, 64, 128] }
       - { tp: 4, ep: 4, dp-attn: true, offloading: cpu,   conc-list: [32, 48, 64, 96, 128, 192, 256] }
-      - { tp: 8, ep: 8, dp-attn: true, offloading: none,  conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] }
+      # - { tp: 8, ep: 8, dp-attn: true, offloading: none,  conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] }
 
 gptoss-fp4-b200-vllm-agentic:
   image: vllm/vllm-openai:v0.22.0
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
@@ -144,6 +144,9 @@ echo "Starting vllm server..."
 export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 export VLLM_FLOAT32_MATMUL_PRECISION=high
+# Temporary diagnostic: surface asynchronous CUDA failures at the operation
+# that caused them instead of at a later synchronization point.
+export CUDA_LAUNCH_BLOCKING=1
 
 vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \