Skip to content

Commit eb885ff

Browse files
committed
fix(profile): enable vllm trace output for GB200
1 parent 2f300a3 commit eb885ff

2 files changed

Lines changed: 12 additions & 2 deletions

File tree

.github/workflows/profile.yml

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -241,17 +241,19 @@ jobs:
241241
fi
242242
243243
trace_path="profile_${res_name}.trace.json.gz"
244-
if [ ! -f "$trace_path" ] && [ -d LOGS/profiles ]; then
244+
if [ ! -f "$trace_path" ] && [ -d LOGS ]; then
245245
trace_candidate="$(python3 - <<'PY'
246246
from pathlib import Path
247247
248-
root = Path("LOGS/profiles")
248+
root = Path("LOGS")
249249
candidates = [
250250
p for p in root.rglob("*")
251251
if p.is_file() and (
252252
p.name.endswith(".trace.json")
253253
or p.name.endswith(".trace.json.gz")
254254
or p.name.endswith(".pt.trace.json")
255+
or p.name.endswith(".pt.trace.json.gz")
256+
or p.name.endswith(".json.gz")
255257
or p.name.endswith(".json")
256258
)
257259
]
@@ -293,6 +295,11 @@ jobs:
293295
fi
294296
else
295297
echo "Profile trace not found: $trace_path" >&2
298+
if [ -d LOGS ]; then
299+
echo "LOGS profile candidates:" >&2
300+
find LOGS -maxdepth 8 -type f \( -path "*/profiles/*" -o -name "*trace*" -o -name "*profile*" \) -printf "%p %s bytes\n" 2>/dev/null | sort >&2 || true
301+
fi
302+
exit 1
296303
fi
297304
298305
- name: Process result (json -> agg)

benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ backend:
3434
connector: null
3535
aggregated_environment:
3636
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
37+
VLLM_RPC_TIMEOUT: "1800000"
38+
VLLM_TORCH_PROFILER_DIR: "/logs/profiles/agg"
3739
TILELANG_CLEANUP_TEMP_FILES: "1"
3840
VLLM_USE_NCCL_SYMM_MEM: "1"
3941
TORCH_SYMMMEM: "NVSHMEM"
@@ -67,6 +69,7 @@ backend:
6769
max-num-seqs: 256
6870
max-num-batched-tokens: 256
6971
max-cudagraph-capture-size: 256
72+
profiler-config: '{"profiler":"torch","torch_profiler_dir":"/logs/profiles/agg","ignore_frontend":true,"delay_iterations":3,"max_iterations":1,"active_iterations":1,"torch_profiler_with_stack":false}'
7073
trust-remote-code: true
7174
no-enable-prefix-caching: true
7275
no-enable-flashinfer-autotune: true

0 commit comments

Comments
 (0)