Skip to content

Commit c7e4b1d

Browse files
authored
fix: various follow-up bug fixes (#302)
* various bug fixes * various bug fixes pt 2 * various bug fixes pt 3 * various bug fixes pt 4 -- use bash expansion in cw
1 parent d9ddddd commit c7e4b1d

5 files changed

Lines changed: 10 additions & 8 deletions

File tree

.github/workflows/full-sweep-1k1k-scheduler.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ jobs:
5151
config: ${{ fromJson(needs.get-dsr1-configs.outputs.multi-node-search-space-config) }}
5252
secrets: inherit
5353
with:
54+
exp-name: "dsr1_1k1k"
5455
isl: 1024
5556
osl: 1024
5657
max-model-len: 2248
@@ -60,7 +61,6 @@ jobs:
6061
model-prefix: ${{ matrix.config.model-prefix }}
6162
framework: ${{ matrix.config.framework }}
6263
precision: ${{ matrix.config.precision }}
63-
exp-name: "dsr1_1k1k"
6464
conc-list: ${{ toJson(matrix.config.conc) }}
6565
spec-decoding: ${{ matrix.config.spec-decoding }}
6666
disagg: ${{ matrix.config.disagg }}
@@ -116,6 +116,7 @@ jobs:
116116
config: ${{ fromJson(needs.get-gptoss-configs.outputs.multi-node-search-space-config) }}
117117
secrets: inherit
118118
with:
119+
exp-name: "gptoss_1k1k"
119120
isl: 1024
120121
osl: 1024
121122
max-model-len: 2248
@@ -125,7 +126,6 @@ jobs:
125126
model-prefix: ${{ matrix.config.model-prefix }}
126127
framework: ${{ matrix.config.framework }}
127128
precision: ${{ matrix.config.precision }}
128-
exp-name: "dsr1_1k1k"
129129
conc-list: ${{ toJson(matrix.config.conc) }}
130130
spec-decoding: ${{ matrix.config.spec-decoding }}
131131
disagg: ${{ matrix.config.disagg }}

.github/workflows/full-sweep-1k8k-scheduler.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ jobs:
5151
config: ${{ fromJson(needs.get-dsr1-configs.outputs.multi-node-search-space-config) }}
5252
secrets: inherit
5353
with:
54+
exp-name: "dsr1_1k8k"
5455
isl: 1024
5556
osl: 8192
5657
max-model-len: 9416
@@ -60,7 +61,6 @@ jobs:
6061
model-prefix: ${{ matrix.config.model-prefix }}
6162
framework: ${{ matrix.config.framework }}
6263
precision: ${{ matrix.config.precision }}
63-
exp-name: "dsr1_1k8k"
6464
conc-list: ${{ toJson(matrix.config.conc) }}
6565
spec-decoding: ${{ matrix.config.spec-decoding }}
6666
disagg: ${{ matrix.config.disagg }}
@@ -116,6 +116,7 @@ jobs:
116116
config: ${{ fromJson(needs.get-gptoss-configs.outputs.multi-node-search-space-config) }}
117117
secrets: inherit
118118
with:
119+
exp-name: "gptoss_1k8k"
119120
isl: 1024
120121
osl: 8192
121122
max-model-len: 9416
@@ -125,7 +126,6 @@ jobs:
125126
model-prefix: ${{ matrix.config.model-prefix }}
126127
framework: ${{ matrix.config.framework }}
127128
precision: ${{ matrix.config.precision }}
128-
exp-name: "dsr1_1k8k"
129129
conc-list: ${{ toJson(matrix.config.conc) }}
130130
spec-decoding: ${{ matrix.config.spec-decoding }}
131131
disagg: ${{ matrix.config.disagg }}

.github/workflows/full-sweep-8k1k-scheduler.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ jobs:
5151
config: ${{ fromJson(needs.get-dsr1-configs.outputs.multi-node-search-space-config) }}
5252
secrets: inherit
5353
with:
54+
exp-name: "dsr1_8k1k"
5455
isl: 8192
5556
osl: 1024
5657
max-model-len: 9416
@@ -60,7 +61,6 @@ jobs:
6061
model-prefix: ${{ matrix.config.model-prefix }}
6162
framework: ${{ matrix.config.framework }}
6263
precision: ${{ matrix.config.precision }}
63-
exp-name: "dsr1_8k1k"
6464
conc-list: ${{ toJson(matrix.config.conc) }}
6565
spec-decoding: ${{ matrix.config.spec-decoding }}
6666
disagg: ${{ matrix.config.disagg }}
@@ -116,6 +116,7 @@ jobs:
116116
config: ${{ fromJson(needs.get-gptoss-configs.outputs.multi-node-search-space-config) }}
117117
secrets: inherit
118118
with:
119+
exp-name: "gptoss_8k1k"
119120
isl: 8192
120121
osl: 1024
121122
max-model-len: 9416
@@ -125,7 +126,6 @@ jobs:
125126
model-prefix: ${{ matrix.config.model-prefix }}
126127
framework: ${{ matrix.config.framework }}
127128
precision: ${{ matrix.config.precision }}
128-
exp-name: "dsr1_8k1k"
129129
conc-list: ${{ toJson(matrix.config.conc) }}
130130
spec-decoding: ${{ matrix.config.spec-decoding }}
131131
disagg: ${{ matrix.config.disagg }}

runners/launch_b200-nb.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ HF_HUB_CACHE_MOUNT="/mnt/data/hf-hub-cache-${USER: -1}/"
44
PARTITION="main"
55
FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
66

7-
UCX_NET_DEVICES=mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1,mlx5_10:1,mlx5_11:1
7+
UCX_NET_DEVICES=eth0
88

99
# Cleanup any stale enroot locks from previous runs
1010
find /var/cache/enroot-container-images/$UID -type f -name "*.lock" | xargs rm

runners/launch_h200-cw.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,14 @@ else
2121
CONTAINER_IMAGE=$(realpath $SQUASH_FILE)
2222
fi
2323

24+
# The 'rm -rf /dev/shm/sagemaker_sessions' is to clean up shared memory used by sagemaker sessions inside the container
25+
# This seems to have been introduced in vLLM 0.11.2, but the issue is specific to CoreWeave runners.
2426
srun --jobid=$JOB_ID \
2527
--container-image=$CONTAINER_IMAGE \
2628
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
2729
--container-mount-home \
2830
--container-workdir=/workspace/ \
2931
--no-container-entrypoint --export=ALL \
30-
bash -c 'bash benchmarks/'"${EXP_NAME%%_*}_${PRECISION}"'_h200_slurm.sh; rm -rf /dev/shm/sagemaker_sessions'
32+
bash -c "bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh; rm -rf /dev/shm/sagemaker_sessions"
3133

3234
scancel $JOB_ID

0 commit comments

Comments
 (0)