Skip to content

Commit 0b29d34

Browse files
committed
[NV] llm-d: stage gpt-oss-120b via 'hf download' to match repo convention
Signed-off-by: Ezra Silvera <ezra@il.ibm.com>
1 parent fbcd31d commit 0b29d34

1 file changed

Lines changed: 29 additions & 28 deletions

File tree

runners/launch_h200-dgxc-slurm.sh

Lines changed: 29 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,26 @@
66
export SLURM_PARTITION="main"
77
export SLURM_ACCOUNT="sa-shared"
88

9+
# Inline copies of agentic_pip_install / ensure_hf_cli from
10+
# benchmarks/benchmark_lib.sh. We can't safely source the whole lib here
11+
# (it ships container-only side effects), but we want the same hf-CLI
12+
# install path the rest of the repo uses so model staging on the runner
13+
# host matches `hf download "$MODEL"` in single-node scripts.
14+
agentic_pip_install() {
15+
local pip_install=(python3 -m pip install)
16+
if python3 -m pip install --help 2>/dev/null | grep -q -- "--break-system-packages"; then
17+
pip_install+=(--break-system-packages)
18+
fi
19+
"${pip_install[@]}" "$@"
20+
}
21+
ensure_hf_cli() {
22+
if command -v hf >/dev/null 2>&1; then return 0; fi
23+
agentic_pip_install --quiet --user "huggingface_hub[cli]>=0.25.0"
24+
# pip --user puts the binary under ~/.local/bin; make it visible.
25+
export PATH="$HOME/.local/bin:$PATH"
26+
command -v hf >/dev/null 2>&1
27+
}
28+
929
set -x
1030

1131
if [[ "$IS_MULTINODE" == "true" ]]; then
@@ -24,45 +44,26 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
2444
# the container, so MODEL_PATH must be an existing host
2545
# directory (an HF id alone will not work). Resolution order:
2646
# 1. /models/gpt-oss-120b (cluster-shared, staged by an admin)
27-
# 2. $HOME/inferencex-models/gpt-oss-120b (per-runner, written
28-
# by this script - downloaded once, cached across dispatches)
29-
# The download happens inline on the runner host so users with
30-
# only gh-dispatch access can stage the model without ssh.
47+
# 2. $HOME/inferencex-models/gpt-oss-120b (per-runner, staged
48+
# here via `hf download`; cached across dispatches)
3149
GPTOSS_LOCAL_DIR="$HOME/inferencex-models/gpt-oss-120b"
32-
export GPTOSS_LOCAL_DIR
3350
if [[ -d "/models/gpt-oss-120b" ]]; then
3451
export MODEL_PATH="/models/gpt-oss-120b"
3552
else
3653
mkdir -p "$HOME/inferencex-models"
37-
stage_gptoss_120b() {
54+
# flock serializes concurrent dispatches so a second
55+
# run waits instead of racing the first download.
56+
if ! (
3857
set -euo pipefail
39-
# flock serializes concurrent dispatches so a second
40-
# run waits instead of racing the first download.
4158
exec 200>"$HOME/inferencex-models/.gpt-oss-120b.download.lock"
4259
flock -x 200
4360
if [[ -d "$GPTOSS_LOCAL_DIR" && -n "$(ls -A "$GPTOSS_LOCAL_DIR" 2>/dev/null)" ]]; then
44-
return 0 # already staged
61+
exit 0
4562
fi
4663
echo "Staging openai/gpt-oss-120b -> $GPTOSS_LOCAL_DIR (one-time, ~60 GB)"
47-
if ! command -v huggingface-cli >/dev/null 2>&1 \
48-
&& ! python3 -c "import huggingface_hub" 2>/dev/null; then
49-
echo "Installing huggingface_hub via pip --user"
50-
python3 -m pip install --user --quiet huggingface_hub
51-
export PATH="$HOME/.local/bin:$PATH"
52-
fi
53-
if command -v huggingface-cli >/dev/null 2>&1; then
54-
huggingface-cli download openai/gpt-oss-120b \
55-
--local-dir "$GPTOSS_LOCAL_DIR"
56-
else
57-
python3 - <<'PY'
58-
import os
59-
from huggingface_hub import snapshot_download
60-
snapshot_download(repo_id="openai/gpt-oss-120b",
61-
local_dir=os.environ["GPTOSS_LOCAL_DIR"])
62-
PY
63-
fi
64-
}
65-
if ! stage_gptoss_120b; then
64+
ensure_hf_cli
65+
hf download openai/gpt-oss-120b --local-dir "$GPTOSS_LOCAL_DIR"
66+
); then
6667
echo "Error: failed to stage gpt-oss-120b on this runner." >&2
6768
exit 1
6869
fi

0 commit comments

Comments
 (0)