Skip to content

Commit fbcd31d

Browse files
committed
[NV] llm-d: pip-install huggingface_hub on demand, abort runner if staging fails
Signed-off-by: Ezra Silvera <ezra@il.ibm.com>
1 parent 94345c5 commit fbcd31d

1 file changed

Lines changed: 30 additions & 16 deletions

File tree

runners/launch_h200-dgxc-slurm.sh

Lines changed: 30 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -29,33 +29,47 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
2929
# The download happens inline on the runner host so users with
3030
# only gh-dispatch access can stage the model without ssh.
3131
GPTOSS_LOCAL_DIR="$HOME/inferencex-models/gpt-oss-120b"
32+
export GPTOSS_LOCAL_DIR
3233
if [[ -d "/models/gpt-oss-120b" ]]; then
3334
export MODEL_PATH="/models/gpt-oss-120b"
3435
else
3536
mkdir -p "$HOME/inferencex-models"
36-
# flock serializes concurrent dispatches so a second run
37-
# waits for the first download to finish instead of racing.
38-
(
37+
stage_gptoss_120b() {
38+
set -euo pipefail
39+
# flock serializes concurrent dispatches so a second
40+
# run waits instead of racing the first download.
41+
exec 200>"$HOME/inferencex-models/.gpt-oss-120b.download.lock"
3942
flock -x 200
40-
if [[ ! -d "$GPTOSS_LOCAL_DIR" ]]; then
41-
echo "Staging openai/gpt-oss-120b -> $GPTOSS_LOCAL_DIR (one-time, ~60 GB)"
42-
if command -v huggingface-cli >/dev/null 2>&1; then
43-
huggingface-cli download openai/gpt-oss-120b \
44-
--local-dir "$GPTOSS_LOCAL_DIR"
45-
elif python3 -c "import huggingface_hub" 2>/dev/null; then
46-
python3 - <<'PY'
43+
if [[ -d "$GPTOSS_LOCAL_DIR" && -n "$(ls -A "$GPTOSS_LOCAL_DIR" 2>/dev/null)" ]]; then
44+
return 0 # already staged
45+
fi
46+
echo "Staging openai/gpt-oss-120b -> $GPTOSS_LOCAL_DIR (one-time, ~60 GB)"
47+
if ! command -v huggingface-cli >/dev/null 2>&1 \
48+
&& ! python3 -c "import huggingface_hub" 2>/dev/null; then
49+
echo "Installing huggingface_hub via pip --user"
50+
python3 -m pip install --user --quiet huggingface_hub
51+
export PATH="$HOME/.local/bin:$PATH"
52+
fi
53+
if command -v huggingface-cli >/dev/null 2>&1; then
54+
huggingface-cli download openai/gpt-oss-120b \
55+
--local-dir "$GPTOSS_LOCAL_DIR"
56+
else
57+
python3 - <<'PY'
4758
import os
4859
from huggingface_hub import snapshot_download
4960
snapshot_download(repo_id="openai/gpt-oss-120b",
5061
local_dir=os.environ["GPTOSS_LOCAL_DIR"])
5162
PY
52-
else
53-
echo "Error: neither huggingface-cli nor python3 huggingface_hub available." >&2
54-
echo " Cannot auto-stage gpt-oss-120b on this runner." >&2
55-
exit 1
56-
fi
5763
fi
58-
) 200>"$HOME/inferencex-models/.gpt-oss-120b.download.lock"
64+
}
65+
if ! stage_gptoss_120b; then
66+
echo "Error: failed to stage gpt-oss-120b on this runner." >&2
67+
exit 1
68+
fi
69+
if [[ ! -d "$GPTOSS_LOCAL_DIR" ]] || [[ -z "$(ls -A "$GPTOSS_LOCAL_DIR" 2>/dev/null)" ]]; then
70+
echo "Error: $GPTOSS_LOCAL_DIR is empty after staging step." >&2
71+
exit 1
72+
fi
5973
export MODEL_PATH="$GPTOSS_LOCAL_DIR"
6074
fi
6175
export MODEL_NAME="gpt-oss-120b"

0 commit comments

Comments
 (0)