66export SLURM_PARTITION=" main"
77export SLURM_ACCOUNT=" sa-shared"
88
9+ # Inline copies of agentic_pip_install / ensure_hf_cli from
10+ # benchmarks/benchmark_lib.sh. We can't safely source the whole lib here
11+ # (it ships container-only side effects), but we want the same hf-CLI
12+ # install path the rest of the repo uses so model staging on the runner
13+ # host matches `hf download "$MODEL"` in single-node scripts.
14+ agentic_pip_install () {
15+ local pip_install=(python3 -m pip install)
16+ if python3 -m pip install --help 2> /dev/null | grep -q -- " --break-system-packages" ; then
17+ pip_install+=(--break-system-packages)
18+ fi
19+ " ${pip_install[@]} " " $@ "
20+ }
21+ ensure_hf_cli () {
22+ if command -v hf > /dev/null 2>&1 ; then return 0; fi
23+ agentic_pip_install --quiet --user " huggingface_hub[cli]>=0.25.0"
24+ # pip --user puts the binary under ~/.local/bin; make it visible.
25+ export PATH=" $HOME /.local/bin:$PATH "
26+ command -v hf > /dev/null 2>&1
27+ }
28+
929set -x
1030
1131if [[ " $IS_MULTINODE " == " true" ]]; then
@@ -24,45 +44,26 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
2444 # the container, so MODEL_PATH must be an existing host
2545 # directory (an HF id alone will not work). Resolution order:
2646 # 1. /models/gpt-oss-120b (cluster-shared, staged by an admin)
27- # 2. $HOME/inferencex-models/gpt-oss-120b (per-runner, written
28- # by this script - downloaded once, cached across dispatches)
29- # The download happens inline on the runner host so users with
30- # only gh-dispatch access can stage the model without ssh.
47+ # 2. $HOME/inferencex-models/gpt-oss-120b (per-runner, staged
48+ # here via `hf download`; cached across dispatches)
3149 GPTOSS_LOCAL_DIR=" $HOME /inferencex-models/gpt-oss-120b"
32- export GPTOSS_LOCAL_DIR
3350 if [[ -d " /models/gpt-oss-120b" ]]; then
3451 export MODEL_PATH=" /models/gpt-oss-120b"
3552 else
3653 mkdir -p " $HOME /inferencex-models"
37- stage_gptoss_120b () {
54+ # flock serializes concurrent dispatches so a second
55+ # run waits instead of racing the first download.
56+ if ! (
3857 set -euo pipefail
39- # flock serializes concurrent dispatches so a second
40- # run waits instead of racing the first download.
4158 exec 200> " $HOME /inferencex-models/.gpt-oss-120b.download.lock"
4259 flock -x 200
4360 if [[ -d " $GPTOSS_LOCAL_DIR " && -n " $( ls -A " $GPTOSS_LOCAL_DIR " 2> /dev/null) " ]]; then
44- return 0 # already staged
61+ exit 0
4562 fi
4663 echo " Staging openai/gpt-oss-120b -> $GPTOSS_LOCAL_DIR (one-time, ~60 GB)"
47- if ! command -v huggingface-cli > /dev/null 2>&1 \
48- && ! python3 -c " import huggingface_hub" 2> /dev/null; then
49- echo " Installing huggingface_hub via pip --user"
50- python3 -m pip install --user --quiet huggingface_hub
51- export PATH=" $HOME /.local/bin:$PATH "
52- fi
53- if command -v huggingface-cli > /dev/null 2>&1 ; then
54- huggingface-cli download openai/gpt-oss-120b \
55- --local-dir " $GPTOSS_LOCAL_DIR "
56- else
57- python3 - << 'PY '
58- import os
59- from huggingface_hub import snapshot_download
60- snapshot_download(repo_id="openai/gpt-oss-120b",
61- local_dir=os.environ["GPTOSS_LOCAL_DIR"])
62- PY
63- fi
64- }
65- if ! stage_gptoss_120b; then
64+ ensure_hf_cli
65+ hf download openai/gpt-oss-120b --local-dir " $GPTOSS_LOCAL_DIR "
66+ ); then
6667 echo " Error: failed to stage gpt-oss-120b on this runner." >&2
6768 exit 1
6869 fi
0 commit comments