forked from SemiAnalysisAI/InferenceX
-
Notifications
You must be signed in to change notification settings - Fork 0
152 lines (139 loc) · 4.58 KB
/
benchmark-tmpl.yml
File metadata and controls
152 lines (139 loc) · 4.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
name: Template - Benchmark
on:
workflow_call:
inputs:
runner:
required: true
type: string
image:
required: true
type: string
model:
required: true
type: string
precision:
required: true
type: string
framework:
required: true
type: string
exp-name:
required: true
type: string
isl:
required: true
type: string
osl:
required: true
type: string
tp:
required: true
type: string
ep:
required: true
type: string
dp-attn:
required: true
type: boolean
max-model-len:
required: true
type: string
conc:
required: true
type: string
random-range-ratio:
required: false
type: string
default: '0.8'
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HF_HUB_CACHE: '/mnt/hf_hub_cache/'
EXP_NAME: ${{ inputs.exp-name }}
MODEL: ${{ inputs.model }}
ISL: ${{ inputs.isl }}
OSL: ${{ inputs.osl }}
MAX_MODEL_LEN: ${{ inputs.max-model-len }}
RANDOM_RANGE_RATIO: ${{ inputs.random-range-ratio }}
IMAGE: ${{ inputs.image }}
FRAMEWORK: ${{ inputs.framework }}
PRECISION: ${{ inputs.precision }}
TP: ${{ inputs.tp }}
EP_SIZE: ${{ inputs.ep }}
DP_ATTENTION: ${{ inputs.dp-attn }}
CONC: ${{ inputs.conc }}
permissions:
contents: read
jobs:
benchmark:
runs-on: ${{ inputs.runner }}
timeout-minutes: 180
name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }}'
steps:
- name: Resource cleanup
run: |
if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
host=$(hostname)
if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then
echo "[INFO] Running container-by-container cleanup on $host"
for cid in $(docker ps -aq); do
echo "[INFO] Cleaning container $cid"
# Try graceful first
docker stop -t 90 "$cid" || true
# Wait until it's really dead
docker wait "$cid" >/dev/null 2>&1 || true
# Force remove if anything lingers
docker rm -f "$cid" >/dev/null 2>&1 || true
done
# Give a moment for GPU processes to fully terminate
sleep 2
# Verify GPUs are now idle
if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then
echo "[WARN] After stop, GPU still busy:"
nvidia-smi
# Last resort if driver allows and GPUs appear idle otherwise:
# nvidia-smi --gpu-reset -i 0,1,2,3,4,5,6,7 2>/dev/null || true
fi
else
echo "[Docker] Cleaning up resources ..."
docker ps -aq | xargs -r docker rm -f
docker network prune -f
while [ -n "$(docker ps -aq)" ]; do
docker ps -a
sleep 5
done
fi
fi
if command -v squeue >/dev/null 2>&1; then
echo "[Slurm] Cleaning up resources ..."
scancel -u $USER
while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do
squeue -u $USER
sleep 5
done
fi
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
token: ${{ secrets.REPO_PAT }}
fetch-depth: 0
- name: Launch job script
env:
RUNNER_NAME: ${{ runner.name }}
RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_ep${{ env.EP_SIZE }}_dpa_${{ env.DP_ATTENTION }}_conc${{ env.CONC }}_${{ runner.name }}
run: |
bash ./runners/launch_${RUNNER_NAME%%_*}.sh
if [ -f "$RESULT_FILENAME.json" ]; then
echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV
else
echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2
exit 1
fi
- name: Process result
env:
RUNNER_TYPE: ${{ inputs.runner }}
run: |
python3 utils/process_result.py
- name: Upload result
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
with:
name: ${{ env.RESULT_FILENAME }}
path: agg_${{ env.RESULT_FILENAME }}.json