Skip to content

Commit daae14b

Browse files
committed
Refactor nvidia and amd into different methods
1 parent e361c8b commit daae14b

1 file changed

Lines changed: 102 additions & 75 deletions

File tree

init/eessi_archdetect.sh

Lines changed: 102 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -175,111 +175,138 @@ cpupath(){
175175
fi
176176
}
177177

178-
accelpath() {
179-
# If EESSI_ACCELERATOR_TARGET_OVERRIDE is set, use it
180-
log "DEBUG" "accelpath: Override variable set as '$EESSI_ACCELERATOR_TARGET_OVERRIDE' "
181-
if [ ! -z $EESSI_ACCELERATOR_TARGET_OVERRIDE ]; then
182-
if [[ "$EESSI_ACCELERATOR_TARGET_OVERRIDE" =~ ^accel/nvidia/cc[0-9]+$ ]]; then
183-
echo ${EESSI_ACCELERATOR_TARGET_OVERRIDE}
178+
nvidia_accelpath() {
179+
# Check for NVIDIA GPUs via nvidia-smi command
180+
local nvidia_smi
181+
nvidia_smi=$(command -v nvidia-smi)
182+
183+
if [[ $? -eq 0 ]]; then
184+
log "DEBUG" "nvidia_accelpath: nvidia-smi command found @ ${nvidia_smi}"
185+
local nvidia_smi_out
186+
nvidia_smi_out=$(mktemp -p /tmp nvidia_smi_out.XXXXX)
187+
188+
nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader 2>&1 > $nvidia_smi_out
189+
if [[ $? -eq 0 ]]; then
190+
local nvidia_smi_info=$(head -n 1 $nvidia_smi_out)
191+
local cuda_cc=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f4 -d, | sed 's/\.//g')
192+
log "DEBUG" "nvidia_accelpath: CUDA compute capability '${cuda_cc}' derived from nvidia-smi output '${nvidia_smi_info}'"
193+
194+
echo "accel/nvidia/cc${cuda_cc}"
195+
rm -f $nvidia_smi_out
184196
return 0
185197
else
186-
log "ERROR" "Value of \$EESSI_ACCELERATOR_TARGET_OVERRIDE should match 'accel/nvidia/cc[0-9]+', but it does not: '$EESSI_ACCELERATOR_TARGET_OVERRIDE'"
198+
log "DEBUG" "nvidia_accelpath: nvidia-smi command failed, see output in $nvidia_smi_out"
199+
return 3
187200
fi
188-
return 0
201+
else
202+
log "DEBUG" "nvidia_accelpath: nvidia-smi command not found"
203+
return 2
189204
fi
205+
}
190206

191-
# check for NVIDIA GPUs via nvidia-smi command
192-
# nvidia_smi=$(command -v nvidia-smi)
193-
# if [[ $? -eq 0 ]]; then
194-
# log "DEBUG" "accelpath: nvidia-smi command found @ ${nvidia_smi}"
195-
# nvidia_smi_out=$(mktemp -p /tmp nvidia_smi_out.XXXXX)
196-
# nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader 2>&1 > $nvidia_smi_out
197-
# if [[ $? -eq 0 ]]; then
198-
# nvidia_smi_info=$(head -n 1 $nvidia_smi_out)
199-
# cuda_cc=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f4 -d, | sed 's/\.//g')
200-
# log "DEBUG" "accelpath: CUDA compute capability '${cuda_cc}' derived from nvidia-smi output '${nvidia_smi_info}'"
201-
# res="accel/nvidia/cc${cuda_cc}"
202-
# log "DEBUG" "accelpath: result: ${res}"
203-
# echo $res
204-
# rm -f $nvidia_smi_out
205-
# else
206-
# log "DEBUG" "accelpath: nvidia-smi command failed, see output in $nvidia_smi_out"
207-
# exit 3
208-
# fi
209-
# else
210-
# log "DEBUG" "accelpath: nvidia-smi command not found"
211-
# exit 2
212-
# fi
213-
214-
# check for AMD GPUs via amd-smi command
215-
# amd_smi=$(command -v amd-smi)
216-
# if [[ $? -eq 0 ]]; then
217-
# log "DEBUG" "accelpath: amd-smi command found @ ${amd_smi}"
218-
# amd_smi_out=$(mktemp -p /tmp amd_smi_out.XXXXX)
219-
# amd-smi static --asic | grep TARGET_GRAPHICS_VERSION 2>&1 > $amd_smi_out
220-
# if [[ $? -eq 0 ]]; then
221-
# amd_smi_info=$(head -n 1 $amd_smi_out)
222-
# amdgcn_cc=$(echo $amd_smi_info | sed 's/.*: //')
223-
# log "DEBUG" "accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from amd-smi output '${amd_smi_info}'"
224-
# res="accel/amd/${amdgcn_cc}"
225-
# echo $res
226-
# rm -f $amd_smi_out
227-
# else
228-
# log "DEBUG" "accelpath: ami-smi command failed, see output in $amd_smi_out"
229-
# exit 3
230-
# fi
231-
# else
232-
# log "DEBUG" "accelpath: amd-smi command not found"
233-
# exit 2
234-
# fi
235-
236-
# logic ported from https://github.com/llvm/llvm-project/blob/6e738e187055bbd33b6c3d203b6b55904dfcb624/clang/tools/offload-arch/AMDGPUArchByKFD.cpp
237-
# check for AMD GPUs via KFD sysfs interface (No amd-smi or Python required)
238-
kfd_nodes="/sys/devices/virtual/kfd/kfd/topology/nodes"
207+
amd_accelpath() {
208+
# Method 1: Check for AMD GPUs via KFD sysfs interface (No amd-smi or Python required)
209+
local kfd_nodes="/sys/devices/virtual/kfd/kfd/topology/nodes"
239210

240211
if [[ -d "$kfd_nodes" ]]; then
241-
log "DEBUG" "accelpath: KFD sysfs path found @ ${kfd_nodes}"
242-
amdgcn_cc=""
212+
log "DEBUG" "amd_accelpath: KFD sysfs path found @ ${kfd_nodes}"
213+
local amdgcn_cc=""
243214

244215
# ls -1v ensures numeric/version sorting (nodes/0, nodes/1, ..., nodes/10)
245-
# just like LLVM's llvm::sort by node ID.
246-
# Assuming homogeneous nodes for EESSI, grab the first valid GPU and break
247216
for node in $(ls -1v "$kfd_nodes" 2>/dev/null); do
248-
prop_file="$kfd_nodes/$node/properties"
217+
local prop_file="$kfd_nodes/$node/properties"
249218

250219
if [[ -f "$prop_file" ]]; then
251220
# Extract the integer value. 2>/dev/null suppresses read errors.
252-
gfx_ver=$(grep "^gfx_target_version" "$prop_file" 2>/dev/null | awk '{print $2}')
221+
local gfx_ver=$(grep "^gfx_target_version" "$prop_file" 2>/dev/null | awk '{print $2}')
253222

254223
# If gfx_ver is non-empty and greater than 0 (0 means it's a CPU node)
255224
if [[ -n "$gfx_ver" && "$gfx_ver" -gt 0 ]]; then
256-
# Perform the exact math from AMDGPUArchByKFD.cpp
257-
major=$(( (gfx_ver / 10000) % 100 ))
258-
minor=$(( (gfx_ver / 100) % 100 ))
259-
step=$(( gfx_ver % 100 ))
225+
local major=$(( (gfx_ver / 10000) % 100 ))
226+
local minor=$(( (gfx_ver / 100) % 100 ))
227+
local step=$(( gfx_ver % 100 ))
260228

261-
# Format as gfx<major><minor><hex_step> (e.g., 9 0 a -> gfx90a)
262229
amdgcn_cc=$(printf "gfx%d%d%x" $major $minor $step)
263-
264-
log "DEBUG" "accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from KFD node ${node}"
230+
log "DEBUG" "amd_accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from KFD node ${node}"
265231
break
266232
fi
267233
fi
268234
done
269235

270236
if [[ -n "$amdgcn_cc" ]]; then
271-
res="accel/amd/${amdgcn_cc}"
272-
echo "$res"
237+
echo "accel/amd/${amdgcn_cc}"
238+
return 0
239+
fi
240+
log "DEBUG" "amd_accelpath: KFD topology found, but no AMD GPUs detected. Falling back to amd-smi."
241+
else
242+
log "DEBUG" "amd_accelpath: KFD sysfs path not found. Falling back to amd-smi."
243+
fi
244+
245+
# Method 2: Fallback to AMD GPUs via amd-smi command using /tmp files
246+
local amd_smi
247+
amd_smi=$(command -v amd-smi)
248+
249+
if [[ $? -eq 0 ]]; then
250+
log "DEBUG" "amd_accelpath: amd-smi command found @ ${amd_smi}"
251+
local amd_smi_out
252+
amd_smi_out=$(mktemp -p /tmp amd_smi_out.XXXXX)
253+
254+
amd-smi static --asic | grep TARGET_GRAPHICS_VERSION 2>&1 > $amd_smi_out
255+
if [[ $? -eq 0 ]]; then
256+
local amd_smi_info=$(head -n 1 $amd_smi_out)
257+
local amdgcn_cc=$(echo $amd_smi_info | sed 's/.*: //')
258+
log "DEBUG" "amd_accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from amd-smi output '${amd_smi_info}'"
259+
260+
echo "accel/amd/${amdgcn_cc}"
261+
rm -f $amd_smi_out
262+
return 0
273263
else
274-
log "DEBUG" "accelpath: KFD topology found, but no AMD GPUs detected (only CPUs)"
275-
exit 3
264+
log "DEBUG" "amd_accelpath: amd-smi command failed, see output in $amd_smi_out"
265+
return 3
276266
fi
277267
else
278-
log "DEBUG" "accelpath: KFD sysfs path not found. AMD GPU driver not loaded?"
279-
exit 2
268+
log "DEBUG" "amd_accelpath: amd-smi command not found"
269+
return 2
280270
fi
281271
}
282272

273+
accelpath() {
274+
# If EESSI_ACCELERATOR_TARGET_OVERRIDE is set, use it
275+
log "DEBUG" "accelpath: Override variable set as '$EESSI_ACCELERATOR_TARGET_OVERRIDE' "
276+
if [ ! -z $EESSI_ACCELERATOR_TARGET_OVERRIDE ]; then
277+
# Updated regex to allow both NVIDIA and AMD overrides
278+
if [[ "$EESSI_ACCELERATOR_TARGET_OVERRIDE" =~ ^accel/(nvidia/cc[0-9]+|amd/gfx[0-9a-z]+)$ ]]; then
279+
echo "$EESSI_ACCELERATOR_TARGET_OVERRIDE"
280+
return 0
281+
else
282+
log "ERROR" "Value of \$EESSI_ACCELERATOR_TARGET_OVERRIDE should match 'accel/nvidia/cc[0-9]+' or 'accel/amd/gfx[0-9a-z]+', but it does not: '$EESSI_ACCELERATOR_TARGET_OVERRIDE'"
283+
return 1
284+
fi
285+
fi
286+
287+
# 1. Check for NVIDIA GPUs
288+
local nv_res
289+
nv_res=$(nvidia_accelpath)
290+
if [[ $? -eq 0 ]]; then
291+
log "DEBUG" "accelpath: result: ${nv_res}"
292+
echo "$nv_res"
293+
return 0
294+
fi
295+
296+
# 2. Check for AMD GPUs
297+
local amd_res
298+
amd_res=$(amd_accelpath)
299+
if [[ $? -eq 0 ]]; then
300+
log "DEBUG" "accelpath: result: ${amd_res}"
301+
echo "$amd_res"
302+
return 0
303+
fi
304+
305+
# 3. Fail gracefully if neither is found
306+
log "DEBUG" "accelpath: No supported accelerators found on this system."
307+
exit 2
308+
}
309+
283310
# Parse command line arguments
284311
USAGE="Usage: eessi_archdetect.sh [-h][-d][-a] <action: cpupath or accelpath>"
285312

0 commit comments

Comments
 (0)