Skip to content

Commit df5b271

Browse files
committed
Add AMD accelerator support and refactor accelpath method
1 parent 28af25e commit df5b271

1 file changed

Lines changed: 115 additions & 21 deletions

File tree

init/eessi_archdetect.sh

Lines changed: 115 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -175,11 +175,106 @@ cpupath(){
175175
fi
176176
}
177177

178+
nvidia_accelpath() {
179+
# Check for NVIDIA GPUs via nvidia-smi command
180+
local nvidia_smi
181+
nvidia_smi=$(command -v nvidia-smi)
182+
183+
if [[ $? -eq 0 ]]; then
184+
log "DEBUG" "nvidia_accelpath: nvidia-smi command found @ ${nvidia_smi}"
185+
local nvidia_smi_out
186+
nvidia_smi_out=$(mktemp -p /tmp nvidia_smi_out.XXXXX)
187+
188+
nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader 2>&1 > $nvidia_smi_out
189+
if [[ $? -eq 0 ]]; then
190+
local nvidia_smi_info=$(head -n 1 $nvidia_smi_out)
191+
local cuda_cc=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f4 -d, | sed 's/\.//g')
192+
log "DEBUG" "nvidia_accelpath: CUDA compute capability '${cuda_cc}' derived from nvidia-smi output '${nvidia_smi_info}'"
193+
194+
echo "accel/nvidia/cc${cuda_cc}"
195+
rm -f $nvidia_smi_out
196+
return 0
197+
else
198+
log "DEBUG" "nvidia_accelpath: nvidia-smi command failed, see output in $nvidia_smi_out"
199+
return 3
200+
fi
201+
else
202+
log "DEBUG" "nvidia_accelpath: nvidia-smi command not found"
203+
return 2
204+
fi
205+
}
206+
207+
amd_accelpath() {
208+
# Method 1: Check for AMD GPUs via KFD sysfs interface (No amd-smi or Python required)
209+
local kfd_nodes="/sys/devices/virtual/kfd/kfd/topology/nodes"
210+
211+
if [[ -d "$kfd_nodes" ]]; then
212+
log "DEBUG" "amd_accelpath: KFD sysfs path found @ ${kfd_nodes}"
213+
local amdgcn_cc=""
214+
215+
# ls -1v ensures numeric/version sorting (nodes/0, nodes/1, ..., nodes/10)
216+
for node in $(ls -1v "$kfd_nodes" 2>/dev/null); do
217+
local prop_file="$kfd_nodes/$node/properties"
218+
219+
if [[ -f "$prop_file" ]]; then
220+
# Extract the integer value. 2>/dev/null suppresses read errors.
221+
local gfx_ver=$(grep "^gfx_target_version" "$prop_file" 2>/dev/null | awk '{print $2}')
222+
223+
# If gfx_ver is non-empty and greater than 0 (0 means it's a CPU node)
224+
if [[ -n "$gfx_ver" && "$gfx_ver" -gt 0 ]]; then
225+
local major=$(( (gfx_ver / 10000) % 100 ))
226+
local minor=$(( (gfx_ver / 100) % 100 ))
227+
local step=$(( gfx_ver % 100 ))
228+
229+
amdgcn_cc=$(printf "gfx%d%d%x" $major $minor $step)
230+
log "DEBUG" "amd_accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from KFD node ${node}"
231+
break
232+
fi
233+
fi
234+
done
235+
236+
if [[ -n "$amdgcn_cc" ]]; then
237+
echo "accel/amd/${amdgcn_cc}"
238+
return 0
239+
fi
240+
log "DEBUG" "amd_accelpath: KFD topology found, but no AMD GPUs detected. Falling back to amd-smi."
241+
else
242+
log "DEBUG" "amd_accelpath: KFD sysfs path not found. Falling back to amd-smi."
243+
fi
244+
245+
# Method 2: Fallback to AMD GPUs via amd-smi command using /tmp files
246+
local amd_smi
247+
amd_smi=$(command -v amd-smi)
248+
249+
if [[ $? -eq 0 ]]; then
250+
log "DEBUG" "amd_accelpath: amd-smi command found @ ${amd_smi}"
251+
local amd_smi_out
252+
amd_smi_out=$(mktemp -p /tmp amd_smi_out.XXXXX)
253+
254+
amd-smi static --asic | grep TARGET_GRAPHICS_VERSION 2>&1 > $amd_smi_out
255+
if [[ $? -eq 0 ]]; then
256+
local amd_smi_info=$(head -n 1 $amd_smi_out)
257+
local amdgcn_cc=$(echo $amd_smi_info | sed 's/.*: //')
258+
log "DEBUG" "amd_accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from amd-smi output '${amd_smi_info}'"
259+
260+
echo "accel/amd/${amdgcn_cc}"
261+
rm -f $amd_smi_out
262+
return 0
263+
else
264+
log "DEBUG" "amd_accelpath: amd-smi command failed, see output in $amd_smi_out"
265+
return 3
266+
fi
267+
else
268+
log "DEBUG" "amd_accelpath: amd-smi command not found"
269+
return 2
270+
fi
271+
}
272+
178273
accelpath() {
179274
# If EESSI_ACCELERATOR_TARGET_OVERRIDE is set, use it
180275
log "DEBUG" "accelpath: Override variable set as '$EESSI_ACCELERATOR_TARGET_OVERRIDE' "
181276
if [ ! -z $EESSI_ACCELERATOR_TARGET_OVERRIDE ]; then
182-
# Regex that allows both NVIDIA and AMD overrides
277+
# Updated regex to allow both NVIDIA and AMD overrides
183278
if [[ "$EESSI_ACCELERATOR_TARGET_OVERRIDE" =~ ^accel/(nvidia/cc[0-9]+|amd/gfx[0-9a-f]+)$ ]]; then
184279
echo "$EESSI_ACCELERATOR_TARGET_OVERRIDE"
185280
return 0
@@ -189,28 +284,27 @@ accelpath() {
189284
fi
190285
fi
191286

192-
# check for NVIDIA GPUs via nvidia-smi command
193-
nvidia_smi=$(command -v nvidia-smi)
287+
# 1. Check for NVIDIA GPUs
288+
local nv_res
289+
nv_res=$(nvidia_accelpath)
194290
if [[ $? -eq 0 ]]; then
195-
log "DEBUG" "accelpath: nvidia-smi command found @ ${nvidia_smi}"
196-
nvidia_smi_out=$(mktemp -p /tmp nvidia_smi_out.XXXXX)
197-
nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader 2>&1 > $nvidia_smi_out
198-
if [[ $? -eq 0 ]]; then
199-
nvidia_smi_info=$(head -n 1 $nvidia_smi_out)
200-
cuda_cc=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f4 -d, | sed 's/\.//g')
201-
log "DEBUG" "accelpath: CUDA compute capability '${cuda_cc}' derived from nvidia-smi output '${nvidia_smi_info}'"
202-
res="accel/nvidia/cc${cuda_cc}"
203-
log "DEBUG" "accelpath: result: ${res}"
204-
echo $res
205-
rm -f $nvidia_smi_out
206-
else
207-
log "DEBUG" "accelpath: nvidia-smi command failed, see output in $nvidia_smi_out"
208-
exit 3
209-
fi
210-
else
211-
log "DEBUG" "accelpath: nvidia-smi command not found"
212-
exit 2
291+
log "DEBUG" "accelpath: result: ${nv_res}"
292+
echo "$nv_res"
293+
return 0
213294
fi
295+
296+
# 2. Check for AMD GPUs
297+
local amd_res
298+
amd_res=$(amd_accelpath)
299+
if [[ $? -eq 0 ]]; then
300+
log "DEBUG" "accelpath: result: ${amd_res}"
301+
echo "$amd_res"
302+
return 0
303+
fi
304+
305+
# 3. Fail gracefully if neither is found
306+
log "DEBUG" "accelpath: No supported accelerators found on this system."
307+
exit 2
214308
}
215309

216310
# Parse command line arguments

0 commit comments

Comments
 (0)