Skip to content

Commit ef52cb8

Browse files
committed
Add support for AMD GPU
1 parent 4004732 commit ef52cb8

1 file changed

Lines changed: 118 additions & 23 deletions

File tree

init/eessi_archdetect.sh

Lines changed: 118 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -175,41 +175,136 @@ cpupath(){
175175
fi
176176
}
177177

178+
nvidia_accelpath() {
179+
# Check for NVIDIA GPUs via nvidia-smi command
180+
local nvidia_smi
181+
nvidia_smi=$(command -v nvidia-smi)
182+
183+
if [[ $? -eq 0 ]]; then
184+
log "DEBUG" "nvidia_accelpath: nvidia-smi command found @ ${nvidia_smi}"
185+
local nvidia_smi_out
186+
nvidia_smi_out=$(mktemp -p /tmp nvidia_smi_out.XXXXX)
187+
188+
nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader 2>&1 > $nvidia_smi_out
189+
if [[ $? -eq 0 ]]; then
190+
local nvidia_smi_info=$(head -n 1 $nvidia_smi_out)
191+
local cuda_cc=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f4 -d, | sed 's/\.//g')
192+
log "DEBUG" "nvidia_accelpath: CUDA compute capability '${cuda_cc}' derived from nvidia-smi output '${nvidia_smi_info}'"
193+
194+
echo "accel/nvidia/cc${cuda_cc}"
195+
rm -f $nvidia_smi_out
196+
return 0
197+
else
198+
log "DEBUG" "nvidia_accelpath: nvidia-smi command failed, see output in $nvidia_smi_out"
199+
return 3
200+
fi
201+
else
202+
log "DEBUG" "nvidia_accelpath: nvidia-smi command not found"
203+
return 2
204+
fi
205+
}
206+
207+
amd_accelpath() {
208+
# Method 1: Check for AMD GPUs via KFD sysfs interface (No amd-smi or Python required)
209+
local kfd_nodes="/sys/devices/virtual/kfd/kfd/topology/nodes"
210+
211+
if [[ -d "$kfd_nodes" ]]; then
212+
log "DEBUG" "amd_accelpath: KFD sysfs path found @ ${kfd_nodes}"
213+
local amdgcn_cc=""
214+
215+
# ls -1v ensures numeric/version sorting (nodes/0, nodes/1, ..., nodes/10)
216+
for node in $(ls -1v "$kfd_nodes" 2>/dev/null); do
217+
local prop_file="$kfd_nodes/$node/properties"
218+
219+
if [[ -f "$prop_file" ]]; then
220+
# Extract the integer value. 2>/dev/null suppresses read errors.
221+
local gfx_ver=$(grep "^gfx_target_version" "$prop_file" 2>/dev/null | awk '{print $2}')
222+
223+
# If gfx_ver is non-empty and greater than 0 (0 means it's a CPU node)
224+
if [[ -n "$gfx_ver" && "$gfx_ver" -gt 0 ]]; then
225+
local major=$(( (gfx_ver / 10000) % 100 ))
226+
local minor=$(( (gfx_ver / 100) % 100 ))
227+
local step=$(( gfx_ver % 100 ))
228+
229+
amdgcn_cc=$(printf "gfx%d%d%x" $major $minor $step)
230+
log "DEBUG" "amd_accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from KFD node ${node}"
231+
break
232+
fi
233+
fi
234+
done
235+
236+
if [[ -n "$amdgcn_cc" ]]; then
237+
echo "accel/amd/${amdgcn_cc}"
238+
return 0
239+
fi
240+
log "DEBUG" "amd_accelpath: KFD topology found, but no AMD GPUs detected. Falling back to amd-smi."
241+
else
242+
log "DEBUG" "amd_accelpath: KFD sysfs path not found. Falling back to amd-smi."
243+
fi
244+
245+
# Method 2: Fallback to AMD GPUs via amd-smi command using /tmp files
246+
local amd_smi
247+
amd_smi=$(command -v amd-smi)
248+
249+
if [[ $? -eq 0 ]]; then
250+
log "DEBUG" "amd_accelpath: amd-smi command found @ ${amd_smi}"
251+
local amd_smi_out
252+
amd_smi_out=$(mktemp -p /tmp amd_smi_out.XXXXX)
253+
254+
amd-smi static --asic | grep TARGET_GRAPHICS_VERSION 2>&1 > $amd_smi_out
255+
if [[ $? -eq 0 ]]; then
256+
local amd_smi_info=$(head -n 1 $amd_smi_out)
257+
local amdgcn_cc=$(echo $amd_smi_info | sed 's/.*: //')
258+
log "DEBUG" "amd_accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from amd-smi output '${amd_smi_info}'"
259+
260+
echo "accel/amd/${amdgcn_cc}"
261+
rm -f $amd_smi_out
262+
return 0
263+
else
264+
log "DEBUG" "amd_accelpath: amd-smi command failed, see output in $amd_smi_out"
265+
return 3
266+
fi
267+
else
268+
log "DEBUG" "amd_accelpath: amd-smi command not found"
269+
return 2
270+
fi
271+
}
272+
178273
accelpath() {
179274
# If EESSI_ACCELERATOR_TARGET_OVERRIDE is set, use it
180275
log "DEBUG" "accelpath: Override variable set as '$EESSI_ACCELERATOR_TARGET_OVERRIDE' "
181276
if [ ! -z $EESSI_ACCELERATOR_TARGET_OVERRIDE ]; then
182-
if [[ "$EESSI_ACCELERATOR_TARGET_OVERRIDE" =~ ^accel/nvidia/cc[0-9]+$ ]]; then
183-
echo ${EESSI_ACCELERATOR_TARGET_OVERRIDE}
277+
# Updated regex to allow both NVIDIA and AMD overrides
278+
if [[ "$EESSI_ACCELERATOR_TARGET_OVERRIDE" =~ ^accel/(nvidia/cc[0-9]+|amd/gfx[0-9a-f]+)$ ]]; then
279+
echo "$EESSI_ACCELERATOR_TARGET_OVERRIDE"
184280
return 0
185281
else
186-
log "ERROR" "Value of \$EESSI_ACCELERATOR_TARGET_OVERRIDE should match 'accel/nvidia/cc[0-9]+', but it does not: '$EESSI_ACCELERATOR_TARGET_OVERRIDE'"
282+
log "ERROR" "Value of \$EESSI_ACCELERATOR_TARGET_OVERRIDE should match 'accel/nvidia/cc[0-9]+' or 'accel/amd/gfx[0-9a-f]+', but it does not: '$EESSI_ACCELERATOR_TARGET_OVERRIDE'"
283+
return 1
187284
fi
285+
fi
286+
287+
# 1. Check for NVIDIA GPUs
288+
local nv_res
289+
nv_res=$(nvidia_accelpath)
290+
if [[ $? -eq 0 ]]; then
291+
log "DEBUG" "accelpath: result: ${nv_res}"
292+
echo "$nv_res"
188293
return 0
189294
fi
190295

191-
# check for NVIDIA GPUs via nvidia-smi command
192-
nvidia_smi=$(command -v nvidia-smi)
296+
# 2. Check for AMD GPUs
297+
local amd_res
298+
amd_res=$(amd_accelpath)
193299
if [[ $? -eq 0 ]]; then
194-
log "DEBUG" "accelpath: nvidia-smi command found @ ${nvidia_smi}"
195-
nvidia_smi_out=$(mktemp -p /tmp nvidia_smi_out.XXXXX)
196-
nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader 2>&1 > $nvidia_smi_out
197-
if [[ $? -eq 0 ]]; then
198-
nvidia_smi_info=$(head -n 1 $nvidia_smi_out)
199-
cuda_cc=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f4 -d, | sed 's/\.//g')
200-
log "DEBUG" "accelpath: CUDA compute capability '${cuda_cc}' derived from nvidia-smi output '${nvidia_smi_info}'"
201-
res="accel/nvidia/cc${cuda_cc}"
202-
log "DEBUG" "accelpath: result: ${res}"
203-
echo $res
204-
rm -f $nvidia_smi_out
205-
else
206-
log "DEBUG" "accelpath: nvidia-smi command failed, see output in $nvidia_smi_out"
207-
exit 3
208-
fi
209-
else
210-
log "DEBUG" "accelpath: nvidia-smi command not found"
211-
exit 2
300+
log "DEBUG" "accelpath: result: ${amd_res}"
301+
echo "$amd_res"
302+
return 0
212303
fi
304+
305+
# 3. Fail gracefully if neither is found
306+
log "DEBUG" "accelpath: No supported accelerators found on this system."
307+
exit 2
213308
}
214309

215310
# Parse command line arguments

0 commit comments

Comments
 (0)