@@ -175,111 +175,138 @@ cpupath(){
175175 fi
176176}
177177
178- accelpath () {
179- # If EESSI_ACCELERATOR_TARGET_OVERRIDE is set, use it
180- log " DEBUG" " accelpath: Override variable set as '$EESSI_ACCELERATOR_TARGET_OVERRIDE ' "
181- if [ ! -z $EESSI_ACCELERATOR_TARGET_OVERRIDE ]; then
182- if [[ " $EESSI_ACCELERATOR_TARGET_OVERRIDE " =~ ^accel/nvidia/cc[0-9]+$ ]]; then
183- echo ${EESSI_ACCELERATOR_TARGET_OVERRIDE}
178+ nvidia_accelpath () {
179+ # Check for NVIDIA GPUs via nvidia-smi command
180+ local nvidia_smi
181+ nvidia_smi=$( command -v nvidia-smi)
182+
183+ if [[ $? -eq 0 ]]; then
184+ log " DEBUG" " nvidia_accelpath: nvidia-smi command found @ ${nvidia_smi} "
185+ local nvidia_smi_out
186+ nvidia_smi_out=$( mktemp -p /tmp nvidia_smi_out.XXXXX)
187+
188+ nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader 2>&1 > $nvidia_smi_out
189+ if [[ $? -eq 0 ]]; then
190+ local nvidia_smi_info=$( head -n 1 $nvidia_smi_out )
191+ local cuda_cc=$( echo $nvidia_smi_info | sed ' s/, /,/g' | cut -f4 -d, | sed ' s/\.//g' )
192+ log " DEBUG" " nvidia_accelpath: CUDA compute capability '${cuda_cc} ' derived from nvidia-smi output '${nvidia_smi_info} '"
193+
194+ echo " accel/nvidia/cc${cuda_cc} "
195+ rm -f $nvidia_smi_out
184196 return 0
185197 else
186- log " ERROR" " Value of \$ EESSI_ACCELERATOR_TARGET_OVERRIDE should match 'accel/nvidia/cc[0-9]+', but it does not: '$EESSI_ACCELERATOR_TARGET_OVERRIDE '"
198+ log " DEBUG" " nvidia_accelpath: nvidia-smi command failed, see output in $nvidia_smi_out "
199+ return 3
187200 fi
188- return 0
201+ else
202+ log " DEBUG" " nvidia_accelpath: nvidia-smi command not found"
203+ return 2
189204 fi
205+ }
190206
191- # check for NVIDIA GPUs via nvidia-smi command
192- # nvidia_smi=$(command -v nvidia-smi)
193- # if [[ $? -eq 0 ]]; then
194- # log "DEBUG" "accelpath: nvidia-smi command found @ ${nvidia_smi}"
195- # nvidia_smi_out=$(mktemp -p /tmp nvidia_smi_out.XXXXX)
196- # nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader 2>&1 > $nvidia_smi_out
197- # if [[ $? -eq 0 ]]; then
198- # nvidia_smi_info=$(head -n 1 $nvidia_smi_out)
199- # cuda_cc=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f4 -d, | sed 's/\.//g')
200- # log "DEBUG" "accelpath: CUDA compute capability '${cuda_cc}' derived from nvidia-smi output '${nvidia_smi_info}'"
201- # res="accel/nvidia/cc${cuda_cc}"
202- # log "DEBUG" "accelpath: result: ${res}"
203- # echo $res
204- # rm -f $nvidia_smi_out
205- # else
206- # log "DEBUG" "accelpath: nvidia-smi command failed, see output in $nvidia_smi_out"
207- # exit 3
208- # fi
209- # else
210- # log "DEBUG" "accelpath: nvidia-smi command not found"
211- # exit 2
212- # fi
213-
214- # check for AMD GPUs via amd-smi command
215- # amd_smi=$(command -v amd-smi)
216- # if [[ $? -eq 0 ]]; then
217- # log "DEBUG" "accelpath: amd-smi command found @ ${amd_smi}"
218- # amd_smi_out=$(mktemp -p /tmp amd_smi_out.XXXXX)
219- # amd-smi static --asic | grep TARGET_GRAPHICS_VERSION 2>&1 > $amd_smi_out
220- # if [[ $? -eq 0 ]]; then
221- # amd_smi_info=$(head -n 1 $amd_smi_out)
222- # amdgcn_cc=$(echo $amd_smi_info | sed 's/.*: //')
223- # log "DEBUG" "accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from amd-smi output '${amd_smi_info}'"
224- # res="accel/amd/${amdgcn_cc}"
225- # echo $res
226- # rm -f $amd_smi_out
227- # else
228- # log "DEBUG" "accelpath: ami-smi command failed, see output in $amd_smi_out"
229- # exit 3
230- # fi
231- # else
232- # log "DEBUG" "accelpath: amd-smi command not found"
233- # exit 2
234- # fi
235-
236- # logic ported from https://github.com/llvm/llvm-project/blob/6e738e187055bbd33b6c3d203b6b55904dfcb624/clang/tools/offload-arch/AMDGPUArchByKFD.cpp
237- # check for AMD GPUs via KFD sysfs interface (No amd-smi or Python required)
238- kfd_nodes=" /sys/devices/virtual/kfd/kfd/topology/nodes"
207+ amd_accelpath () {
208+ # Method 1: Check for AMD GPUs via KFD sysfs interface (No amd-smi or Python required)
209+ local kfd_nodes=" /sys/devices/virtual/kfd/kfd/topology/nodes"
239210
240211 if [[ -d " $kfd_nodes " ]]; then
241- log " DEBUG" " accelpath : KFD sysfs path found @ ${kfd_nodes} "
242- amdgcn_cc=" "
212+ log " DEBUG" " amd_accelpath : KFD sysfs path found @ ${kfd_nodes} "
213+ local amdgcn_cc=" "
243214
244215 # ls -1v ensures numeric/version sorting (nodes/0, nodes/1, ..., nodes/10)
245- # just like LLVM's llvm::sort by node ID.
246- # Assuming homogeneous nodes for EESSI, grab the first valid GPU and break
247216 for node in $( ls -1v " $kfd_nodes " 2> /dev/null) ; do
248- prop_file=" $kfd_nodes /$node /properties"
217+ local prop_file=" $kfd_nodes /$node /properties"
249218
250219 if [[ -f " $prop_file " ]]; then
251220 # Extract the integer value. 2>/dev/null suppresses read errors.
252- gfx_ver=$( grep " ^gfx_target_version" " $prop_file " 2> /dev/null | awk ' {print $2}' )
221+ local gfx_ver=$( grep " ^gfx_target_version" " $prop_file " 2> /dev/null | awk ' {print $2}' )
253222
254223 # If gfx_ver is non-empty and greater than 0 (0 means it's a CPU node)
255224 if [[ -n " $gfx_ver " && " $gfx_ver " -gt 0 ]]; then
256- # Perform the exact math from AMDGPUArchByKFD.cpp
257- major=$(( (gfx_ver / 10000 ) % 100 ))
258- minor=$(( (gfx_ver / 100 ) % 100 ))
259- step=$(( gfx_ver % 100 ))
225+ local major=$(( (gfx_ver / 10000 ) % 100 ))
226+ local minor=$(( (gfx_ver / 100 ) % 100 ))
227+ local step=$(( gfx_ver % 100 ))
260228
261- # Format as gfx<major><minor><hex_step> (e.g., 9 0 a -> gfx90a)
262229 amdgcn_cc=$( printf " gfx%d%d%x" $major $minor $step )
263-
264- log " DEBUG" " accelpath: AMDGCN compute capability '${amdgcn_cc} ' derived from KFD node ${node} "
230+ log " DEBUG" " amd_accelpath: AMDGCN compute capability '${amdgcn_cc} ' derived from KFD node ${node} "
265231 break
266232 fi
267233 fi
268234 done
269235
270236 if [[ -n " $amdgcn_cc " ]]; then
271- res=" accel/amd/${amdgcn_cc} "
272- echo " $res "
237+ echo " accel/amd/${amdgcn_cc} "
238+ return 0
239+ fi
240+ log " DEBUG" " amd_accelpath: KFD topology found, but no AMD GPUs detected. Falling back to amd-smi."
241+ else
242+ log " DEBUG" " amd_accelpath: KFD sysfs path not found. Falling back to amd-smi."
243+ fi
244+
245+ # Method 2: Fallback to AMD GPUs via amd-smi command using /tmp files
246+ local amd_smi
247+ amd_smi=$( command -v amd-smi)
248+
249+ if [[ $? -eq 0 ]]; then
250+ log " DEBUG" " amd_accelpath: amd-smi command found @ ${amd_smi} "
251+ local amd_smi_out
252+ amd_smi_out=$( mktemp -p /tmp amd_smi_out.XXXXX)
253+
254+ amd-smi static --asic | grep TARGET_GRAPHICS_VERSION 2>&1 > $amd_smi_out
255+ if [[ $? -eq 0 ]]; then
256+ local amd_smi_info=$( head -n 1 $amd_smi_out )
257+ local amdgcn_cc=$( echo $amd_smi_info | sed ' s/.*: //' )
258+ log " DEBUG" " amd_accelpath: AMDGCN compute capability '${amdgcn_cc} ' derived from amd-smi output '${amd_smi_info} '"
259+
260+ echo " accel/amd/${amdgcn_cc} "
261+ rm -f $amd_smi_out
262+ return 0
273263 else
274- log " DEBUG" " accelpath: KFD topology found, but no AMD GPUs detected (only CPUs) "
275- exit 3
264+ log " DEBUG" " amd_accelpath: amd-smi command failed, see output in $amd_smi_out "
265+ return 3
276266 fi
277267 else
278- log " DEBUG" " accelpath: KFD sysfs path not found. AMD GPU driver not loaded? "
279- exit 2
268+ log " DEBUG" " amd_accelpath: amd-smi command not found"
269+ return 2
280270 fi
281271}
282272
273+ accelpath () {
274+ # If EESSI_ACCELERATOR_TARGET_OVERRIDE is set, use it
275+ log " DEBUG" " accelpath: Override variable set as '$EESSI_ACCELERATOR_TARGET_OVERRIDE ' "
276+ if [ ! -z $EESSI_ACCELERATOR_TARGET_OVERRIDE ]; then
277+ # Updated regex to allow both NVIDIA and AMD overrides
278+ if [[ " $EESSI_ACCELERATOR_TARGET_OVERRIDE " =~ ^accel/(nvidia/cc[0-9]+| amd/gfx[0-9a-z]+)$ ]]; then
279+ echo " $EESSI_ACCELERATOR_TARGET_OVERRIDE "
280+ return 0
281+ else
282+ log " ERROR" " Value of \$ EESSI_ACCELERATOR_TARGET_OVERRIDE should match 'accel/nvidia/cc[0-9]+' or 'accel/amd/gfx[0-9a-z]+', but it does not: '$EESSI_ACCELERATOR_TARGET_OVERRIDE '"
283+ return 1
284+ fi
285+ fi
286+
287+ # 1. Check for NVIDIA GPUs
288+ local nv_res
289+ nv_res=$( nvidia_accelpath)
290+ if [[ $? -eq 0 ]]; then
291+ log " DEBUG" " accelpath: result: ${nv_res} "
292+ echo " $nv_res "
293+ return 0
294+ fi
295+
296+ # 2. Check for AMD GPUs
297+ local amd_res
298+ amd_res=$( amd_accelpath)
299+ if [[ $? -eq 0 ]]; then
300+ log " DEBUG" " accelpath: result: ${amd_res} "
301+ echo " $amd_res "
302+ return 0
303+ fi
304+
305+ # 3. Fail gracefully if neither is found
306+ log " DEBUG" " accelpath: No supported accelerators found on this system."
307+ exit 2
308+ }
309+
283310# Parse command line arguments
284311USAGE=" Usage: eessi_archdetect.sh [-h][-d][-a] <action: cpupath or accelpath>"
285312
0 commit comments