Skip to content

Commit 35c7b93

Browse files
committed
Extract amdgcn_cc from kfd
1 parent ee1eff6 commit 35c7b93

1 file changed

Lines changed: 63 additions & 13 deletions

File tree

init/eessi_archdetect.sh

Lines changed: 63 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -212,24 +212,74 @@ accelpath() {
212212
# fi
213213

214214
# check for AMD GPUs via amd-smi command
215-
amd_smi=$(command -v amd-smi)
216-
if [[ $? -eq 0 ]]; then
217-
log "DEBUG" "accelpath: amd-smi command found @ ${amd_smi}"
218-
amd_smi_out=$(mktemp -p /tmp amd_smi_out.XXXXX)
219-
amd-smi static --asic | grep TARGET_GRAPHICS_VERSION 2>&1 > $amd_smi_out
220-
if [[ $? -eq 0 ]]; then
221-
amd_smi_info=$(head -n 1 $amd_smi_out)
222-
amdgcn_cc=$(echo $amd_smi_info | sed 's/.*: //')
223-
log "DEBUG" "accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from amd-smi output '${amd_smi_info}'"
215+
# amd_smi=$(command -v amd-smi)
216+
# if [[ $? -eq 0 ]]; then
217+
# log "DEBUG" "accelpath: amd-smi command found @ ${amd_smi}"
218+
# amd_smi_out=$(mktemp -p /tmp amd_smi_out.XXXXX)
219+
# amd-smi static --asic | grep TARGET_GRAPHICS_VERSION 2>&1 > $amd_smi_out
220+
# if [[ $? -eq 0 ]]; then
221+
# amd_smi_info=$(head -n 1 $amd_smi_out)
222+
# amdgcn_cc=$(echo $amd_smi_info | sed 's/.*: //')
223+
# log "DEBUG" "accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from amd-smi output '${amd_smi_info}'"
224+
# res="accel/amd/${amdgcn_cc}"
225+
# echo $res
226+
# rm -f $amd_smi_out
227+
# else
228+
# log "DEBUG" "accelpath: ami-smi command failed, see output in $amd_smi_out"
229+
# exit 3
230+
# fi
231+
# else
232+
# log "DEBUG" "accelpath: amd-smi command not found"
233+
# exit 2
234+
# fi
235+
236+
# logic ported from https://github.com/llvm/llvm-project/blob/6e738e187055bbd33b6c3d203b6b55904dfcb624/clang/tools/offload-arch/AMDGPUArchByKFD.cpp
237+
# check for AMD GPUs via KFD sysfs interface (No amd-smi or Python required)
238+
kfd_nodes="/sys/devices/virtual/kfd/kfd/topology/nodes"
239+
240+
echo $kfd_nodes
241+
242+
if [[ -d "$kfd_nodes" ]]; then
243+
log "DEBUG" "accelpath: KFD sysfs path found @ ${kfd_nodes}"
244+
amdgcn_cc=""
245+
246+
# ls -1v ensures numeric/version sorting (nodes/0, nodes/1, ..., nodes/10)
247+
# just like LLVM's llvm::sort by node ID.
248+
for node in $(ls -1v "$kfd_nodes" 2>/dev/null); do
249+
prop_file="$kfd_nodes/$node/properties"
250+
251+
if [[ -f "$prop_file" ]]; then
252+
# Extract the integer value. 2>/dev/null suppresses read errors.
253+
gfx_ver=$(grep "^gfx_target_version" "$prop_file" 2>/dev/null | awk '{print $2}')
254+
255+
# If gfx_ver is non-empty and greater than 0 (0 means it's a CPU node)
256+
if [[ -n "$gfx_ver" && "$gfx_ver" -gt 0 ]]; then
257+
# Perform the exact math from AMDGPUArchByKFD.cpp
258+
major=$(( (gfx_ver / 10000) % 100 ))
259+
minor=$(( (gfx_ver / 100) % 100 ))
260+
step=$(( gfx_ver % 100 ))
261+
262+
# Format as gfx<major><minor><hex_step> (e.g., 9 0 a -> gfx90a)
263+
amdgcn_cc=$(printf "gfx%d%d%x" $major $minor $step)
264+
265+
log "DEBUG" "accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from KFD node ${node}"
266+
267+
# Assuming homogeneous nodes for EESSI, grab the first valid GPU and break
268+
break
269+
fi
270+
fi
271+
done
272+
273+
if [[ -n "$amdgcn_cc" ]]; then
224274
res="accel/amd/${amdgcn_cc}"
225-
echo $res
226-
rm -f $amd_smi_out
275+
echo "$res"
276+
# Proceed with successful exit
227277
else
228-
log "DEBUG" "accelpath: ami-smi command failed, see output in $amd_smi_out"
278+
log "DEBUG" "accelpath: KFD topology found, but no AMD GPUs detected (only CPUs)"
229279
exit 3
230280
fi
231281
else
232-
log "DEBUG" "accelpath: amd-smi command not found"
282+
log "DEBUG" "accelpath: KFD sysfs path not found. AMD GPU driver not loaded?"
233283
exit 2
234284
fi
235285
}

0 commit comments

Comments
 (0)