-
Notifications
You must be signed in to change notification settings - Fork 19
Expand file tree
/
Copy patheessi_archdetect.sh
More file actions
executable file
·330 lines (281 loc) · 12.2 KB
/
eessi_archdetect.sh
File metadata and controls
executable file
·330 lines (281 loc) · 12.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
#!/usr/bin/env bash
# Confirm the current shell is Bash >= 4
# (works for sh, bash, dash, zsh, ksh, but not fish, tcsh, elvish)
if [ -n "$BASH_VERSION" ]; then
# Extract the major version numbers
bash_version=$(echo "$BASH_VERSION" | grep -oP '^\d+\.\d+')
major_version=$(echo "$bash_version" | cut -d. -f1)
# Check if the major version is 4 or higher
if [ "$major_version" -lt 4 ]; then
echo "Error: This script must be run with Bash >= 4, you have $BASH_VERSION." >&2
exit 1
fi
else
echo "Error: This script must be run with Bash." >&2
exit 1
fi
VERSION="1.2.0"
# default log level: only emit warnings or errors
LOG_LEVEL="WARN"
# Default result type is a best match
CPUPATH_RESULT="best"
timestamp () {
date "+%Y-%m-%d %H:%M:%S"
}
log () {
# Simple logger function
declare -A levels=([DEBUG]=0 [INFO]=1 [WARN]=2 [ERROR]=3)
msg_type="${1:-INFO}"
msg_body="${2:-'null'}"
[ ${levels[$msg_type]} ] || log "ERROR" "Unknown log level $msg_type"
# ignore messages below log level
[ ${levels[$msg_type]} -lt ${levels[$LOG_LEVEL]} ] && return 0
# print log message to standard error
echo "$(timestamp) [$msg_type] $msg_body" >&2
# exit after any error message
[ $msg_type == "ERROR" ] && exit 1
}
# Supported CPU specifications
update_arch_specs(){
# Add contents of given spec file into an array
# 1: spec file with the additional specs
[ ! -f "$1" ] && echo "[ERROR] update_arch_specs: spec file not found: $1" >&2 && exit 1
local spec_file="$1"
while read spec_line; do
# format spec line as an array and append it to array with all CPU arch specs
cpu_arch_spec+=("(${spec_line})")
# remove comments from spec file
done < <(sed -E 's/(^|[\s\t])#.*$//g;/^\s*$/d' "$spec_file")
}
# CPU specification of host system
get_cpuinfo(){
# Return the value from cpuinfo for the matching key
# 1: string with key pattern
[ -z "$1" ] && log "ERROR" "get_cpuinfo: missing key pattern in argument list"
cpuinfo_pattern="^${1}\s*:\s*"
# case insensitive match of key pattern and delete key pattern from result
grep -i "$cpuinfo_pattern" ${EESSI_PROC_CPUINFO:-/proc/cpuinfo} | tail -n 1 | sed "s/$cpuinfo_pattern//i"
}
check_allinfirst(){
# Return true if all given arguments after the first are found in the first one
# 1: reference string of space separated values
# 2,3..: each additional argument is a single value to be found in the reference string
[ -z "$1" ] && log "ERROR" "check_allinfirst: missing argument with reference string"
reference="$1"
shift
for candidate in "$@"; do
[[ " $reference " == *" $candidate "* ]] || return 1
done
return 0
}
cpupath(){
# If EESSI_SOFTWARE_SUBDIR_OVERRIDE is set, use it
log "DEBUG" "cpupath: Override variable set as '$EESSI_SOFTWARE_SUBDIR_OVERRIDE' "
[ $EESSI_SOFTWARE_SUBDIR_OVERRIDE ] && echo ${EESSI_SOFTWARE_SUBDIR_OVERRIDE} && exit
# Identify the best matching CPU architecture from a list of supported specifications for the host CPU
# Return the path to the installation files in EESSI of the best matching architecture
local cpu_arch_spec=()
# Identify the host CPU architecture
local machine_type=${EESSI_MACHINE_TYPE:-$(uname -m)}
log "DEBUG" "cpupath: Host CPU architecture identified as '$machine_type'"
# Populate list of supported specs for this architecture
case $machine_type in
"x86_64") local spec_file="eessi_arch_x86.spec";;
"aarch64") local spec_file="eessi_arch_arm.spec";;
"ppc64le") local spec_file="eessi_arch_ppc.spec";;
"riscv64") local spec_file="eessi_arch_riscv.spec";;
*) log "ERROR" "cpupath: Unsupported CPU architecture $machine_type"
esac
# spec files are located in a subfolder with this script
local base_dir=$(dirname $(readlink -f $0))
update_arch_specs "$base_dir/arch_specs/${spec_file}"
# Identify the host CPU vendor
local cpu_vendor=$(get_cpuinfo "vendor[ _]id")
if [ "${cpu_vendor}" == "" ]; then
cpu_vendor=$(get_cpuinfo "cpu[ _]implementer")
fi
log "DEBUG" "cpupath: CPU vendor of host system: '$cpu_vendor'"
# Construct a list of known cpu vendors
local cpu_vendors=()
for spec in "${cpu_arch_spec[@]}"; do
eval "cols=$spec"
cpu_vendors+=("${cols[1]}")
done
log "DEBUG" "cpupath: Known CPU vendors: ${cpu_vendors[*]}"
# For ARM, if CPU vendor is as-yet-unknown fall back to a default ARM vendor 0x41
if [ "${machine_type}" == "aarch64" ]; then
if [[ " ${cpu_vendors[*]} " != *" $cpu_vendor "* ]]; then
log "DEBUG" "cpupath: Unknown ARM CPU vendor '$cpu_vendor', falling back to '0x41'"
cpu_vendor="0x41"
fi
fi
# Identify the host CPU flags or features
# cpuinfo systems print different line identifiers, eg features, instead of flags
local cpu_flag_tag;
if [ "${cpu_vendor}" == "ARM" ]; then
# if CPU vendor field is ARM, then we should be able to determine CPU microarchitecture based on 'flags' field
cpu_flag_tag='flags'
# if 64-bit Arm CPU without "ARM" as vendor ID, we need to take into account 'features' field
elif [ "${machine_type}" == "aarch64" ]; then
cpu_flag_tag='features'
# on 64-bit POWER, we need to look at 'cpu' field
elif [ "${machine_type}" == "ppc64le" ]; then
cpu_flag_tag='cpu'
else
cpu_flag_tag='flags'
fi
local cpu_flags=$(get_cpuinfo "$cpu_flag_tag")
log "DEBUG" "cpupath: CPU flags of host system: '$cpu_flags'"
# Default to generic CPU
local best_arch_match="$machine_type/generic"
local all_arch_matches=$best_arch_match
# Iterate over the supported CPU specifications to find the best match for host CPU
# Order of the specifications matters, the last one to match will be selected
for arch in "${cpu_arch_spec[@]}"; do
eval "arch_spec=$arch"
if [ "${cpu_vendor}x" == "${arch_spec[1]}x" ]; then
# each flag in this CPU specification must be found in the list of flags of the host
check_allinfirst "${cpu_flags[*]}" ${arch_spec[2]} && best_arch_match=${arch_spec[0]} && \
all_arch_matches="$best_arch_match:$all_arch_matches" && \
log "DEBUG" "cpupath: host CPU best match updated to $best_arch_match"
fi
done
if [ "allx" == "${CPUPATH_RESULT}x" ]; then
log "INFO" "cpupath: all matches for host CPU: $all_arch_matches"
echo "$all_arch_matches"
else
log "INFO" "cpupath: best match for host CPU: $best_arch_match"
echo "$best_arch_match"
fi
}
nvidia_accelpath() {
# Check for NVIDIA GPUs via nvidia-smi command
local nvidia_smi
nvidia_smi=$(command -v nvidia-smi)
if [[ $? -eq 0 ]]; then
log "DEBUG" "nvidia_accelpath: nvidia-smi command found @ ${nvidia_smi}"
local nvidia_smi_out
nvidia_smi_out=$(mktemp -p /tmp nvidia_smi_out.XXXXX)
nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader 2>&1 > $nvidia_smi_out
if [[ $? -eq 0 ]]; then
local nvidia_smi_info=$(head -n 1 $nvidia_smi_out)
local cuda_cc=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f4 -d, | sed 's/\.//g')
log "DEBUG" "nvidia_accelpath: CUDA compute capability '${cuda_cc}' derived from nvidia-smi output '${nvidia_smi_info}'"
echo "accel/nvidia/cc${cuda_cc}"
rm -f $nvidia_smi_out
return 0
else
log "DEBUG" "nvidia_accelpath: nvidia-smi command failed, see output in $nvidia_smi_out"
return 3
fi
else
log "DEBUG" "nvidia_accelpath: nvidia-smi command not found"
return 2
fi
}
amd_accelpath() {
# Method 1: Check for AMD GPUs via KFD sysfs interface (No amd-smi or Python required)
local kfd_nodes="/sys/devices/virtual/kfd/kfd/topology/nodes"
if [[ -d "$kfd_nodes" ]]; then
log "DEBUG" "amd_accelpath: KFD sysfs path found @ ${kfd_nodes}"
local amdgcn_cc=""
# ls -1v ensures numeric/version sorting (nodes/0, nodes/1, ..., nodes/10)
for node in $(ls -1v "$kfd_nodes" 2>/dev/null); do
local prop_file="$kfd_nodes/$node/properties"
if [[ -f "$prop_file" ]]; then
# Extract the integer value. 2>/dev/null suppresses read errors.
local gfx_ver=$(grep "^gfx_target_version" "$prop_file" 2>/dev/null | awk '{print $2}')
# If gfx_ver is non-empty and greater than 0 (0 means it's a CPU node)
if [[ -n "$gfx_ver" && "$gfx_ver" -gt 0 ]]; then
local major=$(( (gfx_ver / 10000) % 100 ))
local minor=$(( (gfx_ver / 100) % 100 ))
local step=$(( gfx_ver % 100 ))
amdgcn_cc=$(printf "gfx%d%d%x" $major $minor $step)
log "DEBUG" "amd_accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from KFD node ${node}"
break
fi
fi
done
if [[ -n "$amdgcn_cc" ]]; then
echo "accel/amd/${amdgcn_cc}"
return 0
fi
log "DEBUG" "amd_accelpath: KFD topology found, but no AMD GPUs detected. Falling back to amd-smi."
else
log "DEBUG" "amd_accelpath: KFD sysfs path not found. Falling back to amd-smi."
fi
# Method 2: Fallback to AMD GPUs via amd-smi command using /tmp files
local amd_smi
amd_smi=$(command -v amd-smi)
if [[ $? -eq 0 ]]; then
log "DEBUG" "amd_accelpath: amd-smi command found @ ${amd_smi}"
local amd_smi_out
amd_smi_out=$(mktemp -p /tmp amd_smi_out.XXXXX)
amd-smi static --asic | grep TARGET_GRAPHICS_VERSION 2>&1 > $amd_smi_out
if [[ $? -eq 0 ]]; then
local amd_smi_info=$(head -n 1 $amd_smi_out)
local amdgcn_cc=$(echo $amd_smi_info | sed 's/.*: //')
log "DEBUG" "amd_accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from amd-smi output '${amd_smi_info}'"
echo "accel/amd/${amdgcn_cc}"
rm -f $amd_smi_out
return 0
else
log "DEBUG" "amd_accelpath: amd-smi command failed, see output in $amd_smi_out"
return 3
fi
else
log "DEBUG" "amd_accelpath: amd-smi command not found"
return 2
fi
}
accelpath() {
# If EESSI_ACCELERATOR_TARGET_OVERRIDE is set, use it
log "DEBUG" "accelpath: Override variable set as '$EESSI_ACCELERATOR_TARGET_OVERRIDE' "
if [ ! -z $EESSI_ACCELERATOR_TARGET_OVERRIDE ]; then
# Updated regex to allow both NVIDIA and AMD overrides
if [[ "$EESSI_ACCELERATOR_TARGET_OVERRIDE" =~ ^accel/(nvidia/cc[0-9]+|amd/gfx[0-9a-f]+)$ ]]; then
echo "$EESSI_ACCELERATOR_TARGET_OVERRIDE"
return 0
else
log "ERROR" "Value of \$EESSI_ACCELERATOR_TARGET_OVERRIDE should match 'accel/nvidia/cc[0-9]+' or 'accel/amd/gfx[0-9a-f]+', but it does not: '$EESSI_ACCELERATOR_TARGET_OVERRIDE'"
return 1
fi
fi
# 1. Check for NVIDIA GPUs
local nv_res
nv_res=$(nvidia_accelpath)
if [[ $? -eq 0 ]]; then
log "DEBUG" "accelpath: result: ${nv_res}"
echo "$nv_res"
return 0
fi
# 2. Check for AMD GPUs
local amd_res
amd_res=$(amd_accelpath)
if [[ $? -eq 0 ]]; then
log "DEBUG" "accelpath: result: ${amd_res}"
echo "$amd_res"
return 0
fi
# 3. Fail gracefully if neither is found
log "DEBUG" "accelpath: No supported accelerators found on this system."
exit 2
}
# Parse command line arguments
USAGE="Usage: eessi_archdetect.sh [-h][-d][-a] <action: cpupath or accelpath>"
while getopts 'hdva' OPTION; do
case "$OPTION" in
h) echo "$USAGE"; exit 0;;
d) LOG_LEVEL="DEBUG";;
v) echo "eessi_archdetect.sh v$VERSION"; exit 0;;
a) CPUPATH_RESULT="all";;
?) echo "$USAGE"; exit 1;;
esac
done
shift "$(($OPTIND -1))"
ARGUMENT=${1:-none}
case "$ARGUMENT" in
"cpupath") cpupath; exit;;
"accelpath") accelpath; exit;;
*) echo "$USAGE"; log "ERROR" "Missing <action> argument (possible actions: 'cpupath', 'accelpath')";;
esac