forked from EESSI/software-layer-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlink_nvidia_host_libraries.sh
More file actions
executable file
·646 lines (561 loc) · 27 KB
/
link_nvidia_host_libraries.sh
File metadata and controls
executable file
·646 lines (561 loc) · 27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
#!/bin/bash
# NVIDIA Host Libraries Linking Script for EESSI
# ============================================
# Overview:
# 1. Initialize environment and source utility functions
# All function definitions be here.
# 2. Check prerequisites:
# - EESSI environment initialization
# - nvidia-smi availability
# - Proper umask settings for global read permissions
# 3. Gather NVIDIA information:
# - Detect GPU driver version
# - Get CUDA version
# 4. Library detection and matching:
# - Download/use default NVIDIA library list
# - Find host libraries using ldconfig
# - Match required NVIDIA libraries
# 5. Handle two operation modes:
# a) Show LD_PRELOAD mode: Displays environment variables for preloading
# Suggest exports for following variables:
# EESSI_GPU_COMPAT_LD_PRELOAD (Minimal LD_PRELOAD)
# EESSI_GPU_LD_PRELOAD (Full LD_PRELOAD)
# EESSI_OVERRIDE_GPU_CHECK
# b) Symlink mode: Create directory structure and link libraries
# Create necessary symlinks in EESSI directory structure
#
# Error Handling:
# - nvidia-smi detection: Exits if NVIDIA drivers not found
# - Library matching: Reports missing libraries
# - Permission issues: Checks write access and umask settings
# - Symlink conflicts: Validates existing symlinks
# - Directory creation: Ensures proper structure exists
#
# Note: This script is part of EESSI (European Environment for Scientific
# Software Installations) and manages the linking of host NVIDIA libraries
# to make them accessible within the EESSI environment.
# ###################################################### #
# 1. Initialize environment and source utility functions #
# ###################################################### #
TOPDIR=$(dirname "$(realpath "${BASH_SOURCE[0]}")")
source "$TOPDIR"/../../utils.sh
# Command line help function
show_help() {
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --help Display this help message"
echo " --show-ld-preload Enable recommendations for LD_PRELOAD mode"
echo " --no-download Don't download list of Nvidia libraries from URL,"
echo " but use hardcoded list here in get_nvlib_list()"
echo " -v, --verbose Display debugging messages,"
echo " actions taken, commands being run."
}
# Initialize global variables (These are accessed or set from functions)
LD_PRELOAD_MODE=0 # Trigger show-ld-preload mode T/F
LIBS_LIST="" # Command line argument for get_nvlib_list
VERBOSE=0 # Set verbosity logging T/F
HOST_GPU_DRIVER_VERSION="" # GPU Driver version ()
HOST_GPU_CUDA_VERSION="" # GPU CUDA version ()
MATCHED_LIBRARIES=() # List of found CUDA libraries based on get_nvlib_list()
MISSING_LIBRARIES=() # Complementary to Matched libraries.
# Locates the host system's ldconfig, avoiding CVMFS paths
# Returns path to first valid ldconfig found, prioritizing /sbin
get_host_ldconfig() {
local command_name="ldconfig" # Set command to find
local exclude_prefix="/cvmfs" # Set excluded prefix (paths to ignore)
local found_paths=() # Initialize an array to store found paths
# Always attempt to use /sbin/ldconfig
if [ -x "/sbin/${command_name}" ]; then
found_paths+=("/sbin/${command_name}")
fi
# Split the $PATH and iterate over each directory
IFS=':' read -ra path_dirs <<< "$PATH"
for dir in "${path_dirs[@]}"; do
if [ "$dir" = "/sbin" ]; then
continue # Skip /sbin since it's already checked
fi
# Check if directory does not start with the exclude prefix
if [[ ! "$dir" =~ ^$exclude_prefix ]]; then
if [ -x "${dir}/${command_name}" ]; then
found_paths+=("${dir}/${command_name}")
fi
fi
done
# Check if any paths were found
if [ ${#found_paths[@]} -gt 0 ]; then
# echo the first version we found and return success
echo "${found_paths[0]}"
return 0
else
fatal_error "$command_name not found in PATH or only found in paths starting with $exclude_prefix."
fi
}
# Downloads or provides default list of required NVIDIA libraries.
# As echo to stdout! Don't print any messages inside this function.
# Returns 0 if download successful, 1 if using default list
get_nvlib_list() {
local nvliblist_url="https://raw.githubusercontent.com/apptainer/apptainer/main/etc/nvliblist.conf"
# see https://apptainer.org/docs/admin/1.0/configfiles.html#nvidia-gpus-cuda
# https://github.com/apptainer/apptainer/commits/main/etc/nvliblist.conf
# This default_nvlib_list is based on this commit on Oct 1, 2024:
# https://github.com/apptainer/apptainer/commit/a19fa01527a8914839b8d1649688f83c61ba9ad2
# TODO: driver version which corresponds to?
local default_nvlib_list=(
"libcuda.so"
"libcudadebugger.so"
"libEGL_installertest.so"
"libEGL_nvidia.so"
"libEGL.so"
"libGLdispatch.so"
"libGLESv1_CM_nvidia.so"
"libGLESv1_CM.so"
"libGLESv2_nvidia.so"
"libGLESv2.so"
"libGL.so"
"libGLX_installertest.so"
"libGLX_nvidia.so"
"libglx.so"
"libGLX.so"
"libnvcuvid.so"
"libnvidia-cbl.so"
"libnvidia-cfg.so"
"libnvidia-compiler.so"
"libnvidia-eglcore.so"
"libnvidia-egl-wayland.so"
"libnvidia-encode.so"
"libnvidia-fatbinaryloader.so"
"libnvidia-fbc.so"
"libnvidia-glcore.so"
"libnvidia-glsi.so"
"libnvidia-glvkspirv.so"
"libnvidia-gpucomp.so"
"libnvidia-gtk2.so"
"libnvidia-gtk3.so"
"libnvidia-ifr.so"
"libnvidia-ml.so"
"libnvidia-nvvm.so"
"libnvidia-opencl.so"
"libnvidia-opticalflow.so"
"libnvidia-ptxjitcompiler.so"
"libnvidia-rtcore.so"
"libnvidia-tls.so"
"libnvidia-wfb.so"
"libnvoptix.so"
"libOpenCL.so"
"libOpenGL.so"
"libvdpau_nvidia.so"
"nvidia_drv.so"
"tls_test_.so"
)
# Check if the function was called with the "default" argument
if [[ "$1" == "default" ]]; then
printf "%s\n" "${default_nvlib_list[@]}"
return 1
fi
# Try to download the nvliblist.conf file with curl
nvliblist_content=$(curl --silent "$nvliblist_url")
# Check if curl failed (i.e., the content is empty)
if [ -z "$nvliblist_content" ]; then
# Failed to download nvliblist.conf, using default list instead
# We can't echo here
# echo_yellow "Download failed, using default list of libraries instead"
printf "%s\n" "${default_nvlib_list[@]}"
return 1
fi
# If curl succeeded, filter and return the libraries from the downloaded content
echo "$nvliblist_content" | grep '.so$'
# We can't echo here
# echo "Using downloaded list of libraries"
return 0
}
# Verifies if current umask allows global read access
# Exits with error if permissions are too restrictive
check_global_read() {
# Get the current umask value
local current_umask
current_umask=$(umask)
log_verbose "current umask: ${current_umask}"
# Convert umask to decimal to analyze
local umask_octal
umask_octal=$(printf '%03o\n' "$current_umask")
# Check if umask allows global read
if [ "$umask_octal" -gt 022 ]; then
fatal_error "The current umask ($current_umask) does not allow global read permissions, you'll want everyone to be able to read the created directory."
fi
# TODO: Option to set $UMASK here?
# https://github.com/EESSI/software-layer/pull/754#discussion_r1950643598
}
# Checks for nvidia-smi command and extracts GPU information
# Sets HOST_GPU_CUDA_VERSION and HOST_GPU_DRIVER_VERSION variables
check_nvidia_smi_info() {
if command -v nvidia-smi
then
log_verbose "Found nvidia-smi at: $(which nvidia-smi)"
# Create temporary file for nvidia-smi output
nvidia_smi_out=$(mktemp -p /tmp nvidia_smi_out.XXXXX)
log_verbose "Creating temporary output file: ${nvidia_smi_out}"
# Query GPU information and parse versions
if nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader > "$nvidia_smi_out" 2>&1
then
nvidia_smi_info=$(head -n 1 "${nvidia_smi_out}")
HOST_GPU_CUDA_VERSION=$(echo "${nvidia_smi_info}" | sed 's/, /,/g' | cut -f4 -d,)
HOST_GPU_DRIVER_VERSION=$(echo "${nvidia_smi_info}" | sed 's/, /,/g' | cut -f3 -d,)
echo_green "Found host CUDA version ${HOST_GPU_CUDA_VERSION}"
echo_green "Found NVIDIA GPU driver version ${HOST_GPU_DRIVER_VERSION}"
rm -f "$nvidia_smi_out"
else
fatal_error "nvidia-smi command failed, see output in $nvidia_smi_out. Please remove the file afterwards."
fi
else
fatal_error "nvidia-smi command not found"
exit 2
fi
}
# Suggests configurations for LD_PRELOAD environment for CUDA libraries
# Filters libraries and configures both minimal and full preload options
show_ld_preload() {
echo
echo_yellow "When attempting to use LD_PRELOAD we exclude anything related to graphics"
# Define core CUDA libraries needed for compute
cuda_compat_nvlib_list=(
"libcuda.so"
"libcudadebugger.so"
"libnvidia-nvvm.so"
"libnvidia-ptxjitcompiler.so"
)
# Filter out all symlinks and libraries that have missing library dependencies under EESSI
filtered_libraries=()
compat_filtered_libraries=()
for library in "${MATCHED_LIBRARIES[@]}"; do
# Run ldd on the given binary and filter for "not found" libraries
# not_found_libs=$(ldd "${library}" 2>/dev/null | grep "not found" | awk '{print $1}')
# Trim multiple spaces then use cut
not_found_libs=$(ldd "${library}" 2>/dev/null | grep "not found" | tr -s ' ' | cut -d' ' -f1)
# Check if it is missing an so dep under EESSI
if [[ -z "$not_found_libs" ]]; then
# Resolve any symlink
realpath_library=$(realpath "$library")
if [[ ! " ${filtered_libraries[@]} " =~ " $realpath_library " ]]; then
filtered_libraries+=("${realpath_library}")
# Also prepare compat only libraries for the short list
for item in "${cuda_compat_nvlib_list[@]}"; do
# Check if the current item is a substring of $library
if [[ "$realpath_library" == *"$item"* ]]; then
echo "Match found for $item for CUDA compat libraries"
if [[ ! " ${compat_filtered_libraries[@]} " =~ " $realpath_library " ]]; then
compat_filtered_libraries+=("$realpath_library")
fi
break
fi
done
fi
else
# Iterate over "not found" libraries and check if they are in the array
all_found=true
for lib in $not_found_libs; do
found=false
for listed_lib in "${MATCHED_LIBRARIES[@]}"; do
# Matching to the .so or a symlink target is enough
realpath_lib=$(realpath "${listed_lib}")
if [[ "$lib" == "$listed_lib"* || "$realpath_lib" == *"$lib" ]]; then
found=true
break
fi
done
if [[ "$found" == false ]]; then
echo "$lib is NOT in the provided preload list, filtering $library"
all_found=false
break
fi
done
# If we find all the missing libs in our list include it
if [[ "$all_found" == true ]]; then
# Resolve any symlink
realpath_library=$(realpath "${library}")
if [[ ! " ${filtered_libraries[@]} " =~ " $realpath_library " ]]; then
filtered_libraries+=("${realpath_library}")
# Also prepare compat only libraries for the short list
for item in "${cuda_compat_nvlib_list[@]}"; do
# Check if the current item is a substring of $library
if [[ "$realpath_library" == *"$item"* ]]; then
echo "Match found for $item for CUDA compat libraries"
if [[ ! " ${compat_filtered_libraries[@]} " =~ " $realpath_library " ]]; then
compat_filtered_libraries+=("${realpath_library}")
fi
break
fi
done
fi
fi
fi
done
# Set EESSI_GPU_LD_PRELOAD with the matched libraries
if [ ${#filtered_libraries[@]} -gt 0 ]; then
echo
echo_yellow "The recommended way to use LD_PRELOAD is to only use it when you need to."
echo
# Set up MINIMAL preload for common cases
EESSI_GPU_COMPAT_LD_PRELOAD=$(printf "%s\n" "${compat_filtered_libraries[@]}" | tr '\n' ':')
# Remove the trailing colon from LD_PRELOAD if it exists
EESSI_GPU_COMPAT_LD_PRELOAD=${EESSI_GPU_COMPAT_LD_PRELOAD%:}
export EESSI_GPU_COMPAT_LD_PRELOAD
echo_yellow "A minimal preload which should work in most cases:"
echo_green "export EESSI_GPU_COMPAT_LD_PRELOAD=\"$EESSI_GPU_COMPAT_LD_PRELOAD\""
echo
# Set up FULL preload for corner cases
EESSI_GPU_LD_PRELOAD=$(printf "%s\n" "${filtered_libraries[@]}" | tr '\n' ':')
# Remove the trailing colon from LD_PRELOAD if it exists
EESSI_GPU_LD_PRELOAD=${EESSI_GPU_LD_PRELOAD%:}
export EESSI_GPU_LD_PRELOAD
export EESSI_OVERRIDE_GPU_CHECK=1
echo_yellow "A corner-case full preload (which is hard on memory) for exceptional use:"
# Display usage instructions
echo_green "export EESSI_GPU_LD_PRELOAD=\"$EESSI_GPU_LD_PRELOAD\""
echo_green "export EESSI_OVERRIDE_GPU_CHECK=\"$EESSI_OVERRIDE_GPU_CHECK\""
echo
echo_yellow "Then you can set LD_PRELOAD only when you want to run a GPU application,"
echo_yellow "e.g. deviceQuery command from CUDA-Samples module:"
echo_yellow " LD_PRELOAD=\"\$EESSI_GPU_COMPAT_LD_PRELOAD\" deviceQuery"
echo_yellow "or LD_PRELOAD=\"\$EESSI_GPU_LD_PRELOAD\" deviceQuery"
else
echo "No libraries matched, LD_PRELOAD not set."
fi
[[ "${BASH_SOURCE[0]}" != "${0}" ]] && return 1
}
# Check host's ldconfig, gathers library paths, and filters them on matching.
# Sets MATCHED_LIBRARIES and MISSING_LIBRARIES
find_cuda_libraries_on_host() {
# First let's see what driver libraries are there
# then extract the ones we need for CUDA
# Find the host ldconfig
host_ldconfig=$(get_host_ldconfig)
log_verbose "Found host ldconfig: ${host_ldconfig}"
# Gather all libraries on the host (_must_ be host ldconfig).
# host_libraries=$("${host_ldconfig}" -p | awk '{print $NF}')
# Trim multiple spaces then use cut
host_libraries=$("${host_ldconfig}" -p | tr -s ' ' | cut -d' ' -f4)
# This is only for the scenario where the script is being run inside a container, if it fails the list is empty.
singularity_libs=$(ls /.singularity.d/libs/* 2>/dev/null)
# Now gather the list of possible CUDA libraries and make them into an array
# https://www.shellcheck.net/wiki/SC2207
cuda_candidate_libraries=($(get_nvlib_list "${LIBS_LIST}"))
# Check if the function returned an error (e.g., curl failed)
# Echo here, we take stdout from function as list of libraries.
if [ $? -ne 0 ]; then
echo "Using default list of libraries"
else
echo "Using downloaded list of libraries"
fi
# Search for CUDA Libraries in system paths
echo "Searching for CUDA Libraries"
for library in "${cuda_candidate_libraries[@]}"; do
# Match libraries for current CPU architecture
# "contains" matching - (eg. 'libcuda.so' matches both 'libcuda.so' and 'libcuda.so.1')
# The `grep -v "i386"` is done to exclude i386 libraries, which could be installed in parallel with 64 libs.
matched=$(echo "$host_libraries $singularity_libs" | grep -v "i386" | grep "$library")
if [ -n "$matched" ]; then
log_verbose "Found matches for ${library}: $matched"
# Process each matched library and avoid duplicates by filename
# Used `while - read <<< $matched`` to handle whitespaces and special characters.
while IFS= read -r lib_path; do
# Skip empty lines
[ -z "$lib_path" ] && continue
# Extract just the filename from the path
lib_name=$(basename "$lib_path")
echo "Checking library $lib_name for duplicates"
# Check if we already have this library filename in our matched libraries
duplicate_found=0
for existing_lib in "${MATCHED_LIBRARIES[@]}"; do
existing_name=$(basename "$existing_lib")
if [ "$existing_name" = "$lib_name" ]; then
log_verbose "Duplicate library found: $lib_name (existing: $existing_lib, currently processed: $lib_path)"
log_verbose "Discarting $lib_path"
duplicate_found=1
break
fi
done
# If no duplicate found, add this library
if [ "$duplicate_found" -eq 0 ]; then
MATCHED_LIBRARIES+=("$lib_path")
fi
done <<< "$matched"
else
# There are some libraries, that weren't matched/found on the system
log_verbose "No matches found for ${library}"
MISSING_LIBRARIES+=("$library")
fi
done
# Report matching results
echo_green "Matched ${#MATCHED_LIBRARIES[@]} CUDA Libraries"
if [ ${#MISSING_LIBRARIES[@]} -gt 0 ]; then
echo_yellow "The following libraries were not found (based on 'get_nvlib_list')"
printf '%s\n' "${MISSING_LIBRARIES[@]}"
fi
}
# Actually symlinks the Matched libraries to correct folders.
symlink_mode () {
# First let's make sure the driver libraries are not already in place
# Have to link drivers = True
link_drivers=1
# Do some checks on existence of links and that we don't end up at /dev/null (the default), so we can print some informative information
# One downside is that we can't explicitely check if something is a variant symlink, so we'll just assume that if it's a link AND it
# lives in our CVMFS repository, it must be a variant symlink
nvidia_trusted_dir="${EESSI_EPREFIX}/lib/nvidia"
if [[ -L "$nvidia_trusted_dir" ]]; then
target1=$(readlink "$nvidia_trusted_dir")
log_verbose "$nvidia_trusted_dir is a CVMFS variant symlink (EESSI_${ESSSI_VERSION//./}_NVIDIA_OVERRIDE) currently pointing to $target1"
# If this is a link, and if it lives in the EESSI_CVMFS_REPO, we assume this is a variant symlink
if [[ -L "$target1" && "$target1" == "$EESSI_CVMFS_REPO"/* ]]; then
target2=$(readlink "$target1")
msg="${target1} appears to be a CVMFS variant symlink (EESSI_NVIDIA_OVERRIDE_DEFAULT) currently pointing to ${target2}."
msg="${msg} Proceeding to install host symlinks in ${target2}."
log_verbose "${msg}"
# Check if target2 isn't /dev/null (the default target of the EESSI_NVIDIA_OVERRIDE_DEFAULT variant symlink)
# If it is, suggest setting EESSI_NVIDIA_OVERRIDE_DEFAULT or EESSI_${ESSSI_VERSION//./}_NVIDIA_OVERRIDE
if [[ $target2 == /dev/null ]]; then
msg="${nvidia_trusted_dir} is a symlink pointing to ${target1}, which is a symlink pointing to ${target2}\n"
msg="${msg}If you want to symlink the drivers in a single location for all EESSI versions, please define"
msg="${msg} the EESSI_NVIDIA_OVERRIDE_DEFAULT variant symlink in your local CVMFS configuration to point to"
msg="${msg} writeable location. This will change the target of symlink ${target1}.\n"
msg="${msg}If you want to symlink the drivers only for this version of EESSI (${EESSI_VERSION}), please define"
msg="${msg} the EESSI_${ESSSI_VERSION//./}_NVIDIA_OVERRIDE variant symlink in your local CVMFS configuration to point to"
msg="${msg} writeable location. This will change the target of symlink ${nvidia_trusted_dir}.\n"
fatal_error "${msg}"
fi
else
msg="$target1 does not seem to be a CVMFS variant symlink, suggesting that EESSI_${ESSSI_VERSION//./}_NVIDIA_OVERRIDE"
msg="${msg} was set in the CVMFS config. Proceeding to install host symlinks in $target1."
log_verbose "${msg}"
fi
else
msg="$nvidia_trusted_dir is expected to be a symlink, but it's not. This will likely fail"
msg="${msg} as CVMFS repositories are read-only. Proceeding anyway, but expect this to fail."
echo_yellow "${msg}"
fi
# Make sure that target of nvidia_trusted_dir variant symlink is an existing directory
install_target=$(readlink -f "$nvidia_trusted_dir")
echo "Ensure the final target of ${nvidia_trusted_dir} (${install_target}) exists"
log_verbose "Target directory in which driver symlinks will be installed: ${install_target}"
if [ ! -d "$install_target" ]; then
check_global_read
if ! create_directory_structure "$install_target"; then
fatal_error "No write permissions to directory ${install_target}"
fi
fi
# Define file to store driver version that was symlinked
host_injection_driver_version_file="${install_target}/driver_version.txt"
log_verbose "host_injection_driver_version_file: ${host_injection_driver_version_file}"
# Check if drivers are already linked with correct version
# This is done by comparing host_injection_driver_version_file (driver_version.txt)
# This is needed when updating GPU drivers.
if [ -e "$host_injection_driver_version_file" ]; then
if grep -q "$HOST_GPU_DRIVER_VERSION" "$host_injection_driver_version_file"; then
echo_green "The host GPU driver libraries (v${HOST_GPU_DRIVER_VERSION}) have already been linked! (based on ${host_injection_driver_version_file})"
# The GPU libraries were already linked for this version of CUDA driver
# Have to link drivers = False
link_drivers=0
else
# There's something there but it is out of date
echo_yellow "The host GPU driver libraries version have changed. Now its: (v${HOST_GPU_DRIVER_VERSION})"
echo_yellow "Cleaning out outdated symlinks."
rm "${install_target}"/* || fatal_error "Unable to remove files under '${install_target}'."
fi
fi
# Link all matched_libraries from Nvidia to correct host_injection folder
# This step is only run, when linking of drivers is needed (eg. link_drivers==1)
# Setup variable to track if some drivers were actually linked this run.
drivers_linked=0
# Have to link drivers
if [ "$link_drivers" -eq 1 ]; then
# Link the matched libraries
cd "${install_target}" || fatal_error "Failed to cd to ${install_target}"
log_verbose "Changed directory to: $PWD"
# Make symlinks to all the interesting libraries
# Loop over each matched library
for library in "${MATCHED_LIBRARIES[@]}"; do
log_verbose "Linking library: ${library}"
# Get just the library filename
lib_name=$(basename "$library")
# Check if the symlink already exists
if [ -L "$lib_name" ]; then
# Check if it's pointing to the same target
target=$(readlink "$lib_name")
if [ "$target" = "$library" ]; then
log_verbose "Symlink for $lib_name already exists and points to correct target"
continue
else
log_verbose "Symlink for $lib_name exists but points to wrong target: $target, updating..."
rm "$lib_name"
fi
fi
# Create a symlink in the current directory
# and check if the symlink was created successfully
if ! ln -s "$library" .
then
fatal_error "Error: Failed to create symlink for library $library in $PWD"
fi
done
# Inject driver and CUDA versions into the directory
echo "$HOST_GPU_DRIVER_VERSION" > driver_version.txt
echo "$HOST_GPU_CUDA_VERSION" > cuda_version.txt
drivers_linked=1
fi
}
# Logging function for verbose mode
# TODO: move to utils?
log_verbose() {
if [ "$VERBOSE" -eq 1 ]; then
echo "[VERBOSE] $*"
fi
}
# ###############################################
# 2. Check prerequisites #
# ###############################################
# Make sure EESSI is initialised (doesn't matter what version)
check_eessi_initialised
# Verify nvidia-smi availability
log_verbose "Checking for nvidia-smi command..."
command -v nvidia-smi >/dev/null 2>&1 || { echo_yellow "nvidia-smi not found, this script won't do anything useful"; exit 1; }
# Parse command line arguments
while [[ "$#" -gt 0 ]]; do
case "$1" in
--help)
show_help
exit 0
;; # Show help
--show-ld-preload) LD_PRELOAD_MODE=1 ;; # Enable LD_PRELOAD mode
--no-download) LIBS_LIST="default" ;; # Download latest list of CUDA libraries
--verbose|-v) VERBOSE=1 ;; # Enable verbose output
*)
show_help
fatal_error "Unknown option: $1"
;;
esac
shift
done
# ###############################################
# 3. Gather NVIDIA information #
# ###############################################
# Gather information about NVIDIA drivers (even if we are inside a Gentoo Prefix in a container)
export LD_LIBRARY_PATH="/.singularity.d/libs:${LD_LIBRARY_PATH}"
# Check for NVIDIA GPUs via nvidia-smi command
check_nvidia_smi_info
# ###############################################
# 4. Library detection and matching #
# ###############################################
# Gather any CUDA related driver libraries from the host
# Sets MATCHED_LIBRARIES and MISSING_LIBRARIES array variables
find_cuda_libraries_on_host
# ###############################################
# 5. Handle operation modes #
# ###############################################
# === 5a. LD_PRELOAD Mode ===
if [ "$LD_PRELOAD_MODE" -eq 1 ]; then
show_ld_preload
exit 0
fi
# === 5b. Symlink Mode ===
# If we haven't already exited, we may need to create the symlinks
symlink_mode
# If everything went OK, show success message
echo_green "Host NVIDIA GPU drivers linked successfully for EESSI"