Skip to content

Commit f438313

Browse files
committed
Updated link_nvidia_host_libraries.sh to handle duplicate libraries.
1 parent d4b716a commit f438313

3 files changed

Lines changed: 117 additions & 8 deletions

File tree

.github/workflows/tests_link_nvidia_host_libraries.yml

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,8 @@ jobs:
120120
# List dir with libraries
121121
echo "Showing content of /tmp/nvidia_libs"
122122
echo "$(ls -l /tmp/nvidia_libs)"
123+
echo "Showing content of /tmp/nvidia_libs_duplicate"
124+
echo "$(ls -l /tmp/nvidia_libs_duplicate)"
123125
echo "Showing content of /opt/eessi/nvidia/x86_64/host"
124126
echo "$(ls -l /opt/eessi/nvidia/x86_64/host)"
125127
@@ -152,9 +154,9 @@ jobs:
152154
exit 1
153155
fi
154156
155-
# Verify it points to our mock library in /tmp/nvidia_libs
156-
if [[ "$target" != "/tmp/nvidia_libs/$lib"* && "$target" != *"/tmp/nvidia_libs/"* ]]; then
157-
echo "Error: Symlink $lib_path points to $target, which is not in our mock directory"
157+
# Verify it points to our mock library in /tmp/nvidia_libs or /tmp/nvidia_libs_duplicate
158+
if [[ "$target" != "/tmp/nvidia_libs/$lib"* && "$target" != "/tmp/nvidia_libs_duplicate/$lib"* ]]; then
159+
echo "Error: Symlink $lib_path points to $target, which is not in our mock directories"
158160
exit 1
159161
fi
160162
@@ -194,3 +196,44 @@ jobs:
194196
echo "$output" | grep "have already been linked" || { echo "Missing 'already linked' message"; exit 1; }
195197
196198
echo "Second normal run test passed"
199+
200+
- name: Test handling of duplicate libraries
201+
run: |
202+
echo ">>> Testing handling of duplicate libraries"
203+
204+
# Remove existing symlinks to prepare for this test
205+
rm -rf /opt/eessi/nvidia/x86_64/host
206+
207+
# Run the script with verbose mode
208+
output=$(./scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh --verbose || { echo "Script returned non-zero: $?"; echo $output; exit 1; })
209+
210+
echo "$output"
211+
212+
# Check if libraries with duplicates were correctly handled
213+
# For libcuda.so and libnvidia-ml.so, which have duplicates, there should be exactly one symlink
214+
215+
# List all symlinks created in the host directory
216+
echo ">>> Listing symlinks in host directory"
217+
ls -la /opt/eessi/nvidia/x86_64/host/
218+
219+
# Check for duplicate entries - there should be only one libcuda.so and one libnvidia-ml.so
220+
duplicates=$(find /opt/eessi/nvidia/x86_64/host/ -name "libcuda.so" | wc -l)
221+
if [ "$duplicates" -gt 1 ]; then
222+
echo "Error: Found multiple libcuda.so symlinks, which indicates a failure in handling duplicates"
223+
exit 1
224+
fi
225+
226+
duplicates=$(find /opt/eessi/nvidia/x86_64/host/ -name "libnvidia-ml.so" | wc -l)
227+
if [ "$duplicates" -gt 1 ]; then
228+
echo "Error: Found multiple libnvidia-ml.so symlinks, which indicates a failure in handling duplicates"
229+
exit 1
230+
fi
231+
232+
# Verify that the script correctly prioritized one of the duplicates
233+
libcuda_target=$(readlink /opt/eessi/nvidia/x86_64/host/libcuda.so)
234+
echo "libcuda.so points to: $libcuda_target"
235+
236+
libnvidia_ml_target=$(readlink /opt/eessi/nvidia/x86_64/host/libnvidia-ml.so)
237+
echo "libnvidia-ml.so points to: $libnvidia_ml_target"
238+
239+
echo "Duplicate libraries test passed"

scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh

Lines changed: 54 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -402,8 +402,41 @@ find_cuda_libraries_on_host() {
402402

403403
if [ -n "$matched" ]; then
404404
log_verbose "Found matches for ${library}: $matched"
405-
# Do not quote $matched, since it can contain multiple libraries split by \n!
406-
MATCHED_LIBRARIES+=($matched)
405+
406+
# Process each matched library and avoid duplicates by filename
407+
# Used `while - read <<< $matched`` to handle whitespaces and special characters.
408+
while IFS= read -r lib_path; do
409+
# Skip empty lines
410+
[ -z "$lib_path" ] && continue
411+
412+
# Extract just the filename from the path
413+
lib_name=$(basename "$lib_path")
414+
echo "Checking library $lib_name for duplicates"
415+
416+
# Check if we already have this library filename in our matched libraries
417+
duplicate_found=0
418+
for existing_lib in "${MATCHED_LIBRARIES[@]}"; do
419+
existing_name=$(basename "$existing_lib")
420+
if [ "$existing_name" = "$lib_name" ]; then
421+
log_verbose "Duplicate library found: $lib_name (existing: $existing_lib, new: $lib_path)"
422+
# Prioritize libraries in standard locations if possible
423+
if [[ "$lib_path" == "/usr/lib"* || "$lib_path" == "/lib"* ]]; then
424+
log_verbose "Prioritizing system library: $lib_path"
425+
# Remove the existing entry
426+
MATCHED_LIBRARIES=("${MATCHED_LIBRARIES[@]/$existing_lib}")
427+
# Add the new one
428+
MATCHED_LIBRARIES+=("$lib_path")
429+
fi
430+
duplicate_found=1
431+
break
432+
fi
433+
done
434+
435+
# If no duplicate found, add this library
436+
if [ "$duplicate_found" -eq 0 ]; then
437+
MATCHED_LIBRARIES+=("$lib_path")
438+
fi
439+
done <<< "$matched"
407440
else
408441
# There are some libraries, that weren't matched/found on the system
409442
log_verbose "No matches found for ${library}"
@@ -484,12 +517,30 @@ symlink_mode () {
484517
# Loop over each matched library
485518
for library in "${MATCHED_LIBRARIES[@]}"; do
486519
log_verbose "Linking library: ${library}"
520+
521+
# Get just the library filename
522+
lib_name=$(basename "$library")
523+
524+
# Check if the symlink already exists
525+
if [ -L "$lib_name" ]; then
526+
# Check if it's pointing to the same target
527+
target=$(readlink "$lib_name")
528+
if [ "$target" = "$library" ]; then
529+
log_verbose "Symlink for $lib_name already exists and points to correct target"
530+
continue
531+
else
532+
log_verbose "Symlink for $lib_name exists but points to wrong target: $target, updating..."
533+
rm "$lib_name"
534+
fi
535+
fi
487536

488537
# Create a symlink in the current directory
489538
# and check if the symlink was created successfully
490539
if ! ln -s "$library" .
491540
then
492-
fatal_error "Error: Failed to create symlink for library $library in $PWD"
541+
echo_yellow "Warning: Failed to create symlink for library $library in $PWD"
542+
# Continue instead of fatal_error to make the script more robust
543+
continue
493544
fi
494545
done
495546

tests/nvidia-libs/mock-nvidia-libs.sh

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
#!/bin/bash
22
# Setup script to create fake NVIDIA libraries for testing
33

4-
# Create directory for fake NVIDIA libraries
4+
# Create directories for fake NVIDIA libraries
55
mkdir -p /tmp/nvidia_libs
6+
mkdir -p /tmp/nvidia_libs_duplicate
67

78
# Create common NVIDIA libraries with minimal content
89
libraries=(
@@ -27,6 +28,16 @@ for lib in "${libraries[@]}"; do
2728
base_lib=$(echo "$lib" | sed 's/\.so\.[0-9]*/.so/')
2829
ln -sf "/tmp/nvidia_libs/$lib" "/tmp/nvidia_libs/$base_lib"
2930
fi
31+
32+
# Create duplicate libraries in a different location
33+
if [[ "$lib" == "libcuda.so.1" || "$lib" == "libnvidia-ml.so.1" ]]; then
34+
echo "This is a duplicate $lib for testing purposes" > "/tmp/nvidia_libs_duplicate/$lib"
35+
chmod +x "/tmp/nvidia_libs_duplicate/$lib"
36+
if [[ "$lib" == *".so."* ]]; then
37+
base_lib=$(echo "$lib" | sed 's/\.so\.[0-9]*/.so/')
38+
ln -sf "/tmp/nvidia_libs_duplicate/$lib" "/tmp/nvidia_libs_duplicate/$base_lib"
39+
fi
40+
fi
3041
done
3142

3243
# Create a fake ldconfig cache that points to our fake libraries
@@ -38,11 +49,15 @@ cat > /tmp/ldconfig/ldconfig << 'EOF'
3849
# Fake ldconfig command that returns our fake libraries
3950
4051
if [ "$1" = "-p" ]; then
41-
# Simulate ldconfig -p output
52+
# Simulate ldconfig -p output with duplicate entries
4253
echo "libcuda.so.1 (libc6,x86-64) => /tmp/nvidia_libs/libcuda.so.1"
54+
echo "libcuda.so.1 (libc6,x86-64) => /tmp/nvidia_libs_duplicate/libcuda.so.1"
4355
echo "libcuda.so (libc6,x86-64) => /tmp/nvidia_libs/libcuda.so"
56+
echo "libcuda.so (libc6,x86-64) => /tmp/nvidia_libs_duplicate/libcuda.so"
4457
echo "libnvidia-ml.so.1 (libc6,x86-64) => /tmp/nvidia_libs/libnvidia-ml.so.1"
58+
echo "libnvidia-ml.so.1 (libc6,x86-64) => /tmp/nvidia_libs_duplicate/libnvidia-ml.so.1"
4559
echo "libnvidia-ml.so (libc6,x86-64) => /tmp/nvidia_libs/libnvidia-ml.so"
60+
echo "libnvidia-ml.so (libc6,x86-64) => /tmp/nvidia_libs_duplicate/libnvidia-ml.so"
4661
echo "libnvidia-ptxjitcompiler.so.1 (libc6,x86-64) => /tmp/nvidia_libs/libnvidia-ptxjitcompiler.so.1"
4762
echo "libnvidia-ptxjitcompiler.so (libc6,x86-64) => /tmp/nvidia_libs/libnvidia-ptxjitcompiler.so"
4863
echo "libOpenCL.so.1 (libc6,x86-64) => /tmp/nvidia_libs/libOpenCL.so.1"

0 commit comments

Comments
 (0)