diff --git a/.github/workflows/tests_link_nvidia_host_libraries.yml b/.github/workflows/tests_link_nvidia_host_libraries.yml index b602427eed..22ebc21084 100644 --- a/.github/workflows/tests_link_nvidia_host_libraries.yml +++ b/.github/workflows/tests_link_nvidia_host_libraries.yml @@ -120,6 +120,8 @@ jobs: # List dir with libraries echo "Showing content of /tmp/nvidia_libs" echo "$(ls -l /tmp/nvidia_libs)" + echo "Showing content of /tmp/nvidia_libs_duplicate" + echo "$(ls -l /tmp/nvidia_libs_duplicate)" echo "Showing content of /opt/eessi/nvidia/x86_64/host" echo "$(ls -l /opt/eessi/nvidia/x86_64/host)" @@ -152,9 +154,9 @@ jobs: exit 1 fi - # Verify it points to our mock library in /tmp/nvidia_libs - if [[ "$target" != "/tmp/nvidia_libs/$lib"* && "$target" != *"/tmp/nvidia_libs/"* ]]; then - echo "Error: Symlink $lib_path points to $target, which is not in our mock directory" + # Verify it points to our mock library in /tmp/nvidia_libs or /tmp/nvidia_libs_duplicate + if [[ "$target" != "/tmp/nvidia_libs/$lib"* && "$target" != "/tmp/nvidia_libs_duplicate/$lib"* ]]; then + echo "Error: Symlink $lib_path points to $target, which is not in our mock directories" exit 1 fi @@ -194,3 +196,4 @@ jobs: echo "$output" | grep "have already been linked" || { echo "Missing 'already linked' message"; exit 1; } echo "Second normal run test passed" + \ No newline at end of file diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index dbc672b827..27447341fd 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -402,8 +402,34 @@ find_cuda_libraries_on_host() { if [ -n "$matched" ]; then log_verbose "Found matches for ${library}: $matched" - # Do not quote $matched, since it can contain multiple libraries split by \n! - MATCHED_LIBRARIES+=($matched) + + # Process each matched library and avoid duplicates by filename + # Used `while - read <<< $matched`` to handle whitespaces and special characters. + while IFS= read -r lib_path; do + # Skip empty lines + [ -z "$lib_path" ] && continue + + # Extract just the filename from the path + lib_name=$(basename "$lib_path") + echo "Checking library $lib_name for duplicates" + + # Check if we already have this library filename in our matched libraries + duplicate_found=0 + for existing_lib in "${MATCHED_LIBRARIES[@]}"; do + existing_name=$(basename "$existing_lib") + if [ "$existing_name" = "$lib_name" ]; then + log_verbose "Duplicate library found: $lib_name (existing: $existing_lib, currently processed: $lib_path)" + log_verbose "Discarting $lib_path" + duplicate_found=1 + break + fi + done + + # If no duplicate found, add this library + if [ "$duplicate_found" -eq 0 ]; then + MATCHED_LIBRARIES+=("$lib_path") + fi + done <<< "$matched" else # There are some libraries, that weren't matched/found on the system log_verbose "No matches found for ${library}" @@ -484,6 +510,22 @@ symlink_mode () { # Loop over each matched library for library in "${MATCHED_LIBRARIES[@]}"; do log_verbose "Linking library: ${library}" + + # Get just the library filename + lib_name=$(basename "$library") + + # Check if the symlink already exists + if [ -L "$lib_name" ]; then + # Check if it's pointing to the same target + target=$(readlink "$lib_name") + if [ "$target" = "$library" ]; then + log_verbose "Symlink for $lib_name already exists and points to correct target" + continue + else + log_verbose "Symlink for $lib_name exists but points to wrong target: $target, updating..." + rm "$lib_name" + fi + fi # Create a symlink in the current directory # and check if the symlink was created successfully diff --git a/tests/nvidia-libs/mock-nvidia-libs.sh b/tests/nvidia-libs/mock-nvidia-libs.sh index f18c988918..cdf5f5ca9a 100644 --- a/tests/nvidia-libs/mock-nvidia-libs.sh +++ b/tests/nvidia-libs/mock-nvidia-libs.sh @@ -1,8 +1,9 @@ #!/bin/bash # Setup script to create fake NVIDIA libraries for testing -# Create directory for fake NVIDIA libraries +# Create directories for fake NVIDIA libraries mkdir -p /tmp/nvidia_libs +mkdir -p /tmp/nvidia_libs_duplicate # Create common NVIDIA libraries with minimal content libraries=( @@ -27,6 +28,16 @@ for lib in "${libraries[@]}"; do base_lib=$(echo "$lib" | sed 's/\.so\.[0-9]*/.so/') ln -sf "/tmp/nvidia_libs/$lib" "/tmp/nvidia_libs/$base_lib" fi + + # Create duplicate libraries in a different location + if [[ "$lib" == "libcuda.so.1" || "$lib" == "libnvidia-ml.so.1" ]]; then + echo "This is a duplicate $lib for testing purposes" > "/tmp/nvidia_libs_duplicate/$lib" + chmod +x "/tmp/nvidia_libs_duplicate/$lib" + if [[ "$lib" == *".so."* ]]; then + base_lib=$(echo "$lib" | sed 's/\.so\.[0-9]*/.so/') + ln -sf "/tmp/nvidia_libs_duplicate/$lib" "/tmp/nvidia_libs_duplicate/$base_lib" + fi + fi done # Create a fake ldconfig cache that points to our fake libraries @@ -38,11 +49,15 @@ cat > /tmp/ldconfig/ldconfig << 'EOF' # Fake ldconfig command that returns our fake libraries if [ "$1" = "-p" ]; then - # Simulate ldconfig -p output + # Simulate ldconfig -p output with duplicate entries echo "libcuda.so.1 (libc6,x86-64) => /tmp/nvidia_libs/libcuda.so.1" + echo "libcuda.so.1 (libc6,x86-64) => /tmp/nvidia_libs_duplicate/libcuda.so.1" echo "libcuda.so (libc6,x86-64) => /tmp/nvidia_libs/libcuda.so" + echo "libcuda.so (libc6,x86-64) => /tmp/nvidia_libs_duplicate/libcuda.so" echo "libnvidia-ml.so.1 (libc6,x86-64) => /tmp/nvidia_libs/libnvidia-ml.so.1" + echo "libnvidia-ml.so.1 (libc6,x86-64) => /tmp/nvidia_libs_duplicate/libnvidia-ml.so.1" echo "libnvidia-ml.so (libc6,x86-64) => /tmp/nvidia_libs/libnvidia-ml.so" + echo "libnvidia-ml.so (libc6,x86-64) => /tmp/nvidia_libs_duplicate/libnvidia-ml.so" echo "libnvidia-ptxjitcompiler.so.1 (libc6,x86-64) => /tmp/nvidia_libs/libnvidia-ptxjitcompiler.so.1" echo "libnvidia-ptxjitcompiler.so (libc6,x86-64) => /tmp/nvidia_libs/libnvidia-ptxjitcompiler.so" echo "libOpenCL.so.1 (libc6,x86-64) => /tmp/nvidia_libs/libOpenCL.so.1"