From f438313bd1fe64d95ebbaf0ba4a7ef6ad1654897 Mon Sep 17 00:00:00 2001 From: Pavel Grochal Date: Thu, 20 Mar 2025 09:26:53 +0100 Subject: [PATCH 1/3] Updated link_nvidia_host_libraries.sh to handle duplicate libraries. --- .../tests_link_nvidia_host_libraries.yml | 49 +++++++++++++++- .../nvidia/link_nvidia_host_libraries.sh | 57 ++++++++++++++++++- tests/nvidia-libs/mock-nvidia-libs.sh | 19 ++++++- 3 files changed, 117 insertions(+), 8 deletions(-) diff --git a/.github/workflows/tests_link_nvidia_host_libraries.yml b/.github/workflows/tests_link_nvidia_host_libraries.yml index b602427eed..cf46f546d7 100644 --- a/.github/workflows/tests_link_nvidia_host_libraries.yml +++ b/.github/workflows/tests_link_nvidia_host_libraries.yml @@ -120,6 +120,8 @@ jobs: # List dir with libraries echo "Showing content of /tmp/nvidia_libs" echo "$(ls -l /tmp/nvidia_libs)" + echo "Showing content of /tmp/nvidia_libs_duplicate" + echo "$(ls -l /tmp/nvidia_libs_duplicate)" echo "Showing content of /opt/eessi/nvidia/x86_64/host" echo "$(ls -l /opt/eessi/nvidia/x86_64/host)" @@ -152,9 +154,9 @@ jobs: exit 1 fi - # Verify it points to our mock library in /tmp/nvidia_libs - if [[ "$target" != "/tmp/nvidia_libs/$lib"* && "$target" != *"/tmp/nvidia_libs/"* ]]; then - echo "Error: Symlink $lib_path points to $target, which is not in our mock directory" + # Verify it points to our mock library in /tmp/nvidia_libs or /tmp/nvidia_libs_duplicate + if [[ "$target" != "/tmp/nvidia_libs/$lib"* && "$target" != "/tmp/nvidia_libs_duplicate/$lib"* ]]; then + echo "Error: Symlink $lib_path points to $target, which is not in our mock directories" exit 1 fi @@ -194,3 +196,44 @@ jobs: echo "$output" | grep "have already been linked" || { echo "Missing 'already linked' message"; exit 1; } echo "Second normal run test passed" + + - name: Test handling of duplicate libraries + run: | + echo ">>> Testing handling of duplicate libraries" + + # Remove existing symlinks to prepare for this test + rm -rf /opt/eessi/nvidia/x86_64/host + + # Run the script with verbose mode + output=$(./scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh --verbose || { echo "Script returned non-zero: $?"; echo $output; exit 1; }) + + echo "$output" + + # Check if libraries with duplicates were correctly handled + # For libcuda.so and libnvidia-ml.so, which have duplicates, there should be exactly one symlink + + # List all symlinks created in the host directory + echo ">>> Listing symlinks in host directory" + ls -la /opt/eessi/nvidia/x86_64/host/ + + # Check for duplicate entries - there should be only one libcuda.so and one libnvidia-ml.so + duplicates=$(find /opt/eessi/nvidia/x86_64/host/ -name "libcuda.so" | wc -l) + if [ "$duplicates" -gt 1 ]; then + echo "Error: Found multiple libcuda.so symlinks, which indicates a failure in handling duplicates" + exit 1 + fi + + duplicates=$(find /opt/eessi/nvidia/x86_64/host/ -name "libnvidia-ml.so" | wc -l) + if [ "$duplicates" -gt 1 ]; then + echo "Error: Found multiple libnvidia-ml.so symlinks, which indicates a failure in handling duplicates" + exit 1 + fi + + # Verify that the script correctly prioritized one of the duplicates + libcuda_target=$(readlink /opt/eessi/nvidia/x86_64/host/libcuda.so) + echo "libcuda.so points to: $libcuda_target" + + libnvidia_ml_target=$(readlink /opt/eessi/nvidia/x86_64/host/libnvidia-ml.so) + echo "libnvidia-ml.so points to: $libnvidia_ml_target" + + echo "Duplicate libraries test passed" \ No newline at end of file diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index dbc672b827..fb3a120961 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -402,8 +402,41 @@ find_cuda_libraries_on_host() { if [ -n "$matched" ]; then log_verbose "Found matches for ${library}: $matched" - # Do not quote $matched, since it can contain multiple libraries split by \n! - MATCHED_LIBRARIES+=($matched) + + # Process each matched library and avoid duplicates by filename + # Used `while - read <<< $matched`` to handle whitespaces and special characters. + while IFS= read -r lib_path; do + # Skip empty lines + [ -z "$lib_path" ] && continue + + # Extract just the filename from the path + lib_name=$(basename "$lib_path") + echo "Checking library $lib_name for duplicates" + + # Check if we already have this library filename in our matched libraries + duplicate_found=0 + for existing_lib in "${MATCHED_LIBRARIES[@]}"; do + existing_name=$(basename "$existing_lib") + if [ "$existing_name" = "$lib_name" ]; then + log_verbose "Duplicate library found: $lib_name (existing: $existing_lib, new: $lib_path)" + # Prioritize libraries in standard locations if possible + if [[ "$lib_path" == "/usr/lib"* || "$lib_path" == "/lib"* ]]; then + log_verbose "Prioritizing system library: $lib_path" + # Remove the existing entry + MATCHED_LIBRARIES=("${MATCHED_LIBRARIES[@]/$existing_lib}") + # Add the new one + MATCHED_LIBRARIES+=("$lib_path") + fi + duplicate_found=1 + break + fi + done + + # If no duplicate found, add this library + if [ "$duplicate_found" -eq 0 ]; then + MATCHED_LIBRARIES+=("$lib_path") + fi + done <<< "$matched" else # There are some libraries, that weren't matched/found on the system log_verbose "No matches found for ${library}" @@ -484,12 +517,30 @@ symlink_mode () { # Loop over each matched library for library in "${MATCHED_LIBRARIES[@]}"; do log_verbose "Linking library: ${library}" + + # Get just the library filename + lib_name=$(basename "$library") + + # Check if the symlink already exists + if [ -L "$lib_name" ]; then + # Check if it's pointing to the same target + target=$(readlink "$lib_name") + if [ "$target" = "$library" ]; then + log_verbose "Symlink for $lib_name already exists and points to correct target" + continue + else + log_verbose "Symlink for $lib_name exists but points to wrong target: $target, updating..." + rm "$lib_name" + fi + fi # Create a symlink in the current directory # and check if the symlink was created successfully if ! ln -s "$library" . then - fatal_error "Error: Failed to create symlink for library $library in $PWD" + echo_yellow "Warning: Failed to create symlink for library $library in $PWD" + # Continue instead of fatal_error to make the script more robust + continue fi done diff --git a/tests/nvidia-libs/mock-nvidia-libs.sh b/tests/nvidia-libs/mock-nvidia-libs.sh index f18c988918..cdf5f5ca9a 100644 --- a/tests/nvidia-libs/mock-nvidia-libs.sh +++ b/tests/nvidia-libs/mock-nvidia-libs.sh @@ -1,8 +1,9 @@ #!/bin/bash # Setup script to create fake NVIDIA libraries for testing -# Create directory for fake NVIDIA libraries +# Create directories for fake NVIDIA libraries mkdir -p /tmp/nvidia_libs +mkdir -p /tmp/nvidia_libs_duplicate # Create common NVIDIA libraries with minimal content libraries=( @@ -27,6 +28,16 @@ for lib in "${libraries[@]}"; do base_lib=$(echo "$lib" | sed 's/\.so\.[0-9]*/.so/') ln -sf "/tmp/nvidia_libs/$lib" "/tmp/nvidia_libs/$base_lib" fi + + # Create duplicate libraries in a different location + if [[ "$lib" == "libcuda.so.1" || "$lib" == "libnvidia-ml.so.1" ]]; then + echo "This is a duplicate $lib for testing purposes" > "/tmp/nvidia_libs_duplicate/$lib" + chmod +x "/tmp/nvidia_libs_duplicate/$lib" + if [[ "$lib" == *".so."* ]]; then + base_lib=$(echo "$lib" | sed 's/\.so\.[0-9]*/.so/') + ln -sf "/tmp/nvidia_libs_duplicate/$lib" "/tmp/nvidia_libs_duplicate/$base_lib" + fi + fi done # Create a fake ldconfig cache that points to our fake libraries @@ -38,11 +49,15 @@ cat > /tmp/ldconfig/ldconfig << 'EOF' # Fake ldconfig command that returns our fake libraries if [ "$1" = "-p" ]; then - # Simulate ldconfig -p output + # Simulate ldconfig -p output with duplicate entries echo "libcuda.so.1 (libc6,x86-64) => /tmp/nvidia_libs/libcuda.so.1" + echo "libcuda.so.1 (libc6,x86-64) => /tmp/nvidia_libs_duplicate/libcuda.so.1" echo "libcuda.so (libc6,x86-64) => /tmp/nvidia_libs/libcuda.so" + echo "libcuda.so (libc6,x86-64) => /tmp/nvidia_libs_duplicate/libcuda.so" echo "libnvidia-ml.so.1 (libc6,x86-64) => /tmp/nvidia_libs/libnvidia-ml.so.1" + echo "libnvidia-ml.so.1 (libc6,x86-64) => /tmp/nvidia_libs_duplicate/libnvidia-ml.so.1" echo "libnvidia-ml.so (libc6,x86-64) => /tmp/nvidia_libs/libnvidia-ml.so" + echo "libnvidia-ml.so (libc6,x86-64) => /tmp/nvidia_libs_duplicate/libnvidia-ml.so" echo "libnvidia-ptxjitcompiler.so.1 (libc6,x86-64) => /tmp/nvidia_libs/libnvidia-ptxjitcompiler.so.1" echo "libnvidia-ptxjitcompiler.so (libc6,x86-64) => /tmp/nvidia_libs/libnvidia-ptxjitcompiler.so" echo "libOpenCL.so.1 (libc6,x86-64) => /tmp/nvidia_libs/libOpenCL.so.1" From 453fc93dbf956bc2174c2108930cca8b8dfa0d33 Mon Sep 17 00:00:00 2001 From: Pavel Grochal Date: Thu, 20 Mar 2025 10:40:18 +0100 Subject: [PATCH 2/3] Updated link_nvidia_host_libraries.sh - removed unnecessary tests. --- .../tests_link_nvidia_host_libraries.yml | 42 +------------------ 1 file changed, 1 insertion(+), 41 deletions(-) diff --git a/.github/workflows/tests_link_nvidia_host_libraries.yml b/.github/workflows/tests_link_nvidia_host_libraries.yml index cf46f546d7..22ebc21084 100644 --- a/.github/workflows/tests_link_nvidia_host_libraries.yml +++ b/.github/workflows/tests_link_nvidia_host_libraries.yml @@ -196,44 +196,4 @@ jobs: echo "$output" | grep "have already been linked" || { echo "Missing 'already linked' message"; exit 1; } echo "Second normal run test passed" - - - name: Test handling of duplicate libraries - run: | - echo ">>> Testing handling of duplicate libraries" - - # Remove existing symlinks to prepare for this test - rm -rf /opt/eessi/nvidia/x86_64/host - - # Run the script with verbose mode - output=$(./scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh --verbose || { echo "Script returned non-zero: $?"; echo $output; exit 1; }) - - echo "$output" - - # Check if libraries with duplicates were correctly handled - # For libcuda.so and libnvidia-ml.so, which have duplicates, there should be exactly one symlink - - # List all symlinks created in the host directory - echo ">>> Listing symlinks in host directory" - ls -la /opt/eessi/nvidia/x86_64/host/ - - # Check for duplicate entries - there should be only one libcuda.so and one libnvidia-ml.so - duplicates=$(find /opt/eessi/nvidia/x86_64/host/ -name "libcuda.so" | wc -l) - if [ "$duplicates" -gt 1 ]; then - echo "Error: Found multiple libcuda.so symlinks, which indicates a failure in handling duplicates" - exit 1 - fi - - duplicates=$(find /opt/eessi/nvidia/x86_64/host/ -name "libnvidia-ml.so" | wc -l) - if [ "$duplicates" -gt 1 ]; then - echo "Error: Found multiple libnvidia-ml.so symlinks, which indicates a failure in handling duplicates" - exit 1 - fi - - # Verify that the script correctly prioritized one of the duplicates - libcuda_target=$(readlink /opt/eessi/nvidia/x86_64/host/libcuda.so) - echo "libcuda.so points to: $libcuda_target" - - libnvidia_ml_target=$(readlink /opt/eessi/nvidia/x86_64/host/libnvidia-ml.so) - echo "libnvidia-ml.so points to: $libnvidia_ml_target" - - echo "Duplicate libraries test passed" \ No newline at end of file + \ No newline at end of file From c87f72da55cac7ae9ce9a0f98beaea071fe96ec1 Mon Sep 17 00:00:00 2001 From: Pavel Grochal Date: Wed, 26 Mar 2025 22:10:26 +0100 Subject: [PATCH 3/3] Updated link script to fix remarks. --- .../nvidia/link_nvidia_host_libraries.sh | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index fb3a120961..27447341fd 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -418,15 +418,8 @@ find_cuda_libraries_on_host() { for existing_lib in "${MATCHED_LIBRARIES[@]}"; do existing_name=$(basename "$existing_lib") if [ "$existing_name" = "$lib_name" ]; then - log_verbose "Duplicate library found: $lib_name (existing: $existing_lib, new: $lib_path)" - # Prioritize libraries in standard locations if possible - if [[ "$lib_path" == "/usr/lib"* || "$lib_path" == "/lib"* ]]; then - log_verbose "Prioritizing system library: $lib_path" - # Remove the existing entry - MATCHED_LIBRARIES=("${MATCHED_LIBRARIES[@]/$existing_lib}") - # Add the new one - MATCHED_LIBRARIES+=("$lib_path") - fi + log_verbose "Duplicate library found: $lib_name (existing: $existing_lib, currently processed: $lib_path)" + log_verbose "Discarting $lib_path" duplicate_found=1 break fi @@ -538,9 +531,7 @@ symlink_mode () { # and check if the symlink was created successfully if ! ln -s "$library" . then - echo_yellow "Warning: Failed to create symlink for library $library in $PWD" - # Continue instead of fatal_error to make the script more robust - continue + fatal_error "Error: Failed to create symlink for library $library in $PWD" fi done