Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions .github/workflows/tests_link_nvidia_host_libraries.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ jobs:
# List dir with libraries
echo "Showing content of /tmp/nvidia_libs"
echo "$(ls -l /tmp/nvidia_libs)"
echo "Showing content of /tmp/nvidia_libs_duplicate"
echo "$(ls -l /tmp/nvidia_libs_duplicate)"
echo "Showing content of /opt/eessi/nvidia/x86_64/host"
echo "$(ls -l /opt/eessi/nvidia/x86_64/host)"

Expand Down Expand Up @@ -152,9 +154,9 @@ jobs:
exit 1
fi

# Verify it points to our mock library in /tmp/nvidia_libs
if [[ "$target" != "/tmp/nvidia_libs/$lib"* && "$target" != *"/tmp/nvidia_libs/"* ]]; then
echo "Error: Symlink $lib_path points to $target, which is not in our mock directory"
# Verify it points to our mock library in /tmp/nvidia_libs or /tmp/nvidia_libs_duplicate
if [[ "$target" != "/tmp/nvidia_libs/$lib"* && "$target" != "/tmp/nvidia_libs_duplicate/$lib"* ]]; then
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we know exactly which one it should point to? And there should be something in the output that says the other one was filtered

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, since it is duplicate.
The real-world example was /lib was the same as /lib64 on Azure
in this case you don't care if the lib is linked from any location.

echo "Error: Symlink $lib_path points to $target, which is not in our mock directories"
exit 1
fi

Expand Down Expand Up @@ -194,3 +196,4 @@ jobs:
echo "$output" | grep "have already been linked" || { echo "Missing 'already linked' message"; exit 1; }

echo "Second normal run test passed"

46 changes: 44 additions & 2 deletions scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
Original file line number Diff line number Diff line change
Expand Up @@ -402,8 +402,34 @@ find_cuda_libraries_on_host() {

if [ -n "$matched" ]; then
log_verbose "Found matches for ${library}: $matched"
# Do not quote $matched, since it can contain multiple libraries split by \n!
MATCHED_LIBRARIES+=($matched)

# Process each matched library and avoid duplicates by filename
# Used `while - read <<< $matched`` to handle whitespaces and special characters.
while IFS= read -r lib_path; do
# Skip empty lines
[ -z "$lib_path" ] && continue

# Extract just the filename from the path
lib_name=$(basename "$lib_path")
echo "Checking library $lib_name for duplicates"

# Check if we already have this library filename in our matched libraries
duplicate_found=0
for existing_lib in "${MATCHED_LIBRARIES[@]}"; do
existing_name=$(basename "$existing_lib")
if [ "$existing_name" = "$lib_name" ]; then
log_verbose "Duplicate library found: $lib_name (existing: $existing_lib, currently processed: $lib_path)"
log_verbose "Discarting $lib_path"
duplicate_found=1
break
fi
done

# If no duplicate found, add this library
if [ "$duplicate_found" -eq 0 ]; then
MATCHED_LIBRARIES+=("$lib_path")
fi
done <<< "$matched"
else
# There are some libraries, that weren't matched/found on the system
log_verbose "No matches found for ${library}"
Expand Down Expand Up @@ -484,6 +510,22 @@ symlink_mode () {
# Loop over each matched library
for library in "${MATCHED_LIBRARIES[@]}"; do
log_verbose "Linking library: ${library}"

# Get just the library filename
lib_name=$(basename "$library")

# Check if the symlink already exists
if [ -L "$lib_name" ]; then
# Check if it's pointing to the same target
target=$(readlink "$lib_name")
if [ "$target" = "$library" ]; then
log_verbose "Symlink for $lib_name already exists and points to correct target"
continue
else
log_verbose "Symlink for $lib_name exists but points to wrong target: $target, updating..."
rm "$lib_name"
fi
fi

# Create a symlink in the current directory
# and check if the symlink was created successfully
Expand Down
19 changes: 17 additions & 2 deletions tests/nvidia-libs/mock-nvidia-libs.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#!/bin/bash
# Setup script to create fake NVIDIA libraries for testing

# Create directory for fake NVIDIA libraries
# Create directories for fake NVIDIA libraries
mkdir -p /tmp/nvidia_libs
mkdir -p /tmp/nvidia_libs_duplicate

# Create common NVIDIA libraries with minimal content
libraries=(
Expand All @@ -27,6 +28,16 @@ for lib in "${libraries[@]}"; do
base_lib=$(echo "$lib" | sed 's/\.so\.[0-9]*/.so/')
ln -sf "/tmp/nvidia_libs/$lib" "/tmp/nvidia_libs/$base_lib"
fi

# Create duplicate libraries in a different location
if [[ "$lib" == "libcuda.so.1" || "$lib" == "libnvidia-ml.so.1" ]]; then
echo "This is a duplicate $lib for testing purposes" > "/tmp/nvidia_libs_duplicate/$lib"
chmod +x "/tmp/nvidia_libs_duplicate/$lib"
if [[ "$lib" == *".so."* ]]; then
base_lib=$(echo "$lib" | sed 's/\.so\.[0-9]*/.so/')
ln -sf "/tmp/nvidia_libs_duplicate/$lib" "/tmp/nvidia_libs_duplicate/$base_lib"
fi
fi
done

# Create a fake ldconfig cache that points to our fake libraries
Expand All @@ -38,11 +49,15 @@ cat > /tmp/ldconfig/ldconfig << 'EOF'
# Fake ldconfig command that returns our fake libraries

if [ "$1" = "-p" ]; then
# Simulate ldconfig -p output
# Simulate ldconfig -p output with duplicate entries
echo "libcuda.so.1 (libc6,x86-64) => /tmp/nvidia_libs/libcuda.so.1"
echo "libcuda.so.1 (libc6,x86-64) => /tmp/nvidia_libs_duplicate/libcuda.so.1"
echo "libcuda.so (libc6,x86-64) => /tmp/nvidia_libs/libcuda.so"
echo "libcuda.so (libc6,x86-64) => /tmp/nvidia_libs_duplicate/libcuda.so"
echo "libnvidia-ml.so.1 (libc6,x86-64) => /tmp/nvidia_libs/libnvidia-ml.so.1"
echo "libnvidia-ml.so.1 (libc6,x86-64) => /tmp/nvidia_libs_duplicate/libnvidia-ml.so.1"
echo "libnvidia-ml.so (libc6,x86-64) => /tmp/nvidia_libs/libnvidia-ml.so"
echo "libnvidia-ml.so (libc6,x86-64) => /tmp/nvidia_libs_duplicate/libnvidia-ml.so"
echo "libnvidia-ptxjitcompiler.so.1 (libc6,x86-64) => /tmp/nvidia_libs/libnvidia-ptxjitcompiler.so.1"
echo "libnvidia-ptxjitcompiler.so (libc6,x86-64) => /tmp/nvidia_libs/libnvidia-ptxjitcompiler.so"
echo "libOpenCL.so.1 (libc6,x86-64) => /tmp/nvidia_libs/libOpenCL.so.1"
Expand Down
Loading