Skip to content

Commit 4d286cf

Browse files
authored
Merge pull request #1205 from ocaisa/rebuild_lightgbm
2 parents 12c3099 + 46975c0 commit 4d286cf

3 files changed

Lines changed: 58 additions & 5 deletions

File tree

.github/workflows/scripts/compare_to_generic.sh

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,32 @@ esac
2323
source_of_truth_modules="$base_dir/$source_of_truth/$modules_subdir"
2424
arch_modules="$base_dir/$target_arch/$modules_subdir"
2525
echo "Comparing $arch_modules to $source_of_truth_modules"
26-
python3 $script_dir/compare_stacks.py $source_of_truth_modules $arch_modules
26+
27+
if ! python3 $script_dir/compare_stacks.py $source_of_truth_modules $arch_modules; then
28+
echo "Warning: Comparison failed for CPU stacks" >&2
29+
exit 1
30+
fi
31+
32+
# Also compare NVIDIA GPU software stacks
33+
if [[ -n "$CUDA_COMPUTE_CAPABILITIES" ]]; then
34+
read -ra compute_capabilities <<< "$CUDA_COMPUTE_CAPABILITIES"
35+
echo "Also comparing CUDA-enabled software stacks (for compute capabilities: ${compute_capabilities[@]})"
36+
# Initialize a variable to track failures
37+
any_failure=0
38+
# Loop over the array
39+
for cc in "${compute_capabilities[@]}"; do
40+
source_of_truth_modules="$base_dir/$source_of_truth/accel/nvidia/cc80/$modules_subdir"
41+
arch_modules="$base_dir/$target_arch/accel/nvidia/$cc/$modules_subdir"
42+
echo "Comparing $arch_modules to $source_of_truth_modules"
43+
if ! python3 $script_dir/compare_stacks.py $source_of_truth_modules $arch_modules; then
44+
echo "Warning: Comparison failed for compute capability $cc" >&2
45+
any_failure=1
46+
fi
47+
done
48+
if [[ $any_failure -ne 0 ]]; then
49+
echo "One or more CUDA software stack comparisons failed." >&2
50+
exit 1
51+
fi
52+
else
53+
echo "CUDA_COMPUTE_CAPABILITIES is not set or is empty, not checking NVIDIA software stacks"
54+
fi

.github/workflows/test_compare_stacks.yml

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,17 @@ on:
88
permissions:
99
contents: read # to fetch code (actions/checkout)
1010
env:
11-
EESSI_ACCELERATOR_TARGETS: |
11+
CUDA_COMPUTE_CAPABILITIES_YAML: |
12+
# Provide a default set of compute capabilities
13+
default:
14+
- cc70
15+
- cc80
16+
- cc90
17+
# and then allow for special cases for specific architectures
1218
x86_64/amd/zen2:
13-
- nvidia/cc80
14-
x86_64/amd/zen3:
15-
- nvidia/cc80
19+
- cc70
20+
- cc80
21+
- cc90
1622
jobs:
1723
compare_stacks:
1824
runs-on: ubuntu-24.04
@@ -53,4 +59,11 @@ jobs:
5359
5460
# Compare the requested architecture to the generic stack
5561
# (assumes the general structure /cvmfs/software.eessi.io/versions/2023.06/software/linux/$COMPARISON_ARCH/modules/all)
62+
# and include a check for CUDA-enabled software using the environment variable CUDA_COMPUTE_CAPABILITIES
63+
# (which assumes the structure /cvmfs/software.eessi.io/versions/2023.06/software/linux/$COMPARISON_ARCH/accel/nvidia/$cc/modules/all)
64+
65+
# Parse the yaml that makes the compute capabilities arch-dependent
66+
CUDA_COMPUTE_CAPABILITIES=$(echo "${CUDA_COMPUTE_CAPABILITIES_YAML}" | yq ".\"${{matrix.COMPARISON_ARCH}}\" // .default | .[]" | tr '\n' ' ')
67+
export CUDA_COMPUTE_CAPABILITIES=${CUDA_COMPUTE_CAPABILITIES%% } # trim trailing space
68+
5669
.github/workflows/scripts/compare_to_generic.sh ${EESSI_PREFIX}/software/${EESSI_OS_TYPE} ${{matrix.COMPARISON_ARCH}}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# We'll rebuild all CUDA software, for various reasons
2+
# 1. We now have a proper CUDA sanity check, and if anything was 'wrong' with our current CUDA installs, we'd like
3+
# to know about it
4+
# 2. The PR implementing a CI to check for differences between officially supported CUDA Compute Capabilities shows
5+
# that there are a lot of missing installations https://github.com/EESSI/software-layer/pull/1087 . A rebuild PR like
6+
# this will have the convenient side effect of filling all those holes
7+
easyconfigs:
8+
- LightGBM-4.5.0-foss-2023a-CUDA-12.1.1.eb:
9+
options:
10+
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/24023
11+
from-commit: 853cdf7a8a3912aa0e55367b2b4451ebff00e13b
12+
cuda-sanity-check-accept-missing-ptx: True

0 commit comments

Comments
 (0)