Skip to content

Commit b640b0c

Browse files
authored
Merge branch 'main' into bot_script_ci
2 parents 44d8c90 + c4ed80c commit b640b0c

5 files changed

Lines changed: 110 additions & 23 deletions

File tree

create_lmodsitepackage.py

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,6 @@
172172
else
173173
cudaDriverDir = eessi_eprefix .. "/lib/nvidia"
174174
end
175-
local cudaVersionFile = cudaDriverDir .. "/cuda_version.txt"
176175
local cudaDriverFile = cudaDriverDir .. "/libcuda.so"
177176
local cudaDriverExists = isFile(cudaDriverFile)
178177
local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so")
@@ -189,28 +188,46 @@
189188
else
190189
-- CUDA driver exists, now we check its version to see if an update is needed
191190
if cudaDriverExists then
192-
local cudaVersion = read_file(cudaVersionFile)
193-
if not cudaVersion then
194-
LmodError("No CUDA version file\\n" .. cudaVersionFile .. "\\nfound. " .. refer_to_docs)
191+
local cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION")
192+
if not cudaVersion or cudaVersion == "" then
193+
local eessi_prefix = os.getenv("EESSI_PREFIX")
194+
local script = pathJoin(eessi_prefix, 'scripts', 'gpu_support', 'nvidia', 'get_cuda_driver_version.sh')
195+
source_sh("bash", script)
195196
end
197+
cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION")
196198
local cudaVersion_req = os.getenv("EESSICUDAVERSION")
197-
-- driver CUDA versions don't give a patch version for CUDA
198-
local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)")
199-
local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)")
200-
local driver_libs_need_update = false
201-
if major < major_req then
202-
driver_libs_need_update = true
203-
elseif major == major_req then
204-
if minor < minor_req then
199+
-- Account for the fact that the script sourced above was designed to never return a non-zero exit code,
200+
-- even if it fails to set EESSI_CUDA_DRIVER_VERSION
201+
-- Essentially, we handle that case here by raising an error, which can be suppressed
202+
if not cudaVersion or cudaVersion == "" then
203+
local suppress_var = "EESSI_CUDA_DRIVER_VERSION_SUPPRESS_WARNING"
204+
local warn = "Environment variable EESSI_CUDA_DRIVER_VERSION not found. "
205+
warn = warn .. "Cannot ensure that driver version is new enough for CUDA toolkit version: '"
206+
warn = warn .. cudaVersion_req .. "'. This module will still be loaded, but may not function "
207+
warn = warn .. "as expected. Export " .. suppress_var .. "=1"
208+
local suppress_warn = os.getenv(suppress_var)
209+
if not suppress_warn or suppress_warn == 1 then
210+
LmodWarning(warn)
211+
end
212+
else
213+
-- driver CUDA versions don't give a patch version for CUDA
214+
local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)")
215+
local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)")
216+
local driver_libs_need_update = false
217+
if tonumber(major) < tonumber(major_req) then
205218
driver_libs_need_update = true
219+
elseif tonumber(major) == tonumber(major_req) then
220+
if tonumber(minor) < tonumber(minor_req) then
221+
driver_libs_need_update = true
222+
end
223+
end
224+
if driver_libs_need_update == true then
225+
local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". "
226+
advice = advice .. "Please update your CUDA driver libraries and then "
227+
advice = advice .. "let EESSI know about the update.\\n"
228+
advice = advice .. refer_to_docs
229+
LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice)
206230
end
207-
end
208-
if driver_libs_need_update == true then
209-
local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". "
210-
advice = advice .. "Please update your CUDA driver libraries and then "
211-
advice = advice .. "let EESSI know about the update.\\n"
212-
advice = advice .. refer_to_docs
213-
LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice)
214231
end
215232
end
216233
end

eb_hooks.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,9 @@ def pre_prepare_hook(self, *args, **kwargs):
449449
# Always trigger this, regardless of ec.name
450450
pre_prepare_hook_unsupported_modules(self, *args, **kwargs)
451451

452+
# Always trigger this, regardless of ec.name
453+
pre_prepare_hook_cuda_dependant(self, *args, **kwargs)
454+
452455

453456
def post_prepare_hook_gcc_prefixed_ld_rpath_wrapper(self, *args, **kwargs):
454457
"""
@@ -910,6 +913,20 @@ def post_easyblock_hook_copy_easybuild_subdir(self, *args, **kwargs):
910913
copy_dir(app_easybuild_dir, app_reprod_dir)
911914

912915

916+
def pre_prepare_hook_cuda_dependant(self, *args, **kwargs):
917+
"""
918+
CUDA 12.8.0 doesn't support the 10.0f and 12.0f targets, only 10.0 and 12.0. This hook converts
919+
any CC 10.0f / 12.0f into 10.0 / 12.0 if the current package depends on CUDA.
920+
"""
921+
922+
cudaver = get_dependency_software_version("CUDA", ec=self.cfg, check_deps=True, check_builddeps=True)
923+
if cudaver and cudaver == '12.8.0':
924+
cuda_cc = build_option('cuda_compute_capabilities')
925+
if cuda_cc and ('10.0f' in cuda_cc or '12.0f' in cuda_cc):
926+
updated_cuda_cc = [v.replace('.0f', '.0') if v in ['10.0f', '12.0f'] else v for v in cuda_cc]
927+
update_build_option('cuda_compute_capabilities', updated_cuda_cc)
928+
929+
913930
def pre_prepare_hook_cudnn(self, *args, **kwargs):
914931
"""
915932
cuDNN is a binary install, that doesn't always have the device code for the suffixed CUDA
@@ -1110,6 +1127,31 @@ def pre_configure_hook_score_p(self, *args, **kwargs):
11101127
raise EasyBuildError("Score-P-specific hook triggered for non-Score-P easyconfig?!")
11111128

11121129

1130+
def pre_configure_hook_dyninst(self, *args, **kwargs):
1131+
"""
1132+
Pre-configure hook for Dyninst
1133+
- specify correct path to binutils (in compat layer)
1134+
"""
1135+
if self.name == 'Dyninst':
1136+
1137+
# determine path to Prefix installation in compat layer via $EPREFIX
1138+
eprefix = get_eessi_envvar('EPREFIX')
1139+
1140+
binutils_lib_path_glob_pattern = os.path.join(eprefix, 'usr', 'lib*', 'binutils', '*-linux-gnu', '2.*')
1141+
binutils_lib_path = glob.glob(binutils_lib_path_glob_pattern)
1142+
if len(binutils_lib_path) == 1:
1143+
print_msg("Defining LibIberty variables for Dyninst...")
1144+
self.cfg.update('configopts', '-DLibIberty_ROOT_DIR=' + binutils_lib_path[0])
1145+
self.cfg.update('configopts', '-DLibIberty_INCLUDE_DIRS=' + os.path.join(binutils_lib_path[0], 'include'))
1146+
self.cfg.update('configopts', '-DLibIberty_LIBRARIES=' + os.path.join(binutils_lib_path[0], 'libiberty.a'))
1147+
else:
1148+
raise EasyBuildError("Failed to isolate path for binutils libraries using %s, got %s",
1149+
binutils_lib_path_glob_pattern, binutils_lib_path)
1150+
1151+
else:
1152+
raise EasyBuildError("Dyninst-specific hook triggered for non-Dyninst easyconfig?!")
1153+
1154+
11131155
def pre_configure_hook_extrae(self, *args, **kwargs):
11141156
"""
11151157
Pre-configure hook for Extrae
@@ -1987,6 +2029,7 @@ def pre_run_shell_cmd_hook(cmd, work_dir=None, **kwargs):
19872029
'WRF': pre_configure_hook_wrf_aarch64,
19882030
'LAMMPS': pre_configure_hook_LAMMPS_zen4_and_aarch64_cuda,
19892031
'Score-P': pre_configure_hook_score_p,
2032+
'Dyninst': pre_configure_hook_dyninst,
19902033
'CMake': pre_configure_hook_cmake_system,
19912034
}
19922035

eessi_container.sh

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,9 @@ display_help() {
113113
echo " when a directory is provided, the format of the"
114114
echo " tarball's name will be {REPO_ID}-{TIMESTAMP}.tgz"
115115
echo " [default: not set]"
116+
echo " -S | --sandbox - use sandbox mode (i.e. convert .sif image to sandbox and then run"
117+
echo " it instead)"
118+
echo " [default: not set]"
116119
echo " -v | --verbose - display more information [default: false]"
117120
echo " -x | --http-proxy URL - provides URL for the env variable http_proxy"
118121
echo " [default: not set]; uses env var \$http_proxy if set"
@@ -275,6 +278,10 @@ while [[ $# -gt 0 ]]; do
275278
SAVE="$2"
276279
shift 2
277280
;;
281+
-S|--sandbox)
282+
SANDBOX=1
283+
shift 1
284+
;;
278285
-u|--resume)
279286
RESUME="$2"
280287
shift 2
@@ -1039,10 +1046,23 @@ for arg in "${PASS_THROUGH[@]}"; do
10391046
ADDITIONAL_CONTAINER_OPTIONS+=(${arg})
10401047
done
10411048

1042-
echo "Launching container with command (next line):"
1043-
echo "singularity ${RUN_QUIET} ${MODE} ${ADDITIONAL_CONTAINER_OPTIONS[@]} ${EESSI_FUSE_MOUNTS[@]} ${CONTAINER} $@"
1044-
singularity ${RUN_QUIET} ${MODE} "${ADDITIONAL_CONTAINER_OPTIONS[@]}" "${EESSI_FUSE_MOUNTS[@]}" ${CONTAINER} "$@"
1045-
exit_code=$?
1049+
# EESSI_SINGULARITY_SANDBOX is an environment variable (typically set in site_config.sh, if needed)
1050+
if [[ -n "${EESSI_SINGULARITY_SANDBOX}" || ${SANDBOX} -eq 1 ]]; then
1051+
# using a sandbox image mode is more robust at the cleanup phase at the end
1052+
CONTAINER_SANDBOX="${CONTAINER%.sif}.sandbox"
1053+
echo "Building a sandbox image with command (next line):"
1054+
echo "singularity build --sandbox --force ${CONTAINER_SANDBOX} ${CONTAINER}"
1055+
singularity build --sandbox --force ${CONTAINER_SANDBOX} ${CONTAINER}
1056+
echo "Launching sandbox container with command (next line):"
1057+
echo "singularity ${RUN_QUIET} ${MODE} ${ADDITIONAL_CONTAINER_OPTIONS[@]} ${EESSI_FUSE_MOUNTS[@]} ${CONTAINER_SANDBOX} $@"
1058+
singularity ${RUN_QUIET} ${MODE} "${ADDITIONAL_CONTAINER_OPTIONS[@]}" "${EESSI_FUSE_MOUNTS[@]}" ${CONTAINER_SANDBOX} "$@"
1059+
exit_code=$?
1060+
else
1061+
echo "Launching container with command (next line):"
1062+
echo "singularity ${RUN_QUIET} ${MODE} ${ADDITIONAL_CONTAINER_OPTIONS[@]} ${EESSI_FUSE_MOUNTS[@]} ${CONTAINER} $@"
1063+
singularity ${RUN_QUIET} ${MODE} "${ADDITIONAL_CONTAINER_OPTIONS[@]}" "${EESSI_FUSE_MOUNTS[@]}" ${CONTAINER} "$@"
1064+
exit_code=$?
1065+
fi
10461066

10471067
# 6. save tmp if requested (arg -s|--save)
10481068
if [[ ! -z ${SAVE} ]]; then

install_scripts.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,7 @@ nvidia_files=(
211211
install_cuda_and_libraries.sh
212212
install_cuda_host_injections.sh
213213
link_nvidia_host_libraries.sh
214+
get_cuda_driver_version.sh
214215
)
215216
copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}"
216217

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# This can be leveraged by the source_sh() feature of Lmod
2+
# Because we want to source this without immediately raising an LmodError upon failure, this script
3+
# is designed to ALWAYS return a 0 exit code
4+
EESSI_CUDA_DRIVER_VERSION=$(nvidia-smi --query | grep -oP 'CUDA Version\s*:\s*\K[0-9.]+') || return 0
5+
# The || return 0 shouldn't be needed, but just to be overly sure that this script always returns 0
6+
export EESSI_CUDA_DRIVER_VERSION || return 0

0 commit comments

Comments
 (0)