Skip to content

Commit d040d10

Browse files
author
Richard Top
committed
Merge branch 'main' of ssh://github.com/EESSI/software-layer-scripts into TensorFlow-CUDA
2 parents 3064a8b + 99c82b5 commit d040d10

9 files changed

Lines changed: 125 additions & 228 deletions

File tree

EESSI-extend-easybuild.eb

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ modextravars = {
7070
# EASYBUILD_INSTALLPATH=${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR}
7171
# EASYBUILD_SOURCEPATH=${WORKDIR}/easybuild/sources:${EESSI_SOURCEPATH}
7272
#
73-
# And also some optional ones based on the kind of installation
73+
# And also some optional ones based on the installation mode
7474
# EASYBUILD_SET_GID_BIT
7575
# EASYBUILD_GROUP_WRITABLE_INSTALLDIR
7676
# EASYBUILD_UMASK
@@ -212,7 +212,11 @@ easybuild_version = os.getenv("EBVERSIONEASYBUILD") or easybuild_version
212212
eessi_version = os.getenv("EESSI_VERSION") or "2023.06"
213213
214214
-- Set environment variables that are EasyBuild version specific
215-
if convertToCanonical(easybuild_version) > convertToCanonical("4") then
215+
-- Do unload unconditionally, so that even if EB versions were switched in the meantime, this gets unset
216+
-- This avoids issues where EESSI-extend is first loaded with EB => 5.1 (which set these vars)
217+
-- but then EB is swapped for a version < 5.1 and then EESSI-extend is unloaded (which would not unset
218+
-- these vars if we did it conditional on the EB version)
219+
if convertToCanonical(easybuild_version) >= convertToCanonical("5.1") or mode() == "unload" then
216220
setenv ("EASYBUILD_STRICT_RPATH_SANITY_CHECK", "1")
217221
setenv ("EASYBUILD_CUDA_SANITY_CHECK_ERROR_ON_FAILED_CHECKS", "1")
218222
setenv ("EASYBUILD_FAIL_ON_MOD_FILES_GCCCORE", "1")

EESSI-install-software.sh

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,29 @@ else
150150
# make sure the the software and modules directory exist
151151
# (since it's expected by init/eessi_environment_variables when using archdetect and by the EESSI module)
152152
mkdir -p ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}/{modules,software}
153+
154+
# If EESSI_ACCELERATOR_TARGET_OVERRIDE is defined, we are building for an accelerator target
155+
# In that case, make sure the modulepath for the accelerator subdir exists, otherwise the EESSI module will not
156+
# set EESSI_ACCELERATOR_TARGET and the if-condition later in this script which checks if EESSI_ACCELERATOR_TARGET
157+
# is equal to EESSI_ACCELERATOR_TARGET_OVERRIDE will fail
158+
# See https://github.com/EESSI/software-layer-scripts/pull/59#issuecomment-3173593882
159+
if [ -n $EESSI_ACCELERATOR_TARGET_OVERRIDE ]; then
160+
# Note that ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}/${EESSI_ACCELERATOR_TARGET_OVERRIDE}/modules/all
161+
# is only the correct path if EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE is not set
162+
if [ -z $EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE ]; then
163+
mkdir -p ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}/${EESSI_ACCELERATOR_TARGET_OVERRIDE}/modules/all
164+
else
165+
# At runtime, one might want to use a different CPU subdir for a given accelerator. E.g. one could use
166+
# a zen2 CPU subdir on a zen4 node if the required GPU software isn't available in the zen4 tree.
167+
# At build time, this doesn't make a lot of sense: we'd probably build in a CPU prefix that is different
168+
# from what the code will be optimized for, and we wouldn't want that
169+
# So this message _should_ never be printed...
170+
msg="When building the software subdirectory for the CPU should almost certainly be that of the host."
171+
msg="$msg If you think this is incorrect, please implement behaviour that makes sense in "
172+
msg="$msg EESSI-software-installation.sh, essentially replacing this error."
173+
fatal_error "$msg"
174+
fi
175+
fi
153176
)
154177
fi
155178

@@ -294,6 +317,7 @@ source $TOPDIR/load_eessi_extend_module.sh ${EESSI_VERSION}
294317
echo "DEBUG: after loading EESSI-extend // EASYBUILD_INSTALLPATH='${EASYBUILD_INSTALLPATH}'"
295318

296319
# Install full CUDA SDK and cu* libraries in host_injections
320+
# (This is done *before* configuring EasyBuild as it may rely on an older EB version)
297321
# Hardcode this for now, see if it works
298322
# TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install
299323
# Allow skipping CUDA SDK install in e.g. CI environments
@@ -315,6 +339,7 @@ if nvidia_gpu_available; then
315339
${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
316340
fi
317341

342+
318343
if [ ! -z "${shared_fs_path}" ]; then
319344
shared_eb_sourcepath=${shared_fs_path}/easybuild/sources
320345
echo ">> Using ${shared_eb_sourcepath} as shared EasyBuild source path"

bot/build.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -267,11 +267,15 @@ TARBALL_STEP_ARGS+=("--resume" "${BUILD_TMPDIR}")
267267
timestamp=$(date +%s)
268268
# to set EESSI_VERSION we need to source init/eessi_defaults now
269269
source $software_layer_dir/init/eessi_defaults
270-
# Note: iff ${EESSI_DEV_PROJECT} is defined (building for dev.eessi.io), then we
270+
# Note: if ${EESSI_DEV_PROJECT} is defined (building for dev.eessi.io), then we
271271
# append the project (subdirectory) name to the end tarball name. This is information
272272
# then used at the ingestion stage. If ${EESSI_DEV_PROJECT} is not defined, nothing is
273273
# appended
274-
export TGZ=$(printf "eessi-%s-software-%s-%s-%b%d.tar.gz" ${EESSI_VERSION} ${EESSI_OS_TYPE} ${EESSI_SOFTWARE_SUBDIR_OVERRIDE//\//-} ${EESSI_DEV_PROJECT:+$EESSI_DEV_PROJECT-} ${timestamp})
274+
if [[ -z ${EESSI_ACCELERATOR_TARGET_OVERRIDE} ]]; then
275+
export TGZ=$(printf "eessi-%s-software-%s-%s-%b%d.tar.gz" ${EESSI_VERSION} ${EESSI_OS_TYPE} ${EESSI_SOFTWARE_SUBDIR_OVERRIDE//\//-} ${EESSI_DEV_PROJECT:+$EESSI_DEV_PROJECT-} ${timestamp})
276+
else
277+
export TGZ=$(printf "eessi-%s-software-%s-%s-%s-%b%d.tar.gz" ${EESSI_VERSION} ${EESSI_OS_TYPE} ${EESSI_SOFTWARE_SUBDIR_OVERRIDE//\//-} ${EESSI_ACCELERATOR_TARGET_OVERRIDE//\//-} ${EESSI_DEV_PROJECT:+$EESSI_DEV_PROJECT-} ${timestamp})
278+
fi
275279

276280
# Export EESSI_DEV_PROJECT to use it (if needed) when making tarball
277281
echo "bot/build.sh: EESSI_DEV_PROJECT='${EESSI_DEV_PROJECT}'"

create_lmodsitepackage.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,13 +123,31 @@
123123
local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/site_specific_config/gpu/.\\n"
124124
if packagesList[simpleName] then
125125
-- simpleName is a module in packagesList
126-
-- get the full host_injections path
127-
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
126+
-- first, check the old host_injections path prior to https://github.com/EESSI/software-layer-scripts/pull/59
127+
-- If that exists, print a more targetted, explanatory warning
128+
local previousHostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
129+
local previousPackageEasyBuildDir = previousHostInjections .. "/software/" .. t.modFullName .. "/easybuild"
130+
local previousPackageDirExists = isDir(previousPackageEasyBuildDir)
131+
132+
-- get the host_injections path, and add only the EESSI_CPU_FAMILY at the end
133+
local strip_suffix = os.getenv('EESSI_VERSION') .. "/software/" .. os.getenv('EESSI_OS_TYPE') .. "/"
134+
strip_suffix = strip_suffix .. os.getenv('EESSI_SOFTWARE_SUBDIR')
135+
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", strip_suffix, os.getenv('EESSI_CPU_FAMILY'))
128136
129137
-- build final path where the software should be installed
130138
local packageEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
131139
local packageDirExists = isDir(packageEasyBuildDir)
132-
if not packageDirExists then
140+
if previousPackageDirExists and not packageDirExists then
141+
local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI "
142+
advice = advice .. "due to licencing. You will need to install a full copy of the " .. simpleName .. " package where EESSI "
143+
advice = advice .. "can find it.\\n"
144+
advice = advice .. "Note that a full copy is installed at " .. previousHostInjections .. "/software/" .. t.modFullName .. ". "
145+
advice = advice .. "However, EESSI expects it in a different location since Aug'25, namely at "
146+
advice = advice .. hostInjections .. "/software/" .. t.modFullName .. ". "
147+
advice = advice .. "Please re-install the package at the new location. "
148+
advice = advice .. refer_to_docs
149+
LmodError("\\nYou requested to load ", simpleName, " ", advice)
150+
elseif not packageDirExists then
133151
local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI "
134152
advice = advice .. "due to licencing. You will need to install a full copy of the " .. simpleName .. " package where EESSI "
135153
advice = advice .. "can find it.\\n"
@@ -293,7 +311,7 @@ def error(msg):
293311
# the install path (if it exists)
294312
accel_subdir = os.getenv("EESSI_ACCELERATOR_TARGET")
295313
if accel_subdir:
296-
sitepackage_path = sitepackage_path.replace("/accel/%s" % accel_subdir, '')
314+
sitepackage_path = sitepackage_path.replace("/%s" % accel_subdir, '')
297315
try:
298316
os.makedirs(os.path.dirname(sitepackage_path), exist_ok=True)
299317
with open(sitepackage_path, 'w') as fp:

easystacks/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
WARNING: in principle _all_ easystack files should go into EESSI/software-layer, not in EESSI/software-layer-scripts. Easystack files are only added in EESSI/software-layer-scripts by exception, for example when the (re)deployment of the software has to be done synchronously with a change in EESSI/software-layer-scripts.
2+
3+
Here, we list past deployments for which this was the case (and why):
4+
5+
[PR#59](https://github.com/EESSI/software-layer-scripts/pull/59): modified the prefix in which `install_cuda_and_libraries.sh` installs the CUDA toolkit within `host_injections`. Also, updated the Lmod SitePackage.lua to print an informative message in case the CUDA Toolkit is found in the old location. This requires synchronous deployment of new CUDA and cuDNN installations in the software layer, because the symlinks from these installations should be redirected to the new prefix in `host_injections`.
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# In https://github.com/EESSI/software-layer-scripts/pull/59 we introduced a new location for
2+
# installing the CUDA toolkit within the host_injections directory. This requires reinstallation
3+
# of CUDA and cuDNN to make sure all symlinks point to these new locations
4+
easyconfigs:
5+
- CUDA-12.1.1.eb:
6+
options:
7+
accept-eula-for: CUDA
8+
- CUDA-12.4.0.eb:
9+
options:
10+
accept-eula-for: CUDA
11+
- cuDNN-8.9.2.26-CUDA-12.1.1.eb:
12+
options:
13+
accept-eula-for: cuDNN

eb_hooks.py

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ def parse_list_of_dicts_env(var_name):
151151
if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', var_name):
152152
raise ValueError(f"Invalid environment variable name: {var_name}")
153153
list_string = os.getenv(var_name, '[]')
154-
154+
155155
list_of_dicts = []
156156
try:
157157
# Try JSON format first
@@ -162,7 +162,7 @@ def parse_list_of_dicts_env(var_name):
162162
list_of_dicts = ast.literal_eval(list_string)
163163
except (ValueError, SyntaxError):
164164
raise ValueError(f"Environment variable '{var_name}' does not contain a valid list of dictionaries.")
165-
165+
166166
return list_of_dicts
167167

168168

@@ -211,7 +211,7 @@ def post_ready_hook(self, *args, **kwargs):
211211
parallel = self.parallel
212212
else:
213213
parallel = self.cfg['parallel']
214-
214+
215215
if parallel == 1:
216216
return # no need to limit if already using 1 core
217217

@@ -780,7 +780,7 @@ def pre_configure_hook_score_p(self, *args, **kwargs):
780780
def pre_configure_hook_vsearch(self, *args, **kwargs):
781781
"""
782782
Pre-configure hook for VSEARCH
783-
- Workaround for a Zlib macro being renamed in Gentoo, see https://bugs.gentoo.org/383179
783+
- Workaround for a Zlib macro being renamed in Gentoo, see https://bugs.gentoo.org/383179
784784
(solves "expected initializer before 'OF'" errors)
785785
"""
786786
if self.name == 'VSEARCH':
@@ -1246,7 +1246,7 @@ def post_postproc_cuda(self, *args, **kwargs):
12461246

12471247
# replace files that are not distributable with symlinks into
12481248
# host_injections
1249-
replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist)
1249+
replace_binary_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist)
12501250
else:
12511251
print_msg(f"EESSI hook to respect CUDA license not triggered for installation path {self.installdir}")
12521252
else:
@@ -1296,16 +1296,19 @@ def post_postproc_cudnn(self, *args, **kwargs):
12961296

12971297
# replace files that are not distributable with symlinks into
12981298
# host_injections
1299-
replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist)
1299+
replace_binary_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist)
13001300
else:
13011301
print_msg(f"EESSI hook to respect cuDDN license not triggered for installation path {self.installdir}")
13021302
else:
13031303
raise EasyBuildError("cuDNN-specific hook triggered for non-cuDNN easyconfig?!")
13041304

13051305

1306-
def replace_non_distributable_files_with_symlinks(log, install_dir, pkg_name, allowlist):
1306+
def replace_binary_non_distributable_files_with_symlinks(log, install_dir, pkg_name, allowlist):
13071307
"""
13081308
Replace files that cannot be distributed with symlinks into host_injections
1309+
Since these are binary files, only the CPU family will be included in the prefix,
1310+
no microarchitecture or accelerator architecture will be included. For example,
1311+
/cvmfs/software.eessi.io/host_injections/x86_64/suffix/to/actual/file
13091312
"""
13101313
# Different packages use different ways to specify which files or file
13111314
# 'types' may be redistributed. For CUDA, the 'EULA.txt' lists full file
@@ -1348,13 +1351,37 @@ def replace_non_distributable_files_with_symlinks(log, install_dir, pkg_name, al
13481351
log.debug("%s is not found in allowlist, so replacing it with symlink: %s",
13491352
print_name, full_path)
13501353
# the host_injections path is under a fixed repo/location for CUDA or cuDNN
1354+
# full_path is something similar to
1355+
# /cvmfs/software.eessi.io/version/.../x86_64/amd/zen4/accel/nvidia/cc90/.../CUDA/bin/nvcc
1356+
# host_inj_path will then be
1357+
# /cvmfs/software.eessi.io/host_injections/.../x86_64/amd/zen4/accel/nvidia/cc90/.../CUDA/bin/nvcc
13511358
host_inj_path = re.sub(EESSI_INSTALLATION_REGEX, HOST_INJECTIONS_LOCATION, full_path)
13521359
# CUDA and cu* libraries themselves don't care about compute capability so remove this
13531360
# duplication from under host_injections (symlink to a single CUDA or cu* library
13541361
# installation for all compute capabilities)
13551362
accel_subdir = get_eessi_envvar("EESSI_ACCELERATOR_TARGET")
1363+
# If accel_subdir is defined, remove it from the full path
1364+
# After removal of accel_subdir, host_inj_path will be something like
1365+
# /cvmfs/software.eessi.io/host_injections/.../x86_64/amd/zen4/.../CUDA/bin/nvcc
13561366
if accel_subdir:
1357-
host_inj_path = host_inj_path.replace("/accel/%s" % accel_subdir, '')
1367+
host_inj_path = host_inj_path.replace(accel_subdir, '')
1368+
software_subdir = get_eessi_envvar("EESSI_SOFTWARE_SUBDIR")
1369+
cpu_family = get_eessi_envvar("EESSI_CPU_FAMILY")
1370+
os_type = get_eessi_envvar("EESSI_OS_TYPE")
1371+
eessi_version = get_eessi_envvar("EESSI_VERSION")
1372+
if software_subdir and cpu_family and os_type and eessi_version:
1373+
# Compose the string to be removed:
1374+
partial_path = f"{eessi_version}/software/{os_type}/{software_subdir}"
1375+
# After this, host_inj_path will be e.g.
1376+
# /cvmfs/software.eessi.io/host_injections/x86_64/software/CUDA/bin/nvcc
1377+
host_inj_path = host_inj_path.replace(partial_path, cpu_family)
1378+
else:
1379+
msg = "Failed to construct path to symlink for file (%s). All of the following values "
1380+
msg += "have to be defined: EESSI_SOFTWARE_SUBDIR='%s', EESSI_CPU_FAMILY='%s', "
1381+
msg += "EESSI_OS_TYPE='%s', EESSI_VERSION='%s'. Failed to replace non-redistributable file "
1382+
msg += "with symlink, aborting..."
1383+
raise EasyBuildError(msg, full_path, software_subdir, cpu_family, os_type, eessi_version)
1384+
13581385
# make sure source and target of symlink are not the same
13591386
if full_path == host_inj_path:
13601387
raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you "

scripts/gpu_support/nvidia/install_cuda_and_libraries.sh

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,16 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do
132132

133133
# If there is a GPU on the node, the installation path will by default have an
134134
# accelerator subdirectory. For CUDA and cu*, these are binary installations and
135-
# don't care about the target compute capability. Our hooks are aware of this and
136-
# therefore expect CUDA to be available under EESSI_SITE_SOFTWARE_PATH
137-
export EASYBUILD_INSTALLPATH=$EESSI_SITE_SOFTWARE_PATH
135+
# we don't care about the target compute capability nor the CPU microarchitecture.
136+
# Our hooks are aware of this and therefore expect CUDA to be available under
137+
# something like EESSI_SITE_SOFTWARE_PATH, but then with the CPU micro-architecture
138+
# stripped
139+
# This sed command will capture everything from the EESSI_SITE_SOFTWARE_PATH up until
140+
# the EESSI_VERSION in a capture group. It will the replace that with the content
141+
# of the capture group and then have the EESSI_CPU_FAMILY appended
142+
# Thus EESSI_SITE_CPU_FAMILY_PATH is then something like /cvmfs/software.eessi.io/host_injections/x86_64
143+
EESSI_SITE_CPU_FAMILY_PATH=$(echo "$EESSI_SITE_SOFTWARE_PATH" | sed 's|\(.*\)'"$EESSI_VERSION"/software/"$EESSI_OS_TYPE"/"$EESSI_SOFTWARE_SUBDIR"'|\1'"$EESSI_CPU_FAMILY"'|')
144+
export EASYBUILD_INSTALLPATH=$EESSI_SITE_CPU_FAMILY_PATH
138145

139146
# Install modules in hidden .modules dir to keep track of what was installed before
140147
# (this action is temporary, and we do not call Lmod again within the current shell context, but in EasyBuild
@@ -258,7 +265,7 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do
258265
cp -a ${eb_last_log} .
259266
fatal_error "some installation failed, please check EasyBuild logs ${PWD}/$(basename ${eb_last_log})..."
260267
else
261-
echo_green "all installations at ${EESSI_SITE_SOFTWARE_PATH}/software/... succeeded!"
268+
echo_green "all installations at ${EASYBUILD_INSTALLPATH}/software/... succeeded!"
262269
fi
263270

264271
# clean up tmpdir content

0 commit comments

Comments
 (0)