Skip to content

Commit 16408d5

Browse files
committed
refactor code to limit parallelism
1 parent 077eb43 commit 16408d5

1 file changed

Lines changed: 64 additions & 18 deletions

File tree

eb_hooks.py

Lines changed: 64 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -121,27 +121,38 @@ def parse_hook(ec, *args, **kwargs):
121121

122122
def post_ready_hook(self, *args, **kwargs):
123123
"""
124-
Post-ready hook: limit parallellism for selected builds, because they require a lot of memory per used core.
124+
Post-ready hook: limit parallellism for selected builds based on software name and CPU target.
125+
parallelism needs to be limited because some builds require a lot of memory per used core.
125126
"""
126127
# 'parallel' easyconfig parameter is set via EasyBlock.set_parallel in ready step based on available cores.
127-
# here we reduce parallellism to only use half of that for selected software,
128-
# to avoid failing builds/tests due to out-of-memory problems;
129-
memory_hungry_build = self.name in ['libxc', 'MBX', 'TensorFlow']
130-
# on A64FX systems, (HBM) memory is typically scarce, so we need to use fewer cores for some builds
128+
# get current parallelism setting
129+
parallel = self.cfg['parallel']
130+
if parallel == 1:
131+
return # no need to limit if already using 1 core
132+
133+
# get CPU target
131134
cpu_target = get_eessi_envvar('EESSI_SOFTWARE_SUBDIR')
132-
memory_hungry_build_a64fx = cpu_target == CPU_TARGET_A64FX and self.name in ['Qt5', 'ROOT']
133-
if memory_hungry_build or memory_hungry_build_a64fx:
134-
parallel = self.cfg['parallel']
135-
if cpu_target == CPU_TARGET_A64FX and self.name in ['TensorFlow']:
136-
# limit parallelism to 8, builds with 12 and 16 failed on Deucalion
137-
if parallel > 8:
138-
self.cfg['parallel'] = 8
139-
msg = "limiting parallelism to %s (was %s) for %s on %s to avoid out-of-memory failures during building/testing"
140-
print_msg(msg % (self.cfg['parallel'], parallel, self.name, cpu_target), log=self.log)
141-
elif parallel > 1:
142-
self.cfg['parallel'] = parallel // 2
143-
msg = "limiting parallelism to %s (was %s) for %s to avoid out-of-memory failures during building/testing"
144-
print_msg(msg % (self.cfg['parallel'], parallel, self.name), log=self.log)
135+
136+
# check if we have limits defined for this software
137+
if self.name in PARALLELISM_LIMITS:
138+
limits = PARALLELISM_LIMITS[self.name]
139+
140+
# first check for CPU-specific limit
141+
if cpu_target in limits:
142+
operation_func, operation_args = limits[cpu_target]
143+
new_parallel = operation_func(parallel, operation_args)
144+
# then check for generic limit (applies to all CPU targets)
145+
elif '*' in limits:
146+
operation_func, operation_args = limits['*']
147+
new_parallel = operation_func(parallel, operation_args)
148+
else:
149+
return # no applicable limits found
150+
151+
# apply the limit if it's different from current
152+
if new_parallel != parallel:
153+
self.cfg['parallel'] = new_parallel
154+
msg = "limiting parallelism to %s (was %s) for %s on %s to avoid out-of-memory failures during building/testing"
155+
print_msg(msg % (new_parallel, parallel, self.name, cpu_target), log=self.log)
145156

146157

147158
def pre_prepare_hook(self, *args, **kwargs):
@@ -1249,3 +1260,38 @@ def post_module_hook(self, *args, **kwargs):
12491260
}
12501261

12511262
POST_MODULE_HOOKS = {}
1263+
1264+
# Define parallelism limit operations
1265+
def divide_by_factor(parallel, factor):
1266+
"""Divide parallelism by given factor"""
1267+
return max(1, parallel // factor)
1268+
1269+
def set_maximum(parallel, max_value):
1270+
"""Set parallelism to maximum value"""
1271+
return min(parallel, max_value)
1272+
1273+
# Data structure defining parallelism limits for different software and CPU targets
1274+
# Format: {software_name: {cpu_target: (operation_function, operation_args)}}
1275+
# '*' for a CPU target means the operation applies to all CPU targets
1276+
# Information is processed in the post_ready_hook function. First it checks if the
1277+
# specific CPU target is defined in the data structure below. If not, it checks for
1278+
# the generic '*' entry.
1279+
PARALLELISM_LIMITS = {
1280+
'libxc': {
1281+
'*': (divide_by_factor, 2),
1282+
CPU_TARGET_A64FX: (set_maximum, 12),
1283+
},
1284+
'MBX': {
1285+
'*': (divide_by_factor, 2),
1286+
},
1287+
'TensorFlow': {
1288+
'*': (divide_by_factor, 2),
1289+
CPU_TARGET_A64FX: (set_maximum, 8),
1290+
},
1291+
'Qt5': {
1292+
CPU_TARGET_A64FX: (divide_by_factor, 2),
1293+
},
1294+
'ROOT': {
1295+
CPU_TARGET_A64FX: (divide_by_factor, 2),
1296+
},
1297+
}

0 commit comments

Comments
 (0)