Skip to content

Commit e0417e2

Browse files
Grutschusclaude
andauthored
Fix/slurm mutually exclusive options (#170)
* moved ssh-forward to optional dependencies * pinned setuptools version due to deprecation * switched from mp forkserver to Popen and socket-based IPC * moved worker logic in dedicated module * add REMOVE sentinel to unset inherited config keys Introduces a REMOVE singleton (and !remove YAML tag) that can be used as a value in any config dict to delete a key that would otherwise be inherited from a lower-priority config via merge_dicts. This allows e.g. removing the default cpus-per-task when switching to cpus-per-gpu in a user settings.py or experiment YAML without re-specifying the full default block. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * auto-remove conflicting sbatch options on config merge When a higher-priority slurm config (template or experiment yaml) sets an option that belongs to a mutually exclusive group, conflicting options inherited from the base are now automatically removed during assemble_slurm_config_dict. Covers all pairs documented in the sbatch man page: cpus-per-task/cpus-per-gpu, mem/mem-per-cpu/mem-per-gpu, exclusive/oversubscribe, core-spec/thread-spec, and the three ntasks-per-gpu incompatibilities. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * added tests and docs * added logging --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 7b443a0 commit e0417e2

13 files changed

Lines changed: 400 additions & 4 deletions

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,4 @@ comment.cut
2626
# Code editors
2727
.idea/
2828
.vscode/
29+
.env

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ repos:
66
- id: check-toml
77
- id: check-xml
88
- id: check-yaml
9+
args: [--unsafe]
910
exclude: |
1011
(?x)^(
1112
test/resources/config/config_with_duplicate_parameters_3.yaml

examples/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,12 @@ This can be nested arbitrarily deeply (be aware of combinatorial explosion of th
158158

159159
If a parameter is defined in (at least) two **different blocks** in `[grid, random, fixed]` on the same level, `seml` will throw an error to avoid ambiguity.
160160
If a parameter is re-defined in a sub-configuration, the redefinition overrides any previous definitions of that parameter.
161+
To remove a key inherited from a lower-priority config instead of overriding it, set it to `!remove`:
162+
```yaml
163+
large_datasets:
164+
fixed:
165+
regularization: !remove # removes the key set in the root fixed block
166+
```
161167

162168
### Grid parameters
163169
In an experiment config, under `grid` you can define parameters that should be sampled from a regular grid. Currently supported

src/seml/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,6 @@
88
from seml.evaluation import * # noqa
99
from seml.experiment import Experiment # noqa
1010
from seml.experiment.observers import * # noqa
11+
from seml.utils import REMOVE # noqa
1112

1213
__version__ = importlib.metadata.version(__package__ or __name__)

src/seml/experiment/config.py

Lines changed: 64 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1246,6 +1246,61 @@ def check_slurm_config(experiments_per_job: int, sbatch_options: SBatchOptions):
12461246
)
12471247

12481248

1249+
# Each inner list is a set of mutually exclusive option groups. When the higher-priority
1250+
# config sets any key from one group, keys from all other groups are removed from the base.
1251+
# Keys are compared after stripping leading dashes to handle --cpus-per-task and cpus-per-task.
1252+
_SBATCH_MUTUALLY_EXCLUSIVE: list[list[frozenset[str]]] = [
1253+
# --cpus-per-gpu is not compatible with --cpus-per-task (-c)
1254+
[frozenset({'cpus-per-task', 'c'}), frozenset({'cpus-per-gpu'})],
1255+
# --mem, --mem-per-cpu and --mem-per-gpu are mutually exclusive
1256+
[frozenset({'mem'}), frozenset({'mem-per-cpu'}), frozenset({'mem-per-gpu'})],
1257+
# --exclusive and --oversubscribe (-s) are mutually exclusive
1258+
[frozenset({'exclusive'}), frozenset({'oversubscribe', 's'})],
1259+
# --core-spec (-S) and --thread-spec are mutually exclusive
1260+
[frozenset({'core-spec', 'S'}), frozenset({'thread-spec'})],
1261+
# --ntasks-per-gpu is not compatible with --gpus-per-task, --gpus-per-socket, or
1262+
# --ntasks-per-node. Modelled as separate pairs because those three are not necessarily
1263+
# incompatible with each other.
1264+
[frozenset({'ntasks-per-gpu'}), frozenset({'gpus-per-task'})],
1265+
[frozenset({'ntasks-per-gpu'}), frozenset({'gpus-per-socket'})],
1266+
[frozenset({'ntasks-per-gpu'}), frozenset({'ntasks-per-node'})],
1267+
]
1268+
1269+
1270+
def _merge_sbatch_options(
1271+
base: dict[str, Any], override: dict[str, Any]
1272+
) -> SBatchOptions:
1273+
"""merge_dicts for sbatch options with automatic mutual-exclusion cleanup.
1274+
1275+
When override sets a key that belongs to a mutually exclusive group (e.g. cpus-per-gpu),
1276+
any keys from conflicting groups that were inherited from base (e.g. cpus-per-task) are
1277+
removed from the result, mirroring normal override precedence.
1278+
"""
1279+
result: dict[str, Any] = dict(merge_dicts(base, override))
1280+
norm = str.lstrip # strip leading dashes for key comparison
1281+
override_normalized = {norm(k, '-') for k in override}
1282+
1283+
for exclusive_groups in _SBATCH_MUTUALLY_EXCLUSIVE:
1284+
activated = {
1285+
i for i, g in enumerate(exclusive_groups) if override_normalized & g
1286+
}
1287+
if not activated:
1288+
continue
1289+
conflicting = {
1290+
s for j, g in enumerate(exclusive_groups) if j not in activated for s in g
1291+
}
1292+
to_remove = [
1293+
k for k in result if norm(k, '-') in conflicting and k not in override
1294+
]
1295+
for k in to_remove:
1296+
logging.info(
1297+
f"Removed inherited sbatch option '{k}' because it conflicts with an override."
1298+
)
1299+
del result[k]
1300+
1301+
return cast(SBatchOptions, result)
1302+
1303+
12491304
def assemble_slurm_config_dict(experiment_slurm_config: SlurmConfig):
12501305
"""
12511306
Realize inheritance for the slurm configuration, with the following relationship:
@@ -1274,13 +1329,19 @@ def assemble_slurm_config_dict(experiment_slurm_config: SlurmConfig):
12741329
raise ConfigError(
12751330
f"sbatch options template '{sbatch_options_template}' not found in settings.py."
12761331
)
1277-
slurm_config_base['sbatch_options'] = merge_dicts(
1278-
slurm_config_base['sbatch_options'],
1279-
SETTINGS.SBATCH_OPTIONS_TEMPLATES[sbatch_options_template],
1332+
slurm_config_base['sbatch_options'] = _merge_sbatch_options(
1333+
dict(slurm_config_base['sbatch_options']),
1334+
dict(SETTINGS.SBATCH_OPTIONS_TEMPLATES[sbatch_options_template]),
12801335
)
12811336

12821337
# Integrate experiment specific config
1338+
exp_sbatch_options = dict(slurm_config.get('sbatch_options', {}))
12831339
slurm_config = merge_dicts(slurm_config_base, slurm_config)
1340+
if exp_sbatch_options:
1341+
slurm_config['sbatch_options'] = _merge_sbatch_options(
1342+
dict(slurm_config_base['sbatch_options']),
1343+
exp_sbatch_options,
1344+
)
12841345

12851346
slurm_config['sbatch_options'] = cast(
12861347
SBatchOptions,

src/seml/utils/__init__.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,24 @@ def chunker(seq: S, size: int) -> Generator[S]:
261261
yield from (cast(S, seq[pos : pos + size]) for pos in range(0, len(seq), size))
262262

263263

264+
class _RemoveSentinel:
265+
"""Sentinel that removes a key during dict merging. Use ``REMOVE`` (or ``!remove`` in YAML)
266+
as a config value to unset a key inherited from a lower-priority config."""
267+
268+
_instance: _RemoveSentinel | None = None
269+
270+
def __new__(cls) -> _RemoveSentinel:
271+
if cls._instance is None:
272+
cls._instance = super().__new__(cls)
273+
return cls._instance
274+
275+
def __repr__(self) -> str:
276+
return 'REMOVE'
277+
278+
279+
REMOVE = _RemoveSentinel()
280+
281+
264282
D = TypeVar('D', bound=Mapping)
265283

266284

@@ -283,6 +301,9 @@ def merge_dicts(dict1: Mapping, dict2: Mapping) -> Mapping:
283301
value, this will call itself recursively to merge these dictionaries.
284302
This does not modify the input dictionaries (creates an internal copy).
285303
304+
Setting a value to ``REMOVE`` (or ``!remove`` in YAML) in dict2 will remove that key
305+
from the result even if it was present in dict1.
306+
286307
Parameters
287308
----------
288309
dict1: dict
@@ -304,7 +325,9 @@ def merge_dicts(dict1: Mapping, dict2: Mapping) -> Mapping:
304325
return_dict = copy.deepcopy(dict1)
305326

306327
for k, v in dict2.items():
307-
if k not in dict1:
328+
if isinstance(v, _RemoveSentinel):
329+
return_dict.pop(k, None)
330+
elif k not in dict1:
308331
return_dict[k] = v
309332
else:
310333
if isinstance(v, dict) and isinstance(dict1[k], dict):

src/seml/utils/yaml.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import yaml
22

3+
from seml.utils import REMOVE
34
from seml.utils.errors import ConfigError
45

56

@@ -30,6 +31,11 @@ def construct_mapping(loader, node, deep=False):
3031
construct_mapping,
3132
)
3233

34+
YamlUniqueLoader.add_constructor(
35+
'!remove',
36+
lambda loader, node: REMOVE,
37+
)
38+
3339

3440
class YamlDumper(yaml.Dumper):
3541
def represent_mapping(self, tag, mapping, flow_style=None):
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
seml:
2+
executable: test_config.py
3+
name: example_experiment
4+
output_dir: logs
5+
project_root_dir: ../..
6+
7+
slurm:
8+
- sbatch_options_template: GPU
9+
sbatch_options:
10+
cpus-per-gpu: 4
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
seml:
2+
executable: test_config.py
3+
name: example_experiment
4+
output_dir: logs
5+
project_root_dir: ../..
6+
7+
slurm:
8+
- sbatch_options:
9+
mem-per-cpu: 4G
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
seml:
2+
executable: test_config.py
3+
name: example_experiment
4+
output_dir: logs
5+
project_root_dir: ../..
6+
7+
slurm:
8+
- sbatch_options:
9+
mem: !remove

0 commit comments

Comments
 (0)