Skip to content

Commit e7a4a03

Browse files
fix: Flaky Slurm network issues (#445)
* fix: Flaky Slurm network issues Signed-off-by: oliver könig <okoenig@nvidia.com> * Potential fix for code scanning alert no. 534: Illegal raise Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Signed-off-by: oliver könig <okoenig@nvidia.com> * format Signed-off-by: oliver könig <okoenig@nvidia.com> --------- Signed-off-by: oliver könig <okoenig@nvidia.com> Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
1 parent bff5276 commit e7a4a03

2 files changed

Lines changed: 43 additions & 6 deletions

File tree

nemo_run/run/torchx_backend/schedulers/slurm.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -424,12 +424,24 @@ def _save_job_dir(
424424
)
425425

426426

427-
def _get_job_dirs() -> dict[str, tuple[str, SSHTunnel | LocalTunnel, str]]:
428-
try:
429-
with open(SLURM_JOB_DIRS, "rt") as f:
430-
lines = f.readlines()
431-
except FileNotFoundError:
432-
return {}
427+
def _get_job_dirs(retries: int = 5) -> dict[str, tuple[str, SSHTunnel | LocalTunnel, str]]:
428+
last_exc: OSError | None = None
429+
for _ in range(retries):
430+
try:
431+
with open(SLURM_JOB_DIRS, "rt") as f:
432+
lines = f.readlines()
433+
break
434+
except FileNotFoundError:
435+
return {}
436+
except OSError as e:
437+
last_exc = e
438+
time.sleep(1)
439+
else:
440+
if last_exc is not None:
441+
raise last_exc
442+
raise OSError(
443+
f"Failed to read SLURM job dirs from {SLURM_JOB_DIRS} after {retries} retries"
444+
)
433445

434446
out = {}
435447
for line in lines:

test/run/torchx_backend/schedulers/test_slurm.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,31 @@ def test_get_job_dirs():
339339
assert result == {}
340340

341341

342+
def test_get_job_dirs_retries_on_permission_error(tmp_path, mocker):
343+
"""Transient PermissionError should be retried; success on 3rd attempt returns data."""
344+
mocker.patch("time.sleep") # don't actually sleep in tests
345+
mock_open = mocker.mock_open(read_data="")
346+
mock_open.side_effect = [
347+
PermissionError("[Errno 1] Operation not permitted"),
348+
PermissionError("[Errno 1] Operation not permitted"),
349+
mock_open.return_value,
350+
]
351+
mocker.patch("builtins.open", mock_open)
352+
353+
result = _get_job_dirs(retries=5)
354+
assert result == {}
355+
assert mock_open.call_count == 3
356+
357+
358+
def test_get_job_dirs_raises_after_exhausting_retries(mocker):
359+
"""PermissionError should be re-raised after all retries are exhausted."""
360+
mocker.patch("time.sleep")
361+
mocker.patch("builtins.open", side_effect=PermissionError("[Errno 1] Operation not permitted"))
362+
363+
with pytest.raises(PermissionError):
364+
_get_job_dirs(retries=3)
365+
366+
342367
def test_schedule_with_dependencies(slurm_scheduler, slurm_executor):
343368
mock_request = mock.MagicMock()
344369
mock_request.cmd = ["sbatch", "--requeue", "--parsable"]

0 commit comments

Comments
 (0)