From 01b013be9268182857a3c133cb8b5ad21e8cf39d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 10 Mar 2026 10:25:10 +0000 Subject: [PATCH 1/3] fix: Flaky Slurm network issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .../run/torchx_backend/schedulers/slurm.py | 20 ++++++++++----- .../torchx_backend/schedulers/test_slurm.py | 25 +++++++++++++++++++ 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/nemo_run/run/torchx_backend/schedulers/slurm.py b/nemo_run/run/torchx_backend/schedulers/slurm.py index c959c968..74740d25 100644 --- a/nemo_run/run/torchx_backend/schedulers/slurm.py +++ b/nemo_run/run/torchx_backend/schedulers/slurm.py @@ -424,12 +424,20 @@ def _save_job_dir( ) -def _get_job_dirs() -> dict[str, tuple[str, SSHTunnel | LocalTunnel, str]]: - try: - with open(SLURM_JOB_DIRS, "rt") as f: - lines = f.readlines() - except FileNotFoundError: - return {} +def _get_job_dirs(retries: int = 5) -> dict[str, tuple[str, SSHTunnel | LocalTunnel, str]]: + last_exc: OSError | None = None + for _ in range(retries): + try: + with open(SLURM_JOB_DIRS, "rt") as f: + lines = f.readlines() + break + except FileNotFoundError: + return {} + except OSError as e: + last_exc = e + time.sleep(1) + else: + raise last_exc # type: ignore[misc] out = {} for line in lines: diff --git a/test/run/torchx_backend/schedulers/test_slurm.py b/test/run/torchx_backend/schedulers/test_slurm.py index c8851bcc..5e829381 100644 --- a/test/run/torchx_backend/schedulers/test_slurm.py +++ b/test/run/torchx_backend/schedulers/test_slurm.py @@ -339,6 +339,31 @@ def test_get_job_dirs(): assert result == {} +def test_get_job_dirs_retries_on_permission_error(tmp_path, mocker): + """Transient PermissionError should be retried; success on 3rd attempt returns data.""" + mocker.patch("time.sleep") # don't actually sleep in tests + mock_open = mocker.mock_open(read_data="") + mock_open.side_effect = [ + PermissionError("[Errno 1] Operation not permitted"), + PermissionError("[Errno 1] Operation not permitted"), + mock_open.return_value, + ] + mocker.patch("builtins.open", mock_open) + + result = _get_job_dirs(retries=5) + assert result == {} + assert mock_open.call_count == 3 + + +def test_get_job_dirs_raises_after_exhausting_retries(mocker): + """PermissionError should be re-raised after all retries are exhausted.""" + mocker.patch("time.sleep") + mocker.patch("builtins.open", side_effect=PermissionError("[Errno 1] Operation not permitted")) + + with pytest.raises(PermissionError): + _get_job_dirs(retries=3) + + def test_schedule_with_dependencies(slurm_scheduler, slurm_executor): mock_request = mock.MagicMock() mock_request.cmd = ["sbatch", "--requeue", "--parsable"] From 40a908f0b7ef9453e6b2b885696b86bf2944c440 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 10 Mar 2026 11:28:23 +0100 Subject: [PATCH 2/3] Potential fix for code scanning alert no. 534: Illegal raise MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Signed-off-by: oliver könig --- nemo_run/run/torchx_backend/schedulers/slurm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo_run/run/torchx_backend/schedulers/slurm.py b/nemo_run/run/torchx_backend/schedulers/slurm.py index 74740d25..13a74bfc 100644 --- a/nemo_run/run/torchx_backend/schedulers/slurm.py +++ b/nemo_run/run/torchx_backend/schedulers/slurm.py @@ -437,7 +437,9 @@ def _get_job_dirs(retries: int = 5) -> dict[str, tuple[str, SSHTunnel | LocalTun last_exc = e time.sleep(1) else: - raise last_exc # type: ignore[misc] + if last_exc is not None: + raise last_exc + raise OSError(f"Failed to read SLURM job dirs from {SLURM_JOB_DIRS} after {retries} retries") out = {} for line in lines: From 6d6fd8ebe91caec2700ebaa33a9c2fcf84bc58a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 10 Mar 2026 10:35:05 +0000 Subject: [PATCH 3/3] format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- nemo_run/run/torchx_backend/schedulers/slurm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo_run/run/torchx_backend/schedulers/slurm.py b/nemo_run/run/torchx_backend/schedulers/slurm.py index 13a74bfc..c3c08bf6 100644 --- a/nemo_run/run/torchx_backend/schedulers/slurm.py +++ b/nemo_run/run/torchx_backend/schedulers/slurm.py @@ -439,7 +439,9 @@ def _get_job_dirs(retries: int = 5) -> dict[str, tuple[str, SSHTunnel | LocalTun else: if last_exc is not None: raise last_exc - raise OSError(f"Failed to read SLURM job dirs from {SLURM_JOB_DIRS} after {retries} retries") + raise OSError( + f"Failed to read SLURM job dirs from {SLURM_JOB_DIRS} after {retries} retries" + ) out = {} for line in lines: