diff --git a/nemo_run/run/torchx_backend/schedulers/slurm.py b/nemo_run/run/torchx_backend/schedulers/slurm.py index c959c968..c3c08bf6 100644 --- a/nemo_run/run/torchx_backend/schedulers/slurm.py +++ b/nemo_run/run/torchx_backend/schedulers/slurm.py @@ -424,12 +424,24 @@ def _save_job_dir( ) -def _get_job_dirs() -> dict[str, tuple[str, SSHTunnel | LocalTunnel, str]]: - try: - with open(SLURM_JOB_DIRS, "rt") as f: - lines = f.readlines() - except FileNotFoundError: - return {} +def _get_job_dirs(retries: int = 5) -> dict[str, tuple[str, SSHTunnel | LocalTunnel, str]]: + last_exc: OSError | None = None + for _ in range(retries): + try: + with open(SLURM_JOB_DIRS, "rt") as f: + lines = f.readlines() + break + except FileNotFoundError: + return {} + except OSError as e: + last_exc = e + time.sleep(1) + else: + if last_exc is not None: + raise last_exc + raise OSError( + f"Failed to read SLURM job dirs from {SLURM_JOB_DIRS} after {retries} retries" + ) out = {} for line in lines: diff --git a/test/run/torchx_backend/schedulers/test_slurm.py b/test/run/torchx_backend/schedulers/test_slurm.py index c8851bcc..5e829381 100644 --- a/test/run/torchx_backend/schedulers/test_slurm.py +++ b/test/run/torchx_backend/schedulers/test_slurm.py @@ -339,6 +339,31 @@ def test_get_job_dirs(): assert result == {} +def test_get_job_dirs_retries_on_permission_error(tmp_path, mocker): + """Transient PermissionError should be retried; success on 3rd attempt returns data.""" + mocker.patch("time.sleep") # don't actually sleep in tests + mock_open = mocker.mock_open(read_data="") + mock_open.side_effect = [ + PermissionError("[Errno 1] Operation not permitted"), + PermissionError("[Errno 1] Operation not permitted"), + mock_open.return_value, + ] + mocker.patch("builtins.open", mock_open) + + result = _get_job_dirs(retries=5) + assert result == {} + assert mock_open.call_count == 3 + + +def test_get_job_dirs_raises_after_exhausting_retries(mocker): + """PermissionError should be re-raised after all retries are exhausted.""" + mocker.patch("time.sleep") + mocker.patch("builtins.open", side_effect=PermissionError("[Errno 1] Operation not permitted")) + + with pytest.raises(PermissionError): + _get_job_dirs(retries=3) + + def test_schedule_with_dependencies(slurm_scheduler, slurm_executor): mock_request = mock.MagicMock() mock_request.cmd = ["sbatch", "--requeue", "--parsable"]