fix: catch transient sacct exceptions in SlurmTunnelScheduler.describe() (#460)

ko3n1g · claude · web-flow · commit 0c556ca7fac8 · 2026-03-13T10:04:50.000+01:00
* fix: treat DGXCloud UNKNOWN/transient status as PENDING to avoid false failures

When a job is submitted to DGXCloud, the API may transiently return a
non-200 response or an "Unknown" phase before the workload is fully
registered. Previously this was mapped to AppState.FAILED, causing
wait_and_exit() to treat the job as terminated immediately while the
pod was still starting up on the cluster.

- DGXCloudState.UNKNOWN now maps to AppState.PENDING in DGX_STATES
- executor.status() returns None (instead of DGXCloudState.UNKNOWN)
  on non-200 HTTP responses so transient API errors don't look like
  a real "Unknown" phase reported by the scheduler
- describe() fallback for unknown keys in DGX_STATES changed to PENDING
- Tests updated and added to cover all three code paths

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
Signed-off-by: oliver könig &lt;okoenig@nvidia.com&gt;

* fix: use type-specific endpoint for DGXCloud workload status

The status() method was calling GET /workloads/{job_id} (generic endpoint)
which returns 403 for distributed and training workloads. The correct
endpoints match the create paths: /workloads/distributed/{job_id} for
multi-node jobs and /workloads/trainings/{job_id} for single-node jobs.
This is consistent with how cancel() already uses /workloads/distributed/.

Adds test_status_distributed to verify the correct URL is used for
multi-node executors.

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
Signed-off-by: oliver könig &lt;okoenig@nvidia.com&gt;

* fix: read actualPhase from type-specific workload endpoints

The /workloads/distributed/{id} and /workloads/trainings/{id} endpoints
return actualPhase, not phase (which was the field on the generic
/workloads/{id} endpoint). This caused a KeyError crash immediately
after the 403 fix landed.

Now reads actualPhase first, falls back to phase for compatibility,
and returns None (PENDING) if neither field is present.

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
Signed-off-by: oliver könig &lt;okoenig@nvidia.com&gt;

* add tests

Signed-off-by: oliver könig &lt;okoenig@nvidia.com&gt;

* fix: store job_id explicitly to avoid separator collision in app_id parsing

When a role name ends with '_', the app_id string looks like:
  experiment___role_name____job_id
Splitting on '___' produces job_id = '_job_id' (spurious leading '_'),
causing the status/cancel/log_iter calls to use a wrong ID and get 404.

Fix: _save_job_dir now stores the actual job_id in the JSON record.
describe(), log_iter(), and _cancel_existing() all read job_id from
the stored record, falling back to app_id.split('___')[-1] for
backwards compatibility with existing saved jobs.

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
Signed-off-by: oliver könig &lt;okoenig@nvidia.com&gt;

* format

Signed-off-by: oliver könig &lt;okoenig@nvidia.com&gt;

* fix: catch transient sacct exceptions in SlurmTunnelScheduler.describe()

After long-running jobs (hours), a transient sacct failure (daemon hiccup,
invoke.UnexpectedExit from non-zero exit code, etc.) would propagate
uncaught through describe() → runner.wait() → wait_and_exit(), killing
the wait loop and reporting EXIT_CODE_TRAINING=1 even though the Slurm
job was still running.

Wrap the sacct call in a try/except and return AppState.UNKNOWN on
failure. UNKNOWN is non-terminal in torchx so polling continues.

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
Signed-off-by: oliver könig &lt;okoenig@nvidia.com&gt;

* ruff

Signed-off-by: oliver könig &lt;okoenig@nvidia.com&gt;

* implement max-retry//cancel after final attempt

Signed-off-by: oliver könig &lt;okoenig@nvidia.com&gt;

* add tests

Signed-off-by: oliver könig &lt;okoenig@nvidia.com&gt;

---------

Signed-off-by: oliver könig &lt;okoenig@nvidia.com&gt;
Co-authored-by: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/nemo_run/exceptions.py b/nemo_run/exceptions.py
@@ -18,3 +18,6 @@ class SetValueError(ValueError): ...
 
 
 class UnknownStatusError(Exception): ...
+
+
+class PersistentSacctFailure(Exception): ...
diff --git a/nemo_run/run/torchx_backend/launcher.py b/nemo_run/run/torchx_backend/launcher.py
@@ -27,7 +27,7 @@
 
 from nemo_run.core.execution.base import Executor
 from nemo_run.core.frontend.console.api import CONSOLE
-from nemo_run.exceptions import UnknownStatusError
+from nemo_run.exceptions import PersistentSacctFailure, UnknownStatusError
 from nemo_run.run.logs import get_logs
 from nemo_run.run.torchx_backend.runner import Runner, get_runner
 
@@ -158,6 +158,12 @@ def wait_and_exit(
     while tries < timeout:
         try:
             status = runner.wait(app_handle, wait_interval=2)
+        except PersistentSacctFailure as e:
+            logger.error(
+                f"sacct has been unreachable for too long for job {app_id}, cancelling: {e}"
+            )
+            runner.cancel(app_handle)
+            raise UnknownStatusError(str(e)) from e
         except RuntimeError as e:
             if "can't start new thread" in str(e) and thread_retries < 5:
                 thread_retries += 1
diff --git a/nemo_run/run/torchx_backend/schedulers/slurm.py b/nemo_run/run/torchx_backend/schedulers/slurm.py
@@ -59,10 +59,13 @@
 from nemo_run.core.execution.base import Executor
 from nemo_run.core.execution.slurm import SlurmBatchRequest, SlurmExecutor, SlurmJobDetails
 from nemo_run.core.tunnel.client import LocalTunnel, PackagingJob, SSHTunnel, Tunnel
+from nemo_run.exceptions import PersistentSacctFailure
 from nemo_run.run import experiment as run_experiment
 from nemo_run.run.ray.slurm import SlurmRayRequest
 from nemo_run.run.torchx_backend.schedulers.api import SchedulerMixin
 
+MAX_CONSECUTIVE_SACCT_FAILURES = 30
+
 log: logging.Logger = logging.getLogger(__name__)
 SLURM_JOB_DIRS = os.path.join(get_nemorun_home(), ".slurm_jobs")
 
@@ -74,6 +77,7 @@ def __init__(
         self.tunnel: Optional[Tunnel] = None
         super().__init__(session_name)
         self.experiment = experiment
+        self._consecutive_sacct_failures: dict[str, int] = {}
 
     # TODO: Move this into the SlurmExecutor
     def _initialize_tunnel(self, tunnel: SSHTunnel | LocalTunnel):
@@ -240,9 +244,23 @@ def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
             return None
 
         assert self.tunnel, "Tunnel is None."
-        p = self.tunnel.run(
-            f"sacct --parsable2 -j {app_id}",
-        )
+        try:
+            p = self.tunnel.run(
+                f"sacct --parsable2 -j {app_id}",
+            )
+        except Exception as e:
+            count = self._consecutive_sacct_failures.get(app_id, 0) + 1
+            self._consecutive_sacct_failures[app_id] = count
+            if count >= MAX_CONSECUTIVE_SACCT_FAILURES:
+                raise PersistentSacctFailure(
+                    f"sacct failed {count} consecutive times for job {app_id}: {e}"
+                ) from e
+            log.warning(
+                f"Failed to query sacct for job {app_id} ({count}/{MAX_CONSECUTIVE_SACCT_FAILURES}): "
+                f"{e}. Treating as transient."
+            )
+            return DescribeAppResponse(app_id=app_id, state=AppState.UNKNOWN)
+        self._consecutive_sacct_failures.pop(app_id, None)
         output = p.stdout.strip().split("\n")
 
         if len(output) <= 1:
diff --git a/test/run/torchx_backend/schedulers/test_slurm.py b/test/run/torchx_backend/schedulers/test_slurm.py
@@ -26,7 +26,9 @@
 
 from nemo_run.core.execution.slurm import SlurmBatchRequest, SlurmExecutor
 from nemo_run.core.tunnel.client import LocalTunnel
+from nemo_run.exceptions import PersistentSacctFailure
 from nemo_run.run.torchx_backend.schedulers.slurm import (
+    MAX_CONSECUTIVE_SACCT_FAILURES,
     SlurmTunnelScheduler,
     TunnelLogIterator,
     _get_job_dirs,
@@ -380,6 +382,83 @@ def test_describe_returns_unknown_on_persistent_permission_error(slurm_scheduler
     assert result.state == AppState.UNKNOWN
 
 
+def test_describe_returns_unknown_on_sacct_exception(slurm_scheduler, mocker):
+    """Regression: transient sacct failure (e.g. after hours of polling) must not
+    propagate an exception and kill the wait loop. describe() should return UNKNOWN
+    (non-terminal) so polling continues until the job completes."""
+    from torchx.specs import AppState
+
+    job_dirs = {"12345": ("/path/to/job", LocalTunnel(job_dir="/path/to/tunnel"), "log*")}
+    mocker.patch(
+        "nemo_run.run.torchx_backend.schedulers.slurm._get_job_dirs",
+        return_value=job_dirs,
+    )
+    mocker.patch.object(SlurmTunnelScheduler, "_initialize_tunnel")
+
+    slurm_scheduler.tunnel = mock.MagicMock()
+    slurm_scheduler.tunnel.run.side_effect = Exception("sacct: command failed")
+
+    result = slurm_scheduler.describe("12345")
+    assert result is not None
+    assert result.state == AppState.UNKNOWN
+
+
+def test_describe_raises_persistent_sacct_failure_after_threshold(slurm_scheduler, mocker):
+    """After MAX_CONSECUTIVE_SACCT_FAILURES consecutive sacct exceptions, describe() must
+    raise PersistentSacctFailure so the caller can cancel the job instead of spinning forever."""
+    job_dirs = {"12345": ("/path/to/job", LocalTunnel(job_dir="/path/to/tunnel"), "log*")}
+    mocker.patch(
+        "nemo_run.run.torchx_backend.schedulers.slurm._get_job_dirs",
+        return_value=job_dirs,
+    )
+    mocker.patch.object(SlurmTunnelScheduler, "_initialize_tunnel")
+
+    slurm_scheduler.tunnel = mock.MagicMock()
+    slurm_scheduler.tunnel.run.side_effect = Exception("sacct: command failed")
+
+    for _ in range(MAX_CONSECUTIVE_SACCT_FAILURES - 1):
+        result = slurm_scheduler.describe("12345")
+        assert result.state == AppState.UNKNOWN
+
+    with pytest.raises(PersistentSacctFailure, match="12345"):
+        slurm_scheduler.describe("12345")
+
+
+def test_describe_resets_sacct_failure_counter_on_success(slurm_scheduler, mocker):
+    """A successful sacct call must reset the consecutive failure counter so that
+    subsequent transient failures start fresh."""
+    job_dirs = {"12345": ("/path/to/job", LocalTunnel(job_dir="/path/to/tunnel"), "log*")}
+    mocker.patch(
+        "nemo_run.run.torchx_backend.schedulers.slurm._get_job_dirs",
+        return_value=job_dirs,
+    )
+    mocker.patch.object(SlurmTunnelScheduler, "_initialize_tunnel")
+
+    slurm_scheduler.tunnel = mock.MagicMock()
+
+    # Fail just below the threshold
+    slurm_scheduler.tunnel.run.side_effect = Exception("sacct: command failed")
+    for _ in range(MAX_CONSECUTIVE_SACCT_FAILURES - 1):
+        slurm_scheduler.describe("12345")
+
+    # Recover — sacct returns valid output
+    header = "JobID|JobName|State|ExitCode"
+    row = "12345|exp.master|RUNNING|0:0"
+    success_result = mock.MagicMock()
+    success_result.stdout = f"{header}\n{row}"
+    slurm_scheduler.tunnel.run.side_effect = None
+    slurm_scheduler.tunnel.run.return_value = success_result
+    slurm_scheduler.describe("12345")
+
+    assert slurm_scheduler._consecutive_sacct_failures.get("12345", 0) == 0
+
+    # Fail again — counter should restart from 1, not trigger threshold immediately
+    slurm_scheduler.tunnel.run.side_effect = Exception("sacct: command failed")
+    result = slurm_scheduler.describe("12345")
+    assert result.state == AppState.UNKNOWN
+    assert slurm_scheduler._consecutive_sacct_failures["12345"] == 1
+
+
 def test_schedule_with_dependencies(slurm_scheduler, slurm_executor):
     mock_request = mock.MagicMock()
     mock_request.cmd = ["sbatch", "--requeue", "--parsable"]
diff --git a/test/run/torchx_backend/test_launcher.py b/test/run/torchx_backend/test_launcher.py
@@ -23,7 +23,7 @@
 from torchx.specs import AppDef, AppStatus
 
 from nemo_run.core.execution.base import Executor
-from nemo_run.exceptions import UnknownStatusError
+from nemo_run.exceptions import PersistentSacctFailure, UnknownStatusError
 from nemo_run.run.logs import get_logs
 from nemo_run.run.torchx_backend.launcher import ContextThread, launch, wait_and_exit
 
@@ -231,6 +231,17 @@ def test_wait_and_exit_other_runtime_error_propagates(mock_runner):
         wait_and_exit(app_handle=mock_app_handle, log=False, runner=mock_runner)
 
 
+def test_wait_and_exit_cancels_job_on_persistent_sacct_failure(mock_runner):
+    """PersistentSacctFailure must cancel the job and raise UnknownStatusError."""
+    mock_app_handle = "dummy://nemo_run/my-test-run"
+    mock_runner.wait.side_effect = PersistentSacctFailure("sacct failed 30 times for 12345")
+
+    with pytest.raises(UnknownStatusError):
+        wait_and_exit(app_handle=mock_app_handle, log=False, runner=mock_runner)
+
+    mock_runner.cancel.assert_called_once_with(mock_app_handle)
+
+
 @patch("threading.Thread.run")
 def test_context_thread_run(mocked_run, setup_and_teardown):
     def test_function():