Skip to content

Commit 568a242

Browse files
ko3n1gclaude
andcommitted
fix: catch transient sacct exceptions in SlurmTunnelScheduler.describe()
After long-running jobs (hours), a transient sacct failure (daemon hiccup, invoke.UnexpectedExit from non-zero exit code, etc.) would propagate uncaught through describe() → runner.wait() → wait_and_exit(), killing the wait loop and reporting EXIT_CODE_TRAINING=1 even though the Slurm job was still running. Wrap the sacct call in a try/except and return AppState.UNKNOWN on failure. UNKNOWN is non-terminal in torchx so polling continues. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> Signed-off-by: oliver könig <okoenig@nvidia.com>
1 parent 739409a commit 568a242

2 files changed

Lines changed: 30 additions & 3 deletions

File tree

nemo_run/run/torchx_backend/schedulers/slurm.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -240,9 +240,15 @@ def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
240240
return None
241241

242242
assert self.tunnel, "Tunnel is None."
243-
p = self.tunnel.run(
244-
f"sacct --parsable2 -j {app_id}",
245-
)
243+
try:
244+
p = self.tunnel.run(
245+
f"sacct --parsable2 -j {app_id}",
246+
)
247+
except Exception as e:
248+
log.warning(
249+
f"Failed to query sacct for job {app_id}: {e}. Treating as transient."
250+
)
251+
return DescribeAppResponse(app_id=app_id, state=AppState.UNKNOWN)
246252
output = p.stdout.strip().split("\n")
247253

248254
if len(output) <= 1:

test/run/torchx_backend/schedulers/test_slurm.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,27 @@ def test_describe_returns_unknown_on_persistent_permission_error(slurm_scheduler
380380
assert result.state == AppState.UNKNOWN
381381

382382

383+
def test_describe_returns_unknown_on_sacct_exception(slurm_scheduler, mocker):
384+
"""Regression: transient sacct failure (e.g. after hours of polling) must not
385+
propagate an exception and kill the wait loop. describe() should return UNKNOWN
386+
(non-terminal) so polling continues until the job completes."""
387+
from torchx.specs import AppState
388+
389+
job_dirs = {"12345": ("/path/to/job", LocalTunnel(job_dir="/path/to/tunnel"), "log*")}
390+
mocker.patch(
391+
"nemo_run.run.torchx_backend.schedulers.slurm._get_job_dirs",
392+
return_value=job_dirs,
393+
)
394+
mocker.patch.object(SlurmTunnelScheduler, "_initialize_tunnel")
395+
396+
slurm_scheduler.tunnel = mock.MagicMock()
397+
slurm_scheduler.tunnel.run.side_effect = Exception("sacct: command failed")
398+
399+
result = slurm_scheduler.describe("12345")
400+
assert result is not None
401+
assert result.state == AppState.UNKNOWN
402+
403+
383404
def test_schedule_with_dependencies(slurm_scheduler, slurm_executor):
384405
mock_request = mock.MagicMock()
385406
mock_request.cmd = ["sbatch", "--requeue", "--parsable"]

0 commit comments

Comments
 (0)