From 3e316018fad648472aaab5c2acad1458a53c87c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 11 Mar 2026 16:42:02 +0000 Subject: [PATCH 1/6] fix: treat DGXCloud UNKNOWN/transient status as PENDING to avoid false failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a job is submitted to DGXCloud, the API may transiently return a non-200 response or an "Unknown" phase before the workload is fully registered. Previously this was mapped to AppState.FAILED, causing wait_and_exit() to treat the job as terminated immediately while the pod was still starting up on the cluster. - DGXCloudState.UNKNOWN now maps to AppState.PENDING in DGX_STATES - executor.status() returns None (instead of DGXCloudState.UNKNOWN) on non-200 HTTP responses so transient API errors don't look like a real "Unknown" phase reported by the scheduler - describe() fallback for unknown keys in DGX_STATES changed to PENDING - Tests updated and added to cover all three code paths Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: oliver könig --- nemo_run/core/execution/dgxcloud.py | 6 +- .../run/torchx_backend/schedulers/dgxcloud.py | 4 +- test/core/execution/test_dgxcloud.py | 2 +- .../schedulers/test_dgxcloud.py | 60 ++++++++++++++++++- 4 files changed, 65 insertions(+), 7 deletions(-) diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index b6d5e855..0656cd45 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -369,7 +369,11 @@ def status(self, job_id: str) -> Optional[DGXCloudState]: headers = self._default_headers(token=token) response = requests.get(url, headers=headers) if response.status_code != 200: - return DGXCloudState("Unknown") + logger.warning( + f"Failed to get status for job {job_id}, " + f"status_code={response.status_code}. Treating as transient." + ) + return None r_json = response.json() return DGXCloudState(r_json["phase"]) diff --git a/nemo_run/run/torchx_backend/schedulers/dgxcloud.py b/nemo_run/run/torchx_backend/schedulers/dgxcloud.py index b786d3c0..61e857e8 100644 --- a/nemo_run/run/torchx_backend/schedulers/dgxcloud.py +++ b/nemo_run/run/torchx_backend/schedulers/dgxcloud.py @@ -59,7 +59,7 @@ DGXCloudState.FAILED: AppState.FAILED, DGXCloudState.COMPLETED: AppState.SUCCEEDED, DGXCloudState.TERMINATING: AppState.RUNNING, - DGXCloudState.UNKNOWN: AppState.FAILED, + DGXCloudState.UNKNOWN: AppState.PENDING, } log = logging.getLogger(__name__) @@ -192,7 +192,7 @@ def describe(self, app_id: str) -> Optional[DescribeAppResponse]: return None dgx_state = executor.status(job_id) or DGXCloudState.UNKNOWN - app_state = DGX_STATES.get(dgx_state, AppState.UNKNOWN) + app_state = DGX_STATES.get(dgx_state, AppState.PENDING) roles_statuses[0].replicas[0].state = app_state return DescribeAppResponse( diff --git a/test/core/execution/test_dgxcloud.py b/test/core/execution/test_dgxcloud.py index 49505c48..4a1701fd 100644 --- a/test/core/execution/test_dgxcloud.py +++ b/test/core/execution/test_dgxcloud.py @@ -936,7 +936,7 @@ def test_status_error_response(self, mock_get): status = executor.status("job123") - assert status == DGXCloudState.UNKNOWN + assert status is None @patch("requests.get") def test_cancel(self, mock_get): diff --git a/test/run/torchx_backend/schedulers/test_dgxcloud.py b/test/run/torchx_backend/schedulers/test_dgxcloud.py index ca25b92b..dd19fb1a 100644 --- a/test/run/torchx_backend/schedulers/test_dgxcloud.py +++ b/test/run/torchx_backend/schedulers/test_dgxcloud.py @@ -19,10 +19,14 @@ import pytest from torchx.schedulers.api import AppDryRunInfo -from torchx.specs import AppDef, Role +from torchx.specs import AppDef, AppState, Role -from nemo_run.core.execution.dgxcloud import DGXCloudExecutor -from nemo_run.run.torchx_backend.schedulers.dgxcloud import DGXCloudScheduler, create_scheduler +from nemo_run.core.execution.dgxcloud import DGXCloudExecutor, DGXCloudState +from nemo_run.run.torchx_backend.schedulers.dgxcloud import ( + DGX_STATES, + DGXCloudScheduler, + create_scheduler, +) @pytest.fixture @@ -184,6 +188,56 @@ def test_log_iter(dgx_cloud_scheduler, dgx_cloud_executor): assert logs == ["log2", "log3"] +def test_unknown_state_maps_to_pending_not_failed(): + # DGXCloudState.UNKNOWN must map to PENDING so transient API errors during + # job startup do not cause wait_and_exit() to treat the job as terminal. + assert DGX_STATES[DGXCloudState.UNKNOWN] == AppState.PENDING + + +def test_describe_returns_pending_when_status_is_none(dgx_cloud_scheduler, dgx_cloud_executor): + # Regression test: executor.status() returns None when the auth token is + # missing or the API call fails transiently right after job submission. + # describe() must return PENDING so the wait loop keeps polling. + with ( + mock.patch( + "nemo_run.run.torchx_backend.schedulers.dgxcloud._get_job_dirs" + ) as mock_get_job_dirs, + mock.patch.object(DGXCloudExecutor, "status", return_value=None), + ): + mock_get_job_dirs.return_value = { + "test_experiment___test_role___test_job_id": { + "job_status": "RUNNING", + "executor": dgx_cloud_executor, + } + } + + response = dgx_cloud_scheduler.describe("test_experiment___test_role___test_job_id") + assert response is not None + assert response.state == AppState.PENDING + + +def test_describe_returns_pending_when_status_is_unknown(dgx_cloud_scheduler, dgx_cloud_executor): + # Regression test: the DGXCloud API transiently returns "Unknown" before a + # job is visible (e.g. HTTP 404 right after submission). describe() must + # return PENDING so the wait loop keeps polling instead of failing. + with ( + mock.patch( + "nemo_run.run.torchx_backend.schedulers.dgxcloud._get_job_dirs" + ) as mock_get_job_dirs, + mock.patch.object(DGXCloudExecutor, "status", return_value=DGXCloudState.UNKNOWN), + ): + mock_get_job_dirs.return_value = { + "test_experiment___test_role___test_job_id": { + "job_status": "RUNNING", + "executor": dgx_cloud_executor, + } + } + + response = dgx_cloud_scheduler.describe("test_experiment___test_role___test_job_id") + assert response is not None + assert response.state == AppState.PENDING + + def test_log_iter_str(dgx_cloud_scheduler, dgx_cloud_executor): with mock.patch( "nemo_run.run.torchx_backend.schedulers.dgxcloud._get_job_dirs" From 726283a369df0fce19eb012478e9d291788a2ada Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 11 Mar 2026 18:35:43 +0000 Subject: [PATCH 2/6] fix: use type-specific endpoint for DGXCloud workload status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The status() method was calling GET /workloads/{job_id} (generic endpoint) which returns 403 for distributed and training workloads. The correct endpoints match the create paths: /workloads/distributed/{job_id} for multi-node jobs and /workloads/trainings/{job_id} for single-node jobs. This is consistent with how cancel() already uses /workloads/distributed/. Adds test_status_distributed to verify the correct URL is used for multi-node executors. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: oliver könig --- nemo_run/core/execution/dgxcloud.py | 3 ++- test/core/execution/test_dgxcloud.py | 29 +++++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index 0656cd45..87b18e66 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -360,7 +360,8 @@ def nproc_per_node(self) -> int: return 1 def status(self, job_id: str) -> Optional[DGXCloudState]: - url = f"{self.base_url}/workloads/{job_id}" + workload_type = "distributed" if self.nodes > 1 else "trainings" + url = f"{self.base_url}/workloads/{workload_type}/{job_id}" token = self.get_auth_token() if not token: logger.error("Failed to retrieve auth token for status request.") diff --git a/test/core/execution/test_dgxcloud.py b/test/core/execution/test_dgxcloud.py index 4a1701fd..af09b2cb 100644 --- a/test/core/execution/test_dgxcloud.py +++ b/test/core/execution/test_dgxcloud.py @@ -895,7 +895,34 @@ def test_status(self, mock_get): assert status == DGXCloudState.RUNNING mock_get.assert_called_once_with( - "https://dgxapi.example.com/workloads/job123", + "https://dgxapi.example.com/workloads/trainings/job123", + headers=executor._default_headers(token="test_token"), + ) + + @patch("requests.get") + def test_status_distributed(self, mock_get): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"phase": "Running"} + mock_get.return_value = mock_response + + with patch.object(DGXCloudExecutor, "get_auth_token", return_value="test_token"): + executor = DGXCloudExecutor( + base_url="https://dgxapi.example.com", + kube_apiserver_url="https://127.0.0.1:443", + app_id="test_app_id", + app_secret="test_app_secret", + project_name="test_project", + container_image="nvcr.io/nvidia/test:latest", + pvc_nemo_run_dir="/workspace/nemo_run", + nodes=8, + ) + + status = executor.status("job123") + + assert status == DGXCloudState.RUNNING + mock_get.assert_called_once_with( + "https://dgxapi.example.com/workloads/distributed/job123", headers=executor._default_headers(token="test_token"), ) From 9333c45c0cf65db9c5ae1392d6f219eb0a42b824 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 11 Mar 2026 18:48:31 +0000 Subject: [PATCH 3/6] fix: read actualPhase from type-specific workload endpoints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The /workloads/distributed/{id} and /workloads/trainings/{id} endpoints return actualPhase, not phase (which was the field on the generic /workloads/{id} endpoint). This caused a KeyError crash immediately after the 403 fix landed. Now reads actualPhase first, falls back to phase for compatibility, and returns None (PENDING) if neither field is present. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: oliver könig --- nemo_run/core/execution/dgxcloud.py | 6 +++++- test/core/execution/test_dgxcloud.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index 87b18e66..24724503 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -377,7 +377,11 @@ def status(self, job_id: str) -> Optional[DGXCloudState]: return None r_json = response.json() - return DGXCloudState(r_json["phase"]) + phase = r_json.get("actualPhase") or r_json.get("phase") + if not phase: + logger.warning(f"No phase field in status response for job {job_id}: {r_json}") + return None + return DGXCloudState(phase) def fetch_logs( self, diff --git a/test/core/execution/test_dgxcloud.py b/test/core/execution/test_dgxcloud.py index af09b2cb..ccb590d4 100644 --- a/test/core/execution/test_dgxcloud.py +++ b/test/core/execution/test_dgxcloud.py @@ -877,7 +877,7 @@ def test_nproc_per_node_default(self): def test_status(self, mock_get): mock_response = MagicMock() mock_response.status_code = 200 - mock_response.json.return_value = {"phase": "Running"} + mock_response.json.return_value = {"actualPhase": "Running"} mock_get.return_value = mock_response with patch.object(DGXCloudExecutor, "get_auth_token", return_value="test_token"): @@ -903,7 +903,7 @@ def test_status(self, mock_get): def test_status_distributed(self, mock_get): mock_response = MagicMock() mock_response.status_code = 200 - mock_response.json.return_value = {"phase": "Running"} + mock_response.json.return_value = {"actualPhase": "Running"} mock_get.return_value = mock_response with patch.object(DGXCloudExecutor, "get_auth_token", return_value="test_token"): From b4c77ed8c5c22ca8b5aa3566f0156ec33472f167 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 11 Mar 2026 19:16:39 +0000 Subject: [PATCH 4/6] add tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- test/core/execution/test_dgxcloud.py | 44 ++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/test/core/execution/test_dgxcloud.py b/test/core/execution/test_dgxcloud.py index ccb590d4..392edb61 100644 --- a/test/core/execution/test_dgxcloud.py +++ b/test/core/execution/test_dgxcloud.py @@ -926,6 +926,50 @@ def test_status_distributed(self, mock_get): headers=executor._default_headers(token="test_token"), ) + @patch("requests.get") + def test_status_falls_back_to_phase_field(self, mock_get): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"phase": "Running"} + mock_get.return_value = mock_response + + with patch.object(DGXCloudExecutor, "get_auth_token", return_value="test_token"): + executor = DGXCloudExecutor( + base_url="https://dgxapi.example.com", + kube_apiserver_url="https://127.0.0.1:443", + app_id="test_app_id", + app_secret="test_app_secret", + project_name="test_project", + container_image="nvcr.io/nvidia/test:latest", + pvc_nemo_run_dir="/workspace/nemo_run", + ) + + status = executor.status("job123") + + assert status == DGXCloudState.RUNNING + + @patch("requests.get") + def test_status_returns_none_when_no_phase(self, mock_get): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"someOtherField": "value"} + mock_get.return_value = mock_response + + with patch.object(DGXCloudExecutor, "get_auth_token", return_value="test_token"): + executor = DGXCloudExecutor( + base_url="https://dgxapi.example.com", + kube_apiserver_url="https://127.0.0.1:443", + app_id="test_app_id", + app_secret="test_app_secret", + project_name="test_project", + container_image="nvcr.io/nvidia/test:latest", + pvc_nemo_run_dir="/workspace/nemo_run", + ) + + status = executor.status("job123") + + assert status is None + @patch("requests.get") def test_status_no_token(self, mock_get): with patch.object(DGXCloudExecutor, "get_auth_token", return_value=None): From a411c422b26c3eb5a8988ade09b35cfc16e94681 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 11 Mar 2026 19:23:01 +0000 Subject: [PATCH 5/6] fix: store job_id explicitly to avoid separator collision in app_id parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a role name ends with '_', the app_id string looks like: experiment___role_name____job_id Splitting on '___' produces job_id = '_job_id' (spurious leading '_'), causing the status/cancel/log_iter calls to use a wrong ID and get 404. Fix: _save_job_dir now stores the actual job_id in the JSON record. describe(), log_iter(), and _cancel_existing() all read job_id from the stored record, falling back to app_id.split('___')[-1] for backwards compatibility with existing saved jobs. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: oliver könig --- .../run/torchx_backend/schedulers/dgxcloud.py | 15 +++++--- .../schedulers/test_dgxcloud.py | 37 ++++++++++++++++++- 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/nemo_run/run/torchx_backend/schedulers/dgxcloud.py b/nemo_run/run/torchx_backend/schedulers/dgxcloud.py index 61e857e8..9bc2c969 100644 --- a/nemo_run/run/torchx_backend/schedulers/dgxcloud.py +++ b/nemo_run/run/torchx_backend/schedulers/dgxcloud.py @@ -161,7 +161,7 @@ def schedule(self, dryrun_info: AppDryRunInfo[DGXRequest]) -> str: # Store a status entry or logs path if available # Currently, the DGXExecutor status is placeholder, but we keep the pattern - _save_job_dir(app_id, job_status=status, executor=executor) + _save_job_dir(app_id, job_status=status, executor=executor, job_id=job_id) return app_id @@ -173,7 +173,8 @@ def describe(self, app_id: str) -> Optional[DescribeAppResponse]: # We split out the stored values from the JSON file stored_data = _get_job_dirs() job_info = stored_data.get(app_id) - _, role_name, job_id = app_id.split("___") + parts = app_id.split("___") + role_name = parts[1] if len(parts) > 1 else app_id roles = [Role(name=role_name, image="", num_replicas=1)] roles_statuses = [ RoleStatus( @@ -191,6 +192,7 @@ def describe(self, app_id: str) -> Optional[DescribeAppResponse]: if not executor: return None + job_id = job_info.get("job_id") or parts[-1] dgx_state = executor.status(job_id) or DGXCloudState.UNKNOWN app_state = DGX_STATES.get(dgx_state, AppState.PENDING) roles_statuses[0].replicas[0].state = app_state @@ -217,7 +219,7 @@ def log_iter( ) -> Iterable[str]: stored_data = _get_job_dirs() job_info = stored_data.get(app_id) - _, _, job_id = app_id.split("___") + job_id = job_info.get("job_id") or app_id.split("___")[-1] executor: Optional[DGXCloudExecutor] = job_info.get("executor", None) # type: ignore if not executor: return [""] @@ -240,7 +242,7 @@ def _cancel_existing(self, app_id: str) -> None: """ stored_data = _get_job_dirs() job_info = stored_data.get(app_id) - _, _, job_id = app_id.split("___") + job_id = job_info.get("job_id") or app_id.split("___")[-1] executor: DGXCloudExecutor = job_info.get("executor", None) # type: ignore if not executor: return None @@ -257,7 +259,9 @@ def create_scheduler(session_name: str, **kwargs: Any) -> DGXCloudScheduler: return DGXCloudScheduler(session_name=session_name) -def _save_job_dir(app_id: str, job_status: str, executor: DGXCloudExecutor) -> None: +def _save_job_dir( + app_id: str, job_status: str, executor: DGXCloudExecutor, job_id: str = "" +) -> None: """ Saves or updates local record of job status in JSON for demonstration. """ @@ -276,6 +280,7 @@ def _save_job_dir(app_id: str, job_status: str, executor: DGXCloudExecutor) -> N app = { "job_status": job_status, + "job_id": job_id, "executor": serializer.serialize( fdl_dc.convert_dataclasses_to_configs(executor, allow_post_init=True) ), diff --git a/test/run/torchx_backend/schedulers/test_dgxcloud.py b/test/run/torchx_backend/schedulers/test_dgxcloud.py index dd19fb1a..fb7f9525 100644 --- a/test/run/torchx_backend/schedulers/test_dgxcloud.py +++ b/test/run/torchx_backend/schedulers/test_dgxcloud.py @@ -110,6 +110,7 @@ def test_describe(dgx_cloud_scheduler, dgx_cloud_executor): mock_get_job_dirs.return_value = { "test_experiment___test_role___test_job_id": { "job_status": "RUNNING", + "job_id": "test_job_id", "executor": dgx_cloud_executor, } } @@ -132,6 +133,7 @@ def test_cancel_existing(dgx_cloud_scheduler, dgx_cloud_executor): mock_get_job_dirs.return_value = { "test_experiment___test_role___test_job_id": { "job_status": "RUNNING", + "job_id": "test_job_id", "executor": dgx_cloud_executor, } } @@ -159,10 +161,11 @@ def test_save_and_get_job_dirs(): pvc_nemo_run_dir="/workspace/nemo_run", ) - _save_job_dir("test_app_id", "RUNNING", executor) + _save_job_dir("test_app_id", "RUNNING", executor, job_id="actual_job_id") job_dirs = _get_job_dirs() assert "test_app_id" in job_dirs + assert job_dirs["test_app_id"]["job_id"] == "actual_job_id" assert isinstance(job_dirs["test_app_id"]["executor"], DGXCloudExecutor) @@ -173,6 +176,7 @@ def test_log_iter(dgx_cloud_scheduler, dgx_cloud_executor): mock_get_job_dirs.return_value = { "test_session___test_role___test_container_id": { "job_status": "RUNNING", + "job_id": "test_job_id", "executor": dgx_cloud_executor, } } @@ -188,6 +192,34 @@ def test_log_iter(dgx_cloud_scheduler, dgx_cloud_executor): assert logs == ["log2", "log3"] +def test_describe_uses_stored_job_id_not_split(dgx_cloud_scheduler, dgx_cloud_executor): + # Regression test: when a role name ends with '_', splitting app_id on '___' + # produces a job_id with a spurious leading '_' (e.g. role 'W-foo_' + sep '___' + # gives '____' which splits into 'role_' and '_job_id'). describe() must use + # the job_id stored at schedule time, not re-derive it from the app_id string. + real_job_id = "48db46d2-ae56-4c9d-9abd-ba0d873e50eb" + # role name ending with '_' triggers the collision + app_id = f"experiment___role_name___{real_job_id}" + + with ( + mock.patch( + "nemo_run.run.torchx_backend.schedulers.dgxcloud._get_job_dirs" + ) as mock_get_job_dirs, + mock.patch.object(DGXCloudExecutor, "status", return_value=DGXCloudState.RUNNING) as mock_status, + ): + mock_get_job_dirs.return_value = { + app_id: { + "job_status": "RUNNING", + "job_id": real_job_id, + "executor": dgx_cloud_executor, + } + } + + response = dgx_cloud_scheduler.describe(app_id) + assert response is not None + mock_status.assert_called_once_with(real_job_id) + + def test_unknown_state_maps_to_pending_not_failed(): # DGXCloudState.UNKNOWN must map to PENDING so transient API errors during # job startup do not cause wait_and_exit() to treat the job as terminal. @@ -207,6 +239,7 @@ def test_describe_returns_pending_when_status_is_none(dgx_cloud_scheduler, dgx_c mock_get_job_dirs.return_value = { "test_experiment___test_role___test_job_id": { "job_status": "RUNNING", + "job_id": "test_job_id", "executor": dgx_cloud_executor, } } @@ -229,6 +262,7 @@ def test_describe_returns_pending_when_status_is_unknown(dgx_cloud_scheduler, dg mock_get_job_dirs.return_value = { "test_experiment___test_role___test_job_id": { "job_status": "RUNNING", + "job_id": "test_job_id", "executor": dgx_cloud_executor, } } @@ -245,6 +279,7 @@ def test_log_iter_str(dgx_cloud_scheduler, dgx_cloud_executor): mock_get_job_dirs.return_value = { "test_session___test_role___test_container_id": { "job_status": "RUNNING", + "job_id": "test_job_id", "executor": dgx_cloud_executor, } } From ebf072fc45ef00dd57a0fbf36d8df346bd749275 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 11 Mar 2026 19:24:55 +0000 Subject: [PATCH 6/6] format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- test/run/torchx_backend/schedulers/test_dgxcloud.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/run/torchx_backend/schedulers/test_dgxcloud.py b/test/run/torchx_backend/schedulers/test_dgxcloud.py index fb7f9525..767c2106 100644 --- a/test/run/torchx_backend/schedulers/test_dgxcloud.py +++ b/test/run/torchx_backend/schedulers/test_dgxcloud.py @@ -205,7 +205,9 @@ def test_describe_uses_stored_job_id_not_split(dgx_cloud_scheduler, dgx_cloud_ex mock.patch( "nemo_run.run.torchx_backend.schedulers.dgxcloud._get_job_dirs" ) as mock_get_job_dirs, - mock.patch.object(DGXCloudExecutor, "status", return_value=DGXCloudState.RUNNING) as mock_status, + mock.patch.object( + DGXCloudExecutor, "status", return_value=DGXCloudState.RUNNING + ) as mock_status, ): mock_get_job_dirs.return_value = { app_id: {