Skip to content

Commit 1e18af5

Browse files
ko3n1gclaude
andcommitted
fix: use type-specific endpoint for DGXCloud workload status
The status() method was calling GET /workloads/{job_id} (generic endpoint) which returns 403 for distributed and training workloads. The correct endpoints match the create paths: /workloads/distributed/{job_id} for multi-node jobs and /workloads/trainings/{job_id} for single-node jobs. This is consistent with how cancel() already uses /workloads/distributed/. Adds test_status_distributed to verify the correct URL is used for multi-node executors. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 8d7c550 commit 1e18af5

2 files changed

Lines changed: 30 additions & 2 deletions

File tree

nemo_run/core/execution/dgxcloud.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,8 @@ def nproc_per_node(self) -> int:
360360
return 1
361361

362362
def status(self, job_id: str) -> Optional[DGXCloudState]:
363-
url = f"{self.base_url}/workloads/{job_id}"
363+
workload_type = "distributed" if self.nodes > 1 else "trainings"
364+
url = f"{self.base_url}/workloads/{workload_type}/{job_id}"
364365
token = self.get_auth_token()
365366
if not token:
366367
logger.error("Failed to retrieve auth token for status request.")

test/core/execution/test_dgxcloud.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -895,7 +895,34 @@ def test_status(self, mock_get):
895895

896896
assert status == DGXCloudState.RUNNING
897897
mock_get.assert_called_once_with(
898-
"https://dgxapi.example.com/workloads/job123",
898+
"https://dgxapi.example.com/workloads/trainings/job123",
899+
headers=executor._default_headers(token="test_token"),
900+
)
901+
902+
@patch("requests.get")
903+
def test_status_distributed(self, mock_get):
904+
mock_response = MagicMock()
905+
mock_response.status_code = 200
906+
mock_response.json.return_value = {"phase": "Running"}
907+
mock_get.return_value = mock_response
908+
909+
with patch.object(DGXCloudExecutor, "get_auth_token", return_value="test_token"):
910+
executor = DGXCloudExecutor(
911+
base_url="https://dgxapi.example.com",
912+
kube_apiserver_url="https://127.0.0.1:443",
913+
app_id="test_app_id",
914+
app_secret="test_app_secret",
915+
project_name="test_project",
916+
container_image="nvcr.io/nvidia/test:latest",
917+
pvc_nemo_run_dir="/workspace/nemo_run",
918+
nodes=8,
919+
)
920+
921+
status = executor.status("job123")
922+
923+
assert status == DGXCloudState.RUNNING
924+
mock_get.assert_called_once_with(
925+
"https://dgxapi.example.com/workloads/distributed/job123",
899926
headers=executor._default_headers(token="test_token"),
900927
)
901928

0 commit comments

Comments
 (0)