From 11af577f2e81e9f9b67e2d0788868b97c7092608 Mon Sep 17 00:00:00 2001 From: Zoey Zhang Date: Tue, 3 Jun 2025 22:08:53 -0700 Subject: [PATCH 1/2] adding response reasons to errors and making name all lower case before launching jobs Signed-off-by: Zoey Zhang --- nemo_run/core/execution/dgxcloud.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index b11a97e9..7449ac44 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -163,7 +163,7 @@ def move_data(self, token: str, project_id: str, cluster_id: str, sleep: float = resp = self.create_data_mover_workload(token, project_id, cluster_id) if resp.status_code not in [200, 202]: raise RuntimeError( - f"Failed to create data mover workload, status_code={resp.status_code}" + f"Failed to create data mover workload, status_code={resp.status_code}, response={resp.text}" ) resp_json = resp.json() @@ -240,7 +240,7 @@ def create_distributed_job(self, token: str, project_id: str, cluster_id: str, n return response def launch(self, name: str, cmd: list[str]) -> tuple[str, str]: - name = name.replace("_", "-").replace(".", "-") # to meet K8s requirements + name = name.replace("_", "-").replace(".", "-").lower() # to meet K8s requirements token = self.get_auth_token() if not token: raise RuntimeError("Failed to get auth token") @@ -265,7 +265,9 @@ def launch(self, name: str, cmd: list[str]) -> tuple[str, str]: logger.info("Creating distributed workload") resp = self.create_distributed_job(token, project_id, cluster_id, name) if resp.status_code not in [200, 202]: - raise RuntimeError(f"Failed to create job, status_code={resp.status_code}") + raise RuntimeError( + f"Failed to create job, status_code={resp.status_code}, reason={resp.text}" + ) r_json = resp.json() job_id = r_json["workloadId"] From 70cfd5ecebc8f26d3ddbfaf17d76c945683737ac Mon Sep 17 00:00:00 2001 From: Zoey Zhang Date: Tue, 3 Jun 2025 22:12:06 -0700 Subject: [PATCH 2/2] standarizing to use reason Signed-off-by: Zoey Zhang --- nemo_run/core/execution/dgxcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index 7449ac44..a4862ee4 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -163,7 +163,7 @@ def move_data(self, token: str, project_id: str, cluster_id: str, sleep: float = resp = self.create_data_mover_workload(token, project_id, cluster_id) if resp.status_code not in [200, 202]: raise RuntimeError( - f"Failed to create data mover workload, status_code={resp.status_code}, response={resp.text}" + f"Failed to create data mover workload, status_code={resp.status_code}, reason={resp.text}" ) resp_json = resp.json()