Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions nemo_run/core/execution/dgxcloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def move_data(self, token: str, project_id: str, cluster_id: str, sleep: float =
resp = self.create_data_mover_workload(token, project_id, cluster_id)
if resp.status_code not in [200, 202]:
raise RuntimeError(
f"Failed to create data mover workload, status_code={resp.status_code}"
f"Failed to create data mover workload, status_code={resp.status_code}, reason={resp.text}"
)

resp_json = resp.json()
Expand Down Expand Up @@ -240,7 +240,7 @@ def create_distributed_job(self, token: str, project_id: str, cluster_id: str, n
return response

def launch(self, name: str, cmd: list[str]) -> tuple[str, str]:
name = name.replace("_", "-").replace(".", "-") # to meet K8s requirements
name = name.replace("_", "-").replace(".", "-").lower() # to meet K8s requirements
token = self.get_auth_token()
if not token:
raise RuntimeError("Failed to get auth token")
Expand All @@ -265,7 +265,9 @@ def launch(self, name: str, cmd: list[str]) -> tuple[str, str]:
logger.info("Creating distributed workload")
resp = self.create_distributed_job(token, project_id, cluster_id, name)
if resp.status_code not in [200, 202]:
raise RuntimeError(f"Failed to create job, status_code={resp.status_code}")
raise RuntimeError(
f"Failed to create job, status_code={resp.status_code}, reason={resp.text}"
)

r_json = resp.json()
job_id = r_json["workloadId"]
Expand Down
Loading