diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index 927f6256..b8b64528 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -225,6 +225,12 @@ def create_training_job( if self.nodes < 1: raise ValueError("Node count must be at least 1") + if len(name) >= 35: + logger.warning( + "Training name can only be max 35 characters. Shortening name to 35 characters..." + ) + name = name[:34] + # Common payload elements common_payload = { "name": name, @@ -265,6 +271,7 @@ def create_training_job( headers = self._default_headers(token=token) response = requests.post(url, json=payload, headers=headers) + logger.info(json.dumps(payload)) logger.debug( "Created %s job; response code=%s, content=%s", "distributed" if self.nodes > 1 else "training", @@ -276,6 +283,7 @@ def create_training_job( def launch(self, name: str, cmd: list[str]) -> tuple[str, str]: name = name.replace("_", "-").replace(".", "-").lower() # to meet K8s requirements + logger.info(f"workload name:{name}") token = self.get_auth_token() if not token: raise RuntimeError("Failed to get auth token")