From 2132a155f73599a47141f6cb3fb048ee11684f0f Mon Sep 17 00:00:00 2001 From: Zoey Zhang Date: Tue, 17 Jun 2025 11:51:51 -0400 Subject: [PATCH 1/2] shortening names so it doesn't break runs Signed-off-by: Zoey Zhang --- nemo_run/core/execution/dgxcloud.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index 927f6256..99115c14 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -225,6 +225,10 @@ def create_training_job( if self.nodes < 1: raise ValueError("Node count must be at least 1") + if len(name)>=35: + logger.warning("Training name can only be max 35 characters. Shortening name to 35 characters...") + name=name[:34] + # Common payload elements common_payload = { "name": name, @@ -265,6 +269,7 @@ def create_training_job( headers = self._default_headers(token=token) response = requests.post(url, json=payload, headers=headers) + logger.info(json.dumps(payload)) logger.debug( "Created %s job; response code=%s, content=%s", "distributed" if self.nodes > 1 else "training", @@ -276,6 +281,7 @@ def create_training_job( def launch(self, name: str, cmd: list[str]) -> tuple[str, str]: name = name.replace("_", "-").replace(".", "-").lower() # to meet K8s requirements + logger.info(f"workload name:{name}") token = self.get_auth_token() if not token: raise RuntimeError("Failed to get auth token") From 35932a404b530572cfc86adc1eda6e91828dbb42 Mon Sep 17 00:00:00 2001 From: Zoey Zhang Date: Mon, 30 Jun 2025 16:43:34 -0700 Subject: [PATCH 2/2] linting Signed-off-by: Zoey Zhang --- nemo_run/core/execution/dgxcloud.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index 99115c14..b8b64528 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -225,9 +225,11 @@ def create_training_job( if self.nodes < 1: raise ValueError("Node count must be at least 1") - if len(name)>=35: - logger.warning("Training name can only be max 35 characters. Shortening name to 35 characters...") - name=name[:34] + if len(name) >= 35: + logger.warning( + "Training name can only be max 35 characters. Shortening name to 35 characters..." + ) + name = name[:34] # Common payload elements common_payload = {