Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions nemo_run/core/execution/dgxcloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,12 @@
if self.nodes < 1:
raise ValueError("Node count must be at least 1")

if len(name) >= 35:
logger.warning(

Check warning on line 229 in nemo_run/core/execution/dgxcloud.py

View check run for this annotation

Codecov / codecov/patch

nemo_run/core/execution/dgxcloud.py#L229

Added line #L229 was not covered by tests
"Training name can only be max 35 characters. Shortening name to 35 characters..."
)
name = name[:34]

Check warning on line 232 in nemo_run/core/execution/dgxcloud.py

View check run for this annotation

Codecov / codecov/patch

nemo_run/core/execution/dgxcloud.py#L232

Added line #L232 was not covered by tests

# Common payload elements
common_payload = {
"name": name,
Expand Down Expand Up @@ -265,6 +271,7 @@
headers = self._default_headers(token=token)
response = requests.post(url, json=payload, headers=headers)

logger.info(json.dumps(payload))
logger.debug(
"Created %s job; response code=%s, content=%s",
"distributed" if self.nodes > 1 else "training",
Expand All @@ -276,6 +283,7 @@

def launch(self, name: str, cmd: list[str]) -> tuple[str, str]:
name = name.replace("_", "-").replace(".", "-").lower() # to meet K8s requirements
logger.info(f"workload name:{name}")
token = self.get_auth_token()
if not token:
raise RuntimeError("Failed to get auth token")
Expand Down
Loading