Skip to content

Commit 2bd75e3

Browse files
committed
Add storage mount options to LeptonExecutor
Allow users to specify where storage is mounted from on DGX Cloud Lepton jobs, such as on an attached NFS mounted on all of the nodes in the node group. This can be mounted in jobs for shared storage. Signed-Off-By: Robert Clark <roclark@nvidia.com>
1 parent 78f54ee commit 2bd75e3

2 files changed

Lines changed: 60 additions & 3 deletions

File tree

nemo_run/core/execution/lepton.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,15 @@ def _valid_node_ids(self, node_group_id: DedicatedNodeGroup, client: APIClient)
161161

162162
return valid_node_ids
163163

164+
def _validate_mounts(self):
165+
"""
166+
Ensure the required arguments are specified for mounts.
167+
"""
168+
for mount in self.mounts:
169+
# Verify that 'path' and 'mount_path' are both present in the mounts list
170+
if not all(key in mount for key in ["path", "mount_path"]):
171+
raise RuntimeError("Must specify a 'path' and 'mount_path' for all mounts")
172+
164173
def create_lepton_job(self, name: str):
165174
"""
166175
Creates a distributed PyTorch job using the provided project/cluster IDs.
@@ -192,9 +201,7 @@ def create_lepton_job(self, name: str):
192201
max_failure_retry=None,
193202
max_job_failure_retry=None,
194203
envs=envs,
195-
mounts=[
196-
Mount(path=mount["path"], mount_path=mount["mount_path"]) for mount in self.mounts
197-
],
204+
mounts=[Mount(**mount) for mount in self.mounts],
198205
image_pull_secrets=[],
199206
ttl_seconds_after_finished=None,
200207
intra_job_communication=True,
@@ -211,6 +218,7 @@ def create_lepton_job(self, name: str):
211218
return created_job
212219

213220
def launch(self, name: str, cmd: list[str]) -> tuple[str, str]:
221+
self._validate_mounts()
214222
name = name.replace("_", "-").replace(".", "-") # to meet K8s requirements
215223
launch_script = f"""
216224
wget -O init.sh https://raw.githubusercontent.com/leptonai/scripts/main/lepton_env_to_pytorch.sh

test/core/execution/test_lepton.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,55 @@ def test_nproc_per_node_default(self):
396396

397397
assert executor.nproc_per_node() == 1
398398

399+
def test_valid_storage_mounts(self):
400+
executor = LeptonExecutor(
401+
container_image="nvcr.io/nvidia/test:latest",
402+
nemo_run_dir="/workspace/nemo_run",
403+
mounts=[{"path": "/workspace", "mount_path": "/workspace"}],
404+
)
405+
406+
assert executor._validate_mounts() == None
407+
408+
def test_valid_storage_mounts_with_mount_from(self):
409+
executor = LeptonExecutor(
410+
container_image="nvcr.io/nvidia/test:latest",
411+
nemo_run_dir="/workspace/nemo_run",
412+
mounts=[
413+
{"path": "/workspace", "mount_path": "/workspace", "from": "local-storage:nfs"}
414+
],
415+
)
416+
417+
assert executor._validate_mounts() == None
418+
419+
def test_missing_storage_mount_options(self):
420+
executor = LeptonExecutor(
421+
container_image="nvcr.io/nvidia/test:latest",
422+
nemo_run_dir="/workspace/nemo_run",
423+
mounts=[{"path": "/workspace"}],
424+
)
425+
426+
with pytest.raises(RuntimeError):
427+
executor._validate_mounts()
428+
429+
def test_missing_storage_mount_options_mount_path(self):
430+
executor = LeptonExecutor(
431+
container_image="nvcr.io/nvidia/test:latest",
432+
nemo_run_dir="/workspace/nemo_run",
433+
mounts=[{"mount_path": "/workspace"}],
434+
)
435+
436+
with pytest.raises(RuntimeError):
437+
executor._validate_mounts()
438+
439+
def test_valid_storage_mounts_with_random_args(self):
440+
executor = LeptonExecutor(
441+
container_image="nvcr.io/nvidia/test:latest",
442+
nemo_run_dir="/workspace/nemo_run",
443+
mounts=[{"path": "/workspace", "mount_path": "/workspace", "random": True}],
444+
)
445+
446+
assert executor._validate_mounts() == None
447+
399448
@patch("nemo_run.core.execution.lepton.APIClient")
400449
def test_status_running_and_ready(self, mock_APIClient):
401450
mock_instance = MagicMock()

0 commit comments

Comments
 (0)