Skip to content

Commit 379a070

Browse files
committed
rename dgx cloud executor to runai executor
Signed-off-by: Hemil Desai <hemild@nvidia.com>
1 parent 9512c3b commit 379a070

12 files changed

Lines changed: 236 additions & 239 deletions

File tree

docs/source/guides/execution.md

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ The packager support matrix is described below:
4141
| DockerExecutor | run.Packager, run.GitArchivePackager, run.PatternPackager, run.HybridPackager |
4242
| SlurmExecutor | run.Packager, run.GitArchivePackager, run.PatternPackager, run.HybridPackager |
4343
| SkypilotExecutor | run.Packager, run.GitArchivePackager, run.PatternPackager, run.HybridPackager |
44-
| DGXCloudExecutor | run.Packager, run.GitArchivePackager, run.PatternPackager, run.HybridPackager |
44+
| RunAIExecutor | run.Packager, run.GitArchivePackager, run.PatternPackager, run.HybridPackager |
4545
| LeptonExecutor | run.Packager, run.GitArchivePackager, run.PatternPackager, run.HybridPackager |
4646

4747
`run.Packager` is a passthrough base packager.
@@ -224,19 +224,17 @@ executor = your_skypilot_cluster(nodes=8, devices=8, container_image="your-nemo-
224224

225225
As demonstrated in the examples, defining executors in Python offers great flexibility. You can easily mix and match things like common environment variables, and the separation of tasks from executors enables you to run the same configured task on any supported executor.
226226

227-
#### DGXCloudExecutor
227+
#### RunAIExecutor
228228

229-
The `DGXCloudExecutor` integrates with a DGX Cloud cluster's Run:ai API to launch distributed jobs. It uses REST API calls to authenticate, identify the target project and cluster, and submit the job specification.
230-
231-
> **_WARNING:_** Currently, the `DGXCloudExecutor` is only supported when launching experiments *from* a pod running on the DGX Cloud cluster itself. Furthermore, this launching pod must have access to a Persistent Volume Claim (PVC) where the experiment/job directories will be created, and this same PVC must also be configured to be mounted by the job being launched.
229+
The `RunAIExecutor` integrates with the Run:ai API to launch distributed jobs. It uses REST API calls to authenticate, identify the target project and cluster, and submit the job specification.
232230

233231
Here's an example configuration:
234232

235233
```python
236-
def your_dgx_executor(nodes: int, gpus_per_node: int, container_image: str):
237-
# Ensure these are set correctly for your DGX Cloud environment
234+
def your_runai_executor(nodes: int, gpus_per_node: int, container_image: str):
235+
# Ensure these are set correctly for your RunAI environment
238236
# You might fetch these from environment variables or a config file
239-
base_url = "YOUR_DGX_CLOUD_API_ENDPOINT" # e.g., https://<cluster-name>.<domain>/api/v1
237+
base_url = "YOUR_RUNAI_API_ENDPOINT" # e.g., https://<cluster-name>.<domain>/api/v1
240238
app_id = "YOUR_RUNAI_APP_ID"
241239
app_secret = "YOUR_RUNAI_APP_SECRET"
242240
project_name = "YOUR_RUNAI_PROJECT_NAME"
@@ -245,7 +243,7 @@ def your_dgx_executor(nodes: int, gpus_per_node: int, container_image: str):
245243
pvc_name = "your-pvc-k8s-name" # The Kubernetes name of the PVC
246244
pvc_mount_path = "/your_custom_path" # The path where the PVC will be mounted inside the container
247245

248-
executor = run.DGXCloudExecutor(
246+
executor = run.RunAIExecutor(
249247
base_url=base_url,
250248
app_id=app_id,
251249
app_secret=app_secret,
@@ -261,11 +259,11 @@ def your_dgx_executor(nodes: int, gpus_per_node: int, container_image: str):
261259
return executor
262260

263261
# Example usage:
264-
# executor = your_dgx_executor(nodes=4, gpus_per_node=8, container_image="your-nemo-image")
262+
# executor = your_runai_executor(nodes=4, gpus_per_node=8, container_image="your-nemo-image")
265263

266264
```
267265

268-
For a complete end-to-end example using DGX Cloud with NeMo, refer to the [NVIDIA DGX Cloud NeMo End-to-End Workflow Example](https://docs.nvidia.com/dgx-cloud/run-ai/latest/nemo-e2e-example.html).
266+
For a complete end-to-end example using RunAI with NeMo, refer to the [NVIDIA RunAI NeMo End-to-End Workflow Example](https://docs.nvidia.com/dgx-cloud/run-ai/latest/nemo-e2e-example.html).
269267

270268
#### LeptonExecutor
271269

nemo_run/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,11 @@
2222
from nemo_run.cli.lazy import LazyEntrypoint, lazy_imports
2323
from nemo_run.config import Config, ConfigurableMixin, Partial, Script
2424
from nemo_run.core.execution.base import Executor, ExecutorMacros, import_executor
25-
from nemo_run.core.execution.dgxcloud import DGXCloudExecutor
2625
from nemo_run.core.execution.docker import DockerExecutor
2726
from nemo_run.core.execution.launcher import FaultTolerance, SlurmRay, SlurmTemplate, Torchrun
2827
from nemo_run.core.execution.lepton import LeptonExecutor
2928
from nemo_run.core.execution.local import LocalExecutor
29+
from nemo_run.core.execution.runai import RunAIExecutor
3030
from nemo_run.core.execution.skypilot import SkypilotExecutor
3131
from nemo_run.core.execution.slurm import SlurmExecutor
3232
from nemo_run.core.packaging import GitArchivePackager, HybridPackager, Packager, PatternPackager
@@ -48,7 +48,7 @@
4848
"ConfigurableMixin",
4949
"DevSpace",
5050
"DockerExecutor",
51-
"DGXCloudExecutor",
51+
"RunAIExecutor",
5252
"dryrun_fn",
5353
"Executor",
5454
"import_executor",
Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
logger = logging.getLogger(__name__)
2222

2323

24-
class DGXCloudState(Enum):
24+
class RunAIState(Enum):
2525
CREATING = "Creating"
2626
INITIALIZING = "Initializing"
2727
RESUMING = "Resuming"
@@ -39,14 +39,14 @@ class DGXCloudState(Enum):
3939

4040

4141
@dataclass(kw_only=True)
42-
class DGXCloudExecutor(Executor):
42+
class RunAIExecutor(Executor):
4343
"""
44-
Dataclass to configure a DGX Executor.
44+
Dataclass to configure a RunAI Executor.
4545
46-
This executor integrates with a DGX cloud endpoint for launching jobs
46+
This executor integrates with a RunAI cloud endpoint for launching jobs
4747
via a REST API. It acquires an auth token, identifies the project/cluster,
4848
and launches jobs with a specified command. It can be adapted to meet user
49-
authentication and job-submission requirements on DGX.
49+
authentication and job-submission requirements on RunAI.
5050
"""
5151

5252
base_url: str
@@ -168,29 +168,29 @@ def move_data(self, token: str, project_id: str, cluster_id: str, sleep: float =
168168

169169
resp_json = resp.json()
170170
workload_id = resp_json["workloadId"]
171-
status = DGXCloudState(resp_json["actualPhase"])
171+
status = RunAIState(resp_json["actualPhase"])
172172

173-
logger.info(f"Successfully created data movement workload {workload_id} on DGXCloud")
173+
logger.info(f"Successfully created data movement workload {workload_id} on RunAI")
174174

175175
while status in [
176-
DGXCloudState.PENDING,
177-
DGXCloudState.CREATING,
178-
DGXCloudState.INITIALIZING,
179-
DGXCloudState.RUNNING,
176+
RunAIState.PENDING,
177+
RunAIState.CREATING,
178+
RunAIState.INITIALIZING,
179+
RunAIState.RUNNING,
180180
]:
181181
time.sleep(sleep)
182182
status = self.status(workload_id)
183183
logger.debug(
184184
f"Polling data movement workload {workload_id}'s status. Current status is: {status}"
185185
)
186186

187-
if status is not DGXCloudState.COMPLETED:
187+
if status is not RunAIState.COMPLETED:
188188
raise RuntimeError(f"Failed to move data to PVC. Workload status is {status}")
189189

190190
resp = self.delete_workload(token, workload_id)
191191
if resp.status_code >= 200 and resp.status_code < 300:
192192
logger.info(
193-
"Successfully deleted data movement workload %s on DGXCloud with response code %d",
193+
"Successfully deleted data movement workload %s on RunAI with response code %d",
194194
workload_id,
195195
resp.status_code,
196196
)
@@ -286,7 +286,7 @@ def nproc_per_node(self) -> int:
286286
return self.nprocs_per_node
287287
return 1
288288

289-
def status(self, job_id: str) -> Optional[DGXCloudState]:
289+
def status(self, job_id: str) -> Optional[RunAIState]:
290290
url = f"{self.base_url}/workloads/{job_id}"
291291
token = self.get_auth_token()
292292
if not token:
@@ -296,10 +296,10 @@ def status(self, job_id: str) -> Optional[DGXCloudState]:
296296
headers = self._default_headers(token=token)
297297
response = requests.get(url, headers=headers)
298298
if response.status_code != 200:
299-
return DGXCloudState("Unknown")
299+
return RunAIState("Unknown")
300300

301301
r_json = response.json()
302-
return DGXCloudState(r_json["phase"])
302+
return RunAIState(r_json["phase"])
303303

304304
def cancel(self, job_id: str):
305305
# Retrieve the authentication token for the REST calls
@@ -315,7 +315,7 @@ def cancel(self, job_id: str):
315315
response = requests.get(url, headers=headers)
316316
if response.status_code >= 200 and response.status_code < 300:
317317
logger.info(
318-
"Successfully cancelled job %s on DGX with response code %d",
318+
"Successfully cancelled job %s on RunAI with response code %d",
319319
job_id,
320320
response.status_code,
321321
)
@@ -328,9 +328,9 @@ def cancel(self, job_id: str):
328328
)
329329

330330
@classmethod
331-
def logs(cls: Type["DGXCloudExecutor"], app_id: str, fallback_path: Optional[str]):
331+
def logs(cls: Type["RunAIExecutor"], app_id: str, fallback_path: Optional[str]):
332332
logger.warning(
333-
"Logs not available for DGXCloudExecutor based jobs. Please visit the cluster UI to view the logs."
333+
"Logs not available for RunAIExecutor based jobs. Please visit the cluster UI to view the logs."
334334
)
335335

336336
def cleanup(self, handle: str): ...

nemo_run/run/experiment.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,8 @@
3535
from rich.console import Group
3636
from rich.live import Live
3737
from rich.panel import Panel
38-
from rich.progress import BarColumn, Progress, SpinnerColumn
38+
from rich.progress import BarColumn, Progress, SpinnerColumn, TaskID, TimeElapsedColumn
3939
from rich.progress import Task as RichTask
40-
from rich.progress import TaskID, TimeElapsedColumn
4140
from rich.syntax import Syntax
4241
from torchx.specs.api import AppState
4342

@@ -51,10 +50,10 @@
5150
get_type_namespace,
5251
)
5352
from nemo_run.core.execution.base import Executor
54-
from nemo_run.core.execution.dgxcloud import DGXCloudExecutor
5553
from nemo_run.core.execution.docker import DockerExecutor
5654
from nemo_run.core.execution.lepton import LeptonExecutor
5755
from nemo_run.core.execution.local import LocalExecutor
56+
from nemo_run.core.execution.runai import RunAIExecutor
5857
from nemo_run.core.execution.skypilot import SkypilotExecutor
5958
from nemo_run.core.execution.slurm import SlurmExecutor
6059
from nemo_run.core.frontend.console.api import CONSOLE, configure_logging, deconfigure_logging
@@ -203,13 +202,13 @@ class Experiment(ConfigurableMixin):
203202
LocalExecutor,
204203
SkypilotExecutor,
205204
DockerExecutor,
206-
DGXCloudExecutor,
205+
RunAIExecutor,
207206
LeptonExecutor,
208207
)
209208
_DETACH_SUPPORTED_EXECUTORS = (
210209
SlurmExecutor,
211210
SkypilotExecutor,
212-
DGXCloudExecutor,
211+
RunAIExecutor,
213212
LeptonExecutor,
214213
)
215214
_DEPENDENCY_SUPPORTED_EXECUTORS = (SlurmExecutor,)

nemo_run/run/torchx_backend/components/ft_launcher.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def ft_launcher(
4949
rank_termination_signal: Optional[str] = None,
5050
log_level: Optional[str] = None,
5151
max_restarts: Optional[int] = None,
52-
dgxc: bool = False,
52+
runai: bool = False,
5353
use_env: bool = False,
5454
) -> specs.AppDef:
5555
torchrun_component = torchrun.torchrun(
@@ -71,7 +71,7 @@ def ft_launcher(
7171
mounts=mounts,
7272
debug=debug,
7373
max_retries=max_retries,
74-
dgxc=dgxc,
74+
runai=runai,
7575
use_env=use_env,
7676
)
7777

nemo_run/run/torchx_backend/components/torchrun.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def torchrun(
6060
rdzv_id: Optional[int] = None,
6161
mounts: Optional[list[str]] = None,
6262
debug: bool = False,
63-
dgxc: bool = False,
63+
runai: bool = False,
6464
lepton: bool = False,
6565
use_env: bool = False,
6666
) -> specs.AppDef:
@@ -96,7 +96,7 @@ def torchrun(
9696
mounts: mounts to mount into the worker environment/container (ex. type=<bind/volume>,src=/host,dst=/job[,readonly]).
9797
See scheduler documentation for more info.
9898
debug: whether to run with preset debug flags enabled
99-
dgxc: whether to use a subset of settings for DGX Cloud
99+
runai: whether to use a subset of settings for RunAI
100100
lepton: whether the experiment is running on Lepton AI
101101
"""
102102
if (script is None) == (m is None):
@@ -140,7 +140,7 @@ def torchrun(
140140
if debug:
141141
env.update(_TORCH_DEBUG_FLAGS)
142142

143-
if dgxc:
143+
if runai:
144144
cmd = ["--nnodes", nnodes_rep, "--nproc-per-node", nproc_per_node]
145145
elif lepton:
146146
cmd = [

nemo_run/run/torchx_backend/packaging.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,10 @@
2323

2424
from nemo_run.config import SCRIPTS_DIR, USE_WITH_RAY_CLUSTER_KEY, Partial, Script
2525
from nemo_run.core.execution.base import Executor
26-
from nemo_run.core.execution.dgxcloud import DGXCloudExecutor
2726
from nemo_run.core.execution.launcher import FaultTolerance, Torchrun
2827
from nemo_run.core.execution.lepton import LeptonExecutor
2928
from nemo_run.core.execution.local import LocalExecutor
29+
from nemo_run.core.execution.runai import RunAIExecutor
3030
from nemo_run.core.execution.slurm import SlurmExecutor
3131
from nemo_run.core.serialization.yaml import YamlSerializer
3232
from nemo_run.core.serialization.zlib_json import ZlibJSONSerializer
@@ -170,7 +170,7 @@ def _get_details_from_script(fn_or_script: Script, serialize_configs: bool):
170170
mounts=mounts,
171171
debug=executor.packager.debug,
172172
max_retries=executor.retries,
173-
dgxc=isinstance(executor, DGXCloudExecutor),
173+
runai=isinstance(executor, RunAIExecutor),
174174
lepton=isinstance(executor, LeptonExecutor),
175175
use_env=use_env,
176176
)

nemo_run/run/torchx_backend/schedulers/api.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@
1818
from torchx.specs import AppDef, AppDryRunInfo
1919

2020
from nemo_run.core.execution.base import Executor
21-
from nemo_run.core.execution.dgxcloud import DGXCloudExecutor
2221
from nemo_run.core.execution.docker import DockerExecutor
2322
from nemo_run.core.execution.lepton import LeptonExecutor
2423
from nemo_run.core.execution.local import LocalExecutor
24+
from nemo_run.core.execution.runai import RunAIExecutor
2525
from nemo_run.core.execution.skypilot import SkypilotExecutor
2626
from nemo_run.core.execution.slurm import SlurmExecutor
2727

@@ -30,7 +30,7 @@
3030
SkypilotExecutor: "skypilot",
3131
LocalExecutor: "local_persistent",
3232
DockerExecutor: "docker_persistent",
33-
DGXCloudExecutor: "dgx_cloud",
33+
RunAIExecutor: "runai",
3434
LeptonExecutor: "lepton",
3535
}
3636

@@ -39,7 +39,7 @@
3939
"skypilot": SkypilotExecutor,
4040
"local_persistent": LocalExecutor,
4141
"docker_persistent": DockerExecutor,
42-
"dgx_cloud": DGXCloudExecutor,
42+
"runai": RunAIExecutor,
4343
"lepton": LeptonExecutor,
4444
}
4545

0 commit comments

Comments
 (0)