Skip to content

Commit 06b4bbf

Browse files
committed
pr feedback
Signed-off-by: Hemil Desai <hemild@nvidia.com>
1 parent 222c859 commit 06b4bbf

4 files changed

Lines changed: 19 additions & 19 deletions

File tree

docs/source/guides/execution.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -226,13 +226,13 @@ As demonstrated in the examples, defining executors in Python offers great flexi
226226

227227
#### RunAIExecutor
228228

229-
The `RunAIExecutor` integrates with the Run:ai API to launch distributed jobs. It uses REST API calls to authenticate, identify the target project and cluster, and submit the job specification.
229+
The `RunAIExecutor` integrates with the NVIDIA Run:ai API to launch distributed jobs. It uses REST API calls to authenticate, identify the target project and cluster, and submit the job specification.
230230

231231
Here's an example configuration:
232232

233233
```python
234234
def your_runai_executor(nodes: int, gpus_per_node: int, container_image: str):
235-
# Ensure these are set correctly for your RunAI environment
235+
# Ensure these are set correctly for your NVIDIA Run:ai environment
236236
# You might fetch these from environment variables or a config file
237237
base_url = "YOUR_RUNAI_API_ENDPOINT" # e.g., https://<cluster-name>.<domain>/api/v1
238238
app_id = "YOUR_RUNAI_APP_ID"
@@ -263,7 +263,7 @@ def your_runai_executor(nodes: int, gpus_per_node: int, container_image: str):
263263

264264
```
265265

266-
For a complete end-to-end example using RunAI with NeMo, refer to the [NVIDIA RunAI NeMo End-to-End Workflow Example](https://docs.nvidia.com/dgx-cloud/run-ai/latest/nemo-e2e-example.html).
266+
For a complete end-to-end example using NVIDIA Run:ai with NeMo, refer to the [NVIDIA Run:ai NeMo End-to-End Workflow Example](https://docs.nvidia.com/dgx-cloud/run-ai/latest/nemo-e2e-example.html).
267267

268268
#### LeptonExecutor
269269

docs/source/guides/ray.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ executor = KubeRayExecutor(
6363
],
6464
# Optional tweaks ----------------------------------------------------
6565
reuse_volumes_in_worker_groups=True, # mount PVCs on workers too
66-
spec_kwargs={"schedulerName": "runai-scheduler"}, # e.g. Run:ai
66+
spec_kwargs={"schedulerName": "runai-scheduler"}, # e.g. NVIDIA Run:ai
6767
volume_mounts=[{"name": "workspace", "mountPath": "/workspace"}],
6868
volumes=[{
6969
"name": "workspace",

nemo_run/core/execution/runai.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,12 @@ class RunAIState(Enum):
4141
@dataclass(kw_only=True)
4242
class RunAIExecutor(Executor):
4343
"""
44-
Dataclass to configure a RunAI Executor.
44+
Dataclass to configure a NVIDIA Run:ai Executor.
4545
46-
This executor integrates with a RunAI cloud endpoint for launching jobs
46+
This executor integrates with a NVIDIA Run:ai cloud endpoint for launching jobs
4747
via a REST API. It acquires an auth token, identifies the project/cluster,
4848
and launches jobs with a specified command. It can be adapted to meet user
49-
authentication and job-submission requirements on RunAI.
49+
authentication and job-submission requirements on NVIDIA Run:ai.
5050
"""
5151

5252
base_url: str
@@ -170,7 +170,7 @@ def move_data(self, token: str, project_id: str, cluster_id: str, sleep: float =
170170
workload_id = resp_json["workloadId"]
171171
status = RunAIState(resp_json["actualPhase"])
172172

173-
logger.info(f"Successfully created data movement workload {workload_id} on RunAI")
173+
logger.info(f"Successfully created data movement workload {workload_id} on NVIDIA Run:ai")
174174

175175
while status in [
176176
RunAIState.PENDING,
@@ -190,7 +190,7 @@ def move_data(self, token: str, project_id: str, cluster_id: str, sleep: float =
190190
resp = self.delete_workload(token, workload_id)
191191
if resp.status_code >= 200 and resp.status_code < 300:
192192
logger.info(
193-
"Successfully deleted data movement workload %s on RunAI with response code %d",
193+
"Successfully deleted data movement workload %s on NVIDIA Run:ai with response code %d",
194194
workload_id,
195195
resp.status_code,
196196
)
@@ -315,7 +315,7 @@ def cancel(self, job_id: str):
315315
response = requests.get(url, headers=headers)
316316
if response.status_code >= 200 and response.status_code < 300:
317317
logger.info(
318-
"Successfully cancelled job %s on RunAI with response code %d",
318+
"Successfully cancelled job %s on NVIDIA Run:ai with response code %d",
319319
job_id,
320320
response.status_code,
321321
)
@@ -330,7 +330,7 @@ def cancel(self, job_id: str):
330330
@classmethod
331331
def logs(cls: Type["RunAIExecutor"], app_id: str, fallback_path: Optional[str]):
332332
logger.warning(
333-
"Logs not available for RunAIExecutor based jobs. Please visit the cluster UI to view the logs."
333+
"Logs not available for NVIDIA Run:ai Executor based jobs. Please visit the cluster UI to view the logs."
334334
)
335335

336336
def cleanup(self, handle: str): ...

nemo_run/run/torchx_backend/schedulers/runai.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,10 @@
3030
from nemo_run.core.serialization.zlib_json import ZlibJSONSerializer
3131
from nemo_run.run.torchx_backend.schedulers.api import SchedulerMixin
3232

33-
# Local placeholder for storing RunAI job states
33+
# Local placeholder for storing NVIDIA Run:ai job states
3434
RUNAI_JOB_DIRS = os.path.join(get_nemorun_home(), ".runai_jobs.json")
3535

36-
# Example mapping from some RunAI statuses to the TorchX AppState
36+
# Example mapping from some NVIDIA Run:ai statuses to the TorchX AppState
3737
RUNAI_STATES: dict[RunAIState, AppState] = {
3838
RunAIState.CREATING: AppState.PENDING,
3939
RunAIState.INITIALIZING: AppState.SUBMITTED,
@@ -57,7 +57,7 @@
5757
@dataclass
5858
class RunAIRequest:
5959
"""
60-
Wrapper around the torchx AppDef and the RunAI executor.
60+
Wrapper around the torchx AppDef and the NVIDIA Run:ai executor.
6161
This object is used to store job submission info for the scheduler.
6262
"""
6363

@@ -87,7 +87,7 @@ def _submit_dryrun( # type: ignore
8787
cfg: Executor,
8888
) -> AppDryRunInfo[RunAIRequest]:
8989
assert isinstance(cfg, RunAIExecutor), (
90-
f"{cfg.__class__} not supported for RunAICloud scheduler."
90+
f"{cfg.__class__} not supported for NVIDIA Run:ai Cloud scheduler."
9191
)
9292
executor = cfg
9393

@@ -101,12 +101,12 @@ def _submit_dryrun( # type: ignore
101101
return AppDryRunInfo(
102102
RunAIRequest(app=app, executor=executor, cmd=cmd, name=role.name),
103103
# Minimal function to show the config, if any
104-
lambda req: f"RunAI job for app: {req.app.name}, cmd: {' '.join(cmd)}, executor: {executor}",
104+
lambda req: f"NVIDIA Run:ai job for app: {req.app.name}, cmd: {' '.join(cmd)}, executor: {executor}",
105105
)
106106

107107
def schedule(self, dryrun_info: AppDryRunInfo[RunAIRequest]) -> str:
108108
"""
109-
Launches a job on RunAI using the RunAIExecutor. Returns an app_id
109+
Launches a job on NVIDIA Run:ai using the RunAIExecutor. Returns an app_id
110110
used by TorchX for subsequent queries/cancellations.
111111
"""
112112
req = dryrun_info.request
@@ -119,7 +119,7 @@ def schedule(self, dryrun_info: AppDryRunInfo[RunAIRequest]) -> str:
119119
# We'll call it without additional parameters here.
120120
job_id, status = executor.launch(name=req.name, cmd=req.cmd)
121121
if not job_id:
122-
raise RuntimeError("Failed scheduling run on RunAI: no job_id returned")
122+
raise RuntimeError("Failed scheduling run on NVIDIA Run:ai: no job_id returned")
123123

124124
# Example app_id format:
125125
# <experiment_id>___<role-name>___<job_id>
@@ -176,7 +176,7 @@ def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
176176

177177
def _cancel_existing(self, app_id: str) -> None:
178178
"""
179-
Cancels the job by calling the RunAIExecutor's cancel method.
179+
Cancels the job by calling the NVIDIA Run:ai Executor's cancel method.
180180
"""
181181
stored_data = _get_job_dirs()
182182
job_info = stored_data.get(app_id)

0 commit comments

Comments
 (0)