Skip to content

Commit 87fa057

Browse files
Update NemoRL on GKE scripts. (#2070)
1 parent fb3c0d1 commit 87fa057

6 files changed

Lines changed: 14 additions & 12 deletions

File tree

ai-ml/nemo-rl-on-gke/nemoRL/gemma3-27b-it/gemma3-27b-gsm8k.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,12 @@ export WANDB_API_KEY=$WANDB_API_KEY
4646
export HF_TOKEN=$HF_TOKEN
4747
export HF_HOME=/opt/nemo-rl/
4848
49-
###-----Example to launch Gemma3-27B on 3 nodes (24 GPUs)----------
49+
###-----Example to launch Gemma3-27B on 2 nodes (16 GPUs)----------
5050
uv run python examples/run_grpo_math.py \
5151
--config examples/configs/recipes/llm/grpo-gemma3-27b-it-8n4g-fsdp2tp4-actckpt-long.yaml \
5252
cluster.num_nodes=2 \
5353
cluster.gpus_per_node=8 \
54-
grpo.max_num_steps=300 \
54+
grpo.max_num_steps=10 \
5555
checkpointing.checkpoint_dir=/data/nemo_rl_gemma3_27b_3_17 \
5656
data.dataset_name=ResponseDataset \
5757
+data.train_data_path=openai/gsm8k \
@@ -65,7 +65,7 @@ uv run python examples/run_grpo_math.py \
6565
logger.wandb_enabled=True \
6666
logger.wandb.name='nemo_rl_gemma3_27b_3_17' \
6767
grpo.num_prompts_per_step=16 \
68-
grpo.num_generations_per_prompt=64 \
68+
grpo.num_generations_per_prompt=32 \
6969
policy.generation.colocated.enabled=False \
7070
policy.generation.colocated.resources.num_nodes=1 \
7171
policy.generation.colocated.resources.gpus_per_node=8 \

ai-ml/nemo-rl-on-gke/nemoRL/lustre/lustre-pv.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ metadata:
1919
spec:
2020
storageClassName: lustre-rwx-500mbps-per-tib
2121
capacity:
22-
storage: 9000Gi
22+
storage: 18000Gi
2323
accessModes:
2424
- ReadWriteMany
2525
persistentVolumeReclaimPolicy: Retain
@@ -29,7 +29,7 @@ spec:
2929
name: lustre-pvc
3030
csi:
3131
driver: lustre.csi.storage.gke.io
32-
volumeHandle: "northam-ce-mlai-tpu/asia-northeast1-b/pmotgi-lustre"
32+
volumeHandle: "${PROJECT_ID}/${NODE_ZONE}/${LUSTRE_NAME}"
3333
volumeAttributes:
34-
ip: 10.247.224.2
35-
filesystem: lustrefs
34+
ip: ${LUSTRE_IP}
35+
filesystem: lustrefs

ai-ml/nemo-rl-on-gke/nemoRL/lustre/lustre-pvc.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,4 @@ spec:
2323
volumeName: lustre-pv
2424
resources:
2525
requests:
26-
storage: 1000Gi
26+
storage: 18000Gi

ai-ml/nemo-rl-on-gke/nemoRL/templates/fluent-bit-config.yaml.j2

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{# Copyright 2026 Google LLC
1+
# Copyright 2026 Google LLC
22

33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -10,7 +10,7 @@
1010
# distributed under the License is distributed on an "AS IS" BASIS,
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
13-
# limitations under the License. #}
13+
# limitations under the License. #
1414

1515
{{- if .Values.configMap.fluentbit }}
1616
apiVersion: v1

ai-ml/nemo-rl-on-gke/nemoRL/templates/raycluster-cluster.yaml.j2

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{# Copyright 2026 Google LLC
1+
# Copyright 2026 Google LLC
22

33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -10,7 +10,7 @@
1010
# distributed under the License is distributed on an "AS IS" BASIS,
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
13-
# limitations under the License. #}
13+
# limitations under the License. #
1414

1515
apiVersion: ray.io/v1
1616
kind: RayCluster

ai-ml/nemo-rl-on-gke/nemoRL/values.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ head:
6060
containerEnv:
6161
- name: RAY_GROUP
6262
value: "head"
63+
nodeSelector:
64+
cloud.google.com/gke-nodepool: default-pool
6365
resources:
6466
limits:
6567
cpu: "64"

0 commit comments

Comments
 (0)