File tree Expand file tree Collapse file tree
ai-ml/nemo-rl-on-gke/nemoRL Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -46,12 +46,12 @@ export WANDB_API_KEY=$WANDB_API_KEY
4646export HF_TOKEN=$HF_TOKEN
4747export HF_HOME=/opt/nemo-rl/
4848
49- ###-----Example to launch Gemma3-27B on 3 nodes (24 GPUs)----------
49+ ###-----Example to launch Gemma3-27B on 2 nodes (16 GPUs)----------
5050uv run python examples/run_grpo_math.py \
5151 --config examples/configs/recipes/llm/grpo-gemma3-27b-it-8n4g-fsdp2tp4-actckpt-long.yaml \
5252 cluster.num_nodes=2 \
5353 cluster.gpus_per_node=8 \
54- grpo.max_num_steps=300 \
54+ grpo.max_num_steps=10 \
5555 checkpointing.checkpoint_dir=/data/nemo_rl_gemma3_27b_3_17 \
5656 data.dataset_name=ResponseDataset \
5757 +data.train_data_path=openai/gsm8k \
@@ -65,7 +65,7 @@ uv run python examples/run_grpo_math.py \
6565 logger.wandb_enabled=True \
6666 logger.wandb.name='nemo_rl_gemma3_27b_3_17' \
6767 grpo.num_prompts_per_step=16 \
68- grpo.num_generations_per_prompt=64 \
68+ grpo.num_generations_per_prompt=32 \
6969 policy.generation.colocated.enabled=False \
7070 policy.generation.colocated.resources.num_nodes=1 \
7171 policy.generation.colocated.resources.gpus_per_node=8 \
Original file line number Diff line number Diff line change @@ -19,7 +19,7 @@ metadata:
1919spec :
2020 storageClassName : lustre-rwx-500mbps-per-tib
2121 capacity :
22- storage : 9000Gi
22+ storage : 18000Gi
2323 accessModes :
2424 - ReadWriteMany
2525 persistentVolumeReclaimPolicy : Retain
2929 name : lustre-pvc
3030 csi :
3131 driver : lustre.csi.storage.gke.io
32- volumeHandle : " northam-ce-mlai-tpu/asia-northeast1-b/pmotgi-lustre "
32+ volumeHandle : " ${PROJECT_ID}/${NODE_ZONE}/${LUSTRE_NAME} "
3333 volumeAttributes :
34- ip : 10.247.224.2
35- filesystem : lustrefs
34+ ip : ${LUSTRE_IP}
35+ filesystem : lustrefs
Original file line number Diff line number Diff line change 2323 volumeName : lustre-pv
2424 resources :
2525 requests :
26- storage : 1000Gi
26+ storage : 18000Gi
Original file line number Diff line number Diff line change 1- { # Copyright 2026 Google LLC
1+ # Copyright 2026 Google LLC
22
33# Licensed under the Apache License, Version 2.0 (the "License");
44# you may not use this file except in compliance with the License.
1010# distributed under the License is distributed on an "AS IS" BASIS,
1111# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212# See the License for the specific language governing permissions and
13- # limitations under the License. #}
13+ # limitations under the License. #
1414
1515{{- if .Values.configMap.fluentbit }}
1616apiVersion: v1
Original file line number Diff line number Diff line change 1- { # Copyright 2026 Google LLC
1+ # Copyright 2026 Google LLC
22
33# Licensed under the Apache License, Version 2.0 (the "License");
44# you may not use this file except in compliance with the License.
1010# distributed under the License is distributed on an "AS IS" BASIS,
1111# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212# See the License for the specific language governing permissions and
13- # limitations under the License. #}
13+ # limitations under the License. #
1414
1515apiVersion: ray.io/v1
1616kind: RayCluster
Original file line number Diff line number Diff line change 6060 containerEnv :
6161 - name : RAY_GROUP
6262 value : " head"
63+ nodeSelector :
64+ cloud.google.com/gke-nodepool : default-pool
6365 resources :
6466 limits :
6567 cpu : " 64"
You can’t perform that action at this time.
0 commit comments