Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 164 additions & 0 deletions experimental/ssh/qwen3_1_7b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# Grouped Relative Policy Optimization (GRPO)
# >>> python -m apps.grpo.main --config experimental/ssh/qwen3_1.7B.yaml

# Global configuration
group_size: 8
local_batch_size: 16 # per-device batch size
max_req_tokens: 1024
max_res_tokens: 2048
model: "Qwen/Qwen3-1.7B"
off_by_n: 1 # Off by one by default
compile: true # Enable torch.compile for trainer/ref_model, and CUDA graphs for vLLM

# Main loop configuration
rollout_threads: ${services.generator.num_replicas} # Recommended to set equal to generator.num_replicas

provisioner:
launcher: ssh
monarch_port: 22222
ssh_hostfile: /job/hostfile
colocate: [ref_model, trainer]

# Observability configuration
metric_logging:
# wandb:
# project: torch_forge_grpo_gsm8k
# group: qwen_1.7b
# name: ssh_mesh
# logging_mode: global_reduce # global_reduce, per_rank_reduce, per_rank_no_reduce
console:
logging_mode: global_reduce

# Dataset configuration
dataset:
path: "openai/gsm8k"
revision: "main"
data_split: "train"
streaming: true
model: ${model}

# Generator configuration
generator:
engine_args: # https://docs.vllm.ai/en/v0.10.0/api/vllm/engine/arg_utils.html#vllm.engine.arg_utils.EngineArgs
model: ${model}
tensor_parallel_size: 1
pipeline_parallel_size: 1
enforce_eager: true # ${not:${compile}}
sampling_params: # https://docs.vllm.ai/en/v0.10.0/api/vllm/sampling_params.html#vllm.sampling_params.SamplingParams
n: ${group_size}
max_tokens: ${max_res_tokens}
temperature: 1.0
top_p: 1.0

# Trainer configuration
trainer:
model:
name: qwen3
flavor: 1.7B
hf_assets_path: hf://${model}
optimizer:
name: AdamW
lr: 1e-5
eps: 1e-8
lr_scheduler:
warmup_steps: 1
training:
local_batch_size: ${local_batch_size}
seq_len: ${sum:${max_req_tokens},${max_res_tokens}} # seq_len >= max_req_tokens + max_res_tokens
max_norm: 1.0
steps: 10 # 1000000
dtype: bfloat16
gc_freq: 1
compile:
enable: ${compile}
parallelism:
data_parallel_replicate_degree: 2
data_parallel_shard_degree: 1
tensor_parallel_degree: 1
pipeline_parallel_degree: 1
context_parallel_degree: 1
expert_parallel_degree: 1
disable_loss_parallel: true
checkpoint:
enable: true
folder: /data-fast/torchforge/${trainer.model.flavor}
initial_load_path: hf://${model} # The path to load the initial checkpoint from. Ignored if `folder` exists.
initial_load_in_hf: true # If true, interpret initial_load_path as a HuggingFace model repo
last_save_in_hf: true
interval: 500
async_mode: "disabled"
activation_checkpoint:
mode: selective
selective_ac_option: op

# Replay buffer configuration
replay_buffer:
batch_size: ${local_batch_size}
max_policy_age: ${off_by_n}
# This should match the dp_size of TorchTitan
# Here it's set explicitly to 2, because we've set
# 2 GPUs for the trainer and we're using full FSDP.
dp_size: ${trainer.parallelism.data_parallel_replicate_degree} # Must equal trainer DP degree

# Reference model configuration
ref_model:
model:
name: qwen3
flavor: 1.7B
hf_assets_path: hf://${model}
training:
seq_len: ${trainer.training.seq_len}
dtype: bfloat16
gc_freq: 1
compile:
enable: ${compile}
parallelism:
data_parallel_replicate_degree: 1
data_parallel_shard_degree: 1
tensor_parallel_degree: 1
pipeline_parallel_degree: 1
context_parallel_degree: 1
expert_parallel_degree: 1
checkpoint:
initial_load_path: hf://${model}
initial_load_in_hf: true

# All resource allocations
services:
generator:
procs: ${generator.engine_args.tensor_parallel_size}
num_replicas: 2
mesh_name: generator
with_gpus: true
hosts: 1

ref_model:
procs: 1
num_replicas: 2
with_gpus: true
mesh_name: ref_model
hosts: 1
reward_actor:
procs: 1
num_replicas: 1
with_gpus: false
mesh_name: reward_actor

actors:
dataset:
procs: 1
with_gpus: false
mesh_name: dataset
trainer:
procs: 2
with_gpus: true
mesh_name: trainer
hosts: 1
replay_buffer:
procs: 1
with_gpus: false
mesh_name: replay_buffer
compute_advantages:
procs: 1
with_gpus: false
mesh_name: compute_advantages
163 changes: 163 additions & 0 deletions experimental/ssh/qwen3_8b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
# Grouped Relative Policy Optimization (GRPO)
# >>> python -m apps.grpo.main --config experimental/ssh/qwen3_8b.yaml

# Global configuration
group_size: 16
local_batch_size: 4 # per-device batch size
max_req_tokens: 1024
max_res_tokens: 2048
model: "Qwen/Qwen3-8B"
off_by_n: 1 # Off by one by default
compile: true # Enable torch.compile for trainer/ref_model, and CUDA graphs for vLLM

# Main loop configuration
rollout_threads: ${services.generator.num_replicas} # Recommended to set equal to generator.num_replicas

provisioner:
launcher: ssh
monarch_port: 22222
ssh_hostfile: /job/hostfile
colocate: [ref_model, trainer]

# Observability configuration
metric_logging:
# wandb:
# project: torch_forge_grpo_gsm8k
# group: qwen3_8b
# name: ssh_mesh
# logging_mode: global_reduce # global_reduce, per_rank_reduce, per_rank_no_reduce
console:
logging_mode: global_reduce

# Dataset configuration
dataset:
path: "openai/gsm8k"
revision: "main"
data_split: "train"
streaming: true
model: ${model}

# Generator configuration
generator:
engine_args: # https://docs.vllm.ai/en/v0.10.0/api/vllm/engine/arg_utils.html#vllm.engine.arg_utils.EngineArgs
model: ${model}
tensor_parallel_size: 2
pipeline_parallel_size: 1
enforce_eager: true # ${not:${compile}}
sampling_params: # https://docs.vllm.ai/en/v0.10.0/api/vllm/sampling_params.html#vllm.sampling_params.SamplingParams
n: ${group_size}
max_tokens: ${max_res_tokens}
temperature: 1.0
top_p: 1.0

# Trainer configuration
trainer:
model:
name: qwen3
flavor: 8B
hf_assets_path: hf://${model}
optimizer:
name: AdamW
lr: 1e-5
eps: 1e-8
lr_scheduler:
warmup_steps: 1
training:
local_batch_size: ${local_batch_size}
seq_len: ${sum:${max_req_tokens},${max_res_tokens}} # seq_len >= max_req_tokens + max_res_tokens
max_norm: 1.0
steps: 10
dtype: bfloat16
gc_freq: 1
compile:
enable: ${compile}
parallelism:
data_parallel_replicate_degree: 1
data_parallel_shard_degree: 4
tensor_parallel_degree: 1
pipeline_parallel_degree: 1
context_parallel_degree: 1
expert_parallel_degree: 1
disable_loss_parallel: true
checkpoint:
enable: true
folder: /data-fast/torchforge/${trainer.model.flavor}
initial_load_path: hf://${model} # The path to load the initial checkpoint from. Ignored if `folder` exists.
initial_load_in_hf: true # If true, interpret initial_load_path as a HuggingFace model repo
last_save_in_hf: true
interval: 500
async_mode: "disabled"
activation_checkpoint:
mode: selective
selective_ac_option: op

# Replay buffer configuration
replay_buffer:
batch_size: ${local_batch_size}
max_policy_age: ${off_by_n}
# This should match the dp_size of TorchTitan
# Here it's set explicitly to 2, because we've set
# 2 GPUs for the trainer and we're using full FSDP.
dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree

# Reference model configuration
ref_model:
model:
name: qwen3
flavor: 8B
hf_assets_path: hf://${model}
training:
seq_len: ${trainer.training.seq_len}
dtype: bfloat16
gc_freq: 1
compile:
enable: ${compile}
parallelism:
data_parallel_replicate_degree: 1
data_parallel_shard_degree: 2
tensor_parallel_degree: 1
pipeline_parallel_degree: 1
context_parallel_degree: 1
expert_parallel_degree: 1
checkpoint:
initial_load_path: hf://${model}
initial_load_in_hf: true

# All resource allocations
services:
generator:
procs: ${generator.engine_args.tensor_parallel_size}
num_replicas: 2
hosts: 1
with_gpus: true
mesh_name: generator
ref_model:
procs: 2
num_replicas: 2
hosts: 1
with_gpus: true
mesh_name: ref_model
reward_actor:
procs: 1
num_replicas: 1
with_gpus: false
mesh_name: reward_actor

actors:
dataset:
procs: 1
with_gpus: false
mesh_name: dataset
trainer:
procs: 4
hosts: 1
with_gpus: true
mesh_name: trainer
replay_buffer:
procs: 1
with_gpus: false
mesh_name: replay_buffer
compute_advantages:
procs: 1
with_gpus: false
mesh_name: compute_advantages
12 changes: 12 additions & 0 deletions src/forge/controller/launchers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
from .base import BaseLauncher
from .launcher import get_launcher

__all__ = [
"BaseLauncher",
"get_launcher",
]
File renamed without changes.
39 changes: 39 additions & 0 deletions src/forge/controller/launchers/launcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""Launcher factory"""


from forge.types import Launcher, LauncherConfig

from .base import BaseLauncher
from .slurm_launcher import Slurmlauncher
from .ssh_launcher import SSHLauncher

JOB_NAME_KEY = "job_name"
LAUNCHER_KEY = "launcher"


def get_launcher(cfg: LauncherConfig | None = None) -> BaseLauncher | None:
if not cfg:
return None
if cfg.launcher == Launcher.SLURM:
return Slurmlauncher(cfg)
elif cfg.launcher == Launcher.SSH:
success, error_msg = SSHLauncher.validate_configuration(cfg)
if not success:
raise ValueError(error_msg)
return SSHLauncher(cfg)
elif cfg.launcher == Launcher.MAST:
try:
from forge.fb.mast_launcher import MastLauncher

return MastLauncher(cfg, detached=False)
except ImportError as err:
raise ValueError("MAST is not available, cannot launch MAST jobs.") from err

else:
raise ValueError(f"Unsupported config provided, got {cfg}")
Loading
Loading