meta-pytorch · sfc-gh-truwase · Dec 30, 2025 · Dec 30, 2025 · Dec 30, 2025 · Dec 30, 2025
diff --git a/experimental/ssh/qwen3_1_7b.yaml b/experimental/ssh/qwen3_1_7b.yaml
@@ -0,0 +1,164 @@
+# Grouped Relative Policy Optimization (GRPO)
+# >>> python -m apps.grpo.main --config experimental/ssh/qwen3_1.7B.yaml
+
+# Global configuration
+group_size: 8
+local_batch_size: 16 # per-device batch size
+max_req_tokens: 1024
+max_res_tokens: 2048
+model: "Qwen/Qwen3-1.7B"
+off_by_n: 1 # Off by one by default
+compile: true # Enable torch.compile for trainer/ref_model, and CUDA graphs for vLLM
+
+# Main loop configuration
+rollout_threads: ${services.generator.num_replicas}   # Recommended to set equal to generator.num_replicas
+
+provisioner:
+  launcher: ssh
+  monarch_port: 22222
+  ssh_hostfile: /job/hostfile
+  colocate: [ref_model, trainer]
+
+# Observability configuration
+metric_logging:
+  # wandb:
+  #   project: torch_forge_grpo_gsm8k
+  #   group: qwen_1.7b
+  #   name: ssh_mesh
+  #   logging_mode: global_reduce # global_reduce, per_rank_reduce, per_rank_no_reduce
+  console:
+    logging_mode: global_reduce
+
+# Dataset configuration
+dataset:
+  path: "openai/gsm8k"
+  revision: "main"
+  data_split: "train"
+  streaming: true
+  model: ${model}
+
+# Generator configuration
+generator:
+  engine_args:  # https://docs.vllm.ai/en/v0.10.0/api/vllm/engine/arg_utils.html#vllm.engine.arg_utils.EngineArgs
+    model: ${model}
+    tensor_parallel_size: 1
+    pipeline_parallel_size: 1
+    enforce_eager: true # ${not:${compile}}
+  sampling_params:  # https://docs.vllm.ai/en/v0.10.0/api/vllm/sampling_params.html#vllm.sampling_params.SamplingParams
+    n: ${group_size}
+    max_tokens: ${max_res_tokens}
+    temperature: 1.0
+    top_p: 1.0
+
+# Trainer configuration
+trainer:
+  model:
+    name: qwen3
+    flavor: 1.7B
+    hf_assets_path: hf://${model}
+  optimizer:
+    name: AdamW
+    lr: 1e-5
+    eps: 1e-8
+  lr_scheduler:
+    warmup_steps: 1
+  training:
+    local_batch_size: ${local_batch_size}
+    seq_len: ${sum:${max_req_tokens},${max_res_tokens}}  # seq_len >= max_req_tokens + max_res_tokens
+    max_norm: 1.0
+    steps: 10 # 1000000
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: ${compile}
+  parallelism:
+    data_parallel_replicate_degree: 2
+    data_parallel_shard_degree: 1
+    tensor_parallel_degree: 1
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+    disable_loss_parallel: true
+  checkpoint:
+    enable: true
+    folder: /data-fast/torchforge/${trainer.model.flavor}
+    initial_load_path: hf://${model}  # The path to load the initial checkpoint from. Ignored if `folder` exists.
+    initial_load_in_hf: true          # If true, interpret initial_load_path as a HuggingFace model repo
+    last_save_in_hf: true
+    interval: 500
+    async_mode: "disabled"
+  activation_checkpoint:
+    mode: selective
+    selective_ac_option: op
+
+# Replay buffer configuration
+replay_buffer:
+  batch_size: ${local_batch_size}
+  max_policy_age: ${off_by_n}
+  # This should match the dp_size of TorchTitan
+  # Here it's set explicitly to 2, because we've set
+  # 2 GPUs for the trainer and we're using full FSDP.
+  dp_size: ${trainer.parallelism.data_parallel_replicate_degree} # Must equal trainer DP degree
+
+# Reference model configuration
+ref_model:
+  model:
+    name: qwen3
+    flavor: 1.7B
+    hf_assets_path: hf://${model}
+  training:
+    seq_len: ${trainer.training.seq_len}
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: ${compile}
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: 1
+    tensor_parallel_degree: 1
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+  checkpoint:
+    initial_load_path: hf://${model}
+    initial_load_in_hf: true
+
+# All resource allocations
+services:
+  generator:
+    procs: ${generator.engine_args.tensor_parallel_size}
+    num_replicas: 2
+    mesh_name: generator
+    with_gpus: true
+    hosts: 1
+
+  ref_model:
+    procs: 1
+    num_replicas: 2
+    with_gpus: true
+    mesh_name: ref_model
+    hosts: 1
+  reward_actor:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: reward_actor
+
+actors:
+  dataset:
+    procs: 1
+    with_gpus: false
+    mesh_name: dataset
+  trainer:
+    procs: 2
+    with_gpus: true
+    mesh_name: trainer
+    hosts: 1
+  replay_buffer:
+    procs: 1
+    with_gpus: false
+    mesh_name: replay_buffer
+  compute_advantages:
+    procs: 1
+    with_gpus: false
+    mesh_name: compute_advantages
diff --git a/experimental/ssh/qwen3_8b.yaml b/experimental/ssh/qwen3_8b.yaml
@@ -0,0 +1,163 @@
+# Grouped Relative Policy Optimization (GRPO)
+# >>> python -m apps.grpo.main --config experimental/ssh/qwen3_8b.yaml
+
+# Global configuration
+group_size: 16
+local_batch_size: 4 # per-device batch size
+max_req_tokens: 1024
+max_res_tokens: 2048
+model: "Qwen/Qwen3-8B"
+off_by_n: 1 # Off by one by default
+compile: true # Enable torch.compile for trainer/ref_model, and CUDA graphs for vLLM
+
+# Main loop configuration
+rollout_threads: ${services.generator.num_replicas}   # Recommended to set equal to generator.num_replicas
+
+provisioner:
+  launcher: ssh
+  monarch_port: 22222
+  ssh_hostfile: /job/hostfile
+  colocate: [ref_model, trainer]
+
+# Observability configuration
+metric_logging:
+  # wandb:
+  #   project: torch_forge_grpo_gsm8k
+  #   group: qwen3_8b
+  #   name: ssh_mesh
+  #   logging_mode: global_reduce # global_reduce, per_rank_reduce, per_rank_no_reduce
+  console:
+    logging_mode: global_reduce
+
+# Dataset configuration
+dataset:
+  path: "openai/gsm8k"
+  revision: "main"
+  data_split: "train"
+  streaming: true
+  model: ${model}
+
+# Generator configuration
+generator:
+  engine_args:  # https://docs.vllm.ai/en/v0.10.0/api/vllm/engine/arg_utils.html#vllm.engine.arg_utils.EngineArgs
+    model: ${model}
+    tensor_parallel_size: 2
+    pipeline_parallel_size: 1
+    enforce_eager: true # ${not:${compile}}
+  sampling_params:  # https://docs.vllm.ai/en/v0.10.0/api/vllm/sampling_params.html#vllm.sampling_params.SamplingParams
+    n: ${group_size}
+    max_tokens: ${max_res_tokens}
+    temperature: 1.0
+    top_p: 1.0
+
+# Trainer configuration
+trainer:
+  model:
+    name: qwen3
+    flavor: 8B
+    hf_assets_path: hf://${model}
+  optimizer:
+    name: AdamW
+    lr: 1e-5
+    eps: 1e-8
+  lr_scheduler:
+    warmup_steps: 1
+  training:
+    local_batch_size: ${local_batch_size}
+    seq_len: ${sum:${max_req_tokens},${max_res_tokens}}  # seq_len >= max_req_tokens + max_res_tokens
+    max_norm: 1.0
+    steps: 10
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: ${compile}
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: 4
+    tensor_parallel_degree: 1
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+    disable_loss_parallel: true
+  checkpoint:
+    enable: true
+    folder: /data-fast/torchforge/${trainer.model.flavor}
+    initial_load_path: hf://${model}  # The path to load the initial checkpoint from. Ignored if `folder` exists.
+    initial_load_in_hf: true          # If true, interpret initial_load_path as a HuggingFace model repo
+    last_save_in_hf: true
+    interval: 500
+    async_mode: "disabled"
+  activation_checkpoint:
+    mode: selective
+    selective_ac_option: op
+
+# Replay buffer configuration
+replay_buffer:
+  batch_size: ${local_batch_size}
+  max_policy_age: ${off_by_n}
+  # This should match the dp_size of TorchTitan
+  # Here it's set explicitly to 2, because we've set
+  # 2 GPUs for the trainer and we're using full FSDP.
+  dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree
+
+# Reference model configuration
+ref_model:
+  model:
+    name: qwen3
+    flavor: 8B
+    hf_assets_path: hf://${model}
+  training:
+    seq_len: ${trainer.training.seq_len}
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: ${compile}
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: 2
+    tensor_parallel_degree: 1
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+  checkpoint:
+    initial_load_path: hf://${model}
+    initial_load_in_hf: true
+
+# All resource allocations
+services:
+  generator:
+    procs: ${generator.engine_args.tensor_parallel_size}
+    num_replicas: 2
+    hosts: 1
+    with_gpus: true
+    mesh_name: generator
+  ref_model:
+    procs: 2
+    num_replicas: 2
+    hosts: 1
+    with_gpus: true
+    mesh_name: ref_model
+  reward_actor:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: reward_actor
+
+actors:
+  dataset:
+    procs: 1
+    with_gpus: false
+    mesh_name: dataset
+  trainer:
+    procs: 4
+    hosts: 1
+    with_gpus: true
+    mesh_name: trainer
+  replay_buffer:
+    procs: 1
+    with_gpus: false
+    mesh_name: replay_buffer
+  compute_advantages:
+    procs: 1
+    with_gpus: false
+    mesh_name: compute_advantages
diff --git a/src/forge/controller/launchers/__init__.py b/src/forge/controller/launchers/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from .base import BaseLauncher
+from .launcher import get_launcher
+
+__all__ = [
+    "BaseLauncher",
+    "get_launcher",
+]
diff --git a/src/forge/controller/base.py → src/forge/controller/launchers/base.py b/src/forge/controller/base.py → src/forge/controller/launchers/base.py
diff --git a/src/forge/controller/launchers/launcher.py b/src/forge/controller/launchers/launcher.py
@@ -0,0 +1,39 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Launcher factory"""
+
+
+from forge.types import Launcher, LauncherConfig
+
+from .base import BaseLauncher
+from .slurm_launcher import Slurmlauncher
+from .ssh_launcher import SSHLauncher
+
+JOB_NAME_KEY = "job_name"
+LAUNCHER_KEY = "launcher"
+
+
+def get_launcher(cfg: LauncherConfig | None = None) -> BaseLauncher | None:
+    if not cfg:
+        return None
+    if cfg.launcher == Launcher.SLURM:
+        return Slurmlauncher(cfg)
+    elif cfg.launcher == Launcher.SSH:
+        success, error_msg = SSHLauncher.validate_configuration(cfg)
+        if not success:
+            raise ValueError(error_msg)
+        return SSHLauncher(cfg)
+    elif cfg.launcher == Launcher.MAST:
+        try:
+            from forge.fb.mast_launcher import MastLauncher
+
+            return MastLauncher(cfg, detached=False)
+        except ImportError as err:
+            raise ValueError("MAST is not available, cannot launch MAST jobs.") from err
+
+    else:
+        raise ValueError(f"Unsupported config provided, got {cfg}")