NVIDIA-NeMo · terrykong · May 24, 2026 · May 1, 2026 · May 2, 2026 · May 4, 2026
@@ -401,3 +401,19 @@ logger:
 cluster:
   gpus_per_node: 1
   num_nodes: 1
+
+# TransferQueue-mediated data plane for sync GRPO.
+# Off by default — the legacy grpo_train trainer never engages this.
+# Flip enabled=true and run grpo_train_sync to use TQ-mediated bulk
+# transfer between rollout and train. See nemo_rl/data_plane/README.md.
+data_plane:
+  enabled: false
+  impl: transfer_queue
+  backend: "simple"                  # TQ storage backend ('simple' or 'mooncake_cpu')
+  storage_capacity: 1000000          # max samples retained per partition
+  num_storage_units: 2               # storage shards
+  claim_meta_poll_interval_s: 0.5    # blocking-claim poll cadence
+  global_segment_size: 549755813888  # 512 GiB — used when backend == "mooncake_cpu"
+  local_buffer_size:   68719476736   # 64 GiB  — used when backend == "mooncake_cpu"
+  # observability:                    # NotRequired
+  #   enabled: false
@@ -7,6 +7,7 @@ loss_fn:
   reference_policy_kl_penalty: 0.0
   use_importance_sampling_correction: true
   truncated_importance_sampling_ratio: 2
+  truncated_importance_sampling_type: tis
 checkpointing:
   checkpoint_dir: results/grpo-glm47-flash-4n8g-automodel
 policy:

@@ -12,6 +12,7 @@ loss_fn:
   reference_policy_kl_penalty: 0.0
   use_importance_sampling_correction: true
   truncated_importance_sampling_ratio: 2
+  truncated_importance_sampling_type: tis
   ratio_clip_max: 0.28
   ratio_clip_c: 10
 checkpointing:

@@ -5,6 +5,7 @@ loss_fn:
   reference_policy_kl_penalty: 0.0
   use_importance_sampling_correction: true
   truncated_importance_sampling_ratio: 2
+  truncated_importance_sampling_type: tis
 checkpointing:
   checkpoint_dir: results/vlm_grpo-qwen3.5-35ba3b-geo3k-2n8g-automodel-ep16
 policy:

@@ -5,6 +5,7 @@ loss_fn:
   reference_policy_kl_penalty: 0.0
   use_importance_sampling_correction: true
   truncated_importance_sampling_ratio: 2
+  truncated_importance_sampling_type: tis
 checkpointing:
   checkpoint_dir: results/vlm_grpo-qwen3.5-35ba3b-geo3k-2n8g-megatron-ep16
 policy:

@@ -31,6 +31,22 @@
 from nemo_rl.utils.logger import get_next_experiment_dir
 
 
+def _select_trainer(master_config: MasterConfig):
+    """Pick the synchronous trainer based on ``data_plane.enabled``.
+
+    Factored out so test_architecture_invariants can verify dispatch
+    without the full setup() path.
+    """
+    dp_cfg = master_config.data_plane or {}
+    if dp_cfg.get("enabled", False):
+        from nemo_rl.algorithms.grpo_sync import grpo_train_sync
+
+        print("🚀 Running synchronous GRPO training (TransferQueue)")
+        return grpo_train_sync
+    print("🚀 Running synchronous GRPO training (legacy)")
+    return grpo_train
+
+
 def parse_args() -> tuple[argparse.Namespace, list[str]]:
     """Parse command line arguments."""
     parser = argparse.ArgumentParser(description="Run GRPO training with configuration")
@@ -100,6 +116,20 @@ def main() -> None:
         val_task_to_env,
     ) = setup_response_data(tokenizer, config.data, config.env)
 
+    # Pick the policy factory at the launcher level so the legacy trainer
+    # stays data-plane-agnostic (architectural invariant — see
+    # tests/data_plane/unit/test_architecture_invariants.py).
+    _dp_cfg = config.data_plane or {}
+    if _dp_cfg.get("enabled", False):
+        from nemo_rl.models.policy.tq_policy import TQPolicy
+
+        def _make_policy(**kwargs):
+            return TQPolicy(**kwargs, dp_cfg=_dp_cfg)
+
+        _policy_factory = _make_policy
+    else:
+        _policy_factory = None  # setup() defaults to plain Policy
+
     (
         policy,
         policy_generation,
@@ -111,7 +141,13 @@ def main() -> None:
         checkpointer,
         grpo_state,
         master_config,
-    ) = setup(config, tokenizer, dataset, val_dataset)
+    ) = setup(
+        config,
+        tokenizer,
+        dataset,
+        val_dataset,
+        policy_factory=_policy_factory,
+    )
 
     # Check if async mode is enabled
     if "async_grpo" in config.grpo and config.grpo["async_grpo"]["enabled"]:
@@ -165,10 +201,10 @@ def main() -> None:
             max_trajectory_age_steps=async_config["max_trajectory_age_steps"],
         )
     else:
-        print("🚀 Running synchronous GRPO training")
-
-        # Run standard GRPO training
-        grpo_train(
+        # Two parallel synchronous trainers (verl-style — main_ppo.py vs
+        # main_ppo_sync.py). data_plane.enabled selects which one runs.
+        trainer = _select_trainer(master_config)
+        trainer(
             policy,
             policy_generation,
             dataloader,

@@ -530,7 +530,7 @@ def distillation_train(
         student_generation = student_policy  # type: ignore
         NEED_REFIT = False
     POLICY_GENERATION_STALE = True  # tracks if generation needs a refit before running
-    assert student_generation is not None  # for mypy type check
+    assert student_generation is not None
 
     # common config/state items
     current_epoch = distillation_save_state["current_epoch"]  # current epoch

@@ -17,7 +17,7 @@
 import warnings
 from concurrent.futures import ThreadPoolExecutor
 from contextlib import nullcontext
-from typing import Any, NotRequired, Optional, TypedDict, TypeVar, cast
+from typing import Any, Callable, NotRequired, Optional, TypedDict, TypeVar, cast
 
 import numpy as np
 import ray
@@ -59,6 +59,7 @@
     get_keys_from_message_log,
 )
 from nemo_rl.data.utils import extract_necessary_env_names, load_dataloader_state
+from nemo_rl.data_plane.interfaces import DataPlaneConfig
 from nemo_rl.distributed.batched_data_dict import BatchedDataDict
 from nemo_rl.distributed.ray_actor_environment_registry import get_actor_python_env
 from nemo_rl.distributed.virtual_cluster import ClusterConfig, RayVirtualCluster
@@ -207,6 +208,7 @@ class MasterConfig(BaseModel, extra="allow"):
     logger: GRPOLoggerConfig
     cluster: ClusterConfig
     checkpointing: CheckpointingConfig
+    data_plane: Optional[DataPlaneConfig] = None
 
 
 # ===============================================================================
@@ -220,6 +222,7 @@ def setup(
     dataset: AllTaskProcessedDataset | dict[str, AllTaskProcessedDataset],
     val_dataset: Optional[AllTaskProcessedDataset],
     processor: Optional[AutoProcessor] = None,
+    policy_factory: Optional[Callable[..., ColocatablePolicyInterface]] = None,
 ) -> tuple[
     ColocatablePolicyInterface,
     Optional[GenerationInterface],
@@ -580,10 +583,15 @@ def init_train_dataloader(dataset, suffix: str = ""):
             "(reference model is not loaded)."
         )
 
+    # Caller-supplied factory lets the sync trainer swap in a TQ-mediated
+    # Policy subclass without this shared setup needing to know the data
+    # plane exists. Default is the plain Policy class — legacy behavior.
+    _make_policy = policy_factory if policy_factory is not None else Policy
+
     def init_policy():
         """Initialize policy training workers."""
         t0 = time.perf_counter()
-        p = Policy(
+        p = _make_policy(
             cluster=train_cluster,
             config=policy_config,
             tokenizer=tokenizer,
@@ -1360,7 +1368,7 @@ def grpo_train(
         policy_generation = policy  # type: ignore
         NEED_REFIT = False
     POLICY_GENERATION_STALE = True  # tracks if generation needs a refit before running
-    assert policy_generation is not None  # for mypy type check
+    assert policy_generation is not None
 
     # Check if we need to sync KV cache scales
     # When fallback to policy as the policy_generation, we use getattr to check.