NVIDIA-NeMo
diff --git a/‎examples/nemo_gym/run_grpo_nemo_gym.py‎
Lines changed: 3 additions & 0 deletions b/‎examples/nemo_gym/run_grpo_nemo_gym.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/run_grpo.py‎
Lines changed: 3 additions & 0 deletions b/‎examples/run_grpo.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/run_grpo_sliding_puzzle.py‎
Lines changed: 2 additions & 0 deletions b/‎examples/run_grpo_sliding_puzzle.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/run_vlm_grpo.py‎
Lines changed: 2 additions & 0 deletions b/‎examples/run_vlm_grpo.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎nemo_rl/algorithms/distillation.py‎
Lines changed: 24 additions & 42 deletions b/‎nemo_rl/algorithms/distillation.py‎
Lines changed: 24 additions & 42 deletions
@@ -195,6 +195,7 @@ def main() -> None:
         policy,
         policy_generation,
         cluster,
+        weight_sync,
         dataloader,
         val_dataloader,
         loss_fn,
@@ -281,6 +282,7 @@ def main() -> None:
             grpo_save_state=grpo_state,
             master_config=master_config,
             max_trajectory_age_steps=async_config["max_trajectory_age_steps"],
+            weight_sync=weight_sync,
         )
     else:
         print("🚀 Running synchronous GRPO training")
@@ -299,6 +301,7 @@ def main() -> None:
             checkpointer,
             grpo_state,
             master_config,
+            weight_sync=weight_sync,
         )
 
 
 
@@ -100,6 +100,7 @@ def main() -> None:
         policy,
         policy_generation,
         cluster,
+        weight_sync,
         dataloader,
         val_dataloader,
         loss_fn,
@@ -159,6 +160,7 @@ def main() -> None:
             grpo_save_state=grpo_state,
             master_config=master_config,
             max_trajectory_age_steps=async_config["max_trajectory_age_steps"],
+            weight_sync=weight_sync,
         )
     else:
         print("🚀 Running synchronous GRPO training")
@@ -177,6 +179,7 @@ def main() -> None:
             checkpointer,
             grpo_state,
             master_config,
+            weight_sync=weight_sync,
         )
 
 
 
@@ -253,6 +253,7 @@ def main():
         policy,
         policy_generation,
         cluster,
+        weight_sync,
         dataloader,
         val_dataloader,
         loss_fn,
@@ -275,6 +276,7 @@ def main():
         checkpointer,
         grpo_state,
         master_config,
+        weight_sync=weight_sync,
     )
 
 
 
@@ -107,6 +107,7 @@ def main() -> None:
         policy,
         policy_generation,
         cluster,
+        weight_sync,
         dataloader,
         val_dataloader,
         loss_fn,
@@ -129,6 +130,7 @@ def main() -> None:
         checkpointer,
         grpo_state,
         master_config,
+        weight_sync=weight_sync,
     )
 
 
 
@@ -23,7 +23,8 @@
 from transformers import AutoConfig, AutoTokenizer
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 
-from nemo_rl.algorithms.grpo import _should_use_async_rollouts, refit_policy_generation
+from nemo_rl.algorithms.grpo import _should_use_async_rollouts
+from nemo_rl.weight_sync import WeightSynchronizer, create_weight_synchronizer
 from nemo_rl.algorithms.loss import (
     DistillationLossConfig,
     DistillationLossDataDict,
@@ -164,6 +165,7 @@ def setup(
     ColocatablePolicyInterface,  # student_policy
     ColocatablePolicyInterface,  # teacher_policy
     Optional[GenerationInterface],  # student_generation
+    Optional[WeightSynchronizer],  # weight_sync
     StatefulDataLoader,
     Optional[StatefulDataLoader],
     DistillationLossFn,
@@ -175,7 +177,7 @@ def setup(
     """Main entry point for distillation algorithm.
 
     Returns:
-        tuple of student_policy, teacher_policy, student_generation,
+        tuple of student_policy, teacher_policy, student_generation, weight_sync,
         train_dataloader, val_dataloader,
         loss_fn, logger, checkpointer, distillation_save_state, master_config
     """
@@ -457,26 +459,18 @@ def setup(
         init_reference_model=False,
     )
 
+    # Create weight synchronizer and initialize communication channels
+    weight_sync: Optional[WeightSynchronizer] = None
     if student_generation is not None:
-        state_dict_info = student_policy.prepare_refit_info()
-        student_generation.prepare_refit_info(state_dict_info)
-
-    # if it is not colocated inference, initialize collective communication for update weights
-    if not colocated_inference:
-        ip, port = train_cluster.get_master_address_and_port()
-        print(f"Using ip: {ip}, port: {port} for collective communication", flush=True)
-        train_world_size = train_cluster.world_size()
-        # inference cluster + head node of the train cluster
-        world_size = train_world_size + inference_nodes * inference_gpus_per_node
-        # init collective
-        futures_train = student_policy.init_collective(
-            ip, port, world_size, train_world_size=train_world_size
+        weight_sync = create_weight_synchronizer(
+            policy=student_policy,
+            generation=student_generation,
+            generation_backend=backend,
+            colocated=colocated_inference,
+            train_cluster=train_cluster if not colocated_inference else None,
+            inference_cluster=inference_cluster if not colocated_inference else None,
         )
-        futures_inference = student_generation.init_collective(
-            ip, port, world_size, train_world_size=train_world_size
-        )  # type: ignore
-        # wait for all futures to complete
-        ray.get(futures_train + futures_inference)
+        weight_sync.init_communicator()
 
     loss_fn = DistillationLossFn(loss_config)
 
@@ -488,6 +482,7 @@ def setup(
         student_policy,
         teacher_policy,
         student_generation,
+        weight_sync,
         dataloader,
         val_dataloader,
         loss_fn,
@@ -517,6 +512,7 @@ def distillation_train(
     checkpointer: CheckpointManager,
     distillation_save_state: DistillationSaveState,
     master_config: MasterConfig,
+    weight_sync: Optional[WeightSynchronizer] = None,
 ) -> None:
     """Run Distillation training algorithm."""
     timer = Timer()
@@ -526,12 +522,9 @@ def distillation_train(
     )
     timeout.start_iterations()
 
-    NEED_REFIT = True
     # If student_generation is None, use the student_policy as the generation interface (megatron framework backend)
     if student_generation is None:
         student_generation = student_policy  # type: ignore
-        NEED_REFIT = False
-    POLICY_GENERATION_STALE = True  # tracks if generation needs a refit before running
     assert student_generation is not None  # for mypy type check
 
     # common config/state items
@@ -558,11 +551,8 @@ def distillation_train(
     # Run validation at the start if configured
     if val_at_start and total_steps == 0:
         print("\n🔍 Running initial validation...", flush=True)
-        if NEED_REFIT and POLICY_GENERATION_STALE:
-            refit_policy_generation(
-                student_policy, student_generation, colocated_inference
-            )
-            POLICY_GENERATION_STALE = False
+        if weight_sync is not None and weight_sync.is_stale:
+            weight_sync.sync_weights()
         else:
             student_generation.prepare_for_generation()
         val_metrics, validation_timings = validate(
@@ -613,14 +603,8 @@ def distillation_train(
                     flush=True,
                 )
                 with timer.time("prepare_for_generation"):
-                    if NEED_REFIT and POLICY_GENERATION_STALE:
-                        refit_policy_generation(
-                            student_policy,
-                            student_generation,
-                            colocated_inference,
-                            timer=timer,
-                        )
-                        POLICY_GENERATION_STALE = False
+                    if weight_sync is not None and weight_sync.is_stale:
+                        weight_sync.sync_weights(timer=timer)
                     else:
                         student_generation.prepare_for_generation()
 
@@ -714,7 +698,8 @@ def distillation_train(
                 with timer.time("training_prep"):
                     teacher_policy.offload_after_refit()
                     student_policy.prepare_for_training()  # set model train and reload optim to GPU
-                    POLICY_GENERATION_STALE = True
+                    if weight_sync is not None:
+                        weight_sync.mark_stale()
 
                 print("▶ Training policy...", flush=True)
                 with timer.time("policy_training"):
@@ -733,11 +718,8 @@ def distillation_train(
                 if (val_period > 0 and (total_steps + 1) % val_period == 0) or (
                     val_at_end and is_last_step
                 ):
-                    if NEED_REFIT and POLICY_GENERATION_STALE:
-                        refit_policy_generation(
-                            student_policy, student_generation, colocated_inference
-                        )
-                        POLICY_GENERATION_STALE = False
+                    if weight_sync is not None and weight_sync.is_stale:
+                        weight_sync.sync_weights()
                     else:
                         student_generation.prepare_for_generation()
                     val_metrics, validation_timings = validate(