NVIDIA-NeMo
diff --git a/‎nemo_rl/models/generation/constants.py‎
Lines changed: 23 additions & 0 deletions b/‎nemo_rl/models/generation/constants.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎nemo_rl/weight_sync/__init__.py‎
Lines changed: 21 additions & 0 deletions b/‎nemo_rl/weight_sync/__init__.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎nemo_rl/weight_sync/collective_weight_synchronizer.py‎
Lines changed: 132 additions & 0 deletions b/‎nemo_rl/weight_sync/collective_weight_synchronizer.py‎
Lines changed: 132 additions & 0 deletions
diff --git a/‎nemo_rl/weight_sync/factory.py‎
Lines changed: 106 additions & 0 deletions b/‎nemo_rl/weight_sync/factory.py‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎nemo_rl/weight_sync/http_weight_synchronizer.py‎
Lines changed: 101 additions & 0 deletions b/‎nemo_rl/weight_sync/http_weight_synchronizer.py‎
Lines changed: 101 additions & 0 deletions
@@ -0,0 +1,23 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Constants for generation backend names.
+
+These should be used instead of raw string literals when checking or
+comparing backend names in config values.
+"""
+
+VLLM_BACKEND = "vllm"
+SGLANG_BACKEND = "sglang"
+MEGATRON_BACKEND = "megatron"
@@ -0,0 +1,21 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo_rl.weight_sync.factory import create_weight_synchronizer
+from nemo_rl.weight_sync.interfaces import WeightSynchronizer
+
+__all__ = [
+    "WeightSynchronizer",
+    "create_weight_synchronizer",
+]
@@ -0,0 +1,132 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""NCCL collective weight synchronizer for non-colocated deployments.
+
+Handles weight transfer between policy and generation workers running on
+separate GPU clusters using NCCL collective communication. The policy
+broadcasts its weights, and generation workers receive them via the
+established NCCL process group.
+
+Lifecycle per sync:
+  1. policy.broadcast_weights_for_collective()    -- send via NCCL
+     generation.update_weights_from_collective()  -- receive via NCCL
+  2. Verify transfer success
+
+No offload/restore steps are needed since policy and generation run on
+separate GPUs with dedicated memory.
+"""
+
+from contextlib import nullcontext
+from typing import Any, Optional
+
+import ray
+
+from nemo_rl.utils.timer import Timer
+from nemo_rl.weight_sync.interfaces import WeightSynchronizer
+
+
+class CollectiveWeightSynchronizer(WeightSynchronizer):
+    """Weight synchronizer using NCCL collectives for non-colocated deployments.
+
+    Policy and generation workers run on separate GPU clusters. Weights are
+    synchronized via NCCL broadcast over a pre-established process group.
+
+    Args:
+        policy: Policy object implementing ColocatablePolicyInterface.
+        generation: Generation object implementing GenerationInterface.
+        train_cluster: RayVirtualCluster for the training workers, used to
+            obtain the master address/port and world size for collective init.
+        inference_cluster: RayVirtualCluster for the inference workers.
+    """
+
+    def __init__(
+        self,
+        policy: Any,
+        generation: Any,
+        train_cluster: Any,
+        inference_cluster: Any,
+    ):
+        self._policy = policy
+        self._generation = generation
+        self._train_cluster = train_cluster
+        self._inference_cluster = inference_cluster
+        self._stale = True
+
+    def sync_weights(
+        self,
+        *,
+        timer: Optional[Timer] = None,
+        kv_scales: Optional[dict[str, float]] = None,
+    ) -> None:
+        timer_context = (
+            timer.time("prepare_for_generation/transfer_and_update_weights")
+            if timer is not None
+            else nullcontext()
+        )
+        with timer_context:
+            futures_train = self._policy.broadcast_weights_for_collective(
+                kv_scales=kv_scales
+            )
+            futures_inference = (
+                self._generation.update_weights_from_collective()
+            )
+
+            ray.get(futures_train)
+            results = ray.get(futures_inference)
+            update_success = all(
+                result for result in results if result is not None
+            )
+
+            if not update_success:
+                raise RuntimeError(
+                    "Weight transfer failed during NCCL collective sync. "
+                    "This often indicates an issue with the NCCL process group "
+                    "or the generation backend worker."
+                )
+
+        self._stale = False
+
+    @property
+    def is_stale(self) -> bool:
+        return self._stale
+
+    def mark_stale(self) -> None:
+        self._stale = True
+
+    def init_communicator(self) -> None:
+        # prepare_refit_info is called before init_collective. This matches
+        # distillation.py ordering. Neither call depends on the other today,
+        # but we document this as the canonical ordering for future reference.
+        state_dict_info = self._policy.prepare_refit_info()
+        self._generation.prepare_refit_info(state_dict_info)
+
+        ip, port = self._train_cluster.get_master_address_and_port()
+        train_world_size = self._train_cluster.world_size()
+        inference_world_size = self._inference_cluster.world_size()
+        world_size = train_world_size + inference_world_size
+
+        futures_train = self._policy.init_collective(
+            ip, port, world_size, train_world_size=train_world_size
+        )
+        futures_inference = self._generation.init_collective(
+            ip, port, world_size, train_world_size=train_world_size
+        )
+        ray.get(futures_train + futures_inference)
+
+    def shutdown(self) -> None:
+        # The NCCL process group lifecycle is managed by Ray actor teardown.
+        # Explicit destroy_process_group() is not needed here because the
+        # workers that own the group are destroyed when the cluster shuts down.
+        pass
@@ -0,0 +1,106 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Factory for creating WeightSynchronizer instances.
+
+Selects the appropriate weight synchronizer based on the deployment
+topology (colocated vs. non-colocated) and the generation backend
+(vLLM uses IPC/ZMQ, SGLang uses HTTP, non-colocated uses NCCL).
+"""
+
+from typing import Any, Optional
+
+from nemo_rl.models.generation.constants import (
+    MEGATRON_BACKEND,
+    SGLANG_BACKEND,
+    VLLM_BACKEND,
+)
+from nemo_rl.weight_sync.interfaces import WeightSynchronizer
+
+
+def create_weight_synchronizer(
+    policy: Any,
+    generation: Any,
+    generation_backend: str,
+    colocated: bool,
+    train_cluster: Optional[Any] = None,
+    inference_cluster: Optional[Any] = None,
+    refit_buffer_size_gb: Optional[int] = None,
+) -> WeightSynchronizer:
+    """Create the appropriate WeightSynchronizer for the given deployment.
+
+    Args:
+        policy: Policy object (ColocatablePolicyInterface).
+        generation: Generation object (GenerationInterface).
+        generation_backend: Name of the generation backend ("vllm", "sglang", "megatron").
+        colocated: Whether policy and generation share the same GPUs.
+        train_cluster: RayVirtualCluster for training workers (required for non-colocated).
+        inference_cluster: RayVirtualCluster for inference workers (required for non-colocated).
+        refit_buffer_size_gb: Optional fixed buffer size for IPC weight staging.
+
+    Returns:
+        A WeightSynchronizer instance appropriate for the deployment topology.
+
+    Raises:
+        NotImplementedError: If the requested configuration is not supported.
+        ValueError: If required arguments are missing.
+    """
+    _SUPPORTED_BACKENDS = {VLLM_BACKEND, SGLANG_BACKEND, MEGATRON_BACKEND}
+    if generation_backend not in _SUPPORTED_BACKENDS:
+        raise ValueError(
+            f"Unknown generation backend {generation_backend!r}. "
+            f"Supported backends: {sorted(_SUPPORTED_BACKENDS)}"
+        )
+
+    if colocated:
+        if generation_backend == SGLANG_BACKEND:
+            from nemo_rl.weight_sync.http_weight_synchronizer import (
+                HTTPWeightSynchronizer,
+            )
+
+            return HTTPWeightSynchronizer(
+                policy=policy,
+                generation=generation,
+            )
+        elif generation_backend in (VLLM_BACKEND, MEGATRON_BACKEND):
+            from nemo_rl.weight_sync.ipc_weight_synchronizer import (
+                IPCWeightSynchronizer,
+            )
+
+            return IPCWeightSynchronizer(
+                policy=policy,
+                generation=generation,
+                refit_buffer_size_gb=refit_buffer_size_gb,
+            )
+    else:
+        if generation_backend == SGLANG_BACKEND:
+            raise NotImplementedError(
+                "SGLang does not support non-colocated inference mode."
+            )
+        if train_cluster is None or inference_cluster is None:
+            raise ValueError(
+                "train_cluster and inference_cluster are required "
+                "for non-colocated weight synchronization."
+            )
+
+        from nemo_rl.weight_sync.collective_weight_synchronizer import (
+            CollectiveWeightSynchronizer,
+        )
+
+        return CollectiveWeightSynchronizer(
+            policy=policy,
+            generation=generation,
+            train_cluster=train_cluster,
+            inference_cluster=inference_cluster,
+        )
@@ -0,0 +1,101 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""HTTP weight synchronizer for colocated SGLang generation.
+
+Handles weight transfer between a colocated policy and SGLang generation
+backend using HTTP streaming. SGLang exposes an HTTP endpoint for weight
+updates, so the policy streams weights directly to SGLang servers.
+
+Lifecycle per sync:
+  1. policy.offload_before_refit()       -- free GPU for weight staging
+  2. generation.prepare_for_generation(tags=["weights"])  -- allocate buffers
+  3. generation.invalidate_kv_cache()    -- clear stale KV cache
+  4. policy.stream_weights_via_http()    -- push weights via HTTP
+  5. policy.offload_after_refit()        -- restore optimizer state
+  6. generation.prepare_for_generation(tags=["kv_cache"]) -- rebuild KV cache
+"""
+
+from contextlib import nullcontext
+from typing import Any, Optional
+
+import ray
+
+from nemo_rl.utils.timer import Timer
+from nemo_rl.weight_sync.interfaces import WeightSynchronizer
+
+
+class HTTPWeightSynchronizer(WeightSynchronizer):
+    """Weight synchronizer using HTTP for colocated SGLang deployments.
+
+    Both the policy and generation workers run on the same GPUs. Weights
+    are streamed to SGLang servers via their HTTP weight-update API.
+
+    Args:
+        policy: Policy object implementing ColocatablePolicyInterface.
+        generation: SGLangGeneration instance exposing get_sglang_url_to_gpu_uuids().
+    """
+
+    def __init__(self, policy: Any, generation: Any):
+        self._policy = policy
+        self._generation = generation
+        self._stale = True
+
+    def sync_weights(
+        self,
+        *,
+        timer: Optional[Timer] = None,
+        kv_scales: Optional[dict[str, float]] = None,
+    ) -> None:
+        self._policy.offload_before_refit()
+        self._generation.prepare_for_generation(tags=["weights"])
+
+        timer_context = (
+            timer.time("prepare_for_generation/transfer_and_update_weights")
+            if timer is not None
+            else nullcontext()
+        )
+        with timer_context:
+            sglang_url_to_gpu_uuids = (
+                self._generation.get_sglang_url_to_gpu_uuids()
+            )
+
+            flush_success = self._generation.invalidate_kv_cache()
+            if not flush_success:
+                print(
+                    "SGLang KV cache invalidation failed before weight update. "
+                )
+
+            futures_train = self._policy.stream_weights_via_http(
+                sglang_url_to_gpu_uuids=sglang_url_to_gpu_uuids,
+            )
+            ray.get(futures_train)
+
+        self._policy.offload_after_refit()
+        self._generation.prepare_for_generation(tags=["kv_cache"])
+        self._stale = False
+
+    @property
+    def is_stale(self) -> bool:
+        return self._stale
+
+    def mark_stale(self) -> None:
+        self._stale = True
+
+    def init_communicator(self) -> None:
+        state_dict_info = self._policy.prepare_refit_info()
+        self._generation.prepare_refit_info(state_dict_info)
+
+    def shutdown(self) -> None:
+        pass