CC review

remi-or · remi-or · commit be8158c669b0 · 2026-05-11T08:16:27.000Z
diff --git a/benchmark_v2/benchmark_scripts/continuous_batching_overall.py b/benchmark_v2/benchmark_scripts/continuous_batching_overall.py
@@ -102,7 +102,7 @@ def score(outputs) -> float:
 
 
 # Data helpers
-def get_tokenized_gms8k(
+def get_tokenized_gsm8k(
     tokenizer: AutoTokenizer, n_fewshot: int = 8
 ) -> tuple[list[list[int]], Callable[[Any], float]]:
     """GSM8K-Platinum few-shot inputs and scorer using the same lighteval extractive_match as the gsm8k task."""
@@ -323,7 +323,7 @@ def diff(cur: float | None, base: float | None) -> str:
 
     # GSM8K benchmarks (256 max new tokens) — gsm8k_platinum dataset, 8-shot, lighteval extractive_match
     tokenizer = AutoTokenizer.from_pretrained(cli_args.model_id, padding_side="left")
-    gsm8k_data, gsm8k_score_fn = get_tokenized_gms8k(tokenizer)
+    gsm8k_data, gsm8k_score_fn = get_tokenized_gsm8k(tokenizer)
 
     ## No options
     results.add_benchmark(
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
@@ -1748,7 +1748,8 @@ class ContinuousBatchingConfig:
     disable_nccl_graph_mixing: bool = True
 
     def __post_init__(self):
-        if self.disable_nccl_graph_mixing:
+        # Only turn off graph mixing support if TP is on
+        if self.disable_nccl_graph_mixing and int(os.environ.get("WORLD_SIZE", "1")) > 1:
             os.environ.setdefault("NCCL_GRAPH_MIXING_SUPPORT", "0")
 
     def account_for_cb_deprecated_arguments(
diff --git a/src/transformers/generation/continuous_batching/cache_manager.py b/src/transformers/generation/continuous_batching/cache_manager.py
@@ -279,6 +279,7 @@ def compute_hash(self, parent_hash: int | None, tokens: list[int], group_id: int
         """Computes the hash of a block identified by the (tokens) it contains, its (parent_hash) and the layer
         (group_id) it belong to. If the block has no parent, the parent hash is None. Uses blake2b for a deterministic
         64-bit digest that is stable across processes (unlike Python's salted built-in `hash`)."""
+        # NOTE: blake2b is ~10–20× slower than hash() here; consider gating by tp_size>1 or switching to xxhash.
         h = hashlib.blake2b(digest_size=8)
         if parent_hash is not None:
             h.update(parent_hash.to_bytes(8, "little", signed=False))
diff --git a/src/transformers/generation/continuous_batching/continuous_api.py b/src/transformers/generation/continuous_batching/continuous_api.py
@@ -260,18 +260,19 @@ def _get_new_requests(self) -> None:
         """Pull new requests and cancellations from the queues and apply them to the scheduler. If the process is a TP
         driver, the input_queue and cancel_queue are not None and the process will drain them. Otherwise, the process
         will wait for the TP driver to send a payload containing the new requests and cancellations."""
-        # Only drains queues if this process is a TP driver
+        # On the TP driver, drain the queues; non-driver ranks start from an empty tuple that gets overwritten by the
+        # broadcast below.
+        payload: tuple[list[RequestState], list[str]] = ([], [])
         if self.input_queue is not None and self.cancel_queue is not None:
-            new_states = drain_queue(self.input_queue)
-            cancellations = drain_queue(self.cancel_queue)
-            payload = (new_states, cancellations)
-        # Otherwise, the payload is None
-        else:
-            payload = ([], [])
+            payload = (drain_queue(self.input_queue), drain_queue(self.cancel_queue))
+
+        # Cheap CPU/gloo presence check: skip the (pickled) object broadcast entirely when there is nothing to send.
+        presence = torch.tensor([len(payload[0]) + len(payload[1])], dtype=torch.int64)
+        self.distributed_helper.tp_broadcast_cpu_from_rank_0(presence)
+        if presence.item() == 0:
+            return
 
-        # Broadcast within the TP group. No-op when tp_size == 1, returns the driver's payload unchanged.
-        payload = self.distributed_helper.tp_broadcast_object(payload)
-        new_states, cancellations = payload
+        new_states, cancellations = self.distributed_helper.tp_broadcast_object(payload)
 
         # All ranks apply the same updates in the same order.
         for state in new_states:
diff --git a/src/transformers/generation/continuous_batching/utils.py b/src/transformers/generation/continuous_batching/utils.py
@@ -279,6 +279,12 @@ def tp_broadcast_from_rank_0(self, value: torch.Tensor) -> torch.Tensor:
             dist.broadcast(value, src=self.tp_root_global_rank, async_op=False, group=self.tp_group)
         return value
 
+    def tp_broadcast_cpu_from_rank_0(self, value: torch.Tensor) -> torch.Tensor:
+        """Inside each TP group, broadcasts a CPU tensor from rank 0 over the gloo ingress group."""
+        if self.tp_size > 1:
+            dist.broadcast(value, src=self.tp_root_global_rank, async_op=False, group=self.ingress_group)
+        return value
+
     def tp_all_reduce_min(self, value: torch.Tensor) -> torch.Tensor:
         """Inside each TP group, all-reduces a tensor with the MIN op. No-op when TP is off."""
         if self.tp_size > 1:
@@ -300,7 +306,7 @@ def tp_broadcast_object(self, obj: T) -> T:
     def maybe_warn_nccl_graph_mixing(self) -> None:
         """Throws a warning if TP is on and NCCL's graph mixing support was supposed to be disabled but isn't. That can
         happen if the distributed group is created before graph mixing is disabled. Typically, if the model is
-        initialized before the ContinousBatchingConfig is created."""
+        initialized before the ContinuousBatchingConfig is created."""
         tp_on = self.tp_size > 1
         graph_mixing_not_disabled = os.environ.get("NCCL_GRAPH_MIXING_SUPPORT") != "0"
         if tp_on and graph_mixing_not_disabled:
diff --git a/tests/generation/test_continuous_batching.py b/tests/generation/test_continuous_batching.py
@@ -356,7 +356,7 @@ def test_continuous_batching_will_allocation_be_successful(
         num_free_blocks: int,
         expected_result: bool,
     ) -> None:
-        """Test the will_allocation_be_successful method of PagedAttentionCache, overloading the elevant attributes of
+        """Test the will_allocation_be_successful method of PagedAttentionCache, overloading the relevant attributes of
         a dummy cache."""
 
         if torch_device is None:  # this check which should always pass and helps with type checking
@@ -532,15 +532,21 @@ def test_distributed_helper_set_tp_seed_no_dist(self) -> None:
         helper.set_tp_seed(seed=None, model_device=torch.device("cpu"))
 
     def test_continuous_batching_config_disables_nccl_graph_mixing(self) -> None:
-        """Test that constructing a ContinuousBatchingConfig sets NCCL_GRAPH_MIXING_SUPPORT=0 by default and only sets
-        it when the disable_nccl_graph_mixing flag is on."""
-        original = os.environ.pop("NCCL_GRAPH_MIXING_SUPPORT", None)
+        """Test that ContinuousBatchingConfig sets NCCL_GRAPH_MIXING_SUPPORT=0 only under a distributed launch
+        (WORLD_SIZE > 1) and respects the disable_nccl_graph_mixing flag."""
+        original_nccl = os.environ.pop("NCCL_GRAPH_MIXING_SUPPORT", None)
+        original_ws = os.environ.pop("WORLD_SIZE", None)
         try:
-            # Default: env var is set to "0"
+            # Single-GPU launch (no WORLD_SIZE): env var is left untouched
+            ContinuousBatchingConfig()
+            self.assertNotIn("NCCL_GRAPH_MIXING_SUPPORT", os.environ)
+
+            # Distributed launch (WORLD_SIZE > 1): env var is set to "0"
+            os.environ["WORLD_SIZE"] = "2"
             ContinuousBatchingConfig()
             self.assertEqual(os.environ.get("NCCL_GRAPH_MIXING_SUPPORT"), "0")
 
-            # Explicitly disabled flag: env var is left untouched
+            # Explicitly disabled flag: env var is left untouched even under a distributed launch
             os.environ.pop("NCCL_GRAPH_MIXING_SUPPORT", None)
             ContinuousBatchingConfig(disable_nccl_graph_mixing=False)
             self.assertNotIn("NCCL_GRAPH_MIXING_SUPPORT", os.environ)
@@ -550,10 +556,14 @@ def test_continuous_batching_config_disables_nccl_graph_mixing(self) -> None:
             ContinuousBatchingConfig()
             self.assertEqual(os.environ.get("NCCL_GRAPH_MIXING_SUPPORT"), "1")
         finally:
-            if original is None:
+            if original_nccl is None:
                 os.environ.pop("NCCL_GRAPH_MIXING_SUPPORT", None)
             else:
-                os.environ["NCCL_GRAPH_MIXING_SUPPORT"] = original
+                os.environ["NCCL_GRAPH_MIXING_SUPPORT"] = original_nccl
+            if original_ws is None:
+                os.environ.pop("WORLD_SIZE", None)
+            else:
+                os.environ["WORLD_SIZE"] = original_ws
 
 
 @require_torch_accelerator