Fixes

remi-or · remi-or · commit e04ce5e6acef · 2026-05-11T07:51:23.000Z
diff --git a/src/transformers/generation/continuous_batching/cache.py b/src/transformers/generation/continuous_batching/cache.py
@@ -24,6 +24,7 @@
 from .cache_manager import BlockManager, CacheAllocator, FullAttentionCacheAllocator, SlidingAttentionCacheAllocator
 from .initialization import resolve_max_memory_percent
 from .requests import RequestState, RequestStatus, get_device_and_memory_breakdown, logger
+from .utils import DistributedHelper
 
 
 def group_layers_by_attn_type(config: PreTrainedConfig) -> tuple[list[list[int]], list[str]]:
@@ -122,8 +123,8 @@ def __init__(
         config: PreTrainedConfig,
         continuous_batching_config: ContinuousBatchingConfig,
         device: torch.device | str,
+        distributed_helper: DistributedHelper,
         dtype: torch.dtype = torch.float16,
-        tp_size: int | None = None,
     ) -> None:
         """Initialize a paged attention cache for efficient memory usage. Also turns in prefix sharing if the model has
         only full attention layers.
@@ -132,8 +133,8 @@ def __init__(
             config: Model configuration
             continuous_batching_config: Continuous batching configuration containing cache parameters
             device: Device for the cache tensors
+            distributed_helper: TP-aware helper. Used to dispatch attention heads and ensure coherent cache size
             dtype: Data type of the cache
-            tp_size: Tensor parallelism size
         """
         self.config = config
         self.dtype = dtype
@@ -165,7 +166,8 @@ def __init__(
 
         # Account for TP: each KV head is dispatched to a different GPU, so the effective number of KV heads per GPU is
         # simply divided by the TP size (number of GPUs)
-        if tp_size is not None and tp_size > 1:
+        tp_size = distributed_helper.tp_size
+        if tp_size > 1:
             if self.num_key_value_heads % tp_size != 0:
                 raise ValueError(
                     f"Number of key value heads {self.num_key_value_heads} must be divisible by tensor parallel size {tp_size}."
@@ -214,6 +216,12 @@ def __init__(
             cache_dtype=self.dtype,
         )
 
+        # For TP, align num_blocks and max_batch_tokens to the minimal value across the TP group
+        if tp_size > 1:
+            sync = torch.tensor([num_blocks, max_batch_tokens], device=self.device, dtype=torch.int64)
+            distributed_helper.tp_all_reduce_min(sync)
+            num_blocks, max_batch_tokens = int(sync[0].item()), int(sync[1].item())
+
         # Add the inferred attributes to the class
         self.num_blocks = num_blocks
         self.max_batch_tokens = max_batch_tokens
diff --git a/src/transformers/generation/continuous_batching/continuous_api.py b/src/transformers/generation/continuous_batching/continuous_api.py
@@ -25,7 +25,6 @@
 
 import torch
 from torch import nn
-from torch.distributed.tensor.device_mesh import DeviceMesh
 from tqdm import tqdm
 from tqdm.contrib.logging import logging_redirect_tqdm
 
@@ -149,7 +148,7 @@ def __init__(
         model_device: torch.device,
         model_dtype: torch.dtype,
         scheduler: Scheduler,
-        device_mesh: DeviceMesh | None,
+        distributed_helper: DistributedHelper,
     ) -> None:
         """Initialize the continuous batch processor.
 
@@ -166,7 +165,7 @@ def __init__(
             model_device: Device for model inputs/outputs
             model_dtype: Data type for model inputs/outputs
             scheduler: The [`Scheduler`] to use
-            device_mesh: The device mesh if there is one
+            distributed_helper: The [`DistributedHelper`] to use
         """
         self.cache = cache
         self.config = config
@@ -179,7 +178,7 @@ def __init__(
         self.model_device = model_device
         self.model_dtype = model_dtype
         self.scheduler = scheduler
-        self.distributed_helper = DistributedHelper(device_mesh=device_mesh)
+        self.distributed_helper = distributed_helper
 
         # Generation-related attributes
         self.do_sample = getattr(generation_config, "do_sample", True)
@@ -268,7 +267,7 @@ def _get_new_requests(self) -> None:
             payload = (new_states, cancellations)
         # Otherwise, the payload is None
         else:
-            payload = None
+            payload = ([], [])
 
         # Broadcast within the TP group. No-op when tp_size == 1, returns the driver's payload unchanged.
         payload = self.distributed_helper.tp_broadcast_object(payload)
@@ -521,11 +520,11 @@ def __init__(
         self._request_lock = threading.Lock()
 
         # Infer if this process is the driver of its own TP group
-        helper = DistributedHelper(device_mesh=getattr(self.model, "_device_mesh", None))
-        self.is_tp_driver = helper.is_tp_driver
+        self.distributed_helper = DistributedHelper(device_mesh=getattr(self.model, "_device_mesh", None))
+        self.is_tp_driver = self.distributed_helper.is_tp_driver
         # If TP is on, check if NCCL graph mixing is disabled (helps with performance)
         if continuous_batching_config.disable_nccl_graph_mixing:
-            helper.maybe_warn_nccl_graph_mixing()
+            self.distributed_helper.maybe_warn_nccl_graph_mixing()
 
         # Generation config related arguments
         num_return_sequences = getattr(generation_config, "num_return_sequences", None)
@@ -601,6 +600,7 @@ def stop(self, block: bool = True, timeout: float | None = None, keep_for_next_s
         # If the manager is not being kept for next session, we clear the batch processor
         if not keep_for_next_session:
             self.batch_processor = None
+            self.distributed_helper.destroy_ingress_group()
         # Otherwise, we keep the batch processor and cache the manager as a model attribute
         else:
             logger.info("Continuous batching manager will be kept for next session.")
@@ -792,15 +792,13 @@ def _generation_step(self) -> None:
         self.batch_processor._generation_step(self.model)
 
     def _create_batch_processor(self) -> ContinuousBatchProcessor:
-        # Retrieve the device mesh if there is one
-        device_mesh: DeviceMesh | None = getattr(self.model, "_device_mesh", None)
         # Create the PagedAttentionCache
         paged_attention_cache = PagedAttentionCache(
-            self.model.config,
-            self.continuous_batching_config,
-            self.model.device,
-            self.model.dtype,
-            tp_size=DistributedHelper(device_mesh=device_mesh).tp_size,  # consistent with the batch processor
+            config=self.model.config,
+            continuous_batching_config=self.continuous_batching_config,
+            device=self.model.device,
+            distributed_helper=self.distributed_helper,
+            dtype=self.model.dtype,
         )
         self._use_prefix_sharing = paged_attention_cache.use_prefix_sharing  # update the approximation
 
@@ -829,7 +827,7 @@ def _create_batch_processor(self) -> ContinuousBatchProcessor:
             model_device=self.model.device,
             model_dtype=self.model.dtype,
             scheduler=scheduler(paged_attention_cache),
-            device_mesh=device_mesh,
+            distributed_helper=self.distributed_helper,
         )
         return batch_processor
 
diff --git a/src/transformers/generation/continuous_batching/utils.py b/src/transformers/generation/continuous_batching/utils.py
@@ -16,7 +16,7 @@
 from collections import OrderedDict
 from dataclasses import dataclass
 from math import ceil, log2
-from typing import Any
+from typing import Any, TypeVar
 
 import torch
 import torch.distributed as dist
@@ -27,6 +27,9 @@
 from .requests import FutureRequestState, RequestState, RequestStatus, logger
 
 
+T = TypeVar("T")
+
+
 class CudaGraphBuffer:
     """A fixed-size dict for CUDA graphs with LRU eviction when full."""
 
@@ -264,13 +267,25 @@ def __init__(self, device_mesh: DeviceMesh | None) -> None:
         self.dp_rank = self.global_rank // self.tp_size
         self.dp_size = self.world_size // self.tp_size
 
+    def destroy_ingress_group(self) -> None:
+        """Destroys the ingress group."""
+        if self.ingress_group is not None:
+            dist.destroy_process_group(self.ingress_group)
+        self.ingress_group = None
+
     def tp_broadcast_from_rank_0(self, value: torch.Tensor) -> torch.Tensor:
         """Inside each TP group, broadcasts the given value from rank 0 to all other ranks."""
         if self.tp_size > 1:
             dist.broadcast(value, src=self.tp_root_global_rank, async_op=False, group=self.tp_group)
         return value
 
-    def tp_broadcast_object(self, obj):
+    def tp_all_reduce_min(self, value: torch.Tensor) -> torch.Tensor:
+        """Inside each TP group, all-reduces a tensor with the MIN op. No-op when TP is off."""
+        if self.tp_size > 1:
+            dist.all_reduce(value, op=dist.ReduceOp.MIN, group=self.tp_group)
+        return value
+
+    def tp_broadcast_object(self, obj: T) -> T:
         """Inside each TP group, broadcasts an arbitrary picklable Python object from TP-rank 0 to all other ranks.
         Used to keep request ingress and cancellations consistent across TP workers without requiring all ranks to
         receive the same external request stream. Uses a dedicated CPU (gloo) `ingress_group` for broadcast."""
@@ -287,8 +302,8 @@ def maybe_warn_nccl_graph_mixing(self) -> None:
         happen if the distributed group is created before graph mixing is disabled. Typically, if the model is
         initialized before the ContinousBatchingConfig is created."""
         tp_on = self.tp_size > 1
-        graph_mixing_supported = os.environ.get("NCCL_GRAPH_MIXING_SUPPORT") != "0"
-        if tp_on or graph_mixing_supported:
+        graph_mixing_not_disabled = os.environ.get("NCCL_GRAPH_MIXING_SUPPORT") != "0"
+        if tp_on and graph_mixing_not_disabled:
             logger.warning(
                 "NCCL_GRAPH_MIXING_SUPPORT was not set to '0' before init_process_group: performance will be harmed. "
                 "Construct your `ContinuousBatchingConfig(...)` BEFORE calling `from_pretrained(tp_plan='auto')`, or "
@@ -304,7 +319,7 @@ def set_tp_seed(self, seed: int | None, model_device: torch.device) -> None:
         # Broadcast the seed to all ranks from rank 0 and memoize it
         tp_seed_tensor = self.tp_broadcast_from_rank_0(tp_seed_tensor)
         tp_seed = tp_seed_tensor.item()
-        if self.global_rank == 0:
-            logger.warning(f"Found no user-specified seed in the config. Setting the config seed to: {tp_seed}.")
+        if self.global_rank == 0 and seed is None:
+            logger.info(f"Found no user-specified seed in the config. Setting the config seed to: {tp_seed}.")
         # Set the seed while accounting for DP replicas
         torch.manual_seed(tp_seed + self.dp_rank)
diff --git a/tests/generation/test_continuous_batching.py b/tests/generation/test_continuous_batching.py
@@ -367,6 +367,7 @@ def test_continuous_batching_will_allocation_be_successful(
             config=AutoConfig.from_pretrained("HuggingFaceTB/SmolLM-1.7B", attn_implementation="sdpa"),
             continuous_batching_config=ContinuousBatchingConfig(block_size=16, num_blocks=8, max_batch_tokens=8),
             device=torch_device,
+            distributed_helper=DistributedHelper(device_mesh=None),
         )
 
         # Overload cache parameters to match test scenario
@@ -511,6 +512,11 @@ def test_distributed_helper_no_dist(self) -> None:
         obj = {"some_request": "payload"}
         self.assertIs(helper.tp_broadcast_object(obj), obj)
 
+        # All-reduce-min should be a no-op without a TP group
+        reduce_tensor = torch.tensor([7, 3], dtype=torch.int64)
+        self.assertIs(helper.tp_all_reduce_min(reduce_tensor), reduce_tensor)
+        self.assertTrue(torch.equal(reduce_tensor, torch.tensor([7, 3], dtype=torch.int64)))
+
     def test_distributed_helper_set_tp_seed_no_dist(self) -> None:
         """Test that set_tp_seed sets a torch seed without distributed initialized, both with and without a user seed."""
         helper = DistributedHelper(device_mesh=None)