huggingface · balgaly · Apr 5, 2026 · Apr 8, 2026
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
@@ -117,6 +117,10 @@
 
 logger = logging.get_logger(__name__)
 
+# `torch.multinomial` requires `input.shape[-1] <= 2**24` (see PyTorch and #45245). Beam search with
+# `do_sample=True` flattens `num_beams * vocab_size` categories; above this we use an equivalent path.
+_BEAM_SEARCH_MULTINOMIAL_DIM_LIMIT = 2**24
+
 if is_accelerate_available():
     from accelerate.hooks import AlignDevicesHook, add_hook_to_module
 
@@ -2973,9 +2977,18 @@ def _get_top_k_continuations(
 
         # Gather the top K scores from _all_ beams.
         if do_sample:
-            topk_indices = torch.multinomial(
-                nn.functional.softmax(accumulated_log_probs, dim=-1), num_samples=beams_to_keep
-            )
+            flat_dim = accumulated_log_probs.shape[-1]
+            if flat_dim >= _BEAM_SEARCH_MULTINOMIAL_DIM_LIMIT:
+                # Gumbel-top-k is equivalent to `multinomial(softmax(logits), k, replacement=False)` here
+                # without requiring a categorical dimension above PyTorch's multinomial limit.
+                log_probs_32 = accumulated_log_probs.to(torch.float32)
+                uniform = torch.rand_like(log_probs_32).clamp(1e-20, 1.0 - 1e-7)
+                gumbel_noise = -torch.log(-torch.log(uniform))
+                _, topk_indices = torch.topk(log_probs_32 + gumbel_noise, k=beams_to_keep, dim=-1)
+            else:
+                topk_indices = torch.multinomial(
+                    nn.functional.softmax(accumulated_log_probs, dim=-1), num_samples=beams_to_keep
+                )
             topk_log_probs = torch.gather(input=accumulated_log_probs, dim=1, index=topk_indices)
         else:
             topk_log_probs, topk_indices = torch.topk(accumulated_log_probs, k=beams_to_keep)

diff --git a/tests/generation/test_beam_search_multinomial_limit.py b/tests/generation/test_beam_search_multinomial_limit.py
@@ -0,0 +1,75 @@
+# Copyright 2025 The HuggingFace Team Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+from transformers import GPT2Config, GPT2LMHeadModel, is_torch_available
+from transformers.testing_utils import require_torch, torch_device
+
+
+if is_torch_available():
+    import torch
+
+    import transformers.generation.utils as gen_utils
+
+
+@require_torch
+class BeamSearchMultinomialLimitTest(unittest.TestCase):
+    def test_get_top_k_continuations_gumbel_when_flat_dim_over_limit(self):
+        """Above _BEAM_SEARCH_MULTINOMIAL_DIM_LIMIT, avoid torch.multinomial on the full flat dim (#45245)."""
+        old_limit = gen_utils._BEAM_SEARCH_MULTINOMIAL_DIM_LIMIT
+        try:
+            gen_utils._BEAM_SEARCH_MULTINOMIAL_DIM_LIMIT = 16
+            config = GPT2Config(
+                vocab_size=100,
+                n_positions=64,
+                n_embd=32,
+                n_layer=1,
+                n_head=1,
+                bos_token_id=0,
+                eos_token_id=0,
+            )
+            model = GPT2LMHeadModel(config)
+            model.to(torch_device)
+            model.eval()
+
+            batch_size, num_beams, vocab_size = 1, 2, 10
+            flat = num_beams * vocab_size
+            self.assertGreaterEqual(flat, gen_utils._BEAM_SEARCH_MULTINOMIAL_DIM_LIMIT)
+
+            accumulated = torch.randn(batch_size, flat, device=torch_device, dtype=torch.float32)
+            max_length = 8
+            cur_len, decoder_prompt_len = 2, 2
+            running_sequences = torch.zeros(batch_size, num_beams, max_length, dtype=torch.long, device=torch_device)
+            running_beam_indices = torch.zeros(
+                batch_size, num_beams, max_length - decoder_prompt_len, dtype=torch.int32, device=torch_device
+            )
+            beams_to_keep = 4
+
+            topk_log_probs, topk_running_sequences, topk_running_beam_indices = model._get_top_k_continuations(
+                accumulated_log_probs=accumulated,
+                running_sequences=running_sequences,
+                running_beam_indices=running_beam_indices,
+                cur_len=cur_len,
+                decoder_prompt_len=decoder_prompt_len,
+                do_sample=True,
+                beams_to_keep=beams_to_keep,
+                num_beams=num_beams,
+                vocab_size=vocab_size,
+                batch_size=batch_size,
+            )
+            self.assertEqual(topk_log_probs.shape, (batch_size, beams_to_keep))
+            self.assertEqual(topk_running_sequences.shape[1], beams_to_keep)
+            self.assertEqual(topk_running_beam_indices.shape[1], beams_to_keep)
+        finally:
+            gen_utils._BEAM_SEARCH_MULTINOMIAL_DIM_LIMIT = old_limit