chore: relocate torch_multi_arange

ixlmar · ixlmar · commit faad7dc423eb · 2026-06-16T12:21:43.000Z
Signed-off-by: ixlmar &lt;206748156+ixlmar@users.noreply.github.com&gt;
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1012,7 +1012,6 @@ common-files: &common_files |
         tests/unittest/_torch/ray_orchestrator/single_gpu/test_cache_transceiver_comm.py |
         tests/unittest/_torch/sampler/test_beam_search.py |
         tests/unittest/_torch/sampler/test_best_of_n.py |
-        tests/unittest/_torch/sampler/test_torch_multi_arange.py |
         tests/unittest/_torch/sampler/test_trtllm_sampler.py |
         tests/unittest/_torch/speculative/test_draft_target.py |
         tests/unittest/_torch/speculative/test_draft_token_tree_sampling.py |
@@ -1027,6 +1026,7 @@ common-files: &common_files |
         tests/unittest/_torch/speculative/test_torch_rejection_sampling.py |
         tests/unittest/_torch/speculative/test_user_provided.py |
         tests/unittest/_torch/test_connector.py |
+        tests/unittest/_torch/test_torch_multi_arange.py |
         tests/unittest/_torch/thop/parallel/deep_gemm_tests.py |
         tests/unittest/_torch/thop/parallel/test_causal_conv1d_op.py |
         tests/unittest/_torch/thop/parallel/test_cublas_mm.py |
@@ -2368,7 +2368,6 @@ legacy-files: &legacy_files |
         tests/unittest/_torch/ray_orchestrator/single_gpu/test_cache_transceiver_comm.py |
         tests/unittest/_torch/sampler/test_beam_search.py |
         tests/unittest/_torch/sampler/test_best_of_n.py |
-        tests/unittest/_torch/sampler/test_torch_multi_arange.py |
         tests/unittest/_torch/sampler/test_trtllm_sampler.py |
         tests/unittest/_torch/speculative/test_draft_target.py |
         tests/unittest/_torch/speculative/test_draft_token_tree_sampling.py |
@@ -2383,6 +2382,7 @@ legacy-files: &legacy_files |
         tests/unittest/_torch/speculative/test_torch_rejection_sampling.py |
         tests/unittest/_torch/speculative/test_user_provided.py |
         tests/unittest/_torch/test_connector.py |
+        tests/unittest/_torch/test_torch_multi_arange.py |
         tests/unittest/_torch/thop/parallel/deep_gemm_tests.py |
         tests/unittest/_torch/thop/parallel/test_causal_conv1d_op.py |
         tests/unittest/_torch/thop/parallel/test_cublas_mm.py |
diff --git a/legacy-files.txt b/legacy-files.txt
@@ -1004,7 +1004,7 @@ tests/unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py
 tests/unittest/_torch/ray_orchestrator/single_gpu/test_cache_transceiver_comm.py
 tests/unittest/_torch/sampler/test_beam_search.py
 tests/unittest/_torch/sampler/test_best_of_n.py
-tests/unittest/_torch/sampler/test_torch_multi_arange.py
+tests/unittest/_torch/test_torch_multi_arange.py
 tests/unittest/_torch/sampler/test_trtllm_sampler.py
 tests/unittest/_torch/speculative/test_draft_target.py
 tests/unittest/_torch/speculative/test_draft_token_tree_sampling.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -1062,7 +1062,6 @@ exclude = [
     "tests/unittest/_torch/ray_orchestrator/single_gpu/test_cache_transceiver_comm.py",
     "tests/unittest/_torch/sampler/test_beam_search.py",
     "tests/unittest/_torch/sampler/test_best_of_n.py",
-    "tests/unittest/_torch/sampler/test_torch_multi_arange.py",
     "tests/unittest/_torch/sampler/test_trtllm_sampler.py",
     "tests/unittest/_torch/speculative/test_draft_target.py",
     "tests/unittest/_torch/speculative/test_draft_token_tree_sampling.py",
@@ -1077,6 +1076,7 @@ exclude = [
     "tests/unittest/_torch/speculative/test_torch_rejection_sampling.py",
     "tests/unittest/_torch/speculative/test_user_provided.py",
     "tests/unittest/_torch/test_connector.py",
+    "tests/unittest/_torch/test_torch_multi_arange.py",
     "tests/unittest/_torch/thop/parallel/deep_gemm_tests.py",
     "tests/unittest/_torch/thop/parallel/test_causal_conv1d_op.py",
     "tests/unittest/_torch/thop/parallel/test_cublas_mm.py",
diff --git a/ruff-legacy.toml b/ruff-legacy.toml
@@ -1021,7 +1021,6 @@ include = [
     "tests/unittest/_torch/ray_orchestrator/single_gpu/test_cache_transceiver_comm.py",
     "tests/unittest/_torch/sampler/test_beam_search.py",
     "tests/unittest/_torch/sampler/test_best_of_n.py",
-    "tests/unittest/_torch/sampler/test_torch_multi_arange.py",
     "tests/unittest/_torch/sampler/test_trtllm_sampler.py",
     "tests/unittest/_torch/speculative/test_draft_target.py",
     "tests/unittest/_torch/speculative/test_draft_token_tree_sampling.py",
@@ -1036,6 +1035,7 @@ include = [
     "tests/unittest/_torch/speculative/test_torch_rejection_sampling.py",
     "tests/unittest/_torch/speculative/test_user_provided.py",
     "tests/unittest/_torch/test_connector.py",
+    "tests/unittest/_torch/test_torch_multi_arange.py",
     "tests/unittest/_torch/thop/parallel/deep_gemm_tests.py",
     "tests/unittest/_torch/thop/parallel/test_causal_conv1d_op.py",
     "tests/unittest/_torch/thop/parallel/test_cublas_mm.py",
diff --git a/tensorrt_llm/_torch/attention_backend/flashinfer.py b/tensorrt_llm/_torch/attention_backend/flashinfer.py
@@ -10,14 +10,13 @@
 from flashinfer.jit.core import check_cuda_arch
 from typing_extensions import Self
 
-from tensorrt_llm._torch.pyexecutor.sampling_utils import torch_multi_arange
 from tensorrt_llm._utils import nvtx_range
 from tensorrt_llm.functional import AttentionMaskType
 from tensorrt_llm.logger import logger
 from tensorrt_llm.models.modeling_utils import QuantConfig
 
 from ..metadata import KVCacheParams
-from ..utils import get_global_attrs, get_model_extra_attrs
+from ..utils import get_global_attrs, get_model_extra_attrs, torch_multi_arange
 from .interface import (AttentionBackend, AttentionForwardArgs,
                         AttentionInputType, AttentionMetadata,
                         CustomAttentionMask, MLAParams, PredefinedAttentionMask,
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -83,6 +83,7 @@
 from ..flashinfer_utils import IS_FLASHINFER_AVAILABLE
 from ..speculative.interface import get_force_num_accepted_tokens
 from ..speculative.spec_tree_manager import SpecTreeManager
+from ..utils import torch_multi_arange
 from .finish_reason import FinishedState
 from .llm_request import LlmRequest, LlmRequestState, get_draft_token_length
 from .resource_manager import ResourceManager, ResourceManagerType
@@ -100,7 +101,6 @@
     resolve_sampling_strategy,
     sample,
     sample_rejected,
-    torch_multi_arange,
 )
 from .scheduler import ScheduledRequests
 
diff --git a/tensorrt_llm/_torch/pyexecutor/sampling_utils.py b/tensorrt_llm/_torch/pyexecutor/sampling_utils.py
@@ -614,95 +614,6 @@ def sample_grouped_strategies(
         )
 
 
-class _AcceptSyncCompute:
-    pass
-
-
-ACCEPT_SYNC_COMPUTE = _AcceptSyncCompute()
-
-
-# Inspired by https://github.com/pytorch/pytorch/issues/80577; note also the
-# suggestion to consider torch.nested.
-def torch_multi_arange(
-    ends: torch.Tensor,
-    *,
-    output_length: int | _AcceptSyncCompute,
-    starts: Optional[torch.Tensor] = None,
-    steps: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    """Efficiently compute torch.cat([torch.arange(b, e, d) for b, e, d in zip(starts, ends, steps)]).
-
-    Starts, ends, steps need to share dtype and shape. Invalid ranges like range(1, 2, -1) are
-    silently discarded. 'steps' defaults to 1 and 'starts' defaults to 0.
-
-    Provide 'output_length' to avoid synchronization when using device tensors or pass
-    `ACCEPT_SYNC_COMPUTE` to explicitly accept the possibility of a device sync (for device tensors)
-    or when tensors are known to reside on the host.
-    """
-    if steps is not None:
-        assert ends.dtype == steps.dtype
-        assert ends.shape == steps.shape
-        assert ends.device == steps.device
-    if starts is not None:
-        assert ends.dtype == starts.dtype
-        assert ends.shape == starts.shape
-        assert ends.device == starts.device
-    output_length_arg = None if isinstance(output_length, _AcceptSyncCompute) else output_length
-
-    if ends.numel() == 0:
-        return ends.clone()
-
-    # This algorithm combines torch.repeat_interleaved() and torch.cumsum() to
-    # construct the result.
-    #
-    # 1. Given N ranges (characterized by starts, ends, steps), construct a sequence
-    #    of 2N numbers, in which the non-overlapping pairs of consecutive numbers
-    #    correspond to the ranges. For a given range, the pair (a, b) is chosen such
-    #    that upon torch.cumsum() application 'a' turns the last element of the
-    #    preceding range into the start element for the current range and 'b' is
-    #    simply the step size for the current range.
-    #
-    repeats = ends  # number of elements in each range
-    if starts is not None:
-        repeats = repeats.clone()
-        repeats -= starts
-    if steps is not None:
-        repeats *= steps.sign()
-        steps_abs = steps.abs()
-        repeats = (repeats + steps_abs - 1).div(steps_abs, rounding_mode="floor")
-    repeats = repeats.clip(min=0)  # ignore invalid ranges
-    range_ends = repeats - 1  # last element in each range
-    if steps is not None:
-        range_ends *= steps
-    if starts is not None:
-        range_ends += starts
-    prev_range_ends = range_ends.roll(1)  # last element in preceding range (or 0)
-    prev_range_ends[0].fill_(0)
-    ones = torch.ones((), dtype=ends.dtype, device=ends.device)
-    zeros = torch.zeros((), dtype=ends.dtype, device=ends.device)
-    if steps is None:
-        steps = ones.broadcast_to(ends.shape)
-    jumps = -prev_range_ends  # delta from one range to the next
-    if starts is not None:
-        jumps += starts
-    #     NB: Apply correction for empty ranges
-    jumps_corrections = torch.where(repeats == 0, jumps, zeros).cumsum(0, dtype=ends.dtype)
-    jumps += jumps_corrections
-    seq = torch.cat((jumps.unsqueeze(-1), steps.unsqueeze(-1)), dim=1).view(-1)
-    #
-    # 2. Construct output via torch.repeat_interleave() and torch.cumsum()
-    #     NB: For a resulting empty range, repeats - 1 == -1. In this case, we
-    #         should set repeats for delta and increment both to 0 instead.
-    jump_repeats = torch.where(repeats == 0, zeros, ones)
-    step_repeats = torch.where(repeats == 0, zeros, repeats - 1)
-    seq_repeats = torch.cat((jump_repeats.unsqueeze(-1), step_repeats.unsqueeze(-1)), dim=1).view(
-        -1
-    )
-    seq = seq.repeat_interleave(seq_repeats, output_size=output_length_arg)
-    seq = seq.cumsum(0, dtype=ends.dtype)
-    return seq
-
-
 class _Fusions:
     @staticmethod
     @torch.compile(dynamic=None, fullgraph=True)
diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py
@@ -520,3 +520,96 @@ def replace_parameter_and_save_metadata(
             raise ValueError(f"Invalid type {type(new_param)} for new_param")
 
     module.register_parameter(param_name, saved_param)
+
+
+class _AcceptSyncCompute:
+    pass
+
+
+ACCEPT_SYNC_COMPUTE = _AcceptSyncCompute()
+
+
+# Inspired by https://github.com/pytorch/pytorch/issues/80577; note also the
+# suggestion to consider torch.nested.
+def torch_multi_arange(
+    ends: torch.Tensor,
+    *,
+    output_length: int | _AcceptSyncCompute,
+    starts: torch.Tensor | None = None,
+    steps: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """Efficiently compute torch.cat([torch.arange(b, e, d) for b, e, d in zip(starts, ends, steps)]).
+
+    Starts, ends, steps need to share dtype and shape. Invalid ranges like range(1, 2, -1) are
+    silently discarded. 'steps' defaults to 1 and 'starts' defaults to 0.
+
+    Provide 'output_length' to avoid synchronization when using device tensors or pass
+    `ACCEPT_SYNC_COMPUTE` to explicitly accept the possibility of a device sync (for device tensors)
+    or when tensors are known to reside on the host.
+    """
+    if steps is not None:
+        assert ends.dtype == steps.dtype
+        assert ends.shape == steps.shape
+        assert ends.device == steps.device
+    if starts is not None:
+        assert ends.dtype == starts.dtype
+        assert ends.shape == starts.shape
+        assert ends.device == starts.device
+    output_length_arg = None if isinstance(
+        output_length, _AcceptSyncCompute) else output_length
+
+    if ends.numel() == 0:
+        return ends.clone()
+
+    # This algorithm combines torch.repeat_interleaved() and torch.cumsum() to
+    # construct the result.
+    #
+    # 1. Given N ranges (characterized by starts, ends, steps), construct a sequence
+    #    of 2N numbers, in which the non-overlapping pairs of consecutive numbers
+    #    correspond to the ranges. For a given range, the pair (a, b) is chosen such
+    #    that upon torch.cumsum() application 'a' turns the last element of the
+    #    preceding range into the start element for the current range and 'b' is
+    #    simply the step size for the current range.
+    #
+    repeats = ends  # number of elements in each range
+    if starts is not None:
+        repeats = repeats.clone()
+        repeats -= starts
+    if steps is not None:
+        repeats *= steps.sign()
+        steps_abs = steps.abs()
+        repeats = (repeats + steps_abs - 1).div(steps_abs,
+                                                rounding_mode="floor")
+    repeats = repeats.clip(min=0)  # ignore invalid ranges
+    range_ends = repeats - 1  # last element in each range
+    if steps is not None:
+        range_ends *= steps
+    if starts is not None:
+        range_ends += starts
+    prev_range_ends = range_ends.roll(
+        1)  # last element in preceding range (or 0)
+    prev_range_ends[0].fill_(0)
+    ones = torch.ones((), dtype=ends.dtype, device=ends.device)
+    zeros = torch.zeros((), dtype=ends.dtype, device=ends.device)
+    if steps is None:
+        steps = ones.broadcast_to(ends.shape)
+    jumps = -prev_range_ends  # delta from one range to the next
+    if starts is not None:
+        jumps += starts
+    #     NB: Apply correction for empty ranges
+    jumps_corrections = torch.where(repeats == 0, jumps,
+                                    zeros).cumsum(0, dtype=ends.dtype)
+    jumps += jumps_corrections
+    seq = torch.cat((jumps.unsqueeze(-1), steps.unsqueeze(-1)), dim=1).view(-1)
+    #
+    # 2. Construct output via torch.repeat_interleave() and torch.cumsum()
+    #     NB: For a resulting empty range, repeats - 1 == -1. In this case, we
+    #         should set repeats for delta and increment both to 0 instead.
+    jump_repeats = torch.where(repeats == 0, zeros, ones)
+    step_repeats = torch.where(repeats == 0, zeros, repeats - 1)
+    seq_repeats = torch.cat(
+        (jump_repeats.unsqueeze(-1), step_repeats.unsqueeze(-1)),
+        dim=1).view(-1)
+    seq = seq.repeat_interleave(seq_repeats, output_size=output_length_arg)
+    seq = seq.cumsum(0, dtype=ends.dtype)
+    return seq
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
@@ -16,7 +16,7 @@
 from tqdm import tqdm
 from transformers import PreTrainedTokenizerBase
 
-from tensorrt_llm._torch.pyexecutor.sampling_utils import torch_multi_arange
+from tensorrt_llm._torch.utils import torch_multi_arange
 from tensorrt_llm._utils import mpi_disabled
 from tensorrt_llm.inputs.multimodal import (DisaggPrefillMultimodalInputs,
                                             MultimodalParams)
diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml
@@ -15,7 +15,7 @@ l0_a10:
   tests:
   # ------------- PyTorch tests ---------------
   - unittest/_torch/sampler/test_torch_sampler.py
-  - unittest/_torch/sampler/test_torch_multi_arange.py
+  - unittest/_torch/test_torch_multi_arange.py
   - unittest/utils/test_util.py
   - unittest/utils/test_logger.py
   - unittest/_torch/test_model_config.py
diff --git a/tests/unittest/_torch/test_torch_multi_arange.py b/tests/unittest/_torch/test_torch_multi_arange.py
@@ -21,8 +21,7 @@
 import torch
 from utils.util import assert_no_cuda_sync, force_ampere
 
-from tensorrt_llm._torch.pyexecutor.sampling_utils import (ACCEPT_SYNC_COMPUTE,
-                                                           torch_multi_arange)
+from tensorrt_llm._torch.utils import ACCEPT_SYNC_COMPUTE, torch_multi_arange
 
 BASE_CASES = [
     (None, [], None, []),