Skip to content

Commit 5495210

Browse files
authored
[sync] sync to upstream 03c4c4a (#1597)
Signed-off-by: AlpinDale <alpindale@gmail.com>
1 parent 7016e79 commit 5495210

150 files changed

Lines changed: 4913 additions & 719 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ repos:
8989
language: python
9090
types: [python]
9191
additional_dependencies: [regex]
92+
exclude: ^benchmarks/
9293
- id: validate-config
9394
name: Validate configuration has default values and that each field has a docstring
9495
entry: python tools/pre_commit/validate_config.py

aphrodite/_custom_ops.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1643,6 +1643,10 @@ def selective_scan_fwd(
16431643
has_initial_state: torch.Tensor | None,
16441644
ssm_states: torch.Tensor,
16451645
pad_slot_id: int,
1646+
block_size: int = 1024,
1647+
block_idx_first_scheduled_token: torch.Tensor | None = None,
1648+
block_idx_last_scheduled_token: torch.Tensor | None = None,
1649+
initial_state_idx: torch.Tensor | None = None,
16461650
):
16471651
torch.ops._C.selective_scan_fwd(
16481652
u,
@@ -1659,6 +1663,10 @@ def selective_scan_fwd(
16591663
has_initial_state,
16601664
ssm_states,
16611665
pad_slot_id,
1666+
block_size,
1667+
block_idx_first_scheduled_token,
1668+
block_idx_last_scheduled_token,
1669+
initial_state_idx,
16621670
)
16631671

16641672

@@ -1697,6 +1705,8 @@ def moe_align_block_size(
16971705
sorted_token_ids: torch.Tensor,
16981706
experts_ids: torch.Tensor,
16991707
num_tokens_post_pad: torch.Tensor,
1708+
adapter_enabled: torch.Tensor,
1709+
lora_ids: torch.Tensor,
17001710
) -> None:
17011711
torch.ops._moe_C.moe_align_block_size(
17021712
topk_ids,
@@ -1705,6 +1715,8 @@ def moe_align_block_size(
17051715
sorted_token_ids,
17061716
experts_ids,
17071717
num_tokens_post_pad,
1718+
adapter_enabled,
1719+
lora_ids,
17081720
)
17091721

17101722

aphrodite/attention/ops/vit_attn_wrappers.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
import einops
1414
import torch
15+
import torch.nn.functional as F
1516

1617
from aphrodite.utils.torch_utils import direct_register_custom_op
1718

@@ -113,3 +114,53 @@ def vit_flash_attn_wrapper(
113114
return torch.ops.aphrodite.flash_attn_maxseqlen_wrapper(
114115
q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter, use_upstream_fa
115116
)
117+
118+
119+
# TODO: Once we have a torch 2.10, we can use tensor slices
120+
# so we won't need to wrap this in custom ops
121+
def torch_sdpa_wrapper(
122+
q: torch.Tensor,
123+
k: torch.Tensor,
124+
v: torch.Tensor,
125+
cu_seqlens: torch.Tensor,
126+
) -> torch.Tensor:
127+
outputs = []
128+
for i in range(1, len(cu_seqlens)):
129+
start_idx = cu_seqlens[i - 1]
130+
end_idx = cu_seqlens[i]
131+
q_i = q[:, start_idx:end_idx]
132+
k_i = k[:, start_idx:end_idx]
133+
v_i = v[:, start_idx:end_idx]
134+
q_i, k_i, v_i = (einops.rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i])
135+
output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
136+
output_i = einops.rearrange(output_i, "b h s d -> b s h d ")
137+
outputs.append(output_i)
138+
context_layer = torch.cat(outputs, dim=1)
139+
context_layer = einops.rearrange(context_layer, "b s h d -> s b (h d)").contiguous()
140+
return context_layer
141+
142+
143+
def torch_sdpa_wrapper_fake(
144+
q: torch.Tensor,
145+
k: torch.Tensor,
146+
v: torch.Tensor,
147+
cu_seqlens: torch.Tensor,
148+
) -> torch.Tensor:
149+
b, s, h, d = q.shape
150+
return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device)
151+
152+
153+
direct_register_custom_op(
154+
op_name="torch_sdpa_wrapper",
155+
op_func=torch_sdpa_wrapper,
156+
fake_impl=torch_sdpa_wrapper_fake,
157+
)
158+
159+
160+
def vit_torch_sdpa_wrapper(
161+
q: torch.Tensor,
162+
k: torch.Tensor,
163+
v: torch.Tensor,
164+
cu_seqlens: torch.Tensor,
165+
) -> torch.Tensor:
166+
return torch.ops.aphrodite.torch_sdpa_wrapper(q, k, v, cu_seqlens)

aphrodite/benchmarks/serve.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,9 +178,14 @@ async def get_request(
178178
total_requests,
179179
request_rate,
180180
)
181+
assert current_request_rate > 0.0, f"Obtained non-positive request rate {current_request_rate}."
181182
request_rates.append(current_request_rate)
182183
if current_request_rate == float("inf"):
183184
delay_ts.append(0)
185+
elif burstiness == float("inf"):
186+
# when burstiness tends to infinity, the delay time becomes constant
187+
# and tends to the inverse of the request rate
188+
delay_ts.append(1.0 / current_request_rate)
184189
else:
185190
theta = 1.0 / (current_request_rate * burstiness)
186191

aphrodite/compilation/backends.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
4040
and hasattr(torch._inductor, "standalone_compile")
4141
):
4242
logger.debug("Using InductorStandaloneAdaptor")
43-
return InductorStandaloneAdaptor()
43+
return InductorStandaloneAdaptor(compilation_config.compile_cache_save_format)
4444
else:
4545
logger.debug("Using InductorAdaptor")
4646
return InductorAdaptor()

aphrodite/compilation/compiler_interface.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import os
55
from collections.abc import Callable
66
from contextlib import ExitStack
7-
from typing import Any
7+
from typing import Any, Literal
88
from unittest.mock import patch
99

1010
import torch
@@ -171,6 +171,9 @@ class InductorStandaloneAdaptor(CompilerInterface):
171171

172172
name = "inductor_standalone"
173173

174+
def __init__(self, save_format: Literal["binary", "unpacked"]):
175+
self.save_format = save_format
176+
174177
def compute_hash(self, aphrodite_config: AphroditeConfig) -> str:
175178
factors = get_inductor_factors()
176179
hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()[:10]
@@ -212,7 +215,7 @@ def compile(
212215
assert key is not None
213216
path = os.path.join(self.cache_dir, key)
214217
if not envs.APHRODITE_DISABLE_COMPILE_CACHE:
215-
compiled_graph.save(path=path, format="unpacked")
218+
compiled_graph.save(path=path, format=self.save_format)
216219
compilation_counter.num_compiled_artifacts_saved += 1
217220
return compiled_graph, (key, path)
218221

@@ -228,7 +231,7 @@ def load(
228231
assert isinstance(handle[0], str)
229232
assert isinstance(handle[1], str)
230233
path = handle[1]
231-
inductor_compiled_graph = torch._inductor.CompiledArtifact.load(path=path, format="unpacked")
234+
inductor_compiled_graph = torch._inductor.CompiledArtifact.load(path=path, format=self.save_format)
232235
from torch._inductor.compile_fx import graph_returns_tuple
233236

234237
returns_tuple = graph_returns_tuple(graph)

aphrodite/config/aphrodite.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,7 @@ def with_hf_config(
275275
return replace(self, model_config=model_config)
276276

277277
def _post_init_kv_transfer_config(self) -> None:
278-
"""Update KVTransferConfig based on top-level configs in VllmConfig.
278+
"""Update KVTransferConfig based on top-level configs in AphroditeConfig.
279279
Right now, this function reads the offloading settings from
280280
CacheConfig and configures the KVTransferConfig accordingly.
281281
"""

aphrodite/config/compilation.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@
44
from collections.abc import Callable
55
from dataclasses import asdict, field
66
from pathlib import Path
7-
from typing import TYPE_CHECKING, Any, ClassVar
7+
from typing import TYPE_CHECKING, Any, ClassVar, Literal
88

99
from pydantic import TypeAdapter, field_validator
1010
from pydantic.dataclasses import dataclass
1111

12+
import aphrodite.envs as envs
1213
from aphrodite.compilation.inductor_pass import CallableInductorPass, InductorPass
1314
from aphrodite.config.utils import config
1415
from aphrodite.logger import init_logger
@@ -204,6 +205,15 @@ class CompilationConfig:
204205
"""The directory to store the compiled graph, to accelerate Inductor
205206
compilation. By default, it will use model-related information to generate
206207
a cache directory."""
208+
compile_cache_save_format: Literal["binary", "unpacked"] = field(
209+
default_factory=lambda: envs.APHRODITE_COMPILE_CACHE_SAVE_FORMAT
210+
)
211+
"""Format for saving torch compile cache:\n
212+
- "binary": saves as binary file (multiprocess safe)\n
213+
- "unpacked": saves as directory structure for inspection/debugging
214+
(NOT multiprocess safe)\n
215+
Defaults to `VLLM_COMPILE_CACHE_SAVE_FORMAT` if not specified.
216+
"""
207217
backend: str = ""
208218
"""The backend for compilation. It needs to be a string:
209219
@@ -475,6 +485,7 @@ def compute_hash(self) -> str:
475485
factors.append(self.inductor_compile_config)
476486
factors.append(self.inductor_passes)
477487
factors.append(self.pass_config.uuid())
488+
factors.append(self.compile_cache_save_format)
478489
return hashlib.sha256(str(factors).encode()).hexdigest()
479490

480491
def __repr__(self) -> str:
@@ -514,6 +525,13 @@ def validate_cudagraph_mode_before(cls, value: Any) -> Any:
514525
return CUDAGraphMode[value.upper()]
515526
return value
516527

528+
@field_validator("compile_cache_save_format")
529+
@classmethod
530+
def validate_compile_cache_save_format(cls, value: str) -> str:
531+
if value not in ("binary", "unpacked"):
532+
raise ValueError(f"compile_cache_save_format must be 'binary' or 'unpacked', got: {value}")
533+
return value
534+
517535
def __post_init__(self) -> None:
518536
if self.level is not None:
519537
logger.warning(

aphrodite/config/load.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ class LoadConfig:
3737
more information.\n
3838
- "runai_streamer" will load the Safetensors weights using Run:ai Model
3939
Streamer.\n
40+
- "runai_streamer_sharded" will load weights from pre-sharded checkpoint
41+
files using Run:ai Model Streamer.\n
4042
- "bitsandbytes" will load the weights using bitsandbytes quantization.\n
4143
- "sharded_state" will load weights from pre-sharded checkpoint files,
4244
supporting efficient loading of tensor-parallel models.\n

aphrodite/config/model.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1406,6 +1406,11 @@ def get_mamba_chunk_size(self) -> int | None:
14061406
if chunk_size is None:
14071407
# used by e.g. Mamba2, NemotronH, Zamba
14081408
chunk_size = getattr(self.hf_text_config, "chunk_size", None)
1409+
1410+
# Since Mamba1 does not have a chunk notion
1411+
# we use a default chunk size of 1024.
1412+
if chunk_size is None:
1413+
chunk_size = 2048
14091414
return chunk_size
14101415

14111416
def get_multimodal_config(self) -> MultiModalConfig:

0 commit comments

Comments
 (0)