dphnAI
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎aphrodite/_custom_ops.py‎
Lines changed: 12 additions & 0 deletions b/‎aphrodite/_custom_ops.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎aphrodite/attention/ops/vit_attn_wrappers.py‎
Lines changed: 51 additions & 0 deletions b/‎aphrodite/attention/ops/vit_attn_wrappers.py‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎aphrodite/benchmarks/serve.py‎
Lines changed: 5 additions & 0 deletions b/‎aphrodite/benchmarks/serve.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎aphrodite/compilation/backends.py‎
Lines changed: 1 addition & 1 deletion b/‎aphrodite/compilation/backends.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aphrodite/compilation/compiler_interface.py‎
Lines changed: 6 additions & 3 deletions b/‎aphrodite/compilation/compiler_interface.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎aphrodite/config/aphrodite.py‎
Lines changed: 1 addition & 1 deletion b/‎aphrodite/config/aphrodite.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aphrodite/config/compilation.py‎
Lines changed: 19 additions & 1 deletion b/‎aphrodite/config/compilation.py‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎aphrodite/config/load.py‎
Lines changed: 2 additions & 0 deletions b/‎aphrodite/config/load.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎aphrodite/config/model.py‎
Lines changed: 5 additions & 0 deletions b/‎aphrodite/config/model.py‎
Lines changed: 5 additions & 0 deletions
@@ -89,6 +89,7 @@ repos:
     language: python
     types: [python]
     additional_dependencies: [regex]
+    exclude: ^benchmarks/
   - id: validate-config
     name: Validate configuration has default values and that each field has a docstring
     entry: python tools/pre_commit/validate_config.py
 
@@ -1643,6 +1643,10 @@ def selective_scan_fwd(
     has_initial_state: torch.Tensor | None,
     ssm_states: torch.Tensor,
     pad_slot_id: int,
+    block_size: int = 1024,
+    block_idx_first_scheduled_token: torch.Tensor | None = None,
+    block_idx_last_scheduled_token: torch.Tensor | None = None,
+    initial_state_idx: torch.Tensor | None = None,
 ):
     torch.ops._C.selective_scan_fwd(
         u,
@@ -1659,6 +1663,10 @@ def selective_scan_fwd(
         has_initial_state,
         ssm_states,
         pad_slot_id,
+        block_size,
+        block_idx_first_scheduled_token,
+        block_idx_last_scheduled_token,
+        initial_state_idx,
     )
 
 
@@ -1697,6 +1705,8 @@ def moe_align_block_size(
     sorted_token_ids: torch.Tensor,
     experts_ids: torch.Tensor,
     num_tokens_post_pad: torch.Tensor,
+    adapter_enabled: torch.Tensor,
+    lora_ids: torch.Tensor,
 ) -> None:
     torch.ops._moe_C.moe_align_block_size(
         topk_ids,
@@ -1705,6 +1715,8 @@ def moe_align_block_size(
         sorted_token_ids,
         experts_ids,
         num_tokens_post_pad,
+        adapter_enabled,
+        lora_ids,
     )
 
 
 
@@ -12,6 +12,7 @@
 
 import einops
 import torch
+import torch.nn.functional as F
 
 from aphrodite.utils.torch_utils import direct_register_custom_op
 
@@ -113,3 +114,53 @@ def vit_flash_attn_wrapper(
     return torch.ops.aphrodite.flash_attn_maxseqlen_wrapper(
         q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter, use_upstream_fa
     )
+
+
+# TODO: Once we have a torch 2.10, we can use tensor slices
+# so we won't need to wrap this in custom ops
+def torch_sdpa_wrapper(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens: torch.Tensor,
+) -> torch.Tensor:
+    outputs = []
+    for i in range(1, len(cu_seqlens)):
+        start_idx = cu_seqlens[i - 1]
+        end_idx = cu_seqlens[i]
+        q_i = q[:, start_idx:end_idx]
+        k_i = k[:, start_idx:end_idx]
+        v_i = v[:, start_idx:end_idx]
+        q_i, k_i, v_i = (einops.rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i])
+        output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
+        output_i = einops.rearrange(output_i, "b h s d -> b s h d ")
+        outputs.append(output_i)
+    context_layer = torch.cat(outputs, dim=1)
+    context_layer = einops.rearrange(context_layer, "b s h d -> s b (h d)").contiguous()
+    return context_layer
+
+
+def torch_sdpa_wrapper_fake(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens: torch.Tensor,
+) -> torch.Tensor:
+    b, s, h, d = q.shape
+    return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device)
+
+
+direct_register_custom_op(
+    op_name="torch_sdpa_wrapper",
+    op_func=torch_sdpa_wrapper,
+    fake_impl=torch_sdpa_wrapper_fake,
+)
+
+
+def vit_torch_sdpa_wrapper(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens: torch.Tensor,
+) -> torch.Tensor:
+    return torch.ops.aphrodite.torch_sdpa_wrapper(q, k, v, cu_seqlens)
@@ -178,9 +178,14 @@ async def get_request(
             total_requests,
             request_rate,
         )
+        assert current_request_rate > 0.0, f"Obtained non-positive request rate {current_request_rate}."
         request_rates.append(current_request_rate)
         if current_request_rate == float("inf"):
             delay_ts.append(0)
+        elif burstiness == float("inf"):
+            # when burstiness tends to infinity, the delay time becomes constant
+            # and tends to the inverse of the request rate
+            delay_ts.append(1.0 / current_request_rate)
         else:
             theta = 1.0 / (current_request_rate * burstiness)
 
 
@@ -40,7 +40,7 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
             and hasattr(torch._inductor, "standalone_compile")
         ):
             logger.debug("Using InductorStandaloneAdaptor")
-            return InductorStandaloneAdaptor()
+            return InductorStandaloneAdaptor(compilation_config.compile_cache_save_format)
         else:
             logger.debug("Using InductorAdaptor")
             return InductorAdaptor()
 
@@ -4,7 +4,7 @@
 import os
 from collections.abc import Callable
 from contextlib import ExitStack
-from typing import Any
+from typing import Any, Literal
 from unittest.mock import patch
 
 import torch
@@ -171,6 +171,9 @@ class InductorStandaloneAdaptor(CompilerInterface):
 
     name = "inductor_standalone"
 
+    def __init__(self, save_format: Literal["binary", "unpacked"]):
+        self.save_format = save_format
+
     def compute_hash(self, aphrodite_config: AphroditeConfig) -> str:
         factors = get_inductor_factors()
         hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()[:10]
@@ -212,7 +215,7 @@ def compile(
         assert key is not None
         path = os.path.join(self.cache_dir, key)
         if not envs.APHRODITE_DISABLE_COMPILE_CACHE:
-            compiled_graph.save(path=path, format="unpacked")
+            compiled_graph.save(path=path, format=self.save_format)
             compilation_counter.num_compiled_artifacts_saved += 1
         return compiled_graph, (key, path)
 
@@ -228,7 +231,7 @@ def load(
         assert isinstance(handle[0], str)
         assert isinstance(handle[1], str)
         path = handle[1]
-        inductor_compiled_graph = torch._inductor.CompiledArtifact.load(path=path, format="unpacked")
+        inductor_compiled_graph = torch._inductor.CompiledArtifact.load(path=path, format=self.save_format)
         from torch._inductor.compile_fx import graph_returns_tuple
 
         returns_tuple = graph_returns_tuple(graph)
 
@@ -275,7 +275,7 @@ def with_hf_config(
         return replace(self, model_config=model_config)
 
     def _post_init_kv_transfer_config(self) -> None:
-        """Update KVTransferConfig based on top-level configs in VllmConfig.
+        """Update KVTransferConfig based on top-level configs in AphroditeConfig.
         Right now, this function reads the offloading settings from
         CacheConfig and configures the KVTransferConfig accordingly.
         """
 
@@ -4,11 +4,12 @@
 from collections.abc import Callable
 from dataclasses import asdict, field
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, ClassVar
+from typing import TYPE_CHECKING, Any, ClassVar, Literal
 
 from pydantic import TypeAdapter, field_validator
 from pydantic.dataclasses import dataclass
 
+import aphrodite.envs as envs
 from aphrodite.compilation.inductor_pass import CallableInductorPass, InductorPass
 from aphrodite.config.utils import config
 from aphrodite.logger import init_logger
@@ -204,6 +205,15 @@ class CompilationConfig:
     """The directory to store the compiled graph, to accelerate Inductor
     compilation. By default, it will use model-related information to generate
     a cache directory."""
+    compile_cache_save_format: Literal["binary", "unpacked"] = field(
+        default_factory=lambda: envs.APHRODITE_COMPILE_CACHE_SAVE_FORMAT
+    )
+    """Format for saving torch compile cache:\n
+    - "binary": saves as binary file (multiprocess safe)\n
+    - "unpacked": saves as directory structure for inspection/debugging
+    (NOT multiprocess safe)\n
+    Defaults to `VLLM_COMPILE_CACHE_SAVE_FORMAT` if not specified.
+    """
     backend: str = ""
     """The backend for compilation. It needs to be a string:
 
@@ -475,6 +485,7 @@ def compute_hash(self) -> str:
         factors.append(self.inductor_compile_config)
         factors.append(self.inductor_passes)
         factors.append(self.pass_config.uuid())
+        factors.append(self.compile_cache_save_format)
         return hashlib.sha256(str(factors).encode()).hexdigest()
 
     def __repr__(self) -> str:
@@ -514,6 +525,13 @@ def validate_cudagraph_mode_before(cls, value: Any) -> Any:
             return CUDAGraphMode[value.upper()]
         return value
 
+    @field_validator("compile_cache_save_format")
+    @classmethod
+    def validate_compile_cache_save_format(cls, value: str) -> str:
+        if value not in ("binary", "unpacked"):
+            raise ValueError(f"compile_cache_save_format must be 'binary' or 'unpacked', got: {value}")
+        return value
+
     def __post_init__(self) -> None:
         if self.level is not None:
             logger.warning(
 
@@ -37,6 +37,8 @@ class LoadConfig:
     more information.\n
     - "runai_streamer" will load the Safetensors weights using Run:ai Model
     Streamer.\n
+    - "runai_streamer_sharded" will load weights from pre-sharded checkpoint
+    files using Run:ai Model Streamer.\n
     - "bitsandbytes" will load the weights using bitsandbytes quantization.\n
     - "sharded_state" will load weights from pre-sharded checkpoint files,
     supporting efficient loading of tensor-parallel models.\n
 
@@ -1406,6 +1406,11 @@ def get_mamba_chunk_size(self) -> int | None:
         if chunk_size is None:
             # used by e.g. Mamba2, NemotronH, Zamba
             chunk_size = getattr(self.hf_text_config, "chunk_size", None)
+
+        # Since Mamba1 does not have a chunk notion
+        # we use a default chunk size of 1024.
+        if chunk_size is None:
+            chunk_size = 2048
         return chunk_size
 
     def get_multimodal_config(self) -> MultiModalConfig: