dphnAI
diff --git a/‎aphrodite/config/model.py‎
Lines changed: 12 additions & 0 deletions b/‎aphrodite/config/model.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎aphrodite/config/multimodal.py‎
Lines changed: 45 additions & 0 deletions b/‎aphrodite/config/multimodal.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎aphrodite/engine/arg_utils.py‎
Lines changed: 24 additions & 0 deletions b/‎aphrodite/engine/arg_utils.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎aphrodite/kernels/triton/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎aphrodite/kernels/triton/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎aphrodite/kernels/triton/qkv_padded_fp8_quant.py‎
Lines changed: 170 additions & 0 deletions b/‎aphrodite/kernels/triton/qkv_padded_fp8_quant.py‎
Lines changed: 170 additions & 0 deletions
@@ -323,6 +323,10 @@ class ModelConfig:
     mm_encoder_only: InitVar[bool | None] = None  # type: ignore[assignment]
     mm_encoder_tp_mode: InitVar[MMEncoderTPMode | None] = None  # type: ignore[assignment]
     mm_encoder_attn_backend: InitVar[AttentionBackendEnum | str | None] = None  # type: ignore[assignment]
+    mm_encoder_attn_dtype: InitVar[str | None] = None  # type: ignore[assignment]
+    mm_encoder_fp8_scale_path: InitVar[str | None] = None  # type: ignore[assignment]
+    mm_encoder_fp8_scale_save_path: InitVar[str | None] = None  # type: ignore[assignment]
+    mm_encoder_fp8_scale_save_margin: InitVar[float | None] = None  # type: ignore[assignment]
     interleave_mm_strings: InitVar[bool | None] = None  # type: ignore[assignment]
     skip_mm_profiling: InitVar[bool | None] = None  # type: ignore[assignment]
     video_pruning_rate: InitVar[float | None] = None  # type: ignore[assignment]
@@ -443,6 +447,10 @@ def __post_init__(
         mm_encoder_only: bool | None,
         mm_encoder_tp_mode: MMEncoderTPMode | None,
         mm_encoder_attn_backend: AttentionBackendEnum | str | None,
+        mm_encoder_attn_dtype: str | None,
+        mm_encoder_fp8_scale_path: str | None,
+        mm_encoder_fp8_scale_save_path: str | None,
+        mm_encoder_fp8_scale_save_margin: float | None,
         interleave_mm_strings: bool | None,
         skip_mm_profiling: bool | None,
         video_pruning_rate: float | None,
@@ -631,6 +639,10 @@ def __post_init__(
                 mm_encoder_only=mm_encoder_only,
                 mm_encoder_tp_mode=mm_encoder_tp_mode,
                 mm_encoder_attn_backend=mm_encoder_attn_backend,
+                mm_encoder_attn_dtype=mm_encoder_attn_dtype,
+                mm_encoder_fp8_scale_path=mm_encoder_fp8_scale_path,
+                mm_encoder_fp8_scale_save_path=mm_encoder_fp8_scale_save_path,
+                mm_encoder_fp8_scale_save_margin=mm_encoder_fp8_scale_save_margin,
                 interleave_mm_strings=interleave_mm_strings,
                 skip_mm_profiling=skip_mm_profiling,
                 video_pruning_rate=video_pruning_rate,
 
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Mapping
+from pathlib import Path
 from typing import Any, Literal, TypeAlias, TypedDict, final
 
 from pydantic import ConfigDict, Field, field_validator, model_validator
@@ -158,6 +159,24 @@ class MultiModalConfig:
     """Optional override for the multi-modal encoder attention backend when
     using vision transformers. Accepts any value from
     `aphrodite.v1.attention.backends.registry.AttentionBackendEnum` (e.g. `FLASH_ATTN`)."""
+    mm_encoder_attn_dtype: Literal["fp8"] | None = None
+    """Optional dtype override for ViT encoder attention. Set to `"fp8"` to
+    enable FP8 quantization via the FlashInfer cuDNN backend. When set to
+    `"fp8"` without a scale file, dynamic scaling is used automatically.
+    See docs/features/quantization/fp8_vit_attn.md for details."""
+    mm_encoder_fp8_scale_path: str | None = None
+    """Path to a JSON file containing per-layer FP8 Q/K/V scales for ViT
+    encoder attention. When provided (with `mm_encoder_attn_dtype="fp8"`),
+    static scaling is used. When omitted, dynamic scaling is used."""
+    mm_encoder_fp8_scale_save_path: str | None = None
+    """When set with dynamic FP8 scaling (`mm_encoder_attn_dtype="fp8"`
+    and no `mm_encoder_fp8_scale_path`), saves the calibrated scales to
+    this file after the amax history buffer is full. The saved file can
+    then be used as `mm_encoder_fp8_scale_path` in subsequent runs."""
+    mm_encoder_fp8_scale_save_margin: float = Field(default=1.5, gt=0.0)
+    """Safety margin multiplied onto scales when auto-saving. A value > 1
+    leaves headroom so that inputs with larger activations than the
+    calibration set do not overflow FP8 range. Default 1.5."""
     interleave_mm_strings: bool = False
     """Enable fully interleaved support for multimodal prompts, while using
     --chat-template-content-format=string."""
@@ -227,6 +246,30 @@ def _validate_multimodal_config(self):
             raise ValueError(
                 "'mm_shm_cache_max_object_size_mb' should only be set when 'mm_processor_cache_type' is 'shm'."
             )
+        # Validate FP8 scale path combinations.
+        if self.mm_encoder_attn_dtype != "fp8" and (
+            self.mm_encoder_fp8_scale_path is not None or self.mm_encoder_fp8_scale_save_path is not None
+        ):
+            raise ValueError(
+                "'mm_encoder_fp8_scale_path' and "
+                "'mm_encoder_fp8_scale_save_path' require "
+                "'mm_encoder_attn_dtype' to be 'fp8'."
+            )
+        if self.mm_encoder_fp8_scale_path is not None and self.mm_encoder_fp8_scale_save_path is not None:
+            raise ValueError(
+                "'mm_encoder_fp8_scale_save_path' cannot be used with "
+                "'mm_encoder_fp8_scale_path' (saving requires dynamic scaling)."
+            )
+
+        # Validate file paths exist.
+        if self.mm_encoder_fp8_scale_path is not None:
+            scale_path = Path(self.mm_encoder_fp8_scale_path)
+            if not scale_path.is_file():
+                raise FileNotFoundError(f"FP8 scale file not found: {scale_path}")
+        if self.mm_encoder_fp8_scale_save_path is not None:
+            save_parent = Path(self.mm_encoder_fp8_scale_save_path).parent
+            if not save_parent.is_dir():
+                raise FileNotFoundError(f"Parent directory for FP8 scale save path not found: {save_parent}")
         return self
 
     def compute_hash(self) -> str:
@@ -244,6 +287,8 @@ def compute_hash(self) -> str:
         factors: list[Any] = [
             self.mm_encoder_attn_backend.name if self.mm_encoder_attn_backend is not None else None,
             self.mm_encoder_tp_mode,
+            self.mm_encoder_attn_dtype,
+            self.mm_encoder_fp8_scale_path,
         ]
         hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
@@ -508,6 +508,10 @@ class EngineArgs:
     mm_encoder_only: bool = MultiModalConfig.mm_encoder_only
     mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode
     mm_encoder_attn_backend: AttentionBackendEnum | str | None = MultiModalConfig.mm_encoder_attn_backend
+    mm_encoder_attn_dtype: str | None = MultiModalConfig.mm_encoder_attn_dtype
+    mm_encoder_fp8_scale_path: str | None = MultiModalConfig.mm_encoder_fp8_scale_path
+    mm_encoder_fp8_scale_save_path: str | None = MultiModalConfig.mm_encoder_fp8_scale_save_path
+    mm_encoder_fp8_scale_save_margin: float = MultiModalConfig.mm_encoder_fp8_scale_save_margin
     io_processor_plugin: str | None = None
     renderer_num_workers: int = 1
     skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
@@ -1015,6 +1019,22 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--mm-encoder-attn-backend",
             **multimodal_kwargs["mm_encoder_attn_backend"],
         )
+        multimodal_group.add_argument(
+            "--mm-encoder-attn-dtype",
+            **multimodal_kwargs["mm_encoder_attn_dtype"],
+        )
+        multimodal_group.add_argument(
+            "--mm-encoder-fp8-scale-path",
+            **multimodal_kwargs["mm_encoder_fp8_scale_path"],
+        )
+        multimodal_group.add_argument(
+            "--mm-encoder-fp8-scale-save-path",
+            **multimodal_kwargs["mm_encoder_fp8_scale_save_path"],
+        )
+        multimodal_group.add_argument(
+            "--mm-encoder-fp8-scale-save-margin",
+            **multimodal_kwargs["mm_encoder_fp8_scale_save_margin"],
+        )
         multimodal_group.add_argument("--interleave-mm-strings", **multimodal_kwargs["interleave_mm_strings"])
         multimodal_group.add_argument("--skip-mm-profiling", **multimodal_kwargs["skip_mm_profiling"])
 
@@ -1302,6 +1322,10 @@ def create_model_config(self) -> ModelConfig:
             mm_encoder_only=self.mm_encoder_only,
             mm_encoder_tp_mode=self.mm_encoder_tp_mode,
             mm_encoder_attn_backend=self.mm_encoder_attn_backend,
+            mm_encoder_attn_dtype=self.mm_encoder_attn_dtype,
+            mm_encoder_fp8_scale_path=self.mm_encoder_fp8_scale_path,
+            mm_encoder_fp8_scale_save_path=self.mm_encoder_fp8_scale_save_path,
+            mm_encoder_fp8_scale_save_margin=self.mm_encoder_fp8_scale_save_margin,
             pooler_config=self.pooler_config,
             generation_config=self.generation_config,
             override_generation_config=self.override_generation_config,
 
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Triton kernel implementations."""
@@ -0,0 +1,170 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Stride-aware FP8 quantization with head_dim padding for ViT attention.
+
+Reads directly from non-contiguous QKV views using 3D strides and pads
+head_dim to a multiple of 16 for cuDNN compatibility.
+"""
+
+import torch
+
+from aphrodite.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
+from aphrodite.model_executor.layers.quantization.utils.quant_utils import (
+    get_fp8_min_max,
+)
+from aphrodite.platforms import current_platform
+from aphrodite.triton_utils import HAS_TRITON, tl, triton
+from aphrodite.utils.math_utils import round_up
+
+_FP8_MIN, _FP8_MAX = get_fp8_min_max()
+
+
+@triton.jit
+def _quantize_pad_fp8_kernel(
+    x_ptr,
+    y_ptr,
+    scale_ptr,
+    stride_xs,
+    stride_xh,
+    stride_xd,
+    stride_ys,
+    stride_yh,
+    stride_yd,
+    num_heads,
+    n_rows,
+    n_cols,
+    n_cols_padded,
+    fp8_min,
+    fp8_max,
+    SKIP_SCALE: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    mask_m = offs_m < n_rows
+    mask_out = mask_m[:, None] & (offs_n[None, :] < n_cols_padded)
+    mask_in = mask_m[:, None] & (offs_n[None, :] < n_cols)
+
+    # Decompose flattened row into (token, head) for 3D stride indexing.
+    s = offs_m // num_heads
+    h = offs_m % num_heads
+
+    x_ptrs = x_ptr + s[:, None] * stride_xs + h[:, None] * stride_xh + offs_n[None, :] * stride_xd
+    x = tl.load(x_ptrs, mask=mask_in, other=0.0).to(tl.float32)
+    if SKIP_SCALE:
+        x_q = x
+    else:
+        scale = tl.load(scale_ptr)
+        x_q = x / scale
+    x_q = tl.clamp(x_q, fp8_min, fp8_max).to(y_ptr.dtype.element_ty)
+
+    y_ptrs = y_ptr + s[:, None] * stride_ys + h[:, None] * stride_yh + offs_n[None, :] * stride_yd
+    tl.store(y_ptrs, x_q, mask=mask_out)
+
+
+def _get_fp8_pad_quant_config(padded_head_dim: int) -> tuple[int, int, int]:
+    block_n = triton.next_power_of_2(padded_head_dim)
+    block_n = max(16, min(block_n, 128))
+    block_m = 16
+    num_warps = 4
+    return block_m, block_n, num_warps
+
+
+def quantize_fp8_pad_head_dim_triton(
+    tensor: torch.Tensor,
+    scale: torch.Tensor,
+    skip_scale: bool = False,
+    block_m: int | None = None,
+    block_n: int | None = None,
+    num_warps: int | None = None,
+) -> torch.Tensor:
+    """Quantize a 3D/4D tensor to FP8, padding head_dim to a multiple of 16.
+
+    Reads directly from the input using its 3D strides, so non-contiguous
+    views (e.g. Q/K/V slices from an interleaved QKV buffer) are handled
+    without an extra copy.  Output is always a fresh contiguous tensor
+    with shape (S, H, padded_D).
+    """
+    if not HAS_TRITON:
+        raise RuntimeError("Triton is required to quantize with head_dim padding.")
+
+    original_shape = tensor.shape
+    if tensor.dim() == 4:
+        tensor = tensor.view(-1, tensor.shape[-2], tensor.shape[-1])
+    assert tensor.dim() == 3, f"Expected 3D input (S, H, D), got {tensor.dim()}D"
+    S, H, D = tensor.shape
+    padded_head_dim = round_up(D, 16)
+    out_dtype = current_platform.fp8_dtype()
+    output = torch.empty(
+        (S, H, padded_head_dim),
+        device=tensor.device,
+        dtype=out_dtype,
+    )
+
+    scale_1d = scale.reshape(-1)
+    n_rows = S * H
+
+    if block_m is None or block_n is None or num_warps is None:
+        block_m, block_n, num_warps = _get_fp8_pad_quant_config(padded_head_dim)
+
+    grid = (
+        triton.cdiv(n_rows, block_m),
+        triton.cdiv(padded_head_dim, block_n),
+    )
+
+    _quantize_pad_fp8_kernel[grid](
+        tensor,
+        output,
+        scale_1d,
+        tensor.stride(0),
+        tensor.stride(1),
+        tensor.stride(2),
+        output.stride(0),
+        output.stride(1),
+        output.stride(2),
+        H,
+        n_rows,
+        D,
+        padded_head_dim,
+        _FP8_MIN,
+        _FP8_MAX,
+        SKIP_SCALE=skip_scale,
+        BLOCK_M=block_m,
+        BLOCK_N=block_n,
+        num_warps=num_warps,
+    )
+
+    return output.view((*original_shape[:-1], padded_head_dim))
+
+
+def quantize_fp8_maybe_pad_head_dim(
+    tensor: torch.Tensor,
+    scale: torch.Tensor,
+    fp8_quant: QuantFP8,
+    skip_scale: bool = False,
+) -> torch.Tensor:
+    """Quantize a 3D/4D tensor to FP8, padding head_dim to a multiple of 16
+    only when needed.
+
+    Accepts (S, H, D) or (B, S, H, D) input. Uses ``fp8_quant`` (a
+    :class:`QuantFP8` CustomOp) when head_dim is already aligned to 16
+    (no padding); otherwise falls back to a stride-aware Triton kernel
+    that pads head_dim to a multiple of 16.
+    """
+    head_dim = tensor.shape[-1]
+    if head_dim % 16 != 0:
+        return quantize_fp8_pad_head_dim_triton(tensor, scale, skip_scale=skip_scale)
+
+    if skip_scale:
+        return tensor.to(current_platform.fp8_dtype())
+
+    # QuantFP8 expects 2D: flatten all dims except (H, D).
+    orig_shape = tensor.shape
+    total_tokens = tensor.numel() // (orig_shape[-1] * orig_shape[-2])
+    tensor_2d = tensor.reshape(total_tokens, -1)
+    fp8_tensor, _ = fp8_quant(tensor_2d, scale=scale)
+    return fp8_tensor.reshape(orig_shape)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# SPDX-License-Identifier: Apache-2.0`
	`2`	`+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
	`3`	`+"""Triton kernel implementations."""`