Skip to content

Commit e5538f1

Browse files
authored
[v0] remove APHRODITE_USE_V1 from platform and v1 (#1590)
Signed-off-by: AlpinDale <alpindale@gmail.com>
1 parent 52d12ec commit e5538f1

8 files changed

Lines changed: 110 additions & 178 deletions

File tree

aphrodite/platforms/cuda.py

Lines changed: 76 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -252,16 +252,12 @@ def get_attn_backend_cls(
252252
"FLASHMLA, FLASH_ATTN_MLA, or TRITON_MLA. Alternatively, set "
253253
"APHRODITE_MLA_DISABLE=1 to disable MLA for this model."
254254
)
255-
if not use_v1:
256-
raise RuntimeError(
257-
"MLA attention backends require the V1 engine. Set APHRODITE_USE_V1=1 to enable them."
258-
)
259255

260256
from aphrodite.attention.ops.flashmla import is_flashmla_dense_supported
261257
from aphrodite.attention.utils.fa_utils import flash_attn_supports_mla
262258

263259
if use_sparse:
264-
logger.info_once("Using Sparse MLA backend on V1 engine.", scope="global")
260+
logger.info_once("Using Sparse MLA backend.", scope="global")
265261
return "aphrodite.v1.attention.backends.mla.flashmla_sparse.FlashMLASparseBackend"
266262

267263
use_cutlassmla = selected_backend == _Backend.CUTLASS_MLA or (
@@ -281,13 +277,13 @@ def get_attn_backend_cls(
281277
use_triton = selected_backend == _Backend.TRITON_MLA or (selected_backend is None)
282278

283279
if use_cutlassmla:
284-
logger.info_once("Using Cutlass MLA backend on V1 engine.", scope="local")
280+
logger.info_once("Using Cutlass MLA backend.", scope="local")
285281
return "aphrodite.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend"
286282
if use_flashinfermla:
287283
from aphrodite.v1.attention.backends.utils import set_kv_cache_layout
288284

289285
set_kv_cache_layout("HND")
290-
logger.info_once("Using FlashInfer MLA backend on V1 engine.", scope="global")
286+
logger.info_once("Using FlashInfer MLA backend.", scope="global")
291287
return "aphrodite.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend"
292288
if use_flashmla:
293289
if block_size % 64 != 0:
@@ -296,106 +292,96 @@ def get_attn_backend_cls(
296292
block_size,
297293
)
298294
else:
299-
logger.info_once("Using FlashMLA backend on V1 engine.", scope="global")
295+
logger.info_once("Using FlashMLA backend.", scope="global")
300296
return "aphrodite.v1.attention.backends.mla.flashmla.FlashMLABackend"
301297
if use_flashattn:
302-
logger.info_once("Using FlashAttention MLA backend on V1 engine.", scope="global")
298+
logger.info_once("Using FlashAttention MLA backend.", scope="global")
303299
return "aphrodite.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend"
304300
if use_triton:
305-
logger.info_once("Using Triton MLA backend on V1 engine.", scope="global")
301+
logger.info_once("Using Triton MLA backend.", scope="global")
306302
return "aphrodite.v1.attention.backends.mla.triton_mla.TritonMLABackend"
307-
if use_v1:
308-
FLASHINFER_V1 = "aphrodite.v1.attention.backends.flashinfer.FlashInferBackend" # noqa: E501
309-
FLEX_ATTENTION_V1 = "aphrodite.v1.attention.backends.flex_attention.FlexAttentionBackend" # noqa: E501
310-
TRITON_ATTN = "aphrodite.v1.attention.backends.triton_attn.TritonAttentionBackend" # noqa: E501
311-
FLASH_ATTN_V1 = "aphrodite.v1.attention.backends.flash_attn.FlashAttentionBackend" # noqa: E501
312-
TREE_ATTN_V1 = "aphrodite.v1.attention.backends.tree_attn.TreeAttentionBackend" # noqa: E501
313-
XFORMERS_V1 = "aphrodite.v1.attention.backends.xformers.XFormersAttentionBackend" # noqa: E501
314303

315-
use_fp8_kv_cache = kv_cache_dtype is not None and kv_cache_dtype.startswith("fp8")
304+
FLASHINFER_V1 = "aphrodite.v1.attention.backends.flashinfer.FlashInferBackend" # noqa: E501
305+
FLEX_ATTENTION_V1 = "aphrodite.v1.attention.backends.flex_attention.FlexAttentionBackend" # noqa: E501
306+
TRITON_ATTN = "aphrodite.v1.attention.backends.triton_attn.TritonAttentionBackend" # noqa: E501
307+
FLASH_ATTN_V1 = "aphrodite.v1.attention.backends.flash_attn.FlashAttentionBackend" # noqa: E501
308+
TREE_ATTN_V1 = "aphrodite.v1.attention.backends.tree_attn.TreeAttentionBackend" # noqa: E501
309+
XFORMERS_V1 = "aphrodite.v1.attention.backends.xformers.XFormersAttentionBackend" # noqa: E501
316310

317-
if selected_backend == _Backend.FLASHINFER:
318-
logger.info_once("Using FlashInfer backend on V1 engine.", scope="global")
319-
if cls.has_device_capability(100):
320-
from aphrodite.v1.attention.backends.utils import set_kv_cache_layout
311+
use_fp8_kv_cache = kv_cache_dtype is not None and kv_cache_dtype.startswith("fp8")
321312

322-
set_kv_cache_layout("HND")
323-
return FLASHINFER_V1
324-
elif selected_backend == _Backend.FLEX_ATTENTION:
325-
logger.info_once("Using FlexAttention backend on V1 engine.", scope="global")
326-
return FLEX_ATTENTION_V1
327-
elif selected_backend == _Backend.TRITON_ATTN:
328-
logger.info_once("Using Triton backend on V1 engine.", scope="global")
329-
return TRITON_ATTN
330-
elif selected_backend == _Backend.FLASH_ATTN:
331-
logger.info_once("Using Flash Attention backend on V1 engine.", scope="global")
332-
return FLASH_ATTN_V1
333-
elif selected_backend == _Backend.TREE_ATTN:
334-
logger.info_once("Using Tree Attention backend on V1 engine.", scope="global")
335-
return TREE_ATTN_V1
336-
elif selected_backend == _Backend.XFORMERS:
337-
logger.info_once("Using XFormers backend on V1 engine.", scope="global")
338-
return XFORMERS_V1
339-
340-
from aphrodite.attention.selector import is_attn_backend_supported
341-
342-
# Default backends for V1 engine
343-
# Prefer FlashInfer for Blackwell GPUs if installed
344-
if cls.is_device_capability(100):
345-
if is_default_backend_supported := is_attn_backend_supported(FLASHINFER_V1, head_size, dtype):
346-
from aphrodite.v1.attention.backends.utils import set_kv_cache_layout
313+
if selected_backend == _Backend.FLASHINFER:
314+
logger.info_once("Using FlashInfer backend.")
315+
if cls.has_device_capability(100):
316+
from aphrodite.v1.attention.backends.utils import set_kv_cache_layout
347317

348-
logger.info_once(
349-
"Using FlashInfer backend with HND KV cache layout on "
350-
"V1 engine by default for Blackwell (SM 10.0) GPUs.",
351-
scope="global",
352-
)
353-
set_kv_cache_layout("HND")
318+
set_kv_cache_layout("HND")
319+
return FLASHINFER_V1
320+
elif selected_backend == _Backend.FLEX_ATTENTION:
321+
logger.info_once("Using FlexAttention backend.")
322+
return FLEX_ATTENTION_V1
323+
elif selected_backend == _Backend.TRITON_ATTN:
324+
logger.info_once("Using Triton backend.")
325+
return TRITON_ATTN
326+
elif selected_backend == _Backend.FLASH_ATTN:
327+
logger.info_once("Using Flash Attention backend.")
328+
return FLASH_ATTN_V1
329+
elif selected_backend == _Backend.TREE_ATTN:
330+
logger.info_once("Using Tree Attention backend.")
331+
return TREE_ATTN_V1
332+
elif selected_backend == _Backend.XFORMERS:
333+
logger.info_once("Using XFormers backend.")
334+
return XFORMERS_V1
335+
336+
from aphrodite.attention.selector import is_attn_backend_supported
337+
338+
# Default backends for V1 engine
339+
# Prefer FlashInfer for Blackwell GPUs if installed
340+
if cls.is_device_capability(100):
341+
if is_default_backend_supported := is_attn_backend_supported(FLASHINFER_V1, head_size, dtype):
342+
from aphrodite.v1.attention.backends.utils import set_kv_cache_layout
354343

355-
return FLASHINFER_V1
344+
logger.info_once(
345+
"Using FlashInfer backend with HND KV cache layout on "
346+
"V1 engine by default for Blackwell (SM 10.0) GPUs.",
347+
scope="global",
348+
)
349+
set_kv_cache_layout("HND")
356350

357-
if not is_default_backend_supported.can_import:
358-
logger.warning_once(
359-
"FlashInfer failed to import for V1 engine on "
360-
"Blackwell (SM 10.0) GPUs; it is recommended to "
361-
"install FlashInfer for better performance.",
362-
scope="global",
363-
)
351+
return FLASHINFER_V1
364352

365-
# FlashAttention is the default for SM 8.0+ GPUs
366-
if cls.has_device_capability(80):
367-
if (has_sink or use_fp8_kv_cache) and not cls.is_device_capability(90):
368-
logger.info_once("Using Triton backend on V1 engine.", scope="global")
369-
return TRITON_ATTN
370-
elif is_default_backend_supported := is_attn_backend_supported(
371-
FLASH_ATTN_V1, head_size, dtype, allow_import_error=False
372-
):
373-
logger.info_once("Using Flash Attention backend on V1 engine.", scope="global")
374-
return FLASH_ATTN_V1
375-
376-
# FlexAttention is the default for older GPUs
377-
else:
378-
logger.info_once("Using FlexAttention backend on V1 engine.", scope="global")
379-
return FLEX_ATTENTION_V1
353+
if not is_default_backend_supported.can_import:
354+
logger.warning_once(
355+
"FlashInfer failed to import for V1 engine on "
356+
"Blackwell (SM 10.0) GPUs; it is recommended to "
357+
"install FlashInfer for better performance.",
358+
scope="global",
359+
)
380360

381-
assert not is_default_backend_supported
361+
# FlashAttention is the default for SM 8.0+ GPUs
362+
if cls.has_device_capability(80):
363+
if (has_sink or use_fp8_kv_cache) and not cls.is_device_capability(90):
364+
logger.info_once("Using Triton backend.", scope="global")
365+
return TRITON_ATTN
366+
elif is_default_backend_supported := is_attn_backend_supported(
367+
FLASH_ATTN_V1, head_size, dtype, allow_import_error=False
368+
):
369+
logger.info_once("Using Flash Attention backend.", scope="global")
370+
return FLASH_ATTN_V1
382371

383-
use_flex_attention_reason = {}
384-
if not is_default_backend_supported.head_size:
385-
use_flex_attention_reason["head_size"] = head_size
386-
if not is_default_backend_supported.dtype:
387-
use_flex_attention_reason["dtype"] = dtype
372+
assert not is_default_backend_supported
388373

389-
logger.info_once(
390-
"Using FlexAttention backend for %s on V1 engine.",
391-
", ".join(f"{k}={v}" for k, v in use_flex_attention_reason.items()),
392-
scope="global",
393-
)
394-
return FLEX_ATTENTION_V1
374+
use_flex_attention_reason = {}
375+
if not is_default_backend_supported.head_size:
376+
use_flex_attention_reason["head_size"] = head_size
377+
if not is_default_backend_supported.dtype:
378+
use_flex_attention_reason["dtype"] = dtype
395379

396-
raise RuntimeError(
397-
"V0 attention backends have been removed. Set APHRODITE_USE_V1=1 to select a supported backend."
380+
logger.info_once(
381+
"Using FlexAttention backend for %s.",
382+
", ".join(f"{k}={v}" for k, v in use_flex_attention_reason.items()),
398383
)
384+
return FLEX_ATTENTION_V1
399385

400386
@classmethod
401387
def get_punica_wrapper(cls) -> str:

aphrodite/platforms/interface.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -453,11 +453,7 @@ def use_all_gather(cls) -> bool:
453453
"""
454454
Whether to use allgather in LogitsProcessor to gather the logits.
455455
"""
456-
import aphrodite.envs as envs
457-
from aphrodite.config import get_current_aphrodite_config
458-
459-
parallel_config = get_current_aphrodite_config().parallel_config
460-
return envs.APHRODITE_USE_V1 or parallel_config.distributed_executor_backend == "external_launcher"
456+
return True
461457

462458
@classmethod
463459
def use_custom_allreduce(cls) -> bool:

aphrodite/platforms/rocm.py

Lines changed: 26 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ def use_rocm_custom_paged_attention(
146146
# disabled due to observed numerical discrepancy.
147147
if ON_GFX9:
148148
return (
149-
(not envs.APHRODITE_USE_V1 or sliding_window == 0 or sliding_window == (-1, -1))
149+
(sliding_window == 0 or sliding_window == (-1, -1))
150150
and (qtype == torch.half or qtype == torch.bfloat16)
151151
and (head_size == 64 or head_size == 128)
152152
and (block_size == 16 or block_size == 32)
@@ -160,7 +160,7 @@ def use_rocm_custom_paged_attention(
160160
else:
161161
return (
162162
ON_GFX11_GFX12
163-
and (not envs.APHRODITE_USE_V1 or sliding_window == 0 or sliding_window == (-1, -1))
163+
and (sliding_window == 0 or sliding_window == (-1, -1))
164164
and (qtype == torch.half or qtype == torch.bfloat16)
165165
and head_size == 128
166166
and block_size == 16
@@ -229,11 +229,6 @@ def get_attn_backend_cls(
229229
if use_sparse:
230230
raise NotImplementedError("Sparse Attention is not supported on ROCm.")
231231
if use_mla:
232-
if not use_v1:
233-
raise RuntimeError(
234-
"MLA attention backends require the V1 engine. Set APHRODITE_USE_V1=1 to enable them."
235-
)
236-
237232
from aphrodite.v1.attention.backends.mla.rocm_aiter_mla import is_aiter_mla_enabled
238233

239234
if selected_backend is None:
@@ -243,14 +238,14 @@ def get_attn_backend_cls(
243238

244239
if selected_backend == _Backend.TRITON_MLA:
245240
if block_size != 1:
246-
logger.info_once("Using Triton MLA backend on V1 engine.")
241+
logger.info_once("Using Triton MLA backend.", scope="global")
247242
return "aphrodite.v1.attention.backends.mla.triton_mla.TritonMLABackend"
248243
raise ValueError(
249244
f" The selected backend, {selected_backend.name},does not support block size {block_size}."
250245
)
251246
if selected_backend == _Backend.ROCM_AITER_MLA:
252247
if block_size == 1:
253-
logger.info("Using AITER MLA backend on V1 engine.")
248+
logger.info_once("Using AITER MLA backend.", scope="global")
254249
return "aphrodite.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend" # noqa: E501
255250
raise ValueError(
256251
f" The selected backend, {selected_backend.name},"
@@ -261,31 +256,27 @@ def get_attn_backend_cls(
261256
f" The selected backend, {selected_backend.name},is not MLA type while requested for MLA backend."
262257
)
263258

264-
if envs.APHRODITE_USE_V1:
265-
if selected_backend == _Backend.FLEX_ATTENTION:
266-
logger.info("Using FlexAttention backend on V1 engine.")
267-
return "aphrodite.v1.attention.backends.flex_attention.FlexAttentionBackend"
268-
if (
269-
envs.APHRODITE_ROCM_USE_AITER and envs.APHRODITE_ROCM_USE_AITER_MHA and on_gfx9()
270-
) or selected_backend == _Backend.ROCM_AITER_FA:
271-
logger.info("Using Aiter Flash Attention backend on V1 engine.")
272-
return "aphrodite.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
273-
if (
274-
envs.APHRODITE_ROCM_USE_AITER and envs.APHRODITE_ROCM_USE_AITER_UNIFIED_ATTENTION
275-
) or selected_backend == _Backend.ROCM_AITER_UNIFIED_ATTN:
276-
logger.info("Using Aiter Unified Attention backend on V1 engine.")
277-
return "aphrodite.v1.attention.backends.rocm_aiter_unified_attn.RocmAiterUnifiedAttentionBackend"
278-
if envs.APHRODITE_V1_USE_PREFILL_DECODE_ATTENTION or selected_backend == _Backend.ROCM_ATTN:
279-
# rocm specific backend, with aiter and/or
280-
# triton prefix-prefill
281-
logger.info("Using Rocm Attention backend on V1 engine.")
282-
return "aphrodite.v1.attention.backends.rocm_attn.RocmAttentionBackend"
283-
# default case, using triton unified attention
284-
logger.info("Using Triton Attention backend on V1 engine.")
285-
return "aphrodite.v1.attention.backends.triton_attn.TritonAttentionBackend"
286-
raise RuntimeError(
287-
"V0 attention backends have been removed. Set APHRODITE_USE_V1=1 to select a supported backend."
288-
)
259+
if selected_backend == _Backend.FLEX_ATTENTION:
260+
logger.info("Using FlexAttention backend.")
261+
return "aphrodite.v1.attention.backends.flex_attention.FlexAttentionBackend"
262+
if (
263+
envs.APHRODITE_ROCM_USE_AITER and envs.APHRODITE_ROCM_USE_AITER_MHA and on_gfx9()
264+
) or selected_backend == _Backend.ROCM_AITER_FA:
265+
logger.info("Using Aiter Flash Attention backend.")
266+
return "aphrodite.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
267+
if (
268+
envs.APHRODITE_ROCM_USE_AITER and envs.APHRODITE_ROCM_USE_AITER_UNIFIED_ATTENTION
269+
) or selected_backend == _Backend.ROCM_AITER_UNIFIED_ATTN:
270+
logger.info("Using Aiter Unified Attention backend.")
271+
return "aphrodite.v1.attention.backends.rocm_aiter_unified_attn.RocmAiterUnifiedAttentionBackend"
272+
if envs.APHRODITE_V1_USE_PREFILL_DECODE_ATTENTION or selected_backend == _Backend.ROCM_ATTN:
273+
# rocm specific backend, with aiter and/or
274+
# triton prefix-prefill
275+
logger.info("Using Rocm Attention backend.")
276+
return "aphrodite.v1.attention.backends.rocm_attn.RocmAttentionBackend"
277+
# default case, using triton unified attention
278+
logger.info("Using Triton Attention backend.")
279+
return "aphrodite.v1.attention.backends.triton_attn.TritonAttentionBackend"
289280

290281
@classmethod
291282
def set_device(cls, device: torch.device) -> None:
@@ -346,7 +337,6 @@ def check_and_update_config(cls, aphrodite_config: "AphroditeConfig") -> None:
346337
parallel_config = aphrodite_config.parallel_config
347338
is_eager_execution = compilation_config == CUDAGraphMode.NONE
348339

349-
use_v1 = envs.APHRODITE_USE_V1
350340
use_aiter_rms_norm = envs.APHRODITE_ROCM_USE_AITER and envs.APHRODITE_ROCM_USE_AITER_RMSNORM
351341

352342
if cache_config and cache_config.block_size is None:
@@ -355,12 +345,7 @@ def check_and_update_config(cls, aphrodite_config: "AphroditeConfig") -> None:
355345
if parallel_config.worker_cls == "auto":
356346
parallel_config.worker_cls = "aphrodite.v1.worker.gpu_worker.Worker"
357347
# Aiter rms norm perform best when CUDA Graph capture is enabled.
358-
if (
359-
use_v1
360-
and use_aiter_rms_norm
361-
and not is_eager_execution
362-
and "-rms_norm" not in compilation_config.custom_ops
363-
):
348+
if use_aiter_rms_norm and not is_eager_execution and "-rms_norm" not in compilation_config.custom_ops:
364349
compilation_config.custom_ops.append("+rms_norm")
365350

366351
@classmethod

aphrodite/platforms/tpu.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -189,10 +189,6 @@ def is_pin_memory_available(cls):
189189
def get_device_communicator_cls(cls) -> str:
190190
return "aphrodite.distributed.device_communicators.tpu_communicator.TpuCommunicator" # noqa
191191

192-
@classmethod
193-
def use_all_gather(cls) -> bool:
194-
return True
195-
196192
@classmethod
197193
def validate_request(
198194
cls,

0 commit comments

Comments
 (0)