Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_WEBUI "llama: build the embedded Web UI for server" ON)
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
option(LLAMA_TESTS_INSTALL "llama: install tests" OFF)

# 3rd party libs
option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" ON)
Expand Down
22 changes: 12 additions & 10 deletions ggml/src/ggml-cuda/fattn-vec.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -637,6 +637,17 @@ extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);

// TurboQuant2 -- turbo2 K + turbo2 V
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0);
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0);
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0);

// TurboQuant4 — turbo4 K + turbo4 V (KV cache uses same type)
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);

#ifndef GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA
// Mixed turbo3/q8_0 KV cache types
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0);
Expand All @@ -651,11 +662,6 @@ extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_TURBO3_0);
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_TURBO3_0);
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_TURBO3_0);

// TurboQuant2 -- turbo2 K + turbo2 V
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0);
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0);
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0);

// Mixed turbo2/q8_0 KV cache types
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0);
Expand All @@ -679,11 +685,6 @@ extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO3_0);
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO3_0);
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO3_0);

// TurboQuant4 — turbo4 K + turbo4 V (KV cache uses same type)
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);

// Mixed turbo4/q8_0 KV cache types
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_Q8_0);
Expand Down Expand Up @@ -715,3 +716,4 @@ extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO2_0);
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO4_0);
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO4_0);
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO4_0);
#endif // GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA
35 changes: 21 additions & 14 deletions ggml/src/ggml-cuda/fattn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -306,16 +306,20 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t
// TurboQuant3 KV cache types (always enabled)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0)

// TurboQuant2 KV cache types (always enabled)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0)

// TurboQuant4 KV cache types (always enabled)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0)

#ifndef GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA
// Mixed turbo3/q8_0 KV cache types
FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_TURBO3_0)

// Mixed f16/turbo3 KV cache types
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_TURBO3_0)

// TurboQuant2 KV cache types (always enabled)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0)

// Mixed turbo2/q8_0 KV cache types
FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_TURBO2_0)
Expand All @@ -327,9 +331,6 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t
FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO2_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO3_0)

// TurboQuant4 KV cache types (always enabled)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0)

// Mixed turbo4/q8_0 KV cache types
FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO4_0, GGML_TYPE_Q8_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_TURBO4_0)
Expand All @@ -344,6 +345,7 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t
// Mixed turbo4/turbo2 KV cache types
FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO2_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO4_0)
#endif // GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA

GGML_ABORT("fatal error");
}
Expand Down Expand Up @@ -482,16 +484,21 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
}

// For small batch sizes the vector kernel may be preferable over the kernels optimized for large batch sizes:
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0;
bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0;

#ifdef GGML_USE_HIP
// HIP/ROCm: the TILE/MMA/WMMA FA paths allocate unbounded f16 temp buffers
// for quantized KV types (K_f16, V_f16 in launch_fattn). The pool retains
// peak allocation size, so the temp buffer VRAM exceeds KV compression savings.
// This causes quantized KV to OOM before f16 on the same context length.
// Force VEC path which does inline dequant with zero temp buffer overhead.
// Trade-off: prefill is slower (sequential query processing).
// Limitation: head_dim > 256 cannot use VEC (falls through to TILE).
// HIP/ROCm can optionally disable mixed TurboQuant vec FA. This is useful
// on gfx900/gfx906 where the mixed turbo vec path is not reliable, while
// preserving same-type turbo vec FA and the full mixed-turbo path on newer
// HIP targets.
const bool k_is_turbo = K->type == GGML_TYPE_TURBO2_0 || K->type == GGML_TYPE_TURBO3_0 || K->type == GGML_TYPE_TURBO4_0;
const bool v_is_turbo = V->type == GGML_TYPE_TURBO2_0 || V->type == GGML_TYPE_TURBO3_0 || V->type == GGML_TYPE_TURBO4_0;
const bool turbo_mixed_kv = (k_is_turbo || v_is_turbo) && K->type != V->type;
#ifdef GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA
if (turbo_mixed_kv) {
can_use_vector_kernel = false;
}
#endif
if ((ggml_is_quantized(K->type) || ggml_is_quantized(V->type)) && can_use_vector_kernel) {
return BEST_FATTN_KERNEL_VEC;
}
Expand Down
43 changes: 30 additions & 13 deletions ggml/src/ggml-hip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,12 @@ list(APPEND GGML_SOURCES_ROCM ${SRCS})
file(GLOB SRCS "../ggml-cuda/template-instances/mmf*.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})

set(GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA_DEFAULT OFF)
if (CMAKE_HIP_ARCHITECTURES MATCHES "(^|;)(gfx900|gfx906)($|;)")
set(GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA_DEFAULT ON)
endif()
option(GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA "ggml: disable mixed TurboQuant vec flash-attention on HIP (recommended for gfx900/gfx906)" ${GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA_DEFAULT})

if (GGML_CUDA_FA_ALL_QUANTS)
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
Expand All @@ -83,20 +89,27 @@ else()
../ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo3_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-q8_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo3_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-turbo2_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-q8_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo2_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo2_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-turbo3_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo4_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-q8_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo4_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo3_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo4_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo2_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-turbo4_0.cu)
../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo4_0.cu)

if (NOT GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA)
list(APPEND GGML_SOURCES_ROCM
../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-q8_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo3_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-f16-turbo3_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-q8_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo2_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-f16-turbo2_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo2_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-turbo3_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-q8_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo4_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-f16-turbo4_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo3_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo4_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo2_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-turbo4_0.cu)
endif()
endif()

ggml_add_backend_library(ggml-hip
Expand Down Expand Up @@ -127,6 +140,10 @@ if (GGML_HIP_GRAPHS)
add_compile_definitions(GGML_HIP_GRAPHS)
endif()

if (GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA)
add_compile_definitions(GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA)
endif()

if (GGML_HIP_NO_VMM)
add_compile_definitions(GGML_HIP_NO_VMM)
endif()
Expand Down