diff --git a/CMakeLists.txt b/CMakeLists.txt index caea48c5060..372f6c95356 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -110,7 +110,7 @@ option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_WEBUI "llama: build the embedded Web UI for server" ON) option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT}) -option(LLAMA_TESTS_INSTALL "llama: install tests" ON) +option(LLAMA_TESTS_INSTALL "llama: install tests" OFF) # 3rd party libs option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" ON) diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh index c28a476a843..255e3cfe0c2 100644 --- a/ggml/src/ggml-cuda/fattn-vec.cuh +++ b/ggml/src/ggml-cuda/fattn-vec.cuh @@ -637,6 +637,17 @@ extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0); extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0); extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0); +// TurboQuant2 -- turbo2 K + turbo2 V +extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0); +extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0); +extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0); + +// TurboQuant4 — turbo4 K + turbo4 V (KV cache uses same type) +extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0); +extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0); +extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0); + +#ifndef GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA // Mixed turbo3/q8_0 KV cache types extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0); extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0); @@ -651,11 +662,6 @@ extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_TURBO3_0); extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_TURBO3_0); extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_TURBO3_0); -// TurboQuant2 -- turbo2 K + turbo2 V -extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0); -extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0); -extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0); - // Mixed turbo2/q8_0 KV cache types extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0); extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0); @@ -679,11 +685,6 @@ extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO3_0); extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO3_0); extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO3_0); -// TurboQuant4 — turbo4 K + turbo4 V (KV cache uses same type) -extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0); -extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0); -extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0); - // Mixed turbo4/q8_0 KV cache types extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_Q8_0); extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_Q8_0); @@ -715,3 +716,4 @@ extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO2_0); extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO4_0); extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO4_0); extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO4_0); +#endif // GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 38d222b7016..863e9af13b7 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -306,6 +306,13 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t // TurboQuant3 KV cache types (always enabled) FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0) + // TurboQuant2 KV cache types (always enabled) + FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0) + + // TurboQuant4 KV cache types (always enabled) + FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0) + +#ifndef GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA // Mixed turbo3/q8_0 KV cache types FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_TURBO3_0) @@ -313,9 +320,6 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t // Mixed f16/turbo3 KV cache types FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_TURBO3_0) - // TurboQuant2 KV cache types (always enabled) - FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0) - // Mixed turbo2/q8_0 KV cache types FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_TURBO2_0) @@ -327,9 +331,6 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO2_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO3_0) - // TurboQuant4 KV cache types (always enabled) - FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0) - // Mixed turbo4/q8_0 KV cache types FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO4_0, GGML_TYPE_Q8_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_TURBO4_0) @@ -344,6 +345,7 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t // Mixed turbo4/turbo2 KV cache types FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO2_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO4_0) +#endif // GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA GGML_ABORT("fatal error"); } @@ -482,16 +484,21 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const } // For small batch sizes the vector kernel may be preferable over the kernels optimized for large batch sizes: - const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0; + bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0; #ifdef GGML_USE_HIP - // HIP/ROCm: the TILE/MMA/WMMA FA paths allocate unbounded f16 temp buffers - // for quantized KV types (K_f16, V_f16 in launch_fattn). The pool retains - // peak allocation size, so the temp buffer VRAM exceeds KV compression savings. - // This causes quantized KV to OOM before f16 on the same context length. - // Force VEC path which does inline dequant with zero temp buffer overhead. - // Trade-off: prefill is slower (sequential query processing). - // Limitation: head_dim > 256 cannot use VEC (falls through to TILE). + // HIP/ROCm can optionally disable mixed TurboQuant vec FA. This is useful + // on gfx900/gfx906 where the mixed turbo vec path is not reliable, while + // preserving same-type turbo vec FA and the full mixed-turbo path on newer + // HIP targets. + const bool k_is_turbo = K->type == GGML_TYPE_TURBO2_0 || K->type == GGML_TYPE_TURBO3_0 || K->type == GGML_TYPE_TURBO4_0; + const bool v_is_turbo = V->type == GGML_TYPE_TURBO2_0 || V->type == GGML_TYPE_TURBO3_0 || V->type == GGML_TYPE_TURBO4_0; + const bool turbo_mixed_kv = (k_is_turbo || v_is_turbo) && K->type != V->type; +#ifdef GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA + if (turbo_mixed_kv) { + can_use_vector_kernel = false; + } +#endif if ((ggml_is_quantized(K->type) || ggml_is_quantized(V->type)) && can_use_vector_kernel) { return BEST_FATTN_KERNEL_VEC; } diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt index f40b1d754f5..7e3117af1e7 100644 --- a/ggml/src/ggml-hip/CMakeLists.txt +++ b/ggml/src/ggml-hip/CMakeLists.txt @@ -72,6 +72,12 @@ list(APPEND GGML_SOURCES_ROCM ${SRCS}) file(GLOB SRCS "../ggml-cuda/template-instances/mmf*.cu") list(APPEND GGML_SOURCES_ROCM ${SRCS}) +set(GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA_DEFAULT OFF) +if (CMAKE_HIP_ARCHITECTURES MATCHES "(^|;)(gfx900|gfx906)($|;)") + set(GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA_DEFAULT ON) +endif() +option(GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA "ggml: disable mixed TurboQuant vec flash-attention on HIP (recommended for gfx900/gfx906)" ${GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA_DEFAULT}) + if (GGML_CUDA_FA_ALL_QUANTS) file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu") list(APPEND GGML_SOURCES_ROCM ${SRCS}) @@ -83,20 +89,27 @@ else() ../ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu ../ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu ../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo3_0.cu - ../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-q8_0.cu - ../ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo3_0.cu ../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-turbo2_0.cu - ../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-q8_0.cu - ../ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo2_0.cu - ../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo2_0.cu - ../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-turbo3_0.cu - ../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo4_0.cu - ../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-q8_0.cu - ../ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo4_0.cu - ../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo3_0.cu - ../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo4_0.cu - ../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo2_0.cu - ../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-turbo4_0.cu) + ../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo4_0.cu) + + if (NOT GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA) + list(APPEND GGML_SOURCES_ROCM + ../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-q8_0.cu + ../ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo3_0.cu + ../ggml-cuda/template-instances/fattn-vec-instance-f16-turbo3_0.cu + ../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-q8_0.cu + ../ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo2_0.cu + ../ggml-cuda/template-instances/fattn-vec-instance-f16-turbo2_0.cu + ../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo2_0.cu + ../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-turbo3_0.cu + ../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-q8_0.cu + ../ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo4_0.cu + ../ggml-cuda/template-instances/fattn-vec-instance-f16-turbo4_0.cu + ../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo3_0.cu + ../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo4_0.cu + ../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo2_0.cu + ../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-turbo4_0.cu) + endif() endif() ggml_add_backend_library(ggml-hip @@ -127,6 +140,10 @@ if (GGML_HIP_GRAPHS) add_compile_definitions(GGML_HIP_GRAPHS) endif() +if (GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA) + add_compile_definitions(GGML_HIP_DISABLE_MIXED_TURBO_VEC_FA) +endif() + if (GGML_HIP_NO_VMM) add_compile_definitions(GGML_HIP_NO_VMM) endif()