cuda: add f16 fallbacks to half FA quant builds

Anbeeld · Anbeeld · commit e623d3984702 · 2026-06-09T11:40:25.000+02:00
HALF now compiles the K&gt;=V half-matrix plus all K/f16 fallback pairs
needed by TurboQuant/TCQ dequant paths: 103 pairs instead of 91.

The previous literal K&gt;=V rule missed reversed K/f16 pairs like
q8_0/f16 that arise when Turbo/TCQ decode-dequant dispatches an
internal f16 transient for the V side. ALL works because it includes
every pair; HALF now covers those too.

Changes:
- CMake (CUDA/HIP/MUSA): full nested loop with f16 predicate and
  count assertion (must equal 103)
- fattn.cu: runtime ggml_cuda_fattn_pair_compiled() HALF branch
  adds type_K/type_V == F16 fallback check
- fattn.cu: regenerated HALF dispatch block with 12 new K/f16 pairs
- gen-fattn-vec-dispatch.py: f16 fallback predicate, assertions for
  ALL=169, HALF=103, DEFAULT=27
- Docs/CI: make HALF the recommended build flag, ALL the escape hatch
diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile
@@ -34,7 +34,7 @@ RUN --mount=type=cache,target=/root/.ccache \
       -DGGML_NATIVE=OFF \
       -DGGML_CUDA=ON \
       -DGGML_CUDA_FA=ON \
-      -DGGML_CUDA_FA_ALL_QUANTS=ON \
+      -DGGML_CUDA_FA_HALF_QUANTS=ON \
       -DGGML_BACKEND_DL=ON \
       -DGGML_CPU_ALL_VARIANTS=ON \
       -DLLAMA_BUILD_TESTS=OFF \
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -496,7 +496,7 @@ jobs:
             -DGPU_TARGETS="${GPU_TARGETS}" \
             -DGGML_HIP=ON \
             -DGGML_CUDA_FA=ON \
-            -DGGML_CUDA_FA_ALL_QUANTS=ON \
+            -DGGML_CUDA_FA_HALF_QUANTS=ON \
             -DHIP_PLATFORM=amd \
             -DGGML_HIP_ROCWMMA_FATTN=ON \
             -DCMAKE_C_COMPILER_LAUNCHER=ccache \
@@ -700,7 +700,7 @@ jobs:
             -DGGML_NATIVE=OFF \
             -DGGML_CUDA=ON \
             -DGGML_CUDA_FA=ON \
-            -DGGML_CUDA_FA_ALL_QUANTS=ON \
+            -DGGML_CUDA_FA_HALF_QUANTS=ON \
             -DGGML_BACKEND_DL=ON \
             -DGGML_CPU_ALL_VARIANTS=ON \
             -DCMAKE_C_COMPILER_LAUNCHER=ccache \
@@ -1101,7 +1101,7 @@ jobs:
             -DGGML_CPU=OFF ^
             -DGGML_CUDA=ON ^
             -DGGML_CUDA_FA=ON ^
-            -DGGML_CUDA_FA_ALL_QUANTS=ON ^
+            -DGGML_CUDA_FA_HALF_QUANTS=ON ^
             -DLLAMA_BUILD_BORINGSSL=ON ^
             -DCMAKE_C_COMPILER_LAUNCHER=ccache ^
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache ^
@@ -1226,7 +1226,7 @@ jobs:
             -DGGML_HIP_ROCWMMA_FATTN=ON `
             -DGGML_HIP=ON `
             -DGGML_CUDA_FA=ON `
-            -DGGML_CUDA_FA_ALL_QUANTS=ON `
+            -DGGML_CUDA_FA_HALF_QUANTS=ON `
             -DCMAKE_C_COMPILER_LAUNCHER=ccache `
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache `
             -DCMAKE_HIP_COMPILER_LAUNCHER=ccache `
diff --git a/AGENTS.md b/AGENTS.md
@@ -22,13 +22,13 @@ Prebuilt Windows binaries (CUDA 12.4/13.1) are on the releases page. Otherwise b
 ```bash
 # Linux (GCC + CUDA)
 cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=ON \
-  -DGGML_CUDA_FA=ON -DGGML_CUDA_FA_ALL_QUANTS=ON \
+  -DGGML_CUDA_FA=ON -DGGML_CUDA_FA_HALF_QUANTS=ON \
   -DCMAKE_BUILD_TYPE=Release
 cmake --build build -j
 
 # Windows (MSVC + CUDA)
 cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=ON ^
-  -DGGML_CUDA_FA=ON -DGGML_CUDA_FA_ALL_QUANTS=ON ^
+  -DGGML_CUDA_FA=ON -DGGML_CUDA_FA_HALF_QUANTS=ON ^
   -DCMAKE_BUILD_TYPE=Release
 cmake --build build --config Release --parallel
 
@@ -37,7 +37,7 @@ cmake -B build -DGGML_METAL=ON -DCMAKE_BUILD_TYPE=Release
 cmake --build build -j
 ```
 
-`GGML_CUDA_FA_ALL_QUANTS=ON` is required for TurboQuant and TCQ cache types. `GGML_CUDA_FA_HALF_QUANTS=ON` is an alternative that compiles only the useful K>=V half of the K/V pair matrix (compiling 91 FA vec K/V pairs instead of 169, reducing FA vec pair instances by ~46% vs ALL_QUANTS). These two flags are mutually exclusive. Add `-DCMAKE_CUDA_ARCHITECTURES=86` for RTX 3090, or `-DCMAKE_CUDA_ARCHITECTURES=89` for RTX 4090, if cross-compiling or building in CI without a GPU.
+`GGML_CUDA_FA_HALF_QUANTS=ON` is the recommended CUDA FlashAttention build mode for this fork. It compiles the useful asymmetric K/V cache type half-matrix plus f16 fallback pairs needed by TurboQuant/TCQ dequant paths: 103 FA vec K/V pairs instead of 169 in ALL_QUANTS. `GGML_CUDA_FA_ALL_QUANTS=ON` remains available for the full K/V matrix and arbitrary asymmetric cache-type combinations. These two flags are mutually exclusive. Add `-DCMAKE_CUDA_ARCHITECTURES=86` for RTX 3090, or `-DCMAKE_CUDA_ARCHITECTURES=89` for RTX 4090, if cross-compiling or building in CI without a GPU.
 
 Key binaries: `build/bin/llama-server`, `build/bin/llama-cli`, `build/bin/llama-bench`, `build/bin/llama-perplexity`.
 
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -22,13 +22,13 @@ Prebuilt Windows binaries (CUDA 12.4/13.1) are on the releases page. Otherwise b
 ```bash
 # Linux (GCC + CUDA)
 cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=ON \
-  -DGGML_CUDA_FA=ON -DGGML_CUDA_FA_ALL_QUANTS=ON \
+  -DGGML_CUDA_FA=ON -DGGML_CUDA_FA_HALF_QUANTS=ON \
   -DCMAKE_BUILD_TYPE=Release
 cmake --build build -j
 
 # Windows (MSVC + CUDA)
 cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=ON ^
-  -DGGML_CUDA_FA=ON -DGGML_CUDA_FA_ALL_QUANTS=ON ^
+  -DGGML_CUDA_FA=ON -DGGML_CUDA_FA_HALF_QUANTS=ON ^
   -DCMAKE_BUILD_TYPE=Release
 cmake --build build --config Release --parallel
 
@@ -37,7 +37,7 @@ cmake -B build -DGGML_METAL=ON -DCMAKE_BUILD_TYPE=Release
 cmake --build build -j
 ```
 
-`GGML_CUDA_FA_ALL_QUANTS=ON` is required for TurboQuant and TCQ cache types. `GGML_CUDA_FA_HALF_QUANTS=ON` is an alternative that compiles only the useful K>=V half of the K/V pair matrix (compiling 91 FA vec K/V pairs instead of 169, reducing FA vec pair instances by ~46% vs ALL_QUANTS). These two flags are mutually exclusive. Add `-DCMAKE_CUDA_ARCHITECTURES=86` for RTX 3090, or `-DCMAKE_CUDA_ARCHITECTURES=89` for RTX 4090, if cross-compiling or building in CI without a GPU.
+`GGML_CUDA_FA_HALF_QUANTS=ON` is the recommended CUDA FlashAttention build mode for this fork. It compiles the useful asymmetric K/V cache type half-matrix plus f16 fallback pairs needed by TurboQuant/TCQ dequant paths: 103 FA vec K/V pairs instead of 169 in ALL_QUANTS. `GGML_CUDA_FA_ALL_QUANTS=ON` remains available for the full K/V matrix and arbitrary asymmetric cache-type combinations. These two flags are mutually exclusive. Add `-DCMAKE_CUDA_ARCHITECTURES=86` for RTX 3090, or `-DCMAKE_CUDA_ARCHITECTURES=89` for RTX 4090, if cross-compiling or building in CI without a GPU.
 
 Key binaries: `build/bin/llama-server`, `build/bin/llama-cli`, `build/bin/llama-bench`, `build/bin/llama-perplexity`.
 
diff --git a/README.md b/README.md
@@ -343,13 +343,13 @@ Building from source with `-DGGML_NATIVE=ON` *may* result in a *tiny* bit better
 ```bash
 # Linux (GCC + CUDA)
 cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=ON \
-  -DGGML_CUDA_FA=ON -DGGML_CUDA_FA_ALL_QUANTS=ON \
+  -DGGML_CUDA_FA=ON -DGGML_CUDA_FA_HALF_QUANTS=ON \
   -DCMAKE_BUILD_TYPE=Release
 cmake --build build -j
 
 # Windows (MSVC + CUDA)
 cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=ON ^
-  -DGGML_CUDA_FA=ON -DGGML_CUDA_FA_ALL_QUANTS=ON ^
+  -DGGML_CUDA_FA=ON -DGGML_CUDA_FA_HALF_QUANTS=ON ^
   -DCMAKE_BUILD_TYPE=Release
 cmake --build build --config Release --parallel
 
@@ -358,7 +358,7 @@ cmake -B build -DGGML_METAL=ON -DCMAKE_BUILD_TYPE=Release
 cmake --build build -j
 ```
 
-`GGML_CUDA_FA_ALL_QUANTS=ON` is required for TurboQuant and TCQ cache types. `GGML_CUDA_FA_HALF_QUANTS=ON` is an alternative that compiles only the useful K>=V half of the K/V pair matrix (compiling 91 FA vec K/V pairs instead of 169, reducing FA vec pair instances by ~46% vs ALL_QUANTS). These two flags are mutually exclusive.
+`GGML_CUDA_FA_HALF_QUANTS=ON` is the recommended CUDA FlashAttention build mode. It compiles the useful asymmetric K/V cache type half-matrix plus f16 fallback pairs needed by TurboQuant/TCQ dequant paths: 103 FA vec K/V pairs instead of 169 in ALL_QUANTS. Use `GGML_CUDA_FA_ALL_QUANTS=ON` only when you need the full K/V matrix or arbitrary asymmetric cache-type combinations. These flags are mutually exclusive.
 
 ### Other Backends
 
diff --git a/docs/build.md b/docs/build.md
@@ -297,8 +297,8 @@ The following compilation options are also available to tweak performance:
 | GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, CDNA and RDNA3+). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
 | GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models. There may be issues with numerical overflows (except for V100, CDNA and RDNA4 which use FP32 compute type by default) and memory use will be higher. Prompt processing may become faster on recent datacenter GPUs (the custom kernels were tuned primarily for RTX 3000/4000).   |
 | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                                                                                                                  |
-| GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile CUDA FlashAttention vec kernels for the full supported K/V cache type matrix (169 pairs for the 13-type universe). Mutually exclusive with GGML_CUDA_FA_HALF_QUANTS.                                                                                                                                                                                                     |
-| GGML_CUDA_FA_HALF_QUANTS      | Boolean                | false   | Compile only the useful K>=V half of the K/V cache type matrix for FlashAttention vec kernels (91 pairs), where types are ranked from higher precision to lower: f16 > bf16 > q8_0 > q6_0 > q5_1 > q5_0 > turbo4 > q4_1 > q4_0 > turbo3_tcq > turbo3 > turbo2_tcq > turbo2. This avoids wasteful reversed asymmetric pairs. Mutually exclusive with GGML_CUDA_FA_ALL_QUANTS.          |
+| GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile CUDA FlashAttention vec kernels for the full supported K/V cache type matrix: 169 pairs for the 13-type universe. This is useful for arbitrary asymmetric cache-type combinations. Mutually exclusive with GGML_CUDA_FA_HALF_QUANTS.                                                                                                         |
+| GGML_CUDA_FA_HALF_QUANTS      | Boolean                | false   | Compile the recommended CUDA FlashAttention vec K/V cache pair set for this fork: the useful K>=V half-matrix plus f16 fallback pairs needed by TurboQuant/TCQ dequant paths. This compiles 103 pairs instead of 169 in GGML_CUDA_FA_ALL_QUANTS. Types are ranked from higher precision to lower: f16 > bf16 > q8_0 > q6_0 > q5_1 > q5_0 > turbo4 > q4_1 > q4_0 > turbo3_tcq > turbo3 > turbo2_tcq > turbo2. Mutually exclusive with GGML_CUDA_FA_ALL_QUANTS. |
 
 ## MUSA
 
diff --git a/docs/quickstart-gemma-4-31b-dflash.md b/docs/quickstart-gemma-4-31b-dflash.md
@@ -57,7 +57,7 @@ Building from source with `-DGGML_NATIVE=ON` may result in a tiny bit better per
 
 ```powershell
 cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=ON ^
-  -DGGML_CUDA_FA=ON -DGGML_CUDA_FA_ALL_QUANTS=ON ^
+  -DGGML_CUDA_FA=ON -DGGML_CUDA_FA_HALF_QUANTS=ON ^
   -DCMAKE_BUILD_TYPE=Release
 cmake --build build --config Release --parallel
 ```
@@ -66,12 +66,12 @@ cmake --build build --config Release --parallel
 
 ```bash
 cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=ON \
-  -DGGML_CUDA_FA=ON -DGGML_CUDA_FA_ALL_QUANTS=ON \
+  -DGGML_CUDA_FA=ON -DGGML_CUDA_FA_HALF_QUANTS=ON \
   -DCMAKE_BUILD_TYPE=Release
 cmake --build build -j
 ```
 
-`GGML_CUDA_FA_ALL_QUANTS=ON` is required for TurboQuant and TCQ cache types. `GGML_CUDA_FA_HALF_QUANTS=ON` is an alternative that compiles only the useful K>=V half of the K/V pair matrix (compiling 91 FA vec K/V pairs instead of 169, reducing FA vec pair instances by ~46% vs ALL_QUANTS). These two flags are mutually exclusive.
+`GGML_CUDA_FA_HALF_QUANTS=ON` is the recommended CUDA FlashAttention build mode for these DFlash/TurboQuant quickstarts. It compiles the useful asymmetric K/V cache type half-matrix plus f16 fallback pairs needed by TurboQuant/TCQ dequant paths. Use `GGML_CUDA_FA_ALL_QUANTS=ON` only if you need the full 169-pair K/V matrix or arbitrary asymmetric cache-type combinations. These flags are mutually exclusive.
 
 **macOS (Metal).**
 
diff --git a/docs/quickstart-qwen36-dflash.md b/docs/quickstart-qwen36-dflash.md
@@ -57,7 +57,7 @@ Building from source with `-DGGML_NATIVE=ON` *may* result in a *tiny* bit better
 
 ```powershell
 cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=ON ^
-  -DGGML_CUDA_FA=ON -DGGML_CUDA_FA_ALL_QUANTS=ON ^
+  -DGGML_CUDA_FA=ON -DGGML_CUDA_FA_HALF_QUANTS=ON ^
   -DCMAKE_BUILD_TYPE=Release
 cmake --build build --config Release --parallel
 ```
@@ -66,12 +66,12 @@ cmake --build build --config Release --parallel
 
 ```bash
 cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=ON \
-  -DGGML_CUDA_FA=ON -DGGML_CUDA_FA_ALL_QUANTS=ON \
+  -DGGML_CUDA_FA=ON -DGGML_CUDA_FA_HALF_QUANTS=ON \
   -DCMAKE_BUILD_TYPE=Release
 cmake --build build -j
 ```
 
-`GGML_CUDA_FA_ALL_QUANTS=ON` is required for TurboQuant and TCQ cache types. `GGML_CUDA_FA_HALF_QUANTS=ON` is an alternative that compiles only the useful K>=V half of the K/V pair matrix (compiling 91 FA vec K/V pairs instead of 169, reducing FA vec pair instances by ~46% vs ALL_QUANTS). These two flags are mutually exclusive.
+`GGML_CUDA_FA_HALF_QUANTS=ON` is the recommended CUDA FlashAttention build mode for these DFlash/TurboQuant quickstarts. It compiles the useful asymmetric K/V cache type half-matrix plus f16 fallback pairs needed by TurboQuant/TCQ dequant paths. Use `GGML_CUDA_FA_ALL_QUANTS=ON` only if you need the full 169-pair K/V matrix or arbitrary asymmetric cache-type combinations. These flags are mutually exclusive.
 
 **macOS (Metal).**
 
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
@@ -205,8 +205,8 @@ set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
 option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
 option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"                OFF)
 option(GGML_CUDA_FA                         "ggml: compile ggml FlashAttention CUDA kernels"  ON)
-option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
-option(GGML_CUDA_FA_HALF_QUANTS             "ggml: compile only K>=V half of KV cache quant pairs for FlashAttention" OFF)
+option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile full K/V cache quant pair matrix for FlashAttention"     OFF)
+option(GGML_CUDA_FA_HALF_QUANTS             "ggml: compile K>=V half of KV cache quant pairs plus f16 fallbacks for FlashAttention" OFF)
 option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})
 option(GGML_CUDA_NCCL                       "ggml: use NVIDIA Collective Comm. Library"       ON)
 set   (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -152,15 +152,25 @@ if (CUDAToolkit_FOUND)
         list(LENGTH GGML_CUDA_FA_KV_TYPES_ORDERED GGML_CUDA_FA_KV_TYPES_LEN)
         math(EXPR GGML_CUDA_FA_KV_TYPES_LAST "${GGML_CUDA_FA_KV_TYPES_LEN} - 1")
 
+    set(GGML_CUDA_FA_HALF_PAIR_COUNT 0)
+
         foreach(KI RANGE 0 ${GGML_CUDA_FA_KV_TYPES_LAST})
             list(GET GGML_CUDA_FA_KV_TYPES_ORDERED ${KI} K_TYPE)
 
-            foreach(VI RANGE ${KI} ${GGML_CUDA_FA_KV_TYPES_LAST})
+            foreach(VI RANGE 0 ${GGML_CUDA_FA_KV_TYPES_LAST})
                 list(GET GGML_CUDA_FA_KV_TYPES_ORDERED ${VI} V_TYPE)
-                ggml_add_fattn_vec_pair(GGML_SOURCES_CUDA "${FA_VEC_PREFIX}" "${K_TYPE}" "${V_TYPE}")
+
+                if (KI LESS_EQUAL VI OR K_TYPE STREQUAL "f16" OR V_TYPE STREQUAL "f16")
+                    ggml_add_fattn_vec_pair(GGML_SOURCES_CUDA "${FA_VEC_PREFIX}" "${K_TYPE}" "${V_TYPE}")
+                    math(EXPR GGML_CUDA_FA_HALF_PAIR_COUNT "${GGML_CUDA_FA_HALF_PAIR_COUNT} + 1")
+                endif()
             endforeach()
         endforeach()
 
+        if (NOT GGML_CUDA_FA_HALF_PAIR_COUNT EQUAL 103)
+            message(FATAL_ERROR "GGML_CUDA_FA_HALF_QUANTS expected 103 FA vec pairs, got ${GGML_CUDA_FA_HALF_PAIR_COUNT}")
+        endif()
+
         add_compile_definitions(GGML_CUDA_FA_HALF_QUANTS)
     else()
         set(GGML_CUDA_FA_DEFAULT_KV_PAIRS
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt
diff --git a/ggml/src/ggml-musa/CMakeLists.txt b/ggml/src/ggml-musa/CMakeLists.txt
diff --git a/scripts/gen-fattn-vec-dispatch.py b/scripts/gen-fattn-vec-dispatch.py