Skip to content

Commit e623d39

Browse files
committed
cuda: add f16 fallbacks to half FA quant builds
HALF now compiles the K>=V half-matrix plus all K/f16 fallback pairs needed by TurboQuant/TCQ dequant paths: 103 pairs instead of 91. The previous literal K>=V rule missed reversed K/f16 pairs like q8_0/f16 that arise when Turbo/TCQ decode-dequant dispatches an internal f16 transient for the V side. ALL works because it includes every pair; HALF now covers those too. Changes: - CMake (CUDA/HIP/MUSA): full nested loop with f16 predicate and count assertion (must equal 103) - fattn.cu: runtime ggml_cuda_fattn_pair_compiled() HALF branch adds type_K/type_V == F16 fallback check - fattn.cu: regenerated HALF dispatch block with 12 new K/f16 pairs - gen-fattn-vec-dispatch.py: f16 fallback predicate, assertions for ALL=169, HALF=103, DEFAULT=27 - Docs/CI: make HALF the recommended build flag, ALL the escape hatch
1 parent a596a5c commit e623d39

14 files changed

Lines changed: 88 additions & 35 deletions

File tree

.devops/cuda.Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ RUN --mount=type=cache,target=/root/.ccache \
3434
-DGGML_NATIVE=OFF \
3535
-DGGML_CUDA=ON \
3636
-DGGML_CUDA_FA=ON \
37-
-DGGML_CUDA_FA_ALL_QUANTS=ON \
37+
-DGGML_CUDA_FA_HALF_QUANTS=ON \
3838
-DGGML_BACKEND_DL=ON \
3939
-DGGML_CPU_ALL_VARIANTS=ON \
4040
-DLLAMA_BUILD_TESTS=OFF \

.github/workflows/release.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -496,7 +496,7 @@ jobs:
496496
-DGPU_TARGETS="${GPU_TARGETS}" \
497497
-DGGML_HIP=ON \
498498
-DGGML_CUDA_FA=ON \
499-
-DGGML_CUDA_FA_ALL_QUANTS=ON \
499+
-DGGML_CUDA_FA_HALF_QUANTS=ON \
500500
-DHIP_PLATFORM=amd \
501501
-DGGML_HIP_ROCWMMA_FATTN=ON \
502502
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
@@ -700,7 +700,7 @@ jobs:
700700
-DGGML_NATIVE=OFF \
701701
-DGGML_CUDA=ON \
702702
-DGGML_CUDA_FA=ON \
703-
-DGGML_CUDA_FA_ALL_QUANTS=ON \
703+
-DGGML_CUDA_FA_HALF_QUANTS=ON \
704704
-DGGML_BACKEND_DL=ON \
705705
-DGGML_CPU_ALL_VARIANTS=ON \
706706
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
@@ -1101,7 +1101,7 @@ jobs:
11011101
-DGGML_CPU=OFF ^
11021102
-DGGML_CUDA=ON ^
11031103
-DGGML_CUDA_FA=ON ^
1104-
-DGGML_CUDA_FA_ALL_QUANTS=ON ^
1104+
-DGGML_CUDA_FA_HALF_QUANTS=ON ^
11051105
-DLLAMA_BUILD_BORINGSSL=ON ^
11061106
-DCMAKE_C_COMPILER_LAUNCHER=ccache ^
11071107
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache ^
@@ -1226,7 +1226,7 @@ jobs:
12261226
-DGGML_HIP_ROCWMMA_FATTN=ON `
12271227
-DGGML_HIP=ON `
12281228
-DGGML_CUDA_FA=ON `
1229-
-DGGML_CUDA_FA_ALL_QUANTS=ON `
1229+
-DGGML_CUDA_FA_HALF_QUANTS=ON `
12301230
-DCMAKE_C_COMPILER_LAUNCHER=ccache `
12311231
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache `
12321232
-DCMAKE_HIP_COMPILER_LAUNCHER=ccache `

AGENTS.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@ Prebuilt Windows binaries (CUDA 12.4/13.1) are on the releases page. Otherwise b
2222
```bash
2323
# Linux (GCC + CUDA)
2424
cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=ON \
25-
-DGGML_CUDA_FA=ON -DGGML_CUDA_FA_ALL_QUANTS=ON \
25+
-DGGML_CUDA_FA=ON -DGGML_CUDA_FA_HALF_QUANTS=ON \
2626
-DCMAKE_BUILD_TYPE=Release
2727
cmake --build build -j
2828

2929
# Windows (MSVC + CUDA)
3030
cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=ON ^
31-
-DGGML_CUDA_FA=ON -DGGML_CUDA_FA_ALL_QUANTS=ON ^
31+
-DGGML_CUDA_FA=ON -DGGML_CUDA_FA_HALF_QUANTS=ON ^
3232
-DCMAKE_BUILD_TYPE=Release
3333
cmake --build build --config Release --parallel
3434

@@ -37,7 +37,7 @@ cmake -B build -DGGML_METAL=ON -DCMAKE_BUILD_TYPE=Release
3737
cmake --build build -j
3838
```
3939

40-
`GGML_CUDA_FA_ALL_QUANTS=ON` is required for TurboQuant and TCQ cache types. `GGML_CUDA_FA_HALF_QUANTS=ON` is an alternative that compiles only the useful K>=V half of the K/V pair matrix (compiling 91 FA vec K/V pairs instead of 169, reducing FA vec pair instances by ~46% vs ALL_QUANTS). These two flags are mutually exclusive. Add `-DCMAKE_CUDA_ARCHITECTURES=86` for RTX 3090, or `-DCMAKE_CUDA_ARCHITECTURES=89` for RTX 4090, if cross-compiling or building in CI without a GPU.
40+
`GGML_CUDA_FA_HALF_QUANTS=ON` is the recommended CUDA FlashAttention build mode for this fork. It compiles the useful asymmetric K/V cache type half-matrix plus f16 fallback pairs needed by TurboQuant/TCQ dequant paths: 103 FA vec K/V pairs instead of 169 in ALL_QUANTS. `GGML_CUDA_FA_ALL_QUANTS=ON` remains available for the full K/V matrix and arbitrary asymmetric cache-type combinations. These two flags are mutually exclusive. Add `-DCMAKE_CUDA_ARCHITECTURES=86` for RTX 3090, or `-DCMAKE_CUDA_ARCHITECTURES=89` for RTX 4090, if cross-compiling or building in CI without a GPU.
4141

4242
Key binaries: `build/bin/llama-server`, `build/bin/llama-cli`, `build/bin/llama-bench`, `build/bin/llama-perplexity`.
4343

CLAUDE.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@ Prebuilt Windows binaries (CUDA 12.4/13.1) are on the releases page. Otherwise b
2222
```bash
2323
# Linux (GCC + CUDA)
2424
cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=ON \
25-
-DGGML_CUDA_FA=ON -DGGML_CUDA_FA_ALL_QUANTS=ON \
25+
-DGGML_CUDA_FA=ON -DGGML_CUDA_FA_HALF_QUANTS=ON \
2626
-DCMAKE_BUILD_TYPE=Release
2727
cmake --build build -j
2828

2929
# Windows (MSVC + CUDA)
3030
cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=ON ^
31-
-DGGML_CUDA_FA=ON -DGGML_CUDA_FA_ALL_QUANTS=ON ^
31+
-DGGML_CUDA_FA=ON -DGGML_CUDA_FA_HALF_QUANTS=ON ^
3232
-DCMAKE_BUILD_TYPE=Release
3333
cmake --build build --config Release --parallel
3434

@@ -37,7 +37,7 @@ cmake -B build -DGGML_METAL=ON -DCMAKE_BUILD_TYPE=Release
3737
cmake --build build -j
3838
```
3939

40-
`GGML_CUDA_FA_ALL_QUANTS=ON` is required for TurboQuant and TCQ cache types. `GGML_CUDA_FA_HALF_QUANTS=ON` is an alternative that compiles only the useful K>=V half of the K/V pair matrix (compiling 91 FA vec K/V pairs instead of 169, reducing FA vec pair instances by ~46% vs ALL_QUANTS). These two flags are mutually exclusive. Add `-DCMAKE_CUDA_ARCHITECTURES=86` for RTX 3090, or `-DCMAKE_CUDA_ARCHITECTURES=89` for RTX 4090, if cross-compiling or building in CI without a GPU.
40+
`GGML_CUDA_FA_HALF_QUANTS=ON` is the recommended CUDA FlashAttention build mode for this fork. It compiles the useful asymmetric K/V cache type half-matrix plus f16 fallback pairs needed by TurboQuant/TCQ dequant paths: 103 FA vec K/V pairs instead of 169 in ALL_QUANTS. `GGML_CUDA_FA_ALL_QUANTS=ON` remains available for the full K/V matrix and arbitrary asymmetric cache-type combinations. These two flags are mutually exclusive. Add `-DCMAKE_CUDA_ARCHITECTURES=86` for RTX 3090, or `-DCMAKE_CUDA_ARCHITECTURES=89` for RTX 4090, if cross-compiling or building in CI without a GPU.
4141

4242
Key binaries: `build/bin/llama-server`, `build/bin/llama-cli`, `build/bin/llama-bench`, `build/bin/llama-perplexity`.
4343

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -343,13 +343,13 @@ Building from source with `-DGGML_NATIVE=ON` *may* result in a *tiny* bit better
343343
```bash
344344
# Linux (GCC + CUDA)
345345
cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=ON \
346-
-DGGML_CUDA_FA=ON -DGGML_CUDA_FA_ALL_QUANTS=ON \
346+
-DGGML_CUDA_FA=ON -DGGML_CUDA_FA_HALF_QUANTS=ON \
347347
-DCMAKE_BUILD_TYPE=Release
348348
cmake --build build -j
349349

350350
# Windows (MSVC + CUDA)
351351
cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=ON ^
352-
-DGGML_CUDA_FA=ON -DGGML_CUDA_FA_ALL_QUANTS=ON ^
352+
-DGGML_CUDA_FA=ON -DGGML_CUDA_FA_HALF_QUANTS=ON ^
353353
-DCMAKE_BUILD_TYPE=Release
354354
cmake --build build --config Release --parallel
355355

@@ -358,7 +358,7 @@ cmake -B build -DGGML_METAL=ON -DCMAKE_BUILD_TYPE=Release
358358
cmake --build build -j
359359
```
360360

361-
`GGML_CUDA_FA_ALL_QUANTS=ON` is required for TurboQuant and TCQ cache types. `GGML_CUDA_FA_HALF_QUANTS=ON` is an alternative that compiles only the useful K>=V half of the K/V pair matrix (compiling 91 FA vec K/V pairs instead of 169, reducing FA vec pair instances by ~46% vs ALL_QUANTS). These two flags are mutually exclusive.
361+
`GGML_CUDA_FA_HALF_QUANTS=ON` is the recommended CUDA FlashAttention build mode. It compiles the useful asymmetric K/V cache type half-matrix plus f16 fallback pairs needed by TurboQuant/TCQ dequant paths: 103 FA vec K/V pairs instead of 169 in ALL_QUANTS. Use `GGML_CUDA_FA_ALL_QUANTS=ON` only when you need the full K/V matrix or arbitrary asymmetric cache-type combinations. These flags are mutually exclusive.
362362

363363
### Other Backends
364364

docs/build.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -297,8 +297,8 @@ The following compilation options are also available to tweak performance:
297297
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, CDNA and RDNA3+). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
298298
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models. There may be issues with numerical overflows (except for V100, CDNA and RDNA4 which use FP32 compute type by default) and memory use will be higher. Prompt processing may become faster on recent datacenter GPUs (the custom kernels were tuned primarily for RTX 3000/4000). |
299299
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
300-
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile CUDA FlashAttention vec kernels for the full supported K/V cache type matrix (169 pairs for the 13-type universe). Mutually exclusive with GGML_CUDA_FA_HALF_QUANTS. |
301-
| GGML_CUDA_FA_HALF_QUANTS | Boolean | false | Compile only the useful K>=V half of the K/V cache type matrix for FlashAttention vec kernels (91 pairs), where types are ranked from higher precision to lower: f16 > bf16 > q8_0 > q6_0 > q5_1 > q5_0 > turbo4 > q4_1 > q4_0 > turbo3_tcq > turbo3 > turbo2_tcq > turbo2. This avoids wasteful reversed asymmetric pairs. Mutually exclusive with GGML_CUDA_FA_ALL_QUANTS. |
300+
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile CUDA FlashAttention vec kernels for the full supported K/V cache type matrix: 169 pairs for the 13-type universe. This is useful for arbitrary asymmetric cache-type combinations. Mutually exclusive with GGML_CUDA_FA_HALF_QUANTS. |
301+
| GGML_CUDA_FA_HALF_QUANTS | Boolean | false | Compile the recommended CUDA FlashAttention vec K/V cache pair set for this fork: the useful K>=V half-matrix plus f16 fallback pairs needed by TurboQuant/TCQ dequant paths. This compiles 103 pairs instead of 169 in GGML_CUDA_FA_ALL_QUANTS. Types are ranked from higher precision to lower: f16 > bf16 > q8_0 > q6_0 > q5_1 > q5_0 > turbo4 > q4_1 > q4_0 > turbo3_tcq > turbo3 > turbo2_tcq > turbo2. Mutually exclusive with GGML_CUDA_FA_ALL_QUANTS. |
302302

303303
## MUSA
304304

docs/quickstart-gemma-4-31b-dflash.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ Building from source with `-DGGML_NATIVE=ON` may result in a tiny bit better per
5757

5858
```powershell
5959
cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=ON ^
60-
-DGGML_CUDA_FA=ON -DGGML_CUDA_FA_ALL_QUANTS=ON ^
60+
-DGGML_CUDA_FA=ON -DGGML_CUDA_FA_HALF_QUANTS=ON ^
6161
-DCMAKE_BUILD_TYPE=Release
6262
cmake --build build --config Release --parallel
6363
```
@@ -66,12 +66,12 @@ cmake --build build --config Release --parallel
6666

6767
```bash
6868
cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=ON \
69-
-DGGML_CUDA_FA=ON -DGGML_CUDA_FA_ALL_QUANTS=ON \
69+
-DGGML_CUDA_FA=ON -DGGML_CUDA_FA_HALF_QUANTS=ON \
7070
-DCMAKE_BUILD_TYPE=Release
7171
cmake --build build -j
7272
```
7373

74-
`GGML_CUDA_FA_ALL_QUANTS=ON` is required for TurboQuant and TCQ cache types. `GGML_CUDA_FA_HALF_QUANTS=ON` is an alternative that compiles only the useful K>=V half of the K/V pair matrix (compiling 91 FA vec K/V pairs instead of 169, reducing FA vec pair instances by ~46% vs ALL_QUANTS). These two flags are mutually exclusive.
74+
`GGML_CUDA_FA_HALF_QUANTS=ON` is the recommended CUDA FlashAttention build mode for these DFlash/TurboQuant quickstarts. It compiles the useful asymmetric K/V cache type half-matrix plus f16 fallback pairs needed by TurboQuant/TCQ dequant paths. Use `GGML_CUDA_FA_ALL_QUANTS=ON` only if you need the full 169-pair K/V matrix or arbitrary asymmetric cache-type combinations. These flags are mutually exclusive.
7575

7676
**macOS (Metal).**
7777

docs/quickstart-qwen36-dflash.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ Building from source with `-DGGML_NATIVE=ON` *may* result in a *tiny* bit better
5757

5858
```powershell
5959
cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=ON ^
60-
-DGGML_CUDA_FA=ON -DGGML_CUDA_FA_ALL_QUANTS=ON ^
60+
-DGGML_CUDA_FA=ON -DGGML_CUDA_FA_HALF_QUANTS=ON ^
6161
-DCMAKE_BUILD_TYPE=Release
6262
cmake --build build --config Release --parallel
6363
```
@@ -66,12 +66,12 @@ cmake --build build --config Release --parallel
6666

6767
```bash
6868
cmake -B build -DGGML_CUDA=ON -DGGML_NATIVE=ON \
69-
-DGGML_CUDA_FA=ON -DGGML_CUDA_FA_ALL_QUANTS=ON \
69+
-DGGML_CUDA_FA=ON -DGGML_CUDA_FA_HALF_QUANTS=ON \
7070
-DCMAKE_BUILD_TYPE=Release
7171
cmake --build build -j
7272
```
7373

74-
`GGML_CUDA_FA_ALL_QUANTS=ON` is required for TurboQuant and TCQ cache types. `GGML_CUDA_FA_HALF_QUANTS=ON` is an alternative that compiles only the useful K>=V half of the K/V pair matrix (compiling 91 FA vec K/V pairs instead of 169, reducing FA vec pair instances by ~46% vs ALL_QUANTS). These two flags are mutually exclusive.
74+
`GGML_CUDA_FA_HALF_QUANTS=ON` is the recommended CUDA FlashAttention build mode for these DFlash/TurboQuant quickstarts. It compiles the useful asymmetric K/V cache type half-matrix plus f16 fallback pairs needed by TurboQuant/TCQ dequant paths. Use `GGML_CUDA_FA_ALL_QUANTS=ON` only if you need the full 169-pair K/V matrix or arbitrary asymmetric cache-type combinations. These flags are mutually exclusive.
7575

7676
**macOS (Metal).**
7777

ggml/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -205,8 +205,8 @@ set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
205205
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
206206
option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
207207
option(GGML_CUDA_FA "ggml: compile ggml FlashAttention CUDA kernels" ON)
208-
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
209-
option(GGML_CUDA_FA_HALF_QUANTS "ggml: compile only K>=V half of KV cache quant pairs for FlashAttention" OFF)
208+
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile full K/V cache quant pair matrix for FlashAttention" OFF)
209+
option(GGML_CUDA_FA_HALF_QUANTS "ggml: compile K>=V half of KV cache quant pairs plus f16 fallbacks for FlashAttention" OFF)
210210
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
211211
option(GGML_CUDA_NCCL "ggml: use NVIDIA Collective Comm. Library" ON)
212212
set (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING

ggml/src/ggml-cuda/CMakeLists.txt

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,15 +152,25 @@ if (CUDAToolkit_FOUND)
152152
list(LENGTH GGML_CUDA_FA_KV_TYPES_ORDERED GGML_CUDA_FA_KV_TYPES_LEN)
153153
math(EXPR GGML_CUDA_FA_KV_TYPES_LAST "${GGML_CUDA_FA_KV_TYPES_LEN} - 1")
154154

155+
set(GGML_CUDA_FA_HALF_PAIR_COUNT 0)
156+
155157
foreach(KI RANGE 0 ${GGML_CUDA_FA_KV_TYPES_LAST})
156158
list(GET GGML_CUDA_FA_KV_TYPES_ORDERED ${KI} K_TYPE)
157159

158-
foreach(VI RANGE ${KI} ${GGML_CUDA_FA_KV_TYPES_LAST})
160+
foreach(VI RANGE 0 ${GGML_CUDA_FA_KV_TYPES_LAST})
159161
list(GET GGML_CUDA_FA_KV_TYPES_ORDERED ${VI} V_TYPE)
160-
ggml_add_fattn_vec_pair(GGML_SOURCES_CUDA "${FA_VEC_PREFIX}" "${K_TYPE}" "${V_TYPE}")
162+
163+
if (KI LESS_EQUAL VI OR K_TYPE STREQUAL "f16" OR V_TYPE STREQUAL "f16")
164+
ggml_add_fattn_vec_pair(GGML_SOURCES_CUDA "${FA_VEC_PREFIX}" "${K_TYPE}" "${V_TYPE}")
165+
math(EXPR GGML_CUDA_FA_HALF_PAIR_COUNT "${GGML_CUDA_FA_HALF_PAIR_COUNT} + 1")
166+
endif()
161167
endforeach()
162168
endforeach()
163169

170+
if (NOT GGML_CUDA_FA_HALF_PAIR_COUNT EQUAL 103)
171+
message(FATAL_ERROR "GGML_CUDA_FA_HALF_QUANTS expected 103 FA vec pairs, got ${GGML_CUDA_FA_HALF_PAIR_COUNT}")
172+
endif()
173+
164174
add_compile_definitions(GGML_CUDA_FA_HALF_QUANTS)
165175
else()
166176
set(GGML_CUDA_FA_DEFAULT_KV_PAIRS

0 commit comments

Comments
 (0)