Skip to content

Commit 2cddc45

Browse files
committed
Merge remote-tracking branch 'turboquant/feature/turboquant-kv-cache' into turboquant
2 parents a90e4dc + 69d8e4b commit 2cddc45

113 files changed

Lines changed: 24809 additions & 447 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.devops/nix/package.nix

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@
1616
rocmPackages,
1717
vulkan-headers,
1818
vulkan-loader,
19+
spirv-headers,
1920
openssl,
2021
shaderc,
21-
spirv-headers,
2222
useBlas ?
2323
builtins.all (x: !x) [
2424
useCuda
@@ -103,6 +103,7 @@ let
103103
vulkan-headers
104104
vulkan-loader
105105
shaderc
106+
spirv-headers
106107
];
107108
in
108109

.github/FUNDING.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
github: [TheTom]

.github/workflows/tqp-release.yml

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
name: TurboQuant+ Release
2+
3+
on:
4+
push:
5+
tags:
6+
- 'tqp-v*'
7+
8+
env:
9+
CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON"
10+
11+
jobs:
12+
macos-metal:
13+
runs-on: macos-14
14+
15+
steps:
16+
- name: Clone
17+
uses: actions/checkout@v6
18+
with:
19+
fetch-depth: 0
20+
21+
- name: Build
22+
run: |
23+
cmake -B build \
24+
-DGGML_METAL_USE_BF16=ON \
25+
-DGGML_METAL_EMBED_LIBRARY=ON \
26+
-DCMAKE_INSTALL_RPATH='@loader_path' \
27+
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
28+
${{ env.CMAKE_ARGS }}
29+
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
30+
31+
- name: Pack
32+
run: |
33+
cp LICENSE ./build/bin/
34+
tar -czvf turboquant-plus-${{ github.ref_name }}-macos-arm64-metal.tar.gz \
35+
-s ",./,turboquant-plus-${{ github.ref_name }}/," -C ./build/bin .
36+
37+
- name: Upload
38+
uses: actions/upload-artifact@v6
39+
with:
40+
name: macos-arm64-metal
41+
path: turboquant-plus-${{ github.ref_name }}-macos-arm64-metal.tar.gz
42+
43+
windows-cuda:
44+
runs-on: windows-2022
45+
46+
strategy:
47+
matrix:
48+
cuda: ['12.4']
49+
50+
steps:
51+
- name: Clone
52+
uses: actions/checkout@v6
53+
54+
- name: Install Cuda Toolkit
55+
uses: ./.github/actions/windows-setup-cuda
56+
with:
57+
cuda_version: ${{ matrix.cuda }}
58+
59+
- name: Install Ninja
60+
run: choco install ninja
61+
62+
- name: Build
63+
shell: cmd
64+
run: |
65+
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
66+
cmake -S . -B build -G "Ninja Multi-Config" ^
67+
-DGGML_NATIVE=OFF ^
68+
-DGGML_CUDA=ON ^
69+
-DGGML_CUDA_FA_ALL_QUANTS=ON ^
70+
${{ env.CMAKE_ARGS }}
71+
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
72+
cmake --build build --config Release -j %NINJA_JOBS%
73+
74+
- name: Pack
75+
run: |
76+
cp LICENSE ./build/bin/Release/
77+
$dst='.\build\bin\Release\'
78+
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
79+
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
80+
robocopy "${{env.CUDA_PATH}}\bin\x64" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
81+
7z a turboquant-plus-${{ github.ref_name }}-windows-x64-cuda${{ matrix.cuda }}.zip .\build\bin\Release\*
82+
83+
- name: Upload
84+
uses: actions/upload-artifact@v6
85+
with:
86+
name: windows-x64-cuda${{ matrix.cuda }}
87+
path: turboquant-plus-${{ github.ref_name }}-windows-x64-cuda${{ matrix.cuda }}.zip
88+
89+
release:
90+
needs: [macos-metal, windows-cuda]
91+
runs-on: ubuntu-latest
92+
permissions:
93+
contents: write
94+
95+
steps:
96+
- name: Download artifacts
97+
uses: actions/download-artifact@v7
98+
with:
99+
path: ./release
100+
merge-multiple: true
101+
102+
- name: Create Release
103+
uses: softprops/action-gh-release@v2
104+
with:
105+
tag_name: ${{ github.ref_name }}
106+
name: TurboQuant+ ${{ github.ref_name }}
107+
files: ./release/*
108+
draft: false
109+
prerelease: false

bench-smem-m5-baseline.txt

Lines changed: 362 additions & 0 deletions
Large diffs are not rendered by default.

bench-smem-m5-smem.txt

Lines changed: 413 additions & 0 deletions
Large diffs are not rendered by default.

common/arg.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,9 @@ const std::vector<ggml_type> kv_cache_types = {
398398
GGML_TYPE_IQ4_NL,
399399
GGML_TYPE_Q5_0,
400400
GGML_TYPE_Q5_1,
401+
GGML_TYPE_TURBO2_0,
402+
GGML_TYPE_TURBO3_0,
403+
GGML_TYPE_TURBO4_0,
401404
};
402405

403406
static ggml_type kv_cache_type_from_str(const std::string & s) {
@@ -4233,9 +4236,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
42334236
string_format("enable default speculative decoding config"),
42344237
[](common_params & params) {
42354238
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
4236-
params.speculative.ngram_mod.n_match = 24;
4237-
params.speculative.ngram_mod.n_min = 48;
4238-
params.speculative.ngram_mod.n_max = 64;
4239+
params.speculative.ngram_size_n = 24;
4240+
params.speculative.n_min = 48;
4241+
params.speculative.n_max = 64;
42394242
}
42404243
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
42414244

common/common.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@
4343
#include <string.h>
4444
#include <fcntl.h>
4545
#include <io.h>
46+
#ifndef fileno
47+
#define fileno _fileno
48+
#endif
49+
#ifndef isatty
50+
#define isatty _isatty
51+
#endif
4652
#else
4753
#include <sys/ioctl.h>
4854
#include <sys/stat.h>

docs/rocm-mi300x-test-results.md

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# TurboQuant on AMD Instinct MI300X & MI355X (ROCm/HIP)
2+
3+
## Summary
4+
5+
TurboQuant KV cache compression (turbo2/turbo3/turbo4) builds and runs correctly on AMD Instinct MI300X (gfx942) and MI355X (gfx950). MI300X requires zero code changes. MI355X requires adding CDNA4 arch defines to the HIP vendor header.
6+
7+
## Test Environment
8+
9+
| Component | MI300X | MI355X |
10+
|-----------|--------|--------|
11+
| GPU | MI300X (gfx942), 192 GB HBM3 | MI355X (gfx950), 288 GB HBM3e |
12+
| ROCm | 7.0.2 | 7.0.1 |
13+
| Wave Size | 64 | 64 |
14+
| Build | `-DAMDGPU_TARGETS="gfx942"` | `-DAMDGPU_TARGETS="gfx950"` |
15+
| Model | Qwen2.5-1.5B Q4_K_M (1.04 GiB) | same |
16+
17+
## WHT Kernel Correctness
18+
19+
Standalone roundtrip test (forward WHT → inverse WHT) confirms the Walsh-Hadamard Transform kernel works correctly on HIP with 64-wide wavefronts:
20+
21+
```
22+
=== TurboQuant WHT Roundtrip Test (HIP/gfx942) ===
23+
Total elements: 512 (4 heads x 128 dim)
24+
Forward WHT zeros: 0 / 512
25+
Roundtrip max error: 2.980232e-07
26+
Roundtrip RMSE: 6.816018e-08
27+
Result: PASS ✅
28+
```
29+
30+
The kernel uses shared memory + `__syncthreads()` (no warp shuffles), so it works correctly with GCN's 64-thread wavefronts without modification.
31+
32+
## Performance Results
33+
34+
### MI300X (single GPU, Qwen2.5-1.5B Q4_K_M)
35+
36+
| KV Cache | pp512 (tok/s) | tg128 (tok/s) | Prefill vs f16 | Decode vs f16 |
37+
|----------|--------------|--------------|----------------|---------------|
38+
| f16 | 24,453 ± 230 | 181.2 ± 2.0 | baseline | baseline |
39+
| turbo3 | ~25,200 | ~160 | **+3%** | 88% |
40+
| turbo4 | 25,427 ± 17 | 161.1 ± 0.2 | **+4%** | 89% |
41+
42+
### MI355X (single GPU, Qwen2.5-1.5B Q4_K_M)
43+
44+
| KV Cache | pp512 (tok/s) | tg128 (tok/s) | Prefill vs f16 | Decode vs f16 |
45+
|----------|--------------|--------------|----------------|---------------|
46+
| f16+FA | 40,013 ± 902 | 254.5 ± 1.0 | baseline | baseline |
47+
| turbo3 | 39,140 ± 475 | 162.3 ± 0.1 | 98% | 64% |
48+
| turbo4 | 39,232 ± 508 | 214.1 ± 0.7 | 98% | **84%** |
49+
50+
### Key Observations
51+
52+
1. **MI300X prefill is faster with TurboQuant** (+3-4%) — less KV cache data to write to HBM.
53+
2. **MI300X decode at 88-89% of f16** — consistent with Apple Silicon community results.
54+
3. **MI355X turbo4 decode at 84%** — turbo4 outperforms turbo3 in decode due to simpler 4-bit dequant.
55+
4. **MI355X turbo3 decode at 64%** — the 3-bit codebook + sign extraction is more expensive on gfx950.
56+
5. **MI355X non-FA MMQ path crashes** (xf32 MFMA issue) — turbo types force FA and work correctly.
57+
58+
## Build Instructions
59+
60+
```bash
61+
git clone https://github.com/TheTom/llama-cpp-turboquant.git
62+
cd llama-cpp-turboquant
63+
git checkout feature/turboquant-kv-cache
64+
65+
# MI300X (gfx942) — works without code changes
66+
cmake -B build -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx942"
67+
cmake --build build --config Release -j
68+
69+
# MI355X (gfx950) — requires CDNA4 define patch (see commit)
70+
cmake -B build -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx950"
71+
cmake --build build --config Release -j
72+
73+
# Test
74+
HIP_VISIBLE_DEVICES=0 ./build/bin/llama-bench \
75+
-m model.gguf -ctk turbo3 -ctv turbo3 -ngl 99 -r 3 -p 512 -n 128
76+
```
77+
78+
## Code Changes for gfx950 (MI355X)
79+
80+
Three files modified to add CDNA4 (gfx950) architecture support:
81+
82+
1. **`ggml/src/ggml-cuda/vendors/hip.h`** — Add `CDNA4` define for `__gfx950__`, include in `CDNA` family
83+
2. **`ggml/src/ggml-cuda/common.cuh`** — Add `GGML_CUDA_CC_CDNA4` constant and `GGML_CUDA_CC_IS_CDNA4` macro
84+
3. **`ggml/src/ggml-cuda/mma.cuh`** — Route CDNA4 to compatible MFMA instructions (bf16_1k, i32x16x32_i8, f32x16x4f32 — NOT xf32 which doesn't exist on gfx950)
85+
86+
## Known Limitations
87+
88+
- **MI355X non-FA MMQ crashes**: The default (non-flash-attention) matrix multiply path crashes on gfx950 due to the xf32 MFMA instruction (`mfma_f32_16x16x8_xf32`) not being available. TurboQuant types force flash attention and work correctly. Standard f16/q8_0 KV cache types need `-fa 1` flag on MI355X.
89+
- **llama-cli text output**: Interactive mode produces empty tokens on ROCm (display issue), but `llama-bench` confirms computation is correct.
90+
91+
## Tested By
92+
93+
Andy Luo (@andyluo7)
94+
- AMD Instinct MI300X (gfx942), ROCm 7.0.2 — April 2026
95+
- AMD Instinct MI355X (gfx950), ROCm 7.0.1 — April 2026

ggml/include/ggml-rpc.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ extern "C" {
1111
#define RPC_PROTO_PATCH_VERSION 0
1212

1313
#ifdef __cplusplus
14-
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
14+
static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
1515
#endif
1616

1717
#define GGML_RPC_MAX_SERVERS 16

ggml/include/ggml.h

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@
181181
# define GGML_API __declspec(dllimport) extern
182182
# endif
183183
# else
184-
# define GGML_API __attribute__ ((visibility ("default"))) extern
184+
# define GGML_API __attribute__ ((visibility ("default")))
185185
# endif
186186
#else
187187
# define GGML_API extern
@@ -429,7 +429,12 @@ extern "C" {
429429
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
430430
GGML_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale)
431431
GGML_TYPE_Q1_0 = 41,
432-
GGML_TYPE_COUNT = 42,
432+
GGML_TYPE_TURBO2_0 = 42, // TurboQuant 2-bit KV cache: WHT + 2-bit PolarQuant
433+
GGML_TYPE_TURBO3_0 = 43, // TurboQuant 3-bit KV cache: WHT + 3-bit PolarQuant
434+
GGML_TYPE_TURBO4_0 = 44, // TurboQuant 4-bit KV cache: WHT + 4-bit PolarQuant
435+
GGML_TYPE_TQ3_1S = 45, // TurboQuant 3-bit weight: WHT-rotated 8-level Lloyd-Max, block_size=32
436+
GGML_TYPE_TQ4_1S = 46, // TurboQuant 4-bit weight: WHT-rotated 16-level Lloyd-Max, block_size=32
437+
GGML_TYPE_COUNT = 47,
433438
};
434439

435440
// precision
@@ -567,6 +572,7 @@ extern "C" {
567572
GGML_OP_RWKV_WKV7,
568573
GGML_OP_SOLVE_TRI,
569574
GGML_OP_GATED_DELTA_NET,
575+
GGML_OP_TURBO_WHT,
570576

571577
GGML_OP_UNARY,
572578

@@ -2550,6 +2556,16 @@ extern "C" {
25502556
struct ggml_tensor * beta,
25512557
struct ggml_tensor * state);
25522558

2559+
// TurboQuant Walsh-Hadamard Transform (O(d log d) rotation for KV cache compression)
2560+
// Applies WHT rotation to 128-element groups along ne[0]: sign1 → butterfly → sign2 → normalize
2561+
// direction: 0 = forward (signs1 → WHT → signs2), 1 = inverse (signs2 → WHT → signs1)
2562+
GGML_API struct ggml_tensor * ggml_turbo_wht(
2563+
struct ggml_context * ctx,
2564+
struct ggml_tensor * a,
2565+
int direction,
2566+
int group_size, // 0 = auto (64 or 128 from ne[0])
2567+
struct ggml_tensor * scale); // NULL = no InnerQ scaling
2568+
25532569
// custom operators
25542570

25552571
typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);

0 commit comments

Comments
 (0)