meh
diff --git a/‎.devops/nix/package.nix‎
Lines changed: 2 additions & 1 deletion b/‎.devops/nix/package.nix‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/FUNDING.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/FUNDING.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/tqp-release.yml‎
Lines changed: 109 additions & 0 deletions b/‎.github/workflows/tqp-release.yml‎
Lines changed: 109 additions & 0 deletions
diff --git a/‎bench-smem-m5-baseline.txt‎
Lines changed: 362 additions & 0 deletions b/‎bench-smem-m5-baseline.txt‎
Lines changed: 362 additions & 0 deletions
diff --git a/‎bench-smem-m5-smem.txt‎
Lines changed: 413 additions & 0 deletions b/‎bench-smem-m5-smem.txt‎
Lines changed: 413 additions & 0 deletions
diff --git a/‎common/arg.cpp‎
Lines changed: 6 additions & 3 deletions b/‎common/arg.cpp‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎common/common.cpp‎
Lines changed: 6 additions & 0 deletions b/‎common/common.cpp‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/rocm-mi300x-test-results.md‎
Lines changed: 95 additions & 0 deletions b/‎docs/rocm-mi300x-test-results.md‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎ggml/include/ggml-rpc.h‎
Lines changed: 1 addition & 1 deletion b/‎ggml/include/ggml-rpc.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/include/ggml.h‎
Lines changed: 18 additions & 2 deletions b/‎ggml/include/ggml.h‎
Lines changed: 18 additions & 2 deletions
@@ -16,9 +16,9 @@
   rocmPackages,
   vulkan-headers,
   vulkan-loader,
+  spirv-headers,
   openssl,
   shaderc,
-  spirv-headers,
   useBlas ?
     builtins.all (x: !x) [
       useCuda
@@ -103,6 +103,7 @@ let
     vulkan-headers
     vulkan-loader
     shaderc
+    spirv-headers
   ];
 in
 
 
@@ -0,0 +1 @@
+github: [TheTom]
@@ -0,0 +1,109 @@
+name: TurboQuant+ Release
+
+on:
+  push:
+    tags:
+      - 'tqp-v*'
+
+env:
+  CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON"
+
+jobs:
+  macos-metal:
+    runs-on: macos-14
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Build
+        run: |
+          cmake -B build \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DCMAKE_INSTALL_RPATH='@loader_path' \
+            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
+            ${{ env.CMAKE_ARGS }}
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Pack
+        run: |
+          cp LICENSE ./build/bin/
+          tar -czvf turboquant-plus-${{ github.ref_name }}-macos-arm64-metal.tar.gz \
+            -s ",./,turboquant-plus-${{ github.ref_name }}/," -C ./build/bin .
+
+      - name: Upload
+        uses: actions/upload-artifact@v6
+        with:
+          name: macos-arm64-metal
+          path: turboquant-plus-${{ github.ref_name }}-macos-arm64-metal.tar.gz
+
+  windows-cuda:
+    runs-on: windows-2022
+
+    strategy:
+      matrix:
+        cuda: ['12.4']
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+
+      - name: Install Cuda Toolkit
+        uses: ./.github/actions/windows-setup-cuda
+        with:
+          cuda_version: ${{ matrix.cuda }}
+
+      - name: Install Ninja
+        run: choco install ninja
+
+      - name: Build
+        shell: cmd
+        run: |
+          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
+          cmake -S . -B build -G "Ninja Multi-Config" ^
+            -DGGML_NATIVE=OFF ^
+            -DGGML_CUDA=ON ^
+            -DGGML_CUDA_FA_ALL_QUANTS=ON ^
+            ${{ env.CMAKE_ARGS }}
+          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
+          cmake --build build --config Release -j %NINJA_JOBS%
+
+      - name: Pack
+        run: |
+          cp LICENSE ./build/bin/Release/
+          $dst='.\build\bin\Release\'
+          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          robocopy "${{env.CUDA_PATH}}\bin\x64" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          7z a turboquant-plus-${{ github.ref_name }}-windows-x64-cuda${{ matrix.cuda }}.zip .\build\bin\Release\*
+
+      - name: Upload
+        uses: actions/upload-artifact@v6
+        with:
+          name: windows-x64-cuda${{ matrix.cuda }}
+          path: turboquant-plus-${{ github.ref_name }}-windows-x64-cuda${{ matrix.cuda }}.zip
+
+  release:
+    needs: [macos-metal, windows-cuda]
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+
+    steps:
+      - name: Download artifacts
+        uses: actions/download-artifact@v7
+        with:
+          path: ./release
+          merge-multiple: true
+
+      - name: Create Release
+        uses: softprops/action-gh-release@v2
+        with:
+          tag_name: ${{ github.ref_name }}
+          name: TurboQuant+ ${{ github.ref_name }}
+          files: ./release/*
+          draft: false
+          prerelease: false
@@ -398,6 +398,9 @@ const std::vector<ggml_type> kv_cache_types = {
     GGML_TYPE_IQ4_NL,
     GGML_TYPE_Q5_0,
     GGML_TYPE_Q5_1,
+    GGML_TYPE_TURBO2_0,
+    GGML_TYPE_TURBO3_0,
+    GGML_TYPE_TURBO4_0,
 };
 
 static ggml_type kv_cache_type_from_str(const std::string & s) {
@@ -4233,9 +4236,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         string_format("enable default speculative decoding config"),
         [](common_params & params) {
             params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
-            params.speculative.ngram_mod.n_match = 24;
-            params.speculative.ngram_mod.n_min = 48;
-            params.speculative.ngram_mod.n_max = 64;
+            params.speculative.ngram_size_n = 24;
+            params.speculative.n_min = 48;
+            params.speculative.n_max = 64;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
 
 
@@ -43,6 +43,12 @@
 #include <string.h>
 #include <fcntl.h>
 #include <io.h>
+#ifndef fileno
+#define fileno _fileno
+#endif
+#ifndef isatty
+#define isatty _isatty
+#endif
 #else
 #include <sys/ioctl.h>
 #include <sys/stat.h>
 
@@ -0,0 +1,95 @@
+# TurboQuant on AMD Instinct MI300X & MI355X (ROCm/HIP)
+
+## Summary
+
+TurboQuant KV cache compression (turbo2/turbo3/turbo4) builds and runs correctly on AMD Instinct MI300X (gfx942) and MI355X (gfx950). MI300X requires zero code changes. MI355X requires adding CDNA4 arch defines to the HIP vendor header.
+
+## Test Environment
+
+| Component | MI300X | MI355X |
+|-----------|--------|--------|
+| GPU | MI300X (gfx942), 192 GB HBM3 | MI355X (gfx950), 288 GB HBM3e |
+| ROCm | 7.0.2 | 7.0.1 |
+| Wave Size | 64 | 64 |
+| Build | `-DAMDGPU_TARGETS="gfx942"` | `-DAMDGPU_TARGETS="gfx950"` |
+| Model | Qwen2.5-1.5B Q4_K_M (1.04 GiB) | same |
+
+## WHT Kernel Correctness
+
+Standalone roundtrip test (forward WHT → inverse WHT) confirms the Walsh-Hadamard Transform kernel works correctly on HIP with 64-wide wavefronts:
+
+```
+=== TurboQuant WHT Roundtrip Test (HIP/gfx942) ===
+Total elements: 512 (4 heads x 128 dim)
+Forward WHT zeros: 0 / 512
+Roundtrip max error: 2.980232e-07
+Roundtrip RMSE:      6.816018e-08
+Result: PASS ✅
+```
+
+The kernel uses shared memory + `__syncthreads()` (no warp shuffles), so it works correctly with GCN's 64-thread wavefronts without modification.
+
+## Performance Results
+
+### MI300X (single GPU, Qwen2.5-1.5B Q4_K_M)
+
+| KV Cache | pp512 (tok/s) | tg128 (tok/s) | Prefill vs f16 | Decode vs f16 |
+|----------|--------------|--------------|----------------|---------------|
+| f16 | 24,453 ± 230 | 181.2 ± 2.0 | baseline | baseline |
+| turbo3 | ~25,200 | ~160 | **+3%** | 88% |
+| turbo4 | 25,427 ± 17 | 161.1 ± 0.2 | **+4%** | 89% |
+
+### MI355X (single GPU, Qwen2.5-1.5B Q4_K_M)
+
+| KV Cache | pp512 (tok/s) | tg128 (tok/s) | Prefill vs f16 | Decode vs f16 |
+|----------|--------------|--------------|----------------|---------------|
+| f16+FA | 40,013 ± 902 | 254.5 ± 1.0 | baseline | baseline |
+| turbo3 | 39,140 ± 475 | 162.3 ± 0.1 | 98% | 64% |
+| turbo4 | 39,232 ± 508 | 214.1 ± 0.7 | 98% | **84%** |
+
+### Key Observations
+
+1. **MI300X prefill is faster with TurboQuant** (+3-4%) — less KV cache data to write to HBM.
+2. **MI300X decode at 88-89% of f16** — consistent with Apple Silicon community results.
+3. **MI355X turbo4 decode at 84%** — turbo4 outperforms turbo3 in decode due to simpler 4-bit dequant.
+4. **MI355X turbo3 decode at 64%** — the 3-bit codebook + sign extraction is more expensive on gfx950.
+5. **MI355X non-FA MMQ path crashes** (xf32 MFMA issue) — turbo types force FA and work correctly.
+
+## Build Instructions
+
+```bash
+git clone https://github.com/TheTom/llama-cpp-turboquant.git
+cd llama-cpp-turboquant
+git checkout feature/turboquant-kv-cache
+
+# MI300X (gfx942) — works without code changes
+cmake -B build -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx942"
+cmake --build build --config Release -j
+
+# MI355X (gfx950) — requires CDNA4 define patch (see commit)
+cmake -B build -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx950"
+cmake --build build --config Release -j
+
+# Test
+HIP_VISIBLE_DEVICES=0 ./build/bin/llama-bench \
+  -m model.gguf -ctk turbo3 -ctv turbo3 -ngl 99 -r 3 -p 512 -n 128
+```
+
+## Code Changes for gfx950 (MI355X)
+
+Three files modified to add CDNA4 (gfx950) architecture support:
+
+1. **`ggml/src/ggml-cuda/vendors/hip.h`** — Add `CDNA4` define for `__gfx950__`, include in `CDNA` family
+2. **`ggml/src/ggml-cuda/common.cuh`** — Add `GGML_CUDA_CC_CDNA4` constant and `GGML_CUDA_CC_IS_CDNA4` macro
+3. **`ggml/src/ggml-cuda/mma.cuh`** — Route CDNA4 to compatible MFMA instructions (bf16_1k, i32x16x32_i8, f32x16x4f32 — NOT xf32 which doesn't exist on gfx950)
+
+## Known Limitations
+
+- **MI355X non-FA MMQ crashes**: The default (non-flash-attention) matrix multiply path crashes on gfx950 due to the xf32 MFMA instruction (`mfma_f32_16x16x8_xf32`) not being available. TurboQuant types force flash attention and work correctly. Standard f16/q8_0 KV cache types need `-fa 1` flag on MI355X.
+- **llama-cli text output**: Interactive mode produces empty tokens on ROCm (display issue), but `llama-bench` confirms computation is correct.
+
+## Tested By
+
+Andy Luo (@andyluo7)
+- AMD Instinct MI300X (gfx942), ROCm 7.0.2 — April 2026
+- AMD Instinct MI355X (gfx950), ROCm 7.0.1 — April 2026
@@ -11,7 +11,7 @@ extern "C" {
 #define RPC_PROTO_PATCH_VERSION    0
 
 #ifdef  __cplusplus
-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
+static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
 #endif
 
 #define GGML_RPC_MAX_SERVERS       16
 
@@ -181,7 +181,7 @@
 #            define GGML_API __declspec(dllimport) extern
 #        endif
 #    else
-#        define GGML_API __attribute__ ((visibility ("default"))) extern
+#        define GGML_API __attribute__ ((visibility ("default")))
 #    endif
 #else
 #    define GGML_API extern
@@ -429,7 +429,12 @@ extern "C" {
         GGML_TYPE_MXFP4   = 39, // MXFP4 (1 block)
         GGML_TYPE_NVFP4   = 40, // NVFP4 (4 blocks, E4M3 scale)
         GGML_TYPE_Q1_0    = 41,
-        GGML_TYPE_COUNT   = 42,
+        GGML_TYPE_TURBO2_0 = 42, // TurboQuant 2-bit KV cache: WHT + 2-bit PolarQuant
+        GGML_TYPE_TURBO3_0 = 43, // TurboQuant 3-bit KV cache: WHT + 3-bit PolarQuant
+        GGML_TYPE_TURBO4_0 = 44, // TurboQuant 4-bit KV cache: WHT + 4-bit PolarQuant
+        GGML_TYPE_TQ3_1S  = 45, // TurboQuant 3-bit weight: WHT-rotated 8-level Lloyd-Max, block_size=32
+        GGML_TYPE_TQ4_1S  = 46, // TurboQuant 4-bit weight: WHT-rotated 16-level Lloyd-Max, block_size=32
+        GGML_TYPE_COUNT   = 47,
     };
 
     // precision
@@ -567,6 +572,7 @@ extern "C" {
         GGML_OP_RWKV_WKV7,
         GGML_OP_SOLVE_TRI,
         GGML_OP_GATED_DELTA_NET,
+        GGML_OP_TURBO_WHT,
 
         GGML_OP_UNARY,
 
@@ -2550,6 +2556,16 @@ extern "C" {
             struct ggml_tensor  * beta,
             struct ggml_tensor  * state);
 
+    // TurboQuant Walsh-Hadamard Transform (O(d log d) rotation for KV cache compression)
+    // Applies WHT rotation to 128-element groups along ne[0]: sign1 → butterfly → sign2 → normalize
+    // direction: 0 = forward (signs1 → WHT → signs2), 1 = inverse (signs2 → WHT → signs1)
+    GGML_API struct ggml_tensor * ggml_turbo_wht(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   direction,
+            int                   group_size,    // 0 = auto (64 or 128 from ne[0])
+            struct ggml_tensor  * scale);        // NULL = no InnerQ scaling
+
     // custom operators
 
     typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);