Merge pull request #61 from andyluo7/add-rocm-mi300x-support

TheTom · web-flow · commit 157cb85a8f53 · 2026-04-09T08:20:56.000-05:00
feat: AMD Instinct MI300X + MI355X (gfx942/gfx950) ROCm support
diff --git a/docs/rocm-mi300x-test-results.md b/docs/rocm-mi300x-test-results.md
@@ -0,0 +1,95 @@
+# TurboQuant on AMD Instinct MI300X & MI355X (ROCm/HIP)
+
+## Summary
+
+TurboQuant KV cache compression (turbo2/turbo3/turbo4) builds and runs correctly on AMD Instinct MI300X (gfx942) and MI355X (gfx950). MI300X requires zero code changes. MI355X requires adding CDNA4 arch defines to the HIP vendor header.
+
+## Test Environment
+
+| Component | MI300X | MI355X |
+|-----------|--------|--------|
+| GPU | MI300X (gfx942), 192 GB HBM3 | MI355X (gfx950), 288 GB HBM3e |
+| ROCm | 7.0.2 | 7.0.1 |
+| Wave Size | 64 | 64 |
+| Build | `-DAMDGPU_TARGETS="gfx942"` | `-DAMDGPU_TARGETS="gfx950"` |
+| Model | Qwen2.5-1.5B Q4_K_M (1.04 GiB) | same |
+
+## WHT Kernel Correctness
+
+Standalone roundtrip test (forward WHT → inverse WHT) confirms the Walsh-Hadamard Transform kernel works correctly on HIP with 64-wide wavefronts:
+
+```
+=== TurboQuant WHT Roundtrip Test (HIP/gfx942) ===
+Total elements: 512 (4 heads x 128 dim)
+Forward WHT zeros: 0 / 512
+Roundtrip max error: 2.980232e-07
+Roundtrip RMSE:      6.816018e-08
+Result: PASS ✅
+```
+
+The kernel uses shared memory + `__syncthreads()` (no warp shuffles), so it works correctly with GCN's 64-thread wavefronts without modification.
+
+## Performance Results
+
+### MI300X (single GPU, Qwen2.5-1.5B Q4_K_M)
+
+| KV Cache | pp512 (tok/s) | tg128 (tok/s) | Prefill vs f16 | Decode vs f16 |
+|----------|--------------|--------------|----------------|---------------|
+| f16 | 24,453 ± 230 | 181.2 ± 2.0 | baseline | baseline |
+| turbo3 | ~25,200 | ~160 | **+3%** | 88% |
+| turbo4 | 25,427 ± 17 | 161.1 ± 0.2 | **+4%** | 89% |
+
+### MI355X (single GPU, Qwen2.5-1.5B Q4_K_M)
+
+| KV Cache | pp512 (tok/s) | tg128 (tok/s) | Prefill vs f16 | Decode vs f16 |
+|----------|--------------|--------------|----------------|---------------|
+| f16+FA | 40,013 ± 902 | 254.5 ± 1.0 | baseline | baseline |
+| turbo3 | 39,140 ± 475 | 162.3 ± 0.1 | 98% | 64% |
+| turbo4 | 39,232 ± 508 | 214.1 ± 0.7 | 98% | **84%** |
+
+### Key Observations
+
+1. **MI300X prefill is faster with TurboQuant** (+3-4%) — less KV cache data to write to HBM.
+2. **MI300X decode at 88-89% of f16** — consistent with Apple Silicon community results.
+3. **MI355X turbo4 decode at 84%** — turbo4 outperforms turbo3 in decode due to simpler 4-bit dequant.
+4. **MI355X turbo3 decode at 64%** — the 3-bit codebook + sign extraction is more expensive on gfx950.
+5. **MI355X non-FA MMQ path crashes** (xf32 MFMA issue) — turbo types force FA and work correctly.
+
+## Build Instructions
+
+```bash
+git clone https://github.com/TheTom/llama-cpp-turboquant.git
+cd llama-cpp-turboquant
+git checkout feature/turboquant-kv-cache
+
+# MI300X (gfx942) — works without code changes
+cmake -B build -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx942"
+cmake --build build --config Release -j
+
+# MI355X (gfx950) — requires CDNA4 define patch (see commit)
+cmake -B build -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx950"
+cmake --build build --config Release -j
+
+# Test
+HIP_VISIBLE_DEVICES=0 ./build/bin/llama-bench \
+  -m model.gguf -ctk turbo3 -ctv turbo3 -ngl 99 -r 3 -p 512 -n 128
+```
+
+## Code Changes for gfx950 (MI355X)
+
+Three files modified to add CDNA4 (gfx950) architecture support:
+
+1. **`ggml/src/ggml-cuda/vendors/hip.h`** — Add `CDNA4` define for `__gfx950__`, include in `CDNA` family
+2. **`ggml/src/ggml-cuda/common.cuh`** — Add `GGML_CUDA_CC_CDNA4` constant and `GGML_CUDA_CC_IS_CDNA4` macro
+3. **`ggml/src/ggml-cuda/mma.cuh`** — Route CDNA4 to compatible MFMA instructions (bf16_1k, i32x16x32_i8, f32x16x4f32 — NOT xf32 which doesn't exist on gfx950)
+
+## Known Limitations
+
+- **MI355X non-FA MMQ crashes**: The default (non-flash-attention) matrix multiply path crashes on gfx950 due to the xf32 MFMA instruction (`mfma_f32_16x16x8_xf32`) not being available. TurboQuant types force flash attention and work correctly. Standard f16/q8_0 KV cache types need `-fa 1` flag on MI355X.
+- **llama-cli text output**: Interactive mode produces empty tokens on ROCm (display issue), but `llama-bench` confirms computation is correct.
+
+## Tested By
+
+Andy Luo (@andyluo7)
+- AMD Instinct MI300X (gfx942), ROCm 7.0.2 — April 2026
+- AMD Instinct MI355X (gfx950), ROCm 7.0.1 — April 2026
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
@@ -67,6 +67,7 @@
 #define GGML_CUDA_CC_CDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x908)  // MI100, minimum for MFMA, acc registers
 #define GGML_CUDA_CC_CDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x910)  // MI210, minimum acc register renameing
 #define GGML_CUDA_CC_CDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x942)  // MI300
+#define GGML_CUDA_CC_CDNA4      (GGML_CUDA_CC_OFFSET_AMD + 0x950)  // MI350X/MI355X
 
 // RDNA removes MFMA, dp4a, xnack, acc registers, wave size is 32
 #define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
@@ -87,7 +88,8 @@
 #define GGML_CUDA_CC_IS_CDNA(cc)    (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
 #define GGML_CUDA_CC_IS_CDNA1(cc)   (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
 #define GGML_CUDA_CC_IS_CDNA2(cc)   (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
-#define GGML_CUDA_CC_IS_CDNA3(cc)   (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
+#define GGML_CUDA_CC_IS_CDNA3(cc)   (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_CDNA4)
+#define GGML_CUDA_CC_IS_CDNA4(cc)   (cc >= GGML_CUDA_CC_CDNA4 && cc < GGML_CUDA_CC_RDNA1)
 
 // Moore Threads
 #define MUSART_HMASK 40300 // MUSA rc4.3, min. ver. for half2 -> uint mask comparisons
@@ -802,7 +804,7 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
 static __device__ __forceinline__ float ggml_cuda_ue4m3_to_fp32(uint8_t x) {
 #if defined(GGML_USE_HIP) && defined(CDNA3) && defined(FP8_AVAILABLE) && HIP_VERSION >= 60200000
     // ROCm does not support fp8 in software on devices with fp8 hardware,
-    // but CDNA3 supports only e4m3_fnuz (no inf).
+    // but CDNA3 supports only e4m3_fnuz (no inf). CDNA4 (gfx950) uses standard e4m3fn.
     const uint32_t bits = x * (x != 0x7F && x != 0xFF); // Convert NaN to 0.0f to match CPU implementation.
     const __hip_fp8_e4m3_fnuz xf = *reinterpret_cast<const __hip_fp8_e4m3_fnuz *>(&bits);
     return static_cast<float>(xf) / 2;
diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
@@ -1025,7 +1025,7 @@ namespace ggml_cuda_mma {
         const floatx2_t& a_frag = reinterpret_cast<const floatx2_t&>(A.x[0]);
         const floatx2_t& b_frag = reinterpret_cast<const floatx2_t&>(B.x[0]);
         acc_frag = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a_frag, b_frag, acc_frag, 0, 0, 0);
-#elif defined(CDNA2) || defined(CDNA1)
+#elif defined(CDNA4) || defined(CDNA2) || defined(CDNA1)
 #pragma unroll
         for (int i = 0; i < 2; ++i) {
             acc_frag = __builtin_amdgcn_mfma_f32_16x16x4f32(A.x[i], B.x[i], acc_frag, 0, 0, 0);
@@ -1187,7 +1187,7 @@ namespace ggml_cuda_mma {
 #elif defined(AMD_MFMA_AVAILABLE)
         using floatx4_t = __attribute__((ext_vector_type(4))) float;
         floatx4_t& acc_frag = reinterpret_cast<floatx4_t&>(D.x[0]);
-#if defined(CDNA3) || defined(CDNA2)
+#if defined(CDNA4) || defined(CDNA3) || defined(CDNA2)
         using bf16x4_t = __attribute__((ext_vector_type(4))) __bf16;
         const bf16x4_t& a_frag = reinterpret_cast<const bf16x4_t&>(A.x[0]);
         const bf16x4_t& b_frag = reinterpret_cast<const bf16x4_t&>(B.x[0]);
@@ -1216,12 +1216,12 @@ namespace ggml_cuda_mma {
 #if defined(AMD_MFMA_AVAILABLE)
         using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
         int32x4_t * acc = (int32x4_t *) D.x;
-#if defined(CDNA3)
+#if defined(CDNA4) || defined(CDNA3)
         acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0],
                                                        ((int64_t *) B.x)[0],
                                                        acc[0],
                                                        0, 0, 0);
-#elif defined(CDNA2) || defined(CDNA)
+#elif defined(CDNA2) || defined(CDNA1)
         acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0],
                                                       B.x[0],
                                                       acc[0],
@@ -1295,12 +1295,12 @@ namespace ggml_cuda_mma {
 #if defined(AMD_MFMA_AVAILABLE)
         using int32x16_t = __attribute__((__vector_size__(16 * sizeof(int)))) int;
         int32x16_t * acc = (int32x16_t *) D.x;
-#if defined(CDNA3)
+#if defined(CDNA4) || defined(CDNA3)
         acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0],
                                                        ((int64_t *) B.x)[0],
                                                        acc[0],
                                                        0, 0, 0);
-#elif defined(CDNA2) || defined(CDNA)
+#elif defined(CDNA2) || defined(CDNA1)
         acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0],
                                                      B.x[0],
                                                      acc[0],
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
@@ -3629,7 +3629,7 @@ static __global__ void mul_mat_q(
              tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
         return;
     }
-#endif // (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
+#endif // (defined(GGML_USE_HIP) && !defined(CDNA4) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
 
     constexpr int ITER_K = get_iter_k(type);
 
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
@@ -211,6 +211,10 @@
 #define GCN
 #endif // defined(GCN5) || defined(GCN4)
 
+#if defined(__gfx950__)
+#define CDNA4
+#endif // defined(__gfx950__)
+
 #if defined(__gfx942__)
 #define CDNA3
 #endif // defined(__gfx942__)
@@ -223,9 +227,9 @@
 #define CDNA1
 #endif // defined(__gfx908__)
 
-#if defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
+#if defined(CDNA4) || defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
 #define CDNA // For the entire family
-#endif // defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
+#endif // defined(CDNA4) || defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
 
 #if defined(__GFX12__)
 #define RDNA4

Original file line number	Diff line number	Diff line change
`@@ -3629,7 +3629,7 @@ static __global__ void mul_mat_q(`
`3629`	`3629`	`tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);`
`3630`	`3630`	`return;`
`3631`	`3631`	`}`
`3632`		`-#endif // (defined(GGML_USE_HIP) && !defined(CDNA3)) \|\| __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA`
	`3632`	`+#endif // (defined(GGML_USE_HIP) && !defined(CDNA4) && !defined(CDNA3)) \|\| __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA`
`3633`	`3633`
`3634`	`3634`	`constexpr int ITER_K = get_iter_k(type);`
`3635`	`3635`