remove unused code, fix AMD MMA guard

khosravipasha · khosravipasha · commit 84ab75f5e656 · 2026-04-08T08:20:00.000Z
diff --git a/ggml/src/ggml-cuda/dequantize.cuh b/ggml/src/ggml-cuda/dequantize.cuh
@@ -15,12 +15,12 @@ static __device__ __forceinline__ void dequantize_q1_0(const void * vx, const in
     const int byte_index_1 = bit_index_1 / 8;
     const int bit_offset_1 = bit_index_1 % 8;
 
-    // Extract bits: 1 = +d, 0 = -d
-    const uint8_t bit_0 = (x[ib].qs[byte_index_0] >> bit_offset_0) & 1;
-    const uint8_t bit_1 = (x[ib].qs[byte_index_1] >> bit_offset_1) & 1;
+    // Extract bits: 1 = +d, 0 = -d (branchless)
+    const int bit_0 = (x[ib].qs[byte_index_0] >> bit_offset_0) & 1;
+    const int bit_1 = (x[ib].qs[byte_index_1] >> bit_offset_1) & 1;
 
-    v.x = bit_0 ? d : neg_d;
-    v.y = bit_1 ? d : neg_d;
+    v.x = (2*bit_0 - 1) * d;
+    v.y = (2*bit_1 - 1) * d;
 }
 
 static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
@@ -305,8 +305,8 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
         return false;
     }
 
-    // Q1_0 requires MMA (Turing+) — no DP4A fallback path
-    if (type == GGML_TYPE_Q1_0 && !turing_mma_available(cc)) {
+    // Q1_0 requires MMA — no DP4A fallback path
+    if (type == GGML_TYPE_Q1_0 && !turing_mma_available(cc) && !amd_mfma_available(cc) && !amd_wmma_available(cc)) {
         return false;
     }
 
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
@@ -11,7 +11,6 @@ using namespace ggml_cuda_mma;
 
 #define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available.
 #define MMQ_ITER_K 256
-#define MMQ_ITER_K_Q1_0 128  // For Q1_0: QK1_0=128, QI1_0=4, so threads_per_row = 128/(4*4) = 8
 #define MMQ_ITER_K_MXFP4_FP4    512
 #define MMQ_NWARPS 8
 
diff --git a/ggml/src/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh
@@ -109,44 +109,6 @@ static __device__ __forceinline__ uint32_t unpack_ksigns(const uint8_t v) {
 #define VDR_Q1_0_Q8_1_MMVQ 1  // Process one 32-element chunk at a time for parallelism
 #define VDR_Q1_0_Q8_1_MMQ  4  // Q1_0 has 128 bits (4 ints) per block
 
-template <int vdr> static __device__ __forceinline__ float vec_dot_q1_0_q8_1_impl(
-    const int * v, const int * u, const float & d1, const half2 & ds8) {
-
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        const int vi = v[i];
-
-        // Unpack 32 bits into 32 signed values (-1 or +1)
-        // Each bit: 0 -> -1, 1 -> +1
-        int vi_bytes[8];
-
-#pragma unroll
-        for (int j = 0; j < 8; ++j) {
-            const int shift = j * 4;
-            const int bits4 = (vi >> shift) & 0x0F;
-
-            const int b0 = (bits4 & 0x01) ? 1 : -1;
-            const int b1 = (bits4 & 0x02) ? 1 : -1;
-            const int b2 = (bits4 & 0x04) ? 1 : -1;
-            const int b3 = (bits4 & 0x08) ? 1 : -1;
-
-            vi_bytes[j] = (b0 & 0xFF) | ((b1 & 0xFF) << 8) | ((b2 & 0xFF) << 16) | ((b3 & 0xFF) << 24);
-        }
-
-#pragma unroll
-        for (int j = 0; j < 8; ++j) {
-            sumi = ggml_cuda_dp4a(vi_bytes[j], u[8*i + j], sumi);
-        }
-    }
-
-    const float2 ds8f = __half22float2(ds8);
-
-    // Q1_0 is symmetric (no offset), so we just multiply by scales
-    return d1 * ds8f.x * sumi;
-}
-
 #define VDR_Q4_0_Q8_1_MMVQ 2
 #define VDR_Q4_0_Q8_1_MMQ  4
 

Original file line number	Diff line number	Diff line change
`@@ -305,8 +305,8 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t`
`305`	`305`	`return false;`
`306`	`306`	`}`
`307`	`307`
`308`		`- // Q1_0 requires MMA (Turing+) — no DP4A fallback path`
`309`		`- if (type == GGML_TYPE_Q1_0 && !turing_mma_available(cc)) {`
	`308`	`+ // Q1_0 requires MMA — no DP4A fallback path`
	`309`	`+ if (type == GGML_TYPE_Q1_0 && !turing_mma_available(cc) && !amd_mfma_available(cc) && !amd_wmma_available(cc)) {`
`310`	`310`	`return false;`
`311`	`311`	`}`
`312`	`312`