Skip to content

Commit 84ab75f

Browse files
committed
remove unused code, fix AMD MMA guard
1 parent 7c3501a commit 84ab75f

File tree

4 files changed

+7
-46
lines changed

4 files changed

+7
-46
lines changed

ggml/src/ggml-cuda/dequantize.cuh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@ static __device__ __forceinline__ void dequantize_q1_0(const void * vx, const in
1515
const int byte_index_1 = bit_index_1 / 8;
1616
const int bit_offset_1 = bit_index_1 % 8;
1717

18-
// Extract bits: 1 = +d, 0 = -d
19-
const uint8_t bit_0 = (x[ib].qs[byte_index_0] >> bit_offset_0) & 1;
20-
const uint8_t bit_1 = (x[ib].qs[byte_index_1] >> bit_offset_1) & 1;
18+
// Extract bits: 1 = +d, 0 = -d (branchless)
19+
const int bit_0 = (x[ib].qs[byte_index_0] >> bit_offset_0) & 1;
20+
const int bit_1 = (x[ib].qs[byte_index_1] >> bit_offset_1) & 1;
2121

22-
v.x = bit_0 ? d : neg_d;
23-
v.y = bit_1 ? d : neg_d;
22+
v.x = (2*bit_0 - 1) * d;
23+
v.y = (2*bit_1 - 1) * d;
2424
}
2525

2626
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, float2 & v){

ggml/src/ggml-cuda/mmq.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -305,8 +305,8 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
305305
return false;
306306
}
307307

308-
// Q1_0 requires MMA (Turing+) — no DP4A fallback path
309-
if (type == GGML_TYPE_Q1_0 && !turing_mma_available(cc)) {
308+
// Q1_0 requires MMA — no DP4A fallback path
309+
if (type == GGML_TYPE_Q1_0 && !turing_mma_available(cc) && !amd_mfma_available(cc) && !amd_wmma_available(cc)) {
310310
return false;
311311
}
312312

ggml/src/ggml-cuda/mmq.cuh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ using namespace ggml_cuda_mma;
1111

1212
#define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available.
1313
#define MMQ_ITER_K 256
14-
#define MMQ_ITER_K_Q1_0 128 // For Q1_0: QK1_0=128, QI1_0=4, so threads_per_row = 128/(4*4) = 8
1514
#define MMQ_ITER_K_MXFP4_FP4 512
1615
#define MMQ_NWARPS 8
1716

ggml/src/ggml-cuda/vecdotq.cuh

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -109,44 +109,6 @@ static __device__ __forceinline__ uint32_t unpack_ksigns(const uint8_t v) {
109109
#define VDR_Q1_0_Q8_1_MMVQ 1 // Process one 32-element chunk at a time for parallelism
110110
#define VDR_Q1_0_Q8_1_MMQ 4 // Q1_0 has 128 bits (4 ints) per block
111111

112-
template <int vdr> static __device__ __forceinline__ float vec_dot_q1_0_q8_1_impl(
113-
const int * v, const int * u, const float & d1, const half2 & ds8) {
114-
115-
int sumi = 0;
116-
117-
#pragma unroll
118-
for (int i = 0; i < vdr; ++i) {
119-
const int vi = v[i];
120-
121-
// Unpack 32 bits into 32 signed values (-1 or +1)
122-
// Each bit: 0 -> -1, 1 -> +1
123-
int vi_bytes[8];
124-
125-
#pragma unroll
126-
for (int j = 0; j < 8; ++j) {
127-
const int shift = j * 4;
128-
const int bits4 = (vi >> shift) & 0x0F;
129-
130-
const int b0 = (bits4 & 0x01) ? 1 : -1;
131-
const int b1 = (bits4 & 0x02) ? 1 : -1;
132-
const int b2 = (bits4 & 0x04) ? 1 : -1;
133-
const int b3 = (bits4 & 0x08) ? 1 : -1;
134-
135-
vi_bytes[j] = (b0 & 0xFF) | ((b1 & 0xFF) << 8) | ((b2 & 0xFF) << 16) | ((b3 & 0xFF) << 24);
136-
}
137-
138-
#pragma unroll
139-
for (int j = 0; j < 8; ++j) {
140-
sumi = ggml_cuda_dp4a(vi_bytes[j], u[8*i + j], sumi);
141-
}
142-
}
143-
144-
const float2 ds8f = __half22float2(ds8);
145-
146-
// Q1_0 is symmetric (no offset), so we just multiply by scales
147-
return d1 * ds8f.x * sumi;
148-
}
149-
150112
#define VDR_Q4_0_Q8_1_MMVQ 2
151113
#define VDR_Q4_0_Q8_1_MMQ 4
152114

0 commit comments

Comments
 (0)