qualcomm
diff --git a/‎ggml/src/ggml-opencl/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions b/‎ggml/src/ggml-opencl/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎ggml/src/ggml-opencl/ggml-opencl.cpp‎
Lines changed: 417 additions & 5 deletions b/‎ggml/src/ggml-opencl/ggml-opencl.cpp‎
Lines changed: 417 additions & 5 deletions
diff --git a/‎ggml/src/ggml-opencl/kernels/cvt.cl‎
Lines changed: 100 additions & 0 deletions b/‎ggml/src/ggml-opencl/kernels/cvt.cl‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl‎
Lines changed: 173 additions & 0 deletions b/‎ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl‎
Lines changed: 173 additions & 0 deletions
@@ -87,6 +87,10 @@ set(GGML_OPENCL_KERNELS
     mul_mv_q4_1_f32_flat
     mul_mv_q4_k_f32
     mul_mv_q4_k_f32_flat
+    mul_mv_q5_0_f32
+    mul_mv_q5_0_f32_flat
+    mul_mv_q5_1_f32
+    mul_mv_q5_1_f32_flat
     mul_mv_q5_k_f32
     mul_mv_q5_k_f32_flat
     mul_mv_q6_k_f32
@@ -126,6 +130,8 @@ set(GGML_OPENCL_KERNELS
     mul_mm_f16_f32_l4_lm
     mul_mm_q4_0_f32_l4_lm
     mul_mm_q4_1_f32_l4_lm
+    mul_mm_q5_0_f32_l4_lm
+    mul_mm_q5_1_f32_l4_lm
     mul_mm_q8_0_f32_l4_lm
     mul_mm_iq4_nl_f32_l4_lm
     mul_mm_q4_k_f32_l4_lm
 
@@ -537,6 +537,53 @@ kernel void kernel_restore_block_q4_1_trans4_ns(
     ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
 }
 
+//------------------------------------------------------------------------------
+// kernel_convert_block_q5_0
+// Convert the block_q5_0 format to 3 separate arrays (AOS -> SOA).
+// This kernel does not deshuffle the bits.
+//------------------------------------------------------------------------------
+kernel void kernel_convert_block_q5_0(
+    global struct block_q5_0 * src0,
+    global uchar * dst_qs,
+    global uint  * dst_qh,
+    global half  * dst_d,
+    ulong n_blk
+) {
+    if (get_global_id(0) >= n_blk) {
+        return;
+    }
+
+    global struct block_q5_0 * b = (global struct block_q5_0 *) src0 + get_global_id(0);
+    global uchar * qs = (global uchar *) dst_qs + (QK5_0/2)*get_global_id(0);
+    global uint  * qh = (global uint  *) dst_qh + get_global_id(0);
+    global half  * d  = (global half  *) dst_d  + get_global_id(0);
+
+    *d = b->d;
+    *qh = *((global uint *)(b->qh));
+
+    for (int i = 0; i < QK5_0/2; ++i) {
+        qs[i] = b->qs[i];
+    }
+}
+
+kernel void kernel_restore_block_q5_0(
+    global uchar * src_qs,
+    global uint  * src_qh,
+    global half  * src_d,
+    global struct block_q5_0 * dst
+) {
+    global struct block_q5_0 * b = (global struct block_q5_0 *) dst + get_global_id(0);
+    global uchar * qs = (global uchar *) src_qs + (QK5_0/2)*get_global_id(0);
+    global uint  * qh = (global uint  *) src_qh + get_global_id(0);
+    global half  * d  = (global half  *) src_d  + get_global_id(0);
+
+    b->d = *d;
+    *((global uint *)(b->qh)) = *qh;
+    for (int i = 0; i < QK5_0/2; ++i) {
+        b->qs[i] = qs[i];
+    }
+}
+
 kernel void kernel_convert_block_q5_0_trans4_ns(
     __global struct block_q5_0 * src0,
     __global uint * dst_qs,
@@ -636,6 +683,59 @@ kernel void kernel_restore_block_q5_0_trans4_ns(
     ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
 }
 
+//------------------------------------------------------------------------------
+// kernel_convert_block_q5_1
+// Convert the block_q5_1 format to 4 separate arrays (AOS -> SOA).
+// This kernel does not deshuffle the bits.
+//------------------------------------------------------------------------------
+kernel void kernel_convert_block_q5_1(
+    global struct block_q5_1 * src0,
+    global uchar * dst_qs,
+    global uint  * dst_qh,
+    global half  * dst_d,
+    global half  * dst_m,
+    ulong n_blk
+) {
+    if (get_global_id(0) >= n_blk) {
+        return;
+    }
+
+    global struct block_q5_1 * b = (global struct block_q5_1 *) src0 + get_global_id(0);
+    global uchar * qs = (global uchar *) dst_qs + (QK5_1/2)*get_global_id(0);
+    global uint  * qh = (global uint  *) dst_qh + get_global_id(0);
+    global half  * d  = (global half  *) dst_d  + get_global_id(0);
+    global half  * m  = (global half  *) dst_m  + get_global_id(0);
+
+    *d = b->d;
+    *m = b->m;
+    *qh = *((global uint *)(b->qh));
+
+    for (int i = 0; i < QK5_1/2; ++i) {
+        qs[i] = b->qs[i];
+    }
+}
+
+kernel void kernel_restore_block_q5_1(
+    global uchar * src_qs,
+    global uint  * src_qh,
+    global half  * src_d,
+    global half  * src_m,
+    global struct block_q5_1 * dst
+) {
+    global struct block_q5_1 * b = (global struct block_q5_1 *) dst + get_global_id(0);
+    global uchar * qs = (global uchar *) src_qs + (QK5_1/2)*get_global_id(0);
+    global uint  * qh = (global uint  *) src_qh + get_global_id(0);
+    global half  * d  = (global half  *) src_d  + get_global_id(0);
+    global half  * m  = (global half  *) src_m  + get_global_id(0);
+
+    b->d = *d;
+    b->m = *m;
+    *((global uint *)(b->qh)) = *qh;
+    for (int i = 0; i < QK5_1/2; ++i) {
+        b->qs[i] = qs[i];
+    }
+}
+
 kernel void kernel_convert_block_q5_1_trans4_ns(
     __global struct block_q5_1 * src0,
     __global uint * dst_qs,
 
@@ -0,0 +1,173 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#define LOAD_VEC_A 8
+#define LOAD_VEC_B 4
+
+#define BM 64
+#define BN 64
+#define BK 32
+#define TM 4
+#define TN 8
+
+kernel void kernel_mul_mm_q5_0_f32_l4_lm(
+    global uchar4 * src0_qs,
+    global uint   * src0_qh,
+    global half   * src0_d,
+    global float4 * src1,
+    ulong offset1,
+    global float  * dst,
+    ulong offsetd,
+
+    int ne00,
+    int ne01,
+    int ne02,
+    int ne11,
+    int ne12,
+
+    int stride_a,
+    int stride_b,
+    int stride_d,
+
+    int batch_stride_a,
+    int batch_stride_b,
+    int batch_stride_d,
+
+    int r2,
+    int r3
+) {
+    src1 = (global float4*)((global char*)src1 + offset1);
+    dst  = (global float *)((global char*)dst  + offsetd);
+
+    local float buf_a[BM * BK];
+    local float buf_b[BN * BK];
+
+    const int batch_idx = get_global_id(2);
+
+    const int i13 = batch_idx / ne12;
+    const int i12 = batch_idx % ne12;
+
+    const int i03 = i13 / r3;
+    const int i02 = i12 / r2;
+
+    const int batch_idx_a = i03 * ne02 + i02;
+
+    const int ir = get_group_id(0);
+    const int ic = get_group_id(1);
+
+    const int tid = get_local_id(0);
+    const int th_r  = tid % (BM / TM);
+    const int th_c  = tid / (BM / TM);
+
+    const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A);
+    const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A);
+    const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B);
+    const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B);
+
+    const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK;
+    const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK;
+
+    int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A;
+    int pos_b = (batch_idx   * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B;
+
+    float sums[TM * TN];
+    float cache_a[TM];
+    float cache_b[TN];
+
+    for (int i = 0; i < TM * TN; i++) {
+        sums[i] = 0.0f;
+    }
+
+    for (int block = 0; block < ne00; block += BK) {
+        for (int l = 0; l < BM; l += loadstride_a) {
+            if (ir*BM + loadc_a + l < ne01) {
+                int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
+                int ib  = idx / 4;
+                int iqs = idx % 4;
+
+                float d = (float)src0_d[ib];
+                uint qh_val = src0_qh[ib];
+
+                global uchar4 * qs_ptr = src0_qs + ib*4 + iqs;
+                uchar4 q = *qs_ptr;
+
+                uint qh_lo = qh_val >> (iqs * 4);
+                uint qh_hi = qh_val >> (iqs * 4 + 16);
+
+                uchar4 b_lo = (uchar4)((uchar)qh_lo, (uchar)(qh_lo >> 1), (uchar)(qh_lo >> 2), (uchar)(qh_lo >> 3)) & (uchar)1;
+                uchar4 b_hi = (uchar4)((uchar)qh_hi, (uchar)(qh_hi >> 1), (uchar)(qh_hi >> 2), (uchar)(qh_hi >> 3)) & (uchar)1;
+
+                float4 v1 = (convert_float4((q & (uchar)0x0F) | (b_lo << (uchar)4)) - 16.0f) * d;
+                float4 v2 = (convert_float4((q >> (uchar)4) | (b_hi << (uchar)4)) - 16.0f) * d;
+
+                buf_a[(loadr_a * 4 +  0) * BM + loadc_a + l] = v1.s0;
+                buf_a[(loadr_a * 4 +  1) * BM + loadc_a + l] = v1.s1;
+                buf_a[(loadr_a * 4 +  2) * BM + loadc_a + l] = v1.s2;
+                buf_a[(loadr_a * 4 +  3) * BM + loadc_a + l] = v1.s3;
+                buf_a[(loadr_a * 4 + 16) * BM + loadc_a + l] = v2.s0;
+                buf_a[(loadr_a * 4 + 17) * BM + loadc_a + l] = v2.s1;
+                buf_a[(loadr_a * 4 + 18) * BM + loadc_a + l] = v2.s2;
+                buf_a[(loadr_a * 4 + 19) * BM + loadc_a + l] = v2.s3;
+            } else {
+                buf_a[(loadr_a * 4 +  0) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * 4 +  1) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * 4 +  2) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * 4 +  3) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * 4 + 16) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * 4 + 17) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * 4 + 18) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * 4 + 19) * BM + loadc_a + l] = 0.0f;
+            }
+        }
+
+        for (int l = 0; l < BN; l += loadstride_b) {
+            if (ic*BN + loadc_b + l < ne11) {
+                int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
+                buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
+                buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
+                buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
+                buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
+            } else {
+                buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = 0.0f;
+                buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = 0.0f;
+                buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = 0.0f;
+                buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = 0.0f;
+            }
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        pos_a += BK / LOAD_VEC_A;
+        pos_b += BK / LOAD_VEC_B;
+
+        for (int i = 0; i < BK; i++) {
+            for (int j = 0; j < TM; j++) {
+                cache_a[j] = buf_a[(i) * BM + th_r * TM + j];
+            }
+
+            for (int j = 0; j < TN; j++) {
+                cache_b[j] = buf_b[(i) * BN + th_c * TN + j];
+            }
+
+            for (int cc = 0; cc < TN; cc++) {
+                for (int cr = 0; cr < TM; cr++) {
+                    const int sums_idx = cc*TM + cr;
+                    sums[sums_idx] = mad(cache_a[cr], cache_b[cc], sums[sums_idx]);
+                }
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    const int dr = ir * BM + th_r * TM;
+    const int dc = ic * BN + th_c * TN;
+
+    const int offsets = batch_idx * batch_stride_d;
+
+    for (int cc = 0; cc < TN; cc++) {
+        for (int cr = 0; cr < TM; cr++) {
+            if (dr + cr < ne01 && dc + cc < ne11) {
+                dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr];
+            }
+        }
+    }
+}