opencl: Q1_0 support first attempt by khosravipasha · Pull Request #25 · PrismML-Eng/llama.cpp

khosravipasha · 2026-04-15T01:22:53Z

Just for testing...

Copilot

Pull request overview

Adds initial OpenCL backend support for GGML_TYPE_Q1_0, including AoS↔SoA conversion and new Q1_0 matmul/matvec kernels wired into the backend kernel loader and dispatch paths.

Changes:

Added block_q1_0 convert/restore kernels in cvt.cl and integrated them into tensor set/get paths.
Added new Q1_0 matvec kernels (*_8x_flat, *_1d_8x_flat) and a Q1_0 GEMM kernel (mul_mat_q1_0_Ab_Bi_8x4), with dispatch integration in ggml-opencl.cpp.
Added a new float32 transpose variant kernel (kernel_transpose_32_32) and registered new kernels in OpenCL CMake lists.

Reviewed changes

Copilot reviewed 7 out of 7 changed files in this pull request and generated 5 comments.

Show a summary per file

File	Description
ggml/src/ggml-opencl/kernels/transpose.cl	Adds a bounds-checked float32 transpose kernel variant intended for padded shapes.
ggml/src/ggml-opencl/kernels/mul_mv_q1_0_f32_8x_flat.cl	New Q1_0 SoA matvec kernel (8 outputs per subgroup).
ggml/src/ggml-opencl/kernels/mul_mv_q1_0_f32_1d_8x_flat.cl	New Q1_0 SoA matvec kernel variant for 1d/batch dispatch.
ggml/src/ggml-opencl/kernels/mul_mat_q1_0_Ab_Bi_8x4.cl	New Q1_0 GEMM kernel computing an 8x4 output tile per work-item.
ggml/src/ggml-opencl/kernels/cvt.cl	Adds `block_q1_0` definition + convert/restore kernels (AoS↔SoA).
ggml/src/ggml-opencl/ggml-opencl.cpp	Wires Q1_0 programs/kernels, tensor set/get conversion, and mul_mat dispatch paths.
ggml/src/ggml-opencl/CMakeLists.txt	Registers the new OpenCL kernels for build/embed.

💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.

+    // Pointers for 4 weight columns (SOA layout, row-major)
+    // For Q1_0: each block is 16 bytes (128 bits)
+    global const uchar* weight_base0 = src0_q + (gx_4 + 0) * num_blocks * 16;
+    global const uchar* weight_base1 = src0_q + (gx_4 + 1) * num_blocks * 16;
+    global const uchar* weight_base2 = src0_q + (gx_4 + 2) * num_blocks * 16;
+    global const uchar* weight_base3 = src0_q + (gx_4 + 3) * num_blocks * 16;
+
+    // Scale pointers for 4 columns
+    global const half* scale_ptr0 = src0_d + (gx_4 + 0) * num_blocks;
+    global const half* scale_ptr1 = src0_d + (gx_4 + 1) * num_blocks;
+    global const half* scale_ptr2 = src0_d + (gx_4 + 2) * num_blocks;
+    global const half* scale_ptr3 = src0_d + (gx_4 + 3) * num_blocks;
+


+    if (row_base + 0 < n_no_padding) {
+        vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + (row_base + 0) * m + (gx << 2));
+    }
+    if (row_base + 1 < n_no_padding) {
+        vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + (row_base + 1) * m + (gx << 2));
+    }
+    if (row_base + 2 < n_no_padding) {
+        vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + (row_base + 2) * m + (gx << 2));
+    }
+    if (row_base + 3 < n_no_padding) {
+        vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + (row_base + 3) * m + (gx << 2));
+    }
+    if (row_base + 4 < n_no_padding) {
+        vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + (row_base + 4) * m + (gx << 2));
+    }
+    if (row_base + 5 < n_no_padding) {
+        vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + (row_base + 5) * m + (gx << 2));
+    }
+    if (row_base + 6 < n_no_padding) {
+        vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + (row_base + 6) * m + (gx << 2));
+    }
+    if (row_base + 7 < n_no_padding) {
+        vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + (row_base + 7) * m + (gx << 2));
+    }


+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+#ifdef cl_intel_required_subgroup_size
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
+#endif


+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+#ifdef cl_intel_required_subgroup_size
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
+#endif


+        size_t global_work_size[3] = {(size_t)((N + 7) / 8), (size_t)(M / 4), 1};
+        size_t local_work_size[3] = {1, 128, 1};
+


opencl: Q1_0 support first attempt

077b2e5

khosravipasha requested a review from Copilot April 15, 2026 01:23

github-actions bot added ggml OpenCL labels Apr 15, 2026

Copilot started reviewing on behalf of khosravipasha April 15, 2026 01:23 View session

Copilot AI reviewed Apr 15, 2026

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

opencl: Q1_0 support first attempt#25

opencl: Q1_0 support first attempt#25
khosravipasha wants to merge 1 commit intoprismfrom
prism-android-new

khosravipasha commented Apr 15, 2026

Uh oh!

Copilot AI left a comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

		size_t global_work_size[3] = {(size_t)((N + 7) / 8), (size_t)(M / 4), 1};
		size_t local_work_size[3] = {1, 128, 1};

Conversation

khosravipasha commented Apr 15, 2026

Uh oh!

Copilot AI left a comment

Choose a reason for hiding this comment

Pull request overview

Reviewed changes

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants