Draft
Conversation
There was a problem hiding this comment.
Pull request overview
Adds initial OpenCL backend support for GGML_TYPE_Q1_0, including AoS↔SoA conversion and new Q1_0 matmul/matvec kernels wired into the backend kernel loader and dispatch paths.
Changes:
- Added
block_q1_0convert/restore kernels incvt.cland integrated them into tensor set/get paths. - Added new Q1_0 matvec kernels (
*_8x_flat,*_1d_8x_flat) and a Q1_0 GEMM kernel (mul_mat_q1_0_Ab_Bi_8x4), with dispatch integration inggml-opencl.cpp. - Added a new float32 transpose variant kernel (
kernel_transpose_32_32) and registered new kernels in OpenCL CMake lists.
Reviewed changes
Copilot reviewed 7 out of 7 changed files in this pull request and generated 5 comments.
Show a summary per file
| File | Description |
|---|---|
| ggml/src/ggml-opencl/kernels/transpose.cl | Adds a bounds-checked float32 transpose kernel variant intended for padded shapes. |
| ggml/src/ggml-opencl/kernels/mul_mv_q1_0_f32_8x_flat.cl | New Q1_0 SoA matvec kernel (8 outputs per subgroup). |
| ggml/src/ggml-opencl/kernels/mul_mv_q1_0_f32_1d_8x_flat.cl | New Q1_0 SoA matvec kernel variant for 1d/batch dispatch. |
| ggml/src/ggml-opencl/kernels/mul_mat_q1_0_Ab_Bi_8x4.cl | New Q1_0 GEMM kernel computing an 8x4 output tile per work-item. |
| ggml/src/ggml-opencl/kernels/cvt.cl | Adds block_q1_0 definition + convert/restore kernels (AoS↔SoA). |
| ggml/src/ggml-opencl/ggml-opencl.cpp | Wires Q1_0 programs/kernels, tensor set/get conversion, and mul_mat dispatch paths. |
| ggml/src/ggml-opencl/CMakeLists.txt | Registers the new OpenCL kernels for build/embed. |
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
Comment on lines
+51
to
+63
| // Pointers for 4 weight columns (SOA layout, row-major) | ||
| // For Q1_0: each block is 16 bytes (128 bits) | ||
| global const uchar* weight_base0 = src0_q + (gx_4 + 0) * num_blocks * 16; | ||
| global const uchar* weight_base1 = src0_q + (gx_4 + 1) * num_blocks * 16; | ||
| global const uchar* weight_base2 = src0_q + (gx_4 + 2) * num_blocks * 16; | ||
| global const uchar* weight_base3 = src0_q + (gx_4 + 3) * num_blocks * 16; | ||
|
|
||
| // Scale pointers for 4 columns | ||
| global const half* scale_ptr0 = src0_d + (gx_4 + 0) * num_blocks; | ||
| global const half* scale_ptr1 = src0_d + (gx_4 + 1) * num_blocks; | ||
| global const half* scale_ptr2 = src0_d + (gx_4 + 2) * num_blocks; | ||
| global const half* scale_ptr3 = src0_d + (gx_4 + 3) * num_blocks; | ||
|
|
Comment on lines
+192
to
+215
| if (row_base + 0 < n_no_padding) { | ||
| vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + (row_base + 0) * m + (gx << 2)); | ||
| } | ||
| if (row_base + 1 < n_no_padding) { | ||
| vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + (row_base + 1) * m + (gx << 2)); | ||
| } | ||
| if (row_base + 2 < n_no_padding) { | ||
| vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + (row_base + 2) * m + (gx << 2)); | ||
| } | ||
| if (row_base + 3 < n_no_padding) { | ||
| vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + (row_base + 3) * m + (gx << 2)); | ||
| } | ||
| if (row_base + 4 < n_no_padding) { | ||
| vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + (row_base + 4) * m + (gx << 2)); | ||
| } | ||
| if (row_base + 5 < n_no_padding) { | ||
| vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + (row_base + 5) * m + (gx << 2)); | ||
| } | ||
| if (row_base + 6 < n_no_padding) { | ||
| vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + (row_base + 6) * m + (gx << 2)); | ||
| } | ||
| if (row_base + 7 < n_no_padding) { | ||
| vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + (row_base + 7) * m + (gx << 2)); | ||
| } |
Comment on lines
+1
to
+10
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | ||
| #pragma OPENCL EXTENSION cl_khr_subgroups : enable | ||
|
|
||
| #ifdef cl_intel_required_subgroup_size | ||
| #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable | ||
| #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16))) | ||
| #elif defined(cl_qcom_reqd_sub_group_size) | ||
| #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable | ||
| #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) | ||
| #endif |
Comment on lines
+1
to
+10
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | ||
| #pragma OPENCL EXTENSION cl_khr_subgroups : enable | ||
|
|
||
| #ifdef cl_intel_required_subgroup_size | ||
| #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable | ||
| #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16))) | ||
| #elif defined(cl_qcom_reqd_sub_group_size) | ||
| #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable | ||
| #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) | ||
| #endif |
Comment on lines
+10789
to
+10791
| size_t global_work_size[3] = {(size_t)((N + 7) / 8), (size_t)(M / 4), 1}; | ||
| size_t local_work_size[3] = {1, 128, 1}; | ||
|
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
Just for testing...