Skip to content

Commit c928f71

Browse files
Merge pull request #572 from SKaiNET-developers/feature/native-q4k-matmul
feat(native-cpu): native FFM Q4_K matmul kernel (PR 2 of 5)
2 parents 961d68f + 094023e commit c928f71

9 files changed

Lines changed: 551 additions & 22 deletions

File tree

skainet-backends/skainet-backend-native-cpu/build.gradle.kts

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@ kotlin {
1515
val jvmTest by getting {
1616
dependencies {
1717
implementation(libs.kotlin.test)
18+
// Parity tests compare NativeQ4KMatmulKernel output
19+
// against PanamaVectorQ4KMatmulKernel; the Panama
20+
// kernel pulls in parallelChunks which transitively
21+
// requires kotlinx-coroutines.
22+
implementation(project(":skainet-backends:skainet-backend-cpu"))
23+
implementation(libs.kotlinx.coroutines)
1824
}
1925
}
2026
}
@@ -106,10 +112,15 @@ tasks.named("jvmProcessResources") {
106112
dependsOn(packageNativeKernels)
107113
}
108114

115+
// Forward `-Dskainet.runBench=true` from Gradle CLI to the forked test
116+
// JVM so Q4KMatmulMicrobenchTest activates. Skipped silently otherwise.
117+
val runBenchProperty = providers.systemProperty("skainet.runBench")
118+
109119
tasks.withType<Test>().configureEach {
110-
jvmArgs("--enable-preview", "--enable-native-access=ALL-UNNAMED")
120+
jvmArgs("--enable-preview", "--enable-native-access=ALL-UNNAMED", "--add-modules", "jdk.incubator.vector")
121+
runBenchProperty.orNull?.let { systemProperty("skainet.runBench", it) }
111122
}
112123

113124
tasks.withType<JavaExec>().configureEach {
114-
jvmArgs("--enable-preview", "--enable-native-access=ALL-UNNAMED")
125+
jvmArgs("--enable-preview", "--enable-native-access=ALL-UNNAMED", "--add-modules", "jdk.incubator.vector")
115126
}

skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ endif()
1111

1212
add_library(skainet_kernels SHARED
1313
src/skainet_smoke.c
14+
src/q4k_matmul.c
1415
)
1516

1617
target_include_directories(skainet_kernels PUBLIC
@@ -23,8 +24,15 @@ if(WIN32)
2324
set_target_properties(skainet_kernels PROPERTIES PREFIX "")
2425
endif()
2526

26-
# Hide non-exported symbols on ELF / Mach-O for a smaller surface area.
27+
# Hide non-exported symbols on ELF / Mach-O for a smaller surface area
28+
# and let the compiler auto-vectorize the Q4_K hot loop.
2729
if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU")
28-
target_compile_options(skainet_kernels PRIVATE -fvisibility=hidden -Wall -Wextra)
30+
target_compile_options(skainet_kernels PRIVATE
31+
-fvisibility=hidden
32+
-Wall -Wextra
33+
-O3
34+
-ffast-math
35+
-funroll-loops
36+
)
2937
set_target_properties(skainet_kernels PROPERTIES C_VISIBILITY_PRESET hidden)
3038
endif()

skainet-backends/skainet-backend-native-cpu/native/include/skainet_kernels.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,30 @@ extern "C" {
2525
*/
2626
SKAINET_API void skainet_smoke_double(const float* input, float* output, int32_t length);
2727

28+
/*
29+
* Q4_K matrix-vector multiply.
30+
*
31+
* output[output_offset + o] = sum_j input[input_offset + j] *
32+
* dequant(weight[block, o, j])
33+
*
34+
* Block layout: canonical ggml Q4_K, 256 elements per super-block, 144
35+
* bytes per block, with packed weights laid out as
36+
* weight + weight_byte_offset + (block_idx * output_dim + o) * 144
37+
*
38+
* Caller owns input/weight/output memory; the kernel does not retain
39+
* pointers past return. input_dim must be a multiple of 256.
40+
*/
41+
SKAINET_API void skainet_q4k_matmul(
42+
const float* input,
43+
int32_t input_offset,
44+
const uint8_t* weight,
45+
int32_t weight_byte_offset,
46+
int32_t input_dim,
47+
int32_t output_dim,
48+
float* output,
49+
int32_t output_offset
50+
);
51+
2852
#ifdef __cplusplus
2953
}
3054
#endif
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
#include "skainet_kernels.h"
2+
3+
#include <stddef.h>
4+
#include <stdint.h>
5+
6+
#define Q4K_BLOCK_SIZE 256
7+
#define Q4K_SUB_BLOCK_SIZE 32
8+
#define Q4K_SUB_BLOCKS 8
9+
#define Q4K_BYTES_PER_BLOCK 144
10+
11+
/*
12+
* IEEE 754 binary16 (LE byte order) -> binary32 conversion.
13+
* Mirrors PanamaVectorQ4KMatmulKernel.halfToFloat byte-for-byte.
14+
*/
15+
static inline float skainet_half_to_float(uint16_t hbits) {
16+
const uint32_t sign = (hbits >> 15) & 0x1u;
17+
const uint32_t exp = (hbits >> 10) & 0x1Fu;
18+
const uint32_t frac = hbits & 0x3FFu;
19+
20+
if (exp == 0u) {
21+
if (frac == 0u) {
22+
union { uint32_t u; float f; } v = { sign << 31 };
23+
return v.f;
24+
}
25+
float f = ((float) frac) / 1024.0f * (1.0f / 16384.0f);
26+
return sign ? -f : f;
27+
}
28+
if (exp == 0x1Fu) {
29+
union { uint32_t u; float f; } v;
30+
v.u = (sign << 31) | 0x7F800000u | (frac ? 0x00400000u : 0u);
31+
return v.f;
32+
}
33+
union { uint32_t u; float f; } v;
34+
v.u = (sign << 31) | ((exp - 15u + 127u) << 23) | (frac << 13);
35+
return v.f;
36+
}
37+
38+
/*
39+
* ggml's get_scale_min_k4 unmix for the 12-byte packed sub-scale region
40+
* (bytes 4..15 of a Q4_K block). Same logic as the Kotlin reference.
41+
*/
42+
static inline void skainet_q4k_decode_scales(
43+
const uint8_t* scales,
44+
int* scale_idx,
45+
int* min_idx
46+
) {
47+
for (int sb = 0; sb < 4; ++sb) {
48+
scale_idx[sb] = scales[sb] & 0x3F;
49+
min_idx[sb] = scales[sb + 4] & 0x3F;
50+
}
51+
for (int sb = 4; sb < 8; ++sb) {
52+
const int low4_s = scales[sb + 4] & 0x0F;
53+
const int high2_s = (scales[sb - 4] >> 6) & 0x03;
54+
scale_idx[sb] = low4_s | (high2_s << 4);
55+
56+
const int low4_m = (scales[sb + 4] >> 4) & 0x0F;
57+
const int high2_m = (scales[sb] >> 6) & 0x03;
58+
min_idx[sb] = low4_m | (high2_m << 4);
59+
}
60+
}
61+
62+
/*
63+
* Native Q4_K matrix-vector multiply matching the
64+
* sk.ainet.backend.api.kernel.Q4KMatmulKernel SPI contract. Single
65+
* input row times an `outputDim x inputDim` Q4_K-packed weight tensor
66+
* laid out (blockIdx * outputDim + o) * 144 bytes.
67+
*
68+
* Lazy-dmin pattern: per sub-block accumulate
69+
* codeSum[s] = sum_i input[i] * code[i]
70+
* inputSum[s] = sum_i input[i]
71+
* and combine once via
72+
* acc += d * scaleIdx[s] * codeSum[s] - dMin * minIdx[s] * inputSum[s]
73+
*
74+
* Scalar single-threaded for PR 2; the tight inner loop is
75+
* straight-line FP arithmetic so -O3 auto-vectorizes the
76+
* codeSum/inputSum accumulators on AVX2/NEON.
77+
*/
78+
SKAINET_API void skainet_q4k_matmul(
79+
const float* __restrict__ input,
80+
int32_t input_offset,
81+
const uint8_t* __restrict__ weight,
82+
int32_t weight_byte_offset,
83+
int32_t input_dim,
84+
int32_t output_dim,
85+
float* __restrict__ output,
86+
int32_t output_offset
87+
) {
88+
if (output_dim <= 0 || input_dim <= 0) return;
89+
90+
const int32_t blocks_per_input_dim = input_dim / Q4K_BLOCK_SIZE;
91+
const float* in_base = input + input_offset;
92+
float* out_base = output + output_offset;
93+
94+
int scale_idx[Q4K_SUB_BLOCKS];
95+
int min_idx[Q4K_SUB_BLOCKS];
96+
97+
for (int32_t o = 0; o < output_dim; ++o) {
98+
float acc = 0.0f;
99+
100+
for (int32_t block_idx = 0; block_idx < blocks_per_input_dim; ++block_idx) {
101+
const uint8_t* block = weight + weight_byte_offset
102+
+ (size_t)(block_idx * output_dim + o) * Q4K_BYTES_PER_BLOCK;
103+
104+
/* d, dMin (FP16 LE -> FP32). */
105+
const uint16_t d_bits = (uint16_t) block[0] | ((uint16_t) block[1] << 8);
106+
const uint16_t d_min_bits = (uint16_t) block[2] | ((uint16_t) block[3] << 8);
107+
const float d = skainet_half_to_float(d_bits);
108+
const float d_min = skainet_half_to_float(d_min_bits);
109+
110+
/* 12 bytes of packed (scaleIdx, minIdx) -> 8 ints each. */
111+
skainet_q4k_decode_scales(block + 4, scale_idx, min_idx);
112+
113+
const uint8_t* qs = block + 16;
114+
const float* in_block = in_base + (size_t) block_idx * Q4K_BLOCK_SIZE;
115+
116+
/* 4 strided qs groups; group j carries sub-blocks 2j (lo) and 2j+1 (hi). */
117+
for (int group_j = 0; group_j < 4; ++group_j) {
118+
const uint8_t* qs_group = qs + group_j * Q4K_SUB_BLOCK_SIZE;
119+
const int sb_lo = 2 * group_j;
120+
const int sb_hi = sb_lo + 1;
121+
const float* in_lo = in_block + sb_lo * Q4K_SUB_BLOCK_SIZE;
122+
const float* in_hi = in_block + sb_hi * Q4K_SUB_BLOCK_SIZE;
123+
124+
float code_sum_lo = 0.0f, input_sum_lo = 0.0f;
125+
float code_sum_hi = 0.0f, input_sum_hi = 0.0f;
126+
127+
/* 32 iterations — auto-vectorizes cleanly under -O3. */
128+
for (int i = 0; i < Q4K_SUB_BLOCK_SIZE; ++i) {
129+
const uint8_t b = qs_group[i];
130+
const float code_lo = (float)(b & 0x0F);
131+
const float code_hi = (float)(b >> 4);
132+
const float v_lo = in_lo[i];
133+
const float v_hi = in_hi[i];
134+
code_sum_lo += v_lo * code_lo;
135+
input_sum_lo += v_lo;
136+
code_sum_hi += v_hi * code_hi;
137+
input_sum_hi += v_hi;
138+
}
139+
140+
const float scale_lo = d * (float) scale_idx[sb_lo];
141+
const float offset_lo = d_min * (float) min_idx[sb_lo];
142+
const float scale_hi = d * (float) scale_idx[sb_hi];
143+
const float offset_hi = d_min * (float) min_idx[sb_hi];
144+
acc += code_sum_lo * scale_lo - input_sum_lo * offset_lo;
145+
acc += code_sum_hi * scale_hi - input_sum_hi * offset_hi;
146+
}
147+
}
148+
149+
out_base[o] = acc;
150+
}
151+
}

skainet-backends/skainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeKernelProvider.kt

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,26 +8,24 @@ import sk.ainet.backend.api.kernel.Q4KMatmulKernel
88
* Native (FFM) [KernelProvider]. Sits at priority `100`, above
99
* [PanamaVectorKernelProvider] (`50`) and the scalar reference (`0`).
1010
*
11-
* PR 1 of the staged native-FFM rollout (see the `native-ffm-plan`
12-
* asciidoc) only ships the module scaffolding: the Gradle ↔ CMake
13-
* pipeline that produces a host-arch shared library, its bundling into
14-
* JAR resources, and an end-to-end FFM smoke downcall test. No real
15-
* matmul kernel is wired into the public SPI yet.
11+
* Availability is gated on [NativeQ4KMatmulKernel.isAvailable] — the
12+
* bundled `libskainet_kernels` shared library has to load AND the
13+
* `skainet_q4k_matmul` symbol has to resolve via FFM. When either
14+
* fails (missing arch, sandbox, JDK without FFM, kill-switch),
15+
* `KernelRegistry.bestAvailable()` cleanly cascades to
16+
* [PanamaVectorKernelProvider] at priority 50.
1617
*
17-
* Until [NativeQ4KMatmulKernel] (or its `MemSegment`-input sibling)
18-
* lands in PR 2, this provider deliberately reports `isAvailable() =
19-
* false` and returns `null` from every kernel accessor. That keeps
20-
* `KernelRegistry.bestAvailable()` cleanly cascading down to the
21-
* Panama priority-50 provider on every shape we measure today, so
22-
* adding the new module to the classpath produces no behavior change.
18+
* PR 2 of the staged rollout: real Q4_K matmul wired into the SPI.
19+
* `matmulFp32` follows in a later PR alongside a native FP32 kernel.
2320
*/
2421
public object NativeKernelProvider : KernelProvider {
2522
override val name: String = "native-ffm"
2623
override val priority: Int = 100
2724

28-
override fun isAvailable(): Boolean = false
25+
override fun isAvailable(): Boolean = NativeQ4KMatmulKernel.isAvailable()
2926

3027
override fun matmulFp32(): Fp32MatmulKernel? = null
3128

32-
override fun matmulQ4K(): Q4KMatmulKernel? = null
29+
override fun matmulQ4K(): Q4KMatmulKernel? =
30+
if (NativeQ4KMatmulKernel.isAvailable()) NativeQ4KMatmulKernel else null
3331
}
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
package sk.ainet.exec.kernel
2+
3+
import java.lang.foreign.Arena
4+
import java.lang.foreign.FunctionDescriptor
5+
import java.lang.foreign.Linker
6+
import java.lang.foreign.MemorySegment
7+
import java.lang.foreign.ValueLayout
8+
import java.lang.invoke.MethodHandle
9+
import sk.ainet.backend.api.kernel.Q4KMatmulKernel
10+
11+
/**
12+
* Native (FFM) implementation of [Q4KMatmulKernel].
13+
*
14+
* Wraps the bundled C symbol
15+
*
16+
* void skainet_q4k_matmul(
17+
* const float* input, int32_t input_offset,
18+
* const uint8_t* weight, int32_t weight_byte_offset,
19+
* int32_t input_dim, int32_t output_dim,
20+
* float* output, int32_t output_offset);
21+
*
22+
* The C kernel implements the same lazy-`dmin` accumulation as
23+
* [PanamaVectorQ4KMatmulKernel] (sum input·code and sum input per
24+
* sub-block, combine via `d * scaleIdx[s] * codeSum - dMin * minIdx[s] * inputSum`)
25+
* and shares the canonical 256-element / 144-byte super-block layout.
26+
*
27+
* Numerical parity vs the Panama kernel is asserted by
28+
* [NativeQ4KMatmulKernelParityTest] within `1e-4` relative tolerance,
29+
* matching the parity bar `PanamaVectorQ4KMatmulKernelTest` uses.
30+
*
31+
* PR 2 of the staged native-FFM rollout: ships a single-threaded
32+
* scalar C kernel (`-O3 -ffast-math`, auto-vectorized inner loop).
33+
* NEON / AVX2 intrinsics, `MemorySegment`-input zero-copy variant,
34+
* and cross-arch CI shipping are deferred to PRs 3–5.
35+
*/
36+
internal object NativeQ4KMatmulKernel : Q4KMatmulKernel {
37+
38+
private const val BLOCK_SIZE = 256
39+
40+
fun isAvailable(): Boolean = handle != null
41+
42+
override fun matmul(
43+
input: FloatArray, inputOffset: Int,
44+
weight: ByteArray, weightByteOffset: Int,
45+
inputDim: Int, outputDim: Int,
46+
output: FloatArray, outputOffset: Int,
47+
) {
48+
require(inputDim % BLOCK_SIZE == 0) {
49+
"NativeQ4KMatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim"
50+
}
51+
if (outputDim == 0 || inputDim == 0) return
52+
val mh = handle
53+
?: error("NativeQ4KMatmulKernel.matmul invoked while native library unavailable")
54+
55+
// The native kernel writes outputDim floats and only reads
56+
// inputDim floats + (inputDim/256)*outputDim*144 weight bytes,
57+
// so the segments size exactly to those windows. Heap-array
58+
// segments would also work but allocating off-heap copies keeps
59+
// the native side oblivious to the JVM heap layout (and lets
60+
// the same wrapper take MemorySegment-backed inputs in PR 3).
61+
Arena.ofConfined().use { arena ->
62+
val inSeg = arena.allocate(
63+
inputDim.toLong() * java.lang.Float.BYTES,
64+
ValueLayout.JAVA_FLOAT.byteAlignment(),
65+
)
66+
val outSeg = arena.allocate(
67+
outputDim.toLong() * java.lang.Float.BYTES,
68+
ValueLayout.JAVA_FLOAT.byteAlignment(),
69+
)
70+
val weightBytesUsed = ((inputDim / BLOCK_SIZE).toLong() * outputDim) * 144L
71+
val weightSeg = arena.allocate(weightBytesUsed, 1L)
72+
73+
MemorySegment.copy(input, inputOffset, inSeg, ValueLayout.JAVA_FLOAT, 0L, inputDim)
74+
MemorySegment.copy(weight, weightByteOffset, weightSeg, ValueLayout.JAVA_BYTE, 0L, weightBytesUsed.toInt())
75+
76+
mh.invoke(
77+
inSeg, 0,
78+
weightSeg, 0,
79+
inputDim, outputDim,
80+
outSeg, 0,
81+
)
82+
83+
MemorySegment.copy(outSeg, ValueLayout.JAVA_FLOAT, 0L, output, outputOffset, outputDim)
84+
}
85+
}
86+
87+
private val handle: MethodHandle? by lazy {
88+
val lookup = NativeLibraryLoader.lookup() ?: return@lazy null
89+
val symbol = lookup.find("skainet_q4k_matmul").orElse(null) ?: return@lazy null
90+
val descriptor = FunctionDescriptor.ofVoid(
91+
ValueLayout.ADDRESS, // input
92+
ValueLayout.JAVA_INT, // input_offset
93+
ValueLayout.ADDRESS, // weight
94+
ValueLayout.JAVA_INT, // weight_byte_offset
95+
ValueLayout.JAVA_INT, // input_dim
96+
ValueLayout.JAVA_INT, // output_dim
97+
ValueLayout.ADDRESS, // output
98+
ValueLayout.JAVA_INT, // output_offset
99+
)
100+
runCatching { Linker.nativeLinker().downcallHandle(symbol, descriptor) }.getOrNull()
101+
}
102+
}

0 commit comments

Comments
 (0)