Skip to content

Commit a3a22aa

Browse files
Merge pull request #786 from SKaiNET-developers/feature/native-cpu-arm64-neon-verify
native-cpu: verify NEON kernels on aarch64, add linuxArm64 test path
2 parents d6bdc34 + 453ff40 commit a3a22aa

8 files changed

Lines changed: 309 additions & 17 deletions

File tree

skainet-backends/skainet-backend-native-cpu/build.gradle.kts

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,22 @@ kotlin {
5959
}
6060
val linuxX64Main by getting { dependsOn(nativeMain) }
6161
val linuxArm64Main by getting { dependsOn(nativeMain) }
62-
val linuxX64Test by getting {
62+
// Shared K/N parity tests (cinterop kernel vs commonMain scalar reference),
63+
// run on BOTH linuxX64 (host, scalar/auto-vec archive) and linuxArm64
64+
// (cross-built NEON archive, executed under the K/N-bundled qemu-aarch64
65+
// or on the SL2610 board). Same tests, two codegens — this is how the
66+
// NEON paths get bit-checked without board-only test code.
67+
val nativeTest by creating {
68+
dependsOn(commonTest.get())
6369
dependencies {
6470
implementation(libs.kotlin.test)
65-
// ScalarQ5_KMatmulKernel reference for the cinterop parity test.
71+
// ScalarQ*_KMatmulKernel / ScalarQ8_0MatmulKernel / ScalarMatmulKernel
72+
// references for the cinterop parity tests.
6673
implementation(project(":skainet-backends:skainet-backend-cpu"))
6774
}
6875
}
76+
val linuxX64Test by getting { dependsOn(nativeTest) }
77+
val linuxArm64Test by getting { dependsOn(nativeTest) }
6978
}
7079
}
7180

@@ -162,8 +171,9 @@ val packageNativeKernels by tasks.registering(Copy::class) {
162171
// host opts in. NativeLibraryLoader resolves native/linux-arm64/ from os.arch
163172
// at runtime, so the consuming side needs no change once this .so is bundled.
164173
//
165-
// BOARD-VERIFY-PENDING: the NEON code is syntax-validated for aarch64 but has
166-
// not been executed; run the parity tests under QEMU or on the SL2610.
174+
// NEON parity verified under qemu-aarch64 (see skainet_simd.h banner):
175+
// ./gradlew :skainet-backends:skainet-backend-native-cpu:linuxArm64Test -PcrossArm64=true
176+
// runs the shared nativeTest parity suite against the cross-built archive.
167177
val crossArm64Enabled: Boolean = (findProperty("crossArm64") as String?)?.toBoolean() == true
168178
val aarch64Cc: String = (findProperty("skainetAarch64Cc") as String?) ?: "aarch64-linux-gnu-gcc"
169179
val cmakeBuildArm64Path: String = layout.buildDirectory.dir("native/cmake-build-arm64").get().asFile.absolutePath
@@ -173,7 +183,10 @@ val toolchainFilePath = "$nativeSourcePath/toolchain-aarch64.cmake"
173183
val configureNativeKernelsArm64 by tasks.registering(Exec::class) {
174184
group = "build"
175185
description = "CMake configure for the aarch64 (NEON) native kernels (cross-compile)."
176-
onlyIf { crossArm64Enabled }
186+
// Local copy so the onlyIf lambda captures a Boolean, not the build script
187+
// (script object references break configuration-cache serialization).
188+
val enabled = crossArm64Enabled
189+
onlyIf { enabled }
177190
inputs.file("$nativeSourcePath/CMakeLists.txt")
178191
inputs.dir("$nativeSourcePath/src")
179192
inputs.dir("$nativeSourcePath/include")
@@ -191,7 +204,8 @@ val configureNativeKernelsArm64 by tasks.registering(Exec::class) {
191204
val buildNativeKernelsArm64 by tasks.registering(Exec::class) {
192205
group = "build"
193206
description = "Cross-build the aarch64 (NEON) native kernels shared library."
194-
onlyIf { crossArm64Enabled }
207+
val enabled = crossArm64Enabled
208+
onlyIf { enabled }
195209
dependsOn(configureNativeKernelsArm64)
196210
inputs.file("$nativeSourcePath/CMakeLists.txt")
197211
inputs.dir("$nativeSourcePath/src")
@@ -203,7 +217,8 @@ val buildNativeKernelsArm64 by tasks.registering(Exec::class) {
203217
val packageNativeKernelsArm64 by tasks.registering(Copy::class) {
204218
group = "build"
205219
description = "Stage the cross-built aarch64 native kernels into JVM resources."
206-
onlyIf { crossArm64Enabled }
220+
val enabled = crossArm64Enabled
221+
onlyIf { enabled }
207222
dependsOn(buildNativeKernelsArm64)
208223
from(cmakeBuildArm64Path) {
209224
include("libskainet_kernels.so")
@@ -229,6 +244,32 @@ if (crossArm64Enabled) {
229244
tasks.matching { it.name.startsWith("link") && it.name.endsWith("LinuxArm64") }.configureEach {
230245
dependsOn(buildNativeKernelsArm64)
231246
}
247+
248+
// The Kotlin Gradle plugin does not create a run task for non-host K/N test
249+
// binaries (only linkDebugTestLinuxArm64 / linuxArm64TestBinaries), so wire
250+
// one explicitly: run test.kexe under qemu-aarch64 (user-mode emulation).
251+
// Defaults point at the K/N-bundled dependencies (~/.konan/dependencies):
252+
// the same qemu K/N itself uses for linux_x64 -> linux_arm64 test emulation
253+
// (konan.properties emulatorExecutable.linux_x64-linux_arm64) and the glibc
254+
// sysroot the binary was linked against. Override with -PskainetQemu /
255+
// -PskainetAarch64Sysroot (e.g. /usr/bin/qemu-aarch64-static and
256+
// /usr/aarch64-linux-gnu for the distro toolchain). On the SL2610 board
257+
// itself, just push and run test.kexe directly — no qemu involved.
258+
val konanDeps = "${System.getProperty("user.home")}/.konan/dependencies"
259+
val qemuAarch64: String = (findProperty("skainetQemu") as String?)
260+
?: "$konanDeps/qemu-aarch64-static-5.1.0-linux-2/qemu-aarch64"
261+
val aarch64Sysroot: String = (findProperty("skainetAarch64Sysroot") as String?)
262+
?: "$konanDeps/aarch64-unknown-linux-gnu-gcc-8.3.0-glibc-2.25-kernel-4.9-2/aarch64-unknown-linux-gnu/sysroot"
263+
val testKexePath: String =
264+
layout.buildDirectory.file("bin/linuxArm64/debugTest/test.kexe").get().asFile.absolutePath
265+
266+
tasks.register<Exec>("linuxArm64Test") {
267+
group = "verification"
268+
description = "Run the linuxArm64 K/N tests under qemu-aarch64 (NEON kernel parity vs scalar reference)."
269+
dependsOn("linkDebugTestLinuxArm64")
270+
inputs.file(testKexePath)
271+
commandLine(qemuAarch64, "-L", aarch64Sysroot, testKexePath)
272+
}
232273
}
233274

234275
// Forward `-Dskainet.runBench=true` from Gradle CLI to the forked test

skainet-backends/skainet-backend-native-cpu/native/include/skainet_simd.h

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,19 @@
1111
* with the right -march). `__ARM_FEATURE_DOTPROD` / `__ARM_FEATURE_MATMUL_INT8`
1212
* are gated on the build flags (`-march=armv8.2-a+dotprod`, etc.).
1313
*
14-
* BOARD-VERIFY-PENDING: the NEON paths in this tree compile to the scalar
15-
* fallback on the x86 build host and have NOT been executed on aarch64.
16-
* They must be built with the cross toolchain and bit-exact-checked under
17-
* QEMU or on the SL2610 before being relied on.
14+
* AARCH64-VERIFIED (2026-07-02): the NEON paths (fp32 vfmaq_f32, q4k
15+
* vdotq_s32 dotprod, q5k, q6k, q8_0) were cross-built with
16+
* `-march=armv8.2-a+fp16+dotprod` (aarch64 gcc 8.3, the K/N-bundled
17+
* toolchain) and parity-checked against the commonMain scalar references:
18+
* - under qemu-aarch64 via
19+
* ./gradlew :skainet-backends:skainet-backend-native-cpu:linuxArm64Test -PcrossArm64=true
20+
* - AND on the physical SL2610 board (Cortex-A55, aarch64): the same
21+
* test.kexe run natively on-device — 23/23 tests green, no SIGILL.
22+
* `/proc/cpuinfo` confirmed asimddp + fphp/asimdhp present and i8mm
23+
* absent, matching the chosen -march (no +i8mm).
24+
* The linked archive was confirmed to contain udot/sdot + fmla, i.e. the
25+
* SIMD paths — not the scalar fallback — executed. bf16 and q4_0 have no
26+
* NEON path (scalar only).
1827
*/
1928

2029
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
package sk.ainet.exec.kernel
2+
3+
import kotlin.math.abs
4+
import kotlin.random.Random
5+
import kotlin.test.Test
6+
import kotlin.test.assertTrue
7+
import kotlinx.cinterop.ExperimentalForeignApi
8+
import kotlinx.cinterop.addressOf
9+
import kotlinx.cinterop.usePinned
10+
import sk.ainet.kernels.cinterop.skainet_fp32_matmul
11+
12+
/**
13+
* Proves the Kotlin/Native cinterop path for the FP32 SGEMM: the C
14+
* `skainet_fp32_matmul` (called directly via cinterop — there is no
15+
* NativeKn wrapper object for FP32 yet, `NativeKnKernelProvider.matmulFp32()`
16+
* is null) must agree with the commonMain [ScalarMatmulKernel] reference
17+
* within FMA + `-ffast-math` reassociation tolerance.
18+
*
19+
* Runs on linuxX64 (host archive: scalar/auto-vectorized) AND linuxArm64
20+
* (cross-built archive: NEON `vfmaq_f32` over the n dimension), so the
21+
* aarch64 run bit-checks the `SKAINET_HAVE_NEON` path in fp32_matmul.c.
22+
* Shapes deliberately include n % 4 != 0 to hit both the 4-lane vector
23+
* loop and the scalar tail.
24+
*/
25+
@OptIn(ExperimentalForeignApi::class)
26+
class NativeKnFp32MatmulParityTest {
27+
28+
private fun cinteropMatmul(
29+
a: FloatArray, b: FloatArray, c: FloatArray,
30+
m: Int, n: Int, k: Int,
31+
) {
32+
a.usePinned { aPin ->
33+
b.usePinned { bPin ->
34+
c.usePinned { cPin ->
35+
skainet_fp32_matmul(
36+
aPin.addressOf(0), 0, k,
37+
bPin.addressOf(0), 0, n,
38+
cPin.addressOf(0), 0, n,
39+
m, n, k,
40+
)
41+
}
42+
}
43+
}
44+
}
45+
46+
private fun assertParity(m: Int, n: Int, k: Int, seed: Int, tol: Float) {
47+
val rng = Random(seed)
48+
val a = FloatArray(m * k) { rng.nextFloat() - 0.5f }
49+
val b = FloatArray(k * n) { rng.nextFloat() - 0.5f }
50+
51+
val refOut = FloatArray(m * n)
52+
ScalarMatmulKernel.matmul(a, 0, k, b, 0, n, refOut, 0, n, m, n, k)
53+
54+
val knOut = FloatArray(m * n)
55+
cinteropMatmul(a, b, knOut, m, n, k)
56+
57+
for (i in 0 until m * n) {
58+
val diff = abs(refOut[i] - knOut[i])
59+
val rel = diff / (abs(refOut[i]) + 1e-9f)
60+
assertTrue(
61+
diff <= tol || rel < 1e-4f,
62+
"elem $i (row ${i / n}, col ${i % n}) diverged: " +
63+
"scalar=${refOut[i]} cinterop=${knOut[i]} diff=$diff rel=$rel tol=$tol",
64+
)
65+
}
66+
}
67+
68+
@Test
69+
fun tail_only_shape() = assertParity(m = 2, n = 3, k = 16, seed = 42, tol = 1e-4f)
70+
71+
@Test
72+
fun vector_plus_tail_shape() = assertParity(m = 3, n = 7, k = 32, seed = 7, tol = 1e-4f)
73+
74+
@Test
75+
fun aligned_shape() = assertParity(m = 4, n = 64, k = 128, seed = 123, tol = 1e-3f)
76+
77+
@Test
78+
fun matvec_row_shape() = assertParity(m = 1, n = 513, k = 256, seed = 321, tol = 1e-3f)
79+
80+
@Test
81+
fun llm_typical_shape() = assertParity(m = 8, n = 300, k = 1024, seed = 999, tol = 1e-2f)
82+
}

skainet-backends/skainet-backend-native-cpu/src/linuxX64Test/kotlin/sk/ainet/exec/kernel/NativeKnKernelProviderTest.kt renamed to skainet-backends/skainet-backend-native-cpu/src/nativeTest/kotlin/sk/ainet/exec/kernel/NativeKnKernelProviderTest.kt

File renamed without changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
package sk.ainet.exec.kernel
2+
3+
import kotlin.math.sqrt
4+
import kotlin.random.Random
5+
import kotlin.test.Test
6+
import kotlin.test.assertTrue
7+
8+
/**
9+
* Proves the Kotlin/Native cinterop path: [NativeKnQ4KMatmulKernel] (calling the
10+
* C `skainet_q4k_matmul` via cinterop, linked from libskainet_kernels.a) must
11+
* agree with the commonMain [ScalarQ4_KMatmulKernel] exact-float reference.
12+
*
13+
* Runs on linuxX64 (host archive: scalar/auto-vectorized) AND linuxArm64
14+
* (cross-built archive: NEON + vdotq_s32), so the aarch64 run bit-checks the
15+
* `SKAINET_HAVE_DOTPROD` path in q4k_matmul.c against the scalar reference.
16+
*
17+
* IMPORTANT: the C kernel quantizes the activation to int8 (Q8) for the
18+
* ggml-style dotprod fast path — deliberately lossy, so it is NOT bit-exact vs
19+
* the float reference. Per-row relative error is the wrong gate (a true-zero
20+
* row shows unbounded relative error); the meaningful metric is aggregate
21+
* error energy RMS(error)/RMS(signal), same as the JVM
22+
* `NativeQ4KMatmulKernelParityTest`.
23+
*/
24+
class NativeKnQ4KMatmulKernelParityTest {
25+
26+
private val blockSize = 256
27+
private val bytesPerBlock = 144
28+
29+
private fun randomQ4KBytes(numBlocks: Int, seed: Int): ByteArray {
30+
val rng = Random(seed)
31+
val bytes = ByteArray(numBlocks * bytesPerBlock)
32+
rng.nextBytes(bytes)
33+
for (block in 0 until numBlocks) {
34+
val base = block * bytesPerBlock
35+
// 0x3C00 == 1.0f16 for d and dMin so dequant stays finite.
36+
bytes[base + 0] = 0x00.toByte()
37+
bytes[base + 1] = 0x3C.toByte()
38+
bytes[base + 2] = 0x00.toByte()
39+
bytes[base + 3] = 0x3C.toByte()
40+
}
41+
return bytes
42+
}
43+
44+
private fun assertParity(inputDim: Int, outputDim: Int, seed: Int, tol: Float) {
45+
val numBlocks = (inputDim / blockSize) * outputDim
46+
val packed = randomQ4KBytes(numBlocks, seed)
47+
val input = FloatArray(inputDim) { Random(seed + it).nextFloat() - 0.5f }
48+
49+
val refOut = FloatArray(outputDim)
50+
ScalarQ4_KMatmulKernel.matmul(input, 0, packed, 0, inputDim, outputDim, refOut, 0)
51+
52+
val knOut = FloatArray(outputDim)
53+
NativeKnQ4KMatmulKernel.matmul(input, 0, packed, 0, inputDim, outputDim, knOut, 0)
54+
55+
// Aggregate RMS gate (see class kdoc): Q8-activation quantization makes
56+
// per-row float parity meaningless; bound the total error energy instead.
57+
var sqErr = 0.0
58+
var sqSig = 0.0
59+
for (o in 0 until outputDim) {
60+
val d = (refOut[o] - knOut[o]).toDouble()
61+
sqErr += d * d
62+
sqSig += refOut[o].toDouble() * refOut[o].toDouble()
63+
}
64+
val rmsErr = sqrt(sqErr / outputDim)
65+
val rmsSig = sqrt(sqSig / outputDim)
66+
val relRms = rmsErr / (rmsSig + 1e-9)
67+
assertTrue(
68+
relRms < AGG_REL_TOL || rmsErr < tol,
69+
"Q8 parity exceeded: relRms=$relRms (rmsErr=$rmsErr rmsSig=$rmsSig) over $outputDim rows, tol=$AGG_REL_TOL",
70+
)
71+
}
72+
73+
private companion object {
74+
// Aggregate Q8-activation RMS-relative-error bound (uniform-random worst
75+
// case) — same bar as the JVM NativeQ4KMatmulKernelParityTest.
76+
const val AGG_REL_TOL = 0.03
77+
}
78+
79+
@Test
80+
fun single_block_single_row() = assertParity(256, 1, 42, 1e-2f)
81+
82+
@Test
83+
fun single_block_multi_row() = assertParity(256, 16, 7, 1e-2f)
84+
85+
@Test
86+
fun multi_block_multi_row() = assertParity(1024, 64, 123, 5e-2f)
87+
88+
@Test
89+
fun llm_typical_shape() = assertParity(4096, 64, 999, 5e-1f)
90+
}

skainet-backends/skainet-backend-native-cpu/src/linuxX64Test/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernelParityTest.kt renamed to skainet-backends/skainet-backend-native-cpu/src/nativeTest/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernelParityTest.kt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@ import kotlin.test.assertTrue
1111
* agree with the commonMain [ScalarQ5_KMatmulKernel] reference within FMA +
1212
* `-ffast-math` reassociation tolerance.
1313
*
14-
* This is the host (linuxX64) de-risking of the board (linuxArm64) consumption:
15-
* the cinterop mechanism + kernel correctness are verified here; only the NEON
16-
* codegen differs on aarch64 (board-verify-pending).
14+
* Runs on linuxX64 (host archive: scalar/auto-vectorized) AND linuxArm64
15+
* (cross-built archive: NEON), so the aarch64 run bit-checks the
16+
* `SKAINET_HAVE_NEON` path in q5k_matmul.c against the scalar reference.
1717
*/
1818
class NativeKnQ5KMatmulKernelParityTest {
1919

skainet-backends/skainet-backend-native-cpu/src/linuxX64Test/kotlin/sk/ainet/exec/kernel/NativeKnQ6KMatmulKernelParityTest.kt renamed to skainet-backends/skainet-backend-native-cpu/src/nativeTest/kotlin/sk/ainet/exec/kernel/NativeKnQ6KMatmulKernelParityTest.kt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@ import kotlin.test.assertTrue
1111
* agree with the commonMain [ScalarQ6_KMatmulKernel] reference within FMA +
1212
* `-ffast-math` reassociation tolerance.
1313
*
14-
* This is the host (linuxX64) de-risking of the board (linuxArm64) consumption:
15-
* the cinterop mechanism + kernel correctness are verified here; only the NEON
16-
* codegen differs on aarch64 (board-verify-pending). Q6_K magnitudes (codes
14+
* Runs on linuxX64 (host archive: scalar/auto-vectorized) AND linuxArm64
15+
* (cross-built archive: NEON), so the aarch64 run bit-checks the
16+
* `SKAINET_HAVE_NEON` path in q6k_matmul.c. Q6_K magnitudes (codes
1717
* [-32, 31] × signed int8 scales) are larger than Q5_K, so absolute tolerances
1818
* are a touch looser; the `rel < 1e-4` relative check is the real gate.
1919
*/
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
package sk.ainet.exec.kernel
2+
3+
import kotlin.math.abs
4+
import kotlin.random.Random
5+
import kotlin.test.Test
6+
import kotlin.test.assertTrue
7+
8+
/**
9+
* Proves the Kotlin/Native cinterop path: [NativeKnQ8_0MatmulKernel] (calling
10+
* the C `skainet_q8_0_matmul` via cinterop, linked from libskainet_kernels.a)
11+
* must agree with the commonMain [ScalarQ8_0MatmulKernel] reference within
12+
* FMA + `-ffast-math` reassociation tolerance.
13+
*
14+
* Runs on linuxX64 (host archive: scalar/auto-vectorized) AND linuxArm64
15+
* (cross-built archive: NEON + sdot), so the aarch64 run bit-checks the
16+
* `SKAINET_HAVE_NEON` path in q8_0_matmul.c against the scalar reference.
17+
* Q8_0 blocks are 32 elements / 34 bytes (FP16 `d` + 32 signed int8 codes);
18+
* full-range random codes exercise the signed widening/dot paths.
19+
*/
20+
class NativeKnQ8_0MatmulKernelParityTest {
21+
22+
private val blockSize = 32
23+
private val bytesPerBlock = 34
24+
25+
private fun randomQ8_0Bytes(numBlocks: Int, seed: Int): ByteArray {
26+
val rng = Random(seed)
27+
val bytes = ByteArray(numBlocks * bytesPerBlock)
28+
rng.nextBytes(bytes)
29+
for (block in 0 until numBlocks) {
30+
val base = block * bytesPerBlock
31+
// 0x3C00 == 1.0f16 for the per-block scale so dequant stays finite.
32+
bytes[base + 0] = 0x00.toByte()
33+
bytes[base + 1] = 0x3C.toByte()
34+
}
35+
return bytes
36+
}
37+
38+
private fun assertParity(inputDim: Int, outputDim: Int, seed: Int, tol: Float) {
39+
val numBlocks = (inputDim / blockSize) * outputDim
40+
val packed = randomQ8_0Bytes(numBlocks, seed)
41+
val input = FloatArray(inputDim) { Random(seed + it).nextFloat() - 0.5f }
42+
43+
val refOut = FloatArray(outputDim)
44+
ScalarQ8_0MatmulKernel.matmul(input, 0, packed, 0, inputDim, outputDim, refOut, 0)
45+
46+
val knOut = FloatArray(outputDim)
47+
NativeKnQ8_0MatmulKernel.matmul(input, 0, packed, 0, inputDim, outputDim, knOut, 0)
48+
49+
for (o in 0 until outputDim) {
50+
val diff = abs(refOut[o] - knOut[o])
51+
val rel = diff / (abs(refOut[o]) + 1e-9f)
52+
assertTrue(
53+
diff <= tol || rel < 1e-4f,
54+
"row $o diverged: scalar=${refOut[o]} cinterop=${knOut[o]} diff=$diff rel=$rel tol=$tol",
55+
)
56+
}
57+
}
58+
59+
@Test
60+
fun single_block_single_row() = assertParity(32, 1, 42, 1e-2f)
61+
62+
@Test
63+
fun single_block_multi_row() = assertParity(32, 16, 7, 1e-2f)
64+
65+
@Test
66+
fun multi_block_multi_row() = assertParity(1024, 64, 123, 2e-1f)
67+
68+
@Test
69+
fun llm_typical_shape() = assertParity(4096, 64, 999, 2e0f)
70+
}

0 commit comments

Comments
 (0)