Skip to content

Commit e088933

Browse files
michalharakalclaude
andcommitted
Add Q8_0MatmulMicrobenchTest (gated on -Dskainet.runBench=true)
Median-of-N wall-clock comparison of ScalarQ8_0MatmulKernel vs PanamaVectorQ8_0MatmulKernel vs NativeQ8_0MatmulKernel at LLM-typical matrix-vector projection shapes. Mirrors Q4KMatmulMicrobenchTest; same `-Dskainet.runBench=true` gate. Result on Linux x86_64 / JDK 21 (Intel reference machine): [inputDim=1024 outputDim=1024] scalar: median=3267 µs panama: median=658 µs (4.96x scalar) native: median=350 µs (9.33x scalar, 1.88x panama) [inputDim=2048 outputDim=2048] scalar: median=7419 µs panama: median=2637 µs (2.81x scalar) native: median=1611 µs (4.60x scalar, 1.64x panama) [inputDim=4096 outputDim=4096] scalar: median=30608 µs panama: median=11485 µs (2.67x scalar) native: median=10714 µs (2.86x scalar, 1.07x panama) Panama is competitive here (unlike BF16): the `reduceLanes(ADD) * d` fold per block keeps dequant out of the inner FMA, so the vectorized accumulator actually pays off. At 4096² the kernels converge because memory bandwidth dominates over compute cleverness. Refs #604. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent c22fc2e commit e088933

1 file changed

Lines changed: 109 additions & 0 deletions

File tree

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
package sk.ainet.exec.kernel
2+
3+
import kotlin.math.abs
4+
import kotlin.random.Random
5+
import kotlin.test.Test
6+
import kotlin.test.assertTrue
7+
8+
/**
9+
* Wall-clock microbench comparing [ScalarQ8_0MatmulKernel],
10+
* [PanamaVectorQ8_0MatmulKernel] and [NativeQ8_0MatmulKernel] at
11+
* LLM-typical matrix-vector projection shapes (`inputDim × outputDim`).
12+
*
13+
* Skipped by default; pass `-Dskainet.runBench=true` to enable. Mirrors
14+
* `Q4KMatmulMicrobenchTest` and `Bf16MatmulMicrobenchTest` in structure.
15+
*/
16+
class Q8_0MatmulMicrobenchTest {
17+
18+
private val blockSize = 32
19+
private val bytesPerBlock = 34
20+
21+
private fun randomQ8_0Bytes(blocksPerInputDim: Int, outputDim: Int, seed: Int): ByteArray {
22+
val rng = Random(seed)
23+
val numBlocks = blocksPerInputDim * outputDim
24+
val bytes = ByteArray(numBlocks * bytesPerBlock)
25+
rng.nextBytes(bytes)
26+
for (block in 0 until numBlocks) {
27+
val base = block * bytesPerBlock
28+
bytes[base + 0] = 0x00.toByte()
29+
bytes[base + 1] = 0x22.toByte()
30+
}
31+
return bytes
32+
}
33+
34+
private fun median(values: LongArray): Long {
35+
val sorted = values.sortedArray()
36+
return sorted[sorted.size / 2]
37+
}
38+
39+
private fun benchOne(label: String, warmup: Int, samples: Int, run: () -> Unit): Long {
40+
repeat(warmup) { run() }
41+
val timings = LongArray(samples)
42+
for (i in 0 until samples) {
43+
val t0 = System.nanoTime()
44+
run()
45+
timings[i] = System.nanoTime() - t0
46+
}
47+
val med = median(timings)
48+
val min = timings.min()
49+
println(" $label: median=${med / 1_000} µs min=${min / 1_000} µs (n=$samples)")
50+
return med
51+
}
52+
53+
@Test
54+
fun bench_q8_0_scalar_vs_panama_vs_native() {
55+
if (System.getProperty("skainet.runBench") != "true") {
56+
println("Q8_0MatmulMicrobenchTest skipped — pass -Dskainet.runBench=true to enable.")
57+
return
58+
}
59+
assertTrue(NativeQ8_0MatmulKernel.isAvailable(), "Native Q8_0 kernel must be available for the bench")
60+
61+
// LLM-typical attention / FFN projection shapes (matvec).
62+
// inputDim must be a multiple of 32 (the Q8_0 block size).
63+
val shapes = listOf(
64+
Pair(1024, 1024),
65+
Pair(2048, 2048),
66+
Pair(4096, 4096),
67+
)
68+
69+
println()
70+
println("Q8_0 matmul microbench — Scalar vs Panama Vector vs Native (FFM)")
71+
println("Host: ${System.getProperty("os.name")} ${System.getProperty("os.arch")} | JDK ${System.getProperty("java.version")}")
72+
println()
73+
74+
for ((inputDim, outputDim) in shapes) {
75+
val blocksPerInputDim = inputDim / blockSize
76+
val rng = Random(inputDim + outputDim)
77+
val input = FloatArray(inputDim) { rng.nextFloat() - 0.5f }
78+
val weight = randomQ8_0Bytes(blocksPerInputDim, outputDim, inputDim + outputDim)
79+
80+
val outScalar = FloatArray(outputDim)
81+
val outPanama = FloatArray(outputDim)
82+
val outNative = FloatArray(outputDim)
83+
84+
println("[inputDim=$inputDim outputDim=$outputDim]")
85+
val scalarNs = benchOne("scalar", warmup = 3, samples = 5) {
86+
ScalarQ8_0MatmulKernel.matmul(input, 0, weight, 0, inputDim, outputDim, outScalar, 0)
87+
}
88+
val panamaNs = benchOne("panama", warmup = 10, samples = 21) {
89+
PanamaVectorQ8_0MatmulKernel.matmul(input, 0, weight, 0, inputDim, outputDim, outPanama, 0)
90+
}
91+
val nativeNs = benchOne("native", warmup = 10, samples = 21) {
92+
NativeQ8_0MatmulKernel.matmul(input, 0, weight, 0, inputDim, outputDim, outNative, 0)
93+
}
94+
val panamaSpeedup = scalarNs.toDouble() / panamaNs.toDouble()
95+
val nativeSpeedup = scalarNs.toDouble() / nativeNs.toDouble()
96+
val nativeVsPanama = panamaNs.toDouble() / nativeNs.toDouble()
97+
println(
98+
" speedups: panama is %.2fx scalar | native is %.2fx scalar | native is %.2fx panama (%.1f%% %s)".format(
99+
panamaSpeedup,
100+
nativeSpeedup,
101+
nativeVsPanama,
102+
abs((nativeVsPanama - 1.0) * 100.0),
103+
if (nativeVsPanama >= 1.0) "faster" else "slower",
104+
),
105+
)
106+
println()
107+
}
108+
}
109+
}

0 commit comments

Comments
 (0)