Skip to content

Commit 1487c3a

Browse files
Merge pull request #558 from SKaiNET-developers/feature/jvm-panama-kernel-jmh
bench(kernel): KernelMatmulBench — scalar vs Panama (M5 evidence)
2 parents a5e5f93 + 11cd177 commit 1487c3a

3 files changed

Lines changed: 81 additions & 1 deletion

File tree

docs/modules/ROOT/pages/explanation/perf/jvm-cpu.adoc

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ This page explains how to run the JMH benchmarks for the JVM CPU backend and how
66

77
* Elementwise: FP32 `add` on 1,000,000 elements
88
* Reductions: FP32 `sum` and `mean` on 1,000,000 elements
9-
* Matmul: FP32 square `matmul` with sizes 256, 512, and 1024
9+
* Matmul (op-level): FP32 square `matmul` with sizes 256, 512, and 1024 — exercises `ctx.ops.matmul`, i.e. the production routing path
10+
* Matmul (kernel-level): direct `Fp32MatmulKernel.matmul` invocation, scalar vs Panama Vector, sizes 256/512/1024 — used to validate the M5 milestone target (Panama ≥1.5× scalar) without entanglement from the rest of the op pipeline
1011

1112
Benchmarks are implemented in module:
1213

@@ -17,6 +18,7 @@ Source files:
1718
* `src/jmh/kotlin/sk/ainet/bench/ElementwiseAdd1MBench.kt`
1819
* `src/jmh/kotlin/sk/ainet/bench/Reductions1MBench.kt`
1920
* `src/jmh/kotlin/sk/ainet/bench/MatmulBench.kt`
21+
* `src/jmh/kotlin/sk/ainet/bench/KernelMatmulBench.kt`
2022

2123
===== Prerequisites
2224

@@ -75,6 +77,13 @@ This will build and execute all JMH benchmarks with the default parameters defin
7577
-Pjmh.param.blasEnabled=true
7678
....
7779

80+
* Kernel-level scalar vs Panama at all sizes (M5 ≥1.5× target):
81+
82+
....
83+
./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \
84+
-Pjmh.include=KernelMatmulBench
85+
....
86+
7887
* Matmul at 512 only, comparing BLAS on/off with vector on:
7988

8089
....

skainet-backends/benchmarks/jvm-cpu-jmh/build.gradle.kts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ plugins {
55

66
dependencies {
77
implementation(project(":skainet-lang:skainet-lang-core"))
8+
implementation(project(":skainet-backends:skainet-backend-api"))
89
implementation(project(":skainet-backends:skainet-backend-cpu"))
910
}
1011

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
package sk.ainet.bench
2+
3+
import java.util.concurrent.TimeUnit
4+
import org.openjdk.jmh.annotations.Benchmark
5+
import org.openjdk.jmh.annotations.BenchmarkMode
6+
import org.openjdk.jmh.annotations.Level
7+
import org.openjdk.jmh.annotations.Mode
8+
import org.openjdk.jmh.annotations.OutputTimeUnit
9+
import org.openjdk.jmh.annotations.Param
10+
import org.openjdk.jmh.annotations.Scope
11+
import org.openjdk.jmh.annotations.Setup
12+
import org.openjdk.jmh.annotations.State
13+
import sk.ainet.backend.api.kernel.Fp32MatmulKernel
14+
import sk.ainet.exec.kernel.PanamaVectorMatmulKernel
15+
import sk.ainet.exec.kernel.ScalarMatmulKernel
16+
17+
/**
18+
* Direct kernel-level matmul bench: `Fp32MatmulKernel.matmul` only,
19+
* with no `TensorOps` wrapper / dispatch / context allocation in the
20+
* timed region. Used to validate the M5 milestone target — Panama
21+
* Vector kernel ≥ 1.5× scalar — independent of the rest of the op
22+
* pipeline.
23+
*
24+
* Compare against `MatmulBench`, which exercises the same operation
25+
* through `ctx.ops.matmul` (production routing). Until
26+
* `DefaultCpuOpsJvm.matmul` is wired through `KernelRegistry`, only
27+
* this bench reflects pure kernel-vs-kernel performance.
28+
*/
29+
@State(Scope.Benchmark)
30+
@BenchmarkMode(Mode.AverageTime)
31+
@OutputTimeUnit(TimeUnit.MILLISECONDS)
32+
open class KernelMatmulBench {
33+
34+
@Param("256", "512", "1024")
35+
var size: Int = 512
36+
37+
@Param("scalar", "panama")
38+
var provider: String = "panama"
39+
40+
private lateinit var kernel: Fp32MatmulKernel
41+
private lateinit var a: FloatArray
42+
private lateinit var b: FloatArray
43+
private lateinit var out: FloatArray
44+
45+
@Setup(Level.Trial)
46+
fun setup() {
47+
kernel = when (provider) {
48+
"scalar" -> ScalarMatmulKernel
49+
"panama" -> PanamaVectorMatmulKernel
50+
else -> error("unknown provider: $provider")
51+
}
52+
val n = size
53+
// Same input seeding as MatmulBench so numbers compare cleanly.
54+
a = FloatArray(n * n) { ((it % 251) - 125).toFloat() / 127f }
55+
b = FloatArray(n * n) { ((it * 13 % 257) - 128).toFloat() / 127f }
56+
out = FloatArray(n * n)
57+
}
58+
59+
@Benchmark
60+
fun matmul_fp32_square(): FloatArray {
61+
val n = size
62+
kernel.matmul(
63+
a, 0, n,
64+
b, 0, n,
65+
out, 0, n,
66+
n, n, n,
67+
)
68+
return out
69+
}
70+
}

0 commit comments

Comments
 (0)