Skip to content

Commit 04e6a90

Browse files
michalharakalclaude
andcommitted
feat(backend): commonMain scalar Q5_1/Q5_0/Q4_K/Q6_K kernels + SPI (Native parity)
Part of #708. Brings quantized matmul to Kotlin/Native (and JS/WASM), which previously only had FP32/BF16/Q8_0/Q4_0 scalar kernels — Q4_K/Q6_K/Q5_x were JVM-only (Panama/FFM), so on non-JVM targets packed-quant matmul had no kernel. SPI (skainet-backend-api, commonMain): - New Q5_1MatmulKernel / Q5_0MatmulKernel / Q6KMatmulKernel interfaces (block-major `(blockIdx*outputDim+o)*BYTES_PER_BLOCK`, exact dequant in kdoc). - KernelProvider: matmulQ5_1()/matmulQ5_0()/matmulQ6K() accessors (default null) + supports() keys for "Q5_1"/"Q5_0"/"Q6_K". Scalar kernels (skainet-backend-cpu, commonMain — available on every target): - ScalarQ5_1/Q5_0/Q4_K/Q6_KMatmulKernel, math ported from JvmQuantizedVectorKernels / DequantOps (Q4_K get_scale_min_k4 + sub-block codeSum*scale - inputSum*offset; Q6_K ql/qh 6-bit reassembly). Shared decodeHalf() FP16 helper. - ScalarKernelProvider now overrides matmulQ4K/Q6K/Q5_1/Q5_0 → the scalar floor carries every packed format. Test: ScalarPackedKernelParityTest (commonTest) validates each kernel's matmul against an independent inline dequant; passes on jvmTest AND linuxX64Test, proving Native packed-matmul correctness (relative tol for the FP reassociation of the per-sub-block accumulation). Note: dispatch wiring (so ops.matmul routes packed tensors to these kernels on non-JVM) + non-JVM provider registration land in follow-up commits; this commit is the kernels + SPI surface. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 9b5abe9 commit 04e6a90

12 files changed

Lines changed: 654 additions & 0 deletions

File tree

skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,24 @@ public interface KernelProvider {
7373
*/
7474
public fun matmulQ4_0(): Q4_0MatmulKernel? = null
7575

76+
/**
77+
* F32 × Q6_K matmul kernel exposed by this provider, or `null` if
78+
* this provider does not specialize Q6_K. Same fall-through pattern.
79+
*/
80+
public fun matmulQ6K(): Q6KMatmulKernel? = null
81+
82+
/**
83+
* F32 × Q5_1 matmul kernel exposed by this provider, or `null` if
84+
* this provider does not specialize Q5_1. Same fall-through pattern.
85+
*/
86+
public fun matmulQ5_1(): Q5_1MatmulKernel? = null
87+
88+
/**
89+
* F32 × Q5_0 matmul kernel exposed by this provider, or `null` if
90+
* this provider does not specialize Q5_0. Same fall-through pattern.
91+
*/
92+
public fun matmulQ5_0(): Q5_0MatmulKernel? = null
93+
7694
/**
7795
* Capability query: does this provider carry a kernel for
7896
* [opName] with the given [dtypeKeys]?
@@ -107,6 +125,9 @@ public interface KernelProvider {
107125
"Q4_K" -> matmulQ4K() != null
108126
"Q8_0" -> matmulQ8_0() != null
109127
"Q4_0" -> matmulQ4_0() != null
128+
"Q6_K" -> matmulQ6K() != null
129+
"Q5_1" -> matmulQ5_1() != null
130+
"Q5_0" -> matmulQ5_0() != null
110131
else -> false
111132
}
112133
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
package sk.ainet.backend.api.kernel
2+
3+
/**
4+
* F32 input × Q5_0-packed weights matrix-vector multiply, in canonical
5+
* ggml block layout.
6+
*
7+
* output[outputOffset + o] = Σ_j input[inputOffset + j] · dequant(weight[o, j])
8+
* for j ∈ [0, inputDim), o ∈ [0, outputDim)
9+
*
10+
* Block layout (32-element block, 22 bytes/block; see
11+
* [sk.ainet.lang.tensor.data.Q5_0BlockTensorData] kdoc):
12+
* - bytes 0..1 : `d` (block scale, FP16 LE)
13+
* - bytes 2..5 : `qh[0..3]` (the 5th/high bit of each of the 32 codes)
14+
* - bytes 6..21 : `qs[0..15]` (low 4 bits, two nibbles per byte)
15+
*
16+
* Per element, with `lo = qs[j] & 0x0F`, `hi = qs[j] >>> 4`, and the high
17+
* bits `bitLo = (qh[j/8] >>> (j%8)) & 1`, `bitHi = (qh[(j+16)/8] >>> ((j+16)%8)) & 1`:
18+
*
19+
* element[j] = d * (lo + (bitLo shl 4) - 16) for j ∈ [0, 16)
20+
* element[j + 16] = d * (hi + (bitHi shl 4) - 16)
21+
*
22+
* The `- 16` bias centres the unsigned 5-bit code around zero (no per-block
23+
* min). Matches `sk.ainet.io.gguf.dequant.DequantOps.dequantQ5_0FromBytes`.
24+
*
25+
* Implementations MUST NOT mutate `input` or `weight`. They MUST fully
26+
* write the `outputDim` floats starting at `output[outputOffset]`.
27+
*
28+
* Packed-weight **block-major** row contract: `weight` holds blocks laid
29+
* out `(blockIdx * outputDim + o) * 22`. Matches `Q5_0BlockTensorData.packedData`.
30+
*
31+
* `inputDim` MUST be a multiple of 32 (the Q5_0 block size).
32+
*/
33+
public interface Q5_0MatmulKernel {
34+
public fun matmul(
35+
input: FloatArray, inputOffset: Int,
36+
weight: ByteArray, weightByteOffset: Int,
37+
inputDim: Int, outputDim: Int,
38+
output: FloatArray, outputOffset: Int,
39+
)
40+
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package sk.ainet.backend.api.kernel
2+
3+
/**
4+
* F32 input × Q5_1-packed weights matrix-vector multiply, in canonical
5+
* ggml block layout.
6+
*
7+
* output[outputOffset + o] = Σ_j input[inputOffset + j] · dequant(weight[o, j])
8+
* for j ∈ [0, inputDim), o ∈ [0, outputDim)
9+
*
10+
* Block layout (32-element block, 24 bytes/block; see
11+
* [sk.ainet.lang.tensor.data.Q5_1BlockTensorData] kdoc):
12+
* - bytes 0..1 : `d` (block scale, FP16 LE)
13+
* - bytes 2..3 : `m` (block minimum, FP16 LE)
14+
* - bytes 4..7 : `qh[0..3]` (the 5th/high bit of each of the 32 codes)
15+
* - bytes 8..23 : `qs[0..15]` (low 4 bits, two nibbles per byte)
16+
*
17+
* Per element, with `lo = qs[j] & 0x0F`, `hi = qs[j] >>> 4`, and the high
18+
* bits `bitLo = (qh[j/8] >>> (j%8)) & 1`, `bitHi = (qh[(j+16)/8] >>> ((j+16)%8)) & 1`:
19+
*
20+
* element[j] = d * (lo + (bitLo shl 4)) + m for j ∈ [0, 16)
21+
* element[j + 16] = d * (hi + (bitHi shl 4)) + m
22+
*
23+
* Matches `sk.ainet.io.gguf.dequant.DequantOps.dequantQ5_1FromBytes`.
24+
*
25+
* Implementations MUST NOT mutate `input` or `weight`. They MAY assume
26+
* the arrays do not alias each other or `output`. They MUST fully write
27+
* the `outputDim` floats starting at `output[outputOffset]`.
28+
*
29+
* Packed-weight **block-major** row contract: `weight` holds blocks laid
30+
* out `(blockIdx * outputDim + o) * 24` for output row `o` and input
31+
* block index `blockIdx`. This matches `Q5_1BlockTensorData.packedData`
32+
* after the GGUF row-major → input-block-major re-layout.
33+
*
34+
* `inputDim` MUST be a multiple of 32 (the Q5_1 block size).
35+
*/
36+
public interface Q5_1MatmulKernel {
37+
/**
38+
* @param input FP32 input vector (single row).
39+
* @param inputOffset element offset into [input] where the row starts.
40+
* @param weight packed Q5_1 bytes for the full `outputDim × inputDim` weight tensor.
41+
* @param weightByteOffset byte offset into [weight] where block (0, 0) starts.
42+
* @param inputDim contraction dimension (must be a multiple of 32).
43+
* @param outputDim number of output cells.
44+
* @param output FP32 output vector.
45+
* @param outputOffset element offset into [output] where the row starts.
46+
*/
47+
public fun matmul(
48+
input: FloatArray, inputOffset: Int,
49+
weight: ByteArray, weightByteOffset: Int,
50+
inputDim: Int, outputDim: Int,
51+
output: FloatArray, outputOffset: Int,
52+
)
53+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
package sk.ainet.backend.api.kernel
2+
3+
/**
4+
* F32 input × Q6_K-packed weights matrix-vector multiply, in canonical
5+
* ggml block layout.
6+
*
7+
* output[outputOffset + o] = Σ_j input[inputOffset + j] · dequant(weight[o, j])
8+
* for j ∈ [0, inputDim), o ∈ [0, outputDim)
9+
*
10+
* Q6_K super-block layout (256 elements, 210 bytes/block; see
11+
* [sk.ainet.lang.tensor.data.Q6_KBlockTensorData]):
12+
* - bytes 0..127 : `ql[0..127]` (lower 4 bits of each code)
13+
* - bytes 128..191 : `qh[0..63]` (upper 2 bits of each code)
14+
* - bytes 192..207 : `scales[0..15]`(int8 per-16-element sub-block scales)
15+
* - bytes 208..209 : `d` (super-block scale, FP16 LE)
16+
*
17+
* The 6-bit signed code is reassembled from `ql`/`qh` (see ggml
18+
* `dequantize_row_q6_K`); per element `dequant = d * scales[sub] * (code - 32)`.
19+
* Matches `sk.ainet.io.gguf.dequant.DequantOps.dequantQ6KFromBytes` — that is
20+
* the authoritative reference; implementations MUST agree with it.
21+
*
22+
* Implementations MUST NOT mutate `input` or `weight`. They MUST fully
23+
* write the `outputDim` floats starting at `output[outputOffset]`.
24+
*
25+
* Packed-weight **block-major** row contract: blocks laid out
26+
* `(blockIdx * outputDim + o) * 210`. Matches `Q6_KBlockTensorData.packedData`.
27+
*
28+
* `inputDim` MUST be a multiple of 256 (the Q6_K super-block size).
29+
*/
30+
public interface Q6KMatmulKernel {
31+
public fun matmul(
32+
input: FloatArray, inputOffset: Int,
33+
weight: ByteArray, weightByteOffset: Int,
34+
inputDim: Int, outputDim: Int,
35+
output: FloatArray, outputOffset: Int,
36+
)
37+
}

skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ public final class sk/ainet/exec/kernel/PanamaVectorKernelProvider : sk/ainet/ba
5454
public fun matmulFp32 ()Lsk/ainet/backend/api/kernel/Fp32MatmulKernel;
5555
public fun matmulQ4K ()Lsk/ainet/backend/api/kernel/Q4KMatmulKernel;
5656
public fun matmulQ4_0 ()Lsk/ainet/backend/api/kernel/Q4_0MatmulKernel;
57+
public fun matmulQ5_0 ()Lsk/ainet/backend/api/kernel/Q5_0MatmulKernel;
58+
public fun matmulQ5_1 ()Lsk/ainet/backend/api/kernel/Q5_1MatmulKernel;
59+
public fun matmulQ6K ()Lsk/ainet/backend/api/kernel/Q6KMatmulKernel;
5760
public fun matmulQ8_0 ()Lsk/ainet/backend/api/kernel/Q8_0MatmulKernel;
5861
public fun supports (Ljava/lang/String;Ljava/util/List;)Z
5962
}
@@ -67,6 +70,9 @@ public final class sk/ainet/exec/kernel/PanamaVectorKernelProviderFactory : sk/a
6770
public fun matmulFp32 ()Lsk/ainet/backend/api/kernel/Fp32MatmulKernel;
6871
public fun matmulQ4K ()Lsk/ainet/backend/api/kernel/Q4KMatmulKernel;
6972
public fun matmulQ4_0 ()Lsk/ainet/backend/api/kernel/Q4_0MatmulKernel;
73+
public fun matmulQ5_0 ()Lsk/ainet/backend/api/kernel/Q5_0MatmulKernel;
74+
public fun matmulQ5_1 ()Lsk/ainet/backend/api/kernel/Q5_1MatmulKernel;
75+
public fun matmulQ6K ()Lsk/ainet/backend/api/kernel/Q6KMatmulKernel;
7076
public fun matmulQ8_0 ()Lsk/ainet/backend/api/kernel/Q8_0MatmulKernel;
7177
public fun supports (Ljava/lang/String;Ljava/util/List;)Z
7278
}
@@ -105,6 +111,9 @@ public final class sk/ainet/exec/kernel/ScalarKernelProvider : sk/ainet/backend/
105111
public fun matmulFp32 ()Lsk/ainet/backend/api/kernel/Fp32MatmulKernel;
106112
public fun matmulQ4K ()Lsk/ainet/backend/api/kernel/Q4KMatmulKernel;
107113
public fun matmulQ4_0 ()Lsk/ainet/backend/api/kernel/Q4_0MatmulKernel;
114+
public fun matmulQ5_0 ()Lsk/ainet/backend/api/kernel/Q5_0MatmulKernel;
115+
public fun matmulQ5_1 ()Lsk/ainet/backend/api/kernel/Q5_1MatmulKernel;
116+
public fun matmulQ6K ()Lsk/ainet/backend/api/kernel/Q6KMatmulKernel;
108117
public fun matmulQ8_0 ()Lsk/ainet/backend/api/kernel/Q8_0MatmulKernel;
109118
public fun supports (Ljava/lang/String;Ljava/util/List;)Z
110119
}
@@ -118,6 +127,9 @@ public final class sk/ainet/exec/kernel/ScalarKernelProviderFactory : sk/ainet/b
118127
public fun matmulFp32 ()Lsk/ainet/backend/api/kernel/Fp32MatmulKernel;
119128
public fun matmulQ4K ()Lsk/ainet/backend/api/kernel/Q4KMatmulKernel;
120129
public fun matmulQ4_0 ()Lsk/ainet/backend/api/kernel/Q4_0MatmulKernel;
130+
public fun matmulQ5_0 ()Lsk/ainet/backend/api/kernel/Q5_0MatmulKernel;
131+
public fun matmulQ5_1 ()Lsk/ainet/backend/api/kernel/Q5_1MatmulKernel;
132+
public fun matmulQ6K ()Lsk/ainet/backend/api/kernel/Q6KMatmulKernel;
121133
public fun matmulQ8_0 ()Lsk/ainet/backend/api/kernel/Q8_0MatmulKernel;
122134
public fun supports (Ljava/lang/String;Ljava/util/List;)Z
123135
}
@@ -132,6 +144,26 @@ public final class sk/ainet/exec/kernel/ScalarQ4_0MatmulKernel : sk/ainet/backen
132144
public fun matmul ([FI[BIII[FI)V
133145
}
134146

147+
public final class sk/ainet/exec/kernel/ScalarQ4_KMatmulKernel : sk/ainet/backend/api/kernel/Q4KMatmulKernel {
148+
public static final field INSTANCE Lsk/ainet/exec/kernel/ScalarQ4_KMatmulKernel;
149+
public fun matmul ([FI[BIII[FI)V
150+
}
151+
152+
public final class sk/ainet/exec/kernel/ScalarQ5_0MatmulKernel : sk/ainet/backend/api/kernel/Q5_0MatmulKernel {
153+
public static final field INSTANCE Lsk/ainet/exec/kernel/ScalarQ5_0MatmulKernel;
154+
public fun matmul ([FI[BIII[FI)V
155+
}
156+
157+
public final class sk/ainet/exec/kernel/ScalarQ5_1MatmulKernel : sk/ainet/backend/api/kernel/Q5_1MatmulKernel {
158+
public static final field INSTANCE Lsk/ainet/exec/kernel/ScalarQ5_1MatmulKernel;
159+
public fun matmul ([FI[BIII[FI)V
160+
}
161+
162+
public final class sk/ainet/exec/kernel/ScalarQ6_KMatmulKernel : sk/ainet/backend/api/kernel/Q6KMatmulKernel {
163+
public static final field INSTANCE Lsk/ainet/exec/kernel/ScalarQ6_KMatmulKernel;
164+
public fun matmul ([FI[BIII[FI)V
165+
}
166+
135167
public final class sk/ainet/exec/kernel/ScalarQ8_0MatmulKernel : sk/ainet/backend/api/kernel/Q8_0MatmulKernel {
136168
public static final field INSTANCE Lsk/ainet/exec/kernel/ScalarQ8_0MatmulKernel;
137169
public fun matmul ([FI[BIII[FI)V
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
package sk.ainet.exec.kernel
2+
3+
/**
4+
* Convert a 16-bit IEEE-754 half-precision value (low 16 bits of [hbits])
5+
* to FP32. Shared by the scalar packed-quant kernels in this package
6+
* (Q5_1/Q5_0/Q4_K/Q6_K). Mirrors the inlined helpers in
7+
* [ScalarQ4_0MatmulKernel] / [ScalarQ8_0MatmulKernel].
8+
*/
9+
internal fun decodeHalf(hbits: Int): Float {
10+
val sign = (hbits and 0x8000) shl 16
11+
val exp = (hbits and 0x7C00) shr 10
12+
val mant = hbits and 0x03FF
13+
return when (exp) {
14+
0 -> {
15+
if (mant == 0) {
16+
Float.fromBits(sign)
17+
} else {
18+
var m = mant
19+
var e = -14
20+
while ((m and 0x400) == 0) {
21+
m = m shl 1
22+
e--
23+
}
24+
m = m and 0x3FF
25+
Float.fromBits(sign or ((e + 127) shl 23) or (m shl 13))
26+
}
27+
}
28+
31 -> Float.fromBits(sign or (0xFF shl 23) or (mant shl 13))
29+
else -> Float.fromBits(sign or ((exp - 15 + 127) shl 23) or (mant shl 13))
30+
}
31+
}

skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarKernelProvider.kt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@ package sk.ainet.exec.kernel
33
import sk.ainet.backend.api.kernel.Bf16MatmulKernel
44
import sk.ainet.backend.api.kernel.Fp32MatmulKernel
55
import sk.ainet.backend.api.kernel.KernelProvider
6+
import sk.ainet.backend.api.kernel.Q4KMatmulKernel
67
import sk.ainet.backend.api.kernel.Q4_0MatmulKernel
8+
import sk.ainet.backend.api.kernel.Q5_0MatmulKernel
9+
import sk.ainet.backend.api.kernel.Q5_1MatmulKernel
10+
import sk.ainet.backend.api.kernel.Q6KMatmulKernel
711
import sk.ainet.backend.api.kernel.Q8_0MatmulKernel
812

913
/**
@@ -27,4 +31,8 @@ public object ScalarKernelProvider : KernelProvider {
2731
override fun matmulBf16(): Bf16MatmulKernel = ScalarBf16MatmulKernel
2832
override fun matmulQ8_0(): Q8_0MatmulKernel = ScalarQ8_0MatmulKernel
2933
override fun matmulQ4_0(): Q4_0MatmulKernel = ScalarQ4_0MatmulKernel
34+
override fun matmulQ4K(): Q4KMatmulKernel = ScalarQ4_KMatmulKernel
35+
override fun matmulQ6K(): Q6KMatmulKernel = ScalarQ6_KMatmulKernel
36+
override fun matmulQ5_1(): Q5_1MatmulKernel = ScalarQ5_1MatmulKernel
37+
override fun matmulQ5_0(): Q5_0MatmulKernel = ScalarQ5_0MatmulKernel
3038
}
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
package sk.ainet.exec.kernel
2+
3+
import sk.ainet.backend.api.kernel.Q4KMatmulKernel
4+
5+
/**
6+
* Scalar reference [Q4KMatmulKernel] — commonMain, so Q4_K packed matmul works
7+
* on Kotlin/Native / JS / WASM, not only the JVM SIMD path.
8+
*
9+
* Q4_K super-block: 256 elements / 144 bytes, block-major `(blockIdx*outputDim+o)*144`:
10+
* `d`(f16) `dMin`(f16) 12 scale bytes (ggml `get_scale_min_k4` packing) 128 code bytes.
11+
* Each of the 8 sub-blocks (32 elts) contributes `codeSum*scale - inputSum*offset`,
12+
* with `scale = d*scaleIdx`, `offset = dMin*minIdx`. Math mirrors
13+
* `JvmQuantizedVectorKernels.matmulQ4_KVec` / `DequantOps.dequantQ4KFromBytes`.
14+
*/
15+
public object ScalarQ4_KMatmulKernel : Q4KMatmulKernel {
16+
17+
private const val BLOCK_SIZE = 256
18+
private const val SUB_BLOCK = 32
19+
private const val BYTES_PER_BLOCK = 144
20+
21+
override fun matmul(
22+
input: FloatArray, inputOffset: Int,
23+
weight: ByteArray, weightByteOffset: Int,
24+
inputDim: Int, outputDim: Int,
25+
output: FloatArray, outputOffset: Int,
26+
) {
27+
require(inputDim % BLOCK_SIZE == 0) {
28+
"ScalarQ4_KMatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim"
29+
}
30+
if (outputDim == 0) return
31+
if (inputDim == 0) { for (o in 0 until outputDim) output[outputOffset + o] = 0f; return }
32+
val blocksPerInputDim = inputDim / BLOCK_SIZE
33+
val scaleIdx = IntArray(8)
34+
val minIdx = IntArray(8)
35+
36+
for (o in 0 until outputDim) {
37+
var acc = 0f
38+
for (blockIdx in 0 until blocksPerInputDim) {
39+
val blockBase = weightByteOffset + (blockIdx * outputDim + o) * BYTES_PER_BLOCK
40+
val d = decodeHalf(((weight[blockBase + 1].toInt() and 0xFF) shl 8) or (weight[blockBase].toInt() and 0xFF))
41+
val dMin = decodeHalf(((weight[blockBase + 3].toInt() and 0xFF) shl 8) or (weight[blockBase + 2].toInt() and 0xFF))
42+
43+
// ggml get_scale_min_k4 over the 12 scale bytes.
44+
val sc = blockBase + 4
45+
for (sb in 0 until 4) {
46+
scaleIdx[sb] = weight[sc + sb].toInt() and 0x3F
47+
minIdx[sb] = weight[sc + sb + 4].toInt() and 0x3F
48+
}
49+
for (sb in 4 until 8) {
50+
val low4S = weight[sc + sb + 4].toInt() and 0x0F
51+
val high2S = (weight[sc + sb - 4].toInt() and 0xFF) ushr 6
52+
scaleIdx[sb] = low4S or (high2S shl 4)
53+
val low4M = (weight[sc + sb + 4].toInt() and 0xFF) ushr 4
54+
val high2M = (weight[sc + sb].toInt() and 0xFF) ushr 6
55+
minIdx[sb] = low4M or (high2M shl 4)
56+
}
57+
58+
val codesOffset = blockBase + 16
59+
val inBlockBase = inputOffset + blockIdx * BLOCK_SIZE
60+
for (groupJ in 0 until 4) {
61+
val qsRegion = codesOffset + groupJ * 32
62+
// sub-block lo (low nibbles) then hi (high nibbles) of the same 32 bytes.
63+
for (half in 0 until 2) {
64+
val sb = 2 * groupJ + half
65+
val inStart = inBlockBase + sb * SUB_BLOCK
66+
var codeSum = 0f
67+
var inputSum = 0f
68+
for (i in 0 until 32) {
69+
val b = weight[qsRegion + i].toInt() and 0xFF
70+
val code = if (half == 0) (b and 0x0F) else (b ushr 4)
71+
val v = input[inStart + i]
72+
codeSum += v * code
73+
inputSum += v
74+
}
75+
acc += codeSum * (d * scaleIdx[sb]) - inputSum * (dMin * minIdx[sb])
76+
}
77+
}
78+
}
79+
output[outputOffset + o] = acc
80+
}
81+
}
82+
}

0 commit comments

Comments
 (0)