fix(apertus): force-dequant token_embd under NATIVE_OPTIMIZED

michalharakal · claude · michalharakal · commit 3d3c6ffe447f · 2026-05-02T19:46:17.000+02:00
ApertusWeightLoader.streamingTensorToTensor / readerTensorToTensor wrap quantized weights with byte-level rank-1 shape under QuantPolicy.NATIVE_OPTIMIZED so the native FFM kernels can address the block layout directly. That works for matmul (the kernel knows the logical shape from metadata) but breaks Embedding.gather, which requires the logical rank-2 [vocab, dim] shape — a rank-1 weight tensor errors with "gather: unsupported input rank 1". Surfaced by ApertusNetworkLoader.fromGguf().load() on real unsloth/Apertus-8B-Instruct-2509 Q4_K_S: token_embd is stored as Q4_K in the GGUF and gets the byte-level shape, so the very first forward pass through the embedding layer dies before any logit math. Add loadStreamingTensor / loadReaderTensor wrappers around the existing *ToTensor helpers. They route token_embd.weight through the dequant path (DequantOps.dequantFromBytes → createTensor with the logical [vocab, dim] shape) when quantPolicy is NATIVE_OPTIMIZED and the tensor is a quantized type. Other tensors keep their NATIVE_OPTIMIZED byte-level layout for kernel dispatch. The integration test class kdoc documents the next blocker that prevents end-to-end inference (linearProject in MultiHeadAttention calls ops.transpose on byte-shape weights for Q/K/V/O and FFN projections, which Gemma solves via Q4_KBlockTensorData but Apertus doesn't yet implement). Tracked as #100. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/llm-inference/apertus/src/commonMain/kotlin/sk/ainet/models/apertus/ApertusWeightLoader.kt b/llm-inference/apertus/src/commonMain/kotlin/sk/ainet/models/apertus/ApertusWeightLoader.kt
@@ -120,12 +120,13 @@ public class ApertusWeightLoader private constructor(
         requiredTensorNames(metadata).forEach { name ->
             val rt = tensorByName[name]
                 ?: error("Missing required tensor in GGUF payload: $name")
-            byName[name] = readerTensorToTensor(ctx, dtype, reader, rt)
+            byName[name] = loadReaderTensor(ctx, dtype, reader, rt, name)
         }
 
         // Load optional rope_freqs tensor
         tensorByName[ApertusTensorNames.ROPE_FREQS]?.let { rt ->
-            byName[ApertusTensorNames.ROPE_FREQS] = readerTensorToTensor(ctx, dtype, reader, rt)
+            byName[ApertusTensorNames.ROPE_FREQS] =
+                loadReaderTensor(ctx, dtype, reader, rt, ApertusTensorNames.ROPE_FREQS)
         }
 
         // Extract xIELU params: try metadata fields first, then per-layer tensors
@@ -162,12 +163,13 @@ public class ApertusWeightLoader private constructor(
             requiredTensorNames(metadata).forEach { name ->
                 val st = tensorByName[name]
                     ?: error("Missing required tensor in GGUF payload: $name")
-                byName[name] = streamingTensorToTensor(ctx, dtype, reader, st)
+                byName[name] = loadStreamingTensor(ctx, dtype, reader, st, name)
             }
 
             // Load optional rope_freqs tensor
             tensorByName[ApertusTensorNames.ROPE_FREQS]?.let { st ->
-                byName[ApertusTensorNames.ROPE_FREQS] = streamingTensorToTensor(ctx, dtype, reader, st)
+                byName[ApertusTensorNames.ROPE_FREQS] =
+                    loadStreamingTensor(ctx, dtype, reader, st, ApertusTensorNames.ROPE_FREQS)
             }
 
             // Extract xIELU params: try metadata fields first, then per-layer tensors
@@ -560,6 +562,58 @@ public class ApertusWeightLoader private constructor(
 
     // ============== Tensor conversion ==============
 
+    /**
+     * NATIVE_OPTIMIZED stores quantized tensors as byte-level rank-1 buffers so the
+     * native FFM kernels can address the raw block layout directly. That works for
+     * matmul (the kernel knows the logical shape from metadata) but breaks the
+     * token embedding, where `Embedding.gather()` requires the logical rank-2
+     * `[vocab, dim]` shape. Force `token_embd.weight` through the dequant path so
+     * the embedding lookup gets a real `[vocab, dim]` FP32/FP16 tensor regardless
+     * of the policy chosen for the rest of the model.
+     */
+    private fun <T : DType, V> loadStreamingTensor(
+        ctx: ExecutionContext,
+        dtype: KClass<T>,
+        reader: StreamingGGUFReader,
+        st: StreamingTensorInfo,
+        name: String
+    ): Tensor<T, V> {
+        if (name == ApertusTensorNames.TOKEN_EMBEDDINGS &&
+            quantPolicy == QuantPolicy.NATIVE_OPTIMIZED &&
+            st.tensorType != GGMLQuantizationType.F32 &&
+            st.tensorType != GGMLQuantizationType.F16 &&
+            st.tensorType != GGMLQuantizationType.BF16
+        ) {
+            val shape = Shape(*st.shape.map { it.toInt() }.toIntArray())
+            val bytes = reader.loadTensorData(st)
+            val floats = DequantOps.dequantFromBytes(bytes, st.tensorType, st.nElements.toInt())
+            return createTensor(ctx, dtype, shape, floats)
+        }
+        return streamingTensorToTensor(ctx, dtype, reader, st)
+    }
+
+    private fun <T : DType, V> loadReaderTensor(
+        ctx: ExecutionContext,
+        dtype: KClass<T>,
+        reader: GGUFReader,
+        rt: ReaderTensor,
+        name: String
+    ): Tensor<T, V> {
+        if (name == ApertusTensorNames.TOKEN_EMBEDDINGS &&
+            quantPolicy == QuantPolicy.NATIVE_OPTIMIZED &&
+            rt.tensorType != GGMLQuantizationType.F32 &&
+            rt.tensorType != GGMLQuantizationType.F16 &&
+            rt.tensorType != GGMLQuantizationType.BF16
+        ) {
+            val shape = Shape(*rt.shape.map { it.toInt() }.toIntArray())
+            val raw = if (rt.data.isEmpty()) reader.materialize(rt) else rt.data
+            val bytes: ByteArray = DequantOps.toByteArray(raw, rt.name)
+            val floats = DequantOps.dequantFromBytes(bytes, rt.tensorType, rt.nElements)
+            return createTensor(ctx, dtype, shape, floats)
+        }
+        return readerTensorToTensor(ctx, dtype, reader, rt)
+    }
+
     @Suppress("UNCHECKED_CAST")
     private fun <T : DType, V> readerTensorToTensor(
         ctx: ExecutionContext,
diff --git a/llm-inference/apertus/src/jvmTest/kotlin/sk/ainet/models/apertus/ApertusRealGgufLoadingTest.kt b/llm-inference/apertus/src/jvmTest/kotlin/sk/ainet/models/apertus/ApertusRealGgufLoadingTest.kt
@@ -180,6 +180,34 @@ class ApertusRealGgufLoadingTest {
         println("[real-load fromGguf NATIVE_OPTIMIZED] top-modules=${topNames.size}")
     }
 
+    /**
+     * End-to-end inference (forward / generate / tool calling) is intentionally
+     * NOT covered here.
+     *
+     * `ApertusNetworkLoader.fromGguf().load()` succeeds end-to-end (verified by
+     * the test above), and the embedding lookup works after the
+     * `loadStreamingTensor` token-embd dequant special case. But the rest of
+     * the forward pass — Q/K/V/O projections, FFN matmuls — relies on the
+     * standard `linearProject(ops, input, weight) = ops.matmul(input, ops.transpose(weight))`
+     * helper, which assumes a logical rank-2 weight. Under
+     * `QuantPolicy.NATIVE_OPTIMIZED` the loader stores quantized weights as
+     * raw byte-level rank-1 `Int8` tensors so the native FFM kernels can
+     * address the block layout directly — but `ops.transpose(byteShape)` then
+     * fails.
+     *
+     * Gemma's Q4_K end-to-end test works because Gemma's loader uses
+     * `Q4_KBlockTensorData(logicalShape, blockMajorBytes)` with a lazy
+     * `transpose` override and a quant-aware `matmul` dispatch (see
+     * `GemmaDslQ4KTest`, `relayoutQ4_KRowMajorToBlockMajor`). Apertus's
+     * loader stores raw Int8 bytes instead, so `linearProject` blows up at
+     * the first attention projection.
+     *
+     * Tracking issue: see the upstream / transformers follow-up — the
+     * Apertus loader needs per-quant-type tensor-data wrappers
+     * (`Q4_KBlockTensorData` / `Q5_KBlockTensorData` / `Q6_KBlockTensorData`)
+     * with row-major → block-major relayout, mirroring Gemma's path.
+     */
+
     private fun locateModel(): File? {
         System.getenv("APERTUS_GGUF_PATH")?.let { p ->
             val f = File(p)