Merge pull request #174 from SKaiNET-developers/fix/654-q4_1-decoder-gguf-dequant

michalharakal · web-flow · commit 64b16d05cdde · 2026-06-10T18:16:24.000+02:00
fix(llama): dequantize Q4_1 (and all non-packed quant types) in DecoderGgufMemSegConverter
diff --git a/llm-inference/llama/src/jvmMain/kotlin/sk/ainet/models/llama/DecoderGgufMemSegConverter.kt b/llm-inference/llama/src/jvmMain/kotlin/sk/ainet/models/llama/DecoderGgufMemSegConverter.kt
@@ -27,18 +27,20 @@ import java.lang.foreign.Arena
  *   [Q8MemorySegmentTensorData] with the **logical** matrix shape derived
  *   from metadata. Upstream `DefaultCpuOpsJvm.matmul` and `transpose`
  *   detect the markers and dispatch quant-aware kernels at forward time.
- * - **Q4_K / Q5_K / Q6_K** → dequantized to FP32. The packed K-quant kernels
- *   are MemSeg-only on a hot path the DSL doesn't yet route through, so this
- *   trades memory for correctness. Same trade-off the legacy converter
- *   makes for K-quants.
+ * - **Every other quant type** (Q4_1, Q5_0, Q5_1, Q8_1, the K-quants
+ *   Q4_K / Q5_K / Q6_K, IQ4_NL/XS, TQ1/2_0, ...) → dequantized to FP32. None
+ *   of these has a packed MemSeg kernel on the hot path the DSL routes
+ *   through, so this trades memory for correctness — the same trade-off the
+ *   legacy converter makes for K-quants. [DequantOps.dequantFromBytes] throws
+ *   for genuinely unknown types, so an unsupported model fails explicitly at
+ *   load time instead of silently passing bytes through and crashing later
+ *   inside matmul (see issue #654).
  * - **token_embd.weight** → always dequantized to FP32 regardless of quant
  *   type. The Embedding layer consumes this via `gather`, not matmul, so it
  *   needs real floats with the logical 2D shape — packed quant bytes would
  *   be misread as FP32 values, and the loader's intermediate Int8 wrapper
  *   stores a 1D byte-count shape that `gather` rejects.
  * - **FP32 (no entry in `quantTypes`)** → passed through unchanged.
- * - **Other quant types** → warning logged, passed through (will fail later
- *   if the model actually hits them via matmul).
  *
  * Why logical shape matters here: the loader stores raw quant bytes via
  * `ctx.fromByteArray(Shape(bytes.size), Int8, bytes)` — a 1D byte-count
@@ -168,19 +170,17 @@ public object DecoderGgufMemSegConverter {
                 @Suppress("UNCHECKED_CAST")
                 ctx.fromData(newData as TensorData<FP32, Float>, FP32::class)
             }
-            GGMLQuantizationType.Q4_K,
-            GGMLQuantizationType.Q5_K,
-            GGMLQuantizationType.Q6_K -> {
+            // Every other GGUF quant type (Q4_1, Q5_0, Q5_1, Q8_1, the
+            // K-quants, IQ4_NL/XS, TQ1/2_0, ...) has no packed MemSeg kernel
+            // on the DSL forward path, so dequantize to FP32 here — the same
+            // memory-for-correctness trade-off the K-quants already made.
+            // DequantOps throws for genuinely unknown types, which turns what
+            // used to be a silent pass-through (and a confusing crash deep
+            // inside matmul) into an explicit failure at load time. See #654.
+            else -> {
                 val floats = DequantOps.dequantFromBytes(bytes, quantType, logicalShape.volume)
                 ctx.fromFloatArray(logicalShape, FP32::class, floats)
             }
-            else -> {
-                println(
-                    "WARNING: DecoderGgufMemSegConverter: unsupported quant type $quantType for '$name'; " +
-                        "passing through unchanged. Forward pass may fail at matmul.",
-                )
-                tensor
-            }
         }
     }
 
diff --git a/llm-inference/llama/src/jvmTest/kotlin/sk/ainet/models/llama/DecoderGgufMemSegConverterTest.kt b/llm-inference/llama/src/jvmTest/kotlin/sk/ainet/models/llama/DecoderGgufMemSegConverterTest.kt
@@ -104,6 +104,36 @@ class DecoderGgufMemSegConverterTest {
         }
     }
 
+    @Test
+    fun `Q4_1 tensor is dequantized to FP32 with logical shape`() {
+        // Regression for #654: Q4_1 used to hit the silent pass-through
+        // `else` branch and crash later inside matmul. It must now be
+        // dequantized to a 2D FP32 tensor with the logical matrix shape.
+        // ffn_down logical shape is (dim, ffn); size the raw fixture to match.
+        val rawQ4_1 = rawQ4_1Tensor(rows = dim, cols = ffn)
+        val weights = DecoderGgufWeights<FP32, Float>(
+            metadata = metadata,
+            tensors = mapOf("blk.0.ffn_down.weight" to rawQ4_1),
+            quantTypes = mapOf("blk.0.ffn_down.weight" to GGMLQuantizationType.Q4_1),
+        )
+
+        Arena.ofConfined().use { arena ->
+            val out = DecoderGgufMemSegConverter.convert(weights, ctx, arena)
+            val down = out.tensors.getValue("blk.0.ffn_down.weight")
+
+            assertEquals(
+                Shape(dim, ffn),
+                down.shape,
+                "Q4_1 weight must be dequantized to its logical 2D shape, not passed through as 1D bytes",
+            )
+            assertTrue(
+                down.data !is Q4MemorySegmentMarker && down.data !is Q8MemorySegmentMarker,
+                "Q4_1 has no packed MemSeg path; it must be plain dequantized FP32, got ${down.data::class.simpleName}",
+            )
+            assertTrue(out.quantTypes.isEmpty(), "quantTypes should be cleared post-convert")
+        }
+    }
+
     @Test
     fun `tensor count and key set are preserved`() {
         val q4 = rawQ4Tensor(dim, dim)
@@ -157,6 +187,36 @@ class DecoderGgufMemSegConverterTest {
         return tensor as Tensor<FP32, Float>
     }
 
+    /** Build a raw-byte tensor that simulates a NATIVE_OPTIMIZED Q4_1 load. */
+    private fun rawQ4_1Tensor(rows: Int, cols: Int): Tensor<FP32, Float> {
+        val nElements = rows * cols
+        val blockSize = 32
+        val bytesPerBlock = 20 // 2B d (f16) + 2B m (f16) + 16B packed nibbles
+        val nBlocks = nElements / blockSize
+        val nBytes = nBlocks * bytesPerBlock
+
+        val bytes = ByteArray(nBytes)
+        for (block in 0 until nBlocks) {
+            val off = block * bytesPerBlock
+            // f16 scale d = 0.5
+            val dBits = floatToHalf(0.5f)
+            bytes[off] = (dBits and 0xFF).toByte()
+            bytes[off + 1] = ((dBits shr 8) and 0xFF).toByte()
+            // f16 min m = 0.25
+            val mBits = floatToHalf(0.25f)
+            bytes[off + 2] = (mBits and 0xFF).toByte()
+            bytes[off + 3] = ((mBits shr 8) and 0xFF).toByte()
+            // Nibble codes: 8 on both halves for simplicity (w = d*8 + m)
+            for (i in 0 until 16) {
+                bytes[off + 4 + i] = 0x88.toByte()
+            }
+        }
+
+        val tensor = ctx.fromByteArray<Int8, Byte>(Shape(nBytes), Int8::class, bytes)
+        @Suppress("UNCHECKED_CAST")
+        return tensor as Tensor<FP32, Float>
+    }
+
     /** Build a raw-byte tensor that simulates a NATIVE_OPTIMIZED Q8_0 load. */
     private fun rawQ8Tensor(rows: Int, cols: Int): Tensor<FP32, Float> {
         val nElements = rows * cols