SKaiNET-developers
diff --git a/‎llm-core/build.gradle.kts‎
Lines changed: 9 additions & 0 deletions b/‎llm-core/build.gradle.kts‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎llm-core/src/commonMain/kotlin/sk/ainet/apps/llm/UnifiedModelLoader.kt‎
Lines changed: 5 additions & 4 deletions b/‎llm-core/src/commonMain/kotlin/sk/ainet/apps/llm/UnifiedModelLoader.kt‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎llm-core/src/jvmTest/kotlin/sk/ainet/apps/llm/UnifiedModelLoaderUIntMetadataTest.kt‎
Lines changed: 117 additions & 0 deletions b/‎llm-core/src/jvmTest/kotlin/sk/ainet/apps/llm/UnifiedModelLoaderUIntMetadataTest.kt‎
Lines changed: 117 additions & 0 deletions
diff --git a/‎llm-inference/apertus/build.gradle.kts‎
Lines changed: 1 addition & 1 deletion b/‎llm-inference/apertus/build.gradle.kts‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llm-inference/apertus/src/commonMain/kotlin/sk/ainet/models/apertus/ApertusWeightLoader.kt‎
Lines changed: 70 additions & 6 deletions b/‎llm-inference/apertus/src/commonMain/kotlin/sk/ainet/models/apertus/ApertusWeightLoader.kt‎
Lines changed: 70 additions & 6 deletions
@@ -58,6 +58,15 @@ kotlin {
 
         val jvmMain by getting
 
+        val jvmTest by getting {
+            dependencies {
+                implementation(libs.kotlin.test)
+                implementation(libs.junit)
+                implementation(libs.skainet.io.gguf)
+                implementation(libs.skainet.io.core)
+            }
+        }
+
         // Shared source set for all non-JVM targets (manual BackendRegistry)
         val registryBasedMain by creating {
             dependsOn(commonMain.get())
 
@@ -2,6 +2,7 @@ package sk.ainet.apps.llm
 
 import sk.ainet.io.RandomAccessSource
 import sk.ainet.io.gguf.StreamingGGUFReader
+import sk.ainet.io.gguf.getInt
 import sk.ainet.lang.types.DType
 
 /**
@@ -54,11 +55,11 @@ public object UnifiedModelLoader {
                 GGUFModelInfo(
                     architecture = arch,
                     family = family,
-                    contextLength = (fields["${arch}.context_length"] as? Number)?.toInt() ?: 4096,
-                    vocabSize = (fields["${arch}.vocab_size"] as? Number)?.toInt()
+                    contextLength = fields.getInt("${arch}.context_length") ?: 4096,
+                    vocabSize = fields.getInt("${arch}.vocab_size")
                         ?: ((fields["tokenizer.ggml.tokens"] as? List<*>)?.size ?: 0),
-                    blockCount = (fields["${arch}.block_count"] as? Number)?.toInt() ?: 0,
-                    embeddingLength = (fields["${arch}.embedding_length"] as? Number)?.toInt() ?: 0,
+                    blockCount = fields.getInt("${arch}.block_count") ?: 0,
+                    embeddingLength = fields.getInt("${arch}.embedding_length") ?: 0,
                     fields = fields
                 )
             }
 
@@ -0,0 +1,117 @@
+package sk.ainet.apps.llm
+
+import sk.ainet.io.JvmRandomAccessSource
+import sk.ainet.io.gguf.export.GGUFWriter
+import sk.ainet.io.gguf.export.GgufWriteRequest
+import java.nio.file.Files
+import kotlin.test.Test
+import kotlin.test.assertEquals
+
+/**
+ * Regression tests for [UnifiedModelLoader.peek] handling of GGUF metadata
+ * fields stored as unsigned integer types.
+ *
+ * Before the fix, `(fields[...] as? Number)?.toInt()` silently returned null
+ * for `UInt`/`ULong` values (they are not subtypes of `Number` in Kotlin),
+ * causing every modern GGUF — which uses uint32 dimensions — to fall back
+ * to the defaults: contextLength=4096, blockCount=0, embeddingLength=0.
+ * A blockCount of 0 yields a model with zero transformer layers.
+ */
+class UnifiedModelLoaderUIntMetadataTest {
+
+    @Test
+    fun peek_reads_uint32_metadata_fields() {
+        val bytes = buildGgufBytes(
+            arch = "apertus",
+            metadata = mapOf(
+                "apertus.context_length" to 8192u,
+                "apertus.block_count" to 32u,
+                "apertus.embedding_length" to 4096u,
+                "apertus.vocab_size" to 128256u
+            )
+        )
+
+        val info = peekFromBytes(bytes)
+
+        assertEquals("apertus", info.architecture)
+        assertEquals(8192, info.contextLength)
+        assertEquals(32, info.blockCount)
+        assertEquals(4096, info.embeddingLength)
+        assertEquals(128256, info.vocabSize)
+    }
+
+    @Test
+    fun peek_reads_uint64_metadata_fields() {
+        val bytes = buildGgufBytes(
+            arch = "apertus",
+            metadata = mapOf(
+                "apertus.context_length" to 8192uL,
+                "apertus.block_count" to 32uL,
+                "apertus.embedding_length" to 4096uL,
+                "apertus.vocab_size" to 128256uL
+            )
+        )
+
+        val info = peekFromBytes(bytes)
+
+        assertEquals(8192, info.contextLength)
+        assertEquals(32, info.blockCount)
+        assertEquals(4096, info.embeddingLength)
+        assertEquals(128256, info.vocabSize)
+    }
+
+    @Test
+    fun peek_reads_int32_metadata_fields() {
+        val bytes = buildGgufBytes(
+            arch = "apertus",
+            metadata = mapOf(
+                "apertus.context_length" to 8192,
+                "apertus.block_count" to 32,
+                "apertus.embedding_length" to 4096,
+                "apertus.vocab_size" to 128256
+            )
+        )
+
+        val info = peekFromBytes(bytes)
+
+        assertEquals(8192, info.contextLength)
+        assertEquals(32, info.blockCount)
+        assertEquals(4096, info.embeddingLength)
+        assertEquals(128256, info.vocabSize)
+    }
+
+    @Test
+    fun peek_falls_back_to_defaults_when_fields_missing() {
+        val bytes = buildGgufBytes(
+            arch = "apertus",
+            metadata = emptyMap()
+        )
+
+        val info = peekFromBytes(bytes)
+
+        assertEquals("apertus", info.architecture)
+        assertEquals(4096, info.contextLength) // default
+        assertEquals(0, info.blockCount)
+        assertEquals(0, info.embeddingLength)
+        assertEquals(0, info.vocabSize)
+    }
+
+    private fun buildGgufBytes(arch: String, metadata: Map<String, Any>): ByteArray {
+        val merged = LinkedHashMap<String, Any>()
+        merged["general.architecture"] = arch
+        merged.putAll(metadata)
+        val request = GgufWriteRequest(
+            metadata = merged,
+            tensors = emptyList(),
+            tensorMap = emptyMap()
+        )
+        return GGUFWriter.writeToByteArray(request).second
+    }
+
+    private fun peekFromBytes(bytes: ByteArray): GGUFModelInfo {
+        val tempFile = Files.createTempFile("uint-meta", ".gguf").toFile()
+        tempFile.deleteOnExit()
+        tempFile.writeBytes(bytes)
+        return UnifiedModelLoader.peek { JvmRandomAccessSource.open(tempFile) }
+    }
+}
@@ -71,5 +71,5 @@ kotlin {
 
 tasks.withType<Test>().configureEach {
     jvmArgs("--enable-preview", "--add-modules", "jdk.incubator.vector", "-XX:MaxDirectMemorySize=12g")
-    maxHeapSize = "6g"
+    maxHeapSize = (findProperty("apertusTestMaxHeap") as? String) ?: "6g"
 }
@@ -120,12 +120,13 @@ public class ApertusWeightLoader private constructor(
         requiredTensorNames(metadata).forEach { name ->
             val rt = tensorByName[name]
                 ?: error("Missing required tensor in GGUF payload: $name")
-            byName[name] = readerTensorToTensor(ctx, dtype, reader, rt)
+            byName[name] = loadReaderTensor(ctx, dtype, reader, rt, name)
         }
 
         // Load optional rope_freqs tensor
         tensorByName[ApertusTensorNames.ROPE_FREQS]?.let { rt ->
-            byName[ApertusTensorNames.ROPE_FREQS] = readerTensorToTensor(ctx, dtype, reader, rt)
+            byName[ApertusTensorNames.ROPE_FREQS] =
+                loadReaderTensor(ctx, dtype, reader, rt, ApertusTensorNames.ROPE_FREQS)
         }
 
         // Extract xIELU params: try metadata fields first, then per-layer tensors
@@ -162,12 +163,13 @@ public class ApertusWeightLoader private constructor(
             requiredTensorNames(metadata).forEach { name ->
                 val st = tensorByName[name]
                     ?: error("Missing required tensor in GGUF payload: $name")
-                byName[name] = streamingTensorToTensor(ctx, dtype, reader, st)
+                byName[name] = loadStreamingTensor(ctx, dtype, reader, st, name)
             }
 
             // Load optional rope_freqs tensor
             tensorByName[ApertusTensorNames.ROPE_FREQS]?.let { st ->
-                byName[ApertusTensorNames.ROPE_FREQS] = streamingTensorToTensor(ctx, dtype, reader, st)
+                byName[ApertusTensorNames.ROPE_FREQS] =
+                    loadStreamingTensor(ctx, dtype, reader, st, ApertusTensorNames.ROPE_FREQS)
             }
 
             // Extract xIELU params: try metadata fields first, then per-layer tensors
@@ -560,6 +562,58 @@ public class ApertusWeightLoader private constructor(
 
     // ============== Tensor conversion ==============
 
+    /**
+     * NATIVE_OPTIMIZED stores quantized tensors as byte-level rank-1 buffers so the
+     * native FFM kernels can address the raw block layout directly. That works for
+     * matmul (the kernel knows the logical shape from metadata) but breaks the
+     * token embedding, where `Embedding.gather()` requires the logical rank-2
+     * `[vocab, dim]` shape. Force `token_embd.weight` through the dequant path so
+     * the embedding lookup gets a real `[vocab, dim]` FP32/FP16 tensor regardless
+     * of the policy chosen for the rest of the model.
+     */
+    private fun <T : DType, V> loadStreamingTensor(
+        ctx: ExecutionContext,
+        dtype: KClass<T>,
+        reader: StreamingGGUFReader,
+        st: StreamingTensorInfo,
+        name: String
+    ): Tensor<T, V> {
+        if (name == ApertusTensorNames.TOKEN_EMBEDDINGS &&
+            quantPolicy == QuantPolicy.NATIVE_OPTIMIZED &&
+            st.tensorType != GGMLQuantizationType.F32 &&
+            st.tensorType != GGMLQuantizationType.F16 &&
+            st.tensorType != GGMLQuantizationType.BF16
+        ) {
+            val shape = Shape(*st.shape.map { it.toInt() }.toIntArray())
+            val bytes = reader.loadTensorData(st)
+            val floats = DequantOps.dequantFromBytes(bytes, st.tensorType, st.nElements.toInt())
+            return createTensor(ctx, dtype, shape, floats)
+        }
+        return streamingTensorToTensor(ctx, dtype, reader, st)
+    }
+
+    private fun <T : DType, V> loadReaderTensor(
+        ctx: ExecutionContext,
+        dtype: KClass<T>,
+        reader: GGUFReader,
+        rt: ReaderTensor,
+        name: String
+    ): Tensor<T, V> {
+        if (name == ApertusTensorNames.TOKEN_EMBEDDINGS &&
+            quantPolicy == QuantPolicy.NATIVE_OPTIMIZED &&
+            rt.tensorType != GGMLQuantizationType.F32 &&
+            rt.tensorType != GGMLQuantizationType.F16 &&
+            rt.tensorType != GGMLQuantizationType.BF16
+        ) {
+            val shape = Shape(*rt.shape.map { it.toInt() }.toIntArray())
+            val raw = if (rt.data.isEmpty()) reader.materialize(rt) else rt.data
+            val bytes: ByteArray = DequantOps.toByteArray(raw, rt.name)
+            val floats = DequantOps.dequantFromBytes(bytes, rt.tensorType, rt.nElements)
+            return createTensor(ctx, dtype, shape, floats)
+        }
+        return readerTensorToTensor(ctx, dtype, reader, rt)
+    }
+
     @Suppress("UNCHECKED_CAST")
     private fun <T : DType, V> readerTensorToTensor(
         ctx: ExecutionContext,
@@ -631,7 +685,7 @@ public class ApertusWeightLoader private constructor(
     }
 
     @Suppress("UNCHECKED_CAST")
-    private fun <T : DType, V> streamingTensorToTensor(
+    internal fun <T : DType, V> streamingTensorToTensor(
         ctx: ExecutionContext,
         dtype: KClass<T>,
         reader: StreamingGGUFReader,
@@ -676,9 +730,19 @@ public class ApertusWeightLoader private constructor(
             GGMLQuantizationType.IQ4_NL, GGMLQuantizationType.IQ4_XS,
             GGMLQuantizationType.TQ1_0, GGMLQuantizationType.TQ2_0 -> {
                 when (quantPolicy) {
-                    QuantPolicy.RAW_BYTES, QuantPolicy.NATIVE_OPTIMIZED -> {
+                    QuantPolicy.RAW_BYTES -> {
+                        require(dtype == Int8::class) {
+                            "Quantized tensor ${st.name} requires dtype Int8 with quantPolicy=RAW_BYTES"
+                        }
                         ctx.fromByteArray<Int8, Byte>(shape, Int8::class, bytes) as Tensor<T, V>
                     }
+                    QuantPolicy.NATIVE_OPTIMIZED -> {
+                        // Store raw quantized bytes; dtype can be FP32 (mixed mode).
+                        // Streaming reader preserves logical shape, so use byte-level shape.
+                        val byteShape = Shape(bytes.size)
+                        @Suppress("UNCHECKED_CAST")
+                        ctx.fromByteArray<Int8, Byte>(byteShape, Int8::class, bytes) as Tensor<T, V>
+                    }
                     QuantPolicy.DEQUANTIZE_TO_FP32 -> {
                         val floats = DequantOps.dequantFromBytes(bytes, st.tensorType, st.nElements.toInt())
                         createTensor(ctx, dtype, shape, floats)
Original file line number	Diff line number	Diff line change
`@@ -71,5 +71,5 @@ kotlin {`
`71`	`71`
`72`	`72`	`tasks.withType<Test>().configureEach {`
`73`	`73`	`jvmArgs("--enable-preview", "--add-modules", "jdk.incubator.vector", "-XX:MaxDirectMemorySize=12g")`
`74`		`- maxHeapSize = "6g"`
	`74`	`+ maxHeapSize = (findProperty("apertusTestMaxHeap") as? String) ?: "6g"`
`75`	`75`	`}`