Skip to content

Commit 86cc067

Browse files
michalharakalclaude
andcommitted
fix(gguf): handle unsigned numeric metadata fields
GgufModelMetadata.from() — and any consumer using `(value as? Number)?.toInt()` on `reader.fields` — silently dropped UInt/ULong-typed values. Modern GGUFs store dimensions and counts as uint32, but Kotlin's unsigned types do not extend kotlin.Number, so the cast yielded null. As a result contextLength, embeddingLength, layerCount, headCount, vocabSize fallback, bosTokenId, eosTokenId all came back null on real-world GGUFs and the loader fell back to defaults (e.g. blockCount=0 → zero-layer transformer). Add public Map<String, Any?> extensions in GgufFieldAccessors.kt: getInt / getLong / getString / getIntList / getStringList. The numeric accessors handle Int / UInt / Long / ULong / Short / UShort / Byte / UByte plus the matching primitive arrays for the list variant, and the string- encoded numeric variant some GGUF metadata uses. Route GgufModelMetadata.from() through the new public accessors and remove the buggy private helpers. Add a regression test covering uint32/uint64 scalars, uint-typed lists, and every numeric type the accessor accepts. Closes #585 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 351f727 commit 86cc067

3 files changed

Lines changed: 207 additions & 54 deletions

File tree

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
package sk.ainet.io.gguf
2+
3+
/**
4+
* Typed accessors for the raw `Map<String, Any?>` returned by `StreamingGGUFReader.fields`
5+
* (and any other producer of GGUF metadata).
6+
*
7+
* GGUF stores most numeric fields as `uint32` / `uint64`, which the reader preserves as
8+
* Kotlin `UInt` / `ULong`. Those unsigned types do **not** extend `kotlin.Number`, so the
9+
* naïve idiom `(value as? Number)?.toInt()` silently returns `null` for any modern GGUF.
10+
* These accessors handle every signed and unsigned integer type the reader can emit, plus
11+
* the string-encoded numeric variant some metadata fields use.
12+
*
13+
* All accessors take a vararg of keys and return the first match — useful for fields that
14+
* have moved namespaces between architectures (`llama.context_length` vs.
15+
* `general.context_length` vs. `model.context_length`).
16+
*/
17+
18+
public fun Map<String, Any?>.getString(vararg keys: String): String? {
19+
for (key in keys) {
20+
val value = this[key]
21+
if (value is String) return value
22+
}
23+
return null
24+
}
25+
26+
public fun Map<String, Any?>.getInt(vararg keys: String): Int? {
27+
for (key in keys) {
28+
when (val value = this[key]) {
29+
is Int -> return value
30+
is UInt -> return value.toInt()
31+
is Long -> return value.toInt()
32+
is ULong -> return value.toInt()
33+
is Short -> return value.toInt()
34+
is UShort -> return value.toInt()
35+
is Byte -> return value.toInt()
36+
is UByte -> return value.toInt()
37+
is Number -> return value.toInt()
38+
is String -> value.toIntOrNull()?.let { return it }
39+
}
40+
}
41+
return null
42+
}
43+
44+
public fun Map<String, Any?>.getLong(vararg keys: String): Long? {
45+
for (key in keys) {
46+
when (val value = this[key]) {
47+
is Long -> return value
48+
is ULong -> return value.toLong()
49+
is Int -> return value.toLong()
50+
is UInt -> return value.toLong()
51+
is Short -> return value.toLong()
52+
is UShort -> return value.toLong()
53+
is Byte -> return value.toLong()
54+
is UByte -> return value.toLong()
55+
is Number -> return value.toLong()
56+
is String -> value.toLongOrNull()?.let { return it }
57+
}
58+
}
59+
return null
60+
}
61+
62+
public fun Map<String, Any?>.getIntList(vararg keys: String): List<Int>? {
63+
for (key in keys) {
64+
val value = this[key] ?: continue
65+
val ints = when (value) {
66+
is List<*> -> value.mapNotNull { it.numberToIntOrNull() }
67+
is Array<*> -> value.mapNotNull { it.numberToIntOrNull() }
68+
is IntArray -> value.toList()
69+
is LongArray -> value.map { it.toInt() }
70+
is ShortArray -> value.map { it.toInt() }
71+
is ByteArray -> value.map { it.toInt() }
72+
is UIntArray -> value.map { it.toInt() }
73+
is ULongArray -> value.map { it.toInt() }
74+
is UShortArray -> value.map { it.toInt() }
75+
is UByteArray -> value.map { it.toInt() }
76+
else -> null
77+
}
78+
if (ints != null && ints.isNotEmpty()) return ints
79+
}
80+
return null
81+
}
82+
83+
public fun Map<String, Any?>.getStringList(vararg keys: String): List<String>? {
84+
for (key in keys) {
85+
val value = this[key]
86+
when (value) {
87+
is List<*> -> value.filterIsInstance<String>().takeIf { it.isNotEmpty() }
88+
?.let { return it }
89+
is Array<*> -> value.filterIsInstance<String>().takeIf { it.isNotEmpty() }
90+
?.let { return it }
91+
}
92+
}
93+
return null
94+
}
95+
96+
private fun Any?.numberToIntOrNull(): Int? = when (this) {
97+
is Int -> this
98+
is UInt -> this.toInt()
99+
is Long -> this.toInt()
100+
is ULong -> this.toInt()
101+
is Short -> this.toInt()
102+
is UShort -> this.toInt()
103+
is Byte -> this.toInt()
104+
is UByte -> this.toInt()
105+
is Number -> this.toInt()
106+
else -> null
107+
}

skainet-io/skainet-io-gguf/src/commonMain/kotlin/sk/ainet/io/gguf/GgufModelMetadata.kt

Lines changed: 0 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -165,59 +165,5 @@ public data class GgufModelMetadata(
165165
rawFields = fields
166166
)
167167
}
168-
169-
// ========== Helper Extensions ==========
170-
171-
private fun Map<String, Any?>.getString(vararg keys: String): String? {
172-
for (key in keys) {
173-
val value = this[key]
174-
if (value is String) return value
175-
}
176-
return null
177-
}
178-
179-
private fun Map<String, Any?>.getInt(vararg keys: String): Int? {
180-
for (key in keys) {
181-
val value = this[key]
182-
when (value) {
183-
is Number -> return value.toInt()
184-
is String -> value.toIntOrNull()?.let { return it }
185-
}
186-
}
187-
return null
188-
}
189-
190-
private fun Map<String, Any?>.getIntList(vararg keys: String): List<Int>? {
191-
for (key in keys) {
192-
val value = this[key] ?: continue
193-
val ints = when (value) {
194-
is List<*> -> value.mapNotNull { (it as? Number)?.toInt() }
195-
is Array<*> -> value.mapNotNull { (it as? Number)?.toInt() }
196-
is IntArray -> value.toList()
197-
is LongArray -> value.map { it.toInt() }
198-
else -> null
199-
}
200-
if (ints != null && ints.isNotEmpty()) return ints
201-
}
202-
return null
203-
}
204-
205-
@Suppress("UNCHECKED_CAST")
206-
private fun Map<String, Any?>.getStringList(vararg keys: String): List<String>? {
207-
for (key in keys) {
208-
val value = this[key]
209-
when (value) {
210-
is List<*> -> {
211-
val strings = value.filterIsInstance<String>()
212-
if (strings.isNotEmpty()) return strings
213-
}
214-
is Array<*> -> {
215-
val strings = value.filterIsInstance<String>()
216-
if (strings.isNotEmpty()) return strings
217-
}
218-
}
219-
}
220-
return null
221-
}
222168
}
223169
}
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
package sk.ainet.io.gguf
2+
3+
import kotlin.test.Test
4+
import kotlin.test.assertEquals
5+
6+
/**
7+
* Regression tests for GGUF metadata fields stored as unsigned integers.
8+
*
9+
* Modern GGUF files (recent llama.cpp converters) emit `uint32` / `uint64` for
10+
* dimensions and counts. The reader preserves them as `UInt` / `ULong`, which
11+
* are NOT subtypes of [Number] in Kotlin. The previous private `getInt` helper
12+
* relied on `(value as? Number)?.toInt()` and silently returned null for those
13+
* values, leaving every numeric field on [GgufModelMetadata] populated as null.
14+
*/
15+
class GgufModelMetadataUnsignedTest {
16+
17+
@Test
18+
fun `extracts uint32 numeric fields`() {
19+
val md = GgufModelMetadata.from(mapOf(
20+
"general.architecture" to "llama",
21+
"llama.context_length" to 8192u,
22+
"llama.embedding_length" to 4096u,
23+
"llama.block_count" to 32u,
24+
"llama.attention.head_count" to 32u,
25+
"llama.vocab_size" to 128256u,
26+
"tokenizer.ggml.bos_token_id" to 128000u,
27+
"tokenizer.ggml.eos_token_id" to 128001u,
28+
))
29+
30+
assertEquals("llama", md.architecture)
31+
assertEquals(8192, md.contextLength)
32+
assertEquals(4096, md.embeddingLength)
33+
assertEquals(32, md.layerCount)
34+
assertEquals(32, md.headCount)
35+
assertEquals(128256, md.vocabSize)
36+
assertEquals(128000, md.bosTokenId)
37+
assertEquals(128001, md.eosTokenId)
38+
}
39+
40+
@Test
41+
fun `extracts uint64 numeric fields`() {
42+
val md = GgufModelMetadata.from(mapOf(
43+
"general.architecture" to "llama",
44+
"llama.context_length" to 8192uL,
45+
"llama.embedding_length" to 4096uL,
46+
"llama.block_count" to 32uL,
47+
))
48+
49+
assertEquals(8192, md.contextLength)
50+
assertEquals(4096, md.embeddingLength)
51+
assertEquals(32, md.layerCount)
52+
}
53+
54+
@Test
55+
fun `extracts uint-typed token type list`() {
56+
val md = GgufModelMetadata.from(mapOf(
57+
"general.architecture" to "qwen2",
58+
"tokenizer.ggml.tokens" to listOf("a", "b", "c"),
59+
"tokenizer.ggml.token_type" to listOf(1u, 1u, 3u),
60+
))
61+
62+
assertEquals(listOf(1, 1, 3), md.tokenizerTokenTypes)
63+
}
64+
65+
@Test
66+
fun `extension accessors handle every numeric type`() {
67+
val fields = mapOf<String, Any?>(
68+
"as_int" to 1,
69+
"as_uint" to 2u,
70+
"as_long" to 3L,
71+
"as_ulong" to 4uL,
72+
"as_short" to 5.toShort(),
73+
"as_ushort" to 6.toUShort(),
74+
"as_byte" to 7.toByte(),
75+
"as_ubyte" to 8.toUByte(),
76+
"as_string" to "9",
77+
)
78+
79+
assertEquals(1, fields.getInt("as_int"))
80+
assertEquals(2, fields.getInt("as_uint"))
81+
assertEquals(3, fields.getInt("as_long"))
82+
assertEquals(4, fields.getInt("as_ulong"))
83+
assertEquals(5, fields.getInt("as_short"))
84+
assertEquals(6, fields.getInt("as_ushort"))
85+
assertEquals(7, fields.getInt("as_byte"))
86+
assertEquals(8, fields.getInt("as_ubyte"))
87+
assertEquals(9, fields.getInt("as_string"))
88+
89+
assertEquals(1L, fields.getLong("as_int"))
90+
assertEquals(4L, fields.getLong("as_ulong"))
91+
assertEquals(9L, fields.getLong("as_string"))
92+
}
93+
94+
@Test
95+
fun `getInt returns first matching key`() {
96+
val fields = mapOf<String, Any?>("b" to 2u, "a" to 1u)
97+
assertEquals(1, fields.getInt("a", "b"))
98+
assertEquals(2, fields.getInt("missing", "b", "a"))
99+
}
100+
}

0 commit comments

Comments
 (0)