fix: support tied embeddings in LlamaWeightLoader

michalharakal · claude · michalharakal · commit e212240abad1 · 2026-04-12T16:31:54.000+02:00
Small models like Qwen2.5-0.5B/1.5B tie their input and output embeddings (output.weight = token_embd.weight) to save parameters. The GGUF file omits output.weight in this case, causing "Missing required tensor" errors during load. Detect missing output.weight when token_embd.weight is present and alias the lookup to reuse the embedding tensor as the LM head. Logs "Tied word embeddings" when this path is taken. Verified with Qwen2.5-0.5B-Instruct-Q8_0 which now loads correctly and reaches the tool calling demo. (Output quality still limited by a separate byte-level BPE tokenizer issue.) Refs: #49 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/llm-inference/llama/src/commonMain/kotlin/sk/ainet/models/llama/LlamaWeightLoader.kt b/llm-inference/llama/src/commonMain/kotlin/sk/ainet/models/llama/LlamaWeightLoader.kt
@@ -243,8 +243,20 @@ public class LlamaWeightLoader private constructor(
         val required = requiredTensorNames(metadata)
         val tensorByName = reader.tensors.associateBy { it.name }
 
+        // Tied embeddings (Qwen2.5-0.5B/1.5B, Gemma, etc.): reuse token_embd.weight as output.weight
+        val tiedEmbeddings = tensorByName[LlamaTensorNames.OUTPUT_WEIGHT] == null &&
+            tensorByName[LlamaTensorNames.TOKEN_EMBEDDINGS] != null
+        if (tiedEmbeddings) {
+            println("Tied word embeddings: output.weight = token_embd.weight")
+        }
+
         required.forEach { name ->
-            val rt = tensorByName[name]
+            val lookupName = if (name == LlamaTensorNames.OUTPUT_WEIGHT && tiedEmbeddings) {
+                LlamaTensorNames.TOKEN_EMBEDDINGS
+            } else {
+                name
+            }
+            val rt = tensorByName[lookupName]
                 ?: error("Missing required tensor in GGUF payload: $name")
             validateTensorShape(name, rt, metadata)
             val tensor: Tensor<T, V> = readerTensorToTensor(ctx, dtype, reader, rt)
@@ -299,9 +311,24 @@ public class LlamaWeightLoader private constructor(
             val required = requiredTensorNames(metadata)
             val tensorByName = reader.tensors.associateBy { it.name }
 
+            // Tied embeddings: small models (Qwen2.5-0.5B/1.5B, etc.) omit output.weight
+            // and reuse token_embd.weight as the LM head. Detect and alias.
+            val tiedEmbeddings = tensorByName[LlamaTensorNames.OUTPUT_WEIGHT] == null &&
+                tensorByName[LlamaTensorNames.TOKEN_EMBEDDINGS] != null
+            if (tiedEmbeddings) {
+                println("Tied word embeddings: output.weight = token_embd.weight")
+            }
+
             required.forEach { name ->
-                val st = tensorByName[name]
+                val lookupName = if (name == LlamaTensorNames.OUTPUT_WEIGHT && tiedEmbeddings) {
+                    LlamaTensorNames.TOKEN_EMBEDDINGS
+                } else {
+                    name
+                }
+                val st = tensorByName[lookupName]
                     ?: error("Missing required tensor in GGUF payload: $name")
+                // Shape validation uses the logical name (e.g., OUTPUT_WEIGHT) even when
+                // the physical tensor is TOKEN_EMBEDDINGS — both must have [vocab, dim] shape.
                 validateStreamingTensorShape(name, st, metadata)
                 val tensor: Tensor<T, V> = streamingTensorToTensor(ctx, dtype, reader, st)
                 onTensorLoaded(name, tensor)