beehive-lab
diff --git a/‎src/main/java/org/beehive/gpullama3/inference/weights/tornado/Phi3TornadoWeightsQ8_0.java‎
Lines changed: 53 additions & 0 deletions b/‎src/main/java/org/beehive/gpullama3/inference/weights/tornado/Phi3TornadoWeightsQ8_0.java‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎src/main/java/org/beehive/gpullama3/model/loader/Phi3ModelLoader.java‎
Lines changed: 27 additions & 4 deletions b/‎src/main/java/org/beehive/gpullama3/model/loader/Phi3ModelLoader.java‎
Lines changed: 27 additions & 4 deletions
@@ -0,0 +1,53 @@
+package org.beehive.gpullama3.inference.weights.tornado;
+
+import org.beehive.gpullama3.core.model.GGMLType;
+import org.beehive.gpullama3.core.model.tensor.Q8_0QuantizedTensor;
+import uk.ac.manchester.tornado.api.types.arrays.FloatArray;
+
+
+public class Phi3TornadoWeightsQ8_0 extends Q8_0Weights {
+
+    // Phi3-specific weight arrays
+    public Q8_0QuantizedTensor[] wqkvLayered;    // Combined QKV weights: (layer, op_size, dim) where op_size = dim + 2 * (n_kv_heads * head_dim)
+    public Q8_0QuantizedTensor[] wDownLayered;   // FFN down projection: (layer, dim, hidden_dim)
+    public Q8_0QuantizedTensor[] wUpLayered;     // FFN up projection: (layer, hidden_dim, dim)
+
+    // @formatter:off
+    public Phi3TornadoWeightsQ8_0(
+            FloatArray tokenEmbeddingTable,
+            FloatArray[] rms_att_weightLayered,
+            Q8_0QuantizedTensor[] wqkvLayered,        // Combined QKV weights for Phi3
+            Q8_0QuantizedTensor[] woLayered,
+            FloatArray[] rms_ffn_weightLayered,
+            Q8_0QuantizedTensor[] wDownLayered,       // FFN down weights
+            Q8_0QuantizedTensor[] wUpLayered,         // FFN up weights
+            FloatArray rms_final_weight_as_floatArray,
+            FloatArray freq_cis_realFlat,
+            FloatArray freq_cis_imagFlat,
+            Q8_0QuantizedTensor wclsByteArray,
+            GGMLType weightType) {
+
+        // Call to Q8_0Weights constructor with null values for unused standard weights
+        super(tokenEmbeddingTable,
+                rms_att_weightLayered,
+                null,  // wqLayered - not used in Phi3, using combined wqkv instead
+                null,  // wkLayered - not used in Phi3, using combined wqkv instead
+                null,  // wvLayered - not used in Phi3, using combined wqkv instead
+                woLayered,
+                rms_ffn_weightLayered,
+                null,  // w1Layered - not used in Phi3, using wUp instead
+                null,  // w2Layered - not used in Phi3, using wDown instead
+                null,  // w3Layered - not used in Phi3, using wUp instead
+                rms_final_weight_as_floatArray,
+                freq_cis_realFlat,
+                freq_cis_imagFlat,
+                wclsByteArray,
+                weightType);
+
+        // Initialize Phi3-specific fields
+        this.wqkvLayered = wqkvLayered;
+        this.wDownLayered = wDownLayered;
+        this.wUpLayered = wUpLayered;
+    }
+// @formatter:on
+}
@@ -12,6 +12,7 @@
 import org.beehive.gpullama3.inference.weights.Weights;
 import org.beehive.gpullama3.inference.weights.standard.Phi3StandardWeights;
 import org.beehive.gpullama3.inference.weights.tornado.Phi3TornadoWeights;
+import org.beehive.gpullama3.inference.weights.tornado.Phi3TornadoWeightsQ8_0;
 import org.beehive.gpullama3.model.Configuration;
 import org.beehive.gpullama3.model.format.ChatFormat;
 import org.beehive.gpullama3.model.phi3.Phi3;
@@ -100,20 +101,42 @@ private Weights loadWeights(Map<String, GGMLTensorEntry> tensorEntries, Configur
 
         if (useTornadovm) {
             if (TornadoVMMasterPlan.ENABLE_TORNADOVM_INIT_TIME) {
-                System.out.println("Loading model weights in TornadoVM format (loading " + outputWeight.ggmlType() + " -> " + GGMLType.F16 + ")");
+                System.out.println("Loading model weights in TornadoVM format (loading " + outputWeight.ggmlType() + ")");
+            }
+            if (outputWeight.ggmlType() == GGMLType.Q8_0) {
+                return createTornadoVMWeightsQ8_0(tensorEntries, config, ropeFreqs, tokenEmbeddings, outputWeight);
+            } else {
+                return createTornadoVMWeights(tensorEntries, config, ropeFreqs, tokenEmbeddings, outputWeight);
             }
-            return createTornadoVMWeights(tensorEntries, config, ropeFreqs, tokenEmbeddings, outputWeight);
         } else {
             return createStandardWeights(tensorEntries, config, ropeFreqs, tokenEmbeddings, outputWeight);
         }
     }
     // @formatter:on
 
     // @formatter:off
-    @Override
-    public Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntries, Configuration config,
+    public Weights createTornadoVMWeightsQ8_0(Map<String, GGMLTensorEntry> tensorEntries, Configuration config,
             Pair<float[], float[]> ropeFreqs, GGMLTensorEntry tokenEmbeddings,
             GGMLTensorEntry outputWeight) {
+        return new Phi3TornadoWeightsQ8_0(
+                loadTensorAsFloatArray(tokenEmbeddings),
+                loadArrayAsFloatArrayFromBuffer(config.numberOfLayers(), i -> tensorEntries.get("blk." + i + ".attn_norm.weight")),
+                loadArrayAsQ8_0QuantizedTensor(config.numberOfLayers(), i -> tensorEntries.get("blk." + i + ".attn_qkv.weight")),      // Combined QKV
+                loadArrayAsQ8_0QuantizedTensor(config.numberOfLayers(), i -> tensorEntries.get("blk." + i + ".attn_output.weight")),   // wo
+                loadArrayAsFloatArrayFromBuffer(config.numberOfLayers(), i -> tensorEntries.get("blk." + i + ".ffn_norm.weight")),
+                loadArrayAsQ8_0QuantizedTensor(config.numberOfLayers(), i -> tensorEntries.get("blk." + i + ".ffn_down.weight")),      // wDown
+                loadArrayAsQ8_0QuantizedTensor(config.numberOfLayers(), i -> tensorEntries.get("blk." + i + ".ffn_up.weight")),        // wUp (not combined in reference)
+                floatBufferToFloatArray(tensorEntries.get("output_norm.weight")),
+                FloatArray.fromArray(ropeFreqs.first()),
+                FloatArray.fromArray(ropeFreqs.second()),
+                loadQ8_0QuantizedTensor(outputWeight),
+                outputWeight.ggmlType()
+        );
+    }
+
+    public Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntries, Configuration config,
+                                          Pair<float[], float[]> ropeFreqs, GGMLTensorEntry tokenEmbeddings,
+                                          GGMLTensorEntry outputWeight) {
         return new Phi3TornadoWeights(
                 loadTensorAsFloatArray(tokenEmbeddings),
                 loadArrayAsFloatArrayFromBuffer(config.numberOfLayers(), i -> tensorEntries.get("blk." + i + ".attn_norm.weight")),