beehive-lab
diff --git a/‎.github/workflows/build-and-run.yml‎
Lines changed: 29 additions & 0 deletions b/‎.github/workflows/build-and-run.yml‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 11 additions & 4 deletions b/‎README.md‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎src/main/java/org/beehive/gpullama3/inference/InferenceCore.java‎
Lines changed: 122 additions & 0 deletions b/‎src/main/java/org/beehive/gpullama3/inference/InferenceCore.java‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java‎
Lines changed: 134 additions & 0 deletions b/‎src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java‎
Lines changed: 134 additions & 0 deletions
@@ -128,6 +128,20 @@ jobs:
           ./llama-tornado --gpu --${{ matrix.backend.name }} \
             --model /$MODELS_DIR/Phi-3-mini-4k-instruct-fp16.gguf \
             --prompt "Say hello"
+      - name: FP16 - Run Granite-3.2-2b-instruct-f16.gguf
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
+            --model /$MODELS_DIR/granite-3.2-2b-instruct-f16.gguf \
+            --prompt "Say hello"
+      - name: FP16 - Run Granite-4.0-1b-F16.gguf
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
+            --model /$MODELS_DIR/granite-4.0-1b-F16.gguf \
+            --prompt "Say hello"
       - name: Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf
         run: |
           cd ${{ github.workspace }}
@@ -163,3 +177,18 @@ jobs:
           ./llama-tornado --gpu --${{ matrix.backend.name }} \
             --model $MODELS_DIR/Mistral-7B-Instruct-v0.3.Q8_0.gguf \
             --prompt "Say hello"
+      - name: Q8 - Run Granite-3.2-2b-instruct-Q8.gguf
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
+            --model /$MODELS_DIR/granite-3.2-2b-instruct-Q8_0.gguf \
+            --prompt "Say hello"
+      - name: Q8 - Run Granite-4.0-1b-Q8_0.gguf
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
+            --model /$MODELS_DIR/granite-4.0-1b-Q8_0.gguf \
+            --prompt "Say hello"
+
@@ -15,7 +15,7 @@ clean:
 	$(MVN) clean
 
 install:
-    $(MVN) install -DskipTests
+	$(MVN) install -DskipTests
 
 # Package the project without running tests
 package:
 
@@ -19,7 +19,7 @@
 <strong>Llama3</strong> models written in <strong>native Java</strong> automatically accelerated on GPUs with <a href="https://github.com/beehive-lab/TornadoVM" target="_blank"><strong>TornadoVM</strong></a>.
 Runs Llama3 inference efficiently using TornadoVM's GPU acceleration.
 <br><br>
-Currently, supports <strong>Llama3</strong>, <strong>Mistral</strong>, <strong>Qwen2.5</strong>, <strong>Qwen3</strong> and <strong>Phi3</strong> models in the GGUF format.
+Currently, supports <strong>Llama3</strong>, <strong>Mistral</strong>, <strong>Qwen2.5</strong>, <strong>Qwen3</strong>, <strong>Phi-3</strong>, <strong> IBM Granite 3.2+ </strong> and <strong> IBM Granite 4.0 </strong> models in the GGUF format.
 Also, it is used as GPU inference engine in 
 <a href="https://docs.quarkiverse.io/quarkus-langchain4j/dev/gpullama3-chat-model.html" target="_blank">Quarkus</a> 
 and 
@@ -89,7 +89,7 @@ All pre-built SDKs are available on the TornadoVM [Releases Page](https://github
 wget https://github.com/beehive-lab/TornadoVM/releases/download/v2.1.0/tornadovm-2.1.0-opencl-linux-amd64.zip
 unzip tornadovm-2.1.0-opencl-linux-amd64.zip
 # Replace <path-to-sdk> manually with the absolute path of the extracted folder
-export TORNADO_SDK="<path-to-sdk>/tornadovm-2.1.0-opencl"
+export TORNADOVM_HOME="<path-to-sdk>/tornadovm-2.1.0-opencl"
 export PATH=$TORNADO_SDK/bin:$PATH
 
 tornado --devices
@@ -102,7 +102,7 @@ tornado --version
 wget https://github.com/beehive-lab/TornadoVM/releases/download/v2.1.0/tornadovm-2.1.0-opencl-mac-aarch64.zip
 unzip tornadovm-2.1.0-opencl-mac-aarch64.zip
 # Replace <path-to-sdk> manually with the absolute path of the extracted folder
-export TORNADO_SDK="<path-to-sdk>/tornadovm-2.1.0-opencl"
+export TORNADOVM_HOME="<path-to-sdk>/tornadovm-2.1.0-opencl"
 export PATH=$TORNADO_SDK/bin:$PATH
 
 tornado --devices
@@ -251,7 +251,7 @@ You can run llama-tornado as a pure Java script using [JBang](https://www.jbang.
 ### Prerequisites for JBang
 
 1. **Install JBang**: Follow the [JBang installation guide](https://www.jbang.dev/download/)
-2. **TornadoVM SDK**: You still need TornadoVM installed and `TORNADO_SDK` environment variable set (see Setup section above)
+2. **TornadoVM SDK**: You still need TornadoVM installed and `TORNADOVM_HOME` environment variable set (see Setup section above)
 
 ### Quick Start with JBang
 
@@ -295,6 +295,13 @@ jbang LlamaTornadoCli.java -m beehive-llama-3.2-1b-instruct-fp16.gguf \
 ### Llama3.2 Collection 
 [https://huggingface.co/collections/beehive-lab/llama3-gpullama3java](https://huggingface.co/collections/beehive-lab/llama3-gpullama3java)
 
+### IBM Granite 4.0 Collection
+[https://huggingface.co/collections/beehive-lab/granite-40-language-models-gpullama3java](https://huggingface.co/collections/beehive-lab/granite-40-language-models-gpullama3java)
+
+
+### IBM Granite 3.3 Collection
+[https://huggingface.co/collections/beehive-lab/granite-33-language-models-gpullama3java](https://huggingface.co/collections/beehive-lab/granite-33-language-models-gpullama3java)
+
 ### Qwen 2.5 Collection 
 [https://huggingface.co/collections/beehive-lab/qwen-25-gpullama3java](https://huggingface.co/collections/beehive-lab/qwen-25-gpullama3java)
 
 
@@ -11,6 +11,7 @@
 import org.beehive.gpullama3.inference.weights.tornado.TornadoWeights;
 import org.beehive.gpullama3.model.Configuration;
 import org.beehive.gpullama3.model.Model;
+import org.beehive.gpullama3.model.granite.GraniteConfiguration;
 import org.beehive.gpullama3.model.phi3.Phi3Configuration;
 import org.beehive.gpullama3.model.qwen2.Qwen2Configuration;
 import org.beehive.gpullama3.model.qwen3.Qwen3Configuration;
@@ -546,6 +547,127 @@ public static FloatTensor forwardJavaPhi3(Model model, Phi3State state, int toke
         return state.logits;
     }
 
+    /**
+     * Forward pass for Granite models with µP scaling factors applied.
+     * <p>
+     * Granite uses the same transformer architecture as Llama but with maximal update parameterization (µP)
+     * scaling factors applied at specific points:
+     * <ul>
+     *   <li>Embedding scaling: multiply embeddings after lookup</li>
+     *   <li>Attention scaling: use custom multiplier instead of 1/sqrt(headDim)</li>
+     *   <li>Residual scaling: multiply residual connections</li>
+     *   <li>Logit scaling: divide logits by the scaling factor</li>
+     * </ul>
+     */
+    public static FloatTensor forwardGranite(Model model, State state, int token, int position) {
+        final GraniteConfiguration config = (GraniteConfiguration) model.configuration();
+        final StandardWeights weights = (StandardWeights) model.weights();
+        int dim = config.dim();
+        int headSize = config.headSize();
+        int kvDim = (config.dim() * config.numberOfKeyValueHeads()) / config.numberOfHeads();
+        int kvMul = config.numberOfHeads() / config.numberOfKeyValueHeads();
+        float attentionScale = config.attentionScale();
+        float residualScale = config.residualScale();
+        float embeddingScale = config.embeddingScale();
+        float logitScale = config.logitScale();
+
+        // copy the token embedding into x
+        weights.token_embedding_table.copyTo(token * dim, state.x, 0, dim);
+        // Apply Granite embedding scaling
+        state.x.mapInPlace(v -> v * embeddingScale);
+
+        // forward all the layers
+        for (int l = 0; l < config.numberOfLayers(); l++) {
+            // attention rmsnorm
+            rmsnorm(state.xb, state.x, weights.rms_att_weight[l], 0, dim, config.rmsNormEps());
+
+            // qkv matmuls for this position
+            weights.wq[l].matmul(state.xb, state.q, dim, dim);
+            weights.wk[l].matmul(state.xb, state.k, kvDim, dim);
+            weights.wv[l].matmul(state.xb, state.v, kvDim, dim);
+
+            // RoPE relative positional encoding
+            for (int i = 0; i < dim; i += 2) {
+                int head_dim = i % headSize;
+                float fcr = weights.freq_cis_real.getFloat(position * (headSize / 2) + (head_dim / 2));
+                float fci = weights.freq_cis_imag.getFloat(position * (headSize / 2) + (head_dim / 2));
+                int rotn = i < kvDim ? 2 : 1;
+                for (int v = 0; v < rotn; v++) {
+                    FloatTensor vec = v == 0 ? state.q : state.k;
+                    float v0 = vec.getFloat(i);
+                    float v1 = vec.getFloat(i + 1);
+                    vec.setFloat(i, v0 * fcr - v1 * fci);
+                    vec.setFloat(i + 1, v0 * fci + v1 * fcr);
+                }
+            }
+
+            // save key,value at this time step to kv cache
+            state.k.copyTo(0, state.keyCache[l], position * kvDim, kvDim);
+            state.v.copyTo(0, state.valueCache[l], position * kvDim, kvDim);
+
+            int curLayer = l;
+
+            // multihead attention with Granite attention scaling
+            Parallel.parallelFor(0, config.numberOfHeads(), h -> {
+                int qOffset = h * headSize;
+                int attOffset = h * config.contextLength();
+
+                for (int t = 0; t <= position; t++) {
+                    int keyCacheOffset = t * kvDim + (h / kvMul) * headSize;
+                    float score = state.q.dot(qOffset, state.keyCache[curLayer], keyCacheOffset, headSize);
+                    // Granite uses custom attention multiplier instead of 1/sqrt(headSize)
+                    score *= attentionScale;
+                    state.att.setFloat(attOffset + t, score);
+                }
+
+                state.att.softmaxInPlace(attOffset, position + 1);
+
+                int xbOffset = h * headSize;
+                state.xb.fillInPlace(xbOffset, headSize, 0f);
+
+                for (int t = 0; t <= position; t++) {
+                    int vOffset = t * kvDim + (h / kvMul) * headSize;
+                    float a = state.att.getFloat(attOffset + t);
+                    state.xb.saxpyInPlace(xbOffset, state.valueCache[curLayer], vOffset, headSize, a);
+                }
+            });
+
+            // final matmul to get the output of the attention
+            weights.wo[l].matmul(state.xb, state.xb2, dim, dim);
+
+            // residual connection with Granite scaling
+            state.xb2.mapInPlace(v -> v * residualScale);
+            state.x.addInPlace(state.xb2);
+
+            // ffn rmsnorm
+            rmsnorm(state.xb, state.x, weights.rms_ffn_weight[l], 0, dim, config.rmsNormEps());
+
+            // FFN: self.w2(F.silu(self.w1(x)) * self.w3(x))
+            weights.w1[l].matmul(state.xb, state.hb, config.hiddenDim(), dim);
+            weights.w3[l].matmul(state.xb, state.hb2, config.hiddenDim(), dim);
+
+            // SwiGLU non-linearity
+            state.hb.mapInPlace(value -> value / (float) (1.0 + Math.exp(-value)));
+            state.hb.multiplyInPlace(state.hb2);
+
+            // final matmul to get the output of the ffn
+            weights.w2[l].matmul(state.hb, state.xb, dim, config.hiddenDim());
+
+            // residual connection with Granite scaling
+            state.xb.mapInPlace(v -> v * residualScale);
+            state.x.addInPlace(state.xb);
+        }
+
+        rmsnorm(state.x, state.x, weights.rms_final_weight, 0, dim, config.rmsNormEps());
+
+        weights.wcls.matmul(state.x, state.logits, config.vocabularySize(), dim);
+
+        // Apply Granite logit scaling (divide by the scaling factor)
+        state.logits.mapInPlace(v -> v * logitScale);
+
+        return state.logits;
+    }
+
     static void copyChunk(FloatTensor in, FloatTensor out, int dim1In, int dim1Out, int nChunks, int chunkNo) {
         assert (dim1In == dim1Out * nChunks);
         final int startOffsetInDim1 = chunkNo * dim1Out;
 
@@ -531,4 +531,138 @@ public static List<Integer> generateTokensGPUPhi3(Model model, State state, int
 
         return generatedTokens;
     }
+
+    /**
+     * Generates tokens using the Granite model with CPU inference.
+     * Identical pattern to generateTokensLlama but calls forwardGranite.
+     */
+    public static List<Integer> generateTokensGranite(Model model, State state, int startPosition,
+            List<Integer> promptTokens, Set<Integer> stopTokens, int maxTokens, Sampler sampler, boolean echo,
+            IntConsumer onTokenGenerated) {
+        long startNanos = System.nanoTime();
+        long inferenceStartNanos = 0;
+
+        Object logits;
+        if (maxTokens < 0 || model.configuration().contextLength() < maxTokens) {
+            maxTokens = model.configuration().contextLength();
+        }
+
+        List<Integer> generatedTokens = new ArrayList<>();
+
+        int currentToken = state.latestToken;
+        int nextToken;
+        int promptIndex = 0;
+        int pos = startPosition;
+
+        while (pos < maxTokens) {
+            // Call Granite-specific forward pass
+            logits = InferenceCore.forwardGranite(model, state, currentToken, pos);
+
+            if (promptIndex < promptTokens.size()) {
+                nextToken = promptTokens.get(promptIndex++);
+                if (echo) {
+                    System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken))));
+                }
+            } else {
+                if (inferenceStartNanos == 0) {
+                    inferenceStartNanos = System.nanoTime();
+                }
+
+                nextToken = sampler.sampleToken(logits);
+
+                if (echo) {
+                    System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken))));
+                }
+
+                generatedTokens.add(nextToken);
+
+                if (onTokenGenerated != null) {
+                    onTokenGenerated.accept(nextToken);
+                }
+
+                if (stopTokens.contains(nextToken)) {
+                    break;
+                }
+            }
+
+            currentToken = nextToken;
+            state.latestToken = currentToken;
+            pos++;
+        }
+
+        long endNanos = System.nanoTime();
+        double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
+        int totalTokens = promptIndex + generatedTokens.size();
+
+        LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
+
+        return generatedTokens;
+    }
+
+    /**
+     * Generates tokens using the Granite model with GPU (TornadoVM) inference.
+     * Identical pattern to generateTokensGPULlama.
+     */
+    public static List<Integer> generateTokensGPUGranite(Model model, State state, int startPosition,
+            List<Integer> promptTokens, Set<Integer> stopTokens, int maxTokens, Sampler sampler, boolean echo,
+            IntConsumer onTokenGenerated, TornadoVMMasterPlan tornadoVMMasterPlan) {
+        long startNanos = System.nanoTime();
+        long inferenceStartNanos = 0;
+
+        Object logits;
+        if (maxTokens < 0 || model.configuration().contextLength() < maxTokens) {
+            maxTokens = model.configuration().contextLength();
+        }
+
+        List<Integer> generatedTokens = new ArrayList<>();
+
+        int currentToken = state.latestToken;
+        int nextToken;
+        int promptIndex = 0;
+        int pos = startPosition;
+
+        while (pos < maxTokens) {
+            // Call TornadoVM forward pass (same as Llama for now)
+            logits = InferenceCore.forwardTornadoVM(model, state, currentToken, pos, tornadoVMMasterPlan);
+
+            if (promptIndex < promptTokens.size()) {
+                nextToken = promptTokens.get(promptIndex++);
+                if (echo) {
+                    System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken))));
+                }
+            } else {
+                if (inferenceStartNanos == 0) {
+                    inferenceStartNanos = System.nanoTime();
+                }
+
+                nextToken = sampler.sampleToken(logits);
+
+                if (echo) {
+                    System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken))));
+                }
+
+                generatedTokens.add(nextToken);
+
+                if (onTokenGenerated != null) {
+                    onTokenGenerated.accept(nextToken);
+                }
+
+                if (stopTokens.contains(nextToken)) {
+                    break;
+                }
+            }
+
+            currentToken = nextToken;
+            state.latestToken = currentToken;
+            pos++;
+        }
+
+        long endNanos = System.nanoTime();
+        double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
+        int totalTokens = promptIndex + generatedTokens.size();
+
+        LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
+
+        return generatedTokens;
+    }
 }