reduction fuse opt in RMS normalization layer for llama after the recent half float update

yrq0208 · yrq0208 · commit 7b1d1724995e · 2025-12-15T12:11:35.000Z
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/kernels/TransformerComputeKernelsLayered.java b/src/main/java/org/beehive/gpullama3/tornadovm/kernels/TransformerComputeKernelsLayered.java
@@ -408,6 +408,84 @@ public static void reductionOneBlockWithLayerFuse(KernelContext context, FloatAr
         }
     }
 
+    /**
+     * Performs RMS (Root Mean Square) normalization using parallel reduction. It first computes the variance and scaling factor across all work groups,
+     * then it applies the computed normalization factor to input and weight elements.
+     *
+     * <p>
+     * Formula: output[i] = weight[i] * (normalizationFactor * x[i])
+     *
+     * Algorithm: 1. Each thread computes square of its input element 2. Work group performs parallel reduction of squares 3. Partial sums stored per work group 4. All thread combines all partial
+     * sums and computes normalization factor 5. Applies the computed normalization factor to input and weight elements.
+     *
+     * @param context
+     *         Kernel execution context
+     * @param outputFP16
+     *         Half float array to store partial sums and final normalization factor
+     * @param x
+     *         Input array to normalize
+     * @param weights
+     *         Weight values for each element
+     * @param temp
+     *         Temporary array containing normalization factor at index 0
+     * @param size
+     *         Number of elements to process
+     * @param ermsNorm
+     *         Epsilon value squared for numerical stability
+     * @param localMemSize
+     *         Size of local memory allocation (must match work group size)
+     */
+
+    public static void reductionOneBlockWithLayerFuseFP16(KernelContext context, HalfFloatArray outputFP16, FloatArray x, FloatArray weights, FloatArray temp, int size, float ermsNorm, int localMemSize) {
+        int gid = context.globalIdx;
+        int lid = context.localIdx;
+        int groupId = context.groupIdx;
+        int groupSize = context.localGroupSizeX;
+
+        // Allocate local memory with the provided size
+        float[] localX = context.allocateFloatLocalArray(localMemSize);
+
+        // Load input value and compute square
+        if (gid < size) {
+            float v = x.get(gid);
+            localX[lid] = v * v;
+        } else {
+            localX[lid] = 0.0f;
+        }
+
+        // Perform parallel reduction within the work group
+        for (int stride = (groupSize / 2); stride > 0; stride /= 2) {
+            context.localBarrier();
+            if (lid < stride) {
+                localX[lid] += localX[lid + stride];
+            }
+        }
+
+        // Each workgroup stores its partial sum in a different location
+        if (lid == 0) {
+            // Store the partial sum from each workgroup
+            temp.set(groupId, localX[0]);
+        }
+
+        context.globalBarrier();
+
+        float localss = 0.0f;
+        int numGroups = (size + groupSize - 1) / groupSize;
+        for (int i = 0; i < numGroups; i++) {  // Assuming 8 workgroups
+            localss += temp.get(i);
+        }
+        localss /= size;
+        localss += ermsNorm;
+        localss = 1.0f / TornadoMath.sqrt(localss);
+
+        if (gid < size) {
+            float in = x.get(gid);
+            float w = weights.get(gid);
+            outputFP16.set(gid, new HalfFloat(w * (localss * in)));
+        }
+    }
+
+
     /**
      * Applies the computed normalization factor to input and weight elements. This is the second phase of RMS normalization.
      * <p>
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/fp16/LlamaFP16FFNLayers.java b/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/fp16/LlamaFP16FFNLayers.java
@@ -50,7 +50,7 @@ public GridScheduler updateGridScheduler(GridScheduler tornadoForwardScheduler)
         for (int i = 0; i < config.numberOfLayers(); i++) {
             // === Attention Block ===
             tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".attn_rms_reduce", rmsNormWorker);
-            tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".attn_rms_apply_fp16", rmsNormWorker);
+            //tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".attn_rms_apply_fp16", rmsNormWorker);
             tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".qkv_projection", fusedQKVWorker);
             tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".rope_and_kv_cache", ropeWithCacheWorker);
             tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".attention", parallelAttentionWorker);
@@ -199,6 +199,10 @@ TaskGraph setupSingleFFNLayer(LlamaTornadoWeights weights, Configuration config,
         // === Attention Block ===
         // RMS Normalization
         unifiedLayer.task("attn_rms_reduce",
+                TransformerComputeKernelsLayered::reductionOneBlockWithLayerFuseFP16,
+                context, state.wrapXbFP16, state.wrapX, weights.rms_att_weightLayered[layerIndex].asFloatArray(), state.temp,
+                config.dim(), config.rmsNormEps(), state.localSize);
+        /*unifiedLayer.task("attn_rms_reduce",
                 TransformerComputeKernelsLayered::reductionOneBlockWithLayer,
                 context, state.temp, state.wrapX,
                 config.dim(), config.rmsNormEps(), state.localSize);
@@ -212,7 +216,7 @@ TaskGraph setupSingleFFNLayer(LlamaTornadoWeights weights, Configuration config,
         unifiedLayer.task("attn_rms_apply_fp16",
                 TransformerComputeKernels::mapContextWithQuantize,
                 context, state.wrapXbFP16, state.wrapX,
-                weights.rms_att_weightLayered[layerIndex].asFloatArray(), state.temp);
+                weights.rms_att_weightLayered[layerIndex].asFloatArray(), state.temp);*/
 
         // QKV Projection (fused)
         unifiedLayer.task("qkv_projection",
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/q8_0/LlamaQ8_0FFNLayers.java b/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/q8_0/LlamaQ8_0FFNLayers.java
@@ -161,6 +161,10 @@ TaskGraph setupSingleFFNLayer(LlamaTornadoWeights weights, Configuration config,
         // === Attention Block ===
         // RMS Normalization
         unifiedLayer.task("attn_rms_reduce",
+                TransformerComputeKernelsLayered::reductionOneBlockWithLayerFuse,
+                context, state.wrapXb, state.wrapX, weights.rms_att_weightLayered[layerIndex].asFloatArray(), state.temp,
+                config.dim(), config.rmsNormEps(), state.localSize);
+        /*unifiedLayer.task("attn_rms_reduce",
                 TransformerComputeKernelsLayered::reductionOneBlockWithLayer,
                 context, state.temp, state.wrapX,
                 config.dim(), config.rmsNormEps(), state.localSize);
@@ -174,7 +178,7 @@ TaskGraph setupSingleFFNLayer(LlamaTornadoWeights weights, Configuration config,
         unifiedLayer.task("attn_rms_apply",
                 TransformerComputeKernelsLayered::reductionOneBlock2WithLayer,
                 context, state.wrapXb, state.wrapX,
-                weights.rms_att_weightLayered[layerIndex].asFloatArray(), state.temp);
+                weights.rms_att_weightLayered[layerIndex].asFloatArray(), state.temp);*/
 
         // QKV Projection (fused with Q8 dequantization)
         unifiedLayer.task("qkv_projection",
@@ -306,7 +310,7 @@ public GridScheduler updateGridScheduler(GridScheduler tornadoForwardScheduler)
             // --- Attention Block ---
             // RMS Normalization
             tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".attn_rms_reduce", rmsNormWorker);
-            tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".attn_rms_apply", rmsNormWorker);
+            //tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".attn_rms_apply", rmsNormWorker);
             tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".qkv_projection", fusedQkvWorker);
             tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".rope_and_kv_cache", ropeWithCacheWorker);
             tornadoForwardScheduler.addWorkerGrid("layer_" + i + ".attention", parallelAttentionWorker);