perf(gpt2): transpose weights at load time for SIMD-contiguous matmul

claude · claude · commit c794695ec47b · 2026-03-29T20:27:39.000Z
Weight matrices pre-transposed from [in_dim, out_dim] to [out_dim, in_dim] during safetensors loading. matmul_vec_simd now reads contiguous rows via F32x16::from_slice + mul_add — full SIMD utilization (768D = 48 × F32x16). https://claude.ai/code/session_01Y69Vnw751w75iVSBRws7o7
diff --git a/src/hpc/gpt2/inference.rs b/src/hpc/gpt2/inference.rs
@@ -364,13 +364,27 @@ fn softmax_simd(x: &mut [f32]) {
 /// Matrix-vector multiply: out = input @ weight^T + bias.
 /// Weight stored as [input_dim, output_dim] (row-major, transposed access).
 /// SIMD accelerated for the dot product.
+/// Matrix-vector multiply: out = input @ weight + bias.
+/// Weight is PRE-TRANSPOSED to [out_dim, in_dim] for contiguous SIMD access.
+/// Each output element reads a contiguous row of in_dim floats.
 fn matmul_vec_simd(input: &[f32], weight: &[f32], bias: &[f32], output: &mut [f32], in_dim: usize, out_dim: usize) {
-    // GPT-2 stores weights as [in_dim, out_dim] (row-major).
-    // Strided access per output — TODO: transpose at load time for SIMD.
+    let chunks = in_dim / 16;
+    let remainder = in_dim % 16;
+
     for o in 0..out_dim {
-        let mut dot = 0.0f32;
-        for i in 0..in_dim {
-            dot += input[i] * weight[i * out_dim + o];
+        let row_offset = o * in_dim;
+        let mut acc = F32x16::splat(0.0);
+        for c in 0..chunks {
+            let off = c * 16;
+            let vi = F32x16::from_slice(&input[off..off + 16]);
+            let vw = F32x16::from_slice(&weight[row_offset + off..row_offset + off + 16]);
+            acc = vi.mul_add(vw, acc);
+        }
+        let mut dot = acc.reduce_sum();
+        // Scalar tail
+        let tail_start = chunks * 16;
+        for i in 0..remainder {
+            dot += input[tail_start + i] * weight[row_offset + tail_start + i];
         }
         output[o] = dot + bias[o];
     }
diff --git a/src/hpc/gpt2/weights.rs b/src/hpc/gpt2/weights.rs
@@ -111,10 +111,35 @@ impl Gpt2Weights {
             });
         }
 
-        Ok(Gpt2Weights {
+        let mut weights = Gpt2Weights {
             wte, wpe, layers, ln_f_weight, ln_f_bias,
-        })
+        };
+        weights.transpose_weights_for_simd();
+        Ok(weights)
+    }
+
+    /// Transpose all weight matrices from [in_dim, out_dim] to [out_dim, in_dim].
+    /// After this, matmul can read weight rows contiguously for F32x16 SIMD.
+    fn transpose_weights_for_simd(&mut self) {
+        for layer in &mut self.layers {
+            transpose_matrix(&mut layer.attn_qkv_weight, EMBED_DIM, 3 * EMBED_DIM);
+            transpose_matrix(&mut layer.attn_out_weight, EMBED_DIM, EMBED_DIM);
+            transpose_matrix(&mut layer.mlp_fc_weight, EMBED_DIM, MLP_DIM);
+            transpose_matrix(&mut layer.mlp_proj_weight, MLP_DIM, EMBED_DIM);
+        }
+    }
+}
+
+/// Transpose a [rows, cols] matrix in-place to [cols, rows].
+fn transpose_matrix(data: &mut Vec<f32>, rows: usize, cols: usize) {
+    assert_eq!(data.len(), rows * cols);
+    let mut transposed = vec![0.0f32; rows * cols];
+    for r in 0..rows {
+        for c in 0..cols {
+            transposed[c * rows + r] = data[r * cols + c];
+        }
     }
+    *data = transposed;
 }
 
 /// Tensor metadata from safetensors header.