AyoubMDL
diff --git a/‎Cargo.lock‎
Lines changed: 3 additions & 2 deletions b/‎Cargo.lock‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/components/activation.rs‎
Lines changed: 1 addition & 1 deletion b/‎src/components/activation.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/components/attention.rs‎
Lines changed: 12 additions & 9 deletions b/‎src/components/attention.rs‎
Lines changed: 12 additions & 9 deletions
diff --git a/‎src/components/matmul.rs‎
Lines changed: 45 additions & 29 deletions b/‎src/components/matmul.rs‎
Lines changed: 45 additions & 29 deletions
diff --git a/‎src/components/mod.rs‎
Lines changed: 0 additions & 1 deletion b/‎src/components/mod.rs‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/components/norm.rs‎
Lines changed: 4 additions & 6 deletions b/‎src/components/norm.rs‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎src/components/quant.rs‎
Lines changed: 7 additions & 37 deletions b/‎src/components/quant.rs‎
Lines changed: 7 additions & 37 deletions
diff --git a/‎src/components/rotary_embedding.rs‎
Lines changed: 2 additions & 2 deletions b/‎src/components/rotary_embedding.rs‎
Lines changed: 2 additions & 2 deletions
@@ -9,6 +9,7 @@ memmap2 = "0.9"
 rayon = "1.10"
 tokenizers = { version = "0.22", features = ["http"] }
 safetensors = "0.7.0"
+serde_json = "1"
 
 [[bin]]
 name = "rustllm"
 
@@ -69,7 +69,7 @@ mod tests {
         let mut x = vec![1000.0_f32, 1001.0];
         softmax(&mut x);
         // softmax([1000, 1001]) == softmax([-1, 0]) == [e^-1/(e^-1+1), 1/(e^-1+1)]
-        let expected = vec![
+        let expected = [
             1.0_f32.exp().recip() / (1.0_f32.exp().recip() + 1.0),
             1.0 / (1.0_f32.exp().recip() + 1.0),
         ];
 
@@ -4,6 +4,7 @@ fn dot(x: &[f32], y: &[f32]) -> f32 {
     x.iter().zip(y.iter()).map(|(a, b)| a * b).sum()
 }
 
+#[allow(clippy::too_many_arguments)]
 pub fn group_query_attention(
     attn_out: &mut [f32],
     query: &[f32],
@@ -38,24 +39,24 @@ pub fn group_query_attention(
         // score against every cached key at positions 0..=pos
         let mut scores = vec![0.0_f32; pos + 1];
 
-        for t in 0..=pos {
+        for (t, score) in scores.iter_mut().enumerate() {
             // index into key_cache for layer, position t, kv head kv_h
             let kv_off = layer_idx * max_seq_len * kv_dim + t * kv_dim + kv_h * head_dim;
             let k_t = &key_cache[kv_off..kv_off + head_dim];
-            scores[t] = dot(q_head, k_t) / (head_dim as f32).sqrt();
+            *score = dot(q_head, k_t) / (head_dim as f32).sqrt();
         }
 
         softmax(&mut scores);
 
         // weighted sum of values
         let out_head = &mut attn_out[h * head_dim..(h + 1) * head_dim];
 
-        for t in 0..=pos {
+        for (t, &score) in scores.iter().enumerate() {
             let kv_off = layer_idx * max_seq_len * kv_dim + t * kv_dim + kv_h * head_dim;
             let v_t = &value_cache[kv_off..kv_off + head_dim];
 
             for j in 0..head_dim {
-                out_head[j] += scores[t] * v_t[j];
+                out_head[j] += score * v_t[j];
             }
         }
     }
@@ -73,16 +74,17 @@ mod tests {
         let head_dim = 2;
         let max_seq_len = 4;
         let n_embed = n_heads * head_dim; // 4
+        let n_layers = 1;
 
         // query: [head0: 1,0, head1: 0,1]
         let query = vec![1.0, 0.0, 0.0, 1.0];
 
         // key_cache: 1 layer × 4 positions × kv_dim(2). Fill pos=0 with [1, 0]
-        let mut key_cache = vec![0.0_f32; 1 * max_seq_len * n_kv_heads * head_dim];
+        let mut key_cache = vec![0.0_f32; n_layers * max_seq_len * n_kv_heads * head_dim];
         key_cache[0] = 1.0; // pos 0, kv_head 0, dim 0
 
         // value_cache: same layout. Fill pos=0 with [3, 7]
-        let mut value_cache = vec![0.0_f32; 1 * max_seq_len * n_kv_heads * head_dim];
+        let mut value_cache = vec![0.0_f32; n_layers * max_seq_len * n_kv_heads * head_dim];
         value_cache[0] = 3.0;
         value_cache[1] = 7.0;
 
@@ -115,20 +117,21 @@ mod tests {
         let n_kv_heads = 1;
         let head_dim = 2;
         let max_seq_len = 4;
+        let n_layers = 1;
 
         // query head: [1, 0]
         let query = vec![1.0_f32, 0.0];
 
         // key at pos=0: [1, 0]  → dot with query = 1.0 → score = 1/sqrt(2)
         // key at pos=1: [0, 1]  → dot with query = 0.0 → score = 0/sqrt(2)
-        let mut key_cache = vec![0.0_f32; 1 * max_seq_len * n_kv_heads * head_dim];
+        let mut key_cache = vec![0.0_f32; n_layers * max_seq_len * n_kv_heads * head_dim];
         key_cache[0] = 1.0; // pos=0, dim=0
         key_cache[1] = 0.0; // pos=0, dim=1
         key_cache[2] = 0.0; // pos=1, dim=0
         key_cache[3] = 1.0; // pos=1, dim=1
 
         // value at pos=0: [10, 0], pos=1: [0, 10]
-        let mut value_cache = vec![0.0_f32; 1 * max_seq_len * n_kv_heads * head_dim];
+        let mut value_cache = vec![0.0_f32; n_layers * max_seq_len * n_kv_heads * head_dim];
         value_cache[0] = 10.0;
         value_cache[3] = 10.0;
 
@@ -153,7 +156,7 @@ mod tests {
         let total = s + 1.0; // sum(exp(xi))
         let w0 = s / total; // softmax weight for pos=0
         let w1 = 1.0 / total; // softmax weight for pos=1
-        let expected = vec![w0 * 10.0, w1 * 10.0];
+        let expected = [w0 * 10.0, w1 * 10.0];
 
         for (got, exp) in attn_out.iter().zip(expected.iter()) {
             assert!((got - exp).abs() < 1e-5, "got {got}, expected {exp}");
 
@@ -1,24 +1,40 @@
-use crate::components::weight::Weight;
+use crate::{components::quant::vec_dot, format::gguf::GgufType, tensor::Tensor};
 use rayon::prelude::*;
 
-// matrix-vector multiplication
-// x shape: (in_channels,)
-// w shape: (out_channels, in_channels) stored in row-major order (like safetensors)
-// out shape: (out_channels,)
-#[allow(dead_code)]
-pub fn naive_matmul<W: Weight>(out: &mut [f32], x: &[f32], weight: &[W]) {
-    for i in 0..out.len() {
-        let mut sum = 0.0_f32;
-
-        for k in 0..x.len() {
-            sum += x[k] * weight[k + i * x.len()].to_f32();
-        }
-        out[i] = sum;
+pub fn bf16_to_f32(n: u16) -> f32 {
+    f32::from_bits((n as u32) << 16)
+}
+
+pub trait FloatType: Copy + Sync + Send {
+    fn to_f32(self) -> f32;
+}
+
+impl FloatType for f32 {
+    fn to_f32(self) -> f32 {
+        self
     }
 }
 
-// parallel matmul
-pub fn matmul<W: Weight>(out: &mut [f32], x: &[f32], weight: &[W]) {
+impl FloatType for u16 {
+    // f32:  [1 sign] [8 exponent] [23 mantissa]  = 32 bits
+    // bf16: [1 sign] [8 exponent] [ 7 mantissa]  = 16 bits
+    // To convert BF16 → f32, put these 16 bits in the upper 16 bits of a 32-bit word
+    // and zero-fill the bottom
+    fn to_f32(self) -> f32 {
+        bf16_to_f32(self)
+    }
+}
+
+pub fn matmul_gguf(out: &mut [f32], x: &[f32], weight: &[u8], dtype: GgufType, n_cols: usize) {
+    let row_bytes = dtype.row_bytes(n_cols);
+
+    out.par_iter_mut().enumerate().for_each(|(i, o)| {
+        let row = &weight[i * row_bytes..(i + 1) * row_bytes];
+        *o = vec_dot(row, x, dtype);
+    });
+}
+
+pub fn matmul_float<W: FloatType>(out: &mut [f32], x: &[f32], weight: &[W]) {
     out.par_iter_mut().enumerate().for_each(|(i, o)| {
         // i = row index, o = &mut f32 (that output element)
         let in_channels: usize = x.len();
@@ -30,6 +46,15 @@ pub fn matmul<W: Weight>(out: &mut [f32], x: &[f32], weight: &[W]) {
     });
 }
 
+/// Matrix-vector multiply
+pub fn matmul(out: &mut [f32], x: &[f32], weight: Tensor) {
+    match weight {
+        Tensor::F32(w) => matmul_float::<f32>(out, x, w),
+        Tensor::BF16(w) => matmul_float::<u16>(out, x, w),
+        Tensor::Quantized { data, dtype } => matmul_gguf(out, x, data, dtype, x.len()),
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -41,40 +66,31 @@ mod tests {
 
     #[test]
     fn test_matmul_2x3() {
-        // weight: 2 rows × 3 cols (in_channels=3, out has 2 elements)
-        // row 0: [1, 2, 3]
-        // row 1: [4, 5, 6]
         let weight = vec![1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0];
         let x = vec![1.0_f32, 2.0, 3.0];
         let mut out = vec![0.0_f32; 2];
-        let mut parallel_out = vec![0.0_f32; 2];
 
-        naive_matmul(&mut out, &x, &weight);
-        matmul(&mut parallel_out, &x, &weight);
+        matmul(&mut out, &x, Tensor::F32(&weight));
 
         assert!(approx(out[0], 14.0));
         assert!(approx(out[1], 32.0));
-
-        // check parallel output
-        assert!(approx(parallel_out[0], 14.0));
-        assert!(approx(parallel_out[1], 32.0));
     }
 
     #[test]
     fn test_matmul_bf16() {
-        // Helper: convert f32 to bf16 (top 16 bits)
         fn f32_to_bf16(x: f32) -> u16 {
             (x.to_bits() >> 16) as u16
         }
 
-        let weight: Vec<u16> = vec![1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0]
+        let weight: Vec<u16> = [1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0]
             .iter()
             .map(|&v| f32_to_bf16(v))
             .collect();
         let x = vec![1.0_f32, 2.0, 3.0];
         let mut out = vec![0.0_f32; 2];
 
-        matmul(&mut out, &x, &weight);
+        matmul(&mut out, &x, Tensor::BF16(&weight));
+
         assert!(approx(out[0], 14.0));
         assert!(approx(out[1], 32.0));
     }
 
@@ -5,4 +5,3 @@ pub mod norm;
 pub mod quant;
 pub mod rotary_embedding;
 pub mod sampler;
-pub mod weight;
@@ -1,15 +1,13 @@
-use crate::components::weight::Weight;
-
 // rmsnorm (in-place)
-pub fn rmsnorm<W: Weight>(out: &mut [f32], x: &[f32], weight: &[W]) {
+pub fn rmsnorm(out: &mut [f32], x: &[f32], weight: &[f32]) {
     let sum: f32 = x.iter().map(|&v| v * v).sum();
 
     // x.len() is usize, that's why we cast it to f32
     let mean_sq = sum / x.len() as f32;
     let scale = 1.0 / (mean_sq + 1e-5_f32).sqrt();
 
     for i in 0..out.len() {
-        out[i] = x[i] * scale * weight[i].to_f32();
+        out[i] = x[i] * scale * weight[i];
     }
 }
 
@@ -26,7 +24,7 @@ mod tests {
     fn test_rmsnorm_uniform_weight() {
         let x = vec![1.0_f32, 2.0, 3.0, 4.0];
         let weight = vec![1.0_f32; 4];
-        let expected = vec![0.36514813_f32, 0.73029625, 1.09544444, 1.46059251];
+        let expected = [0.36514813_f32, 0.73029625, 1.095_444_4, 1.460_592_5];
         let mut out = vec![0.0_f32; 4];
 
         rmsnorm(&mut out, &x, &weight);
@@ -40,7 +38,7 @@ mod tests {
     fn test_rmsnorm_nonuniform_weight() {
         let x = vec![1.0_f32, 2.0, 3.0, 4.0];
         let weight = vec![0.5_f32, 1.0, 2.0, 0.5];
-        let expected = vec![0.18257406_f32, 0.73029625, 2.19088888, 0.73029625];
+        let expected = [0.18257406_f32, 0.73029625, 2.190_889, 0.73029625];
         let mut out = vec![0.0_f32; 4];
 
         rmsnorm(&mut out, &x, &weight);
 
@@ -1,5 +1,3 @@
-use rayon::iter::{IndexedParallelIterator, IntoParallelRefMutIterator, ParallelIterator};
-
 use crate::format::gguf::GgufType;
 
 fn fp16_to_fp32(number: u16) -> f32 {
@@ -191,7 +189,7 @@ pub fn dequantize_row_q6_k(src: &[u8], dst: &mut [f32]) {
 
             for l in 0..32 {
                 // Reconstruct 6-bit signed quant: (low4 | high2 << 4) - 32
-                let q1 = (ql[ql_off + l] & 0xF) as i32 | (((qh[qh_off + l] >> 0) & 3) as i32) << 4;
+                let q1 = (ql[ql_off + l] & 0xF) as i32 | ((qh[qh_off + l] & 3) as i32) << 4;
                 let q2 =
                     (ql[ql_off + l + 32] & 0xF) as i32 | (((qh[qh_off + l] >> 2) & 3) as i32) << 4;
                 let q3 = (ql[ql_off + l] >> 4) as i32 | (((qh[qh_off + l] >> 4) & 3) as i32) << 4;
@@ -234,7 +232,7 @@ pub fn vec_dot_q6_k(src: &[u8], x: &[f32]) -> f32 {
             let xp_off = xp + chunk * 128;
 
             for l in 0..16 {
-                let q1 = (ql[ql_off + l] & 0xF) as i32 | (((qh[qh_off + l] >> 0) & 3) as i32) << 4;
+                let q1 = (ql[ql_off + l] & 0xF) as i32 | ((qh[qh_off + l] & 3) as i32) << 4;
                 let q2 =
                     (ql[ql_off + l + 32] & 0xF) as i32 | (((qh[qh_off + l] >> 2) & 3) as i32) << 4;
                 let q3 = (ql[ql_off + l] >> 4) as i32 | (((qh[qh_off + l] >> 4) & 3) as i32) << 4;
@@ -248,7 +246,7 @@ pub fn vec_dot_q6_k(src: &[u8], x: &[f32]) -> f32 {
             }
 
             for l in 16..32 {
-                let q1 = (ql[ql_off + l] & 0xF) as i32 | (((qh[qh_off + l] >> 0) & 3) as i32) << 4;
+                let q1 = (ql[ql_off + l] & 0xF) as i32 | ((qh[qh_off + l] & 3) as i32) << 4;
                 let q2 =
                     (ql[ql_off + l + 32] & 0xF) as i32 | (((qh[qh_off + l] >> 2) & 3) as i32) << 4;
                 let q3 = (ql[ql_off + l] >> 4) as i32 | (((qh[qh_off + l] >> 4) & 3) as i32) << 4;
@@ -272,16 +270,16 @@ pub fn vec_dot_q6_k(src: &[u8], x: &[f32]) -> f32 {
 
 pub fn vec_dot(src: &[u8], x: &[f32], dtype: GgufType) -> f32 {
     match dtype {
-        GgufType::Q4_K => vec_dot_q4_k(src, x),
-        GgufType::Q6_K => vec_dot_q6_k(src, x),
+        GgufType::Q4K => vec_dot_q4_k(src, x),
+        GgufType::Q6K => vec_dot_q6_k(src, x),
         _ => panic!("not implemented"),
     }
 }
 
 pub fn dequantize_row(src: &[u8], dst: &mut [f32], dtype: GgufType) {
     match dtype {
-        GgufType::Q4_K => dequantize_row_q4_k(src, dst),
-        GgufType::Q6_K => dequantize_row_q6_k(src, dst),
+        GgufType::Q4K => dequantize_row_q4_k(src, dst),
+        GgufType::Q6K => dequantize_row_q6_k(src, dst),
         GgufType::F32 => {
             // reinterpret bytes as f32 and copy
             let floats =
@@ -296,31 +294,3 @@ pub fn dequantize_row(src: &[u8], dst: &mut [f32], dtype: GgufType) {
         _ => panic!("dequantize_row: unsupported dtype {:?}", dtype),
     }
 }
-
-pub fn matmul_gguf(out: &mut [f32], x: &[f32], weight: &[u8], dtype: GgufType, n_cols: usize) {
-    let row_bytes = dtype.row_bytes(n_cols);
-
-    out.par_iter_mut().enumerate().for_each(|(i, o)| {
-        let row = &weight[i * row_bytes..(i + 1) * row_bytes];
-        *o = vec_dot(row, x, dtype);
-    });
-}
-
-/// Diagnostic: dequantize each row to f32 first, then do a plain dot product.
-/// Same result as matmul_gguf if vec_dot is correct. Use to isolate vec_dot bugs.
-pub fn matmul_gguf_naive(
-    out: &mut [f32],
-    x: &[f32],
-    weight: &[u8],
-    dtype: GgufType,
-    n_cols: usize,
-) {
-    let row_bytes = dtype.row_bytes(n_cols);
-    let mut tmp = vec![0.0f32; n_cols];
-
-    for (i, o) in out.iter_mut().enumerate() {
-        let row = &weight[i * row_bytes..(i + 1) * row_bytes];
-        dequantize_row(row, &mut tmp, dtype);
-        *o = tmp.iter().zip(x.iter()).map(|(a, b)| a * b).sum();
-    }
-}
@@ -143,8 +143,8 @@ mod tests {
         // Split-half pairs: (x[0],x[2]) with cos=1,sin=0 and (x[1],x[3]) with cos=0,sin=1
         // q: (3,1)→(3*1-1*0, 3*0+1*1)=(3,1), (4,2)→(4*0-2*1, 4*1+2*0)=(-2,4)
         // result: [3, -2, 1, 4]
-        let expected_query = vec![3.0_f32, -2.0, 1.0, 4.0];
-        let expected_key = vec![5.0_f32, -8.0, 7.0, 6.0];
+        let expected_query = [3.0_f32, -2.0, 1.0, 4.0];
+        let expected_key = [5.0_f32, -8.0, 7.0, 6.0];
 
         // check query
         for (got, exp) in query.iter().zip(expected_query.iter()) {