@@ -435,3 +435,127 @@ test "simd_rms_norm" {
435435 // RMS norm should produce non-zero output
436436 try std .testing .expect (output [0 ] > 0 );
437437}
438+
439+ // ═══════════════════════════════════════════════════════════════════════════════
440+ // SIMD ATTENTION WEIGHTED SUM (OPT-001 Enhancement)
441+ // ═══════════════════════════════════════════════════════════════════════════════
442+
443+ /// SIMD-optimized attention weighted sum
444+ /// output[i] = sum(scores[t] * v_cache[t][i]) for all t
445+ /// This is the inner loop of attention computation
446+ pub fn simdAttentionWeightedSum (output : []f32 , scores : []const f32 , v_cache : []const f32 , seq_len : usize , head_dim : usize , kv_stride : usize ) void {
447+ const aligned_dim = head_dim & ~ @as (usize , SIMD_WIDTH - 1 );
448+
449+ // Zero output
450+ @memset (output , 0.0 );
451+
452+ // Process each timestep
453+ for (0.. seq_len ) | t | {
454+ const score = scores [t ];
455+ const score_vec : Vec8f = @splat (score );
456+ const v_offset = t * kv_stride ;
457+
458+ // SIMD loop
459+ var i : usize = 0 ;
460+ while (i < aligned_dim ) : (i += SIMD_WIDTH ) {
461+ const v_vec : Vec8f = v_cache [v_offset + i .. ][0.. SIMD_WIDTH ].* ;
462+ const out_vec : Vec8f = output [i .. ][0.. SIMD_WIDTH ].* ;
463+ output [i .. ][0.. SIMD_WIDTH ].* = out_vec + score_vec * v_vec ;
464+ }
465+
466+ // Scalar tail
467+ while (i < head_dim ) : (i += 1 ) {
468+ output [i ] += score * v_cache [v_offset + i ];
469+ }
470+ }
471+ }
472+
473+ // ═══════════════════════════════════════════════════════════════════════════════
474+ // SIMD SwiGLU ACTIVATION (OPT-002 Enhancement)
475+ // ═══════════════════════════════════════════════════════════════════════════════
476+
477+ /// Fast SiLU approximation using polynomial
478+ /// silu(x) ≈ x * sigmoid(x) ≈ x * (0.5 + 0.5 * tanh(x * 0.7978845608))
479+ /// For better accuracy, we use: x / (1 + exp(-x))
480+ fn siluApprox (x : f32 ) f32 {
481+ // Fast sigmoid approximation
482+ const neg_x = - x ;
483+ const exp_neg = @exp (neg_x );
484+ return x / (1.0 + exp_neg );
485+ }
486+
487+ /// SIMD-optimized SwiGLU activation
488+ /// output[i] = silu(gate[i]) * up[i]
489+ pub fn simdSwiGLU (output : []f32 , gate : []const f32 , up : []const f32 ) void {
490+ const len = @min (gate .len , up .len );
491+ const aligned_len = len & ~ @as (usize , SIMD_WIDTH - 1 );
492+
493+ // SIMD loop - process 8 elements at a time
494+ // Note: @exp is not vectorized in Zig, so we process element-wise but with better cache usage
495+ var i : usize = 0 ;
496+ while (i < aligned_len ) : (i += SIMD_WIDTH ) {
497+ // Load gate and up values
498+ const gate_vec : Vec8f = gate [i .. ][0.. SIMD_WIDTH ].* ;
499+ const up_vec : Vec8f = up [i .. ][0.. SIMD_WIDTH ].* ;
500+
501+ // Apply SiLU to gate (element-wise due to exp)
502+ var silu_arr : [SIMD_WIDTH ]f32 = undefined ;
503+ const gate_arr : [SIMD_WIDTH ]f32 = gate_vec ;
504+ inline for (0.. SIMD_WIDTH ) | j | {
505+ silu_arr [j ] = siluApprox (gate_arr [j ]);
506+ }
507+ const silu_vec : Vec8f = silu_arr ;
508+
509+ // Multiply with up
510+ output [i .. ][0.. SIMD_WIDTH ].* = silu_vec * up_vec ;
511+ }
512+
513+ // Scalar tail
514+ while (i < len ) : (i += 1 ) {
515+ output [i ] = siluApprox (gate [i ]) * up [i ];
516+ }
517+ }
518+
519+ // ═══════════════════════════════════════════════════════════════════════════════
520+ // SIMD RESIDUAL ADD (Common operation)
521+ // ═══════════════════════════════════════════════════════════════════════════════
522+
523+ /// SIMD-optimized residual addition: output[i] = a[i] + b[i]
524+ /// In-place version: a[i] += b[i]
525+ pub fn simdResidualAdd (output : []f32 , residual : []const f32 ) void {
526+ const len = @min (output .len , residual .len );
527+ const aligned_len = len & ~ @as (usize , SIMD_WIDTH - 1 );
528+
529+ var i : usize = 0 ;
530+ while (i < aligned_len ) : (i += SIMD_WIDTH ) {
531+ const out_vec : Vec8f = output [i .. ][0.. SIMD_WIDTH ].* ;
532+ const res_vec : Vec8f = residual [i .. ][0.. SIMD_WIDTH ].* ;
533+ output [i .. ][0.. SIMD_WIDTH ].* = out_vec + res_vec ;
534+ }
535+
536+ while (i < len ) : (i += 1 ) {
537+ output [i ] += residual [i ];
538+ }
539+ }
540+
541+ test "simd_swiglu" {
542+ const gate = [_ ]f32 { 1.0 , 2.0 , 3.0 , 4.0 , 5.0 , 6.0 , 7.0 , 8.0 };
543+ const up = [_ ]f32 { 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 };
544+ var output : [8 ]f32 = undefined ;
545+
546+ simdSwiGLU (& output , & gate , & up );
547+
548+ // silu(1) * 1 ≈ 0.731
549+ try std .testing .expect (output [0 ] > 0.7 and output [0 ] < 0.8 );
550+ }
551+
552+ test "simd_attention_weighted_sum" {
553+ const scores = [_ ]f32 { 0.5 , 0.5 };
554+ const v_cache = [_ ]f32 { 1.0 , 2.0 , 3.0 , 4.0 , 5.0 , 6.0 , 7.0 , 8.0 }; // 2 timesteps, 4 dim
555+ var output : [4 ]f32 = undefined ;
556+
557+ simdAttentionWeightedSum (& output , & scores , & v_cache , 2 , 4 , 4 );
558+
559+ // output[0] = 0.5 * 1.0 + 0.5 * 5.0 = 3.0
560+ try std .testing .expectApproxEqAbs (output [0 ], 3.0 , 0.001 );
561+ }
0 commit comments