@@ -624,6 +624,82 @@ impl U8x64 {
624624 Self ( unsafe { _mm512_subs_epu8 ( self . 0 , other. 0 ) } )
625625 }
626626
627+ // ── Tier 1: seismon rasterizer primitives ─────────────────────────
628+
629+ /// Pairwise unsigned byte average: (a[i] + b[i] + 1) >> 1 per byte.
630+ /// Core op for 4×4 mipmap downsample (vpavgb + horizontal pair = 2 ops).
631+ #[ inline( always) ]
632+ pub fn pairwise_avg ( self , other : Self ) -> Self {
633+ // SAFETY: AVX-512BW instruction, operates on all 64 bytes.
634+ Self ( unsafe { _mm512_avg_epu8 ( self . 0 , other. 0 ) } )
635+ }
636+
637+ /// Byte-wise unsigned greater-than comparison. Returns 64-bit mask:
638+ /// bit i set if self[i] > other[i]. Symmetric to `cmpeq_mask`.
639+ /// Used for threshold density fields, depth/Z-test, hit-tests.
640+ #[ inline( always) ]
641+ pub fn cmpgt_mask ( self , other : Self ) -> u64 {
642+ // SAFETY: AVX-512BW instruction. Unsigned compare via _epu8.
643+ unsafe { _mm512_cmpgt_epu8_mask ( self . 0 , other. 0 ) }
644+ }
645+
646+ /// Masked blend: for each bit in `mask`, select from `b` if set, else `a`.
647+ /// Sprite alpha blit: write atlas pixel where mask bit set, keep framebuffer otherwise.
648+ #[ inline( always) ]
649+ pub fn mask_blend ( mask : u64 , a : Self , b : Self ) -> Self {
650+ // SAFETY: AVX-512BW instruction. mask selects between a and b per byte.
651+ Self ( unsafe { _mm512_mask_blend_epi8 ( mask, a. 0 , b. 0 ) } )
652+ }
653+
654+ /// Shift left each 16-bit lane by immediate bits (nibble write: place high nibble).
655+ /// Completes the nibble shift pair with `shr_epi16`.
656+ #[ inline( always) ]
657+ pub fn shl_epi16 ( self , imm : u32 ) -> Self {
658+ Self ( unsafe { match imm {
659+ 1 => _mm512_slli_epi16 ( self . 0 , 1 ) ,
660+ 2 => _mm512_slli_epi16 ( self . 0 , 2 ) ,
661+ 3 => _mm512_slli_epi16 ( self . 0 , 3 ) ,
662+ 4 => _mm512_slli_epi16 ( self . 0 , 4 ) ,
663+ 5 => _mm512_slli_epi16 ( self . 0 , 5 ) ,
664+ 6 => _mm512_slli_epi16 ( self . 0 , 6 ) ,
665+ 7 => _mm512_slli_epi16 ( self . 0 , 7 ) ,
666+ 8 => _mm512_slli_epi16 ( self . 0 , 8 ) ,
667+ _ => _mm512_setzero_si512 ( ) ,
668+ } } )
669+ }
670+
671+ // ── Tier 2: sprite blit + palette LUT + cross-lane shuffle ────────
672+
673+ /// Masked store: write only bytes where mask bit is set.
674+ /// Partial-tile writes at framebuffer edges without scalar fallback.
675+ ///
676+ /// # Safety
677+ /// `ptr` must point to at least 64 writable bytes (may be unaligned).
678+ #[ inline( always) ]
679+ pub unsafe fn mask_store ( self , ptr : * mut u8 , mask : u64 ) {
680+ // SAFETY: AVX-512BW masked store. Caller guarantees ptr validity.
681+ _mm512_mask_storeu_epi8 ( ptr as * mut i8 , mask, self . 0 ) ;
682+ }
683+
684+ /// Saturating unsigned addition: min(a + b, 255) per byte.
685+ /// Additive blend without overflow wrap. Symmetric to `saturating_sub`.
686+ #[ inline( always) ]
687+ pub fn saturating_add ( self , other : Self ) -> Self {
688+ // SAFETY: AVX-512BW instruction.
689+ Self ( unsafe { _mm512_adds_epu8 ( self . 0 , other. 0 ) } )
690+ }
691+
692+ /// Cross-lane byte permute: rearrange all 64 bytes by index vector.
693+ /// `idx[i]` selects which byte of `self` appears at position `i`.
694+ /// Unlike `shuffle_bytes` (within-lane), this crosses 128-bit lane boundaries.
695+ /// Needed for sprite atlas reorder and palette remap > 16 entries.
696+ #[ inline( always) ]
697+ pub fn permute_bytes ( self , idx : Self ) -> Self {
698+ // SAFETY: AVX-512VBMI instruction (_mm512_permutexvar_epi8).
699+ // Falls back to multi-shuffle on CPUs without VBMI.
700+ Self ( unsafe { _mm512_permutexvar_epi8 ( idx. 0 , self . 0 ) } )
701+ }
702+
627703 /// Interleave low bytes: [a0,b0,a1,b1,...] from lower halves.
628704 #[ inline( always) ]
629705 pub fn unpack_lo_epi8 ( self , other : Self ) -> Self {
@@ -2728,3 +2804,119 @@ mod f16_tests {
27282804 }
27292805 }
27302806}
2807+
2808+ #[ cfg( test) ]
2809+ mod u8x64_rasterizer_tests {
2810+ use super :: U8x64 ;
2811+
2812+ #[ test]
2813+ fn pairwise_avg_basic ( ) {
2814+ let a = U8x64 :: splat ( 10 ) ;
2815+ let b = U8x64 :: splat ( 20 ) ;
2816+ let avg = a. pairwise_avg ( b) ;
2817+ let mut out = [ 0u8 ; 64 ] ;
2818+ avg. copy_to_slice ( & mut out) ;
2819+ // (10 + 20 + 1) >> 1 = 15
2820+ assert ! ( out. iter( ) . all( |& v| v == 15 ) ) ;
2821+ }
2822+
2823+ #[ test]
2824+ fn pairwise_avg_rounding ( ) {
2825+ let a = U8x64 :: splat ( 1 ) ;
2826+ let b = U8x64 :: splat ( 2 ) ;
2827+ let avg = a. pairwise_avg ( b) ;
2828+ let mut out = [ 0u8 ; 64 ] ;
2829+ avg. copy_to_slice ( & mut out) ;
2830+ // (1 + 2 + 1) >> 1 = 2 (rounds up)
2831+ assert ! ( out. iter( ) . all( |& v| v == 2 ) ) ;
2832+ }
2833+
2834+ #[ test]
2835+ fn cmpgt_mask_basic ( ) {
2836+ let a = U8x64 :: splat ( 10 ) ;
2837+ let b = U8x64 :: splat ( 5 ) ;
2838+ assert_eq ! ( a. cmpgt_mask( b) , u64 :: MAX ) ; // all greater
2839+ assert_eq ! ( b. cmpgt_mask( a) , 0 ) ; // none greater
2840+ assert_eq ! ( a. cmpgt_mask( a) , 0 ) ; // equal = not greater
2841+ }
2842+
2843+ #[ test]
2844+ fn mask_blend_selects_correctly ( ) {
2845+ let a = U8x64 :: splat ( 10 ) ;
2846+ let b = U8x64 :: splat ( 20 ) ;
2847+ // mask = 0: all from a
2848+ let r0 = U8x64 :: mask_blend ( 0 , a, b) ;
2849+ let mut out = [ 0u8 ; 64 ] ;
2850+ r0. copy_to_slice ( & mut out) ;
2851+ assert ! ( out. iter( ) . all( |& v| v == 10 ) ) ;
2852+ // mask = all 1s: all from b
2853+ let r1 = U8x64 :: mask_blend ( u64:: MAX , a, b) ;
2854+ r1. copy_to_slice ( & mut out) ;
2855+ assert ! ( out. iter( ) . all( |& v| v == 20 ) ) ;
2856+ // mask = bit 0 only: first byte from b, rest from a
2857+ let r2 = U8x64 :: mask_blend ( 1 , a, b) ;
2858+ r2. copy_to_slice ( & mut out) ;
2859+ assert_eq ! ( out[ 0 ] , 20 ) ;
2860+ assert_eq ! ( out[ 1 ] , 10 ) ;
2861+ }
2862+
2863+ #[ test]
2864+ fn shl_epi16_shift_4 ( ) {
2865+ let mut data = [ 0u8 ; 64 ] ;
2866+ data[ 0 ] = 0x0F ; data[ 1 ] = 0x00 ; // u16 = 0x000F
2867+ let v = U8x64 :: from_slice ( & data) ;
2868+ let shifted = v. shl_epi16 ( 4 ) ;
2869+ let mut out = [ 0u8 ; 64 ] ;
2870+ shifted. copy_to_slice ( & mut out) ;
2871+ let result = u16:: from_le_bytes ( [ out[ 0 ] , out[ 1 ] ] ) ;
2872+ assert_eq ! ( result, 0x00F0 ) ;
2873+ }
2874+
2875+ #[ test]
2876+ fn saturating_add_clamps_at_255 ( ) {
2877+ let a = U8x64 :: splat ( 200 ) ;
2878+ let b = U8x64 :: splat ( 100 ) ;
2879+ let sum = a. saturating_add ( b) ;
2880+ let mut out = [ 0u8 ; 64 ] ;
2881+ sum. copy_to_slice ( & mut out) ;
2882+ assert ! ( out. iter( ) . all( |& v| v == 255 ) ) ;
2883+ }
2884+
2885+ #[ test]
2886+ fn saturating_add_no_overflow ( ) {
2887+ let a = U8x64 :: splat ( 10 ) ;
2888+ let b = U8x64 :: splat ( 20 ) ;
2889+ let sum = a. saturating_add ( b) ;
2890+ let mut out = [ 0u8 ; 64 ] ;
2891+ sum. copy_to_slice ( & mut out) ;
2892+ assert ! ( out. iter( ) . all( |& v| v == 30 ) ) ;
2893+ }
2894+
2895+ #[ test]
2896+ fn permute_bytes_identity ( ) {
2897+ let mut data = [ 0u8 ; 64 ] ;
2898+ for i in 0 ..64 { data[ i] = i as u8 ; }
2899+ let v = U8x64 :: from_slice ( & data) ;
2900+ // Identity permutation
2901+ let mut idx = [ 0u8 ; 64 ] ;
2902+ for i in 0 ..64 { idx[ i] = i as u8 ; }
2903+ let perm = v. permute_bytes ( U8x64 :: from_slice ( & idx) ) ;
2904+ let mut out = [ 0u8 ; 64 ] ;
2905+ perm. copy_to_slice ( & mut out) ;
2906+ assert_eq ! ( out, data) ;
2907+ }
2908+
2909+ #[ test]
2910+ fn permute_bytes_reverse ( ) {
2911+ let mut data = [ 0u8 ; 64 ] ;
2912+ for i in 0 ..64 { data[ i] = i as u8 ; }
2913+ let v = U8x64 :: from_slice ( & data) ;
2914+ // Reverse permutation
2915+ let mut idx = [ 0u8 ; 64 ] ;
2916+ for i in 0 ..64 { idx[ i] = ( 63 - i) as u8 ; }
2917+ let perm = v. permute_bytes ( U8x64 :: from_slice ( & idx) ) ;
2918+ let mut out = [ 0u8 ; 64 ] ;
2919+ perm. copy_to_slice ( & mut out) ;
2920+ for i in 0 ..64 { assert_eq ! ( out[ i] , ( 63 - i) as u8 ) ; }
2921+ }
2922+ }
0 commit comments