Skip to content

Commit 1f224ba

Browse files
committed
feat(simd): Tier 1+2 U8x64 rasterizer intrinsics for seismon framebuffer
Per seismon session wishlist — 8 new methods on U8x64 across all three SIMD backends (AVX-512 native / AVX2 scalar / scalar fallback): Tier 1 (rasterizer core): pairwise_avg → _mm512_avg_epu8 — mipmap 4x4 downsample in 2 ops cmpgt_mask → _mm512_cmpgt_epu8_mask — threshold/Z-test/hit-test mask_blend → _mm512_mask_blend_epi8 — sprite alpha blit shl_epi16 → _mm512_slli_epi16 — nibble write (completes shr pair) Tier 2 (sprite blit + palette): mask_store → _mm512_mask_storeu_epi8 — partial-tile edge writes saturating_add → _mm512_adds_epu8 — additive blend (completes sub pair) permute_bytes → _mm512_permutexvar_epi8 — cross-lane byte shuffle All methods have matching scalar fallbacks in simd.rs and simd_avx2.rs for NEON/non-AVX512 targets. Consumer writes crate::simd::U8x64 — the polyfill picks the path. Tests: 9 new u8x64_rasterizer_tests (pairwise_avg ×2, cmpgt_mask, mask_blend, shl_epi16, saturating_add ×2, permute_bytes ×2). All pass. https://claude.ai/code/session_01SbYsmmbPf9YQuYbHZN52Zh
1 parent f6deff8 commit 1f224ba

3 files changed

Lines changed: 265 additions & 0 deletions

File tree

src/simd.rs

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -805,6 +805,42 @@ mod scalar {
805805
pub fn saturating_sub(self, other: Self) -> Self {
806806
let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[i].saturating_sub(other.0[i]); } Self(out)
807807
}
808+
// ── Tier 1: seismon rasterizer primitives (scalar fallbacks) ──
809+
#[inline(always)]
810+
pub fn pairwise_avg(self, other: Self) -> Self {
811+
let mut out = [0u8; 64]; for i in 0..64 { out[i] = ((self.0[i] as u16 + other.0[i] as u16 + 1) >> 1) as u8; } Self(out)
812+
}
813+
#[inline(always)]
814+
pub fn cmpgt_mask(self, other: Self) -> u64 {
815+
let mut m: u64 = 0; for i in 0..64 { if self.0[i] > other.0[i] { m |= 1 << i; } } m
816+
}
817+
#[inline(always)]
818+
pub fn mask_blend(mask: u64, a: Self, b: Self) -> Self {
819+
let mut out = [0u8; 64]; for i in 0..64 { out[i] = if mask & (1 << i) != 0 { b.0[i] } else { a.0[i] }; } Self(out)
820+
}
821+
#[inline(always)]
822+
pub fn shl_epi16(self, imm: u32) -> Self {
823+
let mut out = [0u8; 64];
824+
for i in (0..64).step_by(2) {
825+
let v = u16::from_le_bytes([self.0[i], self.0[i+1]]);
826+
let s = if imm < 16 { v << imm } else { 0 };
827+
let b = s.to_le_bytes(); out[i] = b[0]; out[i+1] = b[1];
828+
}
829+
Self(out)
830+
}
831+
// ── Tier 2: sprite blit + palette remap (scalar fallbacks) ──
832+
#[inline(always)]
833+
pub unsafe fn mask_store(self, ptr: *mut u8, mask: u64) {
834+
for i in 0..64 { if mask & (1 << i) != 0 { *ptr.add(i) = self.0[i]; } }
835+
}
836+
#[inline(always)]
837+
pub fn saturating_add(self, other: Self) -> Self {
838+
let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[i].saturating_add(other.0[i]); } Self(out)
839+
}
840+
#[inline(always)]
841+
pub fn permute_bytes(self, idx: Self) -> Self {
842+
let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[(idx.0[i] & 63) as usize]; } Self(out)
843+
}
808844
#[inline(always)]
809845
pub fn unpack_lo_epi8(self, other: Self) -> Self {
810846
let mut out = [0u8; 64];

src/simd_avx2.rs

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -806,6 +806,43 @@ impl U8x64 {
806806
Self(out)
807807
}
808808

809+
// ── Tier 1+2: seismon rasterizer primitives (AVX2 scalar fallbacks) ──
810+
811+
#[inline(always)]
812+
pub fn pairwise_avg(self, other: Self) -> Self {
813+
let mut out = [0u8; 64]; for i in 0..64 { out[i] = ((self.0[i] as u16 + other.0[i] as u16 + 1) >> 1) as u8; } Self(out)
814+
}
815+
#[inline(always)]
816+
pub fn cmpgt_mask(self, other: Self) -> u64 {
817+
let mut m: u64 = 0; for i in 0..64 { if self.0[i] > other.0[i] { m |= 1 << i; } } m
818+
}
819+
#[inline(always)]
820+
pub fn mask_blend(mask: u64, a: Self, b: Self) -> Self {
821+
let mut out = [0u8; 64]; for i in 0..64 { out[i] = if mask & (1 << i) != 0 { b.0[i] } else { a.0[i] }; } Self(out)
822+
}
823+
#[inline(always)]
824+
pub fn shl_epi16(self, imm: u32) -> Self {
825+
let mut out = [0u8; 64];
826+
for i in (0..64).step_by(2) {
827+
let v = u16::from_le_bytes([self.0[i], self.0[i+1]]);
828+
let s = if imm < 16 { v << imm } else { 0 };
829+
let b = s.to_le_bytes(); out[i] = b[0]; out[i+1] = b[1];
830+
}
831+
Self(out)
832+
}
833+
#[inline(always)]
834+
pub unsafe fn mask_store(self, ptr: *mut u8, mask: u64) {
835+
for i in 0..64 { if mask & (1 << i) != 0 { *ptr.add(i) = self.0[i]; } }
836+
}
837+
#[inline(always)]
838+
pub fn saturating_add(self, other: Self) -> Self {
839+
let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[i].saturating_add(other.0[i]); } Self(out)
840+
}
841+
#[inline(always)]
842+
pub fn permute_bytes(self, idx: Self) -> Self {
843+
let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[(idx.0[i] & 63) as usize]; } Self(out)
844+
}
845+
809846
/// Interleave low bytes within each 128-bit lane.
810847
#[inline(always)]
811848
pub fn unpack_lo_epi8(self, other: Self) -> Self {

src/simd_avx512.rs

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,82 @@ impl U8x64 {
624624
Self(unsafe { _mm512_subs_epu8(self.0, other.0) })
625625
}
626626

627+
// ── Tier 1: seismon rasterizer primitives ─────────────────────────
628+
629+
/// Pairwise unsigned byte average: (a[i] + b[i] + 1) >> 1 per byte.
630+
/// Core op for 4×4 mipmap downsample (vpavgb + horizontal pair = 2 ops).
631+
#[inline(always)]
632+
pub fn pairwise_avg(self, other: Self) -> Self {
633+
// SAFETY: AVX-512BW instruction, operates on all 64 bytes.
634+
Self(unsafe { _mm512_avg_epu8(self.0, other.0) })
635+
}
636+
637+
/// Byte-wise unsigned greater-than comparison. Returns 64-bit mask:
638+
/// bit i set if self[i] > other[i]. Symmetric to `cmpeq_mask`.
639+
/// Used for threshold density fields, depth/Z-test, hit-tests.
640+
#[inline(always)]
641+
pub fn cmpgt_mask(self, other: Self) -> u64 {
642+
// SAFETY: AVX-512BW instruction. Unsigned compare via _epu8.
643+
unsafe { _mm512_cmpgt_epu8_mask(self.0, other.0) }
644+
}
645+
646+
/// Masked blend: for each bit in `mask`, select from `b` if set, else `a`.
647+
/// Sprite alpha blit: write atlas pixel where mask bit set, keep framebuffer otherwise.
648+
#[inline(always)]
649+
pub fn mask_blend(mask: u64, a: Self, b: Self) -> Self {
650+
// SAFETY: AVX-512BW instruction. mask selects between a and b per byte.
651+
Self(unsafe { _mm512_mask_blend_epi8(mask, a.0, b.0) })
652+
}
653+
654+
/// Shift left each 16-bit lane by immediate bits (nibble write: place high nibble).
655+
/// Completes the nibble shift pair with `shr_epi16`.
656+
#[inline(always)]
657+
pub fn shl_epi16(self, imm: u32) -> Self {
658+
Self(unsafe { match imm {
659+
1 => _mm512_slli_epi16(self.0, 1),
660+
2 => _mm512_slli_epi16(self.0, 2),
661+
3 => _mm512_slli_epi16(self.0, 3),
662+
4 => _mm512_slli_epi16(self.0, 4),
663+
5 => _mm512_slli_epi16(self.0, 5),
664+
6 => _mm512_slli_epi16(self.0, 6),
665+
7 => _mm512_slli_epi16(self.0, 7),
666+
8 => _mm512_slli_epi16(self.0, 8),
667+
_ => _mm512_setzero_si512(),
668+
}})
669+
}
670+
671+
// ── Tier 2: sprite blit + palette LUT + cross-lane shuffle ────────
672+
673+
/// Masked store: write only bytes where mask bit is set.
674+
/// Partial-tile writes at framebuffer edges without scalar fallback.
675+
///
676+
/// # Safety
677+
/// `ptr` must point to at least 64 writable bytes (may be unaligned).
678+
#[inline(always)]
679+
pub unsafe fn mask_store(self, ptr: *mut u8, mask: u64) {
680+
// SAFETY: AVX-512BW masked store. Caller guarantees ptr validity.
681+
_mm512_mask_storeu_epi8(ptr as *mut i8, mask, self.0);
682+
}
683+
684+
/// Saturating unsigned addition: min(a + b, 255) per byte.
685+
/// Additive blend without overflow wrap. Symmetric to `saturating_sub`.
686+
#[inline(always)]
687+
pub fn saturating_add(self, other: Self) -> Self {
688+
// SAFETY: AVX-512BW instruction.
689+
Self(unsafe { _mm512_adds_epu8(self.0, other.0) })
690+
}
691+
692+
/// Cross-lane byte permute: rearrange all 64 bytes by index vector.
693+
/// `idx[i]` selects which byte of `self` appears at position `i`.
694+
/// Unlike `shuffle_bytes` (within-lane), this crosses 128-bit lane boundaries.
695+
/// Needed for sprite atlas reorder and palette remap > 16 entries.
696+
#[inline(always)]
697+
pub fn permute_bytes(self, idx: Self) -> Self {
698+
// SAFETY: AVX-512VBMI instruction (_mm512_permutexvar_epi8).
699+
// Falls back to multi-shuffle on CPUs without VBMI.
700+
Self(unsafe { _mm512_permutexvar_epi8(idx.0, self.0) })
701+
}
702+
627703
/// Interleave low bytes: [a0,b0,a1,b1,...] from lower halves.
628704
#[inline(always)]
629705
pub fn unpack_lo_epi8(self, other: Self) -> Self {
@@ -2728,3 +2804,119 @@ mod f16_tests {
27282804
}
27292805
}
27302806
}
2807+
2808+
#[cfg(test)]
2809+
mod u8x64_rasterizer_tests {
2810+
use super::U8x64;
2811+
2812+
#[test]
2813+
fn pairwise_avg_basic() {
2814+
let a = U8x64::splat(10);
2815+
let b = U8x64::splat(20);
2816+
let avg = a.pairwise_avg(b);
2817+
let mut out = [0u8; 64];
2818+
avg.copy_to_slice(&mut out);
2819+
// (10 + 20 + 1) >> 1 = 15
2820+
assert!(out.iter().all(|&v| v == 15));
2821+
}
2822+
2823+
#[test]
2824+
fn pairwise_avg_rounding() {
2825+
let a = U8x64::splat(1);
2826+
let b = U8x64::splat(2);
2827+
let avg = a.pairwise_avg(b);
2828+
let mut out = [0u8; 64];
2829+
avg.copy_to_slice(&mut out);
2830+
// (1 + 2 + 1) >> 1 = 2 (rounds up)
2831+
assert!(out.iter().all(|&v| v == 2));
2832+
}
2833+
2834+
#[test]
2835+
fn cmpgt_mask_basic() {
2836+
let a = U8x64::splat(10);
2837+
let b = U8x64::splat(5);
2838+
assert_eq!(a.cmpgt_mask(b), u64::MAX); // all greater
2839+
assert_eq!(b.cmpgt_mask(a), 0); // none greater
2840+
assert_eq!(a.cmpgt_mask(a), 0); // equal = not greater
2841+
}
2842+
2843+
#[test]
2844+
fn mask_blend_selects_correctly() {
2845+
let a = U8x64::splat(10);
2846+
let b = U8x64::splat(20);
2847+
// mask = 0: all from a
2848+
let r0 = U8x64::mask_blend(0, a, b);
2849+
let mut out = [0u8; 64];
2850+
r0.copy_to_slice(&mut out);
2851+
assert!(out.iter().all(|&v| v == 10));
2852+
// mask = all 1s: all from b
2853+
let r1 = U8x64::mask_blend(u64::MAX, a, b);
2854+
r1.copy_to_slice(&mut out);
2855+
assert!(out.iter().all(|&v| v == 20));
2856+
// mask = bit 0 only: first byte from b, rest from a
2857+
let r2 = U8x64::mask_blend(1, a, b);
2858+
r2.copy_to_slice(&mut out);
2859+
assert_eq!(out[0], 20);
2860+
assert_eq!(out[1], 10);
2861+
}
2862+
2863+
#[test]
2864+
fn shl_epi16_shift_4() {
2865+
let mut data = [0u8; 64];
2866+
data[0] = 0x0F; data[1] = 0x00; // u16 = 0x000F
2867+
let v = U8x64::from_slice(&data);
2868+
let shifted = v.shl_epi16(4);
2869+
let mut out = [0u8; 64];
2870+
shifted.copy_to_slice(&mut out);
2871+
let result = u16::from_le_bytes([out[0], out[1]]);
2872+
assert_eq!(result, 0x00F0);
2873+
}
2874+
2875+
#[test]
2876+
fn saturating_add_clamps_at_255() {
2877+
let a = U8x64::splat(200);
2878+
let b = U8x64::splat(100);
2879+
let sum = a.saturating_add(b);
2880+
let mut out = [0u8; 64];
2881+
sum.copy_to_slice(&mut out);
2882+
assert!(out.iter().all(|&v| v == 255));
2883+
}
2884+
2885+
#[test]
2886+
fn saturating_add_no_overflow() {
2887+
let a = U8x64::splat(10);
2888+
let b = U8x64::splat(20);
2889+
let sum = a.saturating_add(b);
2890+
let mut out = [0u8; 64];
2891+
sum.copy_to_slice(&mut out);
2892+
assert!(out.iter().all(|&v| v == 30));
2893+
}
2894+
2895+
#[test]
2896+
fn permute_bytes_identity() {
2897+
let mut data = [0u8; 64];
2898+
for i in 0..64 { data[i] = i as u8; }
2899+
let v = U8x64::from_slice(&data);
2900+
// Identity permutation
2901+
let mut idx = [0u8; 64];
2902+
for i in 0..64 { idx[i] = i as u8; }
2903+
let perm = v.permute_bytes(U8x64::from_slice(&idx));
2904+
let mut out = [0u8; 64];
2905+
perm.copy_to_slice(&mut out);
2906+
assert_eq!(out, data);
2907+
}
2908+
2909+
#[test]
2910+
fn permute_bytes_reverse() {
2911+
let mut data = [0u8; 64];
2912+
for i in 0..64 { data[i] = i as u8; }
2913+
let v = U8x64::from_slice(&data);
2914+
// Reverse permutation
2915+
let mut idx = [0u8; 64];
2916+
for i in 0..64 { idx[i] = (63 - i) as u8; }
2917+
let perm = v.permute_bytes(U8x64::from_slice(&idx));
2918+
let mut out = [0u8; 64];
2919+
perm.copy_to_slice(&mut out);
2920+
for i in 0..64 { assert_eq!(out[i], (63 - i) as u8); }
2921+
}
2922+
}

0 commit comments

Comments
 (0)