@@ -13,10 +13,18 @@ use std::sync::LazyLock;
1313
1414/// Detected SIMD capabilities, frozen at first access.
1515///
16- /// This is a `Copy` type: 8 bools packed into 8 bytes. Passed by value,
16+ /// This is a `Copy` type: bools packed into bytes. Passed by value,
1717/// lives in registers after the first `LazyLock` deref.
18+ ///
19+ /// x86_64 fields detect via `is_x86_feature_detected!`.
20+ /// aarch64 fields detect via `is_aarch64_feature_detected!`.
21+ /// NEON is mandatory on aarch64 — the sub-features distinguish Pi models:
22+ /// Pi Zero 2 W / Pi 3 (A53, v8.0): neon only
23+ /// Pi 4 (A72, v8.0): neon only (but 2× throughput)
24+ /// Pi 5 (A76, v8.2): neon + dotprod + fp16 + aes + sha2
1825#[ derive( Debug , Clone , Copy ) ]
1926pub struct SimdCaps {
27+ // ── x86_64 ──
2028 /// AVX2 (256-bit integer/FP SIMD).
2129 pub avx2 : bool ,
2230 /// AVX-512 Foundation (512-bit).
@@ -33,6 +41,22 @@ pub struct SimdCaps {
3341 pub sse2 : bool ,
3442 /// FMA (fused multiply-add).
3543 pub fma : bool ,
44+
45+ // ── aarch64 (ARM) ──
46+ /// NEON 128-bit SIMD (mandatory on aarch64, always true).
47+ pub neon : bool ,
48+ /// ASIMD dot product (ARMv8.2+: Pi 5 A76, NOT Pi 4 A72).
49+ /// Enables `vdotq_s32` — 4× throughput for int8 dot products.
50+ pub asimd_dotprod : bool ,
51+ /// FP16 half-precision arithmetic (ARMv8.2+: Pi 5).
52+ /// Enables `vcvt_f16_f32` and native f16 math.
53+ pub fp16 : bool ,
54+ /// AES hardware acceleration (Pi 3+, all aarch64 Pi models).
55+ pub aes : bool ,
56+ /// SHA-2 hardware acceleration (Pi 3+).
57+ pub sha2 : bool ,
58+ /// CRC32 instructions (Pi 3+).
59+ pub crc32 : bool ,
3660}
3761
3862/// Global singleton — detected once at first access via `LazyLock`.
@@ -58,13 +82,23 @@ impl SimdCaps {
5882 sse41 : is_x86_feature_detected ! ( "sse4.1" ) ,
5983 sse2 : is_x86_feature_detected ! ( "sse2" ) ,
6084 fma : is_x86_feature_detected ! ( "fma" ) ,
85+ // ARM fields: all false on x86
86+ neon : false ,
87+ asimd_dotprod : false ,
88+ fp16 : false ,
89+ aes : false ,
90+ sha2 : false ,
91+ crc32 : false ,
6192 }
6293 }
6394
64- /// Non-x86: all false.
65- #[ cfg( not( target_arch = "x86_64" ) ) ]
95+ /// AArch64: detect NEON sub-features via `is_aarch64_feature_detected!`.
96+ /// NEON itself is mandatory (always true). The sub-features distinguish
97+ /// Pi Zero 2 W / Pi 3 (A53) from Pi 4 (A72) from Pi 5 (A76).
98+ #[ cfg( target_arch = "aarch64" ) ]
6699 fn detect ( ) -> Self {
67100 Self {
101+ // x86 fields: all false on ARM
68102 avx2 : false ,
69103 avx512f : false ,
70104 avx512bw : false ,
@@ -73,6 +107,34 @@ impl SimdCaps {
73107 sse41 : false ,
74108 sse2 : false ,
75109 fma : false ,
110+ // ARM fields: runtime detection
111+ neon : true , // mandatory on aarch64
112+ asimd_dotprod : std:: arch:: is_aarch64_feature_detected!( "dotprod" ) ,
113+ fp16 : std:: arch:: is_aarch64_feature_detected!( "fp16" ) ,
114+ aes : std:: arch:: is_aarch64_feature_detected!( "aes" ) ,
115+ sha2 : std:: arch:: is_aarch64_feature_detected!( "sha2" ) ,
116+ crc32 : std:: arch:: is_aarch64_feature_detected!( "crc" ) ,
117+ }
118+ }
119+
120+ /// Non-x86, non-ARM: all false (wasm, riscv, etc).
121+ #[ cfg( not( any( target_arch = "x86_64" , target_arch = "aarch64" ) ) ) ]
122+ fn detect ( ) -> Self {
123+ Self {
124+ avx2 : false ,
125+ avx512f : false ,
126+ avx512bw : false ,
127+ avx512vl : false ,
128+ avx512vpopcntdq : false ,
129+ sse41 : false ,
130+ sse2 : false ,
131+ fma : false ,
132+ neon : false ,
133+ asimd_dotprod : false ,
134+ fp16 : false ,
135+ aes : false ,
136+ sha2 : false ,
137+ crc32 : false ,
76138 }
77139 }
78140
@@ -87,6 +149,121 @@ impl SimdCaps {
87149 pub fn has_avx512_bw_popcnt ( self ) -> bool {
88150 self . avx512bw && self . avx512vpopcntdq
89151 }
152+
153+ // ── ARM convenience methods ──
154+
155+ /// True if running on aarch64 with NEON (always true on aarch64).
156+ #[ inline( always) ]
157+ pub fn has_neon ( self ) -> bool {
158+ self . neon
159+ }
160+
161+ /// True if ASIMD dot product is available (ARMv8.2+: Pi 5, Orange Pi 5).
162+ /// Enables `vdotq_s32` for 4× int8 dot product throughput.
163+ #[ inline( always) ]
164+ pub fn has_dotprod ( self ) -> bool {
165+ self . neon && self . asimd_dotprod
166+ }
167+
168+ /// True if FP16 arithmetic is available (ARMv8.2+: Pi 5, Orange Pi 5).
169+ #[ inline( always) ]
170+ pub fn has_fp16 ( self ) -> bool {
171+ self . neon && self . fp16
172+ }
173+
174+ /// True if AES + SHA2 crypto extensions are available (Pi 3+, Orange Pi 4+).
175+ #[ inline( always) ]
176+ pub fn has_crypto ( self ) -> bool {
177+ self . aes && self . sha2
178+ }
179+
180+ /// Identify the ARM SBC profile based on detected features.
181+ ///
182+ /// This is heuristic — detects the *capability tier*, not the exact board.
183+ /// Boards with the same SoC tier share the same SIMD capabilities:
184+ ///
185+ /// | Profile | SoC | Boards |
186+ /// |---------|-----|--------|
187+ /// | `A53Baseline` | Cortex-A53 v8.0 | Pi Zero 2 W, Pi 3B+ |
188+ /// | `A72Fast` | Cortex-A72 v8.0 | Pi 4, Orange Pi 4 LTS |
189+ /// | `A76DotProd` | Cortex-A76 v8.2 | Pi 5, Orange Pi 5 |
190+ /// | `Unknown` | Anything else | Other aarch64 SBCs |
191+ #[ inline]
192+ pub fn arm_profile ( self ) -> ArmProfile {
193+ if !self . neon {
194+ return ArmProfile :: NotArm ;
195+ }
196+ if self . asimd_dotprod {
197+ // ARMv8.2+: Pi 5 (A76), Orange Pi 5 (RK3588/A76+A55)
198+ ArmProfile :: A76DotProd
199+ } else if self . aes {
200+ // ARMv8.0 with crypto: could be A53 or A72.
201+ // Can't distinguish purely from features — both have
202+ // NEON + AES + SHA2 but NOT dotprod.
203+ // A72 has 2× NEON throughput but that's microarch, not features.
204+ // We report A72-tier since most deployments target Pi 4.
205+ ArmProfile :: A72Fast
206+ } else {
207+ // NEON but no crypto — unusual for Pi, but possible on
208+ // older aarch64 SoCs or QEMU without extensions.
209+ ArmProfile :: A53Baseline
210+ }
211+ }
212+ }
213+
214+ /// ARM single-board computer capability tier.
215+ ///
216+ /// Heuristic based on detected SIMD features. Boards with the same SoC
217+ /// family share the tier. Used for codebook kernel selection and throughput
218+ /// estimation in ada-brain cascade.
219+ #[ derive( Debug , Clone , Copy , PartialEq , Eq ) ]
220+ pub enum ArmProfile {
221+ /// Not an ARM target (x86, wasm, etc.)
222+ NotArm ,
223+ /// Cortex-A53 v8.0: Pi Zero 2 W, Pi 3B+. NEON baseline only.
224+ /// ~1 NEON pipeline, lower clock. Codebook: 50-500 tok/s.
225+ A53Baseline ,
226+ /// Cortex-A72 v8.0: Pi 4, Orange Pi 4 LTS. NEON + crypto.
227+ /// 2× NEON throughput, higher clock. Codebook: 500-5K tok/s.
228+ A72Fast ,
229+ /// Cortex-A76 v8.2: Pi 5, Orange Pi 5. NEON + dotprod + fp16.
230+ /// dotprod enables 4× int8 throughput. Codebook: 2K-10K tok/s.
231+ A76DotProd ,
232+ }
233+
234+ impl ArmProfile {
235+ /// Human-readable name.
236+ pub const fn name ( self ) -> & ' static str {
237+ match self {
238+ Self :: NotArm => "not-arm" ,
239+ Self :: A53Baseline => "A53-baseline (Pi Zero 2W / Pi 3)" ,
240+ Self :: A72Fast => "A72-fast (Pi 4 / Orange Pi 4)" ,
241+ Self :: A76DotProd => "A76-dotprod (Pi 5 / Orange Pi 5)" ,
242+ }
243+ }
244+
245+ /// Estimated codebook tokens/second for this profile.
246+ pub const fn estimated_tok_per_sec ( self ) -> u32 {
247+ match self {
248+ Self :: NotArm => 0 ,
249+ Self :: A53Baseline => 200 ,
250+ Self :: A72Fast => 2_000 ,
251+ Self :: A76DotProd => 5_000 ,
252+ }
253+ }
254+
255+ /// Number of effective f32 NEON lanes (accounting for pipeline width).
256+ /// A53: 1 pipeline = 4 lanes effective.
257+ /// A72: 2 pipelines = 8 lanes effective (can issue 2 NEON ops/cycle).
258+ /// A76: 2 pipelines + dotprod = 8 lanes + int8 boost.
259+ pub const fn effective_f32_lanes ( self ) -> usize {
260+ match self {
261+ Self :: NotArm => 1 ,
262+ Self :: A53Baseline => 4 ,
263+ Self :: A72Fast => 8 ,
264+ Self :: A76DotProd => 8 ,
265+ }
266+ }
90267}
91268
92269#[ cfg( test) ]
@@ -99,6 +276,7 @@ mod tests {
99276 // On any platform, simd_caps() should succeed.
100277 let _ = caps. avx2 ;
101278 let _ = caps. avx512f ;
279+ let _ = caps. neon ;
102280 }
103281
104282 #[ test]
@@ -108,6 +286,7 @@ mod tests {
108286 let c = a; // Still valid
109287 assert_eq ! ( a. avx2, b. avx2) ;
110288 assert_eq ! ( b. avx512f, c. avx512f) ;
289+ assert_eq ! ( a. neon, c. neon) ;
111290 }
112291
113292 #[ test]
@@ -119,6 +298,8 @@ mod tests {
119298 assert_eq ! ( a. avx512bw, b. avx512bw) ;
120299 assert_eq ! ( a. avx512vpopcntdq, b. avx512vpopcntdq) ;
121300 assert_eq ! ( a. sse41, b. sse41) ;
301+ assert_eq ! ( a. neon, b. neon) ;
302+ assert_eq ! ( a. asimd_dotprod, b. asimd_dotprod) ;
122303 }
123304
124305 #[ test]
@@ -127,5 +308,24 @@ mod tests {
127308 // Just verify these don't panic and return consistent values.
128309 let _ = caps. has_avx512_popcnt ( ) ;
129310 let _ = caps. has_avx512_bw_popcnt ( ) ;
311+ let _ = caps. has_neon ( ) ;
312+ let _ = caps. has_dotprod ( ) ;
313+ let _ = caps. has_fp16 ( ) ;
314+ let _ = caps. has_crypto ( ) ;
315+ }
316+
317+ #[ test]
318+ fn arm_profile_consistent ( ) {
319+ let caps = simd_caps ( ) ;
320+ let profile = caps. arm_profile ( ) ;
321+ let _ = profile. name ( ) ;
322+ let _ = profile. estimated_tok_per_sec ( ) ;
323+ let _ = profile. effective_f32_lanes ( ) ;
324+ // On x86, should be NotArm
325+ #[ cfg( target_arch = "x86_64" ) ]
326+ assert_eq ! ( profile, ArmProfile :: NotArm ) ;
327+ // On aarch64, should be one of the ARM profiles
328+ #[ cfg( target_arch = "aarch64" ) ]
329+ assert_ne ! ( profile, ArmProfile :: NotArm ) ;
130330 }
131331}
0 commit comments