Skip to content

Commit 60e7f49

Browse files
authored
Merge pull request #89 from AdaWorldAPI/claude/setup-rust-smart-home-SOPAY
Add ARM NEON SIMD support for Raspberry Pi (3/4/5)
2 parents b921e88 + e41ea81 commit 60e7f49

5 files changed

Lines changed: 1144 additions & 198 deletions

File tree

src/hpc/simd_caps.rs

Lines changed: 203 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,18 @@ use std::sync::LazyLock;
1313

1414
/// Detected SIMD capabilities, frozen at first access.
1515
///
16-
/// This is a `Copy` type: 8 bools packed into 8 bytes. Passed by value,
16+
/// This is a `Copy` type: bools packed into bytes. Passed by value,
1717
/// lives in registers after the first `LazyLock` deref.
18+
///
19+
/// x86_64 fields detect via `is_x86_feature_detected!`.
20+
/// aarch64 fields detect via `is_aarch64_feature_detected!`.
21+
/// NEON is mandatory on aarch64 — the sub-features distinguish Pi models:
22+
/// Pi Zero 2 W / Pi 3 (A53, v8.0): neon only
23+
/// Pi 4 (A72, v8.0): neon only (but 2× throughput)
24+
/// Pi 5 (A76, v8.2): neon + dotprod + fp16 + aes + sha2
1825
#[derive(Debug, Clone, Copy)]
1926
pub struct SimdCaps {
27+
// ── x86_64 ──
2028
/// AVX2 (256-bit integer/FP SIMD).
2129
pub avx2: bool,
2230
/// AVX-512 Foundation (512-bit).
@@ -33,6 +41,22 @@ pub struct SimdCaps {
3341
pub sse2: bool,
3442
/// FMA (fused multiply-add).
3543
pub fma: bool,
44+
45+
// ── aarch64 (ARM) ──
46+
/// NEON 128-bit SIMD (mandatory on aarch64, always true).
47+
pub neon: bool,
48+
/// ASIMD dot product (ARMv8.2+: Pi 5 A76, NOT Pi 4 A72).
49+
/// Enables `vdotq_s32` — 4× throughput for int8 dot products.
50+
pub asimd_dotprod: bool,
51+
/// FP16 half-precision arithmetic (ARMv8.2+: Pi 5).
52+
/// Enables `vcvt_f16_f32` and native f16 math.
53+
pub fp16: bool,
54+
/// AES hardware acceleration (Pi 3+, all aarch64 Pi models).
55+
pub aes: bool,
56+
/// SHA-2 hardware acceleration (Pi 3+).
57+
pub sha2: bool,
58+
/// CRC32 instructions (Pi 3+).
59+
pub crc32: bool,
3660
}
3761

3862
/// Global singleton — detected once at first access via `LazyLock`.
@@ -58,13 +82,23 @@ impl SimdCaps {
5882
sse41: is_x86_feature_detected!("sse4.1"),
5983
sse2: is_x86_feature_detected!("sse2"),
6084
fma: is_x86_feature_detected!("fma"),
85+
// ARM fields: all false on x86
86+
neon: false,
87+
asimd_dotprod: false,
88+
fp16: false,
89+
aes: false,
90+
sha2: false,
91+
crc32: false,
6192
}
6293
}
6394

64-
/// Non-x86: all false.
65-
#[cfg(not(target_arch = "x86_64"))]
95+
/// AArch64: detect NEON sub-features via `is_aarch64_feature_detected!`.
96+
/// NEON itself is mandatory (always true). The sub-features distinguish
97+
/// Pi Zero 2 W / Pi 3 (A53) from Pi 4 (A72) from Pi 5 (A76).
98+
#[cfg(target_arch = "aarch64")]
6699
fn detect() -> Self {
67100
Self {
101+
// x86 fields: all false on ARM
68102
avx2: false,
69103
avx512f: false,
70104
avx512bw: false,
@@ -73,6 +107,34 @@ impl SimdCaps {
73107
sse41: false,
74108
sse2: false,
75109
fma: false,
110+
// ARM fields: runtime detection
111+
neon: true, // mandatory on aarch64
112+
asimd_dotprod: std::arch::is_aarch64_feature_detected!("dotprod"),
113+
fp16: std::arch::is_aarch64_feature_detected!("fp16"),
114+
aes: std::arch::is_aarch64_feature_detected!("aes"),
115+
sha2: std::arch::is_aarch64_feature_detected!("sha2"),
116+
crc32: std::arch::is_aarch64_feature_detected!("crc"),
117+
}
118+
}
119+
120+
/// Non-x86, non-ARM: all false (wasm, riscv, etc).
121+
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
122+
fn detect() -> Self {
123+
Self {
124+
avx2: false,
125+
avx512f: false,
126+
avx512bw: false,
127+
avx512vl: false,
128+
avx512vpopcntdq: false,
129+
sse41: false,
130+
sse2: false,
131+
fma: false,
132+
neon: false,
133+
asimd_dotprod: false,
134+
fp16: false,
135+
aes: false,
136+
sha2: false,
137+
crc32: false,
76138
}
77139
}
78140

@@ -87,6 +149,121 @@ impl SimdCaps {
87149
pub fn has_avx512_bw_popcnt(self) -> bool {
88150
self.avx512bw && self.avx512vpopcntdq
89151
}
152+
153+
// ── ARM convenience methods ──
154+
155+
/// True if running on aarch64 with NEON (always true on aarch64).
156+
#[inline(always)]
157+
pub fn has_neon(self) -> bool {
158+
self.neon
159+
}
160+
161+
/// True if ASIMD dot product is available (ARMv8.2+: Pi 5, Orange Pi 5).
162+
/// Enables `vdotq_s32` for 4× int8 dot product throughput.
163+
#[inline(always)]
164+
pub fn has_dotprod(self) -> bool {
165+
self.neon && self.asimd_dotprod
166+
}
167+
168+
/// True if FP16 arithmetic is available (ARMv8.2+: Pi 5, Orange Pi 5).
169+
#[inline(always)]
170+
pub fn has_fp16(self) -> bool {
171+
self.neon && self.fp16
172+
}
173+
174+
/// True if AES + SHA2 crypto extensions are available (Pi 3+, Orange Pi 4+).
175+
#[inline(always)]
176+
pub fn has_crypto(self) -> bool {
177+
self.aes && self.sha2
178+
}
179+
180+
/// Identify the ARM SBC profile based on detected features.
181+
///
182+
/// This is heuristic — detects the *capability tier*, not the exact board.
183+
/// Boards with the same SoC tier share the same SIMD capabilities:
184+
///
185+
/// | Profile | SoC | Boards |
186+
/// |---------|-----|--------|
187+
/// | `A53Baseline` | Cortex-A53 v8.0 | Pi Zero 2 W, Pi 3B+ |
188+
/// | `A72Fast` | Cortex-A72 v8.0 | Pi 4, Orange Pi 4 LTS |
189+
/// | `A76DotProd` | Cortex-A76 v8.2 | Pi 5, Orange Pi 5 |
190+
/// | `Unknown` | Anything else | Other aarch64 SBCs |
191+
#[inline]
192+
pub fn arm_profile(self) -> ArmProfile {
193+
if !self.neon {
194+
return ArmProfile::NotArm;
195+
}
196+
if self.asimd_dotprod {
197+
// ARMv8.2+: Pi 5 (A76), Orange Pi 5 (RK3588/A76+A55)
198+
ArmProfile::A76DotProd
199+
} else if self.aes {
200+
// ARMv8.0 with crypto: could be A53 or A72.
201+
// Can't distinguish purely from features — both have
202+
// NEON + AES + SHA2 but NOT dotprod.
203+
// A72 has 2× NEON throughput but that's microarch, not features.
204+
// We report A72-tier since most deployments target Pi 4.
205+
ArmProfile::A72Fast
206+
} else {
207+
// NEON but no crypto — unusual for Pi, but possible on
208+
// older aarch64 SoCs or QEMU without extensions.
209+
ArmProfile::A53Baseline
210+
}
211+
}
212+
}
213+
214+
/// ARM single-board computer capability tier.
215+
///
216+
/// Heuristic based on detected SIMD features. Boards with the same SoC
217+
/// family share the tier. Used for codebook kernel selection and throughput
218+
/// estimation in ada-brain cascade.
219+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
220+
pub enum ArmProfile {
221+
/// Not an ARM target (x86, wasm, etc.)
222+
NotArm,
223+
/// Cortex-A53 v8.0: Pi Zero 2 W, Pi 3B+. NEON baseline only.
224+
/// ~1 NEON pipeline, lower clock. Codebook: 50-500 tok/s.
225+
A53Baseline,
226+
/// Cortex-A72 v8.0: Pi 4, Orange Pi 4 LTS. NEON + crypto.
227+
/// 2× NEON throughput, higher clock. Codebook: 500-5K tok/s.
228+
A72Fast,
229+
/// Cortex-A76 v8.2: Pi 5, Orange Pi 5. NEON + dotprod + fp16.
230+
/// dotprod enables 4× int8 throughput. Codebook: 2K-10K tok/s.
231+
A76DotProd,
232+
}
233+
234+
impl ArmProfile {
235+
/// Human-readable name.
236+
pub const fn name(self) -> &'static str {
237+
match self {
238+
Self::NotArm => "not-arm",
239+
Self::A53Baseline => "A53-baseline (Pi Zero 2W / Pi 3)",
240+
Self::A72Fast => "A72-fast (Pi 4 / Orange Pi 4)",
241+
Self::A76DotProd => "A76-dotprod (Pi 5 / Orange Pi 5)",
242+
}
243+
}
244+
245+
/// Estimated codebook tokens/second for this profile.
246+
pub const fn estimated_tok_per_sec(self) -> u32 {
247+
match self {
248+
Self::NotArm => 0,
249+
Self::A53Baseline => 200,
250+
Self::A72Fast => 2_000,
251+
Self::A76DotProd => 5_000,
252+
}
253+
}
254+
255+
/// Number of effective f32 NEON lanes (accounting for pipeline width).
256+
/// A53: 1 pipeline = 4 lanes effective.
257+
/// A72: 2 pipelines = 8 lanes effective (can issue 2 NEON ops/cycle).
258+
/// A76: 2 pipelines + dotprod = 8 lanes + int8 boost.
259+
pub const fn effective_f32_lanes(self) -> usize {
260+
match self {
261+
Self::NotArm => 1,
262+
Self::A53Baseline => 4,
263+
Self::A72Fast => 8,
264+
Self::A76DotProd => 8,
265+
}
266+
}
90267
}
91268

92269
#[cfg(test)]
@@ -99,6 +276,7 @@ mod tests {
99276
// On any platform, simd_caps() should succeed.
100277
let _ = caps.avx2;
101278
let _ = caps.avx512f;
279+
let _ = caps.neon;
102280
}
103281

104282
#[test]
@@ -108,6 +286,7 @@ mod tests {
108286
let c = a; // Still valid
109287
assert_eq!(a.avx2, b.avx2);
110288
assert_eq!(b.avx512f, c.avx512f);
289+
assert_eq!(a.neon, c.neon);
111290
}
112291

113292
#[test]
@@ -119,6 +298,8 @@ mod tests {
119298
assert_eq!(a.avx512bw, b.avx512bw);
120299
assert_eq!(a.avx512vpopcntdq, b.avx512vpopcntdq);
121300
assert_eq!(a.sse41, b.sse41);
301+
assert_eq!(a.neon, b.neon);
302+
assert_eq!(a.asimd_dotprod, b.asimd_dotprod);
122303
}
123304

124305
#[test]
@@ -127,5 +308,24 @@ mod tests {
127308
// Just verify these don't panic and return consistent values.
128309
let _ = caps.has_avx512_popcnt();
129310
let _ = caps.has_avx512_bw_popcnt();
311+
let _ = caps.has_neon();
312+
let _ = caps.has_dotprod();
313+
let _ = caps.has_fp16();
314+
let _ = caps.has_crypto();
315+
}
316+
317+
#[test]
318+
fn arm_profile_consistent() {
319+
let caps = simd_caps();
320+
let profile = caps.arm_profile();
321+
let _ = profile.name();
322+
let _ = profile.estimated_tok_per_sec();
323+
let _ = profile.effective_f32_lanes();
324+
// On x86, should be NotArm
325+
#[cfg(target_arch = "x86_64")]
326+
assert_eq!(profile, ArmProfile::NotArm);
327+
// On aarch64, should be one of the ARM profiles
328+
#[cfg(target_arch = "aarch64")]
329+
assert_ne!(profile, ArmProfile::NotArm);
130330
}
131331
}

src/hpc/simd_dispatch.rs

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@ pub enum SimdTier {
3535
Avx2,
3636
/// SSE2 (128-bit, 4 × f32). Baseline on x86_64.
3737
Sse2,
38+
/// NEON with dotprod (128-bit, 4 × f32 + int8 dot product).
39+
/// ARMv8.2+: Pi 5 (A76), Orange Pi 5.
40+
NeonDotProd,
41+
/// NEON baseline (128-bit, 4 × f32).
42+
/// ARMv8.0: Pi Zero 2 W (A53), Pi 3 (A53), Pi 4 (A72).
43+
Neon,
3844
/// Scalar fallback (1 lane).
3945
Scalar,
4046
/// WebAssembly SIMD (128-bit, 4 × f32). Future tier.
@@ -48,7 +54,7 @@ impl SimdTier {
4854
match self {
4955
Self::Avx512 => 16,
5056
Self::Avx2 => 8,
51-
Self::Sse2 | Self::WasmSimd128 => 4,
57+
Self::Sse2 | Self::WasmSimd128 | Self::NeonDotProd | Self::Neon => 4,
5258
Self::Scalar => 1,
5359
}
5460
}
@@ -59,6 +65,8 @@ impl SimdTier {
5965
Self::Avx512 => "AVX-512",
6066
Self::Avx2 => "AVX2",
6167
Self::Sse2 => "SSE2",
68+
Self::NeonDotProd => "NEON+dotprod (Pi 5 / A76)",
69+
Self::Neon => "NEON (Pi 3/4 / A53/A72)",
6270
Self::Scalar => "Scalar",
6371
Self::WasmSimd128 => "WASM SIMD128",
6472
}
@@ -139,7 +147,25 @@ impl SimdDispatch {
139147
}
140148
}
141149

142-
#[cfg(not(target_arch = "x86_64"))]
150+
#[cfg(target_arch = "aarch64")]
151+
fn detect() -> Self {
152+
let caps = simd_caps();
153+
let tier = if caps.asimd_dotprod {
154+
SimdTier::NeonDotProd
155+
} else {
156+
SimdTier::Neon
157+
};
158+
// NEON uses the same scalar wrapper signatures — NEON intrinsics
159+
// will be wired when simd_neon.rs types are activated. For now,
160+
// dispatch to scalar which auto-vectorizes well on aarch64 with
161+
// `-C target-feature=+neon` (mandatory on aarch64).
162+
Self {
163+
tier,
164+
..Self::scalar()
165+
}
166+
}
167+
168+
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
143169
fn detect() -> Self {
144170
Self::scalar()
145171
}

0 commit comments

Comments
 (0)