Skip to content

Commit 6bc000b

Browse files
Antigravity Agentclaude
andcommitted
feat(hslm): add adaptive SIMD width with CPU feature detection
Patch #2 of 7 for ternary SIMD optimization. Key features: - Comptime CPU feature detection (AVX2, SSE2, NEON) - Adaptive vector types: VecF16, VecF32, VecI8 - Architecture-specific optimal width selection - Runtime SIMD info and speedup estimation Supports: - x86_64: AVX2 (256-bit) → SSE2 (128-bit) → fallback - aarch64: NEON (128-bit) - wasm32/64: SIMD128 (128-bit) - Generic: safe fallback 9 tests passed. Related: ziglang/zig#352 (code coverage) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent c859ae6 commit 6bc000b

1 file changed

Lines changed: 339 additions & 0 deletions

File tree

src/hslm/simd_config.zig

Lines changed: 339 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,339 @@
1+
// @origin(spec:simd_config.tri) @regen(manual-impl)
2+
// Adaptive SIMD Width — Comptime CPU Feature Detection
3+
// Selects optimal vector width based on available SIMD extensions
4+
//
5+
// Architecture support:
6+
// - x86_64: AVX2 (256-bit) → SSE2 (128-bit) → fallback (64-bit)
7+
// - aarch64: NEON (128-bit)
8+
// - Default: 8-wide (safe baseline)
9+
//
10+
// φ² + 1/φ² = 3 | TRINITY
11+
12+
const std = @import("std");
13+
const builtin = @import("builtin");
14+
15+
// ═══════════════════════════════════════════════════════════════════════════════
16+
// CPU FEATURE DETECTION
17+
// ═══════════════════════════════════════════════════════════════════════════════
18+
19+
/// Detected SIMD capabilities at comptime
20+
pub const SimdCapabilities = struct {
21+
/// Has AVX2 (256-bit vectors on x86_64)
22+
has_avx2: bool = false,
23+
24+
/// Has SSE2 (128-bit vectors on x86_64)
25+
has_sse2: bool = false,
26+
27+
/// Has NEON (128-bit vectors on ARM)
28+
has_neon: bool = false,
29+
30+
/// Has ARM SVE (scalable vectors)
31+
has_sve: bool = false,
32+
33+
/// Optimal f16 vector width
34+
optimal_f16_width: usize,
35+
36+
/// Optimal f32 vector width
37+
optimal_f32_width: usize,
38+
39+
/// Optimal i8 vector width
40+
optimal_i8_width: usize,
41+
42+
/// CPU architecture name
43+
arch_name: []const u8,
44+
};
45+
46+
/// Get SIMD capabilities for current target
47+
pub fn detectSimdCapabilities() SimdCapabilities {
48+
const arch = builtin.cpu.arch;
49+
const features = builtin.cpu.features;
50+
51+
return switch (arch) {
52+
.x86_64 => blk: {
53+
const has_avx2 = std.Target.x86.featureSetHas(features, .avx2);
54+
const has_sse2 = std.Target.x86.featureSetHas(features, .sse2);
55+
56+
const f16_width: usize = if (has_avx2) 16 else if (has_sse2) 8 else 4;
57+
const f32_width: usize = if (has_avx2) 8 else if (has_sse2) 4 else 2;
58+
const i8_width: usize = if (has_avx2) 32 else if (has_sse2) 16 else 8;
59+
60+
break :blk .{
61+
.has_avx2 = has_avx2,
62+
.has_sse2 = has_sse2,
63+
.optimal_f16_width = f16_width,
64+
.optimal_f32_width = f32_width,
65+
.optimal_i8_width = i8_width,
66+
.arch_name = "x86_64",
67+
};
68+
},
69+
.aarch64, .aarch64_be => blk: {
70+
// ARM NEON is always available on aarch64
71+
const f16_width: usize = 8; // 128-bit / 16-bit = 8
72+
const f32_width: usize = 4; // 128-bit / 32-bit = 4
73+
const i8_width: usize = 16; // 128-bit / 8-bit = 16
74+
75+
break :blk .{
76+
.has_neon = true,
77+
.optimal_f16_width = f16_width,
78+
.optimal_f32_width = f32_width,
79+
.optimal_i8_width = i8_width,
80+
.arch_name = "aarch64",
81+
};
82+
},
83+
.wasm32, .wasm64 => blk: {
84+
// WASM SIMD128 provides 128-bit vectors
85+
const f16_width: usize = 8;
86+
const f32_width: usize = 4;
87+
const i8_width: usize = 16;
88+
89+
break :blk .{
90+
.optimal_f16_width = f16_width,
91+
.optimal_f32_width = f32_width,
92+
.optimal_i8_width = i8_width,
93+
.arch_name = "wasm",
94+
};
95+
},
96+
else => blk: {
97+
// Safe fallback for unknown architectures
98+
break :blk .{
99+
.optimal_f16_width = 4,
100+
.optimal_f32_width = 2,
101+
.optimal_i8_width = 8,
102+
.arch_name = "generic",
103+
};
104+
},
105+
};
106+
}
107+
108+
/// Comptime-detected SIMD capabilities
109+
pub const capabilities = detectSimdCapabilities();
110+
111+
// ═══════════════════════════════════════════════════════════════════════════════
112+
// ADAPTIVE VECTOR TYPES
113+
// ═══════════════════════════════════════════════════════════════════════════════
114+
115+
/// Optimal f16 vector type for current CPU
116+
pub const VecF16 = @Vector(capabilities.optimal_f16_width, f16);
117+
118+
/// Optimal f32 vector type for current CPU
119+
pub const VecF32 = @Vector(capabilities.optimal_f32_width, f32);
120+
121+
/// Optimal i8 vector type for current CPU
122+
pub const VecI8 = @Vector(capabilities.optimal_i8_width, i8);
123+
124+
/// Get zero vector for f16
125+
pub inline fn zeroVecF16() VecF16 {
126+
return @splat(@as(f16, 0.0));
127+
}
128+
129+
/// Get zero vector for f32
130+
pub inline fn zeroVecF32() VecF32 {
131+
return @splat(@as(f32, 0.0));
132+
}
133+
134+
/// Get zero vector for i8
135+
pub inline fn zeroVecI8() VecI8 {
136+
return @splat(@as(i8, 0));
137+
}
138+
139+
// ═══════════════════════════════════════════════════════════════════════════════
140+
// RUNTIME INFO
141+
// ═══════════════════════════════════════════════════════════════════════════════
142+
143+
/// Get human-readable SIMD info string
144+
pub fn simdInfoString() []const u8 {
145+
if (capabilities.has_avx2) {
146+
return "AVX2 (256-bit)";
147+
} else if (capabilities.has_sse2) {
148+
return "SSE2 (128-bit)";
149+
} else if (capabilities.has_neon) {
150+
return "NEON (128-bit)";
151+
} else {
152+
return "Scalar (fallback)";
153+
}
154+
}
155+
156+
/// Print SIMD configuration at runtime
157+
/// Note: Only works in executables, not in test mode
158+
pub fn printSimdConfig() void {
159+
const stdout = std.io.getStdOut().writer();
160+
161+
stdout.print("SIMD Configuration:\n", .{}) catch return;
162+
stdout.print(" Architecture: {s}\n", .{capabilities.arch_name}) catch return;
163+
stdout.print(" f16 width: {d}\n", .{capabilities.optimal_f16_width}) catch return;
164+
stdout.print(" f32 width: {d}\n", .{capabilities.optimal_f32_width}) catch return;
165+
stdout.print(" i8 width: {d}\n", .{capabilities.optimal_i8_width}) catch return;
166+
stdout.print(" Detection: {s}\n", .{simdInfoString()}) catch return;
167+
168+
if (capabilities.has_avx2) {
169+
stdout.print(" Extensions: AVX2, SSE2\n", .{}) catch return;
170+
} else if (capabilities.has_sse2) {
171+
stdout.print(" Extensions: SSE2\n", .{}) catch return;
172+
} else if (capabilities.has_neon) {
173+
stdout.print(" Extensions: NEON\n", .{}) catch return;
174+
}
175+
}
176+
177+
// ═══════════════════════════════════════════════════════════════════════════════
178+
// COMPATIBILITY HELPERS
179+
// ═══════════════════════════════════════════════════════════════════════════════
180+
181+
/// Check if current CPU supports a given minimum width
182+
pub fn supportsMinWidth(min_width: usize) bool {
183+
return capabilities.optimal_f32_width >= min_width;
184+
}
185+
186+
/// Get expected speedup vs baseline (8-wide)
187+
pub fn expectedSpeedupVsBaseline() f64 {
188+
const baseline_width: f64 = 8;
189+
const current_width: f64 = @floatFromInt(capabilities.optimal_f32_width);
190+
// Theoretical speedup (actual may vary due to memory bandwidth)
191+
return current_width / baseline_width;
192+
}
193+
194+
// ═══════════════════════════════════════════════════════════════════════════════
195+
// TESTS
196+
// ═══════════════════════════════════════════════════════════════════════════════
197+
198+
test "detect simd capabilities" {
199+
const caps = detectSimdCapabilities();
200+
201+
// Should have detected some architecture
202+
try std.testing.expect(caps.arch_name.len > 0);
203+
204+
// Widths should be power of 2 and reasonable
205+
try std.testing.expect(caps.optimal_f16_width >= 4 and caps.optimal_f16_width <= 32);
206+
try std.testing.expect(caps.optimal_f32_width >= 2 and caps.optimal_f32_width <= 8);
207+
try std.testing.expect(caps.optimal_i8_width >= 8 and caps.optimal_i8_width <= 32);
208+
209+
// f16 width should be 2× f32 width (same total bits)
210+
try std.testing.expectEqual(caps.optimal_f16_width, caps.optimal_f32_width * 2);
211+
212+
// i8 width should be 4× f32 width (same total bits)
213+
try std.testing.expectEqual(caps.optimal_i8_width, caps.optimal_f32_width * 4);
214+
}
215+
216+
test "vector types are correctly sized" {
217+
// Verify the types compile and have expected widths
218+
// The actual width is comptime-known from capabilities
219+
220+
try std.testing.expect(capabilities.optimal_f16_width >= 4);
221+
try std.testing.expect(capabilities.optimal_f32_width >= 2);
222+
try std.testing.expect(capabilities.optimal_i8_width >= 8);
223+
224+
// Verify the relationship between widths
225+
try std.testing.expectEqual(capabilities.optimal_f16_width, capabilities.optimal_f32_width * 2);
226+
try std.testing.expectEqual(capabilities.optimal_i8_width, capabilities.optimal_f32_width * 4);
227+
}
228+
229+
test "zero vectors" {
230+
// Verify zero vectors are actually all zeros
231+
{
232+
const zv = zeroVecF16();
233+
var sum: f64 = 0;
234+
inline for (0..capabilities.optimal_f16_width) |i| {
235+
sum += @as(f64, @floatCast(zv[i]));
236+
}
237+
try std.testing.expectEqual(sum, 0);
238+
}
239+
240+
{
241+
const zv = zeroVecF32();
242+
var sum: f64 = 0;
243+
inline for (0..capabilities.optimal_f32_width) |i| {
244+
sum += @as(f64, zv[i]);
245+
}
246+
try std.testing.expectEqual(sum, 0);
247+
}
248+
249+
{
250+
const zv = zeroVecI8();
251+
var sum: i64 = 0;
252+
inline for (0..capabilities.optimal_i8_width) |i| {
253+
sum += zv[i];
254+
}
255+
try std.testing.expectEqual(sum, 0);
256+
}
257+
}
258+
259+
test "simd info string is valid" {
260+
const info = simdInfoString();
261+
try std.testing.expect(info.len > 0);
262+
}
263+
264+
test "supports min width" {
265+
// Should always support at least 4-wide
266+
try std.testing.expect(supportsMinWidth(4));
267+
268+
// Should support 8-wide on most platforms
269+
try std.testing.expect(supportsMinWidth(8) or capabilities.optimal_f32_width < 8);
270+
}
271+
272+
test "expected speedup is reasonable" {
273+
const speedup = expectedSpeedupVsBaseline();
274+
275+
// Speedup should be between 0.25× and 4×
276+
try std.testing.expect(speedup >= 0.25 and speedup <= 4.0);
277+
}
278+
279+
280+
test "x86_64 avx2 detection" {
281+
if (builtin.cpu.arch == .x86_64) {
282+
// On x86_64, should have at least SSE2
283+
try std.testing.expect(capabilities.has_sse2 or capabilities.optimal_f32_width >= 4);
284+
}
285+
}
286+
287+
test "aarch64 neon detection" {
288+
if (builtin.cpu.arch == .aarch64) {
289+
// On ARM64, should have NEON
290+
try std.testing.expect(capabilities.has_neon);
291+
try std.testing.expectEqual(@as(usize, 8), capabilities.optimal_f16_width);
292+
}
293+
}
294+
295+
test "vector types support common operations" {
296+
// Test that our adaptive vector types work with common SIMD ops
297+
298+
const f16_width = capabilities.optimal_f16_width;
299+
const f32_width = capabilities.optimal_f32_width;
300+
const i8_width = capabilities.optimal_i8_width;
301+
302+
// f16 vector operations
303+
const v_f16_a: VecF16 = @splat(1.5);
304+
const v_f16_b: VecF16 = @splat(2.5);
305+
const v_f16_sum = v_f16_a + v_f16_b;
306+
307+
var f16_check: f64 = 0;
308+
inline for (0..f16_width) |i| {
309+
f16_check += @as(f64, @floatCast(v_f16_sum[i]));
310+
}
311+
const expected_f16 = @as(f64, 4.0) * @as(f64, @floatFromInt(f16_width));
312+
try std.testing.expectApproxEqAbs(expected_f16, f16_check, 0.1);
313+
314+
// f32 vector operations
315+
const v_f32_a: VecF32 = @splat(1.5);
316+
const v_f32_b: VecF32 = @splat(2.5);
317+
const v_f32_sum = v_f32_a + v_f32_b;
318+
319+
var f32_check: f64 = 0;
320+
inline for (0..f32_width) |i| {
321+
f32_check += @as(f64, v_f32_sum[i]);
322+
}
323+
const expected_f32 = @as(f64, 4.0) * @as(f64, @floatFromInt(f32_width));
324+
try std.testing.expectApproxEqAbs(expected_f32, f32_check, 0.1);
325+
326+
// i8 vector operations
327+
const v_i8_a: VecI8 = @splat(@as(i8, 1));
328+
const v_i8_b: VecI8 = @splat(@as(i8, 2));
329+
const v_i8_sum = v_i8_a + v_i8_b;
330+
331+
var i8_check: i64 = 0;
332+
inline for (0..i8_width) |i| {
333+
i8_check += v_i8_sum[i];
334+
}
335+
const expected_i8 = @as(i64, 3) * @as(i64, @intCast(i8_width));
336+
try std.testing.expectEqual(expected_i8, i8_check);
337+
}
338+
339+
// φ² + 1/φ² = 3 | TRINITY

0 commit comments

Comments
 (0)