44//! The dispatch! macro's LazyLock tier check ensures these are only called
55//! on AVX-512 CPUs.
66
7- #![ allow( missing_docs, clippy:: missing_safety_doc) ]
87//!
98//! BLAS-1 and element-wise functions use `F32x16`/`F64x8` from `crate::simd`.
109//! GEMM microkernels retain raw intrinsics for masked stores and broadcast patterns.
@@ -23,6 +22,8 @@ use crate::simd::{F32x16, F64x8};
2322// ═══════════════════════════════════════════════════════════════════
2423
2524/// Dot product: sum(x[i] * y[i]) using 4x-unrolled FMA.
25+ /// # Safety
26+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
2627#[ cfg( target_arch = "x86_64" ) ]
2728#[ target_feature( enable = "avx512f" ) ]
2829pub fn dot_f32 ( x : & [ f32 ] , y : & [ f32 ] ) -> f32 {
@@ -54,6 +55,8 @@ pub fn dot_f32(x: &[f32], y: &[f32]) -> f32 {
5455}
5556
5657/// Dot product f64: 4x-unrolled FMA (8 doubles each).
58+ /// # Safety
59+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
5760#[ cfg( target_arch = "x86_64" ) ]
5861#[ target_feature( enable = "avx512f" ) ]
5962pub fn dot_f64 ( x : & [ f64 ] , y : & [ f64 ] ) -> f64 {
@@ -85,6 +88,8 @@ pub fn dot_f64(x: &[f64], y: &[f64]) -> f64 {
8588}
8689
8790/// AXPY: y = alpha * x + y (f32, 16-wide FMA).
91+ /// # Safety
92+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
8893#[ cfg( target_arch = "x86_64" ) ]
8994#[ target_feature( enable = "avx512f" ) ]
9095pub fn axpy_f32 ( alpha : f32 , x : & [ f32 ] , y : & mut [ f32 ] ) {
@@ -104,6 +109,8 @@ pub fn axpy_f32(alpha: f32, x: &[f32], y: &mut [f32]) {
104109}
105110
106111/// AXPY: y = alpha * x + y (f64, 8-wide FMA).
112+ /// # Safety
113+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
107114#[ cfg( target_arch = "x86_64" ) ]
108115#[ target_feature( enable = "avx512f" ) ]
109116pub fn axpy_f64 ( alpha : f64 , x : & [ f64 ] , y : & mut [ f64 ] ) {
@@ -123,6 +130,8 @@ pub fn axpy_f64(alpha: f64, x: &[f64], y: &mut [f64]) {
123130}
124131
125132/// Scale: x = alpha * x (f32, 16-wide).
133+ /// # Safety
134+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
126135#[ cfg( target_arch = "x86_64" ) ]
127136#[ target_feature( enable = "avx512f" ) ]
128137pub fn scal_f32 ( alpha : f32 , x : & mut [ f32 ] ) {
@@ -140,6 +149,8 @@ pub fn scal_f32(alpha: f32, x: &mut [f32]) {
140149}
141150
142151/// Scale: x = alpha * x (f64, 8-wide).
152+ /// # Safety
153+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
143154#[ cfg( target_arch = "x86_64" ) ]
144155#[ target_feature( enable = "avx512f" ) ]
145156pub fn scal_f64 ( alpha : f64 , x : & mut [ f64 ] ) {
@@ -157,6 +168,8 @@ pub fn scal_f64(alpha: f64, x: &mut [f64]) {
157168}
158169
159170/// L1 norm: sum(|x[i]|) (f32, 16-wide).
171+ /// # Safety
172+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
160173#[ cfg( target_arch = "x86_64" ) ]
161174#[ target_feature( enable = "avx512f" ) ]
162175pub fn asum_f32 ( x : & [ f32 ] ) -> f32 {
@@ -172,6 +185,8 @@ pub fn asum_f32(x: &[f32]) -> f32 {
172185}
173186
174187/// L1 norm: sum(|x[i]|) (f64, 8-wide).
188+ /// # Safety
189+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
175190#[ cfg( target_arch = "x86_64" ) ]
176191#[ target_feature( enable = "avx512f" ) ]
177192pub fn asum_f64 ( x : & [ f64 ] ) -> f64 {
@@ -187,6 +202,8 @@ pub fn asum_f64(x: &[f64]) -> f64 {
187202}
188203
189204/// L2 norm: sqrt(sum(x[i]^2)) (f32, 16-wide FMA).
205+ /// # Safety
206+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
190207#[ cfg( target_arch = "x86_64" ) ]
191208#[ target_feature( enable = "avx512f" ) ]
192209pub fn nrm2_f32 ( x : & [ f32 ] ) -> f32 {
@@ -207,6 +224,8 @@ pub fn nrm2_f32(x: &[f32]) -> f32 {
207224}
208225
209226/// L2 norm: sqrt(sum(x[i]^2)) (f64, 8-wide FMA).
227+ /// # Safety
228+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
210229#[ cfg( target_arch = "x86_64" ) ]
211230#[ target_feature( enable = "avx512f" ) ]
212231pub fn nrm2_f64 ( x : & [ f64 ] ) -> f64 {
@@ -227,6 +246,8 @@ pub fn nrm2_f64(x: &[f64]) -> f64 {
227246}
228247
229248/// Index of max absolute value (f32). Scalar — no AVX-512 specialization.
249+ /// # Safety
250+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
230251#[ cfg( target_arch = "x86_64" ) ]
231252#[ target_feature( enable = "avx512f" ) ]
232253pub fn iamax_f32 ( x : & [ f32 ] ) -> ( usize , f32 ) {
@@ -241,6 +262,8 @@ pub fn iamax_f32(x: &[f32]) -> (usize, f32) {
241262}
242263
243264/// Index of max absolute value (f64). Scalar — no AVX-512 specialization.
265+ /// # Safety
266+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
244267#[ cfg( target_arch = "x86_64" ) ]
245268#[ target_feature( enable = "avx512f" ) ]
246269pub fn iamax_f64 ( x : & [ f64 ] ) -> ( usize , f64 ) {
@@ -258,37 +281,53 @@ pub fn iamax_f64(x: &[f64]) -> (usize, f64) {
258281// Element-wise f32 — 8 functions (16-wide, compat types)
259282// ═══════════════════════════════════════════════════════════════════
260283
284+ /// Elementwise `out[i] = a[i] + scalar` (AVX-512 F32x16 kernel).
285+ /// # Safety
286+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
261287#[ cfg( target_arch = "x86_64" ) ]
262- #[ allow( missing_docs, clippy:: missing_safety_doc) ]
263288#[ target_feature( enable = "avx512f" ) ]
264289pub fn add_f32_scalar ( a : & [ f32 ] , scalar : f32 ) -> Vec < f32 > { ew_f32_s ( a, scalar, EwOp :: Add ) }
290+ /// Elementwise `out[i] = a[i] - scalar`.
291+ /// # Safety
292+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
265293#[ cfg( target_arch = "x86_64" ) ]
266- #[ allow( missing_docs, clippy:: missing_safety_doc) ]
267294#[ target_feature( enable = "avx512f" ) ]
268295pub fn sub_f32_scalar ( a : & [ f32 ] , scalar : f32 ) -> Vec < f32 > { ew_f32_s ( a, scalar, EwOp :: Sub ) }
296+ /// Elementwise `out[i] = a[i] * scalar`.
297+ /// # Safety
298+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
269299#[ cfg( target_arch = "x86_64" ) ]
270- #[ allow( missing_docs, clippy:: missing_safety_doc) ]
271300#[ target_feature( enable = "avx512f" ) ]
272301pub fn mul_f32_scalar ( a : & [ f32 ] , scalar : f32 ) -> Vec < f32 > { ew_f32_s ( a, scalar, EwOp :: Mul ) }
302+ /// Elementwise `out[i] = a[i] / scalar`.
303+ /// # Safety
304+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
273305#[ cfg( target_arch = "x86_64" ) ]
274- #[ allow( missing_docs, clippy:: missing_safety_doc) ]
275306#[ target_feature( enable = "avx512f" ) ]
276307pub fn div_f32_scalar ( a : & [ f32 ] , scalar : f32 ) -> Vec < f32 > { ew_f32_s ( a, scalar, EwOp :: Div ) }
277308
309+ /// Elementwise `out[i] = a[i] + b[i]` (AVX-512 F32x16 kernel).
310+ /// # Safety
311+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
278312#[ cfg( target_arch = "x86_64" ) ]
279- #[ allow( missing_docs, clippy:: missing_safety_doc) ]
280313#[ target_feature( enable = "avx512f" ) ]
281314pub fn add_f32_vec ( a : & [ f32 ] , b : & [ f32 ] ) -> Vec < f32 > { ew_f32_v ( a, b, EwOp :: Add ) }
315+ /// Elementwise `out[i] = a[i] - b[i]`.
316+ /// # Safety
317+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
282318#[ cfg( target_arch = "x86_64" ) ]
283- #[ allow( missing_docs, clippy:: missing_safety_doc) ]
284319#[ target_feature( enable = "avx512f" ) ]
285320pub fn sub_f32_vec ( a : & [ f32 ] , b : & [ f32 ] ) -> Vec < f32 > { ew_f32_v ( a, b, EwOp :: Sub ) }
321+ /// Elementwise `out[i] = a[i] * b[i]`.
322+ /// # Safety
323+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
286324#[ cfg( target_arch = "x86_64" ) ]
287- #[ allow( missing_docs, clippy:: missing_safety_doc) ]
288325#[ target_feature( enable = "avx512f" ) ]
289326pub fn mul_f32_vec ( a : & [ f32 ] , b : & [ f32 ] ) -> Vec < f32 > { ew_f32_v ( a, b, EwOp :: Mul ) }
327+ /// Elementwise `out[i] = a[i] / b[i]`.
328+ /// # Safety
329+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
290330#[ cfg( target_arch = "x86_64" ) ]
291- #[ allow( missing_docs, clippy:: missing_safety_doc) ]
292331#[ target_feature( enable = "avx512f" ) ]
293332pub fn div_f32_vec ( a : & [ f32 ] , b : & [ f32 ] ) -> Vec < f32 > { ew_f32_v ( a, b, EwOp :: Div ) }
294333
@@ -514,6 +553,8 @@ fn pack_b_f32(b: &[f32], ldb: usize, kc: usize, nc: usize, k_start: usize, j_sta
514553/// AVX-512 microkernel: C[MR×NR] += A_packed[MR×kc] * B_packed[kc×NR]
515554///
516555/// Uses raw intrinsics for broadcast-FMA and masked store patterns.
556+ /// # Safety
557+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
517558#[ cfg( target_arch = "x86_64" ) ]
518559#[ target_feature( enable = "avx512f" ) ]
519560unsafe fn sgemm_ukernel_6x16 (
@@ -569,6 +610,8 @@ unsafe fn sgemm_ukernel_6x16(
569610}
570611
571612/// Goto BLAS style blocked SGEMM with packing and AVX-512 microkernel.
613+ /// # Safety
614+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
572615#[ cfg( target_arch = "x86_64" ) ]
573616#[ target_feature( enable = "avx512f" ) ]
574617pub fn sgemm_blocked (
@@ -677,6 +720,8 @@ fn pack_b_f64(b: &[f64], ldb: usize, kc: usize, nc: usize, k_start: usize, j_sta
677720}
678721
679722/// AVX-512 microkernel: C[6×8] += A_packed[6×kc] * B_packed[kc×8] (f64)
723+ /// # Safety
724+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
680725#[ cfg( target_arch = "x86_64" ) ]
681726#[ target_feature( enable = "avx512f" ) ]
682727unsafe fn dgemm_ukernel_6x8 (
@@ -732,6 +777,8 @@ unsafe fn dgemm_ukernel_6x8(
732777}
733778
734779/// Goto BLAS style blocked DGEMM with packing and AVX-512 microkernel.
780+ /// # Safety
781+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
735782#[ cfg( target_arch = "x86_64" ) ]
736783#[ target_feature( enable = "avx512f" ) ]
737784pub fn dgemm_blocked (
@@ -845,6 +892,8 @@ pub fn popcount(a: &[u8]) -> u64 {
845892}
846893
847894/// Int8 dot product (scalar — no AVX-512 VNNI specialization yet).
895+ /// # Safety
896+ /// Caller must ensure AVX-512F is available (`simd_caps().avx512f`).
848897#[ cfg( target_arch = "x86_64" ) ]
849898#[ target_feature( enable = "avx512f" ) ]
850899pub fn dot_i8 ( a : & [ u8 ] , b : & [ u8 ] ) -> i64 {
0 commit comments