Skip to content

Commit 00b6ee5

Browse files
committed
feat(backend): re-export all slice-level ops through ndarray::backend::*
Consumers now reach every SIMD-dispatched op from one module: use ndarray::backend::{ // BLAS L1 dot_f32, axpy_f32, scal_f32, nrm2_f32, asum_f32, // GEMM (f32/f64/i8/bf16) gemm_f32, gemm_f64, gemm_i8, gemm_bf16, cblas_sgemm, cblas_dgemm, cblas_gemm_s8s8s32, cblas_gemm_bf16bf16f32, // Elementwise (f32 vec×vec / vec×scalar) add_f32_vec, sub_f32_vec, mul_f32_vec, div_f32_vec, add_f32_scalar, sub_f32_scalar, mul_f32_scalar, div_f32_scalar, // Integer slice ops add_i8, sub_i8, add_i16, dot_i8, dot_i16, min_i8, max_i8, // Half-precision slice ops add_bf16_inplace, mul_bf16_inplace, add_f16_inplace, mul_f16_inplace, cast_bf16_to_f32_batch, cast_f16_to_f32_batch, cast_f32_to_bf16_batch, cast_f32_to_f16_batch, // Reductions sum_f32, sum_f64, mean_f32, mean_f64, max_f32, min_f32, argmax_f32, argmin_f32, }; Previously scattered across backend/kernels_avx512.rs (pub(crate)), simd_int_ops.rs, simd_half.rs, hpc/reductions.rs — none reachable from a single import path. Now all unified. https://claude.ai/code/session_01NYGrxVopyszZYgLBxe4hgj
1 parent 2cd3d8b commit 00b6ee5

1 file changed

Lines changed: 44 additions & 0 deletions

File tree

src/backend/mod.rs

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,3 +278,47 @@ pub fn cblas_gemm_bf16bf16f32(
278278
) {
279279
gemm_bf16(a, b, c, m, n, k)
280280
}
281+
282+
// ─── Elementwise ops (SIMD-dispatched) ───────────────────────────
283+
//
284+
// Slice-level add/sub/mul/div for f32, dispatched through the AVX-512
285+
// kernel with AVX2/scalar fallback. Both vec×vec and vec×scalar forms.
286+
//
287+
// Usage:
288+
// use ndarray::backend::{add_f32_vec, mul_f32_scalar};
289+
// let c = add_f32_vec(&a, &b); // c[i] = a[i] + b[i]
290+
// let d = mul_f32_scalar(&a, 2.0); // d[i] = a[i] * 2.0
291+
292+
#[cfg(target_arch = "x86_64")]
293+
pub use kernels_avx512::{
294+
add_f32_vec, sub_f32_vec, mul_f32_vec, div_f32_vec,
295+
add_f32_scalar, sub_f32_scalar, mul_f32_scalar, div_f32_scalar,
296+
iamax_f32, iamax_f64,
297+
};
298+
299+
// ─── Slice-level ops by dtype (unified re-exports) ──────────────
300+
//
301+
// All the SIMD-dispatched slice ops in one place.
302+
// Integer: simd_int_ops. Half: simd_half. Float: kernels_avx512 + reductions.
303+
304+
#[cfg(feature = "std")]
305+
pub use crate::simd_int_ops::{
306+
add_i8, sub_i8, add_i16,
307+
dot_i8, dot_i16,
308+
min_i8, max_i8,
309+
};
310+
311+
#[cfg(feature = "std")]
312+
pub use crate::simd_half::{
313+
add_bf16_inplace, mul_bf16_inplace,
314+
add_f16_inplace, mul_f16_inplace,
315+
cast_bf16_to_f32_batch, cast_f16_to_f32_batch,
316+
cast_f32_to_bf16_batch, cast_f32_to_f16_batch,
317+
};
318+
319+
#[cfg(feature = "std")]
320+
pub use crate::hpc::reductions::{
321+
sum_f32, sum_f64, mean_f32, mean_f64,
322+
max_f32, min_f32, argmax_f32, argmin_f32,
323+
nrm2_f32 as nrm2_f32_simd,
324+
};

0 commit comments

Comments
 (0)