feat: burn-adaworld crate skeleton — burn Backend powered by ndarray SIMD

claude · claude · commit 7b7dcb45da8f · 2026-03-29T07:54:44.000Z
New crate: crates/burn-adaworld/ Depends on upstream burn-backend + burn-tensor (0.21.0-pre.2) + adaworldapi/ndarray (path) for SIMD-accelerated tensor ops. Architecture: Tensor<AdaWorld, D> → Backend trait → crate::simd F32x16 with optional AttentionTable O(1) compiled attention. Compiles clean. Backend trait impl is 5-session plan. https://claude.ai/code/session_01Y69Vnw751w75iVSBRws7o7
diff --git a/crates/burn-adaworld/Cargo.toml b/crates/burn-adaworld/Cargo.toml
@@ -0,0 +1,42 @@
+[package]
+name = "burn-adaworld"
+version = "0.1.0"
+edition = "2021"
+license = "MIT OR Apache-2.0"
+publish = false
+description = """
+Burn backend powered by adaworldapi/ndarray with:
+- crate::simd F32x16 via LazyLock dispatch (AVX-512 → AVX2 → scalar)
+- bgz-tensor AttentionTable for O(1) compiled attention (optional)
+- CAM-PQ product quantization for 170× compression (optional)
+- SimilarityTable as BF16-precision cosine replacement (256 levels, O(1))
+
+The consumer sees burn's Tensor<B, D> API. Behind it:
+matmul() → checks for compiled AttentionTable → falls through to BLAS.
+All SIMD via crate::simd only. Consumer never sees hardware.
+"""
+
+[dependencies]
+# Upstream burn — Backend trait + tensor API
+burn-backend = "0.21.0-pre.2"
+burn-tensor = "0.21.0-pre.2"
+
+# Our ndarray with SIMD + HPC extensions
+ndarray = { path = "../..", features = ["std"] }
+
+# Standard deps
+serde = { version = "1", features = ["derive"] }
+half = { version = "2", features = ["num-traits"] }
+num-traits = "0.2"
+rand = "0.8"
+
+[dev-dependencies]
+burn-tensor-testgen = "0.21.0-pre.2"
+
+[features]
+default = ["std"]
+std = []
+# Enable bgz-tensor AttentionTable path for compiled attention
+attention-table = []
+# Enable multi-threaded execution via rayon
+multi-threads = ["ndarray/rayon"]
diff --git a/crates/burn-adaworld/src/backend.rs b/crates/burn-adaworld/src/backend.rs
@@ -0,0 +1,60 @@
+//! AdaWorld backend: implements burn's Backend trait.
+//!
+//! Delegates all tensor operations to ndarray + crate::simd.
+//! This is the entry point — every burn model compiled with `Backend = AdaWorld`
+//! runs on our SIMD dispatch with optional AttentionTable compiled attention.
+//!
+//! # Implementation Status
+//!
+//! The Backend trait requires ~200+ methods across 7 op traits.
+//! Implementation strategy: core ops first (what Whisper/Llama need),
+//! then expand coverage guided by burn-backend-tests.
+//!
+//! Required traits:
+//!   FloatTensorOps  — 84 required methods (+ ~36 with defaults)
+//!   IntTensorOps    — ~50 required methods
+//!   BoolTensorOps   — ~30 required methods
+//!   ModuleOps       — conv, pool, embedding, etc.
+//!   ActivationOps   — relu, sigmoid, gelu (most have defaults)
+//!   QTensorOps      — quantized tensor ops
+//!   TransactionOps  — batch execution
+//!
+//! # Architecture
+//!
+//! ```text
+//! burn::Tensor<AdaWorld, D>
+//!   ↓ (burn dispatches via Backend trait)
+//! AdaWorld::float_matmul(lhs, rhs)
+//!   ↓ (check for compiled attention table)
+//!   ├── AttentionTable[q_idx][k_idx]  → O(1)  (if compiled)
+//!   └── ndarray general_mat_mul()     → O(d)  (fallback to BLAS)
+//!         ↓ (ndarray delegates to BLAS or matrixmultiply)
+//!         crate::simd::F32x16         → AVX-512 / AVX2 via LazyLock dispatch
+//! ```
+
+use crate::tensor::AdaTensor;
+
+/// The AdaWorld backend.
+///
+/// CPU-only. Uses adaworldapi/ndarray with crate::simd SIMD dispatch.
+/// Feature `attention-table` enables bgz-tensor compiled attention path.
+#[derive(Clone, Default, Debug)]
+pub struct AdaWorld;
+
+/// CPU device (unit type — there's only one CPU).
+#[derive(Clone, Default, Debug, PartialEq, Eq, Hash)]
+pub struct CpuDevice;
+
+// NOTE: Full Backend trait implementation requires ~200+ methods across 7 traits.
+// This is tracked as a multi-session effort:
+//
+// Session 1 (current): Crate skeleton + architecture + tensor primitive
+// Session 2: FloatTensorOps core (from_data, matmul, add, mul, exp, reshape, transpose)
+// Session 3: IntTensorOps + BoolTensorOps
+// Session 4: ModuleOps (conv, embedding) + ActivationOps
+// Session 5: QTensorOps + TransactionOps + burn-backend-tests
+//
+// The implementation follows burn-ndarray's pattern but uses:
+//   - crate::simd::F32x16 for element-wise ops (not macerator)
+//   - LazyLock<SimdDispatch> for runtime tier selection (not compile-time features)
+//   - Optional AttentionTable for compiled attention (unique to this backend)
diff --git a/crates/burn-adaworld/src/element.rs b/crates/burn-adaworld/src/element.rs
@@ -0,0 +1,47 @@
+//! Element types supported by the AdaWorld backend.
+//!
+//! Maps burn's element traits to ndarray-compatible types.
+
+use burn_backend::Element;
+use burn_tensor::{DType, ElementConversion};
+use num_traits::ToPrimitive;
+
+/// Marker trait for elements usable with our ndarray backend.
+pub trait AdaElement: Element + ndarray::LinalgScalar + ndarray::ScalarOperand + Default + 'static {
+    fn to_f32(self) -> f32;
+    fn from_f32(val: f32) -> Self;
+}
+
+impl AdaElement for f32 {
+    #[inline(always)]
+    fn to_f32(self) -> f32 { self }
+    #[inline(always)]
+    fn from_f32(val: f32) -> Self { val }
+}
+
+impl AdaElement for f64 {
+    #[inline(always)]
+    fn to_f32(self) -> f32 { self as f32 }
+    #[inline(always)]
+    fn from_f32(val: f32) -> Self { val as f64 }
+}
+
+/// Integer element trait.
+pub trait AdaIntElement: Element + ndarray::LinalgScalar + ndarray::ScalarOperand + Default + 'static {
+    fn to_i64(self) -> i64;
+    fn from_i64(val: i64) -> Self;
+}
+
+impl AdaIntElement for i32 {
+    #[inline(always)]
+    fn to_i64(self) -> i64 { self as i64 }
+    #[inline(always)]
+    fn from_i64(val: i64) -> Self { val as i32 }
+}
+
+impl AdaIntElement for i64 {
+    #[inline(always)]
+    fn to_i64(self) -> i64 { self }
+    #[inline(always)]
+    fn from_i64(val: i64) -> Self { val }
+}
diff --git a/crates/burn-adaworld/src/lib.rs b/crates/burn-adaworld/src/lib.rs
@@ -0,0 +1,24 @@
+//! burn-adaworld: Burn backend powered by adaworldapi/ndarray SIMD.
+//!
+//! Implements burn's `Backend` trait using:
+//! - `crate::simd::F32x16` via `LazyLock<SimdDispatch>` (AVX-512 → AVX2 → scalar)
+//! - Optional `AttentionTable` for O(1) compiled attention (bgz-tensor)
+//! - `SimilarityTable` as BF16-precision cosine replacement (256 levels)
+//!
+//! # Usage
+//!
+//! ```ignore
+//! use burn_adaworld::AdaWorld;
+//! use burn_tensor::Tensor;
+//!
+//! let a = Tensor::<AdaWorld, 2>::ones([3, 4], &Default::default());
+//! let b = Tensor::<AdaWorld, 2>::ones([4, 5], &Default::default());
+//! let c = a.matmul(b); // Uses crate::simd BLAS, or AttentionTable if compiled
+//! ```
+
+pub mod backend;
+pub mod element;
+pub mod tensor;
+pub mod ops;
+
+pub use backend::AdaWorld;
diff --git a/crates/burn-adaworld/src/ops.rs b/crates/burn-adaworld/src/ops.rs
@@ -0,0 +1,8 @@
+//! Tensor operations for the AdaWorld backend.
+//!
+//! Implements burn's FloatTensorOps, IntTensorOps, BoolTensorOps by delegating
+//! to ndarray operations accelerated by crate::simd.
+
+pub mod float_ops;
+pub mod int_ops;
+pub mod bool_ops;
diff --git a/crates/burn-adaworld/src/ops/bool_ops.rs b/crates/burn-adaworld/src/ops/bool_ops.rs
@@ -0,0 +1,2 @@
+//! BoolTensorOps for AdaWorld backend.
+//! Placeholder — to be implemented in session 3.
diff --git a/crates/burn-adaworld/src/ops/float_ops.rs b/crates/burn-adaworld/src/ops/float_ops.rs
@@ -0,0 +1,23 @@
+//! FloatTensorOps for AdaWorld backend.
+//!
+//! 84 required methods + ~36 with defaults = ~120 total.
+//! Delegates to ndarray operations with crate::simd acceleration.
+//!
+//! # Implementation Priority
+//!
+//! P0 (Whisper minimal): from_data, into_data, matmul, add, mul, div, exp,
+//!     reshape, transpose, swap_dims, device, to_device, shape, empty, zeros, ones
+//!
+//! P1 (full inference): softmax, log, sqrt, neg, recip, gather, select, slice,
+//!     mask_where, cat, sum, mean, max, min, argmax, argmin, equal
+//!
+//! P2 (training): backward-compatible with burn-autodiff (future)
+
+// Implementation will follow burn-ndarray's pattern:
+// https://github.com/tracel-ai/burn/tree/main/crates/burn-ndarray/src/ops
+//
+// Key differences from burn-ndarray:
+//   1. Uses crate::simd::F32x16 instead of macerator
+//   2. Uses LazyLock<SimdDispatch> for tier selection
+//   3. Optional AttentionTable for compiled matmul
+//   4. SimilarityTable for BF16-equivalent scoring
diff --git a/crates/burn-adaworld/src/ops/int_ops.rs b/crates/burn-adaworld/src/ops/int_ops.rs
@@ -0,0 +1,2 @@
+//! IntTensorOps for AdaWorld backend.
+//! Placeholder — to be implemented in session 3.
diff --git a/crates/burn-adaworld/src/tensor.rs b/crates/burn-adaworld/src/tensor.rs
@@ -0,0 +1,70 @@
+//! Tensor primitive: wraps ndarray::ArcArray for burn's Backend trait.
+
+use ndarray::{ArcArray, IxDyn};
+use std::sync::Arc;
+
+/// The tensor primitive for the AdaWorld backend.
+///
+/// Wraps ndarray's `ArcArray<E, IxDyn>` with reference-counted shared ownership.
+/// Zero-copy when possible (ArcArray uses copy-on-write).
+#[derive(Debug, Clone)]
+pub struct AdaTensor<E: Clone + 'static> {
+    /// The underlying ndarray with dynamic dimensionality.
+    pub array: ArcArray<E, IxDyn>,
+}
+
+impl<E: Clone + Default + 'static> AdaTensor<E> {
+    /// Create from an owned ndarray.
+    pub fn new(array: ndarray::Array<E, IxDyn>) -> Self {
+        Self {
+            array: array.into_shared(),
+        }
+    }
+
+    /// Create from a shared ndarray (zero-copy).
+    pub fn from_shared(array: ArcArray<E, IxDyn>) -> Self {
+        Self { array }
+    }
+
+    /// Shape as a slice.
+    pub fn shape(&self) -> &[usize] {
+        self.array.shape()
+    }
+
+    /// Total number of elements.
+    pub fn len(&self) -> usize {
+        self.array.len()
+    }
+
+    /// Number of dimensions.
+    pub fn ndim(&self) -> usize {
+        self.array.ndim()
+    }
+
+    /// Get a contiguous slice of the data (if layout is standard).
+    pub fn as_slice(&self) -> Option<&[E]> {
+        self.array.as_slice()
+    }
+
+    /// Create a tensor filled with zeros.
+    pub fn zeros(shape: &[usize]) -> Self
+    where
+        E: num_traits::Zero,
+    {
+        Self::new(ndarray::Array::zeros(IxDyn(shape)))
+    }
+
+    /// Create a tensor filled with ones.
+    pub fn ones(shape: &[usize]) -> Self
+    where
+        E: num_traits::One,
+    {
+        Self::new(ndarray::Array::ones(IxDyn(shape)))
+    }
+
+    /// Reshape (zero-copy if contiguous).
+    pub fn reshape(self, shape: &[usize]) -> Self {
+        let array = self.array.into_owned();
+        Self::new(array.into_shape_with_order(IxDyn(shape)).expect("reshape: incompatible shape"))
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+//! BoolTensorOps for AdaWorld backend.`
	`2`	`+//! Placeholder — to be implemented in session 3.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+//! IntTensorOps for AdaWorld backend.`
	`2`	`+//! Placeholder — to be implemented in session 3.`