AdaWorldAPI
diff --git a/‎Cargo.lock‎
Lines changed: 367 additions & 210 deletions b/‎Cargo.lock‎
Lines changed: 367 additions & 210 deletions
diff --git a/‎crates/burn-adaworld/Cargo.toml‎
Lines changed: 61 additions & 28 deletions b/‎crates/burn-adaworld/Cargo.toml‎
Lines changed: 61 additions & 28 deletions
diff --git a/‎crates/burn-adaworld/src/backend.rs‎
Lines changed: 221 additions & 59 deletions b/‎crates/burn-adaworld/src/backend.rs‎
Lines changed: 221 additions & 59 deletions
@@ -1,42 +1,75 @@
 [package]
 name = "burn-adaworld"
 version = "0.1.0"
-edition = "2021"
+edition = "2024"
 license = "MIT OR Apache-2.0"
 publish = false
 description = """
-Burn backend powered by adaworldapi/ndarray with:
-- crate::simd F32x16 via LazyLock dispatch (AVX-512 → AVX2 → scalar)
-- bgz-tensor AttentionTable for O(1) compiled attention (optional)
-- CAM-PQ product quantization for 170× compression (optional)
-- SimilarityTable as BF16-precision cosine replacement (256 levels, O(1))
-
-The consumer sees burn's Tensor<B, D> API. Behind it:
-matmul() → checks for compiled AttentionTable → falls through to BLAS.
-All SIMD via crate::simd only. Consumer never sees hardware.
+Burn ndarray backend forked into adaworldapi/ndarray for SIMD augmentation.
+Source: upstream burn-ndarray (tracel-ai/burn, v0.21.0-pre.2).
+Goal: replace macerator SIMD with crate::simd F32x16 + LazyLock dispatch,
+add bgz-tensor AttentionTable compiled attention path.
 """
 
+[features]
+default = ["std", "simd", "multi-threads"]
+multi-threads = ["rayon", "ndarray/rayon", "matrixmultiply/threading"]
+simd = ["macerator", "bytemuck", "seq-macro", "itertools"]
+std = [
+    "burn-autodiff",
+    "burn-std/std",
+    "burn-backend/std",
+    "burn-ir/std",
+    "ndarray/std",
+    "matrixmultiply/std",
+    "rand/std",
+    "rand/std_rng",
+    "num-traits/std",
+    "macerator/std",
+]
+blas-openblas = ["blas-src/openblas", "ndarray/blas", "openblas-src"]
+blas-openblas-system = ["blas-src/openblas", "ndarray/blas", "openblas-src/system"]
+blas-netlib = ["blas-src/netlib", "ndarray/blas"]
+export_tests = []
+
 [dependencies]
-# Upstream burn — Backend trait + tensor API
-burn-backend = "0.21.0-pre.2"
-burn-tensor = "0.21.0-pre.2"
+# Upstream burn crates (from git main — matches source code we copied)
+burn-autodiff = { git = "https://github.com/tracel-ai/burn.git", default-features = false, optional = true }
+burn-std = { git = "https://github.com/tracel-ai/burn.git", default-features = false }
+burn-ir = { git = "https://github.com/tracel-ai/burn.git", default-features = false }
+burn-backend = { git = "https://github.com/tracel-ai/burn.git", default-features = false }
+
+# ndarray — uses our workspace root (adaworldapi/ndarray with SIMD + HPC)
+ndarray = { path = "../..", default-features = false }
+
+# Matrix multiply
+matrixmultiply = { version = "0.3", default-features = false }
+
+# Element traits
+num-traits = { version = "0.2", default-features = false }
+libm = "0.2"
+atomic_float = "1"
+const-random = "0.1"
+paste = "1"
 
-# Our ndarray with SIMD + HPC extensions
-ndarray = { path = "../..", features = ["std"] }
+# Random
+rand = { version = "0.10", default-features = false, features = ["std_rng"] }
 
-# Standard deps
+# Serialization
 serde = { version = "1", features = ["derive"] }
-half = { version = "2", features = ["num-traits"] }
-num-traits = "0.2"
-rand = "0.8"
 
-[dev-dependencies]
-burn-tensor-testgen = "0.21.0-pre.2"
+# SIMD (macerator — upstream burn's choice, will augment with crate::simd)
+macerator = { version = "0.3", default-features = false, optional = true }
+bytemuck = { version = "1", optional = true }
+seq-macro = { version = "0.3", optional = true }
+itertools = { version = "0.14", optional = true }
 
-[features]
-default = ["std"]
-std = []
-# Enable bgz-tensor AttentionTable path for compiled attention
-attention-table = []
-# Enable multi-threaded execution via rayon
-multi-threads = ["ndarray/rayon"]
+# Parallel
+rayon = { version = "1", optional = true }
+
+# BLAS (optional)
+blas-src = { version = "0.10", default-features = false, optional = true }
+openblas-src = { version = "0.10", optional = true }
+
+[dev-dependencies]
+bytes = "1"
@@ -1,60 +1,222 @@
-//! AdaWorld backend: implements burn's Backend trait.
-//!
-//! Delegates all tensor operations to ndarray + crate::simd.
-//! This is the entry point — every burn model compiled with `Backend = AdaWorld`
-//! runs on our SIMD dispatch with optional AttentionTable compiled attention.
-//!
-//! # Implementation Status
-//!
-//! The Backend trait requires ~200+ methods across 7 op traits.
-//! Implementation strategy: core ops first (what Whisper/Llama need),
-//! then expand coverage guided by burn-backend-tests.
-//!
-//! Required traits:
-//!   FloatTensorOps  — 84 required methods (+ ~36 with defaults)
-//!   IntTensorOps    — ~50 required methods
-//!   BoolTensorOps   — ~30 required methods
-//!   ModuleOps       — conv, pool, embedding, etc.
-//!   ActivationOps   — relu, sigmoid, gelu (most have defaults)
-//!   QTensorOps      — quantized tensor ops
-//!   TransactionOps  — batch execution
-//!
-//! # Architecture
-//!
-//! ```text
-//! burn::Tensor<AdaWorld, D>
-//!   ↓ (burn dispatches via Backend trait)
-//! AdaWorld::float_matmul(lhs, rhs)
-//!   ↓ (check for compiled attention table)
-//!   ├── AttentionTable[q_idx][k_idx]  → O(1)  (if compiled)
-//!   └── ndarray general_mat_mul()     → O(d)  (fallback to BLAS)
-//!         ↓ (ndarray delegates to BLAS or matrixmultiply)
-//!         crate::simd::F32x16         → AVX-512 / AVX2 via LazyLock dispatch
-//! ```
-
-use crate::tensor::AdaTensor;
-
-/// The AdaWorld backend.
+use crate::rand::NdArrayRng;
+use crate::{NdArrayQTensor, NdArrayTensor};
+use crate::{
+    SharedArray,
+    element::{FloatNdArrayElement, IntNdArrayElement, QuantElement},
+};
+use alloc::string::String;
+use burn_backend::quantization::{QuantLevel, QuantMode, QuantScheme, QuantStore, QuantValue};
+use burn_backend::tensor::{BoolTensor, FloatTensor, IntTensor, QuantizedTensor};
+use burn_backend::{Backend, DType, DeviceId, DeviceOps};
+use burn_ir::{BackendIr, HandleKind, TensorHandle};
+use burn_std::BoolStore;
+use burn_std::stub::Mutex;
+use core::marker::PhantomData;
+use rand::SeedableRng;
+
+pub(crate) static SEED: Mutex<Option<NdArrayRng>> = Mutex::new(None);
+
+/// The device type for the ndarray backend.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
+pub enum NdArrayDevice {
+    /// The CPU device.
+    #[default]
+    Cpu,
+}
+
+impl DeviceOps for NdArrayDevice {}
+
+impl burn_backend::Device for NdArrayDevice {
+    fn from_id(_device_id: DeviceId) -> Self {
+        Self::Cpu
+    }
+
+    fn to_id(&self) -> DeviceId {
+        DeviceId {
+            type_id: 0,
+            index_id: 0,
+        }
+    }
+}
+
+/// Tensor backend that uses the [ndarray](ndarray) crate for executing tensor operations.
 ///
-/// CPU-only. Uses adaworldapi/ndarray with crate::simd SIMD dispatch.
-/// Feature `attention-table` enables bgz-tensor compiled attention path.
-#[derive(Clone, Default, Debug)]
-pub struct AdaWorld;
-
-/// CPU device (unit type — there's only one CPU).
-#[derive(Clone, Default, Debug, PartialEq, Eq, Hash)]
-pub struct CpuDevice;
-
-// NOTE: Full Backend trait implementation requires ~200+ methods across 7 traits.
-// This is tracked as a multi-session effort:
-//
-// Session 1 (current): Crate skeleton + architecture + tensor primitive
-// Session 2: FloatTensorOps core (from_data, matmul, add, mul, exp, reshape, transpose)
-// Session 3: IntTensorOps + BoolTensorOps
-// Session 4: ModuleOps (conv, embedding) + ActivationOps
-// Session 5: QTensorOps + TransactionOps + burn-backend-tests
-//
-// The implementation follows burn-ndarray's pattern but uses:
-//   - crate::simd::F32x16 for element-wise ops (not macerator)
-//   - LazyLock<SimdDispatch> for runtime tier selection (not compile-time features)
-//   - Optional AttentionTable for compiled attention (unique to this backend)
+/// This backend is compatible with CPUs and can be compiled for almost any platform, including
+/// `wasm`, `arm`, and `x86`.
+#[derive(Clone, Copy, Default, Debug)]
+pub struct NdArray<E = f32, I = i64, Q = i8>
+where
+    NdArrayTensor: From<SharedArray<E>>,
+    NdArrayTensor: From<SharedArray<I>>,
+{
+    _e: PhantomData<E>,
+    _i: PhantomData<I>,
+    _q: PhantomData<Q>,
+}
+
+impl<E: FloatNdArrayElement, I: IntNdArrayElement, Q: QuantElement> Backend for NdArray<E, I, Q>
+where
+    NdArrayTensor: From<SharedArray<E>>,
+    NdArrayTensor: From<SharedArray<I>>,
+{
+    type Device = NdArrayDevice;
+
+    type FloatTensorPrimitive = NdArrayTensor;
+    type FloatElem = E;
+
+    type IntTensorPrimitive = NdArrayTensor;
+    type IntElem = I;
+
+    type BoolTensorPrimitive = NdArrayTensor;
+    type BoolElem = bool;
+
+    type QuantizedTensorPrimitive = NdArrayQTensor;
+
+    fn ad_enabled(_device: &Self::Device) -> bool {
+        false
+    }
+
+    fn name(_device: &Self::Device) -> String {
+        String::from("ndarray")
+    }
+
+    fn seed(_device: &Self::Device, seed: u64) {
+        let rng = NdArrayRng::seed_from_u64(seed);
+        let mut seed = SEED.lock().unwrap();
+        *seed = Some(rng);
+    }
+
+    fn dtype_usage(_device: &Self::Device, dtype: DType) -> burn_backend::DTypeUsageSet {
+        match dtype {
+            DType::F64
+            | DType::F32
+            | DType::Flex32
+            | DType::I64
+            | DType::I32
+            | DType::I16
+            | DType::I8
+            | DType::U64
+            | DType::U32
+            | DType::U16
+            | DType::U8
+            | DType::Bool(BoolStore::Native) => burn_backend::DTypeUsage::general(),
+            DType::F16 | DType::BF16 | DType::Bool(_) => burn_backend::DTypeUsageSet::empty(),
+            DType::QFloat(scheme) => {
+                match scheme {
+                    QuantScheme {
+                        level: QuantLevel::Tensor | QuantLevel::Block(_),
+                        mode: QuantMode::Symmetric,
+                        #[cfg(not(feature = "export_tests"))]
+                            value: QuantValue::Q8F | QuantValue::Q8S,
+                        // For tests, "native" sub-byte quant serves as a reference for value equality.
+                        // Values are stored as i8 regardless.
+                        #[cfg(feature = "export_tests")]
+                            value:
+                            QuantValue::Q8F
+                            | QuantValue::Q8S
+                            | QuantValue::Q4F
+                            | QuantValue::Q4S
+                            | QuantValue::Q2F
+                            | QuantValue::Q2S,
+                        store: QuantStore::Native,
+                        ..
+                    } => burn_backend::DTypeUsage::general(),
+                    _scheme => burn_backend::DTypeUsageSet::empty(),
+                }
+            }
+        }
+    }
+
+    fn device_count(_: u16) -> usize {
+        1
+    }
+}
+
+impl<E: FloatNdArrayElement, I: IntNdArrayElement, Q: QuantElement> BackendIr for NdArray<E, I, Q>
+where
+    NdArrayTensor: From<SharedArray<E>>,
+    NdArrayTensor: From<SharedArray<I>>,
+{
+    type Handle = HandleKind<Self>;
+
+    fn float_tensor(handle: TensorHandle<Self::Handle>) -> FloatTensor<Self> {
+        match handle.handle {
+            HandleKind::Float(handle) => handle,
+            _ => panic!("Expected float handle, got {}", handle.handle.name()),
+        }
+    }
+
+    fn int_tensor(handle: TensorHandle<Self::Handle>) -> IntTensor<Self> {
+        match handle.handle {
+            HandleKind::Int(handle) => handle,
+            _ => panic!("Expected int handle, got {}", handle.handle.name()),
+        }
+    }
+
+    fn bool_tensor(handle: TensorHandle<Self::Handle>) -> BoolTensor<Self> {
+        match handle.handle {
+            HandleKind::Bool(handle) => handle,
+            _ => panic!("Expected bool handle, got {}", handle.handle.name()),
+        }
+    }
+
+    fn quantized_tensor(handle: TensorHandle<Self::Handle>) -> QuantizedTensor<Self> {
+        match handle.handle {
+            HandleKind::Quantized(handle) => handle,
+            _ => panic!("Expected quantized handle, got {}", handle.handle.name()),
+        }
+    }
+
+    fn float_tensor_handle(tensor: FloatTensor<Self>) -> Self::Handle {
+        HandleKind::Float(tensor)
+    }
+
+    fn int_tensor_handle(tensor: IntTensor<Self>) -> Self::Handle {
+        HandleKind::Int(tensor)
+    }
+
+    fn bool_tensor_handle(tensor: BoolTensor<Self>) -> Self::Handle {
+        HandleKind::Bool(tensor)
+    }
+
+    fn quantized_tensor_handle(tensor: QuantizedTensor<Self>) -> Self::Handle {
+        HandleKind::Quantized(tensor)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use burn_backend::QTensorPrimitive;
+
+    #[test]
+    fn should_support_dtypes() {
+        type B = NdArray<f32>;
+        let device = Default::default();
+
+        assert!(B::supports_dtype(&device, DType::F64));
+        assert!(B::supports_dtype(&device, DType::F32));
+        assert!(B::supports_dtype(&device, DType::Flex32));
+        assert!(B::supports_dtype(&device, DType::I64));
+        assert!(B::supports_dtype(&device, DType::I32));
+        assert!(B::supports_dtype(&device, DType::I16));
+        assert!(B::supports_dtype(&device, DType::I8));
+        assert!(B::supports_dtype(&device, DType::U64));
+        assert!(B::supports_dtype(&device, DType::U32));
+        assert!(B::supports_dtype(&device, DType::U16));
+        assert!(B::supports_dtype(&device, DType::U8));
+        assert!(B::supports_dtype(&device, DType::Bool(BoolStore::Native)));
+        assert!(B::supports_dtype(
+            &device,
+            DType::QFloat(NdArrayQTensor::default_scheme())
+        ));
+
+        assert!(!B::supports_dtype(&device, DType::F16));
+        assert!(!B::supports_dtype(&device, DType::BF16));
+        // QuantStore::U32 not supported
+        assert!(!B::supports_dtype(
+            &device,
+            DType::QFloat(QuantScheme::default())
+        ));
+    }
+}