lukefleed
diff --git a/‎Cargo.lock‎
Lines changed: 11 additions & 0 deletions b/‎Cargo.lock‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 11 additions & 0 deletions b/‎Cargo.toml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎benches/fixed/bench_atomic.rs‎
Lines changed: 0 additions & 41 deletions b/‎benches/fixed/bench_atomic.rs‎
Lines changed: 0 additions & 41 deletions
diff --git a/‎benches/fixed/bench_random_access.rs‎
Lines changed: 3 additions & 3 deletions b/‎benches/fixed/bench_random_access.rs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎benches/fixed/benchmark_lock_free_access.rs‎
Lines changed: 128 additions & 0 deletions b/‎benches/fixed/benchmark_lock_free_access.rs‎
Lines changed: 128 additions & 0 deletions
diff --git a/‎benches/fixed/benchmark_locked_access.rs‎
Lines changed: 128 additions & 0 deletions b/‎benches/fixed/benchmark_locked_access.rs‎
Lines changed: 128 additions & 0 deletions
diff --git a/‎images/atomic_scaling_lock_free_16bit.svg‎
Lines changed: 1 addition & 0 deletions b/‎images/atomic_scaling_lock_free_16bit.svg‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎images/atomic_scaling_lock_free_diffuse.svg‎
Lines changed: 1 addition & 0 deletions b/‎images/atomic_scaling_lock_free_diffuse.svg‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎images/atomic_scaling_locked_21bit.svg‎
Lines changed: 1 addition & 0 deletions b/‎images/atomic_scaling_locked_21bit.svg‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎images/atomic_scaling_locked_path_diffuse.svg‎
Lines changed: 1 addition & 0 deletions b/‎images/atomic_scaling_locked_path_diffuse.svg‎
Lines changed: 1 addition & 0 deletions
@@ -24,6 +24,7 @@ common_traits = "0.12.0"
 dsi-bitstream = {version = "0.5.0", features = ["mem_dbg"]}
 mem_dbg = "0.3.0"
 num-traits = "0.2.19"
+num_cpus = "1.17.0"
 parking_lot = "0.12.4"
 rayon = { version = "1.10.0", optional = true }
 serde = { version = "1.0.219", features = ["derive"], optional = true }
@@ -97,6 +98,16 @@ name = "bench_size"
 harness = false
 path = "benches/bench_size.rs"
 
+[[bench]]
+name = "benchmark_lock_free_access"
+harness = false
+path = "benches/fixed/benchmark_lock_free_access.rs"
+
+[[bench]]
+name = "benchmark_locked_access"
+harness = false
+path = "benches/fixed/benchmark_locked_access.rs"
+
 
 [features]
 default = ["parallel"]
 
@@ -1,44 +1,3 @@
-//! # Comprehensive Benchmark Suite for Atomic Operations
-//!
-//! This suite provides an exhaustive performance analysis of `AtomicFixedVec`,
-//! comparing it against `sux::bits::AtomicBitFieldVec` and a `Vec<AtomicU64>`
-//! baseline under a wide range of conditions.
-//!
-//! ## Methodology
-//!
-//! To provide a complete performance picture, the benchmarks are structured
-//! along several key dimensions:
-//!
-//! 1.  **Bit Width**:
-//!     -   **16-bit**: Tests the highly optimized, lock-free path for power-of-two
-//!         widths where elements are guaranteed to fit within a single `u64`.
-//!     -   **21-bit**: Tests the more complex (but correct) hybrid path for
-//!         non-power-of-two widths, which uses 128-bit atomics for values
-//!         that span word boundaries.
-//!
-//! 2.  **Concurrency Level (Scalability)**:
-//!     -   **Single-Thread**: Establishes a baseline for raw, uncontended throughput.
-//!     -   **Multi-Thread (2, 4, 8 threads)**: Measures performance scaling as
-//!         the number of concurrent threads increases.
-//!
-//! 3.  **Contention Pattern**:
-//!     -   **Random Access (Diffuse Contention)**: Simulates a workload where
-//!         threads access random, uniformly distributed locations. This is a
-//!         common case with low probability of multiple threads hitting the same
-//!         atomic word simultaneously.
-//!     -   **High Contention**: A stress test where all threads repeatedly target
-//!         the *exact same* memory location. This is the worst-case scenario and
-//!         is critical for evaluating the efficiency of the underlying
-//_compare-and-swap_
-//!         loops and cache coherency protocols.
-//!
-//! 4.  **Memory Ordering**:
-//!     -   **`Ordering::SeqCst`**: The strongest, most expensive ordering, which
-//!         guarantees a single global order of operations.
-//!     -   **`Ordering::Relaxed`**: The weakest, fastest ordering, which provides
-//!         no ordering guarantees between threads but ensures atomicity. This is
-//!         common in algorithms like counters where only the final atomic value matters.
-
 use compressed_intvec::fixed::atomic::UAtomicFixedVec;
 use compressed_intvec::fixed::BitWidth;
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
 
@@ -162,9 +162,9 @@ fn benchmark_random_access(c: &mut Criterion) {
 criterion_group! {
     name = benches;
     config = Criterion::default()
-        .sample_size(10)
-        .warm_up_time(Duration::from_millis(100))
-        .measurement_time(Duration::from_secs(2));
+        .sample_size(50)
+        .warm_up_time(Duration::from_millis(500))
+        .measurement_time(Duration::from_secs(10));
 
     targets = benchmark_random_access
 }
 
@@ -0,0 +1,128 @@
+use compressed_intvec::fixed::atomic::UAtomicFixedVec;
+use compressed_intvec::fixed::BitWidth;
+use criterion::{criterion_group, criterion_main, Criterion, Throughput};
+use rand::{rngs::SmallRng, Rng, SeedableRng};
+use std::sync::atomic::{AtomicU16, AtomicU64, Ordering};
+use std::sync::{Arc, Barrier};
+use std::thread;
+use std::time::Duration;
+use sux::prelude::{AtomicBitFieldSlice, AtomicBitFieldVec};
+
+const VECTOR_SIZE: usize = 10_000;
+const OPS_PER_THREAD: usize = 100_000;
+const BIT_WIDTH: usize = 16; // Power of two for the lock-free path
+
+fn benchmark_lock_free_scaling(c: &mut Criterion) {
+    let mut thread_counts: Vec<usize> = (1..=num_cpus::get())
+        .filter(|n| n.is_power_of_two())
+        .collect();
+    if !thread_counts.contains(&num_cpus::get()) {
+        thread_counts.push(num_cpus::get());
+    }
+    thread_counts.sort_unstable();
+    thread_counts.dedup();
+
+    for &num_threads in &thread_counts {
+        let total_ops = (OPS_PER_THREAD * num_threads) as u64;
+        let mut group = c.benchmark_group(format!("LockFreeScaling_Diffuse/{}Threads", num_threads));
+        group.throughput(Throughput::Elements(total_ops));
+
+        // Pre-generate a single set of random indices for this benchmark configuration.
+        let mut rng = SmallRng::seed_from_u64(42);
+        let access_indices: Vec<usize> = (0..total_ops as usize)
+            .map(|_| rng.random_range(0..VECTOR_SIZE))
+            .collect();
+
+        // --- Setup Data Structures Once ---
+        let baseline_u16 = Arc::new((0..VECTOR_SIZE).map(|_| AtomicU16::new(0)).collect::<Vec<_>>());
+        let afv_16bit = Arc::new(
+            UAtomicFixedVec::<u64>::builder()
+                .bit_width(BitWidth::Explicit(BIT_WIDTH))
+                .build(&vec![0; VECTOR_SIZE])
+                .unwrap(),
+        );
+        let sux_storage_16bit = Arc::new((0..(VECTOR_SIZE * BIT_WIDTH).div_ceil(64) + 2).map(|_| AtomicU64::new(0)).collect());
+
+        // --- Benchmark Runs ---
+        group.bench_function("Baseline_Vec<AtomicU16>/store", |b| {
+            b.iter(|| run_store_on_atomic_u16(&baseline_u16, num_threads, &access_indices));
+        });
+
+        group.bench_function("AtomicFixedVec/store", |b| {
+            b.iter(|| run_store_on_atomic_fixed_vec(&afv_16bit, num_threads, &access_indices));
+        });
+
+        group.bench_function("sux::AtomicBitFieldVec/store", |b| {
+            b.iter(|| run_store_on_sux_vec(&sux_storage_16bit, num_threads, &access_indices));
+        });
+
+        group.finish();
+    }
+}
+
+fn run_store_on_atomic_u16(vec: &Arc<Vec<AtomicU16>>, num_threads: usize, indices: &[usize]) {
+    let barrier = Arc::new(Barrier::new(num_threads));
+    let chunks: Vec<_> = indices.chunks(OPS_PER_THREAD).collect();
+
+    thread::scope(|s| {
+        for (thread_id, chunk) in chunks.iter().enumerate() {
+            let vec_clone = Arc::clone(vec);
+            let barrier_clone = Arc::clone(&barrier);
+            s.spawn(move || {
+                barrier_clone.wait();
+                for &index in *chunk {
+                    vec_clone[index].store(thread_id as u16, Ordering::SeqCst);
+                }
+            });
+        }
+    });
+}
+
+fn run_store_on_atomic_fixed_vec(vec: &Arc<UAtomicFixedVec<u64>>, num_threads: usize, indices: &[usize]) {
+    let barrier = Arc::new(Barrier::new(num_threads));
+    let chunks: Vec<_> = indices.chunks(OPS_PER_THREAD).collect();
+
+    thread::scope(|s| {
+        for (thread_id, chunk) in chunks.iter().enumerate() {
+            let vec_clone = Arc::clone(vec);
+            let barrier_clone = Arc::clone(&barrier);
+            s.spawn(move || {
+                barrier_clone.wait();
+                for &index in *chunk {
+                    vec_clone.store(index, thread_id as u64, Ordering::SeqCst);
+                }
+            });
+        }
+    });
+}
+
+fn run_store_on_sux_vec(storage: &Arc<Vec<AtomicU64>>, num_threads: usize, indices: &[usize]) {
+    let barrier = Arc::new(Barrier::new(num_threads));
+    let chunks: Vec<_> = indices.chunks(OPS_PER_THREAD).collect();
+
+    thread::scope(|s| {
+        for (thread_id, chunk) in chunks.iter().enumerate() {
+            let storage_clone = Arc::clone(storage);
+            let barrier_clone = Arc::clone(&barrier);
+            s.spawn(move || {
+                let sux_vec = unsafe { AtomicBitFieldVec::<u64, _>::from_raw_parts(storage_clone.as_slice(), BIT_WIDTH, VECTOR_SIZE) };
+                barrier_clone.wait();
+                for &index in *chunk {
+                    unsafe {
+                        sux_vec.set_atomic_unchecked(index, thread_id as u64, Ordering::SeqCst);
+                    }
+                }
+            });
+        }
+    });
+}
+
+criterion_group! {
+    name = benches;
+    config = Criterion::default()
+        .sample_size(50)
+        .warm_up_time(Duration::from_millis(500))
+        .measurement_time(Duration::from_secs(3));
+    targets = benchmark_lock_free_scaling
+}
+criterion_main!(benches);
@@ -0,0 +1,128 @@
+use compressed_intvec::fixed::atomic::UAtomicFixedVec;
+use compressed_intvec::fixed::BitWidth;
+use criterion::{criterion_group, criterion_main, Criterion, Throughput};
+use rand::{rngs::SmallRng, Rng, SeedableRng};
+use std::sync::atomic::{AtomicU16, AtomicU64, Ordering};
+use std::sync::{Arc, Barrier};
+use std::thread;
+use std::time::Duration;
+use sux::prelude::{AtomicBitFieldSlice, AtomicBitFieldVec};
+
+const VECTOR_SIZE: usize = 10_000;
+const OPS_PER_THREAD: usize = 100_000;
+const BIT_WIDTH: usize = 15; // Non-power of two to force the locked path
+
+fn benchmark_locked_scaling(c: &mut Criterion) {
+    let mut thread_counts: Vec<usize> = (1..=num_cpus::get())
+        .filter(|n| n.is_power_of_two())
+        .collect();
+    if !thread_counts.contains(&num_cpus::get()) {
+        thread_counts.push(num_cpus::get());
+    }
+    thread_counts.sort_unstable();
+    thread_counts.dedup();
+
+    for &num_threads in &thread_counts {
+        let total_ops = (OPS_PER_THREAD * num_threads) as u64;
+        let mut group = c.benchmark_group(format!("LockedScaling_Diffuse/{}Threads", num_threads));
+        group.throughput(Throughput::Elements(total_ops));
+
+        // Pre-generate a single set of random indices for this benchmark configuration.
+        let mut rng = SmallRng::seed_from_u64(42);
+        let access_indices: Vec<usize> = (0..total_ops as usize)
+            .map(|_| rng.random_range(0..VECTOR_SIZE))
+            .collect();
+
+        // --- Setup Data Structures Once ---
+        let baseline_u16 = Arc::new((0..VECTOR_SIZE).map(|_| AtomicU16::new(0)).collect::<Vec<_>>());
+        let afv_15bit = Arc::new(
+            UAtomicFixedVec::<u64>::builder()
+                .bit_width(BitWidth::Explicit(BIT_WIDTH))
+                .build(&vec![0; VECTOR_SIZE])
+                .unwrap(),
+        );
+        let sux_storage_15bit = Arc::new((0..(VECTOR_SIZE * BIT_WIDTH).div_ceil(64) + 2).map(|_| AtomicU64::new(0)).collect());
+
+        // --- Benchmark Runs ---
+        group.bench_function("Baseline_Vec<AtomicU16>/store", |b| {
+            b.iter(|| run_store_on_atomic_u16(&baseline_u16, num_threads, &access_indices));
+        });
+
+        group.bench_function("AtomicFixedVec/store", |b| {
+            b.iter(|| run_store_on_atomic_fixed_vec(&afv_15bit, num_threads, &access_indices));
+        });
+
+        group.bench_function("sux::AtomicBitFieldVec/store", |b| {
+            b.iter(|| run_store_on_sux_vec(&sux_storage_15bit, num_threads, &access_indices));
+        });
+
+        group.finish();
+    }
+}
+
+fn run_store_on_atomic_u16(vec: &Arc<Vec<AtomicU16>>, num_threads: usize, indices: &[usize]) {
+    let barrier = Arc::new(Barrier::new(num_threads));
+    let chunks: Vec<_> = indices.chunks(OPS_PER_THREAD).collect();
+
+    thread::scope(|s| {
+        for (thread_id, chunk) in chunks.iter().enumerate() {
+            let vec_clone = Arc::clone(vec);
+            let barrier_clone = Arc::clone(&barrier);
+            s.spawn(move || {
+                barrier_clone.wait();
+                for &index in *chunk {
+                    vec_clone[index].store(thread_id as u16, Ordering::SeqCst);
+                }
+            });
+        }
+    });
+}
+
+fn run_store_on_atomic_fixed_vec(vec: &Arc<UAtomicFixedVec<u64>>, num_threads: usize, indices: &[usize]) {
+    let barrier = Arc::new(Barrier::new(num_threads));
+    let chunks: Vec<_> = indices.chunks(OPS_PER_THREAD).collect();
+
+    thread::scope(|s| {
+        for (thread_id, chunk) in chunks.iter().enumerate() {
+            let vec_clone = Arc::clone(vec);
+            let barrier_clone = Arc::clone(&barrier);
+            s.spawn(move || {
+                barrier_clone.wait();
+                for &index in *chunk {
+                    vec_clone.store(index, thread_id as u64, Ordering::SeqCst);
+                }
+            });
+        }
+    });
+}
+
+fn run_store_on_sux_vec(storage: &Arc<Vec<AtomicU64>>, num_threads: usize, indices: &[usize]) {
+    let barrier = Arc::new(Barrier::new(num_threads));
+    let chunks: Vec<_> = indices.chunks(OPS_PER_THREAD).collect();
+
+    thread::scope(|s| {
+        for (thread_id, chunk) in chunks.iter().enumerate() {
+            let storage_clone = Arc::clone(storage);
+            let barrier_clone = Arc::clone(&barrier);
+            s.spawn(move || {
+                let sux_vec = unsafe { AtomicBitFieldVec::<u64, _>::from_raw_parts(storage_clone.as_slice(), BIT_WIDTH, VECTOR_SIZE) };
+                barrier_clone.wait();
+                for &index in *chunk {
+                    unsafe {
+                        sux_vec.set_atomic_unchecked(index, thread_id as u64, Ordering::SeqCst);
+                    }
+                }
+            });
+        }
+    });
+}
+
+criterion_group! {
+    name = benches;
+    config = Criterion::default()
+        .sample_size(50)
+        .warm_up_time(Duration::from_millis(500))
+        .measurement_time(Duration::from_secs(3));
+    targets = benchmark_locked_scaling
+}
+criterion_main!(benches);