Skip to content

Commit 1ec41b6

Browse files
committed
update benches
1 parent d92a9e4 commit 1ec41b6

20 files changed

Lines changed: 532 additions & 139 deletions

Cargo.lock

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ common_traits = "0.12.0"
2424
dsi-bitstream = {version = "0.5.0", features = ["mem_dbg"]}
2525
mem_dbg = "0.3.0"
2626
num-traits = "0.2.19"
27+
num_cpus = "1.17.0"
2728
parking_lot = "0.12.4"
2829
rayon = { version = "1.10.0", optional = true }
2930
serde = { version = "1.0.219", features = ["derive"], optional = true }
@@ -97,6 +98,16 @@ name = "bench_size"
9798
harness = false
9899
path = "benches/bench_size.rs"
99100

101+
[[bench]]
102+
name = "benchmark_lock_free_access"
103+
harness = false
104+
path = "benches/fixed/benchmark_lock_free_access.rs"
105+
106+
[[bench]]
107+
name = "benchmark_locked_access"
108+
harness = false
109+
path = "benches/fixed/benchmark_locked_access.rs"
110+
100111

101112
[features]
102113
default = ["parallel"]

benches/fixed/bench_atomic.rs

Lines changed: 0 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,3 @@
1-
//! # Comprehensive Benchmark Suite for Atomic Operations
2-
//!
3-
//! This suite provides an exhaustive performance analysis of `AtomicFixedVec`,
4-
//! comparing it against `sux::bits::AtomicBitFieldVec` and a `Vec<AtomicU64>`
5-
//! baseline under a wide range of conditions.
6-
//!
7-
//! ## Methodology
8-
//!
9-
//! To provide a complete performance picture, the benchmarks are structured
10-
//! along several key dimensions:
11-
//!
12-
//! 1. **Bit Width**:
13-
//! - **16-bit**: Tests the highly optimized, lock-free path for power-of-two
14-
//! widths where elements are guaranteed to fit within a single `u64`.
15-
//! - **21-bit**: Tests the more complex (but correct) hybrid path for
16-
//! non-power-of-two widths, which uses 128-bit atomics for values
17-
//! that span word boundaries.
18-
//!
19-
//! 2. **Concurrency Level (Scalability)**:
20-
//! - **Single-Thread**: Establishes a baseline for raw, uncontended throughput.
21-
//! - **Multi-Thread (2, 4, 8 threads)**: Measures performance scaling as
22-
//! the number of concurrent threads increases.
23-
//!
24-
//! 3. **Contention Pattern**:
25-
//! - **Random Access (Diffuse Contention)**: Simulates a workload where
26-
//! threads access random, uniformly distributed locations. This is a
27-
//! common case with low probability of multiple threads hitting the same
28-
//! atomic word simultaneously.
29-
//! - **High Contention**: A stress test where all threads repeatedly target
30-
//! the *exact same* memory location. This is the worst-case scenario and
31-
//! is critical for evaluating the efficiency of the underlying
32-
//_compare-and-swap_
33-
//! loops and cache coherency protocols.
34-
//!
35-
//! 4. **Memory Ordering**:
36-
//! - **`Ordering::SeqCst`**: The strongest, most expensive ordering, which
37-
//! guarantees a single global order of operations.
38-
//! - **`Ordering::Relaxed`**: The weakest, fastest ordering, which provides
39-
//! no ordering guarantees between threads but ensures atomicity. This is
40-
//! common in algorithms like counters where only the final atomic value matters.
41-
421
use compressed_intvec::fixed::atomic::UAtomicFixedVec;
432
use compressed_intvec::fixed::BitWidth;
443
use criterion::{black_box, criterion_group, criterion_main, Criterion};

benches/fixed/bench_random_access.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -162,9 +162,9 @@ fn benchmark_random_access(c: &mut Criterion) {
162162
criterion_group! {
163163
name = benches;
164164
config = Criterion::default()
165-
.sample_size(10)
166-
.warm_up_time(Duration::from_millis(100))
167-
.measurement_time(Duration::from_secs(2));
165+
.sample_size(50)
166+
.warm_up_time(Duration::from_millis(500))
167+
.measurement_time(Duration::from_secs(10));
168168

169169
targets = benchmark_random_access
170170
}
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
use compressed_intvec::fixed::atomic::UAtomicFixedVec;
2+
use compressed_intvec::fixed::BitWidth;
3+
use criterion::{criterion_group, criterion_main, Criterion, Throughput};
4+
use rand::{rngs::SmallRng, Rng, SeedableRng};
5+
use std::sync::atomic::{AtomicU16, AtomicU64, Ordering};
6+
use std::sync::{Arc, Barrier};
7+
use std::thread;
8+
use std::time::Duration;
9+
use sux::prelude::{AtomicBitFieldSlice, AtomicBitFieldVec};
10+
11+
const VECTOR_SIZE: usize = 10_000;
12+
const OPS_PER_THREAD: usize = 100_000;
13+
const BIT_WIDTH: usize = 16; // Power of two for the lock-free path
14+
15+
fn benchmark_lock_free_scaling(c: &mut Criterion) {
16+
let mut thread_counts: Vec<usize> = (1..=num_cpus::get())
17+
.filter(|n| n.is_power_of_two())
18+
.collect();
19+
if !thread_counts.contains(&num_cpus::get()) {
20+
thread_counts.push(num_cpus::get());
21+
}
22+
thread_counts.sort_unstable();
23+
thread_counts.dedup();
24+
25+
for &num_threads in &thread_counts {
26+
let total_ops = (OPS_PER_THREAD * num_threads) as u64;
27+
let mut group = c.benchmark_group(format!("LockFreeScaling_Diffuse/{}Threads", num_threads));
28+
group.throughput(Throughput::Elements(total_ops));
29+
30+
// Pre-generate a single set of random indices for this benchmark configuration.
31+
let mut rng = SmallRng::seed_from_u64(42);
32+
let access_indices: Vec<usize> = (0..total_ops as usize)
33+
.map(|_| rng.random_range(0..VECTOR_SIZE))
34+
.collect();
35+
36+
// --- Setup Data Structures Once ---
37+
let baseline_u16 = Arc::new((0..VECTOR_SIZE).map(|_| AtomicU16::new(0)).collect::<Vec<_>>());
38+
let afv_16bit = Arc::new(
39+
UAtomicFixedVec::<u64>::builder()
40+
.bit_width(BitWidth::Explicit(BIT_WIDTH))
41+
.build(&vec![0; VECTOR_SIZE])
42+
.unwrap(),
43+
);
44+
let sux_storage_16bit = Arc::new((0..(VECTOR_SIZE * BIT_WIDTH).div_ceil(64) + 2).map(|_| AtomicU64::new(0)).collect());
45+
46+
// --- Benchmark Runs ---
47+
group.bench_function("Baseline_Vec<AtomicU16>/store", |b| {
48+
b.iter(|| run_store_on_atomic_u16(&baseline_u16, num_threads, &access_indices));
49+
});
50+
51+
group.bench_function("AtomicFixedVec/store", |b| {
52+
b.iter(|| run_store_on_atomic_fixed_vec(&afv_16bit, num_threads, &access_indices));
53+
});
54+
55+
group.bench_function("sux::AtomicBitFieldVec/store", |b| {
56+
b.iter(|| run_store_on_sux_vec(&sux_storage_16bit, num_threads, &access_indices));
57+
});
58+
59+
group.finish();
60+
}
61+
}
62+
63+
fn run_store_on_atomic_u16(vec: &Arc<Vec<AtomicU16>>, num_threads: usize, indices: &[usize]) {
64+
let barrier = Arc::new(Barrier::new(num_threads));
65+
let chunks: Vec<_> = indices.chunks(OPS_PER_THREAD).collect();
66+
67+
thread::scope(|s| {
68+
for (thread_id, chunk) in chunks.iter().enumerate() {
69+
let vec_clone = Arc::clone(vec);
70+
let barrier_clone = Arc::clone(&barrier);
71+
s.spawn(move || {
72+
barrier_clone.wait();
73+
for &index in *chunk {
74+
vec_clone[index].store(thread_id as u16, Ordering::SeqCst);
75+
}
76+
});
77+
}
78+
});
79+
}
80+
81+
fn run_store_on_atomic_fixed_vec(vec: &Arc<UAtomicFixedVec<u64>>, num_threads: usize, indices: &[usize]) {
82+
let barrier = Arc::new(Barrier::new(num_threads));
83+
let chunks: Vec<_> = indices.chunks(OPS_PER_THREAD).collect();
84+
85+
thread::scope(|s| {
86+
for (thread_id, chunk) in chunks.iter().enumerate() {
87+
let vec_clone = Arc::clone(vec);
88+
let barrier_clone = Arc::clone(&barrier);
89+
s.spawn(move || {
90+
barrier_clone.wait();
91+
for &index in *chunk {
92+
vec_clone.store(index, thread_id as u64, Ordering::SeqCst);
93+
}
94+
});
95+
}
96+
});
97+
}
98+
99+
fn run_store_on_sux_vec(storage: &Arc<Vec<AtomicU64>>, num_threads: usize, indices: &[usize]) {
100+
let barrier = Arc::new(Barrier::new(num_threads));
101+
let chunks: Vec<_> = indices.chunks(OPS_PER_THREAD).collect();
102+
103+
thread::scope(|s| {
104+
for (thread_id, chunk) in chunks.iter().enumerate() {
105+
let storage_clone = Arc::clone(storage);
106+
let barrier_clone = Arc::clone(&barrier);
107+
s.spawn(move || {
108+
let sux_vec = unsafe { AtomicBitFieldVec::<u64, _>::from_raw_parts(storage_clone.as_slice(), BIT_WIDTH, VECTOR_SIZE) };
109+
barrier_clone.wait();
110+
for &index in *chunk {
111+
unsafe {
112+
sux_vec.set_atomic_unchecked(index, thread_id as u64, Ordering::SeqCst);
113+
}
114+
}
115+
});
116+
}
117+
});
118+
}
119+
120+
criterion_group! {
121+
name = benches;
122+
config = Criterion::default()
123+
.sample_size(50)
124+
.warm_up_time(Duration::from_millis(500))
125+
.measurement_time(Duration::from_secs(3));
126+
targets = benchmark_lock_free_scaling
127+
}
128+
criterion_main!(benches);
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
use compressed_intvec::fixed::atomic::UAtomicFixedVec;
2+
use compressed_intvec::fixed::BitWidth;
3+
use criterion::{criterion_group, criterion_main, Criterion, Throughput};
4+
use rand::{rngs::SmallRng, Rng, SeedableRng};
5+
use std::sync::atomic::{AtomicU16, AtomicU64, Ordering};
6+
use std::sync::{Arc, Barrier};
7+
use std::thread;
8+
use std::time::Duration;
9+
use sux::prelude::{AtomicBitFieldSlice, AtomicBitFieldVec};
10+
11+
const VECTOR_SIZE: usize = 10_000;
12+
const OPS_PER_THREAD: usize = 100_000;
13+
const BIT_WIDTH: usize = 15; // Non-power of two to force the locked path
14+
15+
fn benchmark_locked_scaling(c: &mut Criterion) {
16+
let mut thread_counts: Vec<usize> = (1..=num_cpus::get())
17+
.filter(|n| n.is_power_of_two())
18+
.collect();
19+
if !thread_counts.contains(&num_cpus::get()) {
20+
thread_counts.push(num_cpus::get());
21+
}
22+
thread_counts.sort_unstable();
23+
thread_counts.dedup();
24+
25+
for &num_threads in &thread_counts {
26+
let total_ops = (OPS_PER_THREAD * num_threads) as u64;
27+
let mut group = c.benchmark_group(format!("LockedScaling_Diffuse/{}Threads", num_threads));
28+
group.throughput(Throughput::Elements(total_ops));
29+
30+
// Pre-generate a single set of random indices for this benchmark configuration.
31+
let mut rng = SmallRng::seed_from_u64(42);
32+
let access_indices: Vec<usize> = (0..total_ops as usize)
33+
.map(|_| rng.random_range(0..VECTOR_SIZE))
34+
.collect();
35+
36+
// --- Setup Data Structures Once ---
37+
let baseline_u16 = Arc::new((0..VECTOR_SIZE).map(|_| AtomicU16::new(0)).collect::<Vec<_>>());
38+
let afv_15bit = Arc::new(
39+
UAtomicFixedVec::<u64>::builder()
40+
.bit_width(BitWidth::Explicit(BIT_WIDTH))
41+
.build(&vec![0; VECTOR_SIZE])
42+
.unwrap(),
43+
);
44+
let sux_storage_15bit = Arc::new((0..(VECTOR_SIZE * BIT_WIDTH).div_ceil(64) + 2).map(|_| AtomicU64::new(0)).collect());
45+
46+
// --- Benchmark Runs ---
47+
group.bench_function("Baseline_Vec<AtomicU16>/store", |b| {
48+
b.iter(|| run_store_on_atomic_u16(&baseline_u16, num_threads, &access_indices));
49+
});
50+
51+
group.bench_function("AtomicFixedVec/store", |b| {
52+
b.iter(|| run_store_on_atomic_fixed_vec(&afv_15bit, num_threads, &access_indices));
53+
});
54+
55+
group.bench_function("sux::AtomicBitFieldVec/store", |b| {
56+
b.iter(|| run_store_on_sux_vec(&sux_storage_15bit, num_threads, &access_indices));
57+
});
58+
59+
group.finish();
60+
}
61+
}
62+
63+
fn run_store_on_atomic_u16(vec: &Arc<Vec<AtomicU16>>, num_threads: usize, indices: &[usize]) {
64+
let barrier = Arc::new(Barrier::new(num_threads));
65+
let chunks: Vec<_> = indices.chunks(OPS_PER_THREAD).collect();
66+
67+
thread::scope(|s| {
68+
for (thread_id, chunk) in chunks.iter().enumerate() {
69+
let vec_clone = Arc::clone(vec);
70+
let barrier_clone = Arc::clone(&barrier);
71+
s.spawn(move || {
72+
barrier_clone.wait();
73+
for &index in *chunk {
74+
vec_clone[index].store(thread_id as u16, Ordering::SeqCst);
75+
}
76+
});
77+
}
78+
});
79+
}
80+
81+
fn run_store_on_atomic_fixed_vec(vec: &Arc<UAtomicFixedVec<u64>>, num_threads: usize, indices: &[usize]) {
82+
let barrier = Arc::new(Barrier::new(num_threads));
83+
let chunks: Vec<_> = indices.chunks(OPS_PER_THREAD).collect();
84+
85+
thread::scope(|s| {
86+
for (thread_id, chunk) in chunks.iter().enumerate() {
87+
let vec_clone = Arc::clone(vec);
88+
let barrier_clone = Arc::clone(&barrier);
89+
s.spawn(move || {
90+
barrier_clone.wait();
91+
for &index in *chunk {
92+
vec_clone.store(index, thread_id as u64, Ordering::SeqCst);
93+
}
94+
});
95+
}
96+
});
97+
}
98+
99+
fn run_store_on_sux_vec(storage: &Arc<Vec<AtomicU64>>, num_threads: usize, indices: &[usize]) {
100+
let barrier = Arc::new(Barrier::new(num_threads));
101+
let chunks: Vec<_> = indices.chunks(OPS_PER_THREAD).collect();
102+
103+
thread::scope(|s| {
104+
for (thread_id, chunk) in chunks.iter().enumerate() {
105+
let storage_clone = Arc::clone(storage);
106+
let barrier_clone = Arc::clone(&barrier);
107+
s.spawn(move || {
108+
let sux_vec = unsafe { AtomicBitFieldVec::<u64, _>::from_raw_parts(storage_clone.as_slice(), BIT_WIDTH, VECTOR_SIZE) };
109+
barrier_clone.wait();
110+
for &index in *chunk {
111+
unsafe {
112+
sux_vec.set_atomic_unchecked(index, thread_id as u64, Ordering::SeqCst);
113+
}
114+
}
115+
});
116+
}
117+
});
118+
}
119+
120+
criterion_group! {
121+
name = benches;
122+
config = Criterion::default()
123+
.sample_size(50)
124+
.warm_up_time(Duration::from_millis(500))
125+
.measurement_time(Duration::from_secs(3));
126+
targets = benchmark_locked_scaling
127+
}
128+
criterion_main!(benches);

images/atomic_scaling_lock_free_16bit.svg

Lines changed: 1 addition & 0 deletions
Loading

images/atomic_scaling_lock_free_diffuse.svg

Lines changed: 1 addition & 0 deletions
Loading

images/atomic_scaling_locked_21bit.svg

Lines changed: 1 addition & 0 deletions
Loading

images/atomic_scaling_locked_path_diffuse.svg

Lines changed: 1 addition & 0 deletions
Loading

0 commit comments

Comments
 (0)