Initial commit: Cache patterns benchmark example

art049 · art049 · commit ffbd2e363e1a · 2026-01-20T12:00:30.000+01:00
Add particle simulation demonstrating Array of Structures (AoS) vs
Structure of Arrays (SoA) for cache-friendly data layouts.

- Implement AoS and SoA particle systems
- Add benchmarks for position updates, kinetic energy, and gravity
- Include documentation on cache-friendly patterns
- Pin Rust toolchain to 1.83.0
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,14 @@
+# Rust build artifacts
+/target/
+Cargo.lock
+
+# IDE files
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS files
+.DS_Store
+Thumbs.db
diff --git a/Cargo.toml b/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "cache-patterns"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+
+[dev-dependencies]
+divan = "0.1"
+
+[[bench]]
+name = "particle_simulation"
+harness = false
diff --git a/README.md b/README.md
@@ -0,0 +1,95 @@
+# Cache Patterns Benchmark
+
+This crate demonstrates the performance impact of different data layouts on CPU cache utilization through a particle physics simulation.
+
+## Initial Assumption
+
+**Hypothesis**: Data layout significantly impacts CPU cache behavior. Specifically, organizing data as a Structure of Arrays (SoA) should show measurably better cache performance than Array of Structures (AoS) when operations only access a subset of fields.
+
+This benchmark is designed to validate this hypothesis using CodSpeed's walltime instrument, which provides hardware performance counters including cache hit/miss rates, memory bandwidth, and IPC (instructions per cycle).
+
+## The Problem: Array of Structures (AoS) vs Structure of Arrays (SoA)
+
+### Array of Structures (AoS) - Cache Unfriendly
+```rust
+struct Particle {
+    position: Vec3,  // 12 bytes
+    velocity: Vec3,  // 12 bytes
+    mass: f32,       // 4 bytes
+}                    // = 28 bytes per particle (40 with padding)
+
+particles: Vec<Particle>
+```
+
+**Memory layout**: `[pos0, vel0, mass0, pos1, vel1, mass1, pos2, vel2, mass2, ...]`
+
+When we only need to update positions, we load entire cache lines containing velocity and mass data that we don't use, wasting bandwidth and cache space.
+
+### Structure of Arrays (SoA) - Cache Friendly
+```rust
+struct ParticleSystem {
+    positions: Vec<Vec3>,
+    velocities: Vec<Vec3>,
+    masses: Vec<f32>,
+}
+```
+
+**Memory layout**:
+- `positions: [pos0, pos1, pos2, ...]`
+- `velocities: [vel0, vel1, vel2, ...]`
+- `masses: [mass0, mass1, mass2, ...]`
+
+When we update positions, every byte in the cache line is useful data, maximizing cache efficiency.
+
+## Expected Performance Characteristics
+
+### AoS (Cache Unfriendly)
+- Higher L1/L2/L3 cache miss rates
+- Lower memory bandwidth utilization
+- More stalls waiting for memory
+
+### SoA (Cache Friendly)
+- Lower cache miss rates (better spatial locality)
+- Higher effective memory bandwidth
+- Better prefetcher efficiency
+
+## Running the Benchmarks
+
+```bash
+# Run with standard benchmarking
+cargo bench
+
+# Run with CodSpeed profiling to see cache counters
+# (requires CodSpeed setup with walltime instrument)
+codspeed run cargo bench
+```
+
+## What to Look For in CodSpeed Profiling
+
+When comparing AoS vs SoA versions with CodSpeed's walltime instrument, you should see:
+
+1. **Cache Misses**: SoA should show significantly fewer L1/L2/L3 cache misses
+2. **Memory Operations**: Better cache line utilization in SoA version
+3. **Instructions Per Cycle (IPC)**: Higher IPC in SoA due to less memory stalls
+4. **Wall Time**: SoA should be faster, especially with larger datasets
+
+## Benchmark Operations
+
+Each version implements three operations:
+
+1. **update_positions**: `position = position + velocity * dt`
+   - Tests spatial locality when accessing two arrays
+
+2. **compute_kinetic_energy**: `sum(0.5 * mass * velocity²)`
+   - Tests cache behavior when skipping position data
+
+3. **apply_gravity**: `velocity = velocity + gravity * dt`
+   - Tests cache behavior when accessing only one field
+
+## Dataset Sizes
+
+- **Small**: 1,000 particles (~40 KB for AoS, ~32 KB for SoA)
+- **Medium**: 10,000 particles (~400 KB for AoS, ~320 KB for SoA)
+- **Large**: 100,000 particles (~4 MB for AoS, ~3.2 MB for SoA)
+
+Different sizes stress different cache levels (L1/L2/L3).
diff --git a/benches/particle_simulation.rs b/benches/particle_simulation.rs
@@ -0,0 +1,67 @@
+use cache_patterns::{aos, soa, Vec3};
+
+fn main() {
+    divan::main();
+}
+
+// ============================================================================
+// Array of Structures (AoS) - Cache Unfriendly
+// ============================================================================
+
+#[divan::bench(args = [1_000, 10_000, 100_000])]
+fn aos_update_positions(bencher: divan::Bencher, count: usize) {
+    bencher
+        .with_inputs(|| aos::ParticleSystem::new(count))
+        .bench_values(|mut system| {
+            system.update_positions(0.016);
+        });
+}
+
+#[divan::bench(args = [1_000, 10_000, 100_000])]
+fn aos_kinetic_energy(bencher: divan::Bencher, count: usize) {
+    bencher
+        .with_inputs(|| aos::ParticleSystem::new(count))
+        .bench_values(|system| {
+            divan::black_box(system.compute_kinetic_energy());
+        });
+}
+
+#[divan::bench(args = [1_000, 10_000, 100_000])]
+fn aos_apply_gravity(bencher: divan::Bencher, count: usize) {
+    bencher
+        .with_inputs(|| aos::ParticleSystem::new(count))
+        .bench_values(|mut system| {
+            system.apply_gravity(Vec3::new(0.0, -9.81, 0.0), 0.016);
+        });
+}
+
+// ============================================================================
+// Structure of Arrays - Cache Friendly
+// ============================================================================
+
+#[divan::bench(args = [1_000, 10_000, 100_000])]
+fn soa_update_positions(bencher: divan::Bencher, count: usize) {
+    bencher
+        .with_inputs(|| soa::ParticleSystem::new(count))
+        .bench_values(|mut system| {
+            system.update_positions(0.016);
+        });
+}
+
+#[divan::bench(args = [1_000, 10_000, 100_000])]
+fn soa_kinetic_energy(bencher: divan::Bencher, count: usize) {
+    bencher
+        .with_inputs(|| soa::ParticleSystem::new(count))
+        .bench_values(|system| {
+            divan::black_box(system.compute_kinetic_energy());
+        });
+}
+
+#[divan::bench(args = [1_000, 10_000, 100_000])]
+fn soa_apply_gravity(bencher: divan::Bencher, count: usize) {
+    bencher
+        .with_inputs(|| soa::ParticleSystem::new(count))
+        .bench_values(|mut system| {
+            system.apply_gravity(Vec3::new(0.0, -9.81, 0.0), 0.016);
+        });
+}
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
@@ -0,0 +1,2 @@
+[toolchain]
+channel = "1.92.0"
diff --git a/src/aos.rs b/src/aos.rs
@@ -0,0 +1,71 @@
+/// Array of Structures (AoS) - Cache Unfriendly
+/// When we iterate to update positions, we skip over velocity and mass data,
+/// leading to poor cache utilization
+
+use crate::Vec3;
+
+#[derive(Clone, Debug)]
+pub struct Particle {
+    pub position: Vec3,
+    pub velocity: Vec3,
+    pub mass: f32,
+}
+
+impl Particle {
+    pub fn new(position: Vec3, velocity: Vec3, mass: f32) -> Self {
+        Self {
+            position,
+            velocity,
+            mass,
+        }
+    }
+}
+
+pub struct ParticleSystem {
+    pub particles: Vec<Particle>,
+}
+
+impl ParticleSystem {
+    pub fn new(count: usize) -> Self {
+        let mut particles = Vec::with_capacity(count);
+        for i in 0..count {
+            let fi = i as f32;
+            particles.push(Particle::new(
+                Vec3::new(fi, fi * 2.0, fi * 3.0),
+                Vec3::new(fi * 0.1, fi * 0.2, fi * 0.3),
+                1.0 + fi * 0.01,
+            ));
+        }
+        Self { particles }
+    }
+
+    /// Update particle positions based on velocity
+    /// Poor cache behavior: we load entire Particle struct (40 bytes) but only need
+    /// position (12 bytes) and velocity (12 bytes)
+    pub fn update_positions(&mut self, dt: f32) {
+        for particle in &mut self.particles {
+            particle.position = particle.position.add(&particle.velocity.scale(dt));
+        }
+    }
+
+    /// Compute total kinetic energy
+    /// Poor cache behavior: we access velocity and mass, skipping position data
+    pub fn compute_kinetic_energy(&self) -> f32 {
+        let mut total = 0.0;
+        for particle in &self.particles {
+            let v2 = particle.velocity.x * particle.velocity.x
+                + particle.velocity.y * particle.velocity.y
+                + particle.velocity.z * particle.velocity.z;
+            total += 0.5 * particle.mass * v2;
+        }
+        total
+    }
+
+    /// Apply gravity to all particles
+    /// Poor cache behavior: we only need to modify velocity, but load entire struct
+    pub fn apply_gravity(&mut self, gravity: Vec3, dt: f32) {
+        for particle in &mut self.particles {
+            particle.velocity = particle.velocity.add(&gravity.scale(dt));
+        }
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -0,0 +1,32 @@
+/// Particle simulation demonstrating cache-friendly vs cache-unfriendly data layouts
+pub mod aos;
+pub mod soa;
+
+#[derive(Clone, Copy, Debug)]
+pub struct Vec3 {
+    pub x: f32,
+    pub y: f32,
+    pub z: f32,
+}
+
+impl Vec3 {
+    pub fn new(x: f32, y: f32, z: f32) -> Self {
+        Self { x, y, z }
+    }
+
+    pub fn add(&self, other: &Vec3) -> Vec3 {
+        Vec3 {
+            x: self.x + other.x,
+            y: self.y + other.y,
+            z: self.z + other.z,
+        }
+    }
+
+    pub fn scale(&self, factor: f32) -> Vec3 {
+        Vec3 {
+            x: self.x * factor,
+            y: self.y * factor,
+            z: self.z * factor,
+        }
+    }
+}
diff --git a/src/soa.rs b/src/soa.rs
@@ -0,0 +1,61 @@
+/// Structure of Arrays - Cache Friendly
+/// Data is organized so that accessing positions only touches position data,
+/// leading to excellent cache utilization
+
+use crate::Vec3;
+
+pub struct ParticleSystem {
+    pub positions: Vec<Vec3>,
+    pub velocities: Vec<Vec3>,
+    pub masses: Vec<f32>,
+}
+
+impl ParticleSystem {
+    pub fn new(count: usize) -> Self {
+        let mut positions = Vec::with_capacity(count);
+        let mut velocities = Vec::with_capacity(count);
+        let mut masses = Vec::with_capacity(count);
+
+        for i in 0..count {
+            let fi = i as f32;
+            positions.push(Vec3::new(fi, fi * 2.0, fi * 3.0));
+            velocities.push(Vec3::new(fi * 0.1, fi * 0.2, fi * 0.3));
+            masses.push(1.0 + fi * 0.01);
+        }
+
+        Self {
+            positions,
+            velocities,
+            masses,
+        }
+    }
+
+    /// Update particle positions based on velocity
+    /// Excellent cache behavior: positions and velocities are contiguous,
+    /// all data in cache lines is useful
+    pub fn update_positions(&mut self, dt: f32) {
+        for i in 0..self.positions.len() {
+            self.positions[i] = self.positions[i].add(&self.velocities[i].scale(dt));
+        }
+    }
+
+    /// Compute total kinetic energy
+    /// Good cache behavior: sequential access to velocities and masses
+    pub fn compute_kinetic_energy(&self) -> f32 {
+        let mut total = 0.0;
+        for i in 0..self.velocities.len() {
+            let v = &self.velocities[i];
+            let v2 = v.x * v.x + v.y * v.y + v.z * v.z;
+            total += 0.5 * self.masses[i] * v2;
+        }
+        total
+    }
+
+    /// Apply gravity to all particles
+    /// Excellent cache behavior: only touching velocity array
+    pub fn apply_gravity(&mut self, gravity: Vec3, dt: f32) {
+        for velocity in &mut self.velocities {
+            *velocity = velocity.add(&gravity.scale(dt));
+        }
+    }
+}