// 719. SoA vs AoS: data layout for cache efficiency
//
// Shows the impact of struct-of-arrays vs array-of-structs on hot loops.
use std::time::Instant;
// โโ Array of Structures (AoS) โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
/// Classic OOP layout: one struct per element.
/// Memory layout: [x0,y0,z0,mass0,vx0,vy0,vz0, x1,y1,z1,mass1,...].
/// When iterating only `x`, 75% of each cache line is wasted.
#[derive(Clone, Debug, Default)]
pub struct ParticleAoS {
pub x: f32,
pub y: f32,
pub z: f32,
pub mass: f32,
pub vx: f32,
pub vy: f32,
pub vz: f32,
_pad: f32, // pad to 32 bytes for illustration
}
const _: () = assert!(std::mem::size_of::<ParticleAoS>() == 32);
/// Sum all x-coordinates.
/// Loads 32 bytes per element; only 4 bytes (x) are used โ 87.5% wasted bandwidth.
pub fn sum_x_aos(particles: &[ParticleAoS]) -> f32 {
particles.iter().map(|p| p.x).sum()
}
/// Apply gravity to all y-velocities.
pub fn apply_gravity_aos(particles: &mut [ParticleAoS], dt: f32) {
for p in particles.iter_mut() {
p.vy -= 9.81 * p.mass * dt;
}
}
// โโ Structure of Arrays (SoA) โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
/// SoA layout: separate Vec per field.
/// Memory layout: [x0,x1,x2,...] [y0,y1,...] [z0,z1,...] [mass0,mass1,...] ...
/// When iterating only `x`, the full cache line is useful data.
pub struct ParticlesSoA {
pub x: Vec<f32>,
pub y: Vec<f32>,
pub z: Vec<f32>,
pub mass: Vec<f32>,
pub vx: Vec<f32>,
pub vy: Vec<f32>,
pub vz: Vec<f32>,
}
impl ParticlesSoA {
pub fn new(n: usize) -> Self {
Self {
x: vec![0.0; n],
y: vec![0.0; n],
z: vec![0.0; n],
mass: vec![1.0; n],
vx: vec![0.0; n],
vy: vec![0.0; n],
vz: vec![0.0; n],
}
}
pub fn len(&self) -> usize { self.x.len() }
}
/// Sum all x-coordinates.
/// Only touches the `x` array โ 100% of loaded cache lines are used.
pub fn sum_x_soa(p: &ParticlesSoA) -> f32 {
p.x.iter().copied().sum()
}
/// Apply gravity to all y-velocities.
/// Touches only `vy` and `mass` โ two arrays, both hot in cache.
pub fn apply_gravity_soa(p: &mut ParticlesSoA, dt: f32) {
for i in 0..p.len() {
p.vy[i] -= 9.81 * p.mass[i] * dt;
}
}
// โโ Initialisation helpers โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
fn make_aos(n: usize) -> Vec<ParticleAoS> {
(0..n).map(|i| ParticleAoS {
x: i as f32,
y: (i * 2) as f32,
z: 0.0,
mass: 1.0,
..Default::default()
}).collect()
}
fn make_soa(n: usize) -> ParticlesSoA {
let mut p = ParticlesSoA::new(n);
for i in 0..n {
p.x[i] = i as f32;
p.y[i] = (i * 2) as f32;
}
p
}
// โโ main โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
fn main() {
const N: usize = 2_000_000;
const ITERS: u32 = 5;
let mut aos = make_aos(N);
let mut soa = make_soa(N);
// --- sum_x benchmark ---
let t0 = Instant::now();
let mut sink = 0.0f32;
for _ in 0..ITERS { sink += sum_x_aos(&aos); }
let aos_sum_time = t0.elapsed();
let t0 = Instant::now();
let mut sink2 = 0.0f32;
for _ in 0..ITERS { sink2 += sum_x_soa(&soa); }
let soa_sum_time = t0.elapsed();
println!("sum_x (N={N}, {ITERS} iters):");
println!(" AoS: {:?} (result={})", aos_sum_time, sink);
println!(" SoA: {:?} (result={})", soa_sum_time, sink2);
let ratio = aos_sum_time.as_secs_f64() / soa_sum_time.as_secs_f64();
println!(" SoA speedup: {:.2}ร", ratio);
// --- apply_gravity benchmark ---
let t0 = Instant::now();
for _ in 0..ITERS { apply_gravity_aos(&mut aos, 0.016); }
let aos_grav_time = t0.elapsed();
let t0 = Instant::now();
for _ in 0..ITERS { apply_gravity_soa(&mut soa, 0.016); }
let soa_grav_time = t0.elapsed();
println!("\napply_gravity (N={N}, {ITERS} iters):");
println!(" AoS: {:?}", aos_grav_time);
println!(" SoA: {:?}", soa_grav_time);
let ratio2 = aos_grav_time.as_secs_f64() / soa_grav_time.as_secs_f64();
println!(" SoA speedup: {:.2}ร", ratio2);
println!("\nMemory per element:");
println!(" AoS: {} bytes/particle (struct size)", std::mem::size_of::<ParticleAoS>());
println!(" SoA: 7 ร 4 = 28 bytes/particle (7 f32 arrays)");
}
// โโ Tests โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn sum_x_equivalence() {
let n = 100;
let aos = make_aos(n);
let soa = make_soa(n);
let expected: f32 = (0..n).map(|i| i as f32).sum();
assert!((sum_x_aos(&aos) - expected).abs() < 1.0);
assert!((sum_x_soa(&soa) - expected).abs() < 1.0);
}
#[test]
fn gravity_reduces_vy() {
let n = 10;
let mut aos = make_aos(n);
apply_gravity_aos(&mut aos, 1.0);
for p in &aos {
assert!(p.vy < 0.0, "vy should be negative after gravity");
}
let mut soa = make_soa(n);
apply_gravity_soa(&mut soa, 1.0);
for &vy in &soa.vy {
assert!(vy < 0.0);
}
}
#[test]
fn particle_struct_size() {
assert_eq!(std::mem::size_of::<ParticleAoS>(), 32);
}
#[test]
fn soa_particle_count() {
let soa = make_soa(50);
assert_eq!(soa.len(), 50);
assert_eq!(soa.x.len(), 50);
assert_eq!(soa.vy.len(), 50);
}
}
(* OCaml: SoA vs AoS for cache efficiency *)
(* --- Array of Structures (AoS) --- *)
type particle = {
x : float;
y : float;
z : float;
mass : float;
}
(* Sum just the x-coordinates: must traverse all fields even though we skip y/z/mass *)
let sum_x_aos particles =
Array.fold_left (fun acc p -> acc +. p.x) 0.0 particles
(* --- Structure of Arrays (SoA) --- *)
type particles_soa = {
xs : float array;
ys : float array;
zs : float array;
masses: float array;
}
(* Sum just x: touches ONLY xs โ contiguous in memory *)
let sum_x_soa soa =
Array.fold_left (+.) 0.0 soa.xs
(* Gravity update: needs x, y, mass โ still more cache-friendly than AoS *)
let apply_gravity_soa soa dt =
Array.iteri (fun i m ->
soa.ys.(i) <- soa.ys.(i) -. 9.81 *. m *. dt
) soa.masses
let make_aos n =
Array.init n (fun i ->
{ x = float_of_int i; y = float_of_int (i * 2);
z = 0.0; mass = 1.0 })
let make_soa n =
{ xs = Array.init n float_of_int;
ys = Array.init n (fun i -> float_of_int (i * 2));
zs = Array.make n 0.0;
masses = Array.make n 1.0 }
let time_it label f =
let t0 = Sys.time () in
let result = f () in
let t1 = Sys.time () in
Printf.printf "%s: result=%.2f time=%.6fs\n" label result (t1 -. t0)
let () =
let n = 1_000_000 in
let aos = make_aos n in
let soa = make_soa n in
time_it "AoS sum_x" (fun () -> sum_x_aos aos);
time_it "SoA sum_x" (fun () -> sum_x_soa soa);
(* OCaml note: records are heap-allocated objects with GC overhead;
floats in float arrays are unboxed (OCaml optimises float arrays).
So float array SoA in OCaml is actually already cache-friendly! *)
Printf.printf "Note: OCaml float arrays are unboxed โ SoA advantage is native here.\n"