Skip to content

Commit 0575259

Browse files
committed
feat(fma): soa_scan — 1M-row SoA key-vs-value scalability PoC
Proves the OGAR canon's "the key prerenders nodes with zero value decode" at scale. A NodeRow is 512 B = key(16) + edges(16) + value(480); laid out columnar (SoA) as a contiguous key column + value column, a key-only prefix-route scan (count the skeleton subtree by classid 0x0A02 — the same routing /fma-body's skeleton button does) touches ~30x less memory than materializing the value slab, and is cache-resident at the key column's size. Measured (best-of-7, x86-64-v4): rows key-only (route) value (decode slab) speedup 64000 2209 M/s 35.3 GB/s 17 M/s 8.1 GB/s 130x 256000 1581 M/s 25.3 GB/s 17 M/s 8.1 GB/s 94x 1000000 1507 M/s 24.1 GB/s 17 M/s 8.1 GB/s 89x key-only stays ~1.5 G rows/s and flat as N grows (the 16 MB key column at 1M is cache- resident); the value scan is RAM-bandwidth-bound over 480 MB. The 30x is the pure memory floor; the ~90x adds the key column's cache residency + the per-row slab decode work. 1M is synthetic (real FMA ~1368 placed / ~75K terms — the throughput is the point), seeded by the FMA addressing distribution. Dep-free (std only), like the rest of the crate. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01GJ4NVBSjq1w5h7RmTbVafb
1 parent d75ef84 commit 0575259

2 files changed

Lines changed: 119 additions & 0 deletions

File tree

fma/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ BodyParts3D meshes ──tissue (is_a tree)──► triangle rasterizer (z-buff
4242
| `converge` | **v3**: cascading-HHTL `(place:tissue)` **canonical NodeGuid** + `connected_to` edges → `guid/{guid_converged,nodes,edges}.tsv` |
4343
| `graph` | **v3 render**: SOLID triangle surface colored by `tissue`, with a GUID **prefix** that selects the subtree (`graph … 00000a02` = skeleton) → `graph/graph_<sel>.png` |
4444
| `cockpit_bake` | bake the full body → `cockpit/public/fma_body.mesh` (SPM1, opacity = layer id) for the **`/fma-body`** cockpit page (layer toggles + transparency) |
45+
| `soa_scan` | 1M-row SoA scalability PoC: key-only **prefix-route** scan vs full **value-decode** scan (~90× at 1M — the canon's *"prerender with zero value decode"*) |
4546
| `anchor` | compression study: cascade vs raw-cartesian vs Cartesian-Skeleton hybrid |
4647

4748
## Routes (`serve`)

fma/src/bin/soa_scan.rs

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
// soa_scan.rs — 1M-row SoA scalability PoC: key-only scan vs full-value scan.
2+
//
3+
// Proves the OGAR canon's "the key prerenders nodes with ZERO value decode": when the
4+
// canonical NodeRow (512 B = key 16 + edges 16 + value 480) is laid out COLUMNAR
5+
// (struct-of-arrays) — a contiguous key column + a contiguous value column — a key-only
6+
// scan (prefix-route / render-select, e.g. "draw the skeleton subtree" = classid 0x0A02)
7+
// touches ~30x less memory than materializing the value slab, and stays flat as N grows.
8+
// This is the same prefix routing the /fma-body skeleton button and `graph 00000a02` do,
9+
// measured at scale.
10+
//
11+
// 1M is SYNTHETIC — real FMA is ~1368 placed meshes / ~75K terms; the scan throughput is
12+
// the point, not data realism. Synthetic rows are seeded by the FMA addressing
13+
// distribution (~25% skeleton classid, the rest soft tissue).
14+
//
15+
// usage: soa_scan [max_n] [reps] (default 1_000_000 rows, 5 reps; lower max_n for less RAM)
16+
17+
use std::hint::black_box;
18+
use std::time::Instant;
19+
20+
const CLASSID_SOFT: u32 = 0x0000_0A01;
21+
const CLASSID_SKELETON: u32 = 0x0000_0A02;
22+
23+
/// Columnar SoA: the 16-byte GUID key column and the 480-byte value-slab column, kept
24+
/// in separate contiguous allocations (the "SoA > tenant view" split).
25+
fn build(n: usize) -> (Vec<[u8; 16]>, Vec<[u8; 480]>) {
26+
let mut keys = Vec::with_capacity(n);
27+
let mut values = Vec::with_capacity(n);
28+
for i in 0..n {
29+
// synthetic canonical GUID seeded by i: ~25% skeleton, rest soft tissue.
30+
let classid = if i % 4 == 0 { CLASSID_SKELETON } else { CLASSID_SOFT };
31+
let h = (i as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15); // splitmix-ish spread
32+
let mut k = [0u8; 16];
33+
k[0..4].copy_from_slice(&classid.to_le_bytes()); // 0..4 classid
34+
k[4..12].copy_from_slice(&h.to_le_bytes()); // 4..12 HEEL/HIP/TWIG + family low
35+
let id = (i as u32) & 0x00FF_FFFF;
36+
k[12..15].copy_from_slice(&id.to_le_bytes()[..3]); // identity (u24)
37+
k[15] = (h >> 56) as u8;
38+
keys.push(k);
39+
// value slab: filled from i so the read can't be elided to a no-op.
40+
let mut v = [0u8; 480];
41+
v[0] = (i & 0xFF) as u8;
42+
v[239] = (h & 0xFF) as u8;
43+
v[479] = (i >> 8) as u8;
44+
values.push(v);
45+
}
46+
(keys, values)
47+
}
48+
49+
fn gbps(bytes: usize, secs: f64) -> f64 {
50+
(bytes as f64) / secs / 1e9
51+
}
52+
53+
fn main() {
54+
let a: Vec<String> = std::env::args().collect();
55+
let max_n: usize = a.get(1).and_then(|s| s.parse().ok()).unwrap_or(1_000_000);
56+
let reps: usize = a.get(2).and_then(|s| s.parse().ok()).unwrap_or(5);
57+
58+
println!("# SoA scalability: key-only (16 B/row, prefix-route) vs value (480 B/row, decode the slab)");
59+
println!("# NodeRow = 512 B = key(16) + edges(16) + value(480); columnar SoA. {reps} reps, best-of.");
60+
println!("{:>10} | {:>20} | {:>20} | {:>8}", "rows", "key-only (route)", "value (decode slab)", "speedup");
61+
println!("{:->10}-+-{:->20}-+-{:->20}-+-{:->8}", "", "", "", "");
62+
63+
let scales: Vec<usize> = [64_000usize, 256_000, max_n].into_iter().filter(|&n| n > 0).collect();
64+
for &n in &scales {
65+
let (keys, values) = build(n);
66+
67+
// key-only scan: prefix-route — count the skeleton subtree, reading only the key
68+
// column (classid at bytes 0..4; the 16-byte key column is what's streamed).
69+
let mut best_key = f64::MAX;
70+
let mut routed = 0usize;
71+
for _ in 0..reps {
72+
let t = Instant::now();
73+
let mut c = 0usize;
74+
for k in &keys {
75+
let classid = u32::from_le_bytes([k[0], k[1], k[2], k[3]]);
76+
if classid == CLASSID_SKELETON {
77+
c += 1;
78+
}
79+
}
80+
black_box(c);
81+
best_key = best_key.min(t.elapsed().as_secs_f64());
82+
routed = c;
83+
}
84+
85+
// value scan: materialize the slab — sum all 480 bytes per row (the work a value
86+
// decode / tenant read does), reading the whole value column.
87+
let mut best_val = f64::MAX;
88+
for _ in 0..reps {
89+
let t = Instant::now();
90+
let mut s = 0u64;
91+
for v in &values {
92+
let mut acc = 0u64;
93+
for &b in v.iter() {
94+
acc = acc.wrapping_add(b as u64);
95+
}
96+
s = s.wrapping_add(acc);
97+
}
98+
black_box(s);
99+
best_val = best_val.min(t.elapsed().as_secs_f64());
100+
}
101+
102+
let key_bytes = n * 16; // key column streamed
103+
let val_bytes = n * 480; // value column streamed
104+
let key_rps = n as f64 / best_key;
105+
let val_rps = n as f64 / best_val;
106+
println!(
107+
"{:>10} | {:>8.0} M/s {:>5.1} GB/s | {:>8.0} M/s {:>5.1} GB/s | {:>6.1}x",
108+
n,
109+
key_rps / 1e6,
110+
gbps(key_bytes, best_key),
111+
val_rps / 1e6,
112+
gbps(val_bytes, best_val),
113+
best_val / best_key,
114+
);
115+
let _ = routed;
116+
}
117+
println!("# key-only touches 16 B/row vs 480 B/row (30x less); routing/render-select needs NO value decode.");
118+
}

0 commit comments

Comments
 (0)