Skip to content

Commit d9e1180

Browse files
committed
perf(ssa): pool-allocate Array<@reference T> literals contiguously
When an array literal's element type is `Ptr(Struct(..))` (an `@reference` class) and every element expression is a direct `TypedExpression::Struct`, the lowering now allocates ONE buffer of `N * sizeof(Struct)` and hands each struct literal a pre-computed slot pointer (via a new `array_pool_placement` field on `SsaBuilder`). The struct literal lowering short-circuits its own `Malloc` when this placement is present and stores fields directly through the slot pointer. End result: the N bodies in `[a, b, c, …]` land contiguously in memory instead of in N separate `malloc` chunks. The pointer array surface (`List<Ptr<Body>>.data`) still holds N distinct pointers — one per slot — so identity semantics and pointer comparison are preserved unchanged. Why: on cache-bound architectures (x86 GHA runners specifically), scattered allocations cost ~145ms vs the value-type contiguous layout on `bench_nbody_ref`. The pool gives `@reference` the same spatial-locality access pattern as value-type while keeping the heap-pointer semantics user code expects. Fall-through path (element exprs aren't all struct literals, element type isn't `@reference`) is unchanged. Eligibility check is conservative: both conditions must hold. Bench impact on macOS aarch64 (the cross-platform check): within noise (528-552ms range for nbody_ref-LLVM, same as before). The real test is the Linux CI run on this commit — that's where the spatial-locality lever should show up. All 248 compiler tests + 219 embed tests pass; clippy clean.
1 parent 7242a10 commit d9e1180

1 file changed

Lines changed: 129 additions & 27 deletions

File tree

crates/compiler/src/ssa.rs

Lines changed: 129 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,19 @@ pub struct SsaBuilder {
228228
/// becomes a single Cranelift `fsqrt` (or libm-backed `sin`/`cos`)
229229
/// instruction.
230230
intrinsic_alias_map: IndexMap<InternedString, crate::hir::Intrinsic>,
231+
/// When set, the next `@reference`-class struct literal that
232+
/// gets lowered uses this pointer as its allocation site INSTEAD
233+
/// of emitting a fresh `Malloc`. Used by the array-of-`@reference`
234+
/// literal lowering to pool-allocate the N bodies into one
235+
/// contiguous buffer: spatial locality on cache-bound architectures
236+
/// (x86 GHA runners) gets you the value-type access pattern while
237+
/// preserving the pointer-array semantics user code expects.
238+
///
239+
/// Always `None` outside the array-of-`@reference` literal path.
240+
/// The array lowering pushes a slot pointer before each element
241+
/// expression and clears it immediately after, so nested struct
242+
/// literals don't get accidentally placed.
243+
array_pool_placement: Option<HirId>,
231244
}
232245

233246
/// Context for pattern matching
@@ -502,6 +515,7 @@ impl SsaBuilder {
502515
resume_param_names: HashSet::new(),
503516
preset_param_typed_ast_types: IndexMap::new(),
504517
intrinsic_alias_map: default_intrinsic_alias_map(),
518+
array_pool_placement: None,
505519
}
506520
}
507521

@@ -544,6 +558,7 @@ impl SsaBuilder {
544558
resume_param_names: HashSet::new(),
545559
preset_param_typed_ast_types: IndexMap::new(),
546560
intrinsic_alias_map: default_intrinsic_alias_map(),
561+
array_pool_placement: None,
547562
function,
548563
};
549564
// Pre-register all existing blocks in the definitions map
@@ -4852,33 +4867,45 @@ impl SsaBuilder {
48524867
}
48534868
let total_size = running.max(1);
48544869

4855-
// Size constant for malloc.
4856-
let size_const = self.create_value(
4857-
HirType::I64,
4858-
HirValueKind::Constant(crate::hir::HirConstant::I64(total_size as i64)),
4859-
);
4860-
4861-
// Emit `Call(Intrinsic::Malloc, [size])`. The
4862-
// backend lowers Malloc to a libc call returning a
4863-
// pointer; the SSA value is typed
4864-
// `Ptr(Struct{..})` to match the struct's HIR
4865-
// type.
4866-
let malloc_result =
4867-
self.create_value(struct_ty.clone(), HirValueKind::Instruction);
4868-
self.add_instruction(
4869-
block_id,
4870-
HirInstruction::Call {
4871-
result: Some(malloc_result),
4872-
callee: crate::hir::HirCallable::Intrinsic(
4873-
crate::hir::Intrinsic::Malloc,
4874-
),
4875-
args: vec![size_const],
4876-
type_args: vec![],
4877-
const_args: vec![],
4878-
is_tail: false,
4879-
},
4880-
);
4881-
self.add_use(size_const, malloc_result);
4870+
// Allocation site. Two modes:
4871+
// * Normal: emit `Call(Intrinsic::Malloc, [size])`
4872+
// and field-store through the returned pointer.
4873+
// * Pooled (`array_pool_placement = Some(ptr)`):
4874+
// skip the malloc entirely — the array literal
4875+
// lowering pre-allocated one big buffer for all
4876+
// N bodies and handed us a pointer to slot i.
4877+
// Field stores go directly to that slot, so the
4878+
// N bodies end up contiguous in memory. Spatial
4879+
// locality matches a value-type Array<Body> on
4880+
// cache-bound architectures (x86 GHA runners)
4881+
// while preserving the pointer-array surface
4882+
// the rest of the compiler expects.
4883+
let malloc_result = if let Some(slot_ptr) = self.array_pool_placement {
4884+
slot_ptr
4885+
} else {
4886+
let size_const = self.create_value(
4887+
HirType::I64,
4888+
HirValueKind::Constant(crate::hir::HirConstant::I64(
4889+
total_size as i64,
4890+
)),
4891+
);
4892+
let r = self.create_value(struct_ty.clone(), HirValueKind::Instruction);
4893+
self.add_instruction(
4894+
block_id,
4895+
HirInstruction::Call {
4896+
result: Some(r),
4897+
callee: crate::hir::HirCallable::Intrinsic(
4898+
crate::hir::Intrinsic::Malloc,
4899+
),
4900+
args: vec![size_const],
4901+
type_args: vec![],
4902+
const_args: vec![],
4903+
is_tail: false,
4904+
},
4905+
);
4906+
self.add_use(size_const, r);
4907+
r
4908+
};
48824909

48834910
// For each field: emit GEP (byte offset) + Store.
48844911
// GEPs use HirType::U8 + a single i64 byte-offset
@@ -5039,9 +5066,84 @@ impl SsaBuilder {
50395066
},
50405067
);
50415068

5069+
// Pool-allocation eligibility: `elem_ty` is `Ptr(Struct(..))`
5070+
// (every element is an `@reference` class) AND every
5071+
// `elem_expr` is a direct `TypedExpression::Struct`
5072+
// literal. When eligible, we allocate ONE buffer of
5073+
// `N * sizeof(Struct)` and hand each struct literal a
5074+
// pre-computed slot pointer instead of letting it emit
5075+
// its own `Malloc`. Bodies land contiguously in memory;
5076+
// the pointer-array (`data_ptr` slots) still holds
5077+
// distinct per-body pointers so identity semantics are
5078+
// preserved.
5079+
let pool_eligible = matches!(elem_ty, HirType::Ptr(ref inner) if matches!(**inner, HirType::Struct(_)))
5080+
&& elements
5081+
.iter()
5082+
.all(|e| matches!(e.node, TypedExpression::Struct(_)));
5083+
let (pool_buf_ptr, pool_slot_size) = if pool_eligible {
5084+
if let HirType::Ptr(ref inner) = elem_ty {
5085+
let slot_size = hir_ty_size(inner);
5086+
let pool_total = elements.len() * slot_size;
5087+
let size_const = self.create_value(
5088+
HirType::I64,
5089+
HirValueKind::Constant(crate::hir::HirConstant::I64(pool_total as i64)),
5090+
);
5091+
let pool = self.create_value(elem_ty.clone(), HirValueKind::Instruction);
5092+
self.add_instruction(
5093+
block_id,
5094+
HirInstruction::Call {
5095+
result: Some(pool),
5096+
callee: crate::hir::HirCallable::Intrinsic(
5097+
crate::hir::Intrinsic::Malloc,
5098+
),
5099+
args: vec![size_const],
5100+
type_args: vec![],
5101+
const_args: vec![],
5102+
is_tail: false,
5103+
},
5104+
);
5105+
self.add_use(size_const, pool);
5106+
(Some(pool), slot_size)
5107+
} else {
5108+
(None, 0)
5109+
}
5110+
} else {
5111+
(None, 0)
5112+
};
5113+
50425114
// Step 2: Store each element into the data buffer
50435115
for (i, elem_expr) in elements.iter().enumerate() {
5116+
// For pooled `@reference` arrays: compute slot_i =
5117+
// pool_buf + i*slot_size and hand it to the struct
5118+
// literal lowering via `array_pool_placement`. The
5119+
// struct literal will use this pointer instead of
5120+
// mallocing — fields land at pool_buf+i*slot_size+
5121+
// field_offset, exactly contiguous.
5122+
if let Some(pool) = pool_buf_ptr {
5123+
let offset = i * pool_slot_size;
5124+
let offset_const = self.create_value(
5125+
HirType::I64,
5126+
HirValueKind::Constant(crate::hir::HirConstant::I64(offset as i64)),
5127+
);
5128+
let slot_ptr =
5129+
self.create_value(elem_ty.clone(), HirValueKind::Instruction);
5130+
self.add_instruction(
5131+
block_id,
5132+
HirInstruction::GetElementPtr {
5133+
result: slot_ptr,
5134+
ty: HirType::U8,
5135+
ptr: pool,
5136+
indices: vec![offset_const],
5137+
},
5138+
);
5139+
self.add_use(pool, slot_ptr);
5140+
self.add_use(offset_const, slot_ptr);
5141+
self.array_pool_placement = Some(slot_ptr);
5142+
}
50445143
let elem_val = self.translate_expression(block_id, elem_expr)?;
5144+
// Clear immediately so it doesn't leak into nested
5145+
// struct literals inside subsequent elements.
5146+
self.array_pool_placement = None;
50455147

50465148
let offset = i * elem_size;
50475149
let offset_const = self.create_value(

0 commit comments

Comments
 (0)