Skip to content

Commit 988ea3b

Browse files
avrabeclaude
andauthored
fix(optimizer): optimized path declines to direct on R4-R8 exhaustion (#496, #242) (#502)
The optimized ARM path (ir_to_arm, the default non-`--relocatable` self-contained image) borrowed R12/IP as last-resort scratch when its R4-R8 pool was exhausted. R12 is the encoder's indexed-load base scratch (#212), so under real register pressure the "dest written before R12 used" assumption broke: two folds collide on R12, or an R12 value survives across a later MemLoad's `add ip,ip,#off` base math — a silent miscompile. Both silicon fixtures were victims on the default path: control_step execution-faulted (`add ip,ip,ip` = 2×base → ldrh READ_UNMAPPED) and flight_seam_flat produced wrong values. Fix: `alloc_i32_scratch` (and the i64-pair fallback) flag pool exhaustion via a shared Cell instead of silently borrowing R12; after the build, ir_to_arm returns `Err(UnsupportedInstruction)` so arm_backend retries via the direct selector, which has real spill-on-exhaustion support. This is a correctness retreat (the function loses optimized-path codegen until proper spilling lands in the VCR-RA selector), not a perf change. The declined output is byte-identical to `--no-optimize`. Gated by scripts/repro/r12_spill_496_differential.py (CI: r12-spill-496-oracle): both silicon fixtures, compiled via the DEFAULT optimized path, execute bit-identically to wasmtime (control_step → 0x00210A55, flight_algo → 0x07FDF307); pre-fix the harness faults on every vector. Frozen byte gate bit-identical (it compiles --relocatable, untouched); callee-saved-490, block-brif-483, control_step, bulk_memory_374, flight_seam oracles all green. Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
1 parent dc1d122 commit 988ea3b

3 files changed

Lines changed: 289 additions & 9 deletions

File tree

.github/workflows/ci.yml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,3 +418,43 @@ jobs:
418418
env:
419419
SYNTH: ./target/debug/synth
420420
run: python scripts/repro/block_brif_483_differential.py
421+
422+
r12-spill-496-oracle:
423+
name: optimized-path register-exhaustion oracle
424+
# VCR-ORACLE-001 (#242, #496): EXECUTE the two silicon fixtures
425+
# (control_step, flight_seam_flat) compiled via the DEFAULT optimized path
426+
# (no --relocatable) under unicorn (UC_ARCH_ARM / Thumb) and diff the result
427+
# vs wasmtime. Guards the #496 fix: the optimized path used to borrow R12/IP
428+
# as last-resort scratch when its R4-R8 pool was exhausted, but R12 is the
429+
# encoder's indexed-load base scratch (#212), so under real register pressure
430+
# the value collided — control_step execution-faulted (READ_UNMAPPED from
431+
# `add ip,ip,ip` = 2×base) and flight_seam_flat produced wrong values. The fix
432+
# flags exhaustion and DECLINES the function to the direct selector (which
433+
# spills). The frozen byte gate only covers the --relocatable direct path, so
434+
# nothing else exercises the default self-contained-image path's register
435+
# pressure. Isolated job: emulation deps pip-installed here ONLY.
436+
runs-on: ubuntu-latest
437+
steps:
438+
- uses: actions/checkout@v7
439+
- uses: dtolnay/rust-toolchain@stable
440+
- name: Cache Cargo dependencies
441+
uses: actions/cache@v5
442+
with:
443+
path: |
444+
~/.cargo/registry
445+
~/.cargo/git
446+
target/
447+
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
448+
restore-keys: |
449+
${{ runner.os }}-cargo-
450+
- name: Build synth
451+
run: cargo build -p synth-cli
452+
- uses: actions/setup-python@v5
453+
with:
454+
python-version: "3.x"
455+
- name: Install emulation deps
456+
run: pip install wasmtime unicorn pyelftools
457+
- name: Run register-exhaustion oracle
458+
env:
459+
SYNTH: ./target/debug/synth
460+
run: python scripts/repro/r12_spill_496_differential.py

crates/synth-synthesis/src/optimizer_bridge.rs

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2581,6 +2581,17 @@ impl OptimizerBridge {
25812581
}
25822582
};
25832583

2584+
// #496: the optimized path's last-resort scratch fallbacks below
2585+
// (`alloc_i32_scratch` → R12, `alloc_i64_pair` → (R4,R5)) are UNSAFE under
2586+
// real register pressure — R12/IP is the encoder's indexed-load base
2587+
// scratch (#212), so a fold that exhausts R4-R8 and borrows R12 collides
2588+
// with the address math of a later MemLoad (control_step: `lsl ip,r4,ip`
2589+
// clobbered by base materialization → `add ip,ip,ip`). Rather than emit a
2590+
// silent miscompile, we record that exhaustion happened and DECLINE the
2591+
// whole function to the direct selector (which spills correctly) at the end
2592+
// of the build. The Cell is shared by reference into both closures.
2593+
let r12_exhausted = std::cell::Cell::new(false);
2594+
25842595
// Allocate a CONSECUTIVE callee-saved register pair for an i64 destination.
25852596
//
25862597
// Searches `[(R4,R5), (R6,R7), (R8,R9), (R10,R11)]` for a pair where neither
@@ -2615,11 +2626,13 @@ impl OptimizerBridge {
26152626
return (lo, hi);
26162627
}
26172628
}
2618-
// Fallback — same hardcoded pair the buggy code used. Better than crashing,
2619-
// and matches existing behaviour when the caller is so pressured that
2620-
// even R8..R11 are occupied. (Empirically this never triggers for
2621-
// workloads we care about; if it does, the architectural fix is
2622-
// proper spilling, not a wider search.)
2629+
// #496: every pair is occupied — the (R4,R5) fallback would alias a
2630+
// live value. Flag exhaustion so the build declines to the direct
2631+
// selector; still return a pair so the rest of the build completes
2632+
// (its bytes are discarded). This path is unverified by a fixture
2633+
// (empirically never triggered for our workloads), but flagging it is
2634+
// free insurance against the same class as the i32 case below.
2635+
r12_exhausted.set(true);
26232636
(Reg::R4, Reg::R5)
26242637
};
26252638

@@ -2654,10 +2667,14 @@ impl OptimizerBridge {
26542667
return r;
26552668
}
26562669
}
2657-
// Pressure-relief fallback. R12 is acceptable here because
2658-
// the call sites that use this helper write the destination
2659-
// BEFORE using R12 as scratch (e.g. MemLoad emits the LDR
2660-
// last, after the address math).
2670+
// #496: every callee-saved scratch is live. Borrowing R12/IP here is
2671+
// a miscompile under real pressure — IP is the encoder's indexed-load
2672+
// base scratch, so the "dest written before R12 used" assumption fails
2673+
// when two folds both land on R12 (collide) or an R12 value survives
2674+
// across a later MemLoad's base math. Flag exhaustion so the build
2675+
// declines to the direct selector (which spills); still return R12 so
2676+
// the remaining build completes (those bytes are discarded).
2677+
r12_exhausted.set(true);
26612678
Reg::R12
26622679
};
26632680

@@ -5549,6 +5566,22 @@ impl OptimizerBridge {
55495566
// decision here (pre-realloc) would over-save. See
55505567
// `liveness::ensure_callee_saved_prologue`.
55515568

5569+
// #496: if any scratch allocation exhausted the R4-R8 pool and fell back to
5570+
// R12/IP (or the i64 (R4,R5) alias), the body we just built contains a
5571+
// latent miscompile (R12 is the encoder's indexed-load base scratch).
5572+
// Decline the whole function to the direct selector, which has real
5573+
// spill-on-exhaustion support (`instruction_selector`); `arm_backend`
5574+
// catches this `Err` and retries via `select_direct`. This is a
5575+
// correctness retreat, not a perf win — keeping these functions on the
5576+
// optimized path requires proper spilling there, the VCR-RA follow-up.
5577+
if r12_exhausted.get() {
5578+
return Err(Error::UnsupportedInstruction(
5579+
"optimized path: R4-R8 scratch pool exhausted — declining to the \
5580+
direct selector for correct spilling (#496)"
5581+
.to_string(),
5582+
));
5583+
}
5584+
55525585
Ok(arm_instrs)
55535586
}
55545587
}
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
#!/usr/bin/env python3
2+
"""#496 (epic #242) — EXECUTION-validate the DEFAULT optimized-path register-
3+
exhaustion fix.
4+
5+
The optimized ARM path (`optimizer_bridge.rs::ir_to_arm`, the default
6+
non-`--relocatable` self-contained image) used **R12/IP** as a last-resort
7+
scratch when its R4-R8 pool was exhausted. R12 is the encoder's indexed-load
8+
base scratch (#212), so under real register pressure the "dest written before
9+
R12 used" assumption broke: two folds collide on R12, or an R12 value survives
10+
across a later MemLoad's `add ip,ip,#off` base math — a silent miscompile.
11+
**Both silicon fixtures were victims on the default path**: `control_step`
12+
(0x00210A55) execution-faulted (`lsl ip,r4,ip` clobbered → `add ip,ip,ip` =
13+
2×base → ldrh READ_UNMAPPED), `flight_seam_flat` produced wrong values.
14+
15+
The fix flags pool exhaustion and DECLINES the whole function to the direct
16+
selector (which spills correctly), instead of borrowing R12. This harness proves
17+
the property that matters: the **default** `synth compile` (no `--relocatable`,
18+
i.e. the path a self-contained image actually takes) now executes
19+
bit-for-bit like wasmtime on both fixtures.
20+
21+
wasmtime is ground truth; unicorn runs synth's Thumb-2 output compiled WITHOUT
22+
`--relocatable`. Symbols come from the ELF symtab (SHT_SYMTAB), not `synth
23+
disasm` text — the disasm renders host-dependently (the ARM decoder misreads
24+
Thumb bytes on a bare runner). The linear-memory base is read from the
25+
`__linear_memory_base` symbol and both R11 and the unicorn mapping use it, so
26+
absolute and R11-relative addressing both resolve.
27+
28+
NON-VACUITY: pre-fix, `control_step` faulted and `flight_seam_flat` mismatched on
29+
this exact (default-path) harness; the fix changed both functions' bytes
30+
(control_step 626→464, flat 2096→1070). A regression that reinstates the R12
31+
borrow re-breaks execution here.
32+
33+
Run (needs wasmtime + unicorn + pyelftools):
34+
SYNTH=./target/debug/synth python scripts/repro/r12_spill_496_differential.py
35+
Exits nonzero on any mismatch or fault.
36+
"""
37+
import os
38+
import struct
39+
import subprocess
40+
import sys
41+
from pathlib import Path
42+
43+
import wasmtime
44+
from elftools.elf.elffile import ELFFile
45+
from unicorn import UC_ARCH_ARM, UC_MODE_THUMB, Uc, UcError
46+
from unicorn.arm_const import (
47+
UC_ARM_REG_LR,
48+
UC_ARM_REG_R0,
49+
UC_ARM_REG_R1,
50+
UC_ARM_REG_R2,
51+
UC_ARM_REG_R3,
52+
UC_ARM_REG_R11,
53+
UC_ARM_REG_SP,
54+
)
55+
56+
REPRO = Path(__file__).parent
57+
SYNTH = os.environ.get("SYNTH", "./target/release/synth")
58+
59+
CODE, STK, RET = 0x100000, 0x900000, 0x300000
60+
LIN_SIZE = 0x20000 # cover control_step's 2-page .linear_memory
61+
62+
ARG_REGS = (UC_ARM_REG_R0, UC_ARM_REG_R1, UC_ARM_REG_R2, UC_ARM_REG_R3)
63+
64+
65+
def compile_default(wasm, elf):
66+
"""Compile via the DEFAULT optimized path — NO --relocatable. This is the
67+
self-contained-image path #496 fixes (and the one that miscompiled)."""
68+
r = subprocess.run(
69+
[SYNTH, "compile", str(wasm), "-o", elf, "--target", "cortex-m4",
70+
"--all-exports"],
71+
capture_output=True, text=True, env={"PATH": "/usr/bin:/bin"},
72+
)
73+
if r.returncode != 0:
74+
sys.exit(f"compile failed for {wasm}: {r.stderr}")
75+
76+
77+
def load(elf):
78+
f = ELFFile(open(elf, "rb"))
79+
text = f.get_section_by_name(".text")
80+
code, base = text.data(), text["sh_addr"]
81+
syms = {}
82+
for s in f.iter_sections():
83+
if s.header.sh_type == "SHT_SYMTAB":
84+
for sym in s.iter_symbols():
85+
if sym.name:
86+
syms[sym.name] = sym["st_value"]
87+
return code, base, syms
88+
89+
90+
def run_unicorn(code, base, faddr, lin_base, lin_writes, args):
91+
mu = Uc(UC_ARCH_ARM, UC_MODE_THUMB)
92+
mu.mem_map(CODE, 0x20000)
93+
mu.mem_map(lin_base, LIN_SIZE)
94+
mu.mem_map(STK - 0x8000, 0x10000)
95+
mu.mem_map(RET & ~0xFFF, 0x1000)
96+
mu.mem_write(CODE, code)
97+
for off, data in lin_writes:
98+
mu.mem_write(lin_base + off, data)
99+
mu.reg_write(UC_ARM_REG_SP, STK)
100+
mu.reg_write(UC_ARM_REG_R11, lin_base)
101+
for r in ARG_REGS:
102+
mu.reg_write(r, 0)
103+
for r, v in zip(ARG_REGS, args):
104+
mu.reg_write(r, v & 0xFFFFFFFF)
105+
mu.reg_write(UC_ARM_REG_LR, RET | 1)
106+
try:
107+
mu.emu_start((CODE + faddr - base) | 1, RET, count=200000)
108+
except UcError as e:
109+
return f"ERR:{e}"
110+
return mu.reg_read(UC_ARM_REG_R0) & 0xFFFFFFFF
111+
112+
113+
# ---- control_step: 4 args, .rodata table at linmem offset 0x10000 ----
114+
CS_VECTORS = [
115+
(3000, 50, 40, 0), # gale's reference -> 0x00210A55
116+
(0, 0, 0, 0), (1500, 0, 0, 0), (3000, 50, 40, 1), (2000, 100, 200, 5),
117+
(500, 5, 80, 1000), (65535, 127, 127, 3), (1, 1, 1, 1), (2645, 10, 20, 0),
118+
(4000, 200, 30, 7), (123, 45, 67, 89), (3000, 50, 40, 255), (0xFFFFFFFF, 0, 0, 0),
119+
]
120+
CS_RODATA_OFF, CS_RODATA_END = 0x10000, 0x20000
121+
122+
123+
def test_control_step():
124+
wasm = REPRO / "control_step.wasm"
125+
elf = "/tmp/cs496.elf"
126+
compile_default(wasm, elf)
127+
code, base, syms = load(elf)
128+
lin_base = syms["__linear_memory_base"]
129+
fa = syms.get("control_step_decide") or syms["func_0"]
130+
131+
eng = wasmtime.Engine()
132+
mod = wasmtime.Module(eng, wasm.read_bytes())
133+
134+
fails = 0
135+
for vec in CS_VECTORS:
136+
st = wasmtime.Store(eng)
137+
inst = wasmtime.Instance(st, mod, [])
138+
signed = [v - (1 << 32) if v >= 1 << 31 else v for v in vec]
139+
exp = inst.exports(st)["control_step_decide"](st, *signed) & 0xFFFFFFFF
140+
rodata = bytes(inst.exports(st)["memory"].read(st, CS_RODATA_OFF, CS_RODATA_END))
141+
got = run_unicorn(code, base, fa, lin_base,
142+
[(CS_RODATA_OFF, rodata)], vec)
143+
ok = isinstance(got, int) and got == exp
144+
fails += 0 if ok else 1
145+
shown = f"0x{got:08X}" if isinstance(got, int) else got
146+
print(f"{'OK ' if ok else 'FAIL'} control_step_decide{vec} = {shown} "
147+
f"(wasmtime 0x{exp:08X})")
148+
return fails
149+
150+
151+
# ---- flight_seam_flat: flat flight_algo(ST_OFF, S_OFF), 1-page memory ----
152+
FS_ST_OFF, FS_S_OFF = 0x1000, 0x2000
153+
154+
155+
def fs_st_init():
156+
b = bytearray(64)
157+
struct.pack_into("<i", b, 0, 1000)
158+
struct.pack_into("<i", b, 4, -500)
159+
struct.pack_into("<i", b, 8, 200)
160+
struct.pack_into("<i", b, 24, 7)
161+
return bytes(b)
162+
163+
164+
def fs_s_init():
165+
b = bytearray(32)
166+
for i, v in enumerate([100, -50, 30, 300, -200]):
167+
struct.pack_into("<h", b, i * 2, v)
168+
return bytes(b)
169+
170+
171+
def test_flight_seam_flat():
172+
wasm = REPRO / "flight_seam_flat.wasm"
173+
elf = "/tmp/fsflat496.elf"
174+
compile_default(wasm, elf)
175+
code, base, syms = load(elf)
176+
lin_base = syms["__linear_memory_base"]
177+
fa = syms.get("flight_algo") or syms["func_0"]
178+
179+
eng = wasmtime.Engine()
180+
mod = wasmtime.Module(eng, wasm.read_bytes())
181+
st = wasmtime.Store(eng)
182+
inst = wasmtime.Instance(st, mod, [])
183+
mem = inst.exports(st)["memory"]
184+
mem.write(st, fs_st_init(), FS_ST_OFF)
185+
mem.write(st, fs_s_init(), FS_S_OFF)
186+
exp = inst.exports(st)["flight_algo"](st, FS_ST_OFF, FS_S_OFF) & 0xFFFFFFFF
187+
188+
got = run_unicorn(code, base, fa, lin_base,
189+
[(FS_ST_OFF, fs_st_init()), (FS_S_OFF, fs_s_init())],
190+
(FS_ST_OFF, FS_S_OFF))
191+
ok = isinstance(got, int) and got == exp
192+
shown = f"0x{got:08X}" if isinstance(got, int) else got
193+
print(f"{'OK ' if ok else 'FAIL'} flight_algo(0x{FS_ST_OFF:x},0x{FS_S_OFF:x}) "
194+
f"= {shown} (wasmtime 0x{exp:08X})")
195+
return 0 if ok else 1
196+
197+
198+
def main():
199+
if not os.path.exists(SYNTH):
200+
sys.exit(f"{SYNTH} not found — build synth first")
201+
fails = test_control_step() + test_flight_seam_flat()
202+
print("\nORACLE:", "PASS" if fails == 0 else f"FAIL ({fails})")
203+
sys.exit(1 if fails else 0)
204+
205+
206+
if __name__ == "__main__":
207+
main()

0 commit comments

Comments
 (0)