diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 4af2897..e63ace8 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -33,7 +33,29 @@ "Bash(cargo +nightly fuzz build:*)", "Bash(cargo +nightly fuzz run:*)", "Bash(timeout 10 cargo +nightly fuzz run:*)", - "WebFetch(domain:docs.rs)" + "WebFetch(domain:docs.rs)", + "Bash(git checkout *)", + "Bash(Z3_SYS_Z3_HEADER=/opt/homebrew/include/z3.h LIBRARY_PATH=/opt/homebrew/lib cargo build --release)", + "Bash(git add *)", + "Bash(git commit -m ' *)", + "Bash(git push *)", + "Bash(gh pr *)", + "Bash(git fetch *)", + "Bash(git stash *)", + "Bash(git reset *)", + "Bash(git tag -a v0.5.0 1e96c68 -m ' *)", + "Bash(Z3_SYS_Z3_HEADER=/opt/homebrew/include/z3.h LIBRARY_PATH=/opt/homebrew/lib cargo test --release --lib)", + "Bash(wasm-tools dump *)", + "Bash(Z3_SYS_Z3_HEADER=/opt/homebrew/include/z3.h LIBRARY_PATH=/opt/homebrew/lib cargo test --release --lib -- test_cse)", + "Bash(awk '/test_cse_phase4_keeps_small_constants/,/^test result/' /private/tmp/claude-501/-Users-r-git-pulseengine-loom/c4560ac2-258a-4b13-acc5-e9e2a47f62e5/tasks/bng9rzwln.output)", + "Bash(Z3_SYS_Z3_HEADER=/opt/homebrew/include/z3.h LIBRARY_PATH=/opt/homebrew/lib cargo test --release --lib -- test_cse_phase4_keeps_small_constants --nocapture)", + "Bash(Z3_SYS_Z3_HEADER=/opt/homebrew/include/z3.h LIBRARY_PATH=/opt/homebrew/lib cargo test --release --lib -- test_cse_phase4 --nocapture)", + "Bash(grep -E \"Failed$|^error\" /private/tmp/claude-501/-Users-r-git-pulseengine-loom/c4560ac2-258a-4b13-acc5-e9e2a47f62e5/tasks/bafdrty57.output)", + "Bash(cargo fmt *)", + "Bash(wasm-tools print *)", + "Bash(git pull *)", + "Bash(Z3_SYS_Z3_HEADER=/opt/homebrew/include/z3.h LIBRARY_PATH=/opt/homebrew/lib cargo test --release --lib -- test_eliminate_dead_locals --nocapture)", + "Bash(git commit *)" ], "deny": [], "ask": [] diff --git a/docs/research/gale-v0.5.0/source-pattern-analysis.md b/docs/research/gale-v0.5.0/source-pattern-analysis.md new file mode 100644 index 0000000..fac2a96 --- /dev/null +++ b/docs/research/gale-v0.5.0/source-pattern-analysis.md @@ -0,0 +1,362 @@ +# Gale Pattern Analysis for LOOM Optimization Research + +**Source tree:** `/Users/r/git/pulseengine/z/gale/src/` (Verus-verified Rust +model of Zephyr's kernel-scheduler primitives — sem / mutex / sched / +wait_queue / timeout / event / futex / cpu_mask / atomic). + +**Method:** read-only inspection of the largest scheduler-relevant files +(`sched.rs` 916 LoC, `mutex.rs` 560, `sem.rs` 539, `wait_queue.rs` 478, +`timeout.rs` 709, `event.rs`, `futex.rs`, `cpu_mask.rs`, `priority.rs`, +`poll.rs`, `atomic.rs`, `ipi.rs`). + +**Module-wide quick facts** + +| metric | count | comment | +|---|---|---| +| `requires` clauses | 627 | trusted preconditions | +| `ensures` clauses | 980 | post-conditions LOOM can ingest | +| `decreases` clauses | 24 | bounded-loop ranking functions | +| `match` statements (8 hot files) | 29 | many are 7-arm enum dispatches | + +The repository is *unusually* dense in (a) closed-set enum dispatches, (b) +`Result<…, EINVAL>` early-return guards, and (c) bounded for-loops over +fixed-size arrays (`[Option; 64]`, `MAX_CPUS = 16`, +`MAX_PARTITIONS`, `MAX_POLL_EVENTS`). Every property below is grounded in +that structure. + +--- + +## 1. Closed-set state-machine dispatch (br_table candidate) + +**Pattern.** A 7-arm `match` on a small `#[repr(u8)]`-style enum where every +arm is either `Ok(NewState)` or `Err(EINVAL)`. The discriminant is dense +(0..6), so rustc/LLVM emits a `br_table` (jump table). Many of these +functions are then composed into outer state machines. + +**Examples (file:line).** +- `src/sched.rs:649-657` `sched_suspend` — 7 arms, 6 of them `Err(EINVAL)`. +- `src/sched.rs:669-677` `sched_resume` — 7 arms, only `Suspended` succeeds. +- `src/sched.rs:721-729` `sched_sleep` — only `Running` succeeds. +- `src/sched.rs:740-748` `sched_wakeup` — only `Sleeping` succeeds. +- `src/sched.rs:759-767` `sched_pend` — only `Running` succeeds. +- `src/sched.rs:779-787` `sched_unpend` — only `Pending` succeeds. + +**Two-axis pair-match (state×event).** +- `src/sched.rs:530-543` `is_valid_transition` — `match (from, to)` over + `(ThreadState, ThreadState)` (4×4 grid). +- `src/sched.rs:592-630` `sched_is_valid_transition` — `match (from, to)` + over `(SchedThreadState, SchedThreadState)` (7×7 = 49 cases, encoded as + ~25 explicit arms + wildcard). + +**Why it matters for LOOM.** +1. *Constant-folding through dispatch.* A caller invoking `sched_resume` + on a known-`Suspended` value should fold to `Ok(Ready)` — single move. + Today this requires an inter-procedural enum-discriminant tracker. +2. *Switch-table → perfect-hash.* Where >50 % of arms are + `Err(EINVAL)`, LOOM can rewrite to `if state == OnlyOk { Ok(...) } else + { Err(EINVAL) }` — kills the indirect jump entirely. Branch-prediction + wins on the sparse-success pattern that dominates here. +3. *FSM equivalence proofs.* The pair-match form (`(from, to)`) is the + canonical example LOOM/ISLE rule synthesis was designed for. Every + `_ => false` is a refutable rule the optimizer can prove sound from the + `ensures` clause directly above it (e.g. `from === Dead ==> !result`). + +**Expected payoff.** Concrete: 10-30 % cycle reduction on `sched_*` +hot paths (one less jump-table indirect, dead code eliminated). Strategic: +this is the cleanest source-level evidence I've seen for the +"trusted-axiom-driven branch elimination" idea in +`docs/research/novel-optimization-ideas.md`. + +--- + +## 2. Default-then-override (the v0.4.0/v0.5.0 hoist-guard pattern) + +**Pattern.** `let mut x = DEFAULT; ...; if cond { x = OVERRIDE }; x`. +LOOM v0.4/v0.5 added hoist guards specifically for this idiom because +LLVM frequently fails to forward-propagate when the override is conditional +on `Option::is_some()` or a state-machine match. + +**Examples.** + +1. **`src/sched.rs:404-444` `next_up_smp` — the canonical case.** + ``` + let mut thread: Option = runq_best; // default + if let Some(mirqp) = cpu_state.metairq_preempted { ... thread = Some(mirqp); ... } + let candidate = match thread { Some(t) => t, None => cpu_state.idle_thread }; + let mut chosen = candidate; // default + if current_is_active { ... chosen = current; ... } + if !should_preempt(...) { chosen = current; } + ``` + Two stacked default-then-override variables (`thread`, `chosen`) feeding + into a single returned `SmpSchedOutcome`. Exactly the shape that + motivated the hoist guards. + +2. **`src/sched.rs:183-208` `RunQueue::add` insertion search.** + ``` + let mut insert_pos: u32 = self.len; // default = "append at end" + let mut i: u32 = 0; + let mut found: bool = false; + while ... { if thr_pri < entry_pri { insert_pos = i; found = true; } ... } + ``` + `insert_pos` defaults to `self.len`, overridden when the loop finds a + spot. The companion `WaitQueue::pend` at `src/wait_queue.rs:292-332` + is byte-for-byte the same pattern. + +3. **`src/cpu_mask.rs:104-143` `cpu_mask_mod`.** + ``` + if is_running { return CpuMaskResult { mask: current_mask, error: EINVAL }; } + let new_mask: u32 = (current_mask | enable) & !disable; + if new_mask == 0 { return CpuMaskResult { mask: current_mask, error: EINVAL }; } + if pin_only && (new_mask & (new_mask - 1)) != 0 { return CpuMaskResult { mask: current_mask, error: EINVAL }; } + CpuMaskResult { mask: new_mask, error: OK } + ``` + The `mask: current_mask` field is "default", `error: OK/EINVAL` is the + override. LLVM partly handles this via SROA but the `Result`-shaped + struct often forces a stack slot. + +4. **`src/futex.rs:290-344` `wake`.** + `let mut woken: u32 = 0; ... match thread { Some(t) => { woken = 1 } None => {} }` + — classic counter default + match-arm increment. + +5. **`src/ipi.rs:109-141` `compute_ipi_mask`.** + `let mut mask: u32 = 0u32; ... if cond { mask = mask | bit; }` — bitmap + build via OR-into-default. + +**Why it matters.** This is *the* pattern LOOM already targets; gale gives +LOOM a clean, dense, real-world benchmark suite. Every example above has +a Verus `ensures` clause that pins the final value as a function of inputs +— LOOM can use those clauses to prove the override is unconditional in +specific call sites and collapse the assignment. + +--- + +## 3. Verus-verified bounded loops (full-unroll / k-induction targets) + +24 `decreases` clauses across the kernel. All ranking functions are +`MAX_CONST - i` or `len - i` with `MAX` as a `pub const u32`. + +**Examples.** + +| file:line | bound | iterations | +|---|---|---| +| `src/wait_queue.rs:206`, `319`, `359`, `437` | `MAX_WAITERS = 64` | ≤ 64 | +| `src/sched.rs:197` | `MAX_RUNQ_SIZE = 64` | ≤ 64 | +| `src/poll.rs:558,577,598` | `MAX_POLL_EVENTS` | ≤ 32 (typical) | +| `src/mem_domain.rs:202,306,387,460,478` | `MAX_PARTITIONS` | ≤ 16 | +| `src/device_init.rs:329` | `dev.num_deps - i` | ≤ small | +| `src/ipi.rs:121` | `num_cpus - idx`, `MAX_CPUS = 16` | ≤ 16 | +| `src/futex.rs:329` | `count - i`, `count ≤ MAX_WAITERS = 64` | ≤ 64 | + +**Why it matters for LOOM.** +- *Full unroll candidates.* Every loop with `decreases MAX_CPUS - i` and + `MAX_CPUS = 16` is a guaranteed-bounded, fixed-trip loop. LOOM can fully + unroll without runtime check (the `requires` clause already proved the + bound) — converting a loop body into a sequence of 16 SIMD-friendly ops. +- *k-induction-friendly.* Loops carrying invariants like + `forall|k: int| 0 <= k < i ==> entries[k].is_some()` (e.g. + `src/wait_queue.rs:194-200`) are perfect input to bounded SMT-based + loop verification. The Verus invariants *are* the k-induction + hypothesis. +- *Auto-vectorization.* `runq_shift_left` / `runq_shift_and_insert` + (`src/sched.rs:80-105`, marked `#[verifier::external_body]`) are pure + array shifts on `[Option; 64]` — LOOM can replace with + `memmove` / SIMD shuffle once it ingests the bound. + +**Expected payoff.** Big — 2-5× on the wait-queue / runqueue ops if +unrolled + vectorized. These are the absolute hottest paths in any +RTOS kernel. + +--- + +## 4. Tail-call / dispatch-only `match` + +**Pattern.** `match enum { V1 => f(), V2 => g(), V3 => h(), ... }` where +every arm is a single function call with the same signature. LLVM should +emit jump-threaded tail calls but often fails when arms construct +different `Ok(...)` variants. + +**Examples.** + +- `src/sched.rs:649-657`, `669-677`, `721-729`, `740-748`, `759-767`, + `779-787` — every `sched_*` lifecycle function has the form + `match state { S1 => Ok(T1), _ => Err(EINVAL) }` where each arm is a + single-expression constructor. Six functions, six identical shapes — + the classic case for *cross-function rule synthesis* (LOOM ingests one + rule, applies to all six). + +- `src/event.rs:296-336` `wait_decide` — three-way dispatch + (`Matched` / `Pend` / `Timeout`), each arm builds a `WaitDecideResult` + struct. The struct constructions share the `matched_events: 0` tail — + LOOM's struct-field constant-folding should collapse two of three arms. + +- `src/sched.rs:318-321`, `421-424`, `510-513` — three sites with the + *exact same* idiom `match opt { Some(t) => t, None => fallback }`. + Identical AST → ideal for caching one optimization recipe. + +**Payoff.** Smaller per-site (a few cycles) but accumulates because these +helpers are called from every kernel object operation. + +--- + +## 5. Cross-function leaf-inlining + constant propagation + +**Tiny leaf functions called from one site.** + +- `src/sched.rs:271-278` `prio_cmp(a,b) -> i64` — three-line wrapper around + `i64` subtraction, called only from `next_up_smp` + (`src/sched.rs:430`) where one operand is `current.priority`. Inline + + fold = direct `priority` field comparison. + +- `src/sched.rs:285-300` `should_preempt` — pure function of three bools, + called twice in `next_up_smp` and once in `update_cache`. Three call + sites, each with at least one boolean known at the call site (e.g. + `chosen.is_metairq` is often statically `false` for non-MetaIRQ paths). + Specialize → branch elimination. + +- `src/sched.rs:483-494` `update_metairq_preempt` — called once. Inline. + +- `src/priority.rs:62-70` `is_higher_than` — `self.value < other.value` + wrapped in a method. Called from many sites; the `requires + self.inv()` clause guarantees the field read is in-bounds — LOOM can + drop it. + +- `src/atomic.rs:271-279`/`282-290` `inc`/`dec` — wrappers around + `add(1)`/`sub(1)`. Two-line shims; *every* increment in the kernel goes + through them. + +**Constant-argument propagation.** + +- `compute_ipi_mask` (`src/ipi.rs:88-142`) is called with + `MAX_CPUS = 16` in production. The loop bound `num_cpus <= max_cpus <= + MAX_CPUS <= 32` means the inner `1u32 << idx` shift is always in range — + LOOM can specialize the function for `max_cpus = 16` and unroll. + +**Payoff.** Each individually small, cumulatively large because these +helpers sit on every fast path. + +--- + +## 6. Bit-manipulation / mask arithmetic + +**Pattern.** Sequences of `&`/`|`/`!`/`<<` that are provably equivalent to +simpler forms once their precondition is known. + +**Examples.** + +- `src/cpu_mask.rs:66-68` spec: `(current | enable) & !disable`. The + module proves a power-of-two property + `(m & (m-1)) == 0` (line 83) — LOOM can use this as an + axiom whenever a mask flows from `cpu_pin_compute`. + +- `src/event.rs:144-152` `clear`: `events &= !clear_events`. Combined + with `post`'s `events |= new_events` (line 82-ish) the module gives + *complete* monotonicity / idempotence proofs (`events.rs:208-258`). + Every Verus `by (bit_vector)` proof in this file is an axiom LOOM can + ingest: + - `(events | new_events) & events == events` (post-monotonic) + - `value & !value == 0u32` (set/clear roundtrip) + - `(events | new_events) | new_events == events | new_events` (idempotent) + +- `src/cpu_mask.rs:81-86` `validate_pin_mask`: `mask != 0 && (mask & + (mask - 1)) == 0` — power-of-two test, hardware popcount candidate. + +- `src/ipi.rs:127-134` mask construction inside an unrolled loop — + `mask |= bit; bit = 1 << idx;` is exactly the pattern that compiles to + PEXT/PDEP on x86 and `bit-build` on ARMv8.2+. + +**Payoff.** ISA-specific (popcount, PDEP, single-cycle bit-set) — but +LOOM's strategic goal of *ingesting verified bit-vector axioms as +optimization rewrite rules* lines up perfectly. Each `assert(...) by +(bit_vector)` in gale is, modulo translation, an ISLE/eqsat rewrite +rule. + +--- + +## 7. State-machine `match (state, event)` (LOOM ISLE rule-synth target) + +The premier example in the codebase: + +- `src/sched.rs:592-630` `sched_is_valid_transition(from, to)` — pure + `match (from, to) { ... }` over `(SchedThreadState, SchedThreadState)`, + 7 states each. The `ensures` clause `from === Dead ==> !result` (line + 590) is a *trusted axiom* LOOM can use: any caller path that reaches + this with statically-known `from = Dead` simplifies to `false` and + prunes the entire downstream branch. + +- `src/sched.rs:530-543` `is_valid_transition` — same shape, smaller FSM + (4 states). + +- `src/event.rs:289-336` `wait_decide(events, desired, wait_type, + is_no_wait)` — effectively `match (wait_type, condition_met, + is_no_wait)` collapsed into nested `if`s. The `ensures` clauses + (lines 304-310) prove `matched_events == 0` on the non-Matched paths — + LOOM can hoist that constant out. + +**Why it matters.** LOOM's Wave 2 ISLE rule verification work was about +precisely this kind of `match (s, e)` table. Gale is the test bed. + +--- + +## 8. Verus annotations as ingestable axioms + +`requires` (627), `ensures` (980), and `decreases` (24) clauses are +machine-checked by Verus + Z3. LOOM can ingest them as *trusted* axioms +without re-proving anything. + +**Highest-value annotations (sample).** + +| file:line | clause | what LOOM gains | +|---|---|---| +| `src/sched.rs:312-316` | `runq_best.is_none() ==> result === Idle` etc. | branch elimination on `next_up` calls | +| `src/sched.rs:589-590` | `from === Dead ==> !result` | dead-code on FSM paths | +| `src/sem.rs:265-273` | `wait_q.len_spec() == 0 && count < limit ==> count' == count + 1` | scalar replacement for the `count+1` path | +| `src/timeout.rs:316-339` | full `Ok((new_tick, fired))` pre/post | timeout-fast-path inlining | +| `src/cpu_mask.rs:111-122` | `error == OK ==> mask == ((current\|enable) & !disable)` | mask-arith folding | +| `src/event.rs:213,220,242,250` | bit-vector lemmas (`by (bit_vector)`) | direct rewrite-rule ingestion | +| `src/futex.rs:280-288` | `result.woken <= MAX_WAITERS` | bound propagation into callers | +| `src/atomic.rs:222-231` | CAS success/failure semantics | lock-free fast-path specialization | + +This is the largest, most concentrated source of verified pre/post- +conditions I'm aware of in the LOOM benchmark space — strictly more +machine-readable than C-level Frama-C ACSL in upstream Zephyr. + +**Payoff.** Strategic, not numeric: gale lets LOOM measure what +fraction of axioms actually trigger optimizations, and how often +LLVM-only baselines miss the same opportunities. This is the +primary research lever. + +--- + +## Summary table + +| # | Pattern | Sites (approx) | Payoff | +|---|---|---|---| +| 1 | Closed-set FSM `match` (br_table → const-fold) | ~10 in sched.rs alone | high | +| 2 | Default-then-override (LOOM v0.4/v0.5 hoist) | ~7 hot sites | high (already targeted) | +| 3 | Verus-bounded loops (full-unroll + SIMD) | 24 `decreases` | very high on wait/run queues | +| 4 | Tail-call dispatch `match` | ~12 | medium | +| 5 | Leaf-inline + const-prop | ~8 leaf fns | medium-high cumulative | +| 6 | Bit-mask axiom ingestion | events/cpu_mask/ipi | ISA-dependent, strategic | +| 7 | `match (state, event)` FSM | 2 large + 1 medium | high (LOOM ISLE target) | +| 8 | Verus pre/post as trusted axioms | 1607 clauses | strategic — primary lever | + +## Recommended next steps for LOOM + +1. **Pick `sched.rs` lines 530-787 as the canary benchmark.** Six near- + identical `match`-on-enum FSM functions, all with `ensures` clauses, + all called from FFI (`pub`). It's the densest source of patterns 1, 7, + and 8 simultaneously. +2. **Wire `decreases` clauses into LOOM's loop-unrolling cost model.** + Twenty-four loops with statically-known bounds tied to `pub const`s — + minimal effort, large payoff. +3. **Treat `assert(...) by (bit_vector)` as ground truth** for the + bit-mask rewrite-rule database (pattern 6). The Verus bit-vector solver + has already discharged them. +4. **Use gale's compile output as the regression suite for hoist guards + (pattern 2)** — `next_up_smp` alone exercises three layered defaults. + +--- + +*Generated 2026-04-29 from gale source tree at +`/Users/r/git/pulseengine/z/gale/`. Read-only inspection; no files +modified.* diff --git a/docs/research/gale-v0.5.0/wasm-opt-gap-analysis.md b/docs/research/gale-v0.5.0/wasm-opt-gap-analysis.md new file mode 100644 index 0000000..eacc73a --- /dev/null +++ b/docs/research/gale-v0.5.0/wasm-opt-gap-analysis.md @@ -0,0 +1,396 @@ +# wasm-opt → LOOM gap analysis (kernel-scheduler workloads) + +Date: 2026-04-29 +Target: LOOM v0.5.0 vs Binaryen wasm-opt (v116) on gale-style code +(small dispatchers, br_table, bit-manip, state machines, early-exit guards). +Source: gale v0.4.0 measurement report +(`/Users/r/git/pulseengine/loom/docs/research/gale-v0.4.0/measurement-report.md`). +LOOM pipeline confirmed by reading +`/Users/r/git/pulseengine/loom/loom-core/src/lib.rs` (lines 6713-6829, 7689-7770) +and `/Users/r/git/pulseengine/loom/docs/design/{simplify-locals,cse}.md`. + +## Executive summary + +The +6.3% code-section regression on gale is dominated by two effects: + +1. **LOOM's CSE inserts `local.tee/local.get` pairs around 1-byte constants** + (`-22`, `-16`, `-1`), turning a 2-byte sequence into a 4-byte sequence + plus a function-header `(local i32)` declaration. This is exactly + backwards on a workload made of small dispatchers full of `-EINVAL` / + `-EBUSY` / `K_FOREVER` constants. +2. **LOOM's `simplify_locals` skips every function with early-exit + control flow** (`has_dataflow_unsafe_control_flow` is true for any + `BrIf`, `BrTable`, or non-tail `Return`/`Br`). Kernel scheduler code + *is* early-exit code: every function on the gale measurement that + could have benefited (`gale_k_sem_give_decide`, `func 0`, `func 13`, + `func 21`, `func 24`) was skipped. The pass had zero effect on the + workload by construction. + +So the highest-payoff work is **not** to add a brand-new pass — it is +to make the two existing passes pay off on early-exit code, plus add +two cheap shape-canonicalization passes wasm-opt has and LOOM +doesn't (`reorder-locals`, `RedundantSetElimination` proper). + +## Ranked picks + +### 1. `RedundantSetElimination` (RSE) — proper, with liveness across branches + +**What it does.** Removes a `local.set` whose value is overwritten on +every path before any read, even when the path goes through a +branch/return. Binaryen's `RedundantSetElimination` (`--rse`) is a +flow-sensitive forward dataflow that propagates the *set value lattice* +through the CFG; a set is redundant iff for every successor path the +local is overwritten before being read, OR the function exits without +the local being read. + +**Why it pays on kernel-scheduler code.** This is exactly the +"dead-store elimination of init-only locals" the gale report calls out. +The pattern in gale is: + +``` +i32.const -22 +local.set 3 ;; sentinel init +... if (cond) { ...; local.set 3; ... } else { ...; local.set 3; ... } +local.get 3 +return +``` + +Both branches write before the read, so the initial sentinel set is +dead. wasm-opt removes it; LOOM's current `simplify_locals` skips the +whole function because `BrIf`/`Return` is present. The init-only-local +pattern is *universal* in Zephyr-style FFI — every function returns an +errno-style `int` and starts by initializing it to a default. + +**Why LOOM's current `simplify_locals` cannot do this.** +`has_dataflow_unsafe_control_flow` (lib.rs:6713) bails on any non-tail +`Return`/`Br`/`BrIf`/`BrTable`. The eligibility check is correct for +the *substitution* operations that pass also performs (equivalence +propagation, get/set forwarding) but it is far too coarse for +straight-up dead-set elimination, which is sound under liveness alone. + +**Concrete LOOM change.** Split `simplify_locals` into two sub-passes: + +- `simplify_locals_pure` — current code, gated by + `has_dataflow_unsafe_control_flow` (keep the safety net). +- `dead_local_set_elimination` — new, runs liveness analysis on the + CFG and removes any `local.set` whose value is not live-out. Runs + unconditionally (no early-exit gate). Replace the set with `drop` + so the stack stays balanced; the existing DCE pass already collapses + `i32.const N; drop` to nothing (lib.rs:7109). + +**Estimated complexity.** ~600 LOC for live-range computation over +LOOM's instruction stream + ~200 LOC plumbing + ~150 LOC tests. +1.0–1.5 weeks. The CFG already exists for `coalesce_locals` +(lib.rs:9824) — reuse that liveness builder. The hardest part is the +join at if/block/loop merge: take liveness as the union of successor +live-in sets. Loops require iteration-to-fixedpoint over back-edges. + +**Risk / verifiability.** Low. The transformation is a textbook +liveness-driven DSE and has an obvious Z3 encoding: for every removed +set, prove `∀paths from set to any subsequent use of the local, ∃ a +preceding set on the same path`. LOOM's `TranslationValidator` already +does pre/post equivalence; for a set→drop rewrite, the post-state +differs only in one local's value, and that local must be unused on +all paths (the very property we proved). Easy Rocq lemma: +`liveness_join_over_approximates ∧ removed_local_not_in_live_out ⇒ +post_module ≅ pre_module`. No new SMT theory needed. + +### 2. `simplify-locals` "sink/tee" mode — usable on early-exit code + +**What it does.** Binaryen's `SimplifyLocals` does *more* than +substitution. Its biggest payoff is the **sinker**: when a value is +computed and stored to a local that has exactly one read, sink the +expression into the read site (deleting both the set and the get). +This is what turns +``` +local.tee 1; ...; local.get 1; i32.add +``` +into +``` +... ; (sunk_expr); i32.add +``` +when the local has one writer + one reader and the reader dominates +no-other-uses. Combined with sinking into block/if/loop result types, +it's the pass that "moves a value through a control-flow merge so it +no longer needs a local at all." The current Binaryen pass is +~1500 lines because of the merge-point bookkeeping. + +**Why it pays on kernel-scheduler code.** Dispatcher functions are +chains of `if (errno) goto exit; ... result = compute();` and the +sinker collapses the trailing `result = compute(); local.get result; +return` into a tail `return compute()`. Every gale function ends with +this pattern. + +**Why LOOM doesn't have it.** LOOM's `simplify_locals` is a +substitution-only iteration; the design doc (`simplify-locals.md` +lines 14-86) doesn't list sinking. And again: the early-exit gate +disables the whole function. + +**Estimated complexity.** ~1200–1500 LOC. This is the most +LOC-expensive pick on this list. Needs single-use/single-def +analysis, dominance, and a "can this expression be moved past these +intervening instructions" effect check (loads vs stores, calls, +local-aliasing). 3–4 weeks done carefully. + +**Risk / verifiability.** Moderate. The motion check is the +tricky bit: sinking past a store is unsafe if the value depends on +memory. Use LOOM's existing effects framework +(`crate::verify` already encodes "may-load / may-store / may-call" +at the instruction level — the CSE eligibility logic uses it). +Z3 verifier can pre/post compare for any sunk expression, but +validating *all* possible sinks per-function is expensive; the +practical rule is to gate by "expression has no may-store-affecting +load" and reject anything else. Rocq proof for the gate condition +is one moderate-difficulty lemma (~200 lines). + +### 3. `reorder-locals` — slot renumbering for compactness + +**What it does.** After all other passes, sort locals by use-count +(descending). Index 0..N-1 frequently-used locals get short LEB128 +encodings; rarely-used ones get pushed to high indices. Also drops +locals with zero uses (a free DCE). + +**Why it pays on kernel-scheduler code.** The gale report +(section 4.3) calls this out specifically: wasm-opt renames a +one-shot `local 3` to `local 1` and drops the now-empty slot, saving +a local-decl byte. In small dispatcher functions, the function header +is a non-trivial fraction of the bytes — every gale function has +4–8 locals declared, and removing one saves 2 bytes per declaration +group. Across 25 functions that compounds. + +**Estimated complexity.** ~250 LOC. Trivial: count `LocalGet`/`LocalSet`/ +`LocalTee` per index, build a permutation `old → new`, rewrite. The +only subtleties are (a) parameters keep their indices (params occupy +the low slots by spec), (b) update local-decl-runs (locals are +encoded as run-length groups by type — recompute the groups after +renaming). 0.5 weeks including tests. + +**Risk / verifiability.** Trivial. Pure renaming is a bijection; the +post-module is observably identical (proof: structural induction over +instructions, every `Local{Get,Set,Tee}(i)` becomes +`Local{Get,Set,Tee}(σ(i))` and `σ` is a bijection over local indices, +QED). Rocq proof is ~50 lines. + +### 4. Compare-operand canonicalization (`lt_u` ↔ `gt_u` flip + tee target reuse) + +**What it does.** Recognize the pattern +``` +A; local.tee X; B; local.get X; cmp_op +``` +where the live value through `B` is `X`. If `cmp_op` is one of the +order comparisons (`lt_u`, `lt_s`, `le_u`, `le_s`, `gt_u`, `gt_s`, +`ge_u`, `ge_s`), swap the tee target with one of the existing +locals declared in the function and flip the comparison. Goal: keep +the live value in the same physical register slot through the +intervening instructions (saves a wasm-runtime register move; on +some engines also saves a stack-spill byte in the encoding when the +new slot has a smaller LEB). + +**Why it pays on kernel-scheduler code.** Section 4.2 of the gale +report identifies this exact pattern in the wopt diff. State-machine +transitions are full of "compare current state to threshold, branch" +sequences, and every one of them goes through a tee/get pair around +a comparison. wasm-opt does this in `OptimizeInstructions`; LOOM has +no equivalent. + +**Estimated complexity.** ~400 LOC. Pure peephole on a 5-instruction +window with a simple slot-reuse heuristic (prefer slot whose other +uses are temporally adjacent — minimize live range). 1 week +including a use-density heuristic. + +**Risk / verifiability.** Easy. The flip +`(a < b) ≡ (b > a)` for unsigned is a one-line Z3 fact and a +two-line Rocq lemma. The slot reuse is just a renaming, same as +pick 3. Combined proof is ~80 lines of Rocq. + +### 5. Constant-CSE suppression heuristic (FIX existing CSE pass) + +**What it does.** This is *not* adding a pass — it is gating an +existing one. The current LOOM CSE pass (lib.rs CSE phase) replaces +two adjacent `i32.const -22` pushes with `i32.const -22; local.tee N; +local.get N`. This is a regression for any constant whose LEB128 +encoding is ≤ 2 bytes (i.e. anything in `-64..=63` for `i32.const`), +because the `local.tee N + local.get N` replacement is 2 + 2 = 4 +bytes, plus the function-header `(local i32)` declaration. + +**The rule.** Decline to CSE an expression whose post-encoding cost +is `≤ cost_of(local.tee + local.get + local_decl_amortized)`. For +typical functions with ≥1 existing local of the right type, the +break-even is ~3 LEB bytes for the original expression. Below that, +do not CSE. + +**Why it pays on kernel-scheduler code.** Gale's section 4.4 traces +the +6.3% almost entirely to constant-CSE on `-22` / `-16` / `-1` / +`0`. Adding this single 20-line gate likely flips the sign of the +LOOM-vs-baseline delta on this whole class of workload. + +**Estimated complexity.** ~50 LOC for the cost model + gate; +~100 LOC for tests across constant width tiers. 1–2 days. By far +the highest payoff-per-LOC item on this list. + +**Risk / verifiability.** Zero — this is *removing* an unsound / +unprofitable rewrite, not adding one. The existing CSE proof +obligation is unchanged; we just don't fire on certain inputs. +No new Rocq work. + +### 6. `directize` — `call_indirect` → `call` + +**What it does.** When a `call_indirect` site can be proven to +reach exactly one table entry (because the function index on the +stack came from `i32.const N` with no intervening table mutation, +or because the table is immutable and indexable by a known constant), +rewrite to a direct `call $target`. Direct calls are cheaper to +encode (no type index, no table index, no runtime type-check), and +they expose the callee body to the inliner. + +**Why it pays on kernel-scheduler code.** Wasm-component fused +modules — which is exactly what gale produces — generate +`call_indirect` for every cross-module dispatch. After +`fused_optimizer.rs` devirtualizes adapters, a residue of +"resolved-but-still-indirect" call sites remains. Directize gets the +last 5–10% of indirect calls on Zephyr-style FFI shims. + +**Estimated complexity.** ~700 LOC. Need: table contents tracking +(LOOM has the table model), constant-stack-tracking dataflow over +the indirect-call site's index operand, and the rewrite. 2 weeks. + +**Risk / verifiability.** Moderate. Soundness depends on table +immutability *and* on the index operand being a constant on every +incoming path. The first is a module-level invariant +(no `table.set`, no `table.fill`, no exported table). The second +is a per-call-site dataflow query that LOOM's verifier can encode +to Z3 (the index expression's symbolic value is a singleton set). +Rocq lemma: "if table T is immutable and index expression is +provably constant N, then `call_indirect (type_of T[N])` ≡ +`call T[N]`." ~150 lines. + +### 7. `merge-locals` — pre-coalesce same-value merging + +**What it does.** Binaryen's `MergeLocals` runs *before* +`CoalesceLocals` and merges two locals that hold the same value at +some program point — even if their lifetimes overlap — by rewriting +all reads of one to reads of the other. This collapses pure copies +and pre-improves the interference graph that CoalesceLocals will +then color. + +**Why it pays on kernel-scheduler code.** Dispatcher functions +load a struct field, copy it into a "current value" local, and +operate on that. The struct-field local and the working local are +copies of each other for the entire function body. MergeLocals +collapses them; LOOM's existing `coalesce_locals` cannot — it sees +the two locals as live simultaneously and refuses to merge their +slots. + +**Estimated complexity.** ~500 LOC. Requires a value-numbering +pass over locals (which LOOM has fragmentary infrastructure for in +the CSE pass). 1.5 weeks. + +**Risk / verifiability.** Moderate. The proof obligation per merge +is "for every read of L2, the value of L1 equals the value of L2 at +that program point." Discharge with Z3 forward-symbolic execution +or with a syntactic copy-chain analysis (cheaper, less complete). +Rocq: ~200 lines for the syntactic version. The flow-sensitive +version is harder but can be deferred. + +## Specific transformations from the task + +### Dead-store elimination of init-only locals + +**wasm-opt's algorithm** (from Binaryen `RedundantSetElimination` + +`SimplifyLocals` + `Vacuum` working in concert): + +1. `RedundantSetElimination` runs a forward dataflow that, at every + program point, knows for every local "what set is currently + responsible for the local's value, if any". When a `local.set` + is reached and the local already holds the value being set + (e.g. via copy chain), the set is removed. +2. `SimplifyLocals` runs a backward liveness pass: for every set, + if the local is not live-out of the set (no use on any forward + path), the set is replaced by a `drop` of the value (if it has + side effects) or removed entirely (if pure). +3. `Vacuum` cleans up the `(i32.const N) (drop)` pairs left behind. + +The init-only-local pattern (`i32.const -22; local.set 3; ... ; +local.set 3 on every path before any read`) is killed by step 2. +The local has *some* later set, but on every forward path from the +init set there is another set before any get — so the init set's +value is not live-out, hence dead. + +**What LOOM needs.** Pick #1 above. Specifically: + +- A **liveness analyzer** over LOOM's instruction stream. The + fragments needed are already present (`coalesce_locals` does live + ranges; the CFG builder is in `verify.rs`). Wire them into a new + `dead_local_set_elimination` function. +- The analyzer's **join function** must take the union of + successor live-in sets at every merge, and iterate to fixed + point over loop back-edges. +- The **transform** is: for every `local.set X` whose value is not + in `live_out(set_position)`, replace the set with `drop`. Run + DCE to eat the `i32.const N; drop`. +- The **gate** must NOT be `has_dataflow_unsafe_control_flow` — + liveness across branches is precisely the case we want to handle. + Use a much narrower gate: skip only functions with `Unknown` or + `CallIndirect` instructions where effects can't be modeled + (consistent with LOOM's other passes). + +### Compare-operand canonicalization for register reuse + +This is pick #4 above. The full algorithm: + +1. Find a window `[A, local.tee X, B, local.get X, cmp_op]` where + `B` does not modify `X` and `cmp_op ∈ {lt_u, lt_s, le_*, gt_*, + ge_*, eq, ne}`. +2. For each candidate target slot `Y` already declared in the + function: + - Compute the live range cost of replacing `X` with `Y` (does + `Y` have other uses that conflict?). + - Compute the encoding-byte delta (can be negative if `Y` has + a smaller LEB index). +3. If delta is negative or zero AND no live-range conflict, rewrite + to `[A, local.tee Y, B, local.get Y, flipped_cmp_op]`. The flip + table: + - `lt_u ↔ gt_u`, `lt_s ↔ gt_s` + - `le_u ↔ ge_u`, `le_s ↔ ge_s` + - `eq` stays `eq`, `ne` stays `ne` (operand order doesn't matter) + +The rule applies because `(a CMP b) = (b CMP_FLIPPED a)` for total +orders, and the operand order on the wasm stack is exactly what we +swapped. Pure shape canonicalization, no value semantics changed. + +## Recommended LOOM roadmap (kernel-scheduler-optimized) + +| Order | Pass | LOC | Wks | Risk | Expected gale code-section delta | +|---|---|---:|---:|---|---| +| 1 | Constant-CSE suppression gate (pick #5) | 50 | 0.3 | None | -3% to -5% (kills regression source) | +| 2 | reorder-locals (pick #3) | 250 | 0.5 | Trivial | -1% | +| 3 | RedundantSetElimination proper (pick #1) | 600 | 1.5 | Low | -2% to -3% | +| 4 | Compare canonicalization (pick #4) | 400 | 1.0 | Easy | -0.5% | +| 5 | merge-locals (pick #7) | 500 | 1.5 | Moderate | -1% | +| 6 | Directize (pick #6) | 700 | 2.0 | Moderate | -1% to -2% | +| 7 | simplify-locals sinking (pick #2) | 1500 | 4.0 | Moderate | -1% | + +Cumulative projected effect on gale code section: roughly -10% (vs +current +6.3%), i.e. crossing parity with wasm-opt -O3 and probably +a small win because the formal verifier should let us be more +aggressive than wasm-opt on a few patterns it conservatively rejects. + +The first three rows alone (1.3 weeks of work) likely flip the sign +of the gap on this entire workload class. Picks 1 and 5 should be +the immediate priority — pick 1 unlocks the gate that disables every +other simplify-locals optimization on early-exit code, and pick 5 is +a 50-LOC fix to a known regression. + +## Sources + +- [Binaryen wasm-opt manpage (Debian)](https://manpages.debian.org/testing/binaryen/wasm-opt.1.en.html) +- [Binaryen pass.cpp (pass registry)](https://github.com/WebAssembly/binaryen/blob/main/src/passes/pass.cpp) +- [Binaryen SimplifyLocals.cpp](https://github.com/WebAssembly/binaryen/blob/main/src/passes/SimplifyLocals.cpp) +- [Binaryen Vacuum.cpp](https://github.com/WebAssembly/binaryen/blob/main/src/passes/Vacuum.cpp) +- [Binaryen Optimizer Cookbook (wiki)](https://github.com/WebAssembly/binaryen/wiki/Optimizer-Cookbook) +- [wasm-opt: The WebAssembly Optimizer (DeepWiki)](https://deepwiki.com/WebAssembly/binaryen/4.1-wasm-opt:-the-webassembly-optimizer) +- [Compiling to and optimizing Wasm with Binaryen (web.dev)](https://web.dev/articles/binaryen) +- [RedundantSetElimination nondeterminism issue #4524](https://github.com/WebAssembly/binaryen/issues/4524) +- gale measurement report (local): `/Users/r/git/pulseengine/loom/docs/research/gale-v0.4.0/measurement-report.md` +- LOOM pipeline source (local): `/Users/r/git/pulseengine/loom/loom-core/src/lib.rs` (lines 6713-6829, 7689-7770, 8042-8200) +- LOOM design docs (local): `/Users/r/git/pulseengine/loom/docs/design/{simplify-locals,cse,dce,vacuum,branch-simplification}.md` diff --git a/loom-cli/src/main.rs b/loom-cli/src/main.rs index 7702480..5ea5a09 100644 --- a/loom-cli/src/main.rs +++ b/loom-cli/src/main.rs @@ -50,7 +50,7 @@ enum Commands { attestation: bool, /// Select specific optimization passes (comma-separated) - /// Available: inline,precompute,constant-folding,cse,advanced,branches,dce,merge-blocks,vacuum,simplify-locals + /// Available: inline,precompute,constant-folding,cse,advanced,branches,dce,merge-blocks,vacuum,simplify-locals,dead-locals /// Example: --passes inline,constant-folding,dce /// Default: all passes #[arg(long, value_delimiter = ',')] @@ -490,6 +490,15 @@ fn optimize_command( track_pass("simplify-locals", before, after); } + if should_run("dead-locals") { + println!(" Running: dead-locals"); + let before = count_instructions(&module); + loom_core::optimize::eliminate_dead_locals(&mut module) + .context("Dead-local elimination failed")?; + let after = count_instructions(&module); + track_pass("dead-locals", before, after); + } + stats.optimization_time_ms = start_opt.elapsed().as_millis(); println!("✓ Optimized in {} ms", stats.optimization_time_ms); diff --git a/loom-core/src/lib.rs b/loom-core/src/lib.rs index 992ba08..2e80fcb 100644 --- a/loom-core/src/lib.rs +++ b/loom-core/src/lib.rs @@ -9892,6 +9892,223 @@ pub mod optimize { Ok(()) } + /// Eliminate trivially dead locals: locals declared by a function but + /// never read by any `LocalGet` anywhere in the function body. + /// + /// This is the path-INSENSITIVE half of dead-store elimination. Unlike + /// the path-sensitive case (Pick #3, RedundantSetElimination liveness), + /// "zero reads anywhere" is a structural property of the instruction + /// tree — sound regardless of `BrIf`/`BrTable`/early-`Return` control + /// flow. So this pass DOES NOT need the `has_dataflow_unsafe_control_flow` + /// guard that prevents `simplify_locals` and `coalesce_locals` from + /// running on kernel-style early-exit code (gale). + /// + /// Targets the gale "default-then-override" pattern: rustc/LLVM + /// materializes an EINVAL default at function entry, then every + /// reachable path overwrites it before return. The default's + /// `local.set` becomes pure dead store. wasm-opt eliminates this; + /// LOOM v0.5.0 did not. + /// + /// Algorithm: + /// 1. Recursively count `LocalGet` references for each local index. + /// (Walks Block/Loop/If bodies — same recursion shape as + /// `remap_instructions` and `eliminate_redundant_sets`.) + /// 2. Identify dead locals: idx >= param_count AND read_count == 0. + /// 3. Neutralize writes to dead locals: + /// - `LocalSet dead_idx` → `Drop` (preserves stack consumption) + /// - `LocalTee dead_idx` → removed (Tee's stack effect is `[T]→[T]`, + /// so removing it leaves the value passing through unchanged) + /// 4. Build a packed remap: surviving locals get sequential indices + /// starting at param_count (densest LEB128 encoding). + /// 5. Apply remap via existing `remap_instructions`; rebuild + /// declarations from surviving types. + /// 6. Z3 translation-validation: revert on rejection. + pub fn eliminate_dead_locals(module: &mut Module) -> Result<()> { + use crate::stack::validation::{ValidationContext, ValidationGuard}; + use crate::verify::TranslationValidator; + use std::collections::{BTreeMap, BTreeSet}; + + let ctx = ValidationContext::from_module(module); + + for func in &mut module.functions { + if has_unknown_instructions(func) || has_unsupported_isle_instructions(func) { + continue; + } + + let param_count = func.signature.params.len() as u32; + let declared_count: u32 = func.locals.iter().map(|(c, _)| *c).sum(); + if declared_count == 0 { + continue; + } + let total_locals = param_count + declared_count; + + // Step 1: Count LocalGet references for each local across the + // entire instruction tree (including nested bodies). + let mut read_counts: BTreeMap = BTreeMap::new(); + count_local_reads(&func.instructions, &mut read_counts); + + // Step 2: A local is dead iff it's declared (not a param) and + // never read anywhere. This rule is path-insensitive — no read + // means no observation, regardless of which paths the writes + // are reachable from. + let dead: BTreeSet = (param_count..total_locals) + .filter(|idx| read_counts.get(idx).copied().unwrap_or(0) == 0) + .collect(); + + if dead.is_empty() { + continue; + } + + let guard = ValidationGuard::with_context(func, "eliminate_dead_locals", ctx.clone()); + let translator = TranslationValidator::new(func, "eliminate_dead_locals"); + let original_instructions = func.instructions.clone(); + let original_locals = func.locals.clone(); + + // Step 3: Neutralize writes to dead locals. Done in-place via + // a tree walk (matches the recursion shape of remap_instructions). + neutralize_dead_writes(&mut func.instructions, &dead); + + // Step 4: Build packed remap. Surviving locals keep their + // relative order but get a dense, gap-free sequence of new + // indices starting at param_count. + let mut remap: BTreeMap = BTreeMap::new(); + // Identity-map the parameters. + for p in 0..param_count { + remap.insert(p, p); + } + let mut next_idx = param_count; + for old_idx in param_count..total_locals { + if !dead.contains(&old_idx) { + remap.insert(old_idx, next_idx); + next_idx += 1; + } + } + + // Step 5: Walk the tree applying the remap, then rebuild the + // locals declaration from the surviving types in order. + remap_instructions(&mut func.instructions, &remap); + func.locals = pack_surviving_locals(&original_locals, param_count, &dead); + + // Step 6: Z3 verification — revert if the transformed function + // is not observationally equivalent to the original. + if guard.validate(func).is_err() || translator.verify(func).is_err() { + eprintln!("eliminate_dead_locals: reverting function (verification rejected)"); + crate::stats::record_revert("eliminate_dead_locals"); + func.instructions = original_instructions; + func.locals = original_locals; + } + } + + Ok(()) + } + + /// Count LocalGet references for every local index in an instruction + /// tree. Recurses into Block / Loop / If bodies — matches the recursion + /// shape of `remap_instructions` (lib.rs:10106) so the two analyses + /// see the same tree. + fn count_local_reads( + instructions: &[crate::Instruction], + counts: &mut std::collections::BTreeMap, + ) { + use crate::Instruction; + for instr in instructions { + match instr { + Instruction::LocalGet(idx) => { + *counts.entry(*idx).or_insert(0) += 1; + } + Instruction::Block { body, .. } | Instruction::Loop { body, .. } => { + count_local_reads(body, counts); + } + Instruction::If { + then_body, + else_body, + .. + } => { + count_local_reads(then_body, counts); + count_local_reads(else_body, counts); + } + _ => {} + } + } + } + + /// Replace `LocalSet dead` with `Drop` and remove `LocalTee dead` for + /// every local in the dead set. Walks nested control-flow bodies. + /// + /// Stack-effect rationale (see insight at the call site): + /// LocalSet idx : [T] → [] → Drop (which is also [T] → []) + /// LocalTee idx : [T] → [T] → remove (the [T] passes through) + fn neutralize_dead_writes( + instructions: &mut Vec, + dead: &std::collections::BTreeSet, + ) { + use crate::Instruction; + let mut i = 0; + while i < instructions.len() { + match &mut instructions[i] { + Instruction::LocalSet(idx) if dead.contains(idx) => { + instructions[i] = Instruction::Drop; + i += 1; + } + Instruction::LocalTee(idx) if dead.contains(idx) => { + instructions.remove(i); + // Don't increment i — re-examine the new instruction + // at this position. + } + Instruction::Block { body, .. } | Instruction::Loop { body, .. } => { + neutralize_dead_writes(body, dead); + i += 1; + } + Instruction::If { + then_body, + else_body, + .. + } => { + neutralize_dead_writes(then_body, dead); + neutralize_dead_writes(else_body, dead); + i += 1; + } + _ => { + i += 1; + } + } + } + } + + /// Rebuild a locals declaration vector after removing the indices in + /// `dead`. Preserves the original declaration order of survivors and + /// produces a `Vec<(count, type)>` in run-length form (consecutive + /// same-type survivors are merged into a single entry). + fn pack_surviving_locals( + original: &[(u32, crate::ValueType)], + param_count: u32, + dead: &std::collections::BTreeSet, + ) -> Vec<(u32, crate::ValueType)> { + let mut surviving_types: Vec = Vec::new(); + let mut idx = param_count; + for (count, ty) in original { + for _ in 0..*count { + if !dead.contains(&idx) { + surviving_types.push(*ty); + } + idx += 1; + } + } + + // Run-length encode (count, type) pairs. + let mut out: Vec<(u32, crate::ValueType)> = Vec::new(); + for ty in surviving_types { + if let Some(last) = out.last_mut() + && last.1 == ty + { + last.0 += 1; + } else { + out.push((1, ty)); + } + } + out + } + #[derive(Debug, Clone)] struct LiveRange { local_idx: u32, @@ -10683,6 +10900,95 @@ pub mod optimize { Expr::Unary { operand, .. } => operand.is_pure(), } } + + /// Estimate the wasm-encoded byte cost of materializing this + /// expression once. + fn cost_bytes(&self) -> usize { + match self { + Expr::Unknown => 0, + Expr::Const32(v) => 1 + signed_leb128_bytes_i32(*v), + Expr::Const64(v) => 1 + signed_leb128_bytes_i64(*v), + Expr::LocalGet(idx) => 1 + unsigned_leb128_bytes(*idx as u64), + Expr::Binary { left, right, .. } => left.cost_bytes() + right.cost_bytes() + 1, + Expr::Unary { operand, .. } => operand.cost_bytes() + 1, + } + } + + /// Net byte savings from CSE-ing N occurrences of this + /// expression. Positive = win, non-positive = skip. + /// + /// Cost model: + /// first occurrence: keep original + add local.tee N + /// (~2 bytes for opcode + LEB128 idx, + /// assuming idx ≤ 127) + /// each later use: replace original with local.get N + /// (~2 bytes), saving (cost - 2) bytes + /// header growth: +2 bytes for new local declaration + /// (1 byte count + 1 byte type) + /// + /// So: savings = (N-1) * (cost - 2) - 2 (tee) - 2 (header) + /// + /// Examples (with this gate): + /// i32.const 42 (cost=2, N=2): savings = 1*0 - 4 = -4 → skip + /// i32.const 42 (cost=2, N=10): savings = 9*0 - 4 = -4 → skip + /// i32.add of LocalGet+Const42 (cost=5, N=2): + /// savings = 1*3 - 4 = -1 → skip + /// i32.add of LocalGet+Const42 (cost=5, N=3): + /// savings = 2*3 - 4 = 2 → keep + /// big i32.const (cost=6, N=2): savings = 1*4 - 4 = 0 → skip + /// big i32.const (cost=6, N=3): savings = 2*4 - 4 = 4 → keep + /// + /// Tuned against the gale v0.4.0 measurement (CSE-ing + /// `-EINVAL` into local.tee/local.get grew kernel-FFI code + /// section by +6.3%). + fn worth_dedup(&self, occurrences: usize) -> bool { + if occurrences < 2 { + return false; + } + let cost = self.cost_bytes() as i64; + let savings_per_later = (cost - 2).max(0); // never negative — replacing 1-byte materialization with 2-byte local.get is a regression we won't take + let net = (occurrences as i64 - 1) * savings_per_later - 4; + net > 0 + } + } + + // LEB128 byte-length helpers (matches wasm-encoder's encoding). + fn unsigned_leb128_bytes(mut v: u64) -> usize { + let mut n = 1; + while v >= 0x80 { + v >>= 7; + n += 1; + } + n + } + fn signed_leb128_bytes_i32(v: i32) -> usize { + // Signed LEB128: bits needed including sign, divided by 7 + // (round up). Small values: -64..=63 fit in 1 byte; -8192..=8191 + // in 2 bytes; etc. + let mut v = v as i64; + let mut n = 0; + loop { + n += 1; + let byte = (v & 0x7f) as u8; + v >>= 7; + let sign_bit_set = byte & 0x40 != 0; + if (v == 0 && !sign_bit_set) || (v == -1 && sign_bit_set) { + return n; + } + } + } + fn signed_leb128_bytes_i64(v: i64) -> usize { + let mut v = v; + let mut n = 0; + loop { + n += 1; + let byte = (v & 0x7f) as u8; + v >>= 7; + let sign_bit_set = byte & 0x40 != 0; + if (v == 0 && !sign_bit_set) || (v == -1 && sign_bit_set) { + return n; + } + } } for func in &mut module.functions { @@ -10860,13 +11166,19 @@ pub mod optimize { let mut duplicates_to_eliminate = Vec::new(); for (hash, positions) in &hash_to_positions { - if positions.len() > 1 { - // Check if the expression is pure - if let Some((expr, _)) = expr_at_position.get(&positions[0]) { - if expr.is_pure() { - duplicates_to_eliminate.push((*hash, positions.clone())); - } - } + if positions.len() > 1 + && let Some((expr, _)) = expr_at_position.get(&positions[0]) + && expr.is_pure() + // Cost gate: deduplicating into local.tee/local.get + // adds 4 fixed bytes (tee + new local declaration) + // plus amortizes only on the (N-1) later occurrences. + // For cheap expressions (1-2 byte materialization) the + // replacement is always a regression. For 5-byte + // expressions it breaks even at N=3. Skip when the + // net savings are non-positive. + && expr.worth_dedup(positions.len()) + { + duplicates_to_eliminate.push((*hash, positions.clone())); } } @@ -13652,60 +13964,102 @@ mod tests { } #[test] - fn test_cse_phase4_duplicate_constants() { - // Test that CSE Phase 4 eliminates duplicate constants - // Use a case where the same constant is used multiple times in expressions + fn test_cse_phase4_duplicate_constants_above_cost_threshold() { + // CSE-ing duplicate constants is only a size win when the constant + // materializes in more bytes than the local.tee/local.get pair plus + // the new local-declaration overhead. Use a 5-byte LEB128 value + // (i32.const 0x12345678) so dedup is profitable. + // + // The earlier version of this test used `i32.const 42` (a 2-byte + // total), which CSE would dedup for instruction-count savings but + // grew code-section bytes. v0.6.0 added a cost gate (Expr::worth_dedup) + // that suppresses this regression. See gale v0.4.0 measurement: + // CSE-ing -EINVAL grew kernel-FFI code section by +6.3%. let wat = r#"(module (func $test (result i32) (local $result i32) - ;; Use the same constant multiple times - (local.set $result (i32.const 42)) - (local.set $result (i32.add (local.get $result) (i32.const 42))) - (local.set $result (i32.add (local.get $result) (i32.const 42))) + ;; Use the same large constant multiple times (5-byte LEB128) + (local.set $result (i32.const 0x12345678)) + (local.set $result (i32.add (local.get $result) (i32.const 0x12345678))) + (local.set $result (i32.add (local.get $result) (i32.const 0x12345678))) (local.get $result) ) )"#; let mut module = parse::parse_wat(wat).unwrap(); - // Count i32.const 42 instructions before CSE let count_before = module.functions[0] .instructions .iter() - .filter(|i| matches!(i, Instruction::I32Const(42))) + .filter(|i| matches!(i, Instruction::I32Const(0x1234_5678))) .count(); assert_eq!( count_before, 3, "Should have 3 duplicate constants before CSE" ); - // Apply enhanced CSE with Phase 4 optimize::eliminate_common_subexpressions_enhanced(&mut module).unwrap(); - // After CSE, should have 1 i32.const + local.tee, then 2 local.get let const_count = module.functions[0] .instructions .iter() - .filter(|i| matches!(i, Instruction::I32Const(42))) - .count(); - let local_tee_count = module.functions[0] - .instructions - .iter() - .filter(|i| matches!(i, Instruction::LocalTee(_))) + .filter(|i| matches!(i, Instruction::I32Const(0x1234_5678))) .count(); - // Should eliminate duplicate constants assert!( const_count < count_before, - "CSE should reduce number of i32.const instructions" + "CSE should reduce duplicates of large constants (5+ byte LEB128)" ); - if const_count == 1 && local_tee_count >= 1 { - // CSE worked - we have 1 const with tee and gets for duplicates - eprintln!("✓ CSE Phase 4 successfully eliminated duplicates"); - } + let wasm_bytes = encode::encode_wasm(&module).unwrap(); + wasmparser::validate(&wasm_bytes).expect("Generated WASM should be valid"); + } + + #[test] + fn test_cse_phase4_keeps_small_constants() { + // Mirror of the above: a 2-byte constant must NOT be deduplicated, + // because local.tee/local.get + new-local cost more than the + // original two materializations. Pin the gale regression fix: + // the CSE pass must leave cheap-constant patterns unchanged. + let wat = r#"(module + (func $test (result i32) + (local $result i32) + (local.set $result (i32.const 42)) + (local.set $result (i32.add (local.get $result) (i32.const 42))) + (local.set $result (i32.add (local.get $result) (i32.const 42))) + (local.get $result) + ) + )"#; + + let mut module = parse::parse_wat(wat).unwrap(); + let locals_before = module.functions[0].locals.clone(); + let instr_count_before = module.functions[0].instructions.len(); + + optimize::eliminate_common_subexpressions_enhanced(&mut module).unwrap(); + + // Cheap constants must survive — neither locals nor instruction + // count may grow when the cost gate refuses dedup. + assert_eq!( + module.functions[0].locals, locals_before, + "CSE on cheap constants must not add locals (would grow function header)" + ); + assert_eq!( + module.functions[0].instructions.len(), + instr_count_before, + "CSE on cheap constants must not add local.tee/local.get instructions" + ); + + let const_count = module.functions[0] + .instructions + .iter() + .filter(|i| matches!(i, Instruction::I32Const(42))) + .count(); + assert_eq!( + const_count, 3, + "Cheap constants must survive CSE — dedup would grow code size. \ + See docs/research/gale-v0.4.0/measurement-report.md" + ); - // Verify the optimized module is still valid let wasm_bytes = encode::encode_wasm(&module).unwrap(); wasmparser::validate(&wasm_bytes).expect("Generated WASM should be valid"); } @@ -14880,4 +15234,181 @@ mod tests { let result = terms::terms_to_instructions(&terms).expect("Failed to convert back"); assert_eq!(result, vec![Instruction::DataDrop(0)]); } + + // eliminate_dead_locals tests (v0.6.0 PR-B) + // + // Targets: gale "default-then-override" pattern where a local is + // written but never read. The pass is path-insensitive — sound + // regardless of BrIf/BrTable/early-Return control flow that gates + // simplify_locals and coalesce_locals. + + #[test] + fn test_eliminate_dead_locals_basic_write_only() { + // The exact gale pattern from gale_bitarray_alloc_validate: + // local 3 written with EINVAL default but never read. + let wat = r#"(module + (func $test (param i32) (param i32) (result i32) + (local i32) ;; local 2 — dead + i32.const -22 + local.set 2 + local.get 0 + local.get 1 + i32.add + ) + )"#; + + let mut module = parse::parse_wat(wat).expect("Failed to parse"); + let bytes_before = encode::encode_wasm(&module).expect("encode").len(); + let locals_before = module.functions[0].locals.clone(); + assert_eq!(locals_before, vec![(1, ValueType::I32)]); + + optimize::eliminate_dead_locals(&mut module).expect("eliminate_dead_locals"); + + // The dead local should be gone from the declaration. + assert_eq!( + module.functions[0].locals, + vec![], + "Dead local must be removed from declaration" + ); + + // The dead LocalSet should have been replaced by Drop. + let has_dead_set = module.functions[0] + .instructions + .iter() + .any(|i| matches!(i, Instruction::LocalSet(2))); + assert!(!has_dead_set, "Dead LocalSet must be neutralized"); + + // Output must validate as wasm. + let wasm_bytes = encode::encode_wasm(&module).expect("encode after"); + wasmparser::validate(&wasm_bytes).expect("output must validate"); + assert!( + wasm_bytes.len() < bytes_before, + "Eliminating a dead local must reduce binary size" + ); + } + + #[test] + fn test_eliminate_dead_locals_preserves_used_locals() { + // A local that IS read must survive — even if some writes + // to it look redundant. Path-sensitivity is Pick #3, not this pass. + let wat = r#"(module + (func $test (param i32) (result i32) + (local i32) ;; local 1 — read in addition below + i32.const 7 + local.set 1 + local.get 0 + local.get 1 + i32.add + ) + )"#; + + let mut module = parse::parse_wat(wat).expect("parse"); + let locals_before = module.functions[0].locals.clone(); + + optimize::eliminate_dead_locals(&mut module).expect("pass"); + + assert_eq!( + module.functions[0].locals, locals_before, + "Live local must survive" + ); + } + + #[test] + fn test_eliminate_dead_locals_localtee_neutralization() { + // LocalTee writing to a dead local must be REMOVED (not replaced + // with Drop). LocalTee's stack effect is [T] -> [T], so removing + // the instruction leaves the value passing through. Replacing + // with Drop would consume the stack value and break a downstream + // consumer. + let wat = r#"(module + (func $test (param i32) (result i32) + (local i32) ;; local 1 — dead (only LocalTee writes, no LocalGet) + local.get 0 + i32.const 1 + i32.add + local.tee 1 + i32.const 2 + i32.mul + ) + )"#; + + let mut module = parse::parse_wat(wat).expect("parse"); + + optimize::eliminate_dead_locals(&mut module).expect("pass"); + + let has_localtee = module.functions[0] + .instructions + .iter() + .any(|i| matches!(i, Instruction::LocalTee(_))); + assert!(!has_localtee, "Dead LocalTee must be removed entirely"); + + // The function must still encode and validate. + let wasm_bytes = encode::encode_wasm(&module).expect("encode"); + wasmparser::validate(&wasm_bytes) + .expect("LocalTee removal must preserve stack — value passes through"); + + // Local declaration is gone. + assert_eq!(module.functions[0].locals, vec![]); + } + + #[test] + fn test_eliminate_dead_locals_packs_indices() { + // After dropping a dead middle local, surviving locals must be + // packed to dense indices starting at param_count. Locals that + // were 2,3,4 with #3 dead become 2,3 (was 4 -> now 3). + let wat = r#"(module + (func $test (param i32) (param i32) (result i32) + (local i32 i32 i32) ;; locals 2,3,4 — only 4 is dead + i32.const 1 + local.set 2 + i32.const 2 + local.set 4 ;; dead + local.get 2 + local.get 3 ;; reads local 3, keeps it alive + i32.add + ) + )"#; + + let mut module = parse::parse_wat(wat).expect("parse"); + + optimize::eliminate_dead_locals(&mut module).expect("pass"); + + // Two locals survive (2 and 3, the previously-dead 4 is gone). + let total_declared: u32 = module.functions[0].locals.iter().map(|(c, _)| *c).sum(); + assert_eq!(total_declared, 2, "One of three declared locals dropped"); + + // After pack-down, no instruction references local index 4. + let has_idx_4 = module.functions[0].instructions.iter().any(|i| { + matches!( + i, + Instruction::LocalGet(4) | Instruction::LocalSet(4) | Instruction::LocalTee(4) + ) + }); + assert!(!has_idx_4, "Index 4 must be remapped or removed"); + + let wasm_bytes = encode::encode_wasm(&module).expect("encode"); + wasmparser::validate(&wasm_bytes).expect("validates"); + } + + #[test] + fn test_eliminate_dead_locals_skips_params() { + // Parameters are observable to the caller — never eliminate them + // even if a function never reads them. + let wat = r#"(module + (func $test (param i32) (param i32) (result i32) + ;; param 1 (idx 1) is never read + local.get 0 + ) + )"#; + + let mut module = parse::parse_wat(wat).expect("parse"); + let sig_before = module.functions[0].signature.clone(); + + optimize::eliminate_dead_locals(&mut module).expect("pass"); + + assert_eq!( + module.functions[0].signature, sig_before, + "Function signature (params) must never change" + ); + } }