diff --git a/CHANGELOG.md b/CHANGELOG.md index 89fb0ee..5f183e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,31 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +### Added + +- **Rewriter instruction-level offset map** (#143 DWARF Phase 2 + increment 2, `meld-core/src/rewriter.rs`). New + `rewrite_function_body_with_offsets` returns, alongside the + rewritten `Function`, an `InstrOffsetMap` recording each input + operator's `(old, new)` byte offset at instruction boundaries + (relative to the function body's instruction stream). This is the + second anchor DWARF address remapping needs: meld's rewriter + changes operand values (function/global/etc. indices) whose LEB128 + encodings shift length, so intra-function byte offsets drift — + `.debug_line` programs can't be remapped by function-base relocation + alone. The map is collected by measuring each emitted instruction's + encoded length (identical to what `Function::instruction` appends, + via `wasm_encoder::Encode`); it is **opt-in** — the plain + `rewrite_function_body` path pays zero cost and emits byte-identical + code (pinned by `with_offsets_emits_identical_function_bytes`). + `InstrOffsetMap::translate` resolves an old instruction-stream + offset to its new offset. 4 new tests pin LEB-growth accumulation + (`call 0`→`call 200` diverges +1 per call), the identity case (no + LEB change ⇒ `new == old`), translate hit/miss, and output-byte + equivalence. Increment 3 (gimli `.debug_line`/`.debug_info` rewrite) + composes this intra-function map with the per-function base from the + v0.16.0 component-provenance v2 `code_range`. + ## [0.16.0] - 2026-05-28 ### Added diff --git a/meld-core/src/rewriter.rs b/meld-core/src/rewriter.rs index 6beb855..e5f63ee 100644 --- a/meld-core/src/rewriter.rs +++ b/meld-core/src/rewriter.rs @@ -6,11 +6,50 @@ use crate::{Error, Result}; use std::collections::HashMap; -use wasm_encoder::{BlockType, Function, Instruction, MemArg}; +use wasm_encoder::{BlockType, Encode, Function, Instruction, MemArg}; use wasmparser::{ BlockType as WpBlockType, FunctionBody, MemArg as WpMemArg, Operator, OperatorsReader, }; +/// One instruction-boundary offset pair (DWARF Phase 2 inc 2, #143). +/// +/// `old` and `new` are byte offsets relative to the start of their +/// respective function body's **instruction stream** (the byte +/// immediately after the locals-declaration vector). The rewriter +/// changes operand values (function/global/etc. indices) whose +/// LEB128 encodings can change length, so `new` drifts away from +/// `old` at every instruction past the first length change. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct InstrOffset { + pub old: u32, + pub new: u32, +} + +/// Per-function instruction offset map (DWARF Phase 2 inc 2, #143). +/// +/// One [`InstrOffset`] per input operator, in stream order. When an +/// input operator rewrites to multiple output instructions, `new` is +/// the offset of the **first** emitted instruction — the address +/// DWARF line-number programs attribute to that source operator. +/// Increment 3 composes these intra-function offsets with the +/// per-function base from the component-provenance v2 `code_range` +/// to translate DWARF code addresses from input to fused output. +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub struct InstrOffsetMap { + pub entries: Vec, +} + +impl InstrOffsetMap { + /// Translate an old intra-function instruction-stream offset to + /// the corresponding new offset. Returns `None` if `old` does not + /// fall on a recorded instruction boundary (DWARF addresses + /// always point at instruction starts, so a miss signals either a + /// malformed address or an offset past the function end). + pub fn translate(&self, old: u32) -> Option { + self.entries.iter().find(|e| e.old == old).map(|e| e.new) + } +} + /// Index mappings for rewriting #[derive(Debug, Clone, Default)] pub struct IndexMaps { @@ -82,6 +121,32 @@ pub fn rewrite_function_body( param_count: u32, maps: &IndexMaps, ) -> Result { + Ok(rewrite_function_body_core(body, param_count, maps, false)?.0) +} + +/// Like [`rewrite_function_body`], but also returns the per-function +/// [`InstrOffsetMap`] mapping each input operator's instruction-stream +/// byte offset to its rewritten offset (DWARF Phase 2 inc 2, #143). +/// +/// The offset map is collected by measuring each emitted instruction's +/// encoded length — a hot-path cost, so this entry point is only used +/// when DWARF address remapping is requested. The plain +/// [`rewrite_function_body`] path pays nothing. +pub fn rewrite_function_body_with_offsets( + body: &FunctionBody<'_>, + param_count: u32, + maps: &IndexMaps, +) -> Result<(Function, InstrOffsetMap)> { + let (func, map) = rewrite_function_body_core(body, param_count, maps, true)?; + Ok((func, map.unwrap_or_default())) +} + +fn rewrite_function_body_core( + body: &FunctionBody<'_>, + param_count: u32, + maps: &IndexMaps, + collect_offsets: bool, +) -> Result<(Function, Option)> { let locals_reader = body.get_locals_reader()?; // Collect locals @@ -137,25 +202,55 @@ pub fn rewrite_function_body( // Get operators and rewrite them let ops_reader = body.get_operators_reader()?; - rewrite_operators(ops_reader, &maps, &mut func)?; + let offset_map = rewrite_operators(ops_reader, &maps, &mut func, collect_offsets)?; - Ok(func) + Ok((func, offset_map)) } -/// Rewrite operators in a function body +/// Rewrite operators in a function body, optionally collecting an +/// [`InstrOffsetMap`]. +/// +/// When `collect_offsets` is true, the new offset is accumulated by +/// measuring each emitted instruction's encoded byte length — +/// identical to what `Function::instruction` appends, since both +/// route through `wasm_encoder::Encode`. The map records, per input +/// operator, the (old, new) instruction-stream offsets; for an +/// operator that expands to several instructions the `new` offset is +/// captured *before* the group is emitted (the first instruction's +/// position). When false, no measurement happens and `None` is +/// returned — the zero-cost default path. fn rewrite_operators( reader: OperatorsReader<'_>, maps: &IndexMaps, func: &mut Function, -) -> Result<()> { - for op in reader { - let op = op?; + collect_offsets: bool, +) -> Result> { + let mut map = collect_offsets.then(InstrOffsetMap::default); + let mut new_offset: u32 = 0; + let mut base_old: Option = None; + + for res in reader.into_iter_with_offsets() { + let (op, old_pos) = res?; let instrs = rewrite_operator(op, maps)?; - for instr in instrs { - func.instruction(&instr); + if let Some(m) = map.as_mut() { + // First operator's absolute position defines the + // instruction-stream base; subsequent offsets are relative. + let base = *base_old.get_or_insert(old_pos); + m.entries.push(InstrOffset { + old: (old_pos - base) as u32, + new: new_offset, + }); + } + for instr in &instrs { + if map.is_some() { + let mut buf = Vec::new(); + instr.encode(&mut buf); + new_offset = new_offset.saturating_add(buf.len() as u32); + } + func.instruction(instr); } } - Ok(()) + Ok(map) } /// Convert a wasmparser operator to wasm-encoder instruction with index remapping @@ -1220,4 +1315,152 @@ mod tests { ); } } + + // ─── DWARF Phase 2 inc 2: instruction offset map (#143) ─────────── + + /// Run `rewrite_function_body_with_offsets` on function `func_idx` + /// of a wat module and return the collected offset map. + fn offsets_for_wat_func(wat_src: &str, func_idx: usize, maps: &IndexMaps) -> InstrOffsetMap { + let wasm = wat::parse_str(wat_src).expect("wat parse"); + let mut idx = 0; + for payload in wasmparser::Parser::new(0).parse_all(&wasm) { + if let wasmparser::Payload::CodeSectionEntry(body) = payload.expect("payload") { + if idx == func_idx { + let (_func, map) = rewrite_function_body_with_offsets(&body, 0, maps) + .expect("rewrite with offsets"); + return map; + } + idx += 1; + } + } + panic!("function index {func_idx} not found in code section"); + } + + /// LEB-growth tracking: remapping `call 0` → `call 200` grows each + /// call's operand LEB from 1 to 2 bytes. The offset map's `new` + /// must diverge from `old` by the *accumulated* growth — +1 after + /// the first remapped call, +2 after the second. This is the + /// property DWARF line-table remapping (inc 3) depends on. + #[test] + fn instr_offset_map_tracks_leb_growth_from_index_remap() { + let mut maps = IndexMaps::new(); + maps.functions.insert(0, 200); // 1-byte LEB → 2-byte LEB + + // func 1 body: call 0; drop; call 0; drop; i32.const 2 (+ end) + let map = offsets_for_wat_func( + r#"(module + (func (result i32) i32.const 1) + (func (result i32) + call 0 drop + call 0 drop + i32.const 2))"#, + 1, + &maps, + ); + + // Six operators: call, drop, call, drop, const, end. + assert_eq!(map.entries.len(), 6, "one entry per operator: {map:?}"); + + // The divergence (new - old) per entry must be the accumulated + // LEB growth: 0 before any remapped call completes, +1 after + // the first call, +2 after the second. Sequence: [0,1,1,2,2,2]. + let divergence: Vec = map + .entries + .iter() + .map(|e| e.new as i64 - e.old as i64) + .collect(); + assert_eq!( + divergence, + vec![0, 1, 1, 2, 2, 2], + "new offsets must diverge by accumulated LEB growth: {map:?}" + ); + + // Old and new offsets are each strictly increasing. + for w in map.entries.windows(2) { + assert!(w[0].old < w[1].old, "old offsets must increase: {map:?}"); + assert!(w[0].new < w[1].new, "new offsets must increase: {map:?}"); + } + // First instruction anchors both streams at 0. + assert_eq!(map.entries[0].old, 0); + assert_eq!(map.entries[0].new, 0); + } + + /// Identity case: when no remap changes an operand's LEB length, + /// `new` must equal `old` at every instruction boundary — the map + /// is the identity. Remapping 0→1 keeps the `call` operand a + /// single LEB byte. + #[test] + fn instr_offset_map_is_identity_when_no_leb_length_change() { + let mut maps = IndexMaps::new(); + maps.functions.insert(0, 1); // both 1-byte LEBs + + let map = offsets_for_wat_func( + r#"(module + (func (result i32) i32.const 1) + (func (result i32) call 0 drop i32.const 2))"#, + 1, + &maps, + ); + + assert!(!map.entries.is_empty()); + for e in &map.entries { + assert_eq!(e.old, e.new, "no LEB change ⇒ identity offsets: {e:?}"); + } + } + + /// `translate` resolves recorded boundaries and rejects offsets + /// that don't land on an instruction start. + #[test] + fn instr_offset_map_translate_hits_and_misses() { + let map = InstrOffsetMap { + entries: vec![ + InstrOffset { old: 0, new: 0 }, + InstrOffset { old: 2, new: 3 }, + InstrOffset { old: 5, new: 7 }, + ], + }; + assert_eq!(map.translate(0), Some(0)); + assert_eq!(map.translate(2), Some(3)); + assert_eq!(map.translate(5), Some(7)); + // Offsets mid-instruction (not a recorded boundary) miss. + assert_eq!(map.translate(1), None); + assert_eq!(map.translate(99), None); + } + + /// The plain `rewrite_function_body` path is unaffected: it still + /// returns a `Function` and pays no offset-collection cost. Verify + /// it produces byte-identical output to the with-offsets variant's + /// `Function` (the offset collection must not change the emitted + /// code). + #[test] + fn with_offsets_emits_identical_function_bytes() { + let mut maps = IndexMaps::new(); + maps.functions.insert(0, 200); + let src = r#"(module + (func (result i32) i32.const 1) + (func (result i32) call 0 drop call 0 drop i32.const 2))"#; + let wasm = wat::parse_str(src).expect("wat"); + + let mut bodies = Vec::new(); + for payload in wasmparser::Parser::new(0).parse_all(&wasm) { + if let wasmparser::Payload::CodeSectionEntry(body) = payload.expect("payload") { + bodies.push(body); + } + } + let target = &bodies[1]; + + let plain = rewrite_function_body(target, 0, &maps).expect("plain"); + let (with_off, _map) = + rewrite_function_body_with_offsets(target, 0, &maps).expect("with offsets"); + + // Encode both into a CodeSection and compare bytes. + let mut a = wasm_encoder::CodeSection::new(); + a.function(&plain); + let mut b = wasm_encoder::CodeSection::new(); + b.function(&with_off); + let (mut ab, mut bb) = (Vec::new(), Vec::new()); + a.encode(&mut ab); + b.encode(&mut bb); + assert_eq!(ab, bb, "offset collection must not change emitted code"); + } }