diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f2d5cd..961f9d4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,46 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +## [0.19.0] - 2026-05-29 + +### Added + +- **`DwarfHandling::Remap` — end-to-end DWARF address remapping** + (#143 DWARF Phase 2 increment 3b, `meld-core/src/dwarf.rs`). The + final piece of DWARF Phase 2: reads an input core module's `.debug_*` + sections, translates every code address to the fused code section via + the `AddressRemap` engine (v0.18.0), and re-serializes a single + remapped DWARF set with `gimli::write::Dwarf::from`. New `gimli` + dependency. Exposed via `meld fuse --dwarf remap` (modes: `strip` + (default), `passthrough`, `remap`). + - The per-function instruction offset map is recovered **post-hoc** + by walking the input and final-output operator streams in lockstep + — no state threaded through the merge hot path, and it reflects + whatever rewriting actually happened (including the adapter-wiring + re-rewrite). A per-function operator-count or locals-prefix mismatch + aborts the remap (correct-or-strip). + - **Correct-or-strip throughout.** `gimli::write::Dwarf::from` is + all-or-nothing on addresses, which is used as the safety gate: only + the structurally-invariant code-section base (address 0) is + special-cased; any other unmapped address fails the conversion and + falls back to stripping rather than emitting a wrong address. + - **Single DWARF source** is supported in this increment. Inputs + where more than one core module carries DWARF fall back to `strip` + with a warning (merging independent DWARF unit sets is deferred); + zero DWARF sources is a no-op. + - Encoding uses a three-pass dance so the remapped `.debug_*` land in + the attestation/provenance-hashed bytes (trailing custom sections + don't shift code offsets, so the remap built from pass A is valid + for the final output). + - New loss scenario **LS-D-1** (wrong remapped DWARF address → + de-grounded downstream coverage/breakpoints) is `approved`, gated by + `dwarf::tests::ls_d_1_remap_translates_low_pc` (full gimli + read→convert→write→read oracle). **Residual:** `DW_AT_high_pc` + encoded as a *length* is copied verbatim, so a function's reported + byte length may be off by intra-function LEB drift; `low_pc` and the + line-number program (what debuggers and `pulseengine/witness` use) + are correct. + ## [0.18.0] - 2026-05-29 ### Added diff --git a/Cargo.lock b/Cargo.lock index 54aa6c5..c89ecdc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,7 +8,7 @@ version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b5d307320b3181d6d7954e663bd7c774a838b8220fe0593c86d9fb09f498b4b" dependencies = [ - "gimli", + "gimli 0.32.3", ] [[package]] @@ -457,7 +457,7 @@ dependencies = [ "cranelift-control", "cranelift-entity", "cranelift-isle", - "gimli", + "gimli 0.32.3", "hashbrown 0.15.5", "log", "pulley-interpreter", @@ -949,6 +949,17 @@ dependencies = [ "wasip3", ] +[[package]] +name = "gimli" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" +dependencies = [ + "fallible-iterator", + "indexmap", + "stable_deref_trait", +] + [[package]] name = "gimli" version = "0.32.3" @@ -1370,7 +1381,7 @@ checksum = "4facc753ae494aeb6e3c22f839b158aebd4f9270f55cd3c79906c45476c47ab4" [[package]] name = "meld-cli" -version = "0.18.0" +version = "0.19.0" dependencies = [ "anyhow", "clap", @@ -1385,11 +1396,12 @@ dependencies = [ [[package]] name = "meld-core" -version = "0.18.0" +version = "0.19.0" dependencies = [ "anyhow", "bitflags", "criterion", + "gimli 0.31.1", "hex", "log", "petgraph 0.8.3", @@ -2557,7 +2569,7 @@ dependencies = [ "encoding_rs", "futures", "fxprof-processed-profile", - "gimli", + "gimli 0.32.3", "hashbrown 0.15.5", "indexmap", "ittapi", @@ -2608,7 +2620,7 @@ dependencies = [ "cpp_demangle", "cranelift-bitset", "cranelift-entity", - "gimli", + "gimli 0.32.3", "indexmap", "log", "object", @@ -2678,7 +2690,7 @@ dependencies = [ "cranelift-entity", "cranelift-frontend", "cranelift-native", - "gimli", + "gimli 0.32.3", "itertools 0.14.0", "log", "object", @@ -2778,7 +2790,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0063e61f1d0b2c20e9cfc58361a6513d074a23c80b417aac3033724f51648a0" dependencies = [ "cranelift-codegen", - "gimli", + "gimli 0.32.3", "log", "object", "target-lexicon", @@ -2966,7 +2978,7 @@ dependencies = [ "anyhow", "cranelift-assembler-x64", "cranelift-codegen", - "gimli", + "gimli 0.32.3", "regalloc2", "smallvec", "target-lexicon", diff --git a/Cargo.toml b/Cargo.toml index 8be48dd..4f8ffe4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ exclude = [ ] [workspace.package] -version = "0.18.0" +version = "0.19.0" authors = ["PulseEngine "] edition = "2024" license = "Apache-2.0" @@ -23,6 +23,9 @@ wasmparser = { version = "0.246", features = ["component-model"] } wasm-encoder = { version = "0.246", features = ["component-model"] } wasmprinter = "0.246" +# DWARF read/write for address remapping (#143 Phase 2) +gimli = "0.31" + # CLI clap = { version = "4.5", features = ["derive", "cargo"] } diff --git a/meld-cli/src/main.rs b/meld-cli/src/main.rs index f585329..47fc341 100644 --- a/meld-cli/src/main.rs +++ b/meld-cli/src/main.rs @@ -18,7 +18,7 @@ use anyhow::{Context, Result, anyhow}; use clap::{Parser, Subcommand}; -use meld_core::{Fuser, FuserConfig, FusionStats, MemoryStrategy, OutputFormat}; +use meld_core::{DwarfHandling, Fuser, FuserConfig, FusionStats, MemoryStrategy, OutputFormat}; use std::fs; use std::path::Path; use std::time::Instant; @@ -80,6 +80,15 @@ enum Commands { #[arg(long)] no_component_provenance: bool, + /// DWARF debug-info handling: `strip` (default — drop all + /// `.debug_*`), `passthrough` (copy verbatim; addresses are + /// wrong against the fused code section), or `remap` (#143 — + /// translate code addresses to the fused code section). `remap` + /// currently supports a single DWARF-bearing input module and + /// falls back to `strip` for multi-source inputs. + #[arg(long, value_name = "MODE", default_value = "strip")] + dwarf: String, + /// Preserve debug names in output #[arg(long)] preserve_names: bool, @@ -148,6 +157,7 @@ fn main() -> Result<()> { stats, no_attestation, no_component_provenance, + dwarf, preserve_names, validate, component, @@ -162,6 +172,7 @@ fn main() -> Result<()> { stats, no_attestation, no_component_provenance, + dwarf, preserve_names, validate, component, @@ -218,6 +229,7 @@ fn fuse_command( show_stats: bool, no_attestation: bool, no_component_provenance: bool, + dwarf: String, preserve_names: bool, validate: bool, component: bool, @@ -289,6 +301,18 @@ fn fuse_command( } } + let dwarf_handling = match dwarf.as_str() { + "strip" => DwarfHandling::Strip, + "passthrough" => DwarfHandling::PassThrough, + "remap" => DwarfHandling::Remap, + other => { + return Err(anyhow!( + "Invalid --dwarf mode: {}. Use 'strip', 'passthrough', or 'remap'", + other + )); + } + }; + let config = FuserConfig { memory_strategy, attestation: !no_attestation, @@ -297,6 +321,7 @@ fn fuse_command( preserve_names, output_format, opaque_resources, + dwarf_handling, ..Default::default() }; diff --git a/meld-core/Cargo.toml b/meld-core/Cargo.toml index 35d9873..7e6f068 100644 --- a/meld-core/Cargo.toml +++ b/meld-core/Cargo.toml @@ -18,6 +18,9 @@ unexpected_cfgs = { level = "warn", check-cfg = ['cfg(kani)'] } wasmparser.workspace = true wasm-encoder.workspace = true +# DWARF read/write for address remapping (#143 Phase 2 — DwarfHandling::Remap) +gimli.workspace = true + # Error handling anyhow.workspace = true thiserror.workspace = true diff --git a/meld-core/src/dwarf.rs b/meld-core/src/dwarf.rs index 4a16ce4..cafbb7f 100644 --- a/meld-core/src/dwarf.rs +++ b/meld-core/src/dwarf.rs @@ -50,7 +50,7 @@ //! and output. So the prefix cancels when both are equal, and the //! [`FunctionSpan`] records it once as `locals_prefix_len`. -use crate::rewriter::InstrOffsetMap; +use crate::rewriter::{InstrOffset, InstrOffsetMap}; use std::collections::BTreeMap; /// One fused function's mapping data: where it was in the input code @@ -149,6 +149,296 @@ impl AddressRemap { } } +// --------------------------------------------------------------------------- +// Increment 3b: build the remap from a real fusion and rewrite `.debug_*`. +// --------------------------------------------------------------------------- + +/// Per-defined-function byte layout of one core module's code section, +/// recovered by parsing. Used on both the input and the fused output so +/// the two can be walked in lockstep to recover the instruction offset +/// map without threading state through the merge hot path. +struct FnLayout { + /// Function body start, code-section-relative (points at the + /// locals-count LEB — the same convention as + /// [`crate::provenance::CodeRange`]). + body_start: u32, + /// Function body end, code-section-relative (exclusive). + body_end: u32, + /// Bytes from `body_start` to the first operator (locals vector). + locals_prefix_len: u32, + /// Instruction-stream offset (0 at the first operator) of every + /// operator, in code order. + op_offsets: Vec, +} + +/// Parse `module_bytes` and return the [`FnLayout`] of every *defined* +/// function, in code-section order. Returns `None` on any parse error +/// or if there is no code section — the caller then falls back to +/// stripping DWARF rather than emitting a guessed address. +fn module_function_layouts(module_bytes: &[u8]) -> Option> { + use wasmparser::{Parser, Payload}; + let mut content_start: Option = None; + let mut layouts = Vec::new(); + for payload in Parser::new(0).parse_all(module_bytes) { + match payload.ok()? { + Payload::CodeSectionStart { range, .. } => content_start = Some(range.start), + Payload::CodeSectionEntry(body) => { + let base = content_start?; + let r = body.range(); + let ops_reader = body.get_operators_reader().ok()?; + let first_op_pos = ops_reader.original_position(); + let locals_prefix_len = (first_op_pos - r.start) as u32; + let mut op_offsets = Vec::new(); + for item in ops_reader.into_iter_with_offsets() { + let (_op, pos) = item.ok()?; + op_offsets.push((pos - first_op_pos) as u32); + } + layouts.push(FnLayout { + body_start: (r.start - base) as u32, + body_end: (r.end - base) as u32, + locals_prefix_len, + op_offsets, + }); + } + _ => {} + } + } + // No code section → cannot remap; signal the caller to strip. + content_start?; + Some(layouts) +} + +/// Number of imported functions in a core module — the offset between a +/// module-level function index and its defined-function index. +fn import_func_count(module: &crate::parser::CoreModule) -> u32 { + module + .imports + .iter() + .filter(|i| matches!(i.kind, crate::parser::ImportKind::Function(_))) + .count() as u32 +} + +/// Build an [`AddressRemap`] for the single source core module +/// `(comp_idx, mod_idx)`, pairing each of its defined functions with +/// the corresponding fused-output function and zipping their +/// instruction streams. +/// +/// Returns `None` if any function's input/output layouts are +/// inconsistent (different operator count or locals prefix — which +/// happens when the rewriter inserted instructions, e.g. memory +/// address-rebasing) or if parsing fails. A `None` is a hard "do not +/// remap" signal: better to strip DWARF than emit a wrong address. +fn build_remap_for_module( + module: &crate::parser::CoreModule, + merged: &crate::merger::MergedModule, + comp_idx: usize, + mod_idx: usize, + output_bytes: &[u8], +) -> Option { + let imports = import_func_count(module); + // (output defined-function index, input module-level function index) + // for every fused function originating from this source module. + let pairs: Vec<(usize, u32)> = merged + .functions + .iter() + .enumerate() + .filter(|(_, mf)| mf.origin.0 == comp_idx && mf.origin.1 == mod_idx) + .map(|(out_idx, mf)| (out_idx, mf.origin.2)) + .collect(); + build_remap_from_parts(&module.bytes, imports, output_bytes, &pairs) +} + +/// Testable core of [`build_remap_for_module`]: pair input and output +/// function layouts and assemble the [`AddressRemap`]. `pairs` lists +/// `(output_defined_idx, input_module_level_func_idx)` for the source +/// module. Returns `None` on any layout inconsistency (operator-count +/// or locals-prefix mismatch — meaning the rewriter inserted +/// instructions, so addresses cannot be mapped 1:1) or if no function +/// mapped. +fn build_remap_from_parts( + input_bytes: &[u8], + imports: u32, + output_bytes: &[u8], + pairs: &[(usize, u32)], +) -> Option { + let input_layouts = module_function_layouts(input_bytes)?; + let output_layouts = module_function_layouts(output_bytes)?; + + let mut remap = AddressRemap::new(); + for &(defined_out_idx, old_func_idx) in pairs { + // Module-level function index → input defined-function index. + let in_idx = old_func_idx.checked_sub(imports)? as usize; + let input = input_layouts.get(in_idx)?; + let output = output_layouts.get(defined_out_idx)?; + + // Locals are preserved verbatim in the DWARF-remap path, so the + // prefix must be identical; an operator was inserted otherwise. + if input.locals_prefix_len != output.locals_prefix_len { + return None; + } + if input.op_offsets.len() != output.op_offsets.len() { + return None; + } + + let entries = input + .op_offsets + .iter() + .zip(output.op_offsets.iter()) + .map(|(&old, &new)| InstrOffset { old, new }) + .collect(); + + remap.insert(FunctionSpan { + input_start: input.body_start, + input_end: input.body_end, + output_body_start: output.body_start, + locals_prefix_len: input.locals_prefix_len, + instr_offsets: InstrOffsetMap { entries }, + }); + } + + if remap.is_empty() { + return None; + } + Some(remap) +} + +/// Read the `.debug_*` sections in `debug` (a single source module's +/// DWARF), remap every code address through `remap`, and re-serialize. +/// Returns the rewritten `(section_name, bytes)` pairs, or `None` if +/// gimli could not round-trip the DWARF (caller falls back to strip). +/// +/// Wasm DWARF is little-endian and uses code-section-relative +/// addresses, which is exactly what [`AddressRemap::translate`] +/// consumes and produces. +/// +/// **Fidelity note:** `DW_AT_high_pc` encoded as a *length* +/// (`DW_FORM_data*`, the common Rust/LLVM encoding) is copied verbatim +/// — gimli treats it as a constant, not an address, so it is not routed +/// through `convert_address`. The function's start address (`low_pc`) +/// and the line-number program — what debuggers use for breakpoints and +/// backtraces — are remapped correctly; the high_pc *length* may be off +/// by the intra-function LEB drift. This is tracked as LS-D-1. +fn rewrite_debug_sections( + debug: &[(String, Vec)], + remap: &AddressRemap, +) -> Option)>> { + use gimli::write::{Address, Dwarf as WriteDwarf, EndianVec, Sections}; + use gimli::{EndianSlice, LittleEndian, SectionId}; + + let endian = LittleEndian; + let section_data = |name: &str| -> &[u8] { + debug + .iter() + .find(|(n, _)| n == name) + .map(|(_, d)| d.as_slice()) + .unwrap_or(&[]) + }; + + let load = |id: SectionId| -> Result, gimli::Error> { + Ok(EndianSlice::new(section_data(id.name()), endian)) + }; + let read_dwarf = gimli::Dwarf::load(load).ok()?; + + // gimli's `Dwarf::from` is all-or-nothing: if `convert_address` + // returns `None` for *any* address it queries, the whole conversion + // fails. We exploit that as a correct-or-strip gate — a real + // instruction address that we cannot map aborts the conversion and + // the caller strips the DWARF rather than emit a wrong address. + // + // The one address that is *structurally invariant* and not a mapped + // instruction is `0`: wasm DWARF code addresses are relative to the + // start of the code-section contents, and the compilation unit's + // base (`DW_AT_low_pc` = 0) denotes that start, which is offset 0 in + // both the input and the fused output. Map it to itself; everything + // else must go through the instruction-accurate remap. + let convert_address = |addr: u64| -> Option
{ + if addr == 0 { + return Some(Address::Constant(0)); + } + remap + .translate(addr as u32) + .map(|new| Address::Constant(new as u64)) + }; + + let mut write_dwarf = WriteDwarf::from(&read_dwarf, &convert_address).ok()?; + let mut sections = Sections::new(EndianVec::new(endian)); + write_dwarf.write(&mut sections).ok()?; + + let mut out = Vec::new(); + sections + .for_each(|id, data| { + let bytes = data.slice(); + if !bytes.is_empty() { + out.push((id.name().to_string(), bytes.to_vec())); + } + Ok::<(), gimli::Error>(()) + }) + .ok()?; + Some(out) +} + +/// Top-level entry point for [`crate::DwarfHandling::Remap`]. +/// +/// Inspects the input components for `.debug_*` sections and, when +/// exactly one source core module carries DWARF, builds its +/// [`AddressRemap`] and returns the rewritten debug sections to embed in +/// the fused output. Returns `None` (caller strips DWARF) when: +/// +/// - no input module carries DWARF, or +/// - **more than one** module carries DWARF (merging independent DWARF +/// unit sets into one consistent `.debug_info` is a separate problem, +/// deferred to a later increment — emitting either source's addresses +/// against the merged code section would be wrong for the other), or +/// - the remap or gimli round-trip fails any consistency check. +/// +/// `output_bytes` must be the fused module encoded *without* the +/// remapped DWARF (its code-section offsets are what the remap targets; +/// trailing custom sections do not shift code offsets, so the same +/// offsets hold in the final output). +pub fn remap_for_output( + components: &[crate::parser::ParsedComponent], + merged: &crate::merger::MergedModule, + output_bytes: &[u8], +) -> Option)>> { + // Find every (comp_idx, mod_idx) whose module carries DWARF. + let mut dwarf_sources: Vec<(usize, usize)> = Vec::new(); + for (ci, comp) in components.iter().enumerate() { + for (mi, module) in comp.core_modules.iter().enumerate() { + if module + .custom_sections + .iter() + .any(|(name, _)| name.starts_with(".debug_")) + { + dwarf_sources.push((ci, mi)); + } + } + } + + match dwarf_sources.as_slice() { + [] => None, + [(ci, mi)] => { + let module = &components[*ci].core_modules[*mi]; + let remap = build_remap_for_module(module, merged, *ci, *mi, output_bytes)?; + let debug: Vec<(String, Vec)> = module + .custom_sections + .iter() + .filter(|(name, _)| name.starts_with(".debug_")) + .cloned() + .collect(); + rewrite_debug_sections(&debug, &remap) + } + many => { + log::warn!( + "DwarfHandling::Remap: {} source modules carry DWARF; merging \ + independent DWARF unit sets is not yet supported — stripping \ + debug info instead of emitting wrong addresses (#143)", + many.len() + ); + None + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -263,4 +553,139 @@ mod tests { // Input addr 12 = body_rel 2 < locals_prefix_len 5 → None. assert_eq!(remap.translate(12), None); } + + /// Oracle for inc 3b: build real input DWARF with gimli, remap a + /// subprogram's `low_pc` from 0x10 → 0x200 through + /// [`rewrite_debug_sections`], then re-parse the *output* DWARF and + /// assert the address was actually translated. This exercises the + /// full gimli read → `convert_address` → write → read round-trip — + /// the genuinely new, fidelity-risky code path. + #[test] + fn ls_d_1_remap_translates_low_pc() { + use gimli::write::{ + Address, AttributeValue, Dwarf, EndianVec, LineProgram, Sections, Unit, + }; + use gimli::{Encoding, Format, LittleEndian, constants}; + + // --- Build input DWARF: one unit, one subprogram @ low_pc 0x10. + let encoding = Encoding { + format: Format::Dwarf32, + version: 4, + address_size: 4, + }; + let mut in_dwarf = Dwarf::new(); + let unit_id = in_dwarf.units.add(Unit::new(encoding, LineProgram::none())); + let unit = in_dwarf.units.get_mut(unit_id); + let root = unit.root(); + let sp = unit.add(root, constants::DW_TAG_subprogram); + unit.get_mut(sp).set( + constants::DW_AT_low_pc, + AttributeValue::Address(Address::Constant(0x10)), + ); + unit.get_mut(sp) + .set(constants::DW_AT_high_pc, AttributeValue::Udata(0x20)); + + let mut sections = Sections::new(EndianVec::new(LittleEndian)); + in_dwarf.write(&mut sections).expect("write input dwarf"); + let mut input: Vec<(String, Vec)> = Vec::new(); + sections + .for_each(|id, data| { + input.push((id.name().to_string(), data.slice().to_vec())); + Ok::<(), gimli::Error>(()) + }) + .expect("collect input sections"); + + // --- Remap input 0x10 → output 0x200 (single instruction). + let mut remap = AddressRemap::new(); + remap.insert(FunctionSpan { + input_start: 0x10, + input_end: 0x40, + output_body_start: 0x200, + locals_prefix_len: 0, + instr_offsets: InstrOffsetMap { + entries: vec![InstrOffset { old: 0, new: 0 }], + }, + }); + + let out = rewrite_debug_sections(&input, &remap).expect("rewrite debug sections"); + + // --- Re-parse output DWARF and read the subprogram's low_pc. + let section_data = |name: &str| -> &[u8] { + out.iter() + .find(|(n, _)| n == name) + .map(|(_, d)| d.as_slice()) + .unwrap_or(&[]) + }; + let load = + |id: gimli::SectionId| -> Result, gimli::Error> { + Ok(gimli::EndianSlice::new( + section_data(id.name()), + LittleEndian, + )) + }; + let dwarf = gimli::Dwarf::load(load).expect("load output dwarf"); + let mut units = dwarf.units(); + let header = units.next().expect("units iter").expect("exactly one unit"); + let unit = dwarf.unit(header).expect("parse unit"); + let mut entries = unit.entries(); + let mut low_pc = None; + while let Some((_, entry)) = entries.next_dfs().expect("dfs walk") { + if entry.tag() == constants::DW_TAG_subprogram + && let Some(gimli::AttributeValue::Addr(a)) = entry + .attr_value(constants::DW_AT_low_pc) + .expect("read low_pc attr") + { + low_pc = Some(a); + } + } + assert_eq!( + low_pc, + Some(0x200), + "low_pc must be remapped from input 0x10 to output 0x200" + ); + } + + /// The parallel-walk core: with an identity rewrite (output bytes == + /// input bytes) the recovered remap must be an identity on the first + /// instruction of every function — proving the layout parsing and + /// instruction-stream zipping line up. + #[test] + fn build_remap_from_parts_identity_walk() { + let wat = r#"(module + (func (param i32) (result i32) local.get 0 i32.const 1 i32.add) + (func (result i32) i32.const 42))"#; + let bytes = wat::parse_str(wat).expect("assemble wat"); + + // No imports; merged order matches input defined order. + let pairs = [(0usize, 0u32), (1usize, 1u32)]; + let remap = + build_remap_from_parts(&bytes, 0, &bytes, &pairs).expect("build identity remap"); + assert_eq!(remap.len(), 2); + + let layouts = module_function_layouts(&bytes).expect("layouts"); + assert_eq!(layouts.len(), 2); + for l in &layouts { + let first_instr = l.body_start + l.locals_prefix_len; + assert_eq!( + remap.translate(first_instr), + Some(first_instr), + "identity rewrite must map an address to itself" + ); + } + } + + /// A layout mismatch (output has more operators than input — what a + /// rewriter that inserted instructions would produce) must abort the + /// remap rather than emit a misaligned address. + #[test] + fn build_remap_from_parts_aborts_on_operator_count_mismatch() { + let input = wat::parse_str("(module (func (result i32) i32.const 1))").expect("input"); + let output = wat::parse_str("(module (func (result i32) i32.const 1 drop i32.const 2))") + .expect("output"); + let pairs = [(0usize, 0u32)]; + assert!( + build_remap_from_parts(&input, 0, &output, &pairs).is_none(), + "operator-count mismatch must yield None (fall back to strip)" + ); + } } diff --git a/meld-core/src/lib.rs b/meld-core/src/lib.rs index 5935249..d91563e 100644 --- a/meld-core/src/lib.rs +++ b/meld-core/src/lib.rs @@ -203,6 +203,17 @@ pub enum DwarfHandling { /// offsets and will be wrong against the merged code section. /// Use only if the consumer can tolerate or detect that. PassThrough, + + /// Remap DWARF code addresses to the fused code section (#143). + /// + /// Reads the input `.debug_*` sections, translates every code + /// address through an [`crate::dwarf::AddressRemap`] built from the + /// actual input→output instruction layout, and emits a single + /// rewritten DWARF set. Currently supports the case where exactly + /// one input core module carries DWARF; with zero or more than one + /// DWARF source — or if any address fails to map — it falls back to + /// [`DwarfHandling::Strip`] (never emitting a wrong address). + Remap, } /// Statistics about the fusion process @@ -431,7 +442,33 @@ impl Fuser { // to the bytes-without-extras, so consumers strip both // sections before verifying. log::info!("Encoding fused module"); - let output_without_extras = self.encode_output(&merged, &adapters, &[])?; + // Pass A: encode without DWARF and without meld-extras. Its + // code-section offsets are what the DWARF remap targets (trailing + // custom sections do not shift code offsets, so the same offsets + // hold in passes B and C). + let bytes_for_remap = self.encode_output(&merged, &adapters, &[], &[])?; + + // Build the remapped `.debug_*` sections (only under Remap; a + // miss or unsupported shape returns no sections → DWARF stripped). + let dwarf_sections: Vec<(String, Vec)> = if self.config.dwarf_handling + == DwarfHandling::Remap + { + dwarf::remap_for_output(&self.components, &merged, &bytes_for_remap).unwrap_or_default() + } else { + Vec::new() + }; + + // Pass B: re-encode with the remapped DWARF embedded. These are + // the bytes the attestation/provenance hashes cover. + let output_without_extras = if dwarf_sections.is_empty() { + bytes_for_remap + } else { + log::info!( + "Embedding {} remapped DWARF section(s)", + dwarf_sections.len() + ); + self.encode_output(&merged, &adapters, &[], &dwarf_sections)? + }; let mut extra_sections: Vec<(&str, Vec)> = Vec::new(); @@ -454,7 +491,7 @@ impl Fuser { let output = if extra_sections.is_empty() { output_without_extras } else { - self.encode_output(&merged, &adapters, &extra_sections)? + self.encode_output(&merged, &adapters, &extra_sections, &dwarf_sections)? }; // Optionally wrap the fused core module as a P2 component @@ -1295,6 +1332,7 @@ impl Fuser { merged: &MergedModule, adapters: &[adapter::AdapterFunction], extra_custom_sections: &[(&str, Vec)], + dwarf_sections: &[(String, Vec)], ) -> Result> { let mut module = EncodedModule::new(); @@ -1422,7 +1460,11 @@ impl Fuser { if !self.config.preserve_names && name == "name" { continue; } - if self.config.dwarf_handling == DwarfHandling::Strip && name.starts_with(".debug_") + // Only PassThrough emits raw per-input `.debug_*` + // sections. Strip drops them; Remap drops them here and + // emits a single remapped set below. + if self.config.dwarf_handling != DwarfHandling::PassThrough + && name.starts_with(".debug_") { continue; } @@ -1433,6 +1475,19 @@ impl Fuser { } } + // Remapped DWARF (DwarfHandling::Remap): a single `.debug_*` set + // whose code addresses target the fused code section, replacing + // the per-input sections skipped above. Emitted before the + // meld-metadata extras so they sit at a stable byte offset + // across the encode passes (the attestation/provenance hash + // covers these bytes; the extras are stripped before verifying). + for (name, contents) in dwarf_sections { + module.section(&wasm_encoder::CustomSection { + name: std::borrow::Cow::Borrowed(name), + data: std::borrow::Cow::Borrowed(contents), + }); + } + for (name, contents) in extra_custom_sections { module.section(&wasm_encoder::CustomSection { name: std::borrow::Cow::Borrowed(*name), @@ -1639,6 +1694,7 @@ impl Fuser { match self.config.dwarf_handling { DwarfHandling::Strip => "strip", DwarfHandling::PassThrough => "passthrough", + DwarfHandling::Remap => "remap", } } diff --git a/meld-core/tests/dwarf_passthrough.rs b/meld-core/tests/dwarf_passthrough.rs index fe464ff..6c7f9bb 100644 --- a/meld-core/tests/dwarf_passthrough.rs +++ b/meld-core/tests/dwarf_passthrough.rs @@ -179,6 +179,29 @@ fn fuse_passthrough(input: &[u8]) -> Vec { fuser.fuse().expect("fuse") } +/// Build a fuser exercising the **Remap** DWARF policy (#143 Phase 2 +/// inc 3b). When exactly one source core module carries DWARF its +/// addresses are remapped to the fused code section; with more than one +/// DWARF source (the `lists.wasm` fixture case) merging independent +/// unit sets is not yet supported, so it falls back to stripping. +fn fuse_remap(input: &[u8]) -> Vec { + let mut fuser = Fuser::new(FuserConfig { + memory_strategy: MemoryStrategy::MultiMemory, + attestation: false, + component_provenance: false, + address_rebasing: false, + preserve_names: false, + custom_sections: CustomSectionHandling::Merge, + output_format: OutputFormat::CoreModule, + opaque_resources: Vec::new(), + dwarf_handling: DwarfHandling::Remap, + }); + fuser + .add_component_named(input, Some("dwarf-fixture")) + .expect("add_component"); + fuser.fuse().expect("fuse") +} + fn fuse_with_drop(input: &[u8]) -> Vec { let mut fuser = Fuser::new(FuserConfig { memory_strategy: MemoryStrategy::MultiMemory, @@ -372,6 +395,45 @@ fn dwarf_addresses_in_fused_output_are_known_to_be_wrong() { ); } +#[test] +fn remap_policy_falls_back_to_strip_on_multi_dwarf_source() { + // `lists.wasm` embeds more than one core module carrying DWARF (two + // `.debug_info` sections — verified at fixture-selection time). + // Merging independent DWARF unit sets into one consistent + // `.debug_info` against the fused code section is deferred to a + // later increment; the honest behaviour is to strip rather than + // emit one source's addresses (wrong for the other). This pins that + // fallback: Remap on a multi-source fixture yields NO `.debug_*`. + // + // The single-source happy path (addresses actually remapped) is + // covered mechanically by the `dwarf` module unit tests: + // `rewrite_debug_sections_translates_low_pc` (full gimli + // read→convert→write→read round-trip) and + // `build_remap_from_parts_identity_walk` (remap built from real + // wasm bytes). + if !fixture_available() { + return; + } + let bytes = std::fs::read(DEBUG_INFO_FIXTURE).expect("read fixture"); + + // Precondition: the fixture really is multi-DWARF-source. + let input_dwarf = count_dwarf_sections_recursive(&bytes); + assert!( + input_dwarf.get(".debug_info").copied().unwrap_or(0) > 1, + "this test assumes a multi-DWARF-source fixture; saw {input_dwarf:?}. \ + If the fixture became single-source, it should now exercise the \ + remap happy path — assert remapped `.debug_*` are present instead." + ); + + let fused = fuse_remap(&bytes); + let counts = count_dwarf_sections_at_top_level(&fused); + assert!( + counts.is_empty(), + "Remap must fall back to stripping when >1 source module carries \ + DWARF (never emit wrong addresses). Saw: {counts:?}" + ); +} + fn code_section_len(bytes: &[u8]) -> Option { let parser = wasmparser::Parser::new(0); for payload in parser.parse_all(bytes) { diff --git a/safety/stpa/loss-scenarios.yaml b/safety/stpa/loss-scenarios.yaml index 5b31d15..52a9164 100644 --- a/safety/stpa/loss-scenarios.yaml +++ b/safety/stpa/loss-scenarios.yaml @@ -2595,3 +2595,83 @@ loss-scenarios: operand-length changes) is NOT yet handled — that needs the rewriter instruction-offset map (#143 increment 2) and the gimli DWARF rewrite (increment 3). + + - id: LS-D-1 + title: remapped DWARF emits a wrong code address for a fused function + uca: UCA-M-9 + hazards: [H-1] + type: inadequate-control-algorithm + scenario: > + `DwarfHandling::Remap` (#143 Phase 2 increment 3b) reads an input + core module's `.debug_*` sections and rewrites every code address + to the fused code section via `dwarf::AddressRemap`, then emits a + single remapped DWARF set with `gimli::write::Dwarf::from`. If the + remap returns a *wrong* output address for any + `DW_AT_low_pc`/line-program/range address — rather than failing — + the fused module ships DWARF that resolves an instruction to the + wrong source line. A downstream consumer (`pulseengine/witness` + MC/DC mapping, or a debugger) then attributes coverage/breakpoints + to the wrong source location, silently de-grounding any + witness/sigil-attested decision keyed to that line [H-1]. As with + LS-M-6 the hazard is invisible at runtime: the fused module still + executes correctly; only the debug paper-trail lies. + causal-factors: + - >- + The remap reconciles three offset spaces (input code-section + address, instruction-stream offset, output code-section + address). A base-alignment error — e.g. forgetting that the + locals prefix cancels, or mis-deriving a function's input span + — would shift every translated address within that function. + - >- + The instruction offset map is recovered by walking the input + and final-output operator streams in lockstep, assuming a 1:1 + operator correspondence. If the rewriter ever *inserts* + instructions (memory address-rebasing scratch ops) the streams + desynchronise and a naive zip would pair the wrong operators. + - >- + `gimli::write::Dwarf::from` is all-or-nothing: a `convert_address` + that returns a plausible-but-wrong `Some(addr)` for an unmapped + address (instead of `None`) would bake a wrong address into the + output rather than aborting. + process-model-flaw: > + The tempting model is "translate what you can, drop the rest." + But gimli cannot drop a single address — any unmapped address + fails the whole conversion. The safe model is therefore + correct-or-strip: every real instruction address must map + exactly, the only special-cased address is the code-section base + 0 (structurally invariant), and any other miss aborts the + conversion so the caller strips DWARF rather than emit a wrong + address. + status: approved + priority: high + fix: > + Defense-in-depth, correct-or-strip: + (1) The operator-stream walk aborts the remap (returns `None` → + caller strips) on any per-function operator-count or + locals-prefix mismatch, so an inserted-instruction rewrite can + never produce a misaligned map + (`dwarf::tests::build_remap_from_parts_aborts_on_operator_count_mismatch`). + (2) `convert_address` maps only address 0 (code-section base) to + itself and routes everything else through the instruction-accurate + remap; an unmapped real address returns `None`, which fails + `gimli::write::Dwarf::from` and triggers the strip fallback — + never a wrong address. + (3) Multi-DWARF-source inputs (where merging independent unit sets + is not yet supported) fall back to strip with a warning rather + than emitting one source's addresses against the merged code + section + (`tests::dwarf_passthrough::remap_policy_falls_back_to_strip_on_multi_dwarf_source`). + The address-translation correctness oracle is + `meld-core::dwarf::tests::ls_d_1_remap_translates_low_pc`: it + builds real input DWARF with gimli, remaps a subprogram's + `low_pc`, then re-parses the *output* DWARF and asserts the + address was actually translated (full read→convert→write→read + round-trip). The remap math itself is pinned by the six + `dwarf::tests::translate_*` unit tests (increment 3a). + Residual: `DW_AT_high_pc` encoded as a *length* (`DW_FORM_data*`, + the common Rust/LLVM encoding) is copied verbatim — gimli treats + it as a constant, not an address — so the function's reported + byte *length* may be off by the intra-function LEB drift. The + start address (`low_pc`) and line-number program — what debuggers + and witness actually use — are correct. Eliminating the high_pc + length drift is deferred to a later increment.