diff --git a/CHANGELOG.md b/CHANGELOG.md index 8992b1e..75a0fe2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,28 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +### Added + +- **`component-provenance` section v2: per-function code-byte ranges** + (#143 DWARF Phase 2 increment 1, LS-M-6, `meld-core/src/provenance.rs`). + Each entry gains an optional `code_range { start, end }` giving the + function body's byte span in the fused code section, rebased to the + code-section content start (the WebAssembly-DWARF address + convention). This is the anchor for DWARF address remapping. New + `provenance::code_section_function_ranges` re-parses the output code + section and index-aligns bodies with `merged.functions`. The bump + to `VERSION = 2` is **additive**: `code_range` is + `#[serde(default, skip_serializing_if = "Option::is_none")]`, so v1 + consumers that check `version` first still parse the entries and v1- + shaped payloads (no `code_range` key) round-trip unchanged. 5 new + unit tests + 1 integration test pin range ordering, non-overlap, + the no-code-section path, the rebasing cross-check, and v1/v2 + backward-compat. **Scope**: this delivers accurate *current* byte + spans; DWARF `.debug_line` remapping inside rewritten functions + (meld's rewriter shifts intra-function offsets via LEB128 operand- + length changes) is deferred to #143 increment 2 (rewriter + instruction-offset map) + increment 3 (gimli DWARF rewrite). + ### Changed - **LS-M-5 status corrected to `fixed`** (`safety/stpa/loss-scenarios.yaml`, diff --git a/meld-core/src/provenance.rs b/meld-core/src/provenance.rs index 4f8a348..2066277 100644 --- a/meld-core/src/provenance.rs +++ b/meld-core/src/provenance.rs @@ -42,9 +42,38 @@ use sha2::{Digest, Sha256}; /// arbitrary custom-section names; consumers identify by this string. pub const SECTION_NAME: &str = "component-provenance"; -/// Current section format version. Bumped on incompatible payload -/// changes; consumers MUST check `version` before parsing the rest. -pub const VERSION: u32 = 1; +/// Current section format version. +/// +/// - **v1**: `{ fused_func_idx, component_id, originating_func_idx }` +/// per entry (issue #192). +/// - **v2** (DWARF Phase 2, issue #143): adds an optional +/// [`Entry::code_range`] giving the function body's byte span in +/// the fused module's code section. The field is the anchor for +/// DWARF address remapping. v1 consumers that check `version` +/// first will see `2` and can either upgrade or ignore the new +/// field (serde deserialization tolerates its absence via +/// `#[serde(default)]`, and its presence is additive — no v1 key +/// changed shape). +/// +/// Consumers MUST check `version` before relying on `code_range`. +pub const VERSION: u32 = 2; + +/// Byte span of a function body in the fused module's code section. +/// +/// Offsets are **relative to the start of the code section's +/// contents** (the byte immediately after the code section's +/// size/count header — `wasmparser::Payload::CodeSectionStart.range.start`), +/// matching the WebAssembly-DWARF "code section relative" address +/// convention. `start` is the first byte of the function body +/// (its locals-declaration vector), `end` is one past its last +/// instruction byte — i.e. the half-open span `[start, end)` that +/// `wasmparser::FunctionBody::range()` reports, rebased to the +/// section content start. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub struct CodeRange { + pub start: u32, + pub end: u32, +} /// One entry per defined function in the fused Core Wasm module. /// @@ -65,6 +94,13 @@ pub struct Entry { /// view (the `function_idx` field of /// `MergedFunction.origin: (comp_idx, mod_idx, func_idx)`). pub originating_func_idx: u32, + /// v2: byte span of this function's body in the fused code + /// section (see [`CodeRange`]). `None` when the code-offset map + /// could not be built (e.g. the output had no code section). + /// Serialized only when present so v1-shaped entries round-trip + /// unchanged. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub code_range: Option, } /// Decoded provenance section. Constructed by [`build`] at fusion @@ -103,17 +139,59 @@ pub fn sha256_hex(bytes: &[u8]) -> String { hex::encode(out) } +/// Extract the code-section-relative byte span of every defined +/// function body in `module_bytes`, in code-section order. +/// +/// Returns `[(start, end); n_defined_functions]` with offsets +/// rebased to the code section content start (see [`CodeRange`]). +/// Empty if the module has no code section. The i-th entry +/// corresponds to the i-th defined function in code-section order: +/// meld writes `merged.functions` first, then adapter trampolines +/// (`meld-core/src/lib.rs` `encode_output`), so index `i` aligns +/// with `merged.functions[i]` for `i < merged.functions.len()`. +pub fn code_section_function_ranges(module_bytes: &[u8]) -> Vec { + let mut ranges = Vec::new(); + let mut content_start: Option = None; + let parser = wasmparser::Parser::new(0); + for payload in parser.parse_all(module_bytes) { + match payload { + Ok(wasmparser::Payload::CodeSectionStart { range, .. }) => { + content_start = Some(range.start); + } + Ok(wasmparser::Payload::CodeSectionEntry(body)) => { + let base = content_start.unwrap_or(0); + let r = body.range(); + ranges.push(CodeRange { + start: (r.start - base) as u32, + end: (r.end - base) as u32, + }); + } + Ok(_) => {} + Err(_) => break, + } + } + ranges +} + /// Build a [`ComponentProvenance`] from the merged module + the /// component slice. The hash binds the section to the fused module /// without the section (call this with bytes that don't yet include /// the `component-provenance` or `wsc.transformation.attestation` /// custom sections — see the module-level note). +/// +/// v2 (#143): each entry's [`Entry::code_range`] is populated from +/// the code section of `fused_bytes_without_extras` by index-order +/// alignment with `merged.functions`. If the parsed code section has +/// fewer bodies than `merged.functions` (should not happen for a +/// well-formed output), the missing entries get `code_range: None` +/// rather than a wrong span. pub fn build( merged: &crate::merger::MergedModule, components: &[crate::parser::ParsedComponent], fused_bytes_without_extras: &[u8], ) -> ComponentProvenance { let import_count = merged.import_counts.func; + let ranges = code_section_function_ranges(fused_bytes_without_extras); let entries: Vec = merged .functions .iter() @@ -128,6 +206,7 @@ pub fn build( fused_func_idx: import_count + defined_idx as u32, component_id, originating_func_idx: func_idx, + code_range: ranges.get(defined_idx).copied(), } }) .collect(); @@ -153,11 +232,16 @@ mod tests { fused_func_idx: 0, component_id: "auth".into(), originating_func_idx: 3, + code_range: Some(CodeRange { start: 0, end: 42 }), }, Entry { fused_func_idx: 1, component_id: "db".into(), originating_func_idx: 7, + code_range: Some(CodeRange { + start: 42, + end: 100, + }), }, ], }; @@ -166,6 +250,44 @@ mod tests { assert_eq!(original, decoded); } + #[test] + fn v1_shaped_entry_deserializes_with_none_code_range() { + // A v1 producer emits entries without `code_range`. The v2 + // Entry struct must still deserialize them (serde default), + // yielding `None`. This pins backward-compat so a v2 meld can + // read a v1 section and a v2 consumer tolerates v1 entries. + let v1_json = br#"{"version":1,"fused_module_sha256":"00","entries":[ + {"fused_func_idx":0,"component_id":"auth","originating_func_idx":3} + ]}"#; + let decoded = ComponentProvenance::from_bytes(v1_json).expect("deserialize v1"); + assert_eq!(decoded.entries.len(), 1); + assert_eq!(decoded.entries[0].code_range, None); + } + + #[test] + fn code_range_omitted_from_json_when_none() { + // v1-shaped round-trip: an entry with no code_range must not + // emit a `code_range` key (skip_serializing_if), so a v2 meld + // producing a None entry is byte-compatible with v1 readers. + let cp = ComponentProvenance { + version: VERSION, + fused_module_sha256: "0".repeat(64), + entries: vec![Entry { + fused_func_idx: 0, + component_id: "x".into(), + originating_func_idx: 0, + code_range: None, + }], + }; + let json: serde_json::Value = + serde_json::from_slice(&cp.to_bytes().expect("serialize")).expect("parse json"); + assert!( + json["entries"][0].get("code_range").is_none(), + "code_range must be omitted when None; got {}", + json["entries"][0] + ); + } + #[test] fn from_bytes_rejects_malformed_json() { assert!(ComponentProvenance::from_bytes(b"{not json}").is_err()); @@ -214,4 +336,85 @@ mod tests { "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9" ); } + + #[test] + fn code_section_ranges_are_ordered_nonoverlapping_one_per_function() { + // Two functions of clearly different body sizes so the spans + // are distinguishable. wat → wasm gives a real code section. + let wasm = wat::parse_str( + r#"(module + (func (result i32) i32.const 1) + (func (param i32 i32) (result i32) + local.get 0 local.get 1 i32.add))"#, + ) + .expect("wat parse"); + + let ranges = code_section_function_ranges(&wasm); + assert_eq!(ranges.len(), 2, "expected one range per defined function"); + + // Each span is non-empty and ordered, and consecutive spans + // do not overlap (code-section bodies are laid out in order). + assert!(ranges[0].start < ranges[0].end); + assert!(ranges[1].start < ranges[1].end); + assert!( + ranges[0].end <= ranges[1].start, + "function bodies must not overlap: {ranges:?}" + ); + // The first body does NOT begin at rebased offset 0: the base + // is `CodeSectionStart.range.start`, which points at the + // section's count LEB. The first body content (locals + code, + // per wasmparser's `FunctionBody::range()`) therefore starts a + // few bytes in — past the count LEB and the body's own size + // prefix. Assert it's small but non-zero rather than pinning + // the exact constant (the precise base is cross-checked in + // `code_section_ranges_rebased_to_content_start`). + assert!( + ranges[0].start > 0 && ranges[0].start < 16, + "first body should start just past the count + size prefix; got {}", + ranges[0].start + ); + } + + #[test] + fn code_section_ranges_empty_when_no_code_section() { + // A module with only a type section (no functions) yields no + // ranges — the `None` path for Entry::code_range. + let wasm = wat::parse_str(r#"(module (type (func)))"#).expect("wat parse"); + let ranges = code_section_function_ranges(&wasm); + assert!( + ranges.is_empty(), + "no code section ⇒ no ranges; got {ranges:?}" + ); + } + + #[test] + fn code_section_ranges_rebased_to_content_start() { + // Cross-check the rebasing: independently re-parse the module, + // capture the code-section content start and each body's + // absolute range, and confirm code_section_function_ranges + // reports exactly (absolute - content_start). + let wasm = wat::parse_str(r#"(module (func nop) (func (result i32) i32.const 7))"#) + .expect("wat parse"); + + let mut content_start = None; + let mut expected = Vec::new(); + for payload in wasmparser::Parser::new(0).parse_all(&wasm) { + match payload.expect("parse") { + wasmparser::Payload::CodeSectionStart { range, .. } => { + content_start = Some(range.start); + } + wasmparser::Payload::CodeSectionEntry(body) => { + let base = content_start.expect("code section started"); + let r = body.range(); + expected.push(CodeRange { + start: (r.start - base) as u32, + end: (r.end - base) as u32, + }); + } + _ => {} + } + } + + assert_eq!(code_section_function_ranges(&wasm), expected); + } } diff --git a/meld-core/tests/component_provenance.rs b/meld-core/tests/component_provenance.rs index 61f3428..0cae681 100644 --- a/meld-core/tests/component_provenance.rs +++ b/meld-core/tests/component_provenance.rs @@ -152,6 +152,46 @@ fn component_provenance_round_trips() { ); } +#[test] +fn v2_code_ranges_are_populated_ordered_and_nonoverlapping() { + // DWARF Phase 2 increment 1: every entry should carry a + // `code_range`, the spans should be ordered by fused_func_idx and + // non-overlapping (function bodies are laid out sequentially in + // the code section). This is the anchor downstream DWARF + // remapping (increment 3) builds on, so the contract is pinned + // end-to-end against a real fused module. + if !fixture_available() { + return; + } + let bytes = std::fs::read(FIXTURE).expect("read fixture"); + let fused = fuse_default(&bytes, "auth"); + + let payloads = read_custom_sections(&fused, SECTION_NAME); + let payload = payloads.first().expect("section present"); + let prov = ComponentProvenance::from_bytes(payload).expect("decode JSON"); + + // Entries are emitted in defined-function order; sort by + // fused_func_idx to be robust, then check each range is valid and + // the sequence is non-overlapping. + let mut entries = prov.entries.clone(); + entries.sort_by_key(|e| e.fused_func_idx); + + let mut prev_end: Option = None; + for e in &entries { + let r = e + .code_range + .unwrap_or_else(|| panic!("v2 entry missing code_range: {e:?}")); + assert!(r.start < r.end, "empty/inverted code_range: {e:?}"); + if let Some(pe) = prev_end { + assert!( + pe <= r.start, + "code ranges overlap or go backwards: prev_end={pe}, entry={e:?}" + ); + } + prev_end = Some(r.end); + } +} + #[test] fn every_entry_has_a_valid_back_pointer() { if !fixture_available() { diff --git a/safety/stpa/loss-scenarios.yaml b/safety/stpa/loss-scenarios.yaml index ac9b40b..5b31d15 100644 --- a/safety/stpa/loss-scenarios.yaml +++ b/safety/stpa/loss-scenarios.yaml @@ -2567,3 +2567,31 @@ loss-scenarios: verification recipe). Plus 5 unit tests pin the JSON round-trip, version-field stability, and SHA-256 helper canonical value. + ### + ### v2 (DWARF Phase 2 increment 1, #143) + ### + The section gained an optional `code_range { start, end }` per + entry — the function body's byte span in the fused code + section (code-section-relative, the WebAssembly-DWARF address + convention). This is a new mis-attribution surface: a wrong + `code_range` would point DWARF address remapping (and any scry + invariant keyed to it) at the wrong bytes. The range is derived + by `provenance::code_section_function_ranges` re-parsing the + output code section and index-aligning bodies with + `merged.functions` (meld emits merged functions before adapter + trampolines, so position `i` maps to `merged.functions[i]`). + Pinned by `provenance::tests::code_section_ranges_*` (ordering, + non-overlap, empty-when-no-code-section, and an independent + rebasing cross-check) plus the integration test + `component_provenance::v2_code_ranges_are_populated_ordered_and_nonoverlapping` + against a real fused fixture. v1→v2 is additive: `code_range` + is `#[serde(default, skip_serializing_if)]` so v1-shaped + entries round-trip unchanged + (`v1_shaped_entry_deserializes_with_none_code_range`, + `code_range_omitted_from_json_when_none`). Residual: the byte + ranges are accurate for the *current* fused output, but DWARF + line-table remapping inside rewritten function bodies (where + meld's rewriter shifts intra-function offsets via LEB128 + operand-length changes) is NOT yet handled — that needs the + rewriter instruction-offset map (#143 increment 2) and the + gimli DWARF rewrite (increment 3).