|
6 | 6 |
|
7 | 7 | use crate::{Error, Result}; |
8 | 8 | use std::collections::HashMap; |
9 | | -use wasm_encoder::{BlockType, Function, Instruction, MemArg}; |
| 9 | +use wasm_encoder::{BlockType, Encode, Function, Instruction, MemArg}; |
10 | 10 | use wasmparser::{ |
11 | 11 | BlockType as WpBlockType, FunctionBody, MemArg as WpMemArg, Operator, OperatorsReader, |
12 | 12 | }; |
13 | 13 |
|
| 14 | +/// One instruction-boundary offset pair (DWARF Phase 2 inc 2, #143). |
| 15 | +/// |
| 16 | +/// `old` and `new` are byte offsets relative to the start of their |
| 17 | +/// respective function body's **instruction stream** (the byte |
| 18 | +/// immediately after the locals-declaration vector). The rewriter |
| 19 | +/// changes operand values (function/global/etc. indices) whose |
| 20 | +/// LEB128 encodings can change length, so `new` drifts away from |
| 21 | +/// `old` at every instruction past the first length change. |
| 22 | +#[derive(Debug, Clone, Copy, PartialEq, Eq)] |
| 23 | +pub struct InstrOffset { |
| 24 | + pub old: u32, |
| 25 | + pub new: u32, |
| 26 | +} |
| 27 | + |
| 28 | +/// Per-function instruction offset map (DWARF Phase 2 inc 2, #143). |
| 29 | +/// |
| 30 | +/// One [`InstrOffset`] per input operator, in stream order. When an |
| 31 | +/// input operator rewrites to multiple output instructions, `new` is |
| 32 | +/// the offset of the **first** emitted instruction — the address |
| 33 | +/// DWARF line-number programs attribute to that source operator. |
| 34 | +/// Increment 3 composes these intra-function offsets with the |
| 35 | +/// per-function base from the component-provenance v2 `code_range` |
| 36 | +/// to translate DWARF code addresses from input to fused output. |
| 37 | +#[derive(Debug, Clone, Default, PartialEq, Eq)] |
| 38 | +pub struct InstrOffsetMap { |
| 39 | + pub entries: Vec<InstrOffset>, |
| 40 | +} |
| 41 | + |
| 42 | +impl InstrOffsetMap { |
| 43 | + /// Translate an old intra-function instruction-stream offset to |
| 44 | + /// the corresponding new offset. Returns `None` if `old` does not |
| 45 | + /// fall on a recorded instruction boundary (DWARF addresses |
| 46 | + /// always point at instruction starts, so a miss signals either a |
| 47 | + /// malformed address or an offset past the function end). |
| 48 | + pub fn translate(&self, old: u32) -> Option<u32> { |
| 49 | + self.entries.iter().find(|e| e.old == old).map(|e| e.new) |
| 50 | + } |
| 51 | +} |
| 52 | + |
14 | 53 | /// Index mappings for rewriting |
15 | 54 | #[derive(Debug, Clone, Default)] |
16 | 55 | pub struct IndexMaps { |
@@ -82,6 +121,32 @@ pub fn rewrite_function_body( |
82 | 121 | param_count: u32, |
83 | 122 | maps: &IndexMaps, |
84 | 123 | ) -> Result<Function> { |
| 124 | + Ok(rewrite_function_body_core(body, param_count, maps, false)?.0) |
| 125 | +} |
| 126 | + |
| 127 | +/// Like [`rewrite_function_body`], but also returns the per-function |
| 128 | +/// [`InstrOffsetMap`] mapping each input operator's instruction-stream |
| 129 | +/// byte offset to its rewritten offset (DWARF Phase 2 inc 2, #143). |
| 130 | +/// |
| 131 | +/// The offset map is collected by measuring each emitted instruction's |
| 132 | +/// encoded length — a hot-path cost, so this entry point is only used |
| 133 | +/// when DWARF address remapping is requested. The plain |
| 134 | +/// [`rewrite_function_body`] path pays nothing. |
| 135 | +pub fn rewrite_function_body_with_offsets( |
| 136 | + body: &FunctionBody<'_>, |
| 137 | + param_count: u32, |
| 138 | + maps: &IndexMaps, |
| 139 | +) -> Result<(Function, InstrOffsetMap)> { |
| 140 | + let (func, map) = rewrite_function_body_core(body, param_count, maps, true)?; |
| 141 | + Ok((func, map.unwrap_or_default())) |
| 142 | +} |
| 143 | + |
| 144 | +fn rewrite_function_body_core( |
| 145 | + body: &FunctionBody<'_>, |
| 146 | + param_count: u32, |
| 147 | + maps: &IndexMaps, |
| 148 | + collect_offsets: bool, |
| 149 | +) -> Result<(Function, Option<InstrOffsetMap>)> { |
85 | 150 | let locals_reader = body.get_locals_reader()?; |
86 | 151 |
|
87 | 152 | // Collect locals |
@@ -137,25 +202,55 @@ pub fn rewrite_function_body( |
137 | 202 |
|
138 | 203 | // Get operators and rewrite them |
139 | 204 | let ops_reader = body.get_operators_reader()?; |
140 | | - rewrite_operators(ops_reader, &maps, &mut func)?; |
| 205 | + let offset_map = rewrite_operators(ops_reader, &maps, &mut func, collect_offsets)?; |
141 | 206 |
|
142 | | - Ok(func) |
| 207 | + Ok((func, offset_map)) |
143 | 208 | } |
144 | 209 |
|
145 | | -/// Rewrite operators in a function body |
| 210 | +/// Rewrite operators in a function body, optionally collecting an |
| 211 | +/// [`InstrOffsetMap`]. |
| 212 | +/// |
| 213 | +/// When `collect_offsets` is true, the new offset is accumulated by |
| 214 | +/// measuring each emitted instruction's encoded byte length — |
| 215 | +/// identical to what `Function::instruction` appends, since both |
| 216 | +/// route through `wasm_encoder::Encode`. The map records, per input |
| 217 | +/// operator, the (old, new) instruction-stream offsets; for an |
| 218 | +/// operator that expands to several instructions the `new` offset is |
| 219 | +/// captured *before* the group is emitted (the first instruction's |
| 220 | +/// position). When false, no measurement happens and `None` is |
| 221 | +/// returned — the zero-cost default path. |
146 | 222 | fn rewrite_operators( |
147 | 223 | reader: OperatorsReader<'_>, |
148 | 224 | maps: &IndexMaps, |
149 | 225 | func: &mut Function, |
150 | | -) -> Result<()> { |
151 | | - for op in reader { |
152 | | - let op = op?; |
| 226 | + collect_offsets: bool, |
| 227 | +) -> Result<Option<InstrOffsetMap>> { |
| 228 | + let mut map = collect_offsets.then(InstrOffsetMap::default); |
| 229 | + let mut new_offset: u32 = 0; |
| 230 | + let mut base_old: Option<usize> = None; |
| 231 | + |
| 232 | + for res in reader.into_iter_with_offsets() { |
| 233 | + let (op, old_pos) = res?; |
153 | 234 | let instrs = rewrite_operator(op, maps)?; |
154 | | - for instr in instrs { |
155 | | - func.instruction(&instr); |
| 235 | + if let Some(m) = map.as_mut() { |
| 236 | + // First operator's absolute position defines the |
| 237 | + // instruction-stream base; subsequent offsets are relative. |
| 238 | + let base = *base_old.get_or_insert(old_pos); |
| 239 | + m.entries.push(InstrOffset { |
| 240 | + old: (old_pos - base) as u32, |
| 241 | + new: new_offset, |
| 242 | + }); |
| 243 | + } |
| 244 | + for instr in &instrs { |
| 245 | + if map.is_some() { |
| 246 | + let mut buf = Vec::new(); |
| 247 | + instr.encode(&mut buf); |
| 248 | + new_offset = new_offset.saturating_add(buf.len() as u32); |
| 249 | + } |
| 250 | + func.instruction(instr); |
156 | 251 | } |
157 | 252 | } |
158 | | - Ok(()) |
| 253 | + Ok(map) |
159 | 254 | } |
160 | 255 |
|
161 | 256 | /// Convert a wasmparser operator to wasm-encoder instruction with index remapping |
@@ -1220,4 +1315,152 @@ mod tests { |
1220 | 1315 | ); |
1221 | 1316 | } |
1222 | 1317 | } |
| 1318 | + |
| 1319 | + // ─── DWARF Phase 2 inc 2: instruction offset map (#143) ─────────── |
| 1320 | + |
| 1321 | + /// Run `rewrite_function_body_with_offsets` on function `func_idx` |
| 1322 | + /// of a wat module and return the collected offset map. |
| 1323 | + fn offsets_for_wat_func(wat_src: &str, func_idx: usize, maps: &IndexMaps) -> InstrOffsetMap { |
| 1324 | + let wasm = wat::parse_str(wat_src).expect("wat parse"); |
| 1325 | + let mut idx = 0; |
| 1326 | + for payload in wasmparser::Parser::new(0).parse_all(&wasm) { |
| 1327 | + if let wasmparser::Payload::CodeSectionEntry(body) = payload.expect("payload") { |
| 1328 | + if idx == func_idx { |
| 1329 | + let (_func, map) = rewrite_function_body_with_offsets(&body, 0, maps) |
| 1330 | + .expect("rewrite with offsets"); |
| 1331 | + return map; |
| 1332 | + } |
| 1333 | + idx += 1; |
| 1334 | + } |
| 1335 | + } |
| 1336 | + panic!("function index {func_idx} not found in code section"); |
| 1337 | + } |
| 1338 | + |
| 1339 | + /// LEB-growth tracking: remapping `call 0` → `call 200` grows each |
| 1340 | + /// call's operand LEB from 1 to 2 bytes. The offset map's `new` |
| 1341 | + /// must diverge from `old` by the *accumulated* growth — +1 after |
| 1342 | + /// the first remapped call, +2 after the second. This is the |
| 1343 | + /// property DWARF line-table remapping (inc 3) depends on. |
| 1344 | + #[test] |
| 1345 | + fn instr_offset_map_tracks_leb_growth_from_index_remap() { |
| 1346 | + let mut maps = IndexMaps::new(); |
| 1347 | + maps.functions.insert(0, 200); // 1-byte LEB → 2-byte LEB |
| 1348 | + |
| 1349 | + // func 1 body: call 0; drop; call 0; drop; i32.const 2 (+ end) |
| 1350 | + let map = offsets_for_wat_func( |
| 1351 | + r#"(module |
| 1352 | + (func (result i32) i32.const 1) |
| 1353 | + (func (result i32) |
| 1354 | + call 0 drop |
| 1355 | + call 0 drop |
| 1356 | + i32.const 2))"#, |
| 1357 | + 1, |
| 1358 | + &maps, |
| 1359 | + ); |
| 1360 | + |
| 1361 | + // Six operators: call, drop, call, drop, const, end. |
| 1362 | + assert_eq!(map.entries.len(), 6, "one entry per operator: {map:?}"); |
| 1363 | + |
| 1364 | + // The divergence (new - old) per entry must be the accumulated |
| 1365 | + // LEB growth: 0 before any remapped call completes, +1 after |
| 1366 | + // the first call, +2 after the second. Sequence: [0,1,1,2,2,2]. |
| 1367 | + let divergence: Vec<i64> = map |
| 1368 | + .entries |
| 1369 | + .iter() |
| 1370 | + .map(|e| e.new as i64 - e.old as i64) |
| 1371 | + .collect(); |
| 1372 | + assert_eq!( |
| 1373 | + divergence, |
| 1374 | + vec![0, 1, 1, 2, 2, 2], |
| 1375 | + "new offsets must diverge by accumulated LEB growth: {map:?}" |
| 1376 | + ); |
| 1377 | + |
| 1378 | + // Old and new offsets are each strictly increasing. |
| 1379 | + for w in map.entries.windows(2) { |
| 1380 | + assert!(w[0].old < w[1].old, "old offsets must increase: {map:?}"); |
| 1381 | + assert!(w[0].new < w[1].new, "new offsets must increase: {map:?}"); |
| 1382 | + } |
| 1383 | + // First instruction anchors both streams at 0. |
| 1384 | + assert_eq!(map.entries[0].old, 0); |
| 1385 | + assert_eq!(map.entries[0].new, 0); |
| 1386 | + } |
| 1387 | + |
| 1388 | + /// Identity case: when no remap changes an operand's LEB length, |
| 1389 | + /// `new` must equal `old` at every instruction boundary — the map |
| 1390 | + /// is the identity. Remapping 0→1 keeps the `call` operand a |
| 1391 | + /// single LEB byte. |
| 1392 | + #[test] |
| 1393 | + fn instr_offset_map_is_identity_when_no_leb_length_change() { |
| 1394 | + let mut maps = IndexMaps::new(); |
| 1395 | + maps.functions.insert(0, 1); // both 1-byte LEBs |
| 1396 | + |
| 1397 | + let map = offsets_for_wat_func( |
| 1398 | + r#"(module |
| 1399 | + (func (result i32) i32.const 1) |
| 1400 | + (func (result i32) call 0 drop i32.const 2))"#, |
| 1401 | + 1, |
| 1402 | + &maps, |
| 1403 | + ); |
| 1404 | + |
| 1405 | + assert!(!map.entries.is_empty()); |
| 1406 | + for e in &map.entries { |
| 1407 | + assert_eq!(e.old, e.new, "no LEB change ⇒ identity offsets: {e:?}"); |
| 1408 | + } |
| 1409 | + } |
| 1410 | + |
| 1411 | + /// `translate` resolves recorded boundaries and rejects offsets |
| 1412 | + /// that don't land on an instruction start. |
| 1413 | + #[test] |
| 1414 | + fn instr_offset_map_translate_hits_and_misses() { |
| 1415 | + let map = InstrOffsetMap { |
| 1416 | + entries: vec![ |
| 1417 | + InstrOffset { old: 0, new: 0 }, |
| 1418 | + InstrOffset { old: 2, new: 3 }, |
| 1419 | + InstrOffset { old: 5, new: 7 }, |
| 1420 | + ], |
| 1421 | + }; |
| 1422 | + assert_eq!(map.translate(0), Some(0)); |
| 1423 | + assert_eq!(map.translate(2), Some(3)); |
| 1424 | + assert_eq!(map.translate(5), Some(7)); |
| 1425 | + // Offsets mid-instruction (not a recorded boundary) miss. |
| 1426 | + assert_eq!(map.translate(1), None); |
| 1427 | + assert_eq!(map.translate(99), None); |
| 1428 | + } |
| 1429 | + |
| 1430 | + /// The plain `rewrite_function_body` path is unaffected: it still |
| 1431 | + /// returns a `Function` and pays no offset-collection cost. Verify |
| 1432 | + /// it produces byte-identical output to the with-offsets variant's |
| 1433 | + /// `Function` (the offset collection must not change the emitted |
| 1434 | + /// code). |
| 1435 | + #[test] |
| 1436 | + fn with_offsets_emits_identical_function_bytes() { |
| 1437 | + let mut maps = IndexMaps::new(); |
| 1438 | + maps.functions.insert(0, 200); |
| 1439 | + let src = r#"(module |
| 1440 | + (func (result i32) i32.const 1) |
| 1441 | + (func (result i32) call 0 drop call 0 drop i32.const 2))"#; |
| 1442 | + let wasm = wat::parse_str(src).expect("wat"); |
| 1443 | + |
| 1444 | + let mut bodies = Vec::new(); |
| 1445 | + for payload in wasmparser::Parser::new(0).parse_all(&wasm) { |
| 1446 | + if let wasmparser::Payload::CodeSectionEntry(body) = payload.expect("payload") { |
| 1447 | + bodies.push(body); |
| 1448 | + } |
| 1449 | + } |
| 1450 | + let target = &bodies[1]; |
| 1451 | + |
| 1452 | + let plain = rewrite_function_body(target, 0, &maps).expect("plain"); |
| 1453 | + let (with_off, _map) = |
| 1454 | + rewrite_function_body_with_offsets(target, 0, &maps).expect("with offsets"); |
| 1455 | + |
| 1456 | + // Encode both into a CodeSection and compare bytes. |
| 1457 | + let mut a = wasm_encoder::CodeSection::new(); |
| 1458 | + a.function(&plain); |
| 1459 | + let mut b = wasm_encoder::CodeSection::new(); |
| 1460 | + b.function(&with_off); |
| 1461 | + let (mut ab, mut bb) = (Vec::new(), Vec::new()); |
| 1462 | + a.encode(&mut ab); |
| 1463 | + b.encode(&mut bb); |
| 1464 | + assert_eq!(ab, bb, "offset collection must not change emitted code"); |
| 1465 | + } |
1223 | 1466 | } |
0 commit comments