Skip to content

Commit dfa0401

Browse files
def-claude
andcommitted
persist: Validate full part tiling in unflatten, not just endpoints
Trace::unflatten's check that a spine batch's reconstructed parts cover its id range only verified the endpoints: the first part starts at the batch id's lower and the last ends at its upper. A crafted rollup could encode an outer id [0, 3) tiled by non-contiguous parts [0, 2) and [1, 3): the endpoint check passes, and Trace::validate also passes because it inspects the outer SpineBatch ids and descriptions, never the adjacency of a SpineBatch's parts. That malformed trace would be accepted at decode and panic later in normal maintenance instead. fueled_merge_reqs_before_ms emits a FueledMergeReq from the accepted parts, and Compactor::chunk_runs / compact_all (plus the id-range merge path in apply_merge_res_checked) call id_range on those ids, whose adjacency check is an assert_eq!. So the hardening still converted a corrupted durable rollup into a process panic for this invariant. Validate the full tiling in unflatten: require adjacent parts to be contiguous in addition to the existing endpoint check, so non-adjacent parts fail with a decode error. Adds a decode-rejection regression test for the overlapping-parts vector alongside the existing partless-spine-batch one. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 3c01721 commit dfa0401

2 files changed

Lines changed: 41 additions & 4 deletions

File tree

src/persist-client/src/internal/encoding.rs

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2436,7 +2436,37 @@ mod tests {
24362436
});
24372437
});
24382438
let err = rollup_decode_err(proto);
2439-
assert!(err.contains("do not cover"), "{err}");
2439+
assert!(err.contains("do not tile"), "{err}");
2440+
}
2441+
2442+
/// A spine batch whose parts hit the right endpoints but overlap in the
2443+
/// middle (id `[0, 3)` tiled by parts `[0, 2)` and `[1, 3)`) passes the
2444+
/// endpoint check but is not a valid tiling. Such parts reach compaction's
2445+
/// `id_range`, whose contiguity `assert_eq!` would panic later, so decoding
2446+
/// must reject them here instead.
2447+
#[mz_ore::test]
2448+
fn rollup_proto_with_noncontiguous_spine_parts_is_rejected() {
2449+
let outer = SpineId(0, 3);
2450+
let part_ids = [SpineId(0, 2), SpineId(1, 3)];
2451+
let proto = rollup_proto_with_trace(|trace| {
2452+
trace.spine_batches.push(ProtoIdSpineBatch {
2453+
id: Some(outer.into_proto()),
2454+
batch: Some(ProtoSpineBatch {
2455+
level: 0,
2456+
desc: Some(u64_desc_proto(0, 3, 0)),
2457+
parts: part_ids.iter().map(|id| id.into_proto()).collect(),
2458+
descs: vec![],
2459+
}),
2460+
});
2461+
for id in part_ids {
2462+
trace.hollow_batches.push(ProtoIdHollowBatch {
2463+
id: Some(id.into_proto()),
2464+
batch: Some(legacy_batch_proto(0, 1, 0)),
2465+
});
2466+
}
2467+
});
2468+
let err = rollup_decode_err(proto);
2469+
assert!(err.contains("do not tile"), "{err}");
24402470
}
24412471

24422472
/// Three spine batches at one level overflow the two-batch layer, which

src/persist-client/src/internal/trace.rs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -451,13 +451,20 @@ impl<T: Timestamp + Lattice> Trace<T> {
451451
.map(|(id, desc)| pop_batch(id, desc))
452452
.collect::<Result<Vec<_>, _>>()?;
453453
// A spine batch's parts tile its id range (`SpineBatch::id`
454-
// `debug_assert`s this). Real batches always have at least one
455-
// part: an empty batch still carries an empty hollow batch.
454+
// `debug_assert`s the endpoints). Real batches always have at least
455+
// one part: an empty batch still carries an empty hollow batch.
456+
// Validate the full tiling, not just the endpoints: downstream
457+
// maintenance (`fueled_merge_reqs_before_ms` -> `id_range` in
458+
// compaction, `apply_merge_res_checked`) `assert_eq!`s that the
459+
// collected part ids are contiguous, so non-adjacent parts that
460+
// happen to hit the right endpoints would panic later instead of
461+
// here.
456462
if parts.first().map(|x| x.id.0) != Some(id.0)
457463
|| parts.last().map(|x| x.id.1) != Some(id.1)
464+
|| parts.windows(2).any(|w| w[0].id.1 != w[1].id.0)
458465
{
459466
return Err(format!(
460-
"spine batch {id:?} parts do not cover the batch's id range"
467+
"spine batch {id:?} parts do not tile the batch's id range"
461468
));
462469
}
463470
let len = parts.iter().map(|p| (*p).batch.len).sum();

0 commit comments

Comments
 (0)