Skip to content

Commit 51e31e8

Browse files
committed
tar: Preserve trailing record-padding bytes after end-of-archive
GNU tar pads archives to a "record size" (typically 20 × 512 = 10240 bytes). The archive ends with two 512-byte zero blocks that tar-core emits as ParseEvent::End, but any additional zero-padding blocks that fill out the record were silently dropped. split_async() stored only what ParseEvent::End::consumed covered (1024 bytes for the two zero blocks), leaving the remaining trailing bytes unread. When cat() later reconstructed the archive, the output was shorter than the original, causing the sha256 to differ from the layer's diff_id. This broke the checksum validation path in create_filesystem() — specifically the @digest reference path that lacks a verity proof and must re-hash the reconstructed tar. The fix reads any remaining bytes after the End event until true EOF and stores them inline, making cat() byte-for-bit faithful to the original archive. This resolves "Layer has incorrect checksum" errors for images where the tar ends with GNU-style record padding, including Ubuntu 26.04 (resolute) which uses umoci/PAX format. Assisted-by: OpenCode (Claude Sonnet 4.6) Signed-off-by: Colin Walters <walters@verbum.org>
1 parent 61de438 commit 51e31e8

1 file changed

Lines changed: 101 additions & 32 deletions

File tree

  • crates/composefs-oci/src

crates/composefs-oci/src/tar.rs

Lines changed: 101 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,26 @@ pub async fn split_async<ObjectID: FsVerityHashValue>(
203203
}
204204
ParseEvent::End { consumed } => {
205205
builder.push_inline(&buf.split_to(consumed));
206+
// GNU tar pads archives to a "record size" (typically 20×512 = 10240 bytes).
207+
// After the two end-of-archive zero blocks (consumed above), there may be
208+
// additional zero-padding blocks before EOF. We must store them to reproduce
209+
// the original byte stream faithfully for diff_id checksum verification.
210+
//
211+
// Note: ideally tar-core would surface these extra bytes through
212+
// ParseEvent::End::consumed so callers don't need to know about record
213+
// granularity; this drain is a workaround until that is addressed upstream.
214+
// See https://github.com/composefs/tar-core/pull/24 which will obviate this.
215+
if !buf.is_empty() {
216+
builder.push_inline(&buf.split());
217+
}
218+
loop {
219+
buf.reserve(IO_BUF_CAPACITY);
220+
let n = tar_stream.read_buf(&mut buf).await?;
221+
if n == 0 {
222+
break;
223+
}
224+
builder.push_inline(&buf.split());
225+
}
206226
break;
207227
}
208228
ParseEvent::SparseEntry { .. } => {
@@ -582,6 +602,56 @@ mod tests {
582602
assert!(get_entry(&mut reader).unwrap().is_none());
583603
}
584604

605+
/// Verify that a tar without any trailing record padding survives a byte-exact
606+
/// roundtrip. This is the common case for tars produced by the Rust `tar` crate
607+
/// and most standard tooling; it forms a baseline paired with the padding test below.
608+
#[test]
609+
fn test_no_record_padding_roundtrip() {
610+
let mut tar_data = Vec::new();
611+
{
612+
let mut builder = Builder::new(&mut tar_data);
613+
append_file(&mut builder, "hello.txt", b"hello world").unwrap();
614+
builder.finish().unwrap();
615+
}
616+
// Confirm the Rust tar crate did not add GNU record padding.
617+
const GNU_RECORD_SIZE: usize = 20 * 512;
618+
assert_ne!(
619+
tar_data.len() % GNU_RECORD_SIZE,
620+
0,
621+
"expected tar without GNU record padding for this test"
622+
);
623+
roundtrip_tar_bytes(&tar_data);
624+
}
625+
626+
/// Verify that GNU-style record padding (zero bytes after the two end-of-archive
627+
/// blocks, filling the archive out to a 20×512 record boundary) is preserved
628+
/// byte-for-bit through split_async → cat(). Without the fix, the reconstructed
629+
/// tar was shorter than the original, causing diff_id checksum failures for images
630+
/// produced by umoci/Rockcraft (e.g. Ubuntu 26.04).
631+
#[test]
632+
fn test_gnu_record_padding_roundtrip() {
633+
const GNU_RECORD_SIZE: usize = 20 * 512; // 10240 bytes
634+
635+
let mut tar_data = Vec::new();
636+
{
637+
let mut builder = Builder::new(&mut tar_data);
638+
append_file(&mut builder, "hello.txt", b"hello world").unwrap();
639+
builder.finish().unwrap();
640+
}
641+
642+
// Simulate GNU record padding: extend to the next record boundary with zeros.
643+
let remainder = tar_data.len() % GNU_RECORD_SIZE;
644+
if remainder != 0 {
645+
tar_data.resize(tar_data.len() + (GNU_RECORD_SIZE - remainder), 0);
646+
}
647+
648+
// The tar length must now be a multiple of the record size.
649+
assert_eq!(tar_data.len() % GNU_RECORD_SIZE, 0);
650+
651+
// roundtrip_tar_bytes asserts byte-exact reproduction through the splitstream.
652+
roundtrip_tar_bytes(&tar_data);
653+
}
654+
585655
#[tokio::test]
586656
async fn test_single_small_file() {
587657
let mut tar_data = Vec::new();
@@ -1387,6 +1457,37 @@ mod tests {
13871457
assert_eq!(entries[0].path, PathBuf::from(format!("/{full_path}")));
13881458
}
13891459

1460+
/// Byte-exact roundtrip: original tar bytes -> split_async -> splitstream -> cat()
1461+
/// -> assert bytes match. Catches any corruption in either the inline or
1462+
/// external code paths, including missing padding or off-by-one errors.
1463+
fn roundtrip_tar_bytes(tar_data: &[u8]) {
1464+
let rt = tokio::runtime::Runtime::new().unwrap();
1465+
rt.block_on(async {
1466+
let repo = create_test_repository().unwrap();
1467+
let (object_id, _stats) = split_async(tar_data, repo.clone(), TAR_LAYER_CONTENT_TYPE)
1468+
.await
1469+
.unwrap();
1470+
1471+
let mut reader: SplitStreamReader<Sha256HashValue> = SplitStreamReader::new(
1472+
repo.open_object(&object_id).unwrap().into(),
1473+
Some(TAR_LAYER_CONTENT_TYPE),
1474+
)
1475+
.unwrap();
1476+
1477+
let mut reassembled = Vec::new();
1478+
reader.cat(&repo, &mut reassembled).unwrap();
1479+
assert_eq!(
1480+
reassembled.len(),
1481+
tar_data.len(),
1482+
"reassembled tar length mismatch"
1483+
);
1484+
assert_eq!(
1485+
reassembled, tar_data,
1486+
"reassembled tar bytes differ from original"
1487+
);
1488+
});
1489+
}
1490+
13901491
/// Property-based tests for tar path handling.
13911492
mod proptest_tests {
13921493
use super::*;
@@ -1527,38 +1628,6 @@ mod tests {
15271628
tar_data
15281629
}
15291630

1530-
/// Byte-exact roundtrip: original tar -> split_async -> splitstream -> cat()
1531-
/// -> assert bytes match. Catches any corruption in either the inline or
1532-
/// external code paths, including missing padding or off-by-one errors.
1533-
fn roundtrip_tar_bytes(tar_data: &[u8]) {
1534-
let rt = tokio::runtime::Runtime::new().unwrap();
1535-
rt.block_on(async {
1536-
let repo = create_test_repository().unwrap();
1537-
let (object_id, _stats) =
1538-
split_async(tar_data, repo.clone(), TAR_LAYER_CONTENT_TYPE)
1539-
.await
1540-
.unwrap();
1541-
1542-
let mut reader: SplitStreamReader<Sha256HashValue> = SplitStreamReader::new(
1543-
repo.open_object(&object_id).unwrap().into(),
1544-
Some(TAR_LAYER_CONTENT_TYPE),
1545-
)
1546-
.unwrap();
1547-
1548-
let mut reassembled = Vec::new();
1549-
reader.cat(&repo, &mut reassembled).unwrap();
1550-
assert_eq!(
1551-
reassembled.len(),
1552-
tar_data.len(),
1553-
"reassembled tar length mismatch"
1554-
);
1555-
assert_eq!(
1556-
reassembled, tar_data,
1557-
"reassembled tar bytes differ from original"
1558-
);
1559-
});
1560-
}
1561-
15621631
proptest! {
15631632
#![proptest_config(ProptestConfig::with_cases(64))]
15641633

0 commit comments

Comments
 (0)