fix: compute DiffID from uncompressed layer digest

cluster2600 · cluster2600 · commit 841f758fbc2b · 2026-04-14T09:01:03.000+02:00
Add a streaming gzip decompression method to ContentWriter that
computes the SHA256 DiffID of compressed OCI layers without loading
the full file into memory. Streams 64 KB chunks through Apple's
Compression framework with incremental SHA256 hashing and CRC32
validation against the gzip trailer.

Rebased on main, ran make fmt, and switched to ContainerizationError
per reviewer feedback.

Signed-off-by: Maxime Grenu &lt;maxime.grenu@gmail.com&gt;
diff --git a/Sources/ContainerizationOCI/Content/ContentWriter.swift b/Sources/ContainerizationOCI/Content/ContentWriter.swift
@@ -14,10 +14,12 @@
 // limitations under the License.
 //===----------------------------------------------------------------------===//
 
+import Compression
 import ContainerizationError
 import Crypto
 import Foundation
 import NIOCore
+import zlib
 
 /// Provides a context to write data into a directory.
 public class ContentWriter {
@@ -134,4 +136,168 @@ public class ContentWriter {
         let data = try self.encoder.encode(content)
         return try self.write(data)
     }
+
+    /// Computes the SHA256 digest of the uncompressed content of a gzip file.
+    ///
+    /// Per the OCI Image Specification, a DiffID is the SHA256 digest of the
+    /// uncompressed layer content. This method streams the compressed file in
+    /// chunks, decompresses through Apple's Compression framework, and feeds
+    /// each decompressed chunk into an incremental SHA256 hasher. Neither the
+    /// full compressed nor the full decompressed data is held in memory.
+    ///
+    /// - Parameter url: The URL of the gzip-compressed file.
+    /// - Returns: The SHA256 digest of the uncompressed content.
+    public static func diffID(of url: URL) throws -> SHA256.Digest {
+        let fileHandle = try FileHandle(forReadingFrom: url)
+        defer { fileHandle.closeFile() }
+
+        let headerReadSize = 512
+        guard let headerData = Self.readExactly(fileHandle: fileHandle, count: headerReadSize),
+            !headerData.isEmpty
+        else {
+            throw ContainerizationError(.internalError, message: "invalid gzip file")
+        }
+        let headerSize = try Self.gzipHeaderSize(headerData)
+
+        fileHandle.seekToEndOfFile()
+        let fileSize = fileHandle.offsetInFile
+        guard fileSize >= 8 else {
+            throw ContainerizationError(.internalError, message: "gzip trailer mismatch")
+        }
+        fileHandle.seek(toFileOffset: fileSize - 8)
+        guard let trailerData = Self.readExactly(fileHandle: fileHandle, count: 8),
+            trailerData.count == 8
+        else {
+            throw ContainerizationError(.internalError, message: "gzip trailer mismatch")
+        }
+        let expectedCRC =
+            UInt32(trailerData[trailerData.startIndex])
+            | (UInt32(trailerData[trailerData.startIndex + 1]) << 8)
+            | (UInt32(trailerData[trailerData.startIndex + 2]) << 16)
+            | (UInt32(trailerData[trailerData.startIndex + 3]) << 24)
+        let expectedSize =
+            UInt32(trailerData[trailerData.startIndex + 4])
+            | (UInt32(trailerData[trailerData.startIndex + 5]) << 8)
+            | (UInt32(trailerData[trailerData.startIndex + 6]) << 16)
+            | (UInt32(trailerData[trailerData.startIndex + 7]) << 24)
+
+        fileHandle.seek(toFileOffset: UInt64(headerSize))
+        var compressedBytesRemaining = Int(fileSize) - headerSize - 8
+        guard compressedBytesRemaining >= 0 else {
+            throw ContainerizationError(.internalError, message: "invalid gzip file")
+        }
+
+        let chunkSize = 65_536
+        let sourceBuffer = UnsafeMutablePointer<UInt8>.allocate(capacity: chunkSize)
+        let destinationBuffer = UnsafeMutablePointer<UInt8>.allocate(capacity: chunkSize)
+        defer {
+            sourceBuffer.deallocate()
+            destinationBuffer.deallocate()
+        }
+
+        let stream = UnsafeMutablePointer<compression_stream>.allocate(capacity: 1)
+        defer { stream.deallocate() }
+
+        var status = compression_stream_init(stream, COMPRESSION_STREAM_DECODE, COMPRESSION_ZLIB)
+        guard status != COMPRESSION_STATUS_ERROR else {
+            throw ContainerizationError(.internalError, message: "gzip decompression failed")
+        }
+        defer { compression_stream_destroy(stream) }
+
+        stream.pointee.src_ptr = UnsafePointer(sourceBuffer)
+        stream.pointee.src_size = 0
+        stream.pointee.dst_ptr = destinationBuffer
+        stream.pointee.dst_size = chunkSize
+
+        var hasher = SHA256()
+        var runningCRC: uLong = crc32(0, nil, 0)
+        var totalDecompressedSize: UInt64 = 0
+        var inputExhausted = false
+
+        while status != COMPRESSION_STATUS_END {
+            if stream.pointee.src_size == 0 && !inputExhausted {
+                let toRead = min(chunkSize, compressedBytesRemaining)
+                if toRead > 0,
+                    let chunk = fileHandle.readData(ofLength: toRead) as Data?,
+                    !chunk.isEmpty
+                {
+                    compressedBytesRemaining -= chunk.count
+                    chunk.copyBytes(to: sourceBuffer, count: chunk.count)
+                    stream.pointee.src_ptr = UnsafePointer(sourceBuffer)
+                    stream.pointee.src_size = chunk.count
+                } else {
+                    inputExhausted = true
+                }
+            }
+
+            stream.pointee.dst_ptr = destinationBuffer
+            stream.pointee.dst_size = chunkSize
+
+            let flags: Int32 = inputExhausted ? Int32(COMPRESSION_STREAM_FINALIZE.rawValue) : 0
+            status = compression_stream_process(stream, flags)
+
+            switch status {
+            case COMPRESSION_STATUS_OK, COMPRESSION_STATUS_END:
+                let produced = chunkSize - stream.pointee.dst_size
+                if produced > 0 {
+                    let buf = UnsafeBufferPointer(start: destinationBuffer, count: produced)
+                    hasher.update(bufferPointer: UnsafeRawBufferPointer(buf))
+                    runningCRC = crc32(runningCRC, destinationBuffer, uInt(produced))
+                    totalDecompressedSize += UInt64(produced)
+                }
+            default:
+                throw ContainerizationError(.internalError, message: "gzip decompression failed")
+            }
+        }
+
+        let actualCRC = UInt32(truncatingIfNeeded: runningCRC)
+        let actualSize = UInt32(truncatingIfNeeded: totalDecompressedSize)
+
+        guard expectedCRC == actualCRC, expectedSize == actualSize else {
+            throw ContainerizationError(.internalError, message: "gzip trailer mismatch")
+        }
+
+        return hasher.finalize()
+    }
+
+    private static func readExactly(fileHandle: FileHandle, count: Int) -> Data? {
+        let data = fileHandle.readData(ofLength: count)
+        return data.isEmpty ? nil : data
+    }
+
+    private static func gzipHeaderSize(_ data: Data) throws -> Int {
+        guard data.count >= 10,
+            data[data.startIndex] == 0x1f,
+            data[data.startIndex + 1] == 0x8b,
+            data[data.startIndex + 2] == 0x08
+        else {
+            throw ContainerizationError(.internalError, message: "invalid gzip file")
+        }
+
+        let start = data.startIndex
+        let flags = data[start + 3]
+        var offset = 10
+
+        if flags & 0x04 != 0 {
+            guard data.count >= offset + 2 else {
+                throw ContainerizationError(.internalError, message: "invalid gzip file")
+            }
+            let extraLen = Int(data[start + offset]) | (Int(data[start + offset + 1]) << 8)
+            offset += 2 + extraLen
+        }
+        if flags & 0x08 != 0 {
+            while offset < data.count && data[start + offset] != 0 { offset += 1 }
+            offset += 1
+        }
+        if flags & 0x10 != 0 {
+            while offset < data.count && data[start + offset] != 0 { offset += 1 }
+            offset += 1
+        }
+        if flags & 0x02 != 0 { offset += 2 }
+
+        guard offset < data.count else {
+            throw ContainerizationError(.internalError, message: "invalid gzip file")
+        }
+        return offset
+    }
 }
diff --git a/Tests/ContainerizationOCITests/DiffIDTests.swift b/Tests/ContainerizationOCITests/DiffIDTests.swift
@@ -0,0 +1,160 @@
+//===----------------------------------------------------------------------===//
+// Copyright © 2026 Apple Inc. and the Containerization project authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//===----------------------------------------------------------------------===//
+
+import ContainerizationError
+import Crypto
+import Foundation
+import Testing
+
+@testable import ContainerizationOCI
+
+struct DiffIDTests {
+    /// Helper to create a gzip-compressed temporary file from raw data.
+    private func createGzipFile(content: Data) throws -> URL {
+        let tempDir = FileManager.default.temporaryDirectory
+        let rawFile = tempDir.appendingPathComponent(UUID().uuidString)
+        let gzFile = tempDir.appendingPathComponent(UUID().uuidString + ".gz")
+        try content.write(to: rawFile)
+        defer { try? FileManager.default.removeItem(at: rawFile) }
+
+        let process = Process()
+        process.executableURL = URL(fileURLWithPath: "/usr/bin/gzip")
+        process.arguments = ["-k", "-f", rawFile.path]
+        try process.run()
+        process.waitUntilExit()
+
+        let gzPath = URL(fileURLWithPath: rawFile.path + ".gz")
+        if FileManager.default.fileExists(atPath: gzPath.path) {
+            try FileManager.default.moveItem(at: gzPath, to: gzFile)
+        }
+        return gzFile
+    }
+
+    @Test func diffIDMatchesUncompressedSHA256() throws {
+        let content = Data("hello, oci layer content for diffid test".utf8)
+        let gzFile = try createGzipFile(content: content)
+        defer { try? FileManager.default.removeItem(at: gzFile) }
+
+        let diffID = try ContentWriter.diffID(of: gzFile)
+        let expected = SHA256.hash(data: content)
+
+        #expect(diffID.digestString == expected.digestString)
+    }
+
+    @Test func diffIDIsDeterministic() throws {
+        let content = Data("deterministic diffid check".utf8)
+        let gzFile = try createGzipFile(content: content)
+        defer { try? FileManager.default.removeItem(at: gzFile) }
+
+        let first = try ContentWriter.diffID(of: gzFile)
+        let second = try ContentWriter.diffID(of: gzFile)
+
+        #expect(first.digestString == second.digestString)
+    }
+
+    @Test func diffIDRejectsNonGzipData() throws {
+        let tempFile = FileManager.default.temporaryDirectory.appendingPathComponent(UUID().uuidString)
+        try Data("this is not gzip".utf8).write(to: tempFile)
+        defer { try? FileManager.default.removeItem(at: tempFile) }
+
+        #expect(throws: ContainerizationError.self) {
+            try ContentWriter.diffID(of: tempFile)
+        }
+    }
+
+    @Test func diffIDRejectsEmptyFile() throws {
+        let tempFile = FileManager.default.temporaryDirectory.appendingPathComponent(UUID().uuidString)
+        try Data().write(to: tempFile)
+        defer { try? FileManager.default.removeItem(at: tempFile) }
+
+        #expect(throws: ContainerizationError.self) {
+            try ContentWriter.diffID(of: tempFile)
+        }
+    }
+
+    @Test func diffIDHandlesLargeContent() throws {
+        // 1MB of repeating data
+        let pattern = Data("ABCDEFGHIJKLMNOPQRSTUVWXYZ012345".utf8)
+        var large = Data()
+        for _ in 0..<(1_048_576 / pattern.count) {
+            large.append(pattern)
+        }
+        let gzFile = try createGzipFile(content: large)
+        defer { try? FileManager.default.removeItem(at: gzFile) }
+
+        let diffID = try ContentWriter.diffID(of: gzFile)
+        let expected = SHA256.hash(data: large)
+
+        #expect(diffID.digestString == expected.digestString)
+    }
+
+    @Test func diffIDRejectsTruncatedGzip() throws {
+        // Build a valid gzip file, then chop off the 8-byte trailer (CRC32 + ISIZE)
+        // to produce a structurally malformed archive.
+        let content = Data("truncated gzip trailer test".utf8)
+        let gzFile = try createGzipFile(content: content)
+        defer { try? FileManager.default.removeItem(at: gzFile) }
+
+        var gzData = try Data(contentsOf: gzFile)
+        guard gzData.count > 8 else {
+            Issue.record("Compressed file too small to truncate")
+            return
+        }
+        gzData.removeLast(8)
+
+        let truncatedFile = FileManager.default.temporaryDirectory
+            .appendingPathComponent(UUID().uuidString + ".gz")
+        try gzData.write(to: truncatedFile)
+        defer { try? FileManager.default.removeItem(at: truncatedFile) }
+
+        #expect(throws: ContainerizationError.self) {
+            try ContentWriter.diffID(of: truncatedFile)
+        }
+    }
+
+    @Test func diffIDRejectsCorruptedCRC() throws {
+        // Flip a byte in the CRC32 field of an otherwise valid gzip file.
+        let content = Data("corrupted crc test".utf8)
+        let gzFile = try createGzipFile(content: content)
+        defer { try? FileManager.default.removeItem(at: gzFile) }
+
+        var gzData = try Data(contentsOf: gzFile)
+        let crcOffset = gzData.count - 8
+        gzData[crcOffset] ^= 0xFF
+
+        let corruptedFile = FileManager.default.temporaryDirectory
+            .appendingPathComponent(UUID().uuidString + ".gz")
+        try gzData.write(to: corruptedFile)
+        defer { try? FileManager.default.removeItem(at: corruptedFile) }
+
+        #expect(throws: ContainerizationError.self) {
+            try ContentWriter.diffID(of: corruptedFile)
+        }
+    }
+
+    @Test func diffIDDigestStringFormat() throws {
+        let content = Data("format test".utf8)
+        let gzFile = try createGzipFile(content: content)
+        defer { try? FileManager.default.removeItem(at: gzFile) }
+
+        let diffID = try ContentWriter.diffID(of: gzFile)
+        let digestString = diffID.digestString
+
+        #expect(digestString.hasPrefix("sha256:"))
+        // sha256: prefix + 64 hex chars
+        #expect(digestString.count == 7 + 64)
+    }
+}