Skip to content

Commit 841f758

Browse files
committed
fix: compute DiffID from uncompressed layer digest
Add a streaming gzip decompression method to ContentWriter that computes the SHA256 DiffID of compressed OCI layers without loading the full file into memory. Streams 64 KB chunks through Apple's Compression framework with incremental SHA256 hashing and CRC32 validation against the gzip trailer. Rebased on main, ran make fmt, and switched to ContainerizationError per reviewer feedback. Signed-off-by: Maxime Grenu <maxime.grenu@gmail.com>
1 parent de58d1d commit 841f758

File tree

2 files changed

+326
-0
lines changed

2 files changed

+326
-0
lines changed

Sources/ContainerizationOCI/Content/ContentWriter.swift

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,12 @@
1414
// limitations under the License.
1515
//===----------------------------------------------------------------------===//
1616

17+
import Compression
1718
import ContainerizationError
1819
import Crypto
1920
import Foundation
2021
import NIOCore
22+
import zlib
2123

2224
/// Provides a context to write data into a directory.
2325
public class ContentWriter {
@@ -134,4 +136,168 @@ public class ContentWriter {
134136
let data = try self.encoder.encode(content)
135137
return try self.write(data)
136138
}
139+
140+
/// Computes the SHA256 digest of the uncompressed content of a gzip file.
141+
///
142+
/// Per the OCI Image Specification, a DiffID is the SHA256 digest of the
143+
/// uncompressed layer content. This method streams the compressed file in
144+
/// chunks, decompresses through Apple's Compression framework, and feeds
145+
/// each decompressed chunk into an incremental SHA256 hasher. Neither the
146+
/// full compressed nor the full decompressed data is held in memory.
147+
///
148+
/// - Parameter url: The URL of the gzip-compressed file.
149+
/// - Returns: The SHA256 digest of the uncompressed content.
150+
public static func diffID(of url: URL) throws -> SHA256.Digest {
151+
let fileHandle = try FileHandle(forReadingFrom: url)
152+
defer { fileHandle.closeFile() }
153+
154+
let headerReadSize = 512
155+
guard let headerData = Self.readExactly(fileHandle: fileHandle, count: headerReadSize),
156+
!headerData.isEmpty
157+
else {
158+
throw ContainerizationError(.internalError, message: "invalid gzip file")
159+
}
160+
let headerSize = try Self.gzipHeaderSize(headerData)
161+
162+
fileHandle.seekToEndOfFile()
163+
let fileSize = fileHandle.offsetInFile
164+
guard fileSize >= 8 else {
165+
throw ContainerizationError(.internalError, message: "gzip trailer mismatch")
166+
}
167+
fileHandle.seek(toFileOffset: fileSize - 8)
168+
guard let trailerData = Self.readExactly(fileHandle: fileHandle, count: 8),
169+
trailerData.count == 8
170+
else {
171+
throw ContainerizationError(.internalError, message: "gzip trailer mismatch")
172+
}
173+
let expectedCRC =
174+
UInt32(trailerData[trailerData.startIndex])
175+
| (UInt32(trailerData[trailerData.startIndex + 1]) << 8)
176+
| (UInt32(trailerData[trailerData.startIndex + 2]) << 16)
177+
| (UInt32(trailerData[trailerData.startIndex + 3]) << 24)
178+
let expectedSize =
179+
UInt32(trailerData[trailerData.startIndex + 4])
180+
| (UInt32(trailerData[trailerData.startIndex + 5]) << 8)
181+
| (UInt32(trailerData[trailerData.startIndex + 6]) << 16)
182+
| (UInt32(trailerData[trailerData.startIndex + 7]) << 24)
183+
184+
fileHandle.seek(toFileOffset: UInt64(headerSize))
185+
var compressedBytesRemaining = Int(fileSize) - headerSize - 8
186+
guard compressedBytesRemaining >= 0 else {
187+
throw ContainerizationError(.internalError, message: "invalid gzip file")
188+
}
189+
190+
let chunkSize = 65_536
191+
let sourceBuffer = UnsafeMutablePointer<UInt8>.allocate(capacity: chunkSize)
192+
let destinationBuffer = UnsafeMutablePointer<UInt8>.allocate(capacity: chunkSize)
193+
defer {
194+
sourceBuffer.deallocate()
195+
destinationBuffer.deallocate()
196+
}
197+
198+
let stream = UnsafeMutablePointer<compression_stream>.allocate(capacity: 1)
199+
defer { stream.deallocate() }
200+
201+
var status = compression_stream_init(stream, COMPRESSION_STREAM_DECODE, COMPRESSION_ZLIB)
202+
guard status != COMPRESSION_STATUS_ERROR else {
203+
throw ContainerizationError(.internalError, message: "gzip decompression failed")
204+
}
205+
defer { compression_stream_destroy(stream) }
206+
207+
stream.pointee.src_ptr = UnsafePointer(sourceBuffer)
208+
stream.pointee.src_size = 0
209+
stream.pointee.dst_ptr = destinationBuffer
210+
stream.pointee.dst_size = chunkSize
211+
212+
var hasher = SHA256()
213+
var runningCRC: uLong = crc32(0, nil, 0)
214+
var totalDecompressedSize: UInt64 = 0
215+
var inputExhausted = false
216+
217+
while status != COMPRESSION_STATUS_END {
218+
if stream.pointee.src_size == 0 && !inputExhausted {
219+
let toRead = min(chunkSize, compressedBytesRemaining)
220+
if toRead > 0,
221+
let chunk = fileHandle.readData(ofLength: toRead) as Data?,
222+
!chunk.isEmpty
223+
{
224+
compressedBytesRemaining -= chunk.count
225+
chunk.copyBytes(to: sourceBuffer, count: chunk.count)
226+
stream.pointee.src_ptr = UnsafePointer(sourceBuffer)
227+
stream.pointee.src_size = chunk.count
228+
} else {
229+
inputExhausted = true
230+
}
231+
}
232+
233+
stream.pointee.dst_ptr = destinationBuffer
234+
stream.pointee.dst_size = chunkSize
235+
236+
let flags: Int32 = inputExhausted ? Int32(COMPRESSION_STREAM_FINALIZE.rawValue) : 0
237+
status = compression_stream_process(stream, flags)
238+
239+
switch status {
240+
case COMPRESSION_STATUS_OK, COMPRESSION_STATUS_END:
241+
let produced = chunkSize - stream.pointee.dst_size
242+
if produced > 0 {
243+
let buf = UnsafeBufferPointer(start: destinationBuffer, count: produced)
244+
hasher.update(bufferPointer: UnsafeRawBufferPointer(buf))
245+
runningCRC = crc32(runningCRC, destinationBuffer, uInt(produced))
246+
totalDecompressedSize += UInt64(produced)
247+
}
248+
default:
249+
throw ContainerizationError(.internalError, message: "gzip decompression failed")
250+
}
251+
}
252+
253+
let actualCRC = UInt32(truncatingIfNeeded: runningCRC)
254+
let actualSize = UInt32(truncatingIfNeeded: totalDecompressedSize)
255+
256+
guard expectedCRC == actualCRC, expectedSize == actualSize else {
257+
throw ContainerizationError(.internalError, message: "gzip trailer mismatch")
258+
}
259+
260+
return hasher.finalize()
261+
}
262+
263+
private static func readExactly(fileHandle: FileHandle, count: Int) -> Data? {
264+
let data = fileHandle.readData(ofLength: count)
265+
return data.isEmpty ? nil : data
266+
}
267+
268+
private static func gzipHeaderSize(_ data: Data) throws -> Int {
269+
guard data.count >= 10,
270+
data[data.startIndex] == 0x1f,
271+
data[data.startIndex + 1] == 0x8b,
272+
data[data.startIndex + 2] == 0x08
273+
else {
274+
throw ContainerizationError(.internalError, message: "invalid gzip file")
275+
}
276+
277+
let start = data.startIndex
278+
let flags = data[start + 3]
279+
var offset = 10
280+
281+
if flags & 0x04 != 0 {
282+
guard data.count >= offset + 2 else {
283+
throw ContainerizationError(.internalError, message: "invalid gzip file")
284+
}
285+
let extraLen = Int(data[start + offset]) | (Int(data[start + offset + 1]) << 8)
286+
offset += 2 + extraLen
287+
}
288+
if flags & 0x08 != 0 {
289+
while offset < data.count && data[start + offset] != 0 { offset += 1 }
290+
offset += 1
291+
}
292+
if flags & 0x10 != 0 {
293+
while offset < data.count && data[start + offset] != 0 { offset += 1 }
294+
offset += 1
295+
}
296+
if flags & 0x02 != 0 { offset += 2 }
297+
298+
guard offset < data.count else {
299+
throw ContainerizationError(.internalError, message: "invalid gzip file")
300+
}
301+
return offset
302+
}
137303
}
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
//===----------------------------------------------------------------------===//
2+
// Copyright © 2026 Apple Inc. and the Containerization project authors.
3+
//
4+
// Licensed under the Apache License, Version 2.0 (the "License");
5+
// you may not use this file except in compliance with the License.
6+
// You may obtain a copy of the License at
7+
//
8+
// https://www.apache.org/licenses/LICENSE-2.0
9+
//
10+
// Unless required by applicable law or agreed to in writing, software
11+
// distributed under the License is distributed on an "AS IS" BASIS,
12+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
// See the License for the specific language governing permissions and
14+
// limitations under the License.
15+
//===----------------------------------------------------------------------===//
16+
17+
import ContainerizationError
18+
import Crypto
19+
import Foundation
20+
import Testing
21+
22+
@testable import ContainerizationOCI
23+
24+
struct DiffIDTests {
25+
/// Helper to create a gzip-compressed temporary file from raw data.
26+
private func createGzipFile(content: Data) throws -> URL {
27+
let tempDir = FileManager.default.temporaryDirectory
28+
let rawFile = tempDir.appendingPathComponent(UUID().uuidString)
29+
let gzFile = tempDir.appendingPathComponent(UUID().uuidString + ".gz")
30+
try content.write(to: rawFile)
31+
defer { try? FileManager.default.removeItem(at: rawFile) }
32+
33+
let process = Process()
34+
process.executableURL = URL(fileURLWithPath: "/usr/bin/gzip")
35+
process.arguments = ["-k", "-f", rawFile.path]
36+
try process.run()
37+
process.waitUntilExit()
38+
39+
let gzPath = URL(fileURLWithPath: rawFile.path + ".gz")
40+
if FileManager.default.fileExists(atPath: gzPath.path) {
41+
try FileManager.default.moveItem(at: gzPath, to: gzFile)
42+
}
43+
return gzFile
44+
}
45+
46+
@Test func diffIDMatchesUncompressedSHA256() throws {
47+
let content = Data("hello, oci layer content for diffid test".utf8)
48+
let gzFile = try createGzipFile(content: content)
49+
defer { try? FileManager.default.removeItem(at: gzFile) }
50+
51+
let diffID = try ContentWriter.diffID(of: gzFile)
52+
let expected = SHA256.hash(data: content)
53+
54+
#expect(diffID.digestString == expected.digestString)
55+
}
56+
57+
@Test func diffIDIsDeterministic() throws {
58+
let content = Data("deterministic diffid check".utf8)
59+
let gzFile = try createGzipFile(content: content)
60+
defer { try? FileManager.default.removeItem(at: gzFile) }
61+
62+
let first = try ContentWriter.diffID(of: gzFile)
63+
let second = try ContentWriter.diffID(of: gzFile)
64+
65+
#expect(first.digestString == second.digestString)
66+
}
67+
68+
@Test func diffIDRejectsNonGzipData() throws {
69+
let tempFile = FileManager.default.temporaryDirectory.appendingPathComponent(UUID().uuidString)
70+
try Data("this is not gzip".utf8).write(to: tempFile)
71+
defer { try? FileManager.default.removeItem(at: tempFile) }
72+
73+
#expect(throws: ContainerizationError.self) {
74+
try ContentWriter.diffID(of: tempFile)
75+
}
76+
}
77+
78+
@Test func diffIDRejectsEmptyFile() throws {
79+
let tempFile = FileManager.default.temporaryDirectory.appendingPathComponent(UUID().uuidString)
80+
try Data().write(to: tempFile)
81+
defer { try? FileManager.default.removeItem(at: tempFile) }
82+
83+
#expect(throws: ContainerizationError.self) {
84+
try ContentWriter.diffID(of: tempFile)
85+
}
86+
}
87+
88+
@Test func diffIDHandlesLargeContent() throws {
89+
// 1MB of repeating data
90+
let pattern = Data("ABCDEFGHIJKLMNOPQRSTUVWXYZ012345".utf8)
91+
var large = Data()
92+
for _ in 0..<(1_048_576 / pattern.count) {
93+
large.append(pattern)
94+
}
95+
let gzFile = try createGzipFile(content: large)
96+
defer { try? FileManager.default.removeItem(at: gzFile) }
97+
98+
let diffID = try ContentWriter.diffID(of: gzFile)
99+
let expected = SHA256.hash(data: large)
100+
101+
#expect(diffID.digestString == expected.digestString)
102+
}
103+
104+
@Test func diffIDRejectsTruncatedGzip() throws {
105+
// Build a valid gzip file, then chop off the 8-byte trailer (CRC32 + ISIZE)
106+
// to produce a structurally malformed archive.
107+
let content = Data("truncated gzip trailer test".utf8)
108+
let gzFile = try createGzipFile(content: content)
109+
defer { try? FileManager.default.removeItem(at: gzFile) }
110+
111+
var gzData = try Data(contentsOf: gzFile)
112+
guard gzData.count > 8 else {
113+
Issue.record("Compressed file too small to truncate")
114+
return
115+
}
116+
gzData.removeLast(8)
117+
118+
let truncatedFile = FileManager.default.temporaryDirectory
119+
.appendingPathComponent(UUID().uuidString + ".gz")
120+
try gzData.write(to: truncatedFile)
121+
defer { try? FileManager.default.removeItem(at: truncatedFile) }
122+
123+
#expect(throws: ContainerizationError.self) {
124+
try ContentWriter.diffID(of: truncatedFile)
125+
}
126+
}
127+
128+
@Test func diffIDRejectsCorruptedCRC() throws {
129+
// Flip a byte in the CRC32 field of an otherwise valid gzip file.
130+
let content = Data("corrupted crc test".utf8)
131+
let gzFile = try createGzipFile(content: content)
132+
defer { try? FileManager.default.removeItem(at: gzFile) }
133+
134+
var gzData = try Data(contentsOf: gzFile)
135+
let crcOffset = gzData.count - 8
136+
gzData[crcOffset] ^= 0xFF
137+
138+
let corruptedFile = FileManager.default.temporaryDirectory
139+
.appendingPathComponent(UUID().uuidString + ".gz")
140+
try gzData.write(to: corruptedFile)
141+
defer { try? FileManager.default.removeItem(at: corruptedFile) }
142+
143+
#expect(throws: ContainerizationError.self) {
144+
try ContentWriter.diffID(of: corruptedFile)
145+
}
146+
}
147+
148+
@Test func diffIDDigestStringFormat() throws {
149+
let content = Data("format test".utf8)
150+
let gzFile = try createGzipFile(content: content)
151+
defer { try? FileManager.default.removeItem(at: gzFile) }
152+
153+
let diffID = try ContentWriter.diffID(of: gzFile)
154+
let digestString = diffID.digestString
155+
156+
#expect(digestString.hasPrefix("sha256:"))
157+
// sha256: prefix + 64 hex chars
158+
#expect(digestString.count == 7 + 64)
159+
}
160+
}

0 commit comments

Comments
 (0)