Skip to content

Commit af2d389

Browse files
JAORMXclaude
andcommitted
Add deterministic tar and gzip utilities for OCI skills
Add reproducible archive creation and extraction utilities to the oci/skills package. These are foundational for the skill packager. tar.go: Deterministic tar creation with sorted entries, normalized headers (UID/GID=0, PAX format, epoch-based timestamps). Extraction rejects symlinks, hardlinks, device entries, and path traversal. Per-file size limit (100MB) prevents decompression bombs. gzip.go: Deterministic gzip compression with fixed headers (OS=255, empty name/comment, BestCompression). Decompression with size limit (100MB) prevents decompression bombs. Convenience CompressTar and DecompressTar for combined tar.gz operations. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent d2f369f commit af2d389

4 files changed

Lines changed: 777 additions & 0 deletions

File tree

oci/skills/gzip.go

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
// SPDX-FileCopyrightText: Copyright 2026 Stacklok, Inc.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package skills
5+
6+
import (
7+
"bytes"
8+
"compress/gzip"
9+
"fmt"
10+
"io"
11+
"time"
12+
)
13+
14+
// gzipOSUnknown is the OS value for "unknown" in gzip headers (RFC 1952).
15+
// Using this value ensures cross-platform reproducibility.
16+
const gzipOSUnknown = 255
17+
18+
// GzipOptions configures reproducible gzip compression.
19+
type GzipOptions struct {
20+
// Level is the compression level (defaults to gzip.BestCompression).
21+
Level int
22+
23+
// Epoch is the modification time to use in the gzip header.
24+
// If zero, uses Unix epoch (1970-01-01) for reproducibility.
25+
Epoch time.Time
26+
}
27+
28+
// DefaultGzipOptions returns default options for reproducible gzip compression.
29+
func DefaultGzipOptions() GzipOptions {
30+
return GzipOptions{
31+
Level: gzip.BestCompression,
32+
Epoch: time.Unix(0, 0).UTC(),
33+
}
34+
}
35+
36+
// Compress creates a reproducible gzip compressed byte slice.
37+
// Headers are explicitly controlled for reproducibility:
38+
// - ModTime: uses opts.Epoch (defaults to Unix epoch)
39+
// - Name: empty (no filename)
40+
// - Comment: empty
41+
// - OS: 255 (unknown) for cross-platform consistency
42+
func Compress(data []byte, opts GzipOptions) ([]byte, error) {
43+
if opts.Level == 0 {
44+
opts.Level = gzip.BestCompression
45+
}
46+
47+
// Use Unix epoch if no epoch specified
48+
epoch := opts.Epoch
49+
if epoch.IsZero() {
50+
epoch = time.Unix(0, 0).UTC()
51+
}
52+
53+
var buf bytes.Buffer
54+
gw, err := gzip.NewWriterLevel(&buf, opts.Level)
55+
if err != nil {
56+
return nil, fmt.Errorf("creating gzip writer: %w", err)
57+
}
58+
59+
// Explicitly set header fields for reproducibility
60+
gw.ModTime = epoch
61+
gw.Name = ""
62+
gw.Comment = ""
63+
gw.OS = gzipOSUnknown
64+
65+
if _, err := gw.Write(data); err != nil {
66+
return nil, fmt.Errorf("writing gzip data: %w", err)
67+
}
68+
69+
if err := gw.Close(); err != nil {
70+
return nil, fmt.Errorf("closing gzip writer: %w", err)
71+
}
72+
73+
return buf.Bytes(), nil
74+
}
75+
76+
// MaxDecompressedSize is the maximum size of decompressed data (100MB).
77+
// This prevents decompression bombs.
78+
const MaxDecompressedSize = 100 * 1024 * 1024
79+
80+
// Decompress decompresses gzip data.
81+
func Decompress(data []byte) ([]byte, error) {
82+
return DecompressWithLimit(data, MaxDecompressedSize)
83+
}
84+
85+
// DecompressWithLimit decompresses gzip data with a size limit.
86+
func DecompressWithLimit(data []byte, maxSize int64) ([]byte, error) {
87+
gr, err := gzip.NewReader(bytes.NewReader(data))
88+
if err != nil {
89+
return nil, fmt.Errorf("creating gzip reader: %w", err)
90+
}
91+
defer func() { _ = gr.Close() }()
92+
93+
// Limit read size to prevent decompression bombs
94+
limitedReader := io.LimitReader(gr, maxSize+1)
95+
result, err := io.ReadAll(limitedReader)
96+
if err != nil {
97+
return nil, fmt.Errorf("reading gzip data: %w", err)
98+
}
99+
100+
if int64(len(result)) > maxSize {
101+
return nil, fmt.Errorf("decompressed data exceeds maximum size of %d bytes", maxSize)
102+
}
103+
104+
return result, nil
105+
}
106+
107+
// CompressTar creates a reproducible .tar.gz from the given files.
108+
func CompressTar(files []FileEntry, tarOpts TarOptions, gzipOpts GzipOptions) ([]byte, error) {
109+
tarData, err := CreateTar(files, tarOpts)
110+
if err != nil {
111+
return nil, fmt.Errorf("creating tar: %w", err)
112+
}
113+
114+
gzipData, err := Compress(tarData, gzipOpts)
115+
if err != nil {
116+
return nil, fmt.Errorf("compressing tar: %w", err)
117+
}
118+
119+
return gzipData, nil
120+
}
121+
122+
// DecompressTar extracts files from a .tar.gz archive.
123+
func DecompressTar(data []byte) ([]FileEntry, error) {
124+
tarData, err := Decompress(data)
125+
if err != nil {
126+
return nil, fmt.Errorf("decompressing gzip: %w", err)
127+
}
128+
129+
files, err := ExtractTar(tarData)
130+
if err != nil {
131+
return nil, fmt.Errorf("extracting tar: %w", err)
132+
}
133+
134+
return files, nil
135+
}

oci/skills/gzip_test.go

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
// SPDX-FileCopyrightText: Copyright 2026 Stacklok, Inc.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package skills
5+
6+
import (
7+
"bytes"
8+
"compress/gzip"
9+
"testing"
10+
"time"
11+
12+
"github.com/stretchr/testify/assert"
13+
"github.com/stretchr/testify/require"
14+
)
15+
16+
func TestCompress_Reproducible(t *testing.T) {
17+
t.Parallel()
18+
19+
data := []byte("test data for compression")
20+
opts := DefaultGzipOptions()
21+
22+
gz1, err := Compress(data, opts)
23+
require.NoError(t, err)
24+
25+
gz2, err := Compress(data, opts)
26+
require.NoError(t, err)
27+
28+
assert.Equal(t, gz1, gz2, "Compress should produce identical output for same input")
29+
}
30+
31+
func TestCompress_HeaderFieldsForReproducibility(t *testing.T) {
32+
t.Parallel()
33+
34+
data := []byte("test data")
35+
epoch := time.Unix(1234567890, 0).UTC()
36+
opts := GzipOptions{
37+
Level: gzip.BestCompression,
38+
Epoch: epoch,
39+
}
40+
41+
compressed, err := Compress(data, opts)
42+
require.NoError(t, err)
43+
44+
gr, err := gzip.NewReader(bytes.NewReader(compressed))
45+
require.NoError(t, err)
46+
defer gr.Close()
47+
48+
assert.True(t, gr.ModTime.Equal(epoch), "ModTime should match epoch")
49+
assert.Empty(t, gr.Name, "Name should be empty")
50+
assert.Empty(t, gr.Comment, "Comment should be empty")
51+
assert.Equal(t, byte(gzipOSUnknown), gr.OS, "OS should be 255 (unknown)")
52+
}
53+
54+
func TestCompress_DifferentEpochs(t *testing.T) {
55+
t.Parallel()
56+
57+
data := []byte("test data")
58+
59+
tests := []struct {
60+
name string
61+
epoch1 time.Time
62+
epoch2 time.Time
63+
wantEqual bool
64+
}{
65+
{
66+
name: "same epoch produces same output",
67+
epoch1: time.Unix(1609459200, 0).UTC(),
68+
epoch2: time.Unix(1609459200, 0).UTC(),
69+
wantEqual: true,
70+
},
71+
{
72+
name: "different epochs produce different output",
73+
epoch1: time.Unix(0, 0).UTC(),
74+
epoch2: time.Unix(1000000, 0).UTC(),
75+
wantEqual: false,
76+
},
77+
}
78+
79+
for _, tt := range tests {
80+
t.Run(tt.name, func(t *testing.T) {
81+
t.Parallel()
82+
83+
opts1 := GzipOptions{Level: gzip.BestCompression, Epoch: tt.epoch1}
84+
opts2 := GzipOptions{Level: gzip.BestCompression, Epoch: tt.epoch2}
85+
86+
gz1, err := Compress(data, opts1)
87+
require.NoError(t, err)
88+
89+
gz2, err := Compress(data, opts2)
90+
require.NoError(t, err)
91+
92+
if tt.wantEqual {
93+
assert.Equal(t, gz1, gz2)
94+
} else {
95+
assert.NotEqual(t, gz1, gz2)
96+
}
97+
})
98+
}
99+
}
100+
101+
func TestCompress_SameEpochAlwaysReproducible(t *testing.T) {
102+
t.Parallel()
103+
104+
data := []byte("test data for reproducibility check")
105+
epoch := time.Unix(1609459200, 0).UTC()
106+
opts := GzipOptions{Level: gzip.BestCompression, Epoch: epoch}
107+
108+
results := make([][]byte, 5)
109+
for i := range results {
110+
var err error
111+
results[i], err = Compress(data, opts)
112+
require.NoError(t, err)
113+
}
114+
115+
for i := 1; i < len(results); i++ {
116+
assert.Equal(t, results[0], results[i], "iteration %d should match", i)
117+
}
118+
}
119+
120+
func TestCompressDecompress_RoundTrip(t *testing.T) {
121+
t.Parallel()
122+
123+
original := []byte("test data for round trip")
124+
opts := DefaultGzipOptions()
125+
126+
compressed, err := Compress(original, opts)
127+
require.NoError(t, err)
128+
129+
decompressed, err := Decompress(compressed)
130+
require.NoError(t, err)
131+
132+
assert.Equal(t, original, decompressed)
133+
}
134+
135+
func TestDecompressWithLimit_RejectsOversized(t *testing.T) {
136+
t.Parallel()
137+
138+
// Create compressed data that exceeds the limit when decompressed
139+
data := bytes.Repeat([]byte("x"), 1024)
140+
compressed, err := Compress(data, DefaultGzipOptions())
141+
require.NoError(t, err)
142+
143+
_, err = DecompressWithLimit(compressed, 100)
144+
assert.Error(t, err)
145+
assert.Contains(t, err.Error(), "exceeds maximum size")
146+
}
147+
148+
func TestCompressTar_Reproducible(t *testing.T) {
149+
t.Parallel()
150+
151+
files := []FileEntry{
152+
{Path: "b.txt", Content: []byte("content b")},
153+
{Path: "a.txt", Content: []byte("content a")},
154+
}
155+
156+
tarOpts := DefaultTarOptions()
157+
gzipOpts := DefaultGzipOptions()
158+
159+
gz1, err := CompressTar(files, tarOpts, gzipOpts)
160+
require.NoError(t, err)
161+
162+
gz2, err := CompressTar(files, tarOpts, gzipOpts)
163+
require.NoError(t, err)
164+
165+
assert.Equal(t, gz1, gz2, "CompressTar should produce identical output")
166+
}
167+
168+
func TestCompressTar_RoundTrip(t *testing.T) {
169+
t.Parallel()
170+
171+
originalFiles := []FileEntry{
172+
{Path: "a.txt", Content: []byte("content a")},
173+
{Path: "dir/b.txt", Content: []byte("content b")},
174+
}
175+
176+
tarOpts := DefaultTarOptions()
177+
gzipOpts := DefaultGzipOptions()
178+
179+
compressed, err := CompressTar(originalFiles, tarOpts, gzipOpts)
180+
require.NoError(t, err)
181+
182+
extractedFiles, err := DecompressTar(compressed)
183+
require.NoError(t, err)
184+
185+
require.Len(t, extractedFiles, len(originalFiles))
186+
for i, f := range extractedFiles {
187+
assert.Equal(t, originalFiles[i].Path, f.Path)
188+
assert.Equal(t, originalFiles[i].Content, f.Content)
189+
}
190+
}

0 commit comments

Comments
 (0)