From af2d3893053e92c238a3fc7eb1761c301e3f05cf Mon Sep 17 00:00:00 2001 From: Juan Antonio Osorio Date: Fri, 6 Feb 2026 21:15:55 +0200 Subject: [PATCH] Add deterministic tar and gzip utilities for OCI skills Add reproducible archive creation and extraction utilities to the oci/skills package. These are foundational for the skill packager. tar.go: Deterministic tar creation with sorted entries, normalized headers (UID/GID=0, PAX format, epoch-based timestamps). Extraction rejects symlinks, hardlinks, device entries, and path traversal. Per-file size limit (100MB) prevents decompression bombs. gzip.go: Deterministic gzip compression with fixed headers (OS=255, empty name/comment, BestCompression). Decompression with size limit (100MB) prevents decompression bombs. Convenience CompressTar and DecompressTar for combined tar.gz operations. Co-Authored-By: Claude Opus 4.6 (1M context) --- oci/skills/gzip.go | 135 +++++++++++++++++++ oci/skills/gzip_test.go | 190 +++++++++++++++++++++++++++ oci/skills/tar.go | 172 ++++++++++++++++++++++++ oci/skills/tar_test.go | 280 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 777 insertions(+) create mode 100644 oci/skills/gzip.go create mode 100644 oci/skills/gzip_test.go create mode 100644 oci/skills/tar.go create mode 100644 oci/skills/tar_test.go diff --git a/oci/skills/gzip.go b/oci/skills/gzip.go new file mode 100644 index 0000000..1959158 --- /dev/null +++ b/oci/skills/gzip.go @@ -0,0 +1,135 @@ +// SPDX-FileCopyrightText: Copyright 2026 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package skills + +import ( + "bytes" + "compress/gzip" + "fmt" + "io" + "time" +) + +// gzipOSUnknown is the OS value for "unknown" in gzip headers (RFC 1952). +// Using this value ensures cross-platform reproducibility. +const gzipOSUnknown = 255 + +// GzipOptions configures reproducible gzip compression. +type GzipOptions struct { + // Level is the compression level (defaults to gzip.BestCompression). + Level int + + // Epoch is the modification time to use in the gzip header. + // If zero, uses Unix epoch (1970-01-01) for reproducibility. + Epoch time.Time +} + +// DefaultGzipOptions returns default options for reproducible gzip compression. +func DefaultGzipOptions() GzipOptions { + return GzipOptions{ + Level: gzip.BestCompression, + Epoch: time.Unix(0, 0).UTC(), + } +} + +// Compress creates a reproducible gzip compressed byte slice. +// Headers are explicitly controlled for reproducibility: +// - ModTime: uses opts.Epoch (defaults to Unix epoch) +// - Name: empty (no filename) +// - Comment: empty +// - OS: 255 (unknown) for cross-platform consistency +func Compress(data []byte, opts GzipOptions) ([]byte, error) { + if opts.Level == 0 { + opts.Level = gzip.BestCompression + } + + // Use Unix epoch if no epoch specified + epoch := opts.Epoch + if epoch.IsZero() { + epoch = time.Unix(0, 0).UTC() + } + + var buf bytes.Buffer + gw, err := gzip.NewWriterLevel(&buf, opts.Level) + if err != nil { + return nil, fmt.Errorf("creating gzip writer: %w", err) + } + + // Explicitly set header fields for reproducibility + gw.ModTime = epoch + gw.Name = "" + gw.Comment = "" + gw.OS = gzipOSUnknown + + if _, err := gw.Write(data); err != nil { + return nil, fmt.Errorf("writing gzip data: %w", err) + } + + if err := gw.Close(); err != nil { + return nil, fmt.Errorf("closing gzip writer: %w", err) + } + + return buf.Bytes(), nil +} + +// MaxDecompressedSize is the maximum size of decompressed data (100MB). +// This prevents decompression bombs. +const MaxDecompressedSize = 100 * 1024 * 1024 + +// Decompress decompresses gzip data. +func Decompress(data []byte) ([]byte, error) { + return DecompressWithLimit(data, MaxDecompressedSize) +} + +// DecompressWithLimit decompresses gzip data with a size limit. +func DecompressWithLimit(data []byte, maxSize int64) ([]byte, error) { + gr, err := gzip.NewReader(bytes.NewReader(data)) + if err != nil { + return nil, fmt.Errorf("creating gzip reader: %w", err) + } + defer func() { _ = gr.Close() }() + + // Limit read size to prevent decompression bombs + limitedReader := io.LimitReader(gr, maxSize+1) + result, err := io.ReadAll(limitedReader) + if err != nil { + return nil, fmt.Errorf("reading gzip data: %w", err) + } + + if int64(len(result)) > maxSize { + return nil, fmt.Errorf("decompressed data exceeds maximum size of %d bytes", maxSize) + } + + return result, nil +} + +// CompressTar creates a reproducible .tar.gz from the given files. +func CompressTar(files []FileEntry, tarOpts TarOptions, gzipOpts GzipOptions) ([]byte, error) { + tarData, err := CreateTar(files, tarOpts) + if err != nil { + return nil, fmt.Errorf("creating tar: %w", err) + } + + gzipData, err := Compress(tarData, gzipOpts) + if err != nil { + return nil, fmt.Errorf("compressing tar: %w", err) + } + + return gzipData, nil +} + +// DecompressTar extracts files from a .tar.gz archive. +func DecompressTar(data []byte) ([]FileEntry, error) { + tarData, err := Decompress(data) + if err != nil { + return nil, fmt.Errorf("decompressing gzip: %w", err) + } + + files, err := ExtractTar(tarData) + if err != nil { + return nil, fmt.Errorf("extracting tar: %w", err) + } + + return files, nil +} diff --git a/oci/skills/gzip_test.go b/oci/skills/gzip_test.go new file mode 100644 index 0000000..f4c47c3 --- /dev/null +++ b/oci/skills/gzip_test.go @@ -0,0 +1,190 @@ +// SPDX-FileCopyrightText: Copyright 2026 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package skills + +import ( + "bytes" + "compress/gzip" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestCompress_Reproducible(t *testing.T) { + t.Parallel() + + data := []byte("test data for compression") + opts := DefaultGzipOptions() + + gz1, err := Compress(data, opts) + require.NoError(t, err) + + gz2, err := Compress(data, opts) + require.NoError(t, err) + + assert.Equal(t, gz1, gz2, "Compress should produce identical output for same input") +} + +func TestCompress_HeaderFieldsForReproducibility(t *testing.T) { + t.Parallel() + + data := []byte("test data") + epoch := time.Unix(1234567890, 0).UTC() + opts := GzipOptions{ + Level: gzip.BestCompression, + Epoch: epoch, + } + + compressed, err := Compress(data, opts) + require.NoError(t, err) + + gr, err := gzip.NewReader(bytes.NewReader(compressed)) + require.NoError(t, err) + defer gr.Close() + + assert.True(t, gr.ModTime.Equal(epoch), "ModTime should match epoch") + assert.Empty(t, gr.Name, "Name should be empty") + assert.Empty(t, gr.Comment, "Comment should be empty") + assert.Equal(t, byte(gzipOSUnknown), gr.OS, "OS should be 255 (unknown)") +} + +func TestCompress_DifferentEpochs(t *testing.T) { + t.Parallel() + + data := []byte("test data") + + tests := []struct { + name string + epoch1 time.Time + epoch2 time.Time + wantEqual bool + }{ + { + name: "same epoch produces same output", + epoch1: time.Unix(1609459200, 0).UTC(), + epoch2: time.Unix(1609459200, 0).UTC(), + wantEqual: true, + }, + { + name: "different epochs produce different output", + epoch1: time.Unix(0, 0).UTC(), + epoch2: time.Unix(1000000, 0).UTC(), + wantEqual: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + opts1 := GzipOptions{Level: gzip.BestCompression, Epoch: tt.epoch1} + opts2 := GzipOptions{Level: gzip.BestCompression, Epoch: tt.epoch2} + + gz1, err := Compress(data, opts1) + require.NoError(t, err) + + gz2, err := Compress(data, opts2) + require.NoError(t, err) + + if tt.wantEqual { + assert.Equal(t, gz1, gz2) + } else { + assert.NotEqual(t, gz1, gz2) + } + }) + } +} + +func TestCompress_SameEpochAlwaysReproducible(t *testing.T) { + t.Parallel() + + data := []byte("test data for reproducibility check") + epoch := time.Unix(1609459200, 0).UTC() + opts := GzipOptions{Level: gzip.BestCompression, Epoch: epoch} + + results := make([][]byte, 5) + for i := range results { + var err error + results[i], err = Compress(data, opts) + require.NoError(t, err) + } + + for i := 1; i < len(results); i++ { + assert.Equal(t, results[0], results[i], "iteration %d should match", i) + } +} + +func TestCompressDecompress_RoundTrip(t *testing.T) { + t.Parallel() + + original := []byte("test data for round trip") + opts := DefaultGzipOptions() + + compressed, err := Compress(original, opts) + require.NoError(t, err) + + decompressed, err := Decompress(compressed) + require.NoError(t, err) + + assert.Equal(t, original, decompressed) +} + +func TestDecompressWithLimit_RejectsOversized(t *testing.T) { + t.Parallel() + + // Create compressed data that exceeds the limit when decompressed + data := bytes.Repeat([]byte("x"), 1024) + compressed, err := Compress(data, DefaultGzipOptions()) + require.NoError(t, err) + + _, err = DecompressWithLimit(compressed, 100) + assert.Error(t, err) + assert.Contains(t, err.Error(), "exceeds maximum size") +} + +func TestCompressTar_Reproducible(t *testing.T) { + t.Parallel() + + files := []FileEntry{ + {Path: "b.txt", Content: []byte("content b")}, + {Path: "a.txt", Content: []byte("content a")}, + } + + tarOpts := DefaultTarOptions() + gzipOpts := DefaultGzipOptions() + + gz1, err := CompressTar(files, tarOpts, gzipOpts) + require.NoError(t, err) + + gz2, err := CompressTar(files, tarOpts, gzipOpts) + require.NoError(t, err) + + assert.Equal(t, gz1, gz2, "CompressTar should produce identical output") +} + +func TestCompressTar_RoundTrip(t *testing.T) { + t.Parallel() + + originalFiles := []FileEntry{ + {Path: "a.txt", Content: []byte("content a")}, + {Path: "dir/b.txt", Content: []byte("content b")}, + } + + tarOpts := DefaultTarOptions() + gzipOpts := DefaultGzipOptions() + + compressed, err := CompressTar(originalFiles, tarOpts, gzipOpts) + require.NoError(t, err) + + extractedFiles, err := DecompressTar(compressed) + require.NoError(t, err) + + require.Len(t, extractedFiles, len(originalFiles)) + for i, f := range extractedFiles { + assert.Equal(t, originalFiles[i].Path, f.Path) + assert.Equal(t, originalFiles[i].Content, f.Content) + } +} diff --git a/oci/skills/tar.go b/oci/skills/tar.go new file mode 100644 index 0000000..1533374 --- /dev/null +++ b/oci/skills/tar.go @@ -0,0 +1,172 @@ +// SPDX-FileCopyrightText: Copyright 2026 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package skills + +import ( + "archive/tar" + "bytes" + "fmt" + "io" + "path" + "sort" + "strings" + "time" +) + +// TarOptions configures reproducible tar archive creation. +type TarOptions struct { + // Epoch is the timestamp to use for all files (defaults to Unix epoch). + Epoch time.Time +} + +// DefaultTarOptions returns default options for reproducible tar archives. +func DefaultTarOptions() TarOptions { + return TarOptions{ + Epoch: time.Unix(0, 0).UTC(), + } +} + +// FileEntry represents a file to include in a tar archive. +type FileEntry struct { + Path string // Path within the archive + Content []byte // File content + Mode int64 // File mode (defaults to 0644) +} + +// CreateTar creates a reproducible tar archive from the given files. +// Files are sorted alphabetically and normalized headers are used +// to ensure deterministic output. +func CreateTar(files []FileEntry, opts TarOptions) ([]byte, error) { + if opts.Epoch.IsZero() { + opts.Epoch = time.Unix(0, 0).UTC() + } + + // Sort files for deterministic ordering + sorted := make([]FileEntry, len(files)) + copy(sorted, files) + sort.Slice(sorted, func(i, j int) bool { + return sorted[i].Path < sorted[j].Path + }) + + var buf bytes.Buffer + tw := tar.NewWriter(&buf) + + for _, f := range sorted { + mode := f.Mode + if mode == 0 { + mode = 0644 + } + + hdr := &tar.Header{ + Name: f.Path, + Size: int64(len(f.Content)), + Mode: mode, + ModTime: opts.Epoch, + Uid: 0, + Gid: 0, + Uname: "", + Gname: "", + Typeflag: tar.TypeReg, + Format: tar.FormatPAX, + } + + if err := tw.WriteHeader(hdr); err != nil { + return nil, fmt.Errorf("writing tar header for %s: %w", f.Path, err) + } + + if _, err := tw.Write(f.Content); err != nil { + return nil, fmt.Errorf("writing tar content for %s: %w", f.Path, err) + } + } + + if err := tw.Close(); err != nil { + return nil, fmt.Errorf("closing tar writer: %w", err) + } + + return buf.Bytes(), nil +} + +// MaxTarFileSize is the maximum size of a single file in a tar archive (100MB). +// This prevents decompression bombs. +const MaxTarFileSize = 100 * 1024 * 1024 + +// ExtractTar extracts files from a tar archive. +func ExtractTar(data []byte) ([]FileEntry, error) { + return ExtractTarWithLimit(data, MaxTarFileSize) +} + +// ExtractTarWithLimit extracts files from a tar archive with a per-file size limit. +// It rejects symlinks, hardlinks, device entries, and paths containing traversal sequences. +func ExtractTarWithLimit(data []byte, maxFileSize int64) ([]FileEntry, error) { + tr := tar.NewReader(bytes.NewReader(data)) + var files []FileEntry + + for { + hdr, err := tr.Next() + if err == io.EOF { + break + } + if err != nil { + return nil, fmt.Errorf("reading tar header: %w", err) + } + + // Reject path traversal + if err := validateTarPath(hdr.Name); err != nil { + return nil, err + } + + // Skip directories + if hdr.Typeflag == tar.TypeDir { + continue + } + + // Reject symlinks and hardlinks + if hdr.Typeflag == tar.TypeSymlink || hdr.Typeflag == tar.TypeLink { + return nil, fmt.Errorf("archive contains disallowed link type: %s", hdr.Name) + } + + // Reject device entries and other special types + if hdr.Typeflag != tar.TypeReg { + return nil, fmt.Errorf("archive contains disallowed entry type %d: %s", hdr.Typeflag, hdr.Name) + } + + // Check declared size against limit + if hdr.Size > maxFileSize { + return nil, fmt.Errorf("file %s exceeds maximum size of %d bytes", hdr.Name, maxFileSize) + } + + // Use LimitReader to enforce the limit during reading + limitedReader := io.LimitReader(tr, maxFileSize+1) + content, err := io.ReadAll(limitedReader) + if err != nil { + return nil, fmt.Errorf("reading tar content for %s: %w", hdr.Name, err) + } + + if int64(len(content)) > maxFileSize { + return nil, fmt.Errorf("file %s exceeds maximum size of %d bytes", hdr.Name, maxFileSize) + } + + files = append(files, FileEntry{ + Path: hdr.Name, + Content: content, + Mode: hdr.Mode, + }) + } + + return files, nil +} + +// validateTarPath checks that a tar entry path is safe. +func validateTarPath(p string) error { + // path.Clean resolves all ".." segments; any remaining leading ".." + // means the path escapes the archive root. + cleaned := path.Clean(p) + if strings.HasPrefix(cleaned, "..") { + return fmt.Errorf("path traversal detected in archive: %s", p) + } + if path.IsAbs(cleaned) { + return fmt.Errorf("absolute path not allowed in archive: %s", p) + } + return nil +} diff --git a/oci/skills/tar_test.go b/oci/skills/tar_test.go new file mode 100644 index 0000000..87ab09f --- /dev/null +++ b/oci/skills/tar_test.go @@ -0,0 +1,280 @@ +// SPDX-FileCopyrightText: Copyright 2026 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package skills + +import ( + "archive/tar" + "bytes" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestCreateTar_Reproducible(t *testing.T) { + t.Parallel() + + files := []FileEntry{ + {Path: "b.txt", Content: []byte("content b")}, + {Path: "a.txt", Content: []byte("content a")}, + {Path: "c/d.txt", Content: []byte("content d")}, + } + + opts := DefaultTarOptions() + + tar1, err := CreateTar(files, opts) + require.NoError(t, err) + + tar2, err := CreateTar(files, opts) + require.NoError(t, err) + + assert.Equal(t, tar1, tar2, "CreateTar should produce identical output for same input") +} + +func TestCreateTar_DifferentOrder(t *testing.T) { + t.Parallel() + + files1 := []FileEntry{ + {Path: "b.txt", Content: []byte("b")}, + {Path: "a.txt", Content: []byte("a")}, + } + + files2 := []FileEntry{ + {Path: "a.txt", Content: []byte("a")}, + {Path: "b.txt", Content: []byte("b")}, + } + + opts := DefaultTarOptions() + + tar1, err := CreateTar(files1, opts) + require.NoError(t, err) + + tar2, err := CreateTar(files2, opts) + require.NoError(t, err) + + assert.Equal(t, tar1, tar2, "CreateTar should sort files internally") +} + +func TestCreateTar_DifferentTimestamps(t *testing.T) { + t.Parallel() + + files := []FileEntry{ + {Path: "test.txt", Content: []byte("test")}, + } + + tests := []struct { + name string + opts1 TarOptions + opts2 TarOptions + wantEqual bool + }{ + { + name: "same epoch produces same output", + opts1: TarOptions{Epoch: time.Unix(0, 0).UTC()}, + opts2: TarOptions{Epoch: time.Unix(0, 0).UTC()}, + wantEqual: true, + }, + { + name: "different epochs produce different output", + opts1: TarOptions{Epoch: time.Unix(0, 0).UTC()}, + opts2: TarOptions{Epoch: time.Unix(1000000, 0).UTC()}, + wantEqual: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + tar1, err := CreateTar(files, tt.opts1) + require.NoError(t, err) + + tar2, err := CreateTar(files, tt.opts2) + require.NoError(t, err) + + if tt.wantEqual { + assert.Equal(t, tar1, tar2) + } else { + assert.NotEqual(t, tar1, tar2) + } + }) + } +} + +func TestExtractTar_RoundTrip(t *testing.T) { + t.Parallel() + + originalFiles := []FileEntry{ + {Path: "a.txt", Content: []byte("content a")}, + {Path: "b/c.txt", Content: []byte("content c")}, + } + + tarData, err := CreateTar(originalFiles, DefaultTarOptions()) + require.NoError(t, err) + + extractedFiles, err := ExtractTar(tarData) + require.NoError(t, err) + + require.Len(t, extractedFiles, len(originalFiles)) + + for i, f := range extractedFiles { + assert.Equal(t, originalFiles[i].Path, f.Path) + assert.Equal(t, originalFiles[i].Content, f.Content) + } +} + +func TestCreateTar_EmptyFiles(t *testing.T) { + t.Parallel() + + tarData, err := CreateTar(nil, DefaultTarOptions()) + require.NoError(t, err) + + extractedFiles, err := ExtractTar(tarData) + require.NoError(t, err) + + assert.Empty(t, extractedFiles) +} + +func TestExtractTar_RejectsSymlinks(t *testing.T) { + t.Parallel() + + // Create a tar with a symlink entry + var buf bytes.Buffer + tw := tar.NewWriter(&buf) + + hdr := &tar.Header{ + Name: "malicious_link", + Typeflag: tar.TypeSymlink, + Linkname: "/etc/passwd", + } + require.NoError(t, tw.WriteHeader(hdr)) + require.NoError(t, tw.Close()) + + _, err := ExtractTar(buf.Bytes()) + assert.Error(t, err) + assert.Contains(t, err.Error(), "disallowed link type") +} + +func TestExtractTar_RejectsHardlinks(t *testing.T) { + t.Parallel() + + var buf bytes.Buffer + tw := tar.NewWriter(&buf) + + hdr := &tar.Header{ + Name: "malicious_link", + Typeflag: tar.TypeLink, + Linkname: "other_file", + } + require.NoError(t, tw.WriteHeader(hdr)) + require.NoError(t, tw.Close()) + + _, err := ExtractTar(buf.Bytes()) + assert.Error(t, err) + assert.Contains(t, err.Error(), "disallowed link type") +} + +func TestExtractTar_RejectsDeviceEntries(t *testing.T) { + t.Parallel() + + var buf bytes.Buffer + tw := tar.NewWriter(&buf) + + hdr := &tar.Header{ + Name: "malicious_device", + Typeflag: tar.TypeChar, + Mode: 0666, + } + require.NoError(t, tw.WriteHeader(hdr)) + require.NoError(t, tw.Close()) + + _, err := ExtractTar(buf.Bytes()) + assert.Error(t, err) + assert.Contains(t, err.Error(), "disallowed entry type") +} + +func TestExtractTar_RejectsPathTraversal(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + path string + }{ + {name: "dotdot prefix", path: "../etc/passwd"}, + {name: "dotdot in middle", path: "foo/../../etc/passwd"}, + {name: "absolute path", path: "/etc/passwd"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + var buf bytes.Buffer + tw := tar.NewWriter(&buf) + + hdr := &tar.Header{ + Name: tt.path, + Size: 4, + Typeflag: tar.TypeReg, + Mode: 0644, + } + require.NoError(t, tw.WriteHeader(hdr)) + _, err := tw.Write([]byte("test")) + require.NoError(t, err) + require.NoError(t, tw.Close()) + + _, err = ExtractTar(buf.Bytes()) + assert.Error(t, err) + }) + } +} + +func TestExtractTarWithLimit_RejectsOversized(t *testing.T) { + t.Parallel() + + files := []FileEntry{ + {Path: "big.txt", Content: bytes.Repeat([]byte("x"), 1024)}, + } + + tarData, err := CreateTar(files, DefaultTarOptions()) + require.NoError(t, err) + + _, err = ExtractTarWithLimit(tarData, 100) + assert.Error(t, err) + assert.Contains(t, err.Error(), "exceeds maximum size") +} + +func TestExtractTar_SkipsDirectories(t *testing.T) { + t.Parallel() + + var buf bytes.Buffer + tw := tar.NewWriter(&buf) + + // Write a directory entry + require.NoError(t, tw.WriteHeader(&tar.Header{ + Name: "mydir/", + Typeflag: tar.TypeDir, + Mode: 0755, + })) + + // Write a file inside it + content := []byte("hello") + require.NoError(t, tw.WriteHeader(&tar.Header{ + Name: "mydir/file.txt", + Size: int64(len(content)), + Typeflag: tar.TypeReg, + Mode: 0644, + })) + _, err := tw.Write(content) + require.NoError(t, err) + require.NoError(t, tw.Close()) + + files, err := ExtractTar(buf.Bytes()) + require.NoError(t, err) + + require.Len(t, files, 1) + assert.Equal(t, "mydir/file.txt", files[0].Path) + assert.Equal(t, content, files[0].Content) +}