From 794adf895ff998ca2fbc40ab6c528b72ed4c7de5 Mon Sep 17 00:00:00 2001 From: Caleb Xu Date: Tue, 28 Apr 2026 15:52:28 -0400 Subject: [PATCH 1/3] engine: use fmt.Fprintf in generateBundleHash Avoid an extra allocation from fmt.Sprintf plus WriteString. Signed-off-by: Caleb Xu Made-with: Cursor --- internal/engine/engine.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/engine/engine.go b/internal/engine/engine.go index e92e40bf..21234d47 100644 --- a/internal/engine/engine.go +++ b/internal/engine/engine.go @@ -357,7 +357,7 @@ func generateBundleHash(ctx context.Context, bundlePath string) (string, error) slices.Sort(keys) for _, k := range keys { - hashBuffer.WriteString(fmt.Sprintf("%s %s\n", k, files[k])) + fmt.Fprintf(&hashBuffer, "%s %s\n", k, files[k]) } artifactsWriter := artifacts.WriterFromContext(ctx) From 732001d181d090d038eb166e0add8038a6a82129 Mon Sep 17 00:00:00 2001 From: Caleb Xu Date: Tue, 28 Apr 2026 15:53:03 -0400 Subject: [PATCH 2/3] engine: add link graph for tar symlink/hardlink resolution Introduce LinkGraph, alias expansion, and extraction-context helpers used by layer planning. Remove duplicate test linkType constants now defined in graph.go. Signed-off-by: Caleb Xu Made-with: Cursor --- internal/engine/graph.go | 404 ++++++++++++++++++++++++++++++++++ internal/engine/graph_test.go | 174 +++++++++++++++ internal/engine/untar_test.go | 10 +- 3 files changed, 579 insertions(+), 9 deletions(-) create mode 100644 internal/engine/graph.go create mode 100644 internal/engine/graph_test.go diff --git a/internal/engine/graph.go b/internal/engine/graph.go new file mode 100644 index 00000000..94ef4a8d --- /dev/null +++ b/internal/engine/graph.go @@ -0,0 +1,404 @@ +package engine + +import ( + "archive/tar" + "path/filepath" + "slices" + "strings" + + "github.com/go-logr/logr" + + "github.com/redhat-openshift-ecosystem/openshift-preflight/internal/log" +) + +// linkType a convenience type just to make the consuming functions more clear. +type linkType byte + +const ( + hardlink linkType = tar.TypeLink + symlink linkType = tar.TypeSymlink +) + +// String returns the string representation of the LinkType. +func (lt linkType) String() string { + if lt == tar.TypeLink { + return "hardlink" + } + return "symlink" +} + +type linkNode struct { + Name string + Deps *linkNode + OriginalLinkname string // For symlinks, the original target string before resolution + Type linkType // Type of link (symlink or hardlink) + VirtualLinkTarget string // For hardlinks pointing to symlinks, the symlink's target string +} + +func (n *linkNode) IsHardlink() bool { + return n.Type == hardlink +} + +func (n *linkNode) IsSymlink() bool { + return n.Type == symlink +} + +// ChainTypesToFollow returns the link types that should be followed in a chain +// starting from this node. Hardlinks follow both hardlink and symlink chains, +// while symlinks only follow symlink chains. +func (n *linkNode) ChainTypesToFollow() []linkType { + if n.IsHardlink() { + return []linkType{hardlink, symlink} + } + return []linkType{symlink} +} + +type LinkGraph map[string]*linkNode + +type extractionContext struct { + linkGraph LinkGraph + neededFiles *[]string + logger logr.Logger +} + +// ExpandFilePathAliases generates all possible paths to a file through directory symlinks. +// For example, for a symlink /a/b/c -> /foo/bar (directory), and a file /foo/bar/baz, this +// will return both /a/b/c/baz and /foo/bar/baz. +func (lg LinkGraph) ExpandFilePathAliases(filePath string, symlinkAliases map[string][]string) []string { + expanding := make(map[string]struct{}) + return lg.expandFilePathAliasesRec(filePath, symlinkAliases, expanding) +} + +func (lg LinkGraph) expandFilePathAliasesRec(filePath string, symlinkAliases map[string][]string, expanding map[string]struct{}) []string { + results := []string{filePath} + visited := make(map[string]struct{}) + visited[filePath] = struct{}{} + + expanding[filePath] = struct{}{} + defer delete(expanding, filePath) + + // Check all parent directories to see if they have symlink aliases + dir := filePath + for { + dir = filepath.Dir(dir) + if dir == "." || dir == "/" { + break + } + + // Check if this directory has any symlinks pointing to it + if aliases, ok := symlinkAliases[dir]; ok { + for _, symlinkPath := range aliases { + // Replace the directory portion with the symlink path + // For example: /usr/share/rpm/file.db with symlink /usr/lib/sysimage/rpm -> /usr/share/rpm + // becomes /usr/lib/sysimage/rpm/file.db + relativePath, err := filepath.Rel(dir, filePath) + if err != nil { + //coverage:ignore + continue + } + aliasedPath := filepath.Join(symlinkPath, relativePath) + if _, seen := visited[aliasedPath]; !seen { + visited[aliasedPath] = struct{}{} + results = append(results, aliasedPath) + // Recursively find aliases of the aliased path. Skip recursion if aliasedPath + // is already being expanded (directory symlink cycles). + if _, onStack := expanding[aliasedPath]; !onStack { + edgeKey := dir + "\x00" + symlinkPath + if _, edgeSeen := expanding[edgeKey]; edgeSeen { + //coverage:ignore + continue + } + expanding[edgeKey] = struct{}{} + nestedAliases := lg.expandFilePathAliasesRec(aliasedPath, symlinkAliases, expanding) + delete(expanding, edgeKey) + for _, nested := range nestedAliases { + if _, seen := visited[nested]; !seen { + visited[nested] = struct{}{} + results = append(results, nested) + } + } + } + } + } + } + } + + return results +} + +// walkGraphChain walks a graph chain starting from a node, applying a visitor function +// to each node in the chain. Returns when the chain ends, visitor returns false, or a cycle is detected. +func walkGraphChain(start string, graph LinkGraph, visitor func(node string, deps *linkNode) bool) { + current := start + visited := make(map[string]struct{}) + for { + if _, seen := visited[current]; seen { + //coverage:ignore + break + } + visited[current] = struct{}{} + + node, ok := graph[current] + if !ok || node.Deps == nil { + break + } + next := node.Deps.Name + if _, seen := visited[next]; seen { + break + } + if !visitor(next, node.Deps) { + break + } + current = next + } +} + +// followLinkChain adds all links in the chain starting from the given link. +// For example, if linkA -> linkB -> directory, this adds both linkA and linkB +// to neededFiles. If filterType is provided (non-nil), only links of that type are followed. +func (ec *extractionContext) followLinkChain(startLink string, filterType *linkType) { + walkGraphChain(startLink, ec.linkGraph, func(target string, deps *linkNode) bool { + // If the target is also a link in the graph + if targetNode, isTargetLink := ec.linkGraph[target]; isTargetLink { + // If filtering by type, check if target matches + if filterType != nil && targetNode.Type != *filterType { + return false + } + *ec.neededFiles = append(*ec.neededFiles, target) + linkTypeStr := targetNode.Type.String() + ec.logger.V(log.TRC).Info("adding transitive directory "+linkTypeStr, linkTypeStr, target, "via", startLink) + return true + } + return false + }) +} + +// processLink processes a link (symlink or hardlink) by adding it to neededFiles, +// following its chain, and recursively processing its target and parents. +// The logContext parameter provides context for logging (e.g., "parent directory", "target"). +func (ec *extractionContext) processLink(path string, node *linkNode, logContext string, originalFile string, visited map[string]struct{}) { + *ec.neededFiles = append(*ec.neededFiles, path) + + ec.logger.V(log.TRC).Info("adding "+logContext+" "+node.Type.String(), node.Type.String(), path, "for_file", originalFile) + + // Follow link chains based on node type + for _, chainType := range node.ChainTypesToFollow() { + ec.followLinkChain(path, &chainType) + } + + // Recursively process the target's parent directories AND the target itself + // (the target might also be a link) + if node.Deps != nil { + depName := node.Deps.Name + + // First check if the target itself is a link + if _, seen := visited[depName]; !seen { + visited[depName] = struct{}{} + if depNode, isLink := ec.linkGraph[depName]; isLink { + ec.processLink(depName, depNode, "target", originalFile, visited) + } + } + + // Then check the target's parents + ec.addParentLinks(depName, originalFile, visited) + } +} + +func (ec *extractionContext) addParentLinks(path string, originalFile string, visited map[string]struct{}) { + // Check all parent directories up to root + dir := filepath.Dir(path) + for dir != "." && dir != "/" { + if _, seen := visited[dir]; seen { + break + } + visited[dir] = struct{}{} + + // Check if this directory is a link (symlink or hardlink) + if node, isLink := ec.linkGraph[dir]; isLink { + ec.processLink(dir, node, "parent directory", originalFile, visited) + } + + dir = filepath.Dir(dir) + } +} + +// resolveRelativeLinkFrom resolves a symlink's target string the same way as planExtraction: +// POSIX-style absolute linknames (leading '/') are archive-root paths; relative linknames +// are resolved from linkPath's directory. +func resolveRelativeLinkFrom(linkPath, linkTarget string) string { + if strings.HasPrefix(linkTarget, "/") { + return filepath.Clean(strings.TrimPrefix(linkTarget, "/")) + } + return filepath.Clean(filepath.Join(filepath.Dir(linkPath), linkTarget)) +} + +// pathMayHaveSymlinkAliasPaths reports whether ExpandFilePathAliases can return +// more than a single-element slice for path. Keys of symlinkAliases are resolved +// symlink targets; ExpandFilePathAliases only adds alternate paths when some +// ancestor directory of path is such a target. +func pathMayHaveSymlinkAliasPaths(path string, symlinkAliases map[string][]string) bool { + for d := filepath.Dir(path); d != "." && d != "/"; d = filepath.Dir(d) { + if _, ok := symlinkAliases[d]; ok { + return true + } + } + return false +} + +// addAliasIfNew adds an alias to the aliases map if it doesn't already exist +// Returns true if the alias was added (indicating a change) +func addAliasIfNew(aliases map[string][]string, target, alias string) bool { + if !slices.Contains(aliases[target], alias) { + aliases[target] = append(aliases[target], alias) + return true + } + return false +} + +// getLinkTarget returns the link target (original linkname) for a path, +// checking both real symlinks and virtual links +func getLinkTarget(path string, linkGraph LinkGraph) (linkTarget string, found bool) { + if linkNode, exists := linkGraph[path]; exists { + if linkNode.IsSymlink() && linkNode.OriginalLinkname != "" { + return linkNode.OriginalLinkname, true + } + if linkNode.VirtualLinkTarget != "" { + return linkNode.VirtualLinkTarget, true + } + } + return "", false +} + +// getEffectiveLinkTarget checks if a path is a symlink/virtual-symlink, +// or if it's a hardlink pointing to a symlink/virtual-symlink. +// Returns the ultimate symlink target string. +func getEffectiveLinkTarget(path string, linkGraph LinkGraph) (linkTarget string, found bool) { + // First check if path itself is a symlink/virtual-symlink + if target, ok := getLinkTarget(path, linkGraph); ok { + return target, true + } + + // If path is a hardlink, check if its target is a symlink/virtual-symlink + if node, exists := linkGraph[path]; exists && node.IsHardlink() && node.Deps != nil { + return getLinkTarget(node.Deps.Name, linkGraph) + } + + return "", false +} + +// BuildDirectoryAliasMap builds a backlink map for all links in the +// LinkGraph. The resulting map associates each link in the graph +// with its aliases. +func (lg LinkGraph) BuildDirectoryAliasMap(logger logr.Logger) map[string][]string { + // Start with basic symlink aliases + aliases := make(map[string][]string) + for linkPath, node := range lg { + if node.IsSymlink() && node.Deps != nil { + targetPath := node.Deps.Name + aliases[targetPath] = append(aliases[targetPath], linkPath) + } + } + + // Hardlink peers share the same inode: symlinks pointing at any peer apply to all peers. + for { + changed := false + for linkPath, linkNode := range lg { + if !linkNode.IsHardlink() || linkNode.Deps == nil { + continue + } + h, t := linkPath, linkNode.Deps.Name + // Propagate aliases[h] -> t, then aliases[t] -> h (including new entries on t + // from the first loop). Index-based iteration avoids slices.Clone per edge. + if hs := aliases[h]; len(hs) > 0 { + nh := len(hs) + for i := 0; i < nh; i++ { + if addAliasIfNew(aliases, t, hs[i]) { + changed = true + } + } + } + if ts := aliases[t]; len(ts) > 0 { + nt := len(ts) + for i := 0; i < nt; i++ { + if addAliasIfNew(aliases, h, ts[i]) { + changed = true + } + } + } + } + if !changed { + break + } + } + + iteration := 0 + // Monotonic fixpoint: each successful addAliasIfNew strictly increases the + // multiset of (target, alias) pairs; possible pairs are finite for a finite graph. + for { + changed := false + iteration++ + + for linkPath, linkNode := range lg { + if linkNode.Deps == nil { + continue + } + targetPath := linkNode.Deps.Name + + // Handle hardlinks pointing to symlinks (real or virtual) + if linkNode.IsHardlink() { + // Skip if we've already processed this hardlink as a virtual symlink + if linkNode.VirtualLinkTarget != "" { + continue + } + + // Skip if the target is not a symlink/virtual-symlink + linkTarget, found := getLinkTarget(targetPath, lg) + if !found { + continue + } + + // Create a virtual symlink at the hardlink's location: interpret the + // target symlink's linkname as relative to this hardlink's directory + // (same inode semantics users see when opening the path via the hardlink). + resolvedFromHardlink := resolveRelativeLinkFrom(linkPath, linkTarget) + + if addAliasIfNew(aliases, resolvedFromHardlink, linkPath) { + linkNode.VirtualLinkTarget = linkTarget + logger.V(log.TRC).Info("adding hardlink alias", + "from", linkPath, + "to", resolvedFromHardlink, + "via_target", targetPath, + "iteration", iteration) + changed = true + } + } + + // Handle symlinks pointing to other links (directly or via hardlink) + if linkNode.IsSymlink() && linkNode.OriginalLinkname != "" { + linkTarget, found := getEffectiveLinkTarget(targetPath, lg) + if !found { + continue + } + + resolvedFromSymlink := resolveRelativeLinkFrom(targetPath, linkTarget) + + if addAliasIfNew(aliases, resolvedFromSymlink, linkPath) { + logger.V(log.TRC).Info("adding symlink alias", + "from", linkPath, + "to", resolvedFromSymlink, + "via_target", targetPath, + "iteration", iteration) + changed = true + } + } + } + + if !changed { + break + } + } + + logger.V(log.DBG).Info("directory alias map built", "iterations", iteration, "total_aliases", len(aliases)) + + return aliases +} diff --git a/internal/engine/graph_test.go b/internal/engine/graph_test.go new file mode 100644 index 00000000..a4b2b11c --- /dev/null +++ b/internal/engine/graph_test.go @@ -0,0 +1,174 @@ +package engine + +import ( + "fmt" + "testing" + + "github.com/go-logr/logr" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("LinkGraph ExpandFilePathAliases", func() { + It("terminates with a finite set when directory symlink aliases form a cycle", func() { + // Mutual directory symlinks: link "b" points at directory "a", link "a" points at "b". + // Matches BuildDirectoryAliasMap: aliases[target] = symlink paths pointing to target. + symlinkAliases := map[string][]string{ + "a": {"b"}, + "b": {"a"}, + } + var lg LinkGraph + + out := lg.ExpandFilePathAliases("a/x", symlinkAliases) + + Expect(out).To(ConsistOf("a/x", "b/x")) + }) + + It("terminates when a directory symlink path is under its target (duplicate expansion edge guard)", func() { + // Symlink path "x/s" points at directory "x" (same subtree). Inner alias expansion can revisit + // the same (dir, symlinkPath) edge while edgeKey remains in expanding; expandFilePathAliasesRec + // must continue instead of recursing infinitely. + symlinkAliases := map[string][]string{ + "x": {"x/s"}, + } + var lg LinkGraph + + out := lg.ExpandFilePathAliases("x/f", symlinkAliases) + + Expect(out).To(ContainElements("x/f", "x/s/f")) + Expect(len(out)).To(BeNumerically("<", 20)) + }) + + It("normalizes absolute OriginalLinkname to archive-relative alias keys", func() { + // Mirrors tar symlinks whose Linkname starts with '/' (planExtraction strips the slash). + shareDir := &linkNode{Name: "data/share"} + inner := &linkNode{Name: "link/inner", Deps: shareDir, OriginalLinkname: "/data/share", Type: symlink} + lg := LinkGraph{ + "data/share": shareDir, + "link/inner": inner, + "alias/out": &linkNode{Name: "alias/out", Deps: inner, OriginalLinkname: "link/inner", Type: symlink}, + } + + aliases := lg.BuildDirectoryAliasMap(logr.Discard()) + + Expect(aliases["data/share"]).To(ContainElements("link/inner", "alias/out")) + expanded := lg.ExpandFilePathAliases("data/share/doc", aliases) + Expect(expanded).To(ContainElement("alias/out/doc")) + }) + + It("maps chained symlink aliases using the inner link target directory", func() { + // Outer symlink at other/deep/here -> ../../foo/bar/baz; inner at foo/bar/baz has linkname "..". + // Resolving ".." from the outer symlink's directory would incorrectly yield "other"; + // resolution must be relative to the inner symlink path (foo/bar/baz) so ".." means "foo". + fooLeaf := &linkNode{Name: "foo"} + baz := &linkNode{Name: "foo/bar/baz", Deps: fooLeaf, OriginalLinkname: "..", Type: symlink} + lg := LinkGraph{ + "foo": fooLeaf, + "foo/bar/baz": baz, + "other/deep/here": &linkNode{Name: "other/deep/here", Deps: baz, OriginalLinkname: "../../foo/bar/baz", Type: symlink}, + } + + aliases := lg.BuildDirectoryAliasMap(logr.Discard()) + + Expect(aliases["foo"]).To(ContainElements("foo/bar/baz", "other/deep/here")) + }) + + It("converges for a deep symlink chain", func() { + const depth = 100 + lg := LinkGraph{ + "chain/0": &linkNode{Name: "chain/0"}, + } + for i := 1; i <= depth; i++ { + prev := fmt.Sprintf("chain/%d", i-1) + cur := fmt.Sprintf("chain/%d", i) + // Sibling-style linknames under chain/ so resolveRelativeLinkFrom(chain/i, name) + // yields chain/{name} (Join("chain", "../0") would incorrectly clean to "0"). + lg[cur] = &linkNode{ + Name: cur, + Deps: &linkNode{Name: prev}, + OriginalLinkname: fmt.Sprintf("%d", i-1), + Type: symlink, + } + } + + aliases := lg.BuildDirectoryAliasMap(logr.Discard()) + + // Direct child and first propagation hop onto chain/0. + Expect(aliases["chain/0"]).To(ContainElements("chain/1", "chain/2")) + + expanded := lg.ExpandFilePathAliases("chain/0/sub", aliases) + Expect(expanded).To(ContainElement("chain/100/sub")) + }) + + It("stops walking a graph chain on a symlink cycle", func() { + lg := LinkGraph{ + "a": &linkNode{Name: "a", Deps: &linkNode{Name: "b"}, Type: symlink}, + "b": &linkNode{Name: "b", Deps: &linkNode{Name: "a"}, Type: symlink}, + } + + var visited []string + walkGraphChain("a", lg, func(n string, deps *linkNode) bool { + visited = append(visited, n) + return true + }) + + Expect(visited).To(Equal([]string{"b"})) + }) + + // walkGraphChain may still contain a defensive check for "current" already in visited at the + // loop head (graph.go). With the current next/visited logic, a->b->a cycles break when "next" + // is already visited instead; that branch is likely unreachable unless the loop structure changes. +}) + +var _ = Describe("LinkGraph BuildDirectoryAliasMap", func() { + It("propagates symlink aliases onto hardlink peers sharing the target inode", func() { + target := &linkNode{Name: "data/file"} + lg := LinkGraph{ + "data/file": target, + "sym": &linkNode{Name: "sym", Deps: target, OriginalLinkname: "data/file", Type: symlink}, + "peer": &linkNode{Name: "peer", Deps: target, Type: hardlink}, + } + + aliases := lg.BuildDirectoryAliasMap(logr.Discard()) + + Expect(aliases["data/file"]).To(ContainElement("sym")) + Expect(aliases["peer"]).To(ContainElement("sym")) + }) +}) + +var _ = Describe("pathMayHaveSymlinkAliasPaths", func() { + It("is false when no parent is a symlink target key", func() { + aliases := map[string][]string{ + "usr/share/rpm": {"usr/lib/sysimage/rpm"}, + } + Expect(pathMayHaveSymlinkAliasPaths("etc/passwd", aliases)).To(BeFalse()) + }) + + It("is true when a parent directory is a symlink target key", func() { + aliases := map[string][]string{ + "usr/share/rpm": {"usr/lib/sysimage/rpm"}, + } + Expect(pathMayHaveSymlinkAliasPaths("usr/share/rpm/Packages", aliases)).To(BeTrue()) + }) +}) + +// BenchmarkBuildDirectoryAliasMap exercises the hardlink peer propagation and +// symlink fixpoint loops on a synthetic dense graph (see plan: profile alias map). +func BenchmarkBuildDirectoryAliasMap_hardlinkPeers(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + target := &linkNode{Name: "data/share"} + lg := LinkGraph{ + "data/share": target, + } + for i := range 100 { + n := fmt.Sprintf("peer/%d", i) + lg[n] = &linkNode{Name: n, Deps: target, Type: hardlink} + } + for i := range 100 { + n := fmt.Sprintf("sym/%d", i) + lg[n] = &linkNode{Name: n, Deps: target, OriginalLinkname: "data/share", Type: symlink} + } + _ = lg.BuildDirectoryAliasMap(logr.Discard()) + } +} diff --git a/internal/engine/untar_test.go b/internal/engine/untar_test.go index 4875596d..e40f766d 100644 --- a/internal/engine/untar_test.go +++ b/internal/engine/untar_test.go @@ -505,14 +505,6 @@ var _ = Describe("Untar Directory Traversal Protection", func() { }) }) -// linkType a convenience type just to make the consuming functions more clear. -type linkType = byte - -const ( - hardlink linkType = tar.TypeLink - symlink linkType = tar.TypeSymlink -) - // writeTarballWithLink writes a tar archive with a regular file and a hard // link. The ability to write a regular file allows for testing happy paths. // note: this should only be used as a helper function in tests. @@ -537,7 +529,7 @@ func writeTarballWithLink(out io.Writer, linkTypeFlag linkType, contents []byte, } linkHeader := &tar.Header{ - Typeflag: linkTypeFlag, + Typeflag: byte(linkTypeFlag), Name: linkname, Linkname: linkTarget, Mode: 0o644, From 7c606c33fa8ba7b94c5aa77013f6c2b6de5f0847 Mon Sep 17 00:00:00 2001 From: Caleb Xu Date: Tue, 28 Apr 2026 15:55:56 -0400 Subject: [PATCH 3/3] engine: plan and extract layers using link graph Wire tar layer scanning to LinkGraph: planExtraction, buildExtractionPlan, and runExtraction with ordered hardlinks and expanded symlink path matching. Signed-off-by: Caleb Xu Made-with: Cursor --- internal/engine/untar.go | 570 ++++++++++++++++------ internal/engine/untar_test.go | 873 +++++++++++++++++++++++++++++----- 2 files changed, 1167 insertions(+), 276 deletions(-) diff --git a/internal/engine/untar.go b/internal/engine/untar.go index dd2090f0..2b71614b 100644 --- a/internal/engine/untar.go +++ b/internal/engine/untar.go @@ -20,58 +20,176 @@ import ( "github.com/redhat-openshift-ecosystem/openshift-preflight/internal/log" ) -// expandLiteralPatternsWithDescendantGlob adds a "path/**" pattern for each pattern that has no -// glob metacharacters, so a follow-up pass that only lists a symlink's target directory (e.g. -// usr/share/licenses) still matches nested files in the tarball. -// -// Patterns containing `[` are skipped: we treat * ? [ as glob syntax; literal `[` in a path is rare. -func expandLiteralPatternsWithDescendantGlob(patterns []string) []string { - if len(patterns) == 0 { - return patterns +// finishTarReadCloser drains and closes a tar layer reader, merging drain/close errors into err. +func finishTarReadCloser(rc io.ReadCloser, err *error) { + _, drainErr := io.Copy(io.Discard, rc) + closeErr := rc.Close() + + var errs []error + if *err != nil { + errs = append(errs, *err) } - out := slices.Clone(patterns) + if drainErr != nil { + errs = append(errs, fmt.Errorf("failed to drain io reader: %w", drainErr)) + } + if closeErr != nil { + //coverage:ignore + errs = append(errs, fmt.Errorf("failed to close io reader: %w", closeErr)) + } + switch len(errs) { + case 0: + case 1: + *err = errs[0] + default: + //coverage:ignore + *err = errors.Join(errs...) + } +} + +func validateRequiredFilePatterns(patterns []string) error { for _, p := range patterns { - if p == "" || strings.ContainsAny(p, "*?[") { - continue - } - child := p + "/**" - if !slices.Contains(out, child) { - out = append(out, child) + if !doublestar.ValidatePattern(p) { + return fmt.Errorf("invalid glob pattern: %q", p) } } - return out + return nil +} + +func pathMatchesAnyValidatedPattern(path string, patterns []string) bool { + return slices.ContainsFunc(patterns, func(p string) bool { + return doublestar.MatchUnvalidated(p, path) + }) } -// clearUnresolvedLinkTargetsForExtractedPath removes entries from unresolved when an extracted -// path satisfies them: either an exact key match, or the longest key that is a strict parent -// directory prefix of extractedPath (directory symlink targets such as usr/share/licenses). -func clearUnresolvedLinkTargetsForExtractedPath(unresolved map[string]struct{}, extractedPath string) { - var longestKey string - for k := range unresolved { - if extractedPath == k || strings.HasPrefix(extractedPath, k+"/") { - if len(k) > len(longestKey) { - longestKey = k +// buildExtractionPlan decides which archive paths must be extracted given validated glob patterns. +func buildExtractionPlan( + linkGraph LinkGraph, + allFiles map[string]struct{}, + requiredFilePatterns []string, + logger logr.Logger, +) []string { + // Build an alias map: canonical targets -> symlink paths that point at them. + linkAliases := linkGraph.BuildDirectoryAliasMap(logger) + + neededFiles := make([]string, 0) + extractCtx := &extractionContext{ + linkGraph: linkGraph, + neededFiles: &neededFiles, + logger: logger, + } + + // Some paths match only via symlink aliases (e.g. pattern under /usr/lib/sysimage/rpm + // while the inode lives under /usr/share/rpm). When no ancestor is a symlink target, + // ExpandFilePathAliases yields only the canonical path, so we skip the expensive expansion. + for filePath := range allFiles { + var aliasedPaths []string + if pathMayHaveSymlinkAliasPaths(filePath, linkAliases) { + aliasedPaths = linkGraph.ExpandFilePathAliases(filePath, linkAliases) + } else { + aliasedPaths = []string{filePath} + } + + for _, aliasPath := range aliasedPaths { + if !pathMatchesAnyValidatedPattern(aliasPath, requiredFilePatterns) { + continue } + neededFiles = append(neededFiles, filePath) + visited := make(map[string]struct{}) + extractCtx.addParentLinks(aliasPath, filePath, visited) + for _, sym := range linkAliases[filePath] { + if _, ok := visited[sym]; ok { + //coverage:ignore + continue + } + visited[sym] = struct{}{} + if node, ok := linkGraph[sym]; ok { + extractCtx.processLink(sym, node, "symlink to extracted file", filePath, visited) + } + } + break } } - if longestKey != "" { - delete(unresolved, longestKey) - } + + filesToExtract := make(map[string]struct{}) + // neededFiles lists matched archive paths plus link parents; addTransitiveDependencies + // closes over hardlink/symlink targets so the plan is self-consistent. + addTransitiveDependencies(neededFiles, linkGraph, allFiles, filesToExtract) + + return slices.Collect(maps.Keys(filesToExtract)) } -// appendSymlinkTargetPatterns appends the resolved symlink target and target/** to filterPatterns -// when missing, so layer paths under the real directory (e.g. usr/share/licenses/...) match. -func appendSymlinkTargetPatterns(filterPatterns []string, logger logr.Logger, resolvedTargetName string) []string { - nestedGlob := resolvedTargetName + "/**" - if !slices.Contains(filterPatterns, resolvedTargetName) { - logger.V(log.TRC).Info("adding symlink target path to filter patterns", "target", resolvedTargetName) - filterPatterns = append(filterPatterns, resolvedTargetName) +func planExtraction(ctx context.Context, img v1.Image, requiredFilePatterns []string) (result []string, links LinkGraph, err error) { + logger := logr.FromContextOrDiscard(ctx) + logger.V(log.TRC).Info("planning extraction from tar stream") + + if err := validateRequiredFilePatterns(requiredFilePatterns); err != nil { + return nil, nil, fmt.Errorf("invalid required file patterns: %w", err) } - if !slices.Contains(filterPatterns, nestedGlob) { - logger.V(log.TRC).Info("adding symlink target descendant glob to filter patterns", "targetGlob", nestedGlob) - filterPatterns = append(filterPatterns, nestedGlob) + + fs := mutate.Extract(img) + defer func() { + finishTarReadCloser(fs, &err) + }() + + tr := tar.NewReader(fs) + linkGraph := make(LinkGraph, 0) + + // Track all files that exist in the tar archive (as a set for quick lookup) + allFiles := make(map[string]struct{}) + + for { + header, err := tr.Next() + + switch { + case err == io.EOF: + result = buildExtractionPlan(linkGraph, allFiles, requiredFilePatterns, logger) + return result, linkGraph, nil + case err != nil: + //coverage:ignore + return nil, nil, err + case header == nil: + //coverage:ignore + continue + } + + switch header.Typeflag { + case tar.TypeDir: + //coverage:ignore + continue + case tar.TypeReg: + allFiles[header.Name] = struct{}{} + case tar.TypeSymlink: + allFiles[header.Name] = struct{}{} + resolvedTargetName := resolveRelativeLinkFrom(header.Name, header.Linkname) + dep, ok := linkGraph[resolvedTargetName] + if !ok { + dep = &linkNode{ + Name: resolvedTargetName, + Deps: nil, + } + } + linkGraph[header.Name] = &linkNode{ + Name: header.Name, + Deps: dep, + OriginalLinkname: header.Linkname, + Type: tar.TypeSymlink, + } + case tar.TypeLink: + allFiles[header.Name] = struct{}{} + dep, ok := linkGraph[header.Linkname] + if !ok { + dep = &linkNode{ + Name: header.Linkname, + Deps: nil, + } + } + linkGraph[header.Name] = &linkNode{ + Name: header.Name, + Deps: dep, + Type: tar.TypeLink, + } + } } - return filterPatterns } // untar takes a destination path, a container image, and a list of files or match patterns @@ -80,197 +198,349 @@ func untar(ctx context.Context, dst string, img v1.Image, requiredFilePatterns [ logger := logr.FromContextOrDiscard(ctx) logger.V(log.DBG).Info("exporting and flattening image") - // Extract all files matching the required file patterns. - state := make(map[string]struct{}) - var err error + files, linkGraph, err := planExtraction(ctx, img, requiredFilePatterns) + if err != nil { + return fmt.Errorf("failed to build extraction plan: %w", err) + } + logger.V(log.DBG).Info("built list of files for extraction", "files", files, "count", len(files)) logger.V(log.DBG).Info("extracting container filesystem", "path", dst) - remaining := slices.Clone(requiredFilePatterns) + if err = runExtraction(ctx, dst, img, files, linkGraph); err != nil { + return fmt.Errorf("failed to extract tarball: %w", err) + } - // In the case of symlinks, the targets may not be included in the original required file - // patterns, so make additional passes through the layers as needed to find them. - // Make at least one pass to validate the tar format, even if there are no required patterns. - for { - if remaining, err = untarOnce(ctx, dst, img, remaining, state); err != nil { - return fmt.Errorf("failed to extract tarball: %w", err) + return nil +} + +// sortHardlinksByDependencies sorts deferred hardlinks so that targets are created before +// hardlinks that depend on them. Uses the link graph to determine dependencies. +func sortHardlinksByDependencies(hardlinks []string, graph LinkGraph) ([]string, error) { + if len(hardlinks) == 0 { + return hardlinks, nil + } + + // Build a set of hardlink names for quick lookup + hardlinkSet := make(map[string]struct{}) + for _, hlName := range hardlinks { + hardlinkSet[hlName] = struct{}{} + } + + // Build adjacency list and in-degree map for hardlinks only + inDegree := make(map[string]int) + adjList := make(map[string][]string) + + for _, hlName := range hardlinks { + inDegree[hlName] = 0 + adjList[hlName] = []string{} + } + + // Build dependency graph + for _, hlName := range hardlinks { + if node, ok := graph[hlName]; ok && node.Deps != nil { + target := node.Deps.Name + // Only add edge if target is also a deferred hardlink + if _, isHardlink := hardlinkSet[target]; isHardlink { + adjList[target] = append(adjList[target], hlName) + inDegree[hlName]++ + } } - if len(remaining) == 0 { - break + } + + // Topological sort using Kahn's algorithm + queue := []string{} + for _, hlName := range hardlinks { + if inDegree[hlName] == 0 { + queue = append(queue, hlName) } } - return nil + var sorted []string + for len(queue) > 0 { + current := queue[0] + queue = queue[1:] + sorted = append(sorted, current) + + for _, neighbor := range adjList[current] { + inDegree[neighbor]-- + if inDegree[neighbor] == 0 { + queue = append(queue, neighbor) + } + } + } + + if len(sorted) != len(hardlinks) { + sortedSet := make(map[string]struct{}, len(sorted)) + for _, s := range sorted { + //coverage:ignore + sortedSet[s] = struct{}{} + } + var remaining []string + for _, hl := range hardlinks { + if _, ok := sortedSet[hl]; !ok { + remaining = append(remaining, hl) + } + } + slices.Sort(remaining) + return nil, fmt.Errorf("cyclic hardlink dependency; could not order deferred hardlinks (remaining: %v)", remaining) + } + + return sorted, nil } -// untarOnce takes a destination path, a container image, a list of files or match patterns -// which should be extracted out of the image, and a map in which to store extraction progress/state. -// The function returns a list of files that should be extracted in another invocation of -// untarOnce along with an error if one was encountered. -// A tar reader loops over the tarfile creating the file structure at -// 'dst' along the way, and writing any files. This function uses a pre-allocated buffer to -// reduce allocations and is not goroutine-safe. +// addTransitiveDependencies follows the graph forwards from the given files and adds +// files that the given files link to (potentially recursively). +func addTransitiveDependencies(neededFiles []string, graph LinkGraph, allFiles map[string]struct{}, filesToExtract map[string]struct{}) { + // neededFiles and graph may contain virtual links (i.e. paths which do not + // exist in the tar stream, but match the filter patterns and would exist if + // the tar stream were fully extracted). The filesToExtract map should + // only contain real paths present in the tar stream, so filter out virtual + // paths before adding to the map. + + visited := make(map[string]struct{}) + for _, f := range neededFiles { + if _, seen := visited[f]; seen { + continue + } + visited[f] = struct{}{} + + if _, exists := allFiles[f]; exists { + filesToExtract[f] = struct{}{} + } + + // Consider what files would need to be recursively extracted in order + // for the matching file from neededFiles to resolve properly (not be + // a broken link) + walkGraphChain(f, graph, func(target string, deps *linkNode) bool { + if _, seen := visited[target]; seen { + return false + } + visited[target] = struct{}{} + + if _, exists := allFiles[target]; exists { + filesToExtract[target] = struct{}{} + } + return true + }) + } +} + +// runExtraction extracts the specified files from the container image. +// This function makes a single pass through the tar stream and handles hardlinks by +// deferring their creation until after their targets are extracted. +// The linkGraph is used to process deferred hardlinks in dependency order. // Uses os.Root to restrict extraction to dst. -func untarOnce(ctx context.Context, dst string, img v1.Image, filterPatterns []string, state map[string]struct{}) (remaining []string, err error) { +func runExtraction(ctx context.Context, dst string, img v1.Image, files []string, linkGraph LinkGraph) (err error) { logger := logr.FromContextOrDiscard(ctx) - filterPatterns = expandLiteralPatternsWithDescendantGlob(filterPatterns) - logger.V(log.TRC).Info("extracting from tar stream with filter patterns", "patterns", filterPatterns) + logger.V(log.TRC).Info("extracting from tar stream") + + // Build a set of all files to extract from the plan + filesToExtract := make(map[string]struct{}) + for _, file := range files { + filesToExtract[file] = struct{}{} + } fs := mutate.Extract(img) defer func() { - // Drain any remaining data from the reader and capture any errors - _, drainErr := io.Copy(io.Discard, fs) - if drainErr != nil { - err = fmt.Errorf("failed to drain io reader: %w", drainErr) - } - fs.Close() + finishTarReadCloser(fs, &err) }() - filesProcessedInThisPass := make(map[string]struct{}) - unresolvedLinkTargets := make(map[string]struct{}) + extractedFiles := make(map[string]struct{}) + var deferredHardlinks []string tr := tar.NewReader(fs) dst = filepath.Clean(dst) dstRoot, openErr := os.OpenRoot(dst) if openErr != nil { //coverage:ignore - return slices.Collect(maps.Keys(unresolvedLinkTargets)), fmt.Errorf("untar error, unable to open extraction destination %s: %w", dst, openErr) + return fmt.Errorf("untar error, unable to open extraction destination %s: %w", dst, openErr) } defer dstRoot.Close() + mkdirDone := make(map[string]struct{}) + mkdirOnce := func(dir string) error { + if dir == "." || dir == "/" { + return nil + } + if _, ok := mkdirDone[dir]; ok { + return nil + } + if err := dstRoot.MkdirAll(dir, 0o755); err != nil && !os.IsExist(err) { + return err + } + mkdirDone[dir] = struct{}{} + return nil + } + // Buffer for io.CopyBuffer operations to reduce allocations buf := make([]byte, 32*1024) for { header, err := tr.Next() switch { - // if no more files are found return case err == io.EOF: - logger.V(log.TRC).Info("extracted files", "files", filesProcessedInThisPass) - logger.V(log.TRC).Info("remaining files", "files", unresolvedLinkTargets) - return slices.Collect(maps.Keys(unresolvedLinkTargets)), nil + // At this point, all regular files and symlinks have been created, along with + // some hardlinks. Now create deferred hardlinks in dependency order. + orderedHardlinks, sortErr := sortHardlinksByDependencies(deferredHardlinks, linkGraph) + if sortErr != nil { + //coverage:ignore + return fmt.Errorf("deferred hardlink ordering: %w", sortErr) + } + + for _, hlName := range orderedHardlinks { + // Look up the hardlink target from the graph + node, ok := linkGraph[hlName] + if !ok || node.Deps == nil { + //coverage:ignore + logger.V(log.DBG).Info("skipping deferred hardlink, no graph entry", "link", hlName) + continue + } + linkTarget := node.Deps.Name + + if _, targetExists := extractedFiles[linkTarget]; !targetExists { + //coverage:ignore + logger.V(log.DBG).Info("skipping deferred hardlink, target not extracted", "link", hlName, "target", linkTarget) + continue + } + + dirname := filepath.Dir(hlName) + if err := mkdirOnce(dirname); err != nil { + //coverage:ignore + return err + } + + if err := dstRoot.Link(linkTarget, hlName); err != nil { + //coverage:ignore + logger.V(log.DBG).Info("error creating deferred hardlink, ignoring", "link", hlName, "linkedTo", linkTarget, "reason", err.Error()) + continue + } + + extractedFiles[hlName] = struct{}{} + } + + logger.V(log.TRC).Info("extracted files", "files", extractedFiles, "count", len(extractedFiles)) + return nil - // return any other error case err != nil: //coverage:ignore - logger.V(log.TRC).Info("extracted files", "files", filesProcessedInThisPass) - logger.V(log.TRC).Info("remaining files", "files", unresolvedLinkTargets) - return slices.Collect(maps.Keys(unresolvedLinkTargets)), err + logger.V(log.TRC).Info("error reading tar stream", "extractedCount", len(extractedFiles)) + return err - // if the header is nil, just skip it (not sure how this happens) case header == nil: //coverage:ignore continue } - if _, ok := state[header.Name]; ok { - continue - } - - matches := slices.ContainsFunc(filterPatterns, func(p string) bool { - result, _ := doublestar.Match(p, header.Name) - return result - }) - if !matches { + // Skip files not in the extraction plan + if _, shouldExtract := filesToExtract[header.Name]; !shouldExtract { continue } // check the file type switch header.Typeflag { - // skip all directories, we'll only create the needed directory - // structure for the files/symlinks that need to be created case tar.TypeDir: //coverage:ignore continue - // if it's a file create it case tar.TypeReg: dirname := filepath.Dir(header.Name) - if err := dstRoot.MkdirAll(dirname, 0o755); err != nil && !os.IsExist(err) { - return slices.Collect(maps.Keys(unresolvedLinkTargets)), err + if err := mkdirOnce(dirname); err != nil { + return err } // Mask non-permission bits, which are not supported by dstRoot.OpenFile fileMode := os.FileMode(header.Mode & 0o777) - f, err := dstRoot.OpenFile(header.Name, os.O_CREATE|os.O_WRONLY, fileMode) + f, err := dstRoot.OpenFile(header.Name, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, fileMode) if err != nil { //coverage:ignore - return slices.Collect(maps.Keys(unresolvedLinkTargets)), err + return err } - // copy over contents - if _, err := io.CopyBuffer(f, tr, buf); err != nil { + _, copyErr := io.CopyBuffer(f, tr, buf) + closeErr := f.Close() + if copyErr != nil { //coverage:ignore - f.Close() - return slices.Collect(maps.Keys(unresolvedLinkTargets)), err + return copyErr + } + if closeErr != nil { + //coverage:ignore + return closeErr } - filesProcessedInThisPass[header.Name] = struct{}{} - state[header.Name] = struct{}{} - - // Drop satisfied unresolved symlink targets (exact path or longest parent-directory key). - clearUnresolvedLinkTargetsForExtractedPath(unresolvedLinkTargets, header.Name) - - // manually close here after each file operation; defering would cause each file close - // to wait until all operations have completed. - f.Close() + extractedFiles[header.Name] = struct{}{} - // if it's a link create it - case tar.TypeSymlink, tar.TypeLink: - // Create the new link's directory if it doesn't exist. + case tar.TypeSymlink: + // Symlinks can be created even if target doesn't exist dirname := filepath.Dir(header.Name) - if err := dstRoot.MkdirAll(dirname, 0o755); err != nil && !os.IsExist(err) { + if err := mkdirOnce(dirname); err != nil { //coverage:ignore - return slices.Collect(maps.Keys(unresolvedLinkTargets)), err + return err } - linkFn := dstRoot.Link - if header.Typeflag == tar.TypeSymlink { - // for dstRoot, basepath enforcement is not done on - // oldname when symlinking, so we'll do it here instead. - linkFn = func(oldname, newname string) error { - // resolved the oldname relative to the new name - resolvedON, _ := resolveLinkPaths(oldname, newname) - // Identify extraction root traversal with post resolution - finalOldname := filepath.Join(dstRoot.Name(), resolvedON) - if finalOldname != dstRoot.Name() && !strings.HasPrefix(finalOldname, dstRoot.Name()+string(os.PathSeparator)) { - //coverage:ignore - return errors.New("link resolves to path outside of extraction root") - } - - // otherwise, link the two. newname validation is done by dstRoot. - return dstRoot.Symlink(finalOldname, newname) + // for dstRoot, basepath enforcement is not done on + // oldname when symlinking, so we'll do it here instead. + resolvedON, _ := resolveLinkPaths(header.Linkname, header.Name) + // Identify extraction root traversal with post resolution + finalOldname := filepath.Join(dstRoot.Name(), resolvedON) + if finalOldname != dstRoot.Name() && !strings.HasPrefix(finalOldname, dstRoot.Name()+string(os.PathSeparator)) { + logger.V(log.DBG).Info("symlink resolves outside extraction root, ignoring", "link", header.Name, "target", header.Linkname) + continue + } + + // A symlink may be reachable via one or more hardlinks. The relative target is resolved + // differently depending on which hardlink path is accessed. + var symlinkTarget string + if filepath.IsAbs(header.Linkname) { + // Absolute symlinks are converted to absolute within the extraction root + symlinkTarget = finalOldname + } else { + // Canonical relative path from the symlink's directory to the vetted target (POSIX + // resolves relative symlink targets from the link's parent directory, not dst root). + linkDirAbs := filepath.Join(dstRoot.Name(), filepath.Dir(header.Name)) + cleanFinal := filepath.Clean(finalOldname) + relTarget, relErr := filepath.Rel(linkDirAbs, cleanFinal) + if relErr != nil { + //coverage:ignore + logger.V(log.DBG).Info("symlink relative path failed, ignoring", "link", header.Name, "target", header.Linkname, "reason", relErr.Error()) + continue + } + if filepath.Clean(filepath.Join(linkDirAbs, relTarget)) != cleanFinal { + //coverage:ignore + logger.V(log.DBG).Info("symlink sanitized target mismatch, ignoring", "link", header.Name, "target", header.Linkname) + continue } + symlinkTarget = relTarget } - err := linkFn(header.Linkname, header.Name) - if err != nil { + if err := dstRoot.Symlink(symlinkTarget, header.Name); err != nil { //coverage:ignore - logger.V(log.DBG).Info("error creating link, ignoring", "link", header.Name, "linkedTo", header.Linkname, "type", header.Typeflag, "reason", err.Error()) + logger.V(log.DBG).Info("error creating symlink, ignoring", "link", header.Name, "linkedTo", header.Linkname, "reason", err.Error()) continue } - filesProcessedInThisPass[header.Name] = struct{}{} - state[header.Name] = struct{}{} + extractedFiles[header.Name] = struct{}{} - clearUnresolvedLinkTargetsForExtractedPath(unresolvedLinkTargets, header.Name) + case tar.TypeLink: + // Hardlinks require the target to exist + // Check if target has been extracted already + if _, targetExists := extractedFiles[header.Linkname]; targetExists { + dirname := filepath.Dir(header.Name) + if err := mkdirOnce(dirname); err != nil { + //coverage:ignore + return err + } - resolvedTargetName := filepath.Clean(filepath.Join(filepath.Dir(header.Name), header.Linkname)) + if err := dstRoot.Link(header.Linkname, header.Name); err != nil { + //coverage:ignore + logger.V(log.DBG).Info("error creating hardlink, ignoring", "link", header.Name, "linkedTo", header.Linkname, "reason", err.Error()) + continue + } - // If the target of the symlink has already been processed (in this pass or an - // earlier pass), then no further action is needed. - if _, ok := state[resolvedTargetName]; ok { - continue + extractedFiles[header.Name] = struct{}{} + } else { + // Defer this hardlink until after we've seen all files + deferredHardlinks = append(deferredHardlinks, header.Name) } - - // Add the resolved target and target/** so nested layer paths match (e.g. /licenses -> - // /usr/share/licenses with files stored as usr/share/licenses/...). - filterPatterns = appendSymlinkTargetPatterns(filterPatterns, logger, resolvedTargetName) - - // Also add the target of this symlink to the list of unresolved link targets, - // if not already present. It's possible that we've already passed the target - // on this pass, in which case this can only be resolved on another pass. - unresolvedLinkTargets[resolvedTargetName] = struct{}{} } } } diff --git a/internal/engine/untar_test.go b/internal/engine/untar_test.go index e40f766d..6cc1685d 100644 --- a/internal/engine/untar_test.go +++ b/internal/engine/untar_test.go @@ -4,10 +4,13 @@ import ( "archive/tar" "bytes" "context" + "errors" "io" + "maps" "os" "path/filepath" "slices" + "strings" v1 "github.com/google/go-containerregistry/pkg/v1" "github.com/google/go-containerregistry/pkg/v1/empty" @@ -39,71 +42,247 @@ var _ = Describe("Link Path Resolution", func() { ) }) -var _ = Describe("untar pattern helpers", func() { - DescribeTable("expandLiteralPatternsWithDescendantGlob", - func(in, want []string) { - Expect(expandLiteralPatternsWithDescendantGlob(in)).To(Equal(want)) - }, - Entry("nil", nil, nil), - Entry("empty", []string{}, []string{}), - Entry("literal gets descendant glob", []string{"usr/share/licenses"}, []string{"usr/share/licenses", "usr/share/licenses/**"}), - Entry("glob patterns unchanged except clone order", []string{"licenses/**"}, []string{"licenses/**"}), - Entry("mixed literals and globs", []string{"a/**", "b"}, []string{"a/**", "b", "b/**"}), - Entry("does not duplicate child glob", []string{"x", "x/**"}, []string{"x", "x/**"}), - Entry("preserves empty string", []string{"", "y"}, []string{"", "y", "y/**"}), - ) +var _ = Describe("planExtraction symlink targets", func() { + It("stores absolute symlink linknames as archive-relative paths", func() { + var buf bytes.Buffer + tw := tar.NewWriter(&buf) + content := []byte("elf") + regHdr := tar.Header{Name: "usr/bin/bash", Mode: 0o644, Size: int64(len(content)), Typeflag: tar.TypeReg, Format: tar.FormatPAX} + Expect(tw.WriteHeader(®Hdr)).To(Succeed()) + _, err := tw.Write(content) + Expect(err).ToNot(HaveOccurred()) + symHdr := tar.Header{Name: "bin/sh", Typeflag: tar.TypeSymlink, Linkname: "/usr/bin/bash", Mode: 0o777, Format: tar.FormatPAX} + Expect(tw.WriteHeader(&symHdr)).To(Succeed()) + Expect(tw.Close()).To(Succeed()) - DescribeTable("clearUnresolvedLinkTargetsForExtractedPath", - func(initial map[string]struct{}, extracted string, wantKeys []string) { - u := mapsCloneKeys(initial) - clearUnresolvedLinkTargetsForExtractedPath(u, extracted) - keys := mapsKeys(u) - slices.Sort(keys) - Expect(keys).To(Equal(wantKeys)) - }, - Entry("exact match removes key", - map[string]struct{}{"a/b": {}}, - "a/b", - []string{}, - ), - Entry("child path removes longest parent prefix", - map[string]struct{}{"usr/share/licenses": {}}, - "usr/share/licenses/pkg/COPYING", - []string{}, - ), - Entry("longest prefix wins when multiple match", - map[string]struct{}{"usr": {}, "usr/share/licenses": {}}, - "usr/share/licenses/foo", - []string{"usr"}, - ), - Entry("unrelated keys preserved", - map[string]struct{}{"other": {}, "usr/share/licenses": {}}, - "usr/share/licenses/x", - []string{"other"}, - ), - Entry("no-op when nothing matches", - map[string]struct{}{"only/here": {}}, - "elsewhere/file", - []string{"only/here"}, - ), - ) + img, err := createImageWithLayer(buf.Bytes()) + Expect(err).ToNot(HaveOccurred()) + + _, lg, err := planExtraction(context.Background(), img, []string{"usr/bin/bash"}) + Expect(err).ToNot(HaveOccurred()) + + sh, ok := lg["bin/sh"] + Expect(ok).To(BeTrue()) + Expect(sh.Deps).ToNot(BeNil()) + Expect(sh.Deps.Name).To(Equal("usr/bin/bash")) + }) + + It("returns an error for invalid glob patterns", func() { + var buf bytes.Buffer + tw := tar.NewWriter(&buf) + Expect(tw.Close()).To(Succeed()) + + img, err := createImageWithLayer(buf.Bytes()) + Expect(err).ToNot(HaveOccurred()) + + _, _, err = planExtraction(context.Background(), img, []string{"[["}) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("invalid required file patterns")) + }) }) -func mapsCloneKeys(m map[string]struct{}) map[string]struct{} { - out := make(map[string]struct{}, len(m)) - for k := range m { - out[k] = struct{}{} - } - return out -} +var _ = Describe("planExtraction buildExtractionPlan symlink alias dedup", func() { + It("plans when a file symlink path is already visited as a parent of the matched alias path", func() { + // Directory symlink entry -> deep/real; file symlink entry/nested -> deep/real/nested/f.txt. + // addParentLinks("entry/nested/f.txt", ...) marks "entry/nested" before linkAliases[canonical file] + // lists sym "entry/nested", so buildExtractionPlan skips re-processing that sym. + content := []byte("nested alias dedup content") + var buf bytes.Buffer + tw := tar.NewWriter(&buf) + regPath := "deep/real/nested/f.txt" + Expect(tw.WriteHeader(&tar.Header{ + Name: regPath, Typeflag: tar.TypeReg, Size: int64(len(content)), + Mode: 0o644, Format: tar.FormatPAX, + })).To(Succeed()) + _, err := tw.Write(content) + Expect(err).ToNot(HaveOccurred()) + Expect(tw.WriteHeader(&tar.Header{ + Name: "entry", Typeflag: tar.TypeSymlink, Linkname: "deep/real", + Mode: 0o777, Format: tar.FormatPAX, + })).To(Succeed()) + Expect(tw.WriteHeader(&tar.Header{ + Name: "entry/nested", Typeflag: tar.TypeSymlink, Linkname: "deep/real/nested/f.txt", + Mode: 0o777, Format: tar.FormatPAX, + })).To(Succeed()) + Expect(tw.Close()).To(Succeed()) -func mapsKeys(m map[string]struct{}) []string { - keys := make([]string, 0, len(m)) - for k := range m { - keys = append(keys, k) - } - return keys -} + img, err := createImageWithLayer(buf.Bytes()) + Expect(err).ToNot(HaveOccurred()) + + planned, _, err := planExtraction(context.Background(), img, []string{"entry/nested/f.txt"}) + Expect(err).ToNot(HaveOccurred()) + Expect(planned).To(ContainElement(regPath)) + Expect(planned).To(ContainElement("entry")) + }) +}) + +var _ = Describe("planExtraction and runExtraction TypeDir skipping", func() { + var tmpDir string + + BeforeEach(func() { + var err error + tmpDir, err = os.MkdirTemp("", "engine-typedir-test-*") + Expect(err).ToNot(HaveOccurred()) + }) + + AfterEach(func() { + os.RemoveAll(tmpDir) + }) + + It("plans a regular file when a directory header precedes it in the stream", func() { + content := []byte("under typedir") + var buf bytes.Buffer + tw := tar.NewWriter(&buf) + Expect(tw.WriteHeader(&tar.Header{ + Name: "typedir/case/", Typeflag: tar.TypeDir, Mode: 0o755, Format: tar.FormatPAX, + })).To(Succeed()) + Expect(tw.WriteHeader(&tar.Header{ + Name: "typedir/case/file.txt", Typeflag: tar.TypeReg, Size: int64(len(content)), + Mode: 0o644, Format: tar.FormatPAX, + })).To(Succeed()) + _, err := tw.Write(content) + Expect(err).ToNot(HaveOccurred()) + Expect(tw.Close()).To(Succeed()) + + img, err := createImageWithLayer(buf.Bytes()) + Expect(err).ToNot(HaveOccurred()) + + planned, _, err := planExtraction(context.Background(), img, []string{"typedir/case/file.txt"}) + Expect(err).ToNot(HaveOccurred()) + Expect(planned).To(ContainElement("typedir/case/file.txt")) + }) + + It("extracts a regular file when a directory header precedes it in the stream", func() { + content := []byte("extract under typedir") + var buf bytes.Buffer + tw := tar.NewWriter(&buf) + Expect(tw.WriteHeader(&tar.Header{ + Name: "typedir/extract/", Typeflag: tar.TypeDir, Mode: 0o755, Format: tar.FormatPAX, + })).To(Succeed()) + Expect(tw.WriteHeader(&tar.Header{ + Name: "typedir/extract/out.txt", Typeflag: tar.TypeReg, Size: int64(len(content)), + Mode: 0o644, Format: tar.FormatPAX, + })).To(Succeed()) + _, err := tw.Write(content) + Expect(err).ToNot(HaveOccurred()) + Expect(tw.Close()).To(Succeed()) + + img, err := createImageWithLayer(buf.Bytes()) + Expect(err).ToNot(HaveOccurred()) + + err = untar(context.Background(), tmpDir, img, []string{"typedir/extract/out.txt"}) + Expect(err).ToNot(HaveOccurred()) + got, err := os.ReadFile(filepath.Join(tmpDir, "typedir/extract/out.txt")) + Expect(err).ToNot(HaveOccurred()) + Expect(got).To(Equal(content)) + }) +}) + +var _ = Describe("runExtraction deferred hardlink edge cases", func() { + var tmpDir string + + BeforeEach(func() { + var err error + tmpDir, err = os.MkdirTemp("", "engine-deferred-hl-test-*") + Expect(err).ToNot(HaveOccurred()) + }) + + AfterEach(func() { + os.RemoveAll(tmpDir) + }) + + It("continues when the deferred hardlink is missing from the link graph at replay", func() { + content := []byte("deferred hl missing graph entry") + var buf bytes.Buffer + chain := []linkChainEntry{ + {name: "by-hardlink/data.bin", linkType: hardlink, target: "actual/data.bin"}, + } + err := writeTarballWithLinkChain(&buf, content, "actual/data.bin", chain, true) + Expect(err).ToNot(HaveOccurred()) + + img, err := createImageWithLayer(buf.Bytes()) + Expect(err).ToNot(HaveOccurred()) + + files, lg, err := planExtraction(context.Background(), img, []string{"by-hardlink/data.bin"}) + Expect(err).ToNot(HaveOccurred()) + + lgPartial := maps.Clone(lg) + delete(lgPartial, "by-hardlink/data.bin") + + err = runExtraction(context.Background(), tmpDir, img, files, lgPartial) + Expect(err).ToNot(HaveOccurred()) + + got, err := os.ReadFile(filepath.Join(tmpDir, "actual/data.bin")) + Expect(err).ToNot(HaveOccurred()) + Expect(got).To(Equal(content)) + + _, err = os.Lstat(filepath.Join(tmpDir, "by-hardlink/data.bin")) + Expect(errors.Is(err, os.ErrNotExist)).To(BeTrue()) + }) + + It("does not create the deferred hardlink when its target file is not in the extraction plan", func() { + content := []byte("deferred hl target not extracted") + var buf bytes.Buffer + chain := []linkChainEntry{ + {name: "by-hardlink/data.bin", linkType: hardlink, target: "actual/data.bin"}, + } + err := writeTarballWithLinkChain(&buf, content, "actual/data.bin", chain, true) + Expect(err).ToNot(HaveOccurred()) + + img, err := createImageWithLayer(buf.Bytes()) + Expect(err).ToNot(HaveOccurred()) + + _, lg, err := planExtraction(context.Background(), img, []string{"by-hardlink/data.bin"}) + Expect(err).ToNot(HaveOccurred()) + + // Omit the hardlink target from the extraction list so replay sees target not extracted. + err = runExtraction(context.Background(), tmpDir, img, []string{"by-hardlink/data.bin"}, lg) + Expect(err).ToNot(HaveOccurred()) + + _, err = os.Lstat(filepath.Join(tmpDir, "by-hardlink/data.bin")) + Expect(errors.Is(err, os.ErrNotExist)).To(BeTrue()) + _, err = os.Lstat(filepath.Join(tmpDir, "actual/data.bin")) + Expect(errors.Is(err, os.ErrNotExist)).To(BeTrue()) + }) +}) + +var _ = Describe("sortHardlinksByDependencies", func() { + It("returns an error when deferred hardlinks form a cycle", func() { + g := LinkGraph{ + "a": &linkNode{Name: "a", Deps: &linkNode{Name: "b"}, Type: hardlink}, + "b": &linkNode{Name: "b", Deps: &linkNode{Name: "a"}, Type: hardlink}, + } + _, err := sortHardlinksByDependencies([]string{"a", "b"}, g) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("cyclic hardlink dependency")) + Expect(err.Error()).To(And(ContainSubstring("a"), ContainSubstring("b"))) + }) + + It("returns a topological order when there is no cycle", func() { + g := LinkGraph{ + "target": &linkNode{Name: "target"}, + "a": &linkNode{Name: "a", Deps: &linkNode{Name: "target"}, Type: hardlink}, + } + sorted, err := sortHardlinksByDependencies([]string{"a"}, g) + Expect(err).ToNot(HaveOccurred()) + Expect(sorted).To(Equal([]string{"a"})) + }) + + It("orders deferred hardlinks when one deferred link targets another deferred link", func() { + // hlA -> hlB (both deferred) -> base (regular file, not in deferred set). + // Kahn: hlB has in-degree 0, hlA depends on hlB, so hlB must precede hlA. + g := LinkGraph{ + "base": &linkNode{Name: "base"}, + "hlB": &linkNode{Name: "hlB", Deps: &linkNode{Name: "base"}, Type: hardlink}, + "hlA": &linkNode{Name: "hlA", Deps: &linkNode{Name: "hlB"}, Type: hardlink}, + } + sorted, err := sortHardlinksByDependencies([]string{"hlA", "hlB"}, g) + Expect(err).ToNot(HaveOccurred()) + idxB := slices.Index(sorted, "hlB") + idxA := slices.Index(sorted, "hlA") + Expect(idxB).To(BeNumerically("<", idxA), "hardlink target must be ordered before dependent deferred hardlink") + }) +}) var _ = Describe("Untar Directory Traversal Protection", func() { var tmpDir string @@ -156,6 +335,34 @@ var _ = Describe("Untar Directory Traversal Protection", func() { Expect(fileContent).To(Equal(content)) }) + It("skips symlinks whose target resolves outside the extraction root", func() { + content := []byte("safe payload") + var buf bytes.Buffer + tw := tar.NewWriter(&buf) + regHdr := tar.Header{Name: "safe/ok.txt", Mode: 0o644, Size: int64(len(content)), Typeflag: tar.TypeReg, Format: tar.FormatPAX} + Expect(tw.WriteHeader(®Hdr)).To(Succeed()) + _, err := tw.Write(content) + Expect(err).ToNot(HaveOccurred()) + // Lexical Join(safe, linkname) cleans to a path with enough ".." to escape dst when joined with tmpDir. + escapeLinkname := strings.Repeat("../", 24) + "outside_escape_marker" + symHdr := tar.Header{Name: "safe/evil_symlink", Typeflag: tar.TypeSymlink, Linkname: escapeLinkname, Mode: 0o777, Format: tar.FormatPAX} + Expect(tw.WriteHeader(&symHdr)).To(Succeed()) + Expect(tw.Close()).To(Succeed()) + + img, err := createImageWithLayer(buf.Bytes()) + Expect(err).ToNot(HaveOccurred()) + err = untar(context.Background(), tmpDir, img, []string{"safe/ok.txt", "safe/evil_symlink"}) + Expect(err).ToNot(HaveOccurred()) + + okPath := filepath.Join(tmpDir, "safe", "ok.txt") + got, err := os.ReadFile(okPath) + Expect(err).ToNot(HaveOccurred()) + Expect(got).To(Equal(content)) + + _, err = os.Lstat(filepath.Join(tmpDir, "safe", "evil_symlink")) + Expect(os.IsNotExist(err)).To(BeTrue()) + }) + It("should extract nested files when /licenses is a symlink to usr/share/licenses (tar paths under usr/share/licenses)", func() { // Mirrors UBI/RHEL images: layer stores license files under usr/share/licenses/... while // /licenses is a symlink to /usr/share/licenses. Nested paths must match after resolving @@ -330,6 +537,42 @@ var _ = Describe("Untar Directory Traversal Protection", func() { Expect(linkContent).To(Equal(content)) }) + It("sanitizes redundant relative symlink targets to a canonical path", func() { + const ( + dataPath = "p/mnt/data.txt" + maliciousLinkPath = "a/b/c/malicious-link" + redundantLinkname = "../../../p/mnt/../mnt" + ) + content := []byte("data under p/mnt") + var buf bytes.Buffer + tw := tar.NewWriter(&buf) + regHdr := tar.Header{Name: dataPath, Mode: 0o644, Size: int64(len(content)), Typeflag: tar.TypeReg, Format: tar.FormatPAX} + Expect(tw.WriteHeader(®Hdr)).To(Succeed()) + _, err := tw.Write(content) + Expect(err).ToNot(HaveOccurred()) + symHdr := tar.Header{Name: maliciousLinkPath, Typeflag: tar.TypeSymlink, Linkname: redundantLinkname, Mode: 0o777, Format: tar.FormatPAX} + Expect(tw.WriteHeader(&symHdr)).To(Succeed()) + Expect(tw.Close()).To(Succeed()) + + img, err := createImageWithLayer(buf.Bytes()) + Expect(err).ToNot(HaveOccurred()) + err = untar(context.Background(), tmpDir, img, []string{dataPath, maliciousLinkPath}) + Expect(err).ToNot(HaveOccurred()) + + linkAbs := filepath.Join(tmpDir, maliciousLinkPath) + wantTarget, err := filepath.Rel(filepath.Join(tmpDir, filepath.Dir(maliciousLinkPath)), filepath.Join(tmpDir, "p", "mnt")) + Expect(err).ToNot(HaveOccurred()) + Expect(redundantLinkname).ToNot(Equal(wantTarget), "fixture should use a non-canonical tar linkname") + + gotTarget, err := os.Readlink(linkAbs) + Expect(err).ToNot(HaveOccurred()) + Expect(gotTarget).To(Equal(wantTarget), "symlink target should be canonical relative form, not raw tar Linkname") + + throughLink, err := os.ReadFile(filepath.Join(linkAbs, "data.txt")) + Expect(err).ToNot(HaveOccurred()) + Expect(throughLink).To(Equal(content)) + }) + It("should allow creation of multi-layered symlinks within the destination directory", func() { // Test both normal and reverse order; symlinks may appear before or after // the target file in a tar stream. @@ -376,7 +619,7 @@ var _ = Describe("Untar Directory Traversal Protection", func() { Expect(originalContent).To(Equal(content)) // Check that the first symlink exists and points to original - // Note: untar converts relative symlinks to absolute paths for security + // Note: untar keeps relative symlinks as relative (for OSTree compatibility) link1File := filepath.Join(tmpDir, link1Name) link1Info, err := os.Lstat(link1File) Expect(err).ToNot(HaveOccurred()) @@ -384,7 +627,8 @@ var _ = Describe("Untar Directory Traversal Protection", func() { link1Target, err := os.Readlink(link1File) Expect(err).ToNot(HaveOccurred()) - Expect(link1Target).To(Equal(originalFile), "link1.txt should point to original.txt") + // The symlink target is relative (e.g., "original.txt") + Expect(link1Target).To(Equal(filepath.Base(originalName)), "link1.txt should have relative target to original.txt") // Check that the second symlink exists and points to link1 link2File := filepath.Join(tmpDir, link2Name) @@ -394,7 +638,8 @@ var _ = Describe("Untar Directory Traversal Protection", func() { link2Target, err := os.Readlink(link2File) Expect(err).ToNot(HaveOccurred()) - Expect(link2Target).To(Equal(link1File), "link2.txt should point to link1.txt") + // The symlink target is relative (e.g., "link1.txt") + Expect(link2Target).To(Equal(filepath.Base(link1Name)), "link2.txt should have relative target to link1.txt") // Check that reading through the chain of symlinks gives the correct content link1Content, err := os.ReadFile(link1File) @@ -407,7 +652,7 @@ var _ = Describe("Untar Directory Traversal Protection", func() { } }) - It("should progressively resolve multi-layered symlinks across multiple untarOnce passes", func() { + It("should resolve multi-layered symlinks in a single extraction", func() { content := []byte("test content") var buf bytes.Buffer @@ -415,93 +660,330 @@ var _ = Describe("Untar Directory Traversal Protection", func() { link1Name := "progressive-test/link1.txt" link2Name := "progressive-test/link2.txt" - // Create a tarball in the order of file first, then links. Thus, to resolve - // each layer of links, another pass is needed. + // Create a tarball with multi-layered symlinks err := writeTarballWithMultiLayerLinks(&buf, content, originalName, link1Name, link2Name, false) Expect(err).ToNot(HaveOccurred()) img, err := createImageWithLayer(buf.Bytes()) Expect(err).ToNot(HaveOccurred()) - state := make(map[string]struct{}) - - // First pass: Start with only link2.txt in the filter patterns - filterPatterns := []string{link2Name} - remaining, err := untarOnce(context.Background(), tmpDir, img, filterPatterns, state) + // Extract using the untar function with only link2 as the required file + // This should automatically resolve and extract link1 and original as well + err = untar(context.Background(), tmpDir, img, []string{link2Name}) Expect(err).ToNot(HaveOccurred()) - // After first pass: only link2 should be created (it appears in the tar) - // link2 points to link1, which hasn't been extracted yet, so link1 is in remaining - _, link1Exists := state[link1Name] - _, link2Exists := state[link2Name] - _, originalExists := state[originalName] - Expect(link2Exists).To(BeTrue(), "link2.txt should be in state after pass 1") - Expect(link1Exists).To(BeFalse(), "link1.txt should not be in state after pass 1") - Expect(originalExists).To(BeFalse(), "original.txt should not be in state after pass 1") - Expect(len(state)).To(Equal(1), "state should only contain one extracted file") - + // Verify all files were extracted link2File := filepath.Join(tmpDir, link2Name) + link1File := filepath.Join(tmpDir, link1Name) + originalFile := filepath.Join(tmpDir, originalName) + _, err = os.Lstat(link2File) - Expect(err).ToNot(HaveOccurred(), "link2.txt should exist on disk after pass 1") + Expect(err).ToNot(HaveOccurred(), "link2.txt should exist") - link1File := filepath.Join(tmpDir, link1Name) _, err = os.Lstat(link1File) - Expect(os.IsNotExist(err)).To(BeTrue(), "link1.txt should not exist on disk after pass 1") + Expect(err).ToNot(HaveOccurred(), "link1.txt should exist") - // remaining should include link1 (the unresolved target of link2) - Expect(remaining).To(ContainElement(link1Name), "link1.txt should be in remaining after pass 1") - Expect(len(remaining)).To(Equal(1), "only link1.txt should be in remaining after pass 1") + _, err = os.Stat(originalFile) + Expect(err).ToNot(HaveOccurred(), "original.txt should exist") - // Second pass: Use the remaining list from first pass - filterPatterns = remaining - remaining, err = untarOnce(context.Background(), tmpDir, img, filterPatterns, state) + // Verify the complete symlink chain works + link2Content, err := os.ReadFile(link2File) Expect(err).ToNot(HaveOccurred()) + Expect(link2Content).To(Equal(content), "reading through link2 should return original content") - // After second pass: link1 should be created (it appears earlier in tar) - // link1 points to original.txt, which hasn't been extracted yet, so original is in remaining - _, link1Exists = state[link1Name] - _, link2Exists = state[link2Name] - _, originalExists = state[originalName] - Expect(link2Exists).To(BeTrue(), "link2.txt should still be in state after pass 2") - Expect(link1Exists).To(BeTrue(), "link1.txt should be in state after pass 2") - Expect(originalExists).To(BeFalse(), "original.txt should not be in state after pass 2") - Expect(len(state)).To(Equal(2), "state should only contain two extracted files") + link1Content, err := os.ReadFile(link1File) + Expect(err).ToNot(HaveOccurred()) + Expect(link1Content).To(Equal(content), "reading through link1 should return original content") + }) - _, err = os.Lstat(link1File) - Expect(err).ToNot(HaveOccurred(), "link1.txt should exist on disk after pass 2") + It("should extract files when parent directory is a symlink and child matches filter", func() { + // This tests the scenario where: + // - /usr/lib/sysimage/rpm is a symlink to ../../share/rpm + // - The actual file is at /usr/share/rpm/rpmdb.sqlite + // - The filter pattern matches /usr/lib/sysimage/rpm/rpmdb.sqlite (the symlinked path) + // - The file should be extracted along with the directory symlink - originalFile := filepath.Join(tmpDir, originalName) - _, err = os.Stat(originalFile) - Expect(os.IsNotExist(err)).To(BeTrue(), "original.txt should not exist on disk after pass 2") + content := []byte("database content") + var buf bytes.Buffer - // remaining should include original.txt (the unresolved target of link1) - Expect(remaining).To(ContainElement(originalName), "original.txt should be in remaining after pass 2") - Expect(len(remaining)).To(Equal(1), "only original.txt should be in remaining after pass 2") + chain := []linkChainEntry{ + {name: "usr/lib/sysimage/rpm", linkType: symlink, target: "../../share/rpm"}, + } + err := writeTarballWithLinkChain(&buf, content, "usr/share/rpm/rpmdb.sqlite", chain, false) + Expect(err).ToNot(HaveOccurred()) - // Third pass: Use the remaining list from second pass - filterPatterns = remaining - remaining, err = untarOnce(context.Background(), tmpDir, img, filterPatterns, state) + img, err := createImageWithLayer(buf.Bytes()) Expect(err).ToNot(HaveOccurred()) - // After third pass: all files should be created and resolved - _, link1Exists = state[link1Name] - _, link2Exists = state[link2Name] - _, originalExists = state[originalName] - Expect(link2Exists).To(BeTrue(), "link2.txt should still be in state after pass 3") - Expect(link1Exists).To(BeTrue(), "link1.txt should still be in state after pass 3") - Expect(originalExists).To(BeTrue(), "original.txt should be in state after pass 3") - Expect(len(state)).To(Equal(3), "state should only contain three extracted files") + // Extract using pattern that matches through the symlink + err = untar(context.Background(), tmpDir, img, []string{testOstreeExtractPattern}) + Expect(err).ToNot(HaveOccurred()) - _, err = os.Stat(originalFile) - Expect(err).ToNot(HaveOccurred(), "original.txt should exist on disk after pass 3") + // Verify the symlink was created + verifySymlinkExists(tmpDir, testOstreeConsumerHardlink, testOstreeConsumerHardlink) - // remaining should be empty - all targets resolved - Expect(len(remaining)).To(Equal(0), "remaining should be empty after pass 3") + // Verify extraction and readability through the symlink + verifyLinkChainExtraction(tmpDir, testOstreeRPMDBPath, testOstreeExtractPattern, content) + }) - // Verify the complete symlink chain works - link2Content, err := os.ReadFile(link2File) + It("should extract files through chained directory symlinks", func() { + // This tests the scenario where: + // - usr/lib/sysimage/rpm -> ../../../foo/bar/rpm (symlink to symlink) + // - foo/bar/rpm -> ../../usr/share/rpm (symlink to directory) + // - The actual file is at usr/share/rpm/rpmdb.sqlite + // - The filter pattern matches usr/lib/sysimage/rpm/rpmdb.sqlite + // - Both symlinks and the file should be extracted + + content := []byte("chained symlink content") + var buf bytes.Buffer + + chain := []linkChainEntry{ + {name: "foo/bar/rpm", linkType: symlink, target: "../../usr/share/rpm"}, + {name: "usr/lib/sysimage/rpm", linkType: symlink, target: "../../../foo/bar/rpm"}, + } + err := writeTarballWithLinkChain(&buf, content, "usr/share/rpm/rpmdb.sqlite", chain, false) Expect(err).ToNot(HaveOccurred()) - Expect(link2Content).To(Equal(content), "reading through link2 should return original content") + + img, err := createImageWithLayer(buf.Bytes()) + Expect(err).ToNot(HaveOccurred()) + + // Extract using pattern that matches through the chained symlinks + err = untar(context.Background(), tmpDir, img, []string{testOstreeExtractPattern}) + Expect(err).ToNot(HaveOccurred()) + + // Verify both symlinks were created + verifySymlinkExists(tmpDir, "foo/bar/rpm", "foo/bar/rpm") + verifySymlinkExists(tmpDir, testOstreeConsumerHardlink, testOstreeConsumerHardlink) + + // Verify the actual file was extracted and we can read through both symlink paths + verifyLinkChainExtraction(tmpDir, testOstreeRPMDBPath, testOstreeRPMDBPath, content) + verifyLinkChainExtraction(tmpDir, testOstreeRPMDBPath, "foo/bar/rpm/rpmdb.sqlite", content) + verifyLinkChainExtraction(tmpDir, testOstreeRPMDBPath, testOstreeExtractPattern, content) + }) + + It("should extract files through hardlinks to directory symlinks (OSTree pattern)", func() { + // This tests the OSTree container image pattern where: + // - sysroot/ostree/repo/objects/HASH.file is a symlink to ../../share/rpm + // - usr/lib/sysimage/rpm is a hardlink to sysroot/ostree/repo/objects/HASH.file + // - usr/share/rpm/rpmdb.sqlite is a regular file (or hardlink to ostree object) + // - The pattern matches usr/lib/sysimage/rpm/rpmdb.sqlite + // - Both the file and the hardlink-to-symlink chain should be extracted + + content := []byte("rpm database content") + var buf bytes.Buffer + + err := writeOstreeRPMTestFixture(&buf, content) + Expect(err).ToNot(HaveOccurred()) + + img, err := createImageWithLayer(buf.Bytes()) + Expect(err).ToNot(HaveOccurred()) + + // Extract using pattern that matches through the hardlink-to-symlink + err = untar(context.Background(), tmpDir, img, []string{testOstreeExtractPattern}) + Expect(err).ToNot(HaveOccurred()) + + // Verify both links were extracted as symlinks (hardlink to symlink becomes symlink) + verifySymlinkExists(tmpDir, testOstreeObjectSymlinkPath, "ostree object") + verifySymlinkExists(tmpDir, testOstreeConsumerHardlink, "hardlink to symlink") + + // Verify extraction and readability through the chain + verifyLinkChainExtraction(tmpDir, testOstreeRPMDBPath, filepath.Join(testOstreeConsumerHardlink, "rpmdb.sqlite"), content) + }) + + It("should extract entry/nested when the regular file appears before the directory symlink in the tar stream", func() { + content := []byte("layer-payload") + var buf bytes.Buffer + tw := tar.NewWriter(&buf) + regPath := "deep/real/nested/f.txt" + Expect(tw.WriteHeader(&tar.Header{Name: regPath, Typeflag: tar.TypeReg, Size: int64(len(content)), Mode: 0o644, Format: tar.FormatPAX})).To(Succeed()) + _, err := tw.Write(content) + Expect(err).ToNot(HaveOccurred()) + Expect(tw.WriteHeader(&tar.Header{Name: "entry", Typeflag: tar.TypeSymlink, Linkname: "deep/real", Mode: 0o777, Format: tar.FormatPAX})).To(Succeed()) + Expect(tw.Close()).To(Succeed()) + + img, err := createImageWithLayer(buf.Bytes()) + Expect(err).ToNot(HaveOccurred()) + err = untar(context.Background(), tmpDir, img, []string{"entry/nested/f.txt"}) + Expect(err).ToNot(HaveOccurred()) + + got, err := os.ReadFile(filepath.Join(tmpDir, regPath)) + Expect(err).ToNot(HaveOccurred()) + Expect(got).To(Equal(content)) + + got, err = os.ReadFile(filepath.Join(tmpDir, "entry", "nested", "f.txt")) + Expect(err).ToNot(HaveOccurred()) + Expect(got).To(Equal(content)) + }) + + It("should not fail extraction when a requested symlink is dangling", func() { + var buf bytes.Buffer + tw := tar.NewWriter(&buf) + Expect(tw.WriteHeader(&tar.Header{Name: "keep.txt", Typeflag: tar.TypeReg, Size: 1, Mode: 0o644, Format: tar.FormatPAX})).To(Succeed()) + _, err := tw.Write([]byte("y")) + Expect(err).ToNot(HaveOccurred()) + Expect(tw.WriteHeader(&tar.Header{Name: "dangle", Typeflag: tar.TypeSymlink, Linkname: "ghost/missing.txt", Mode: 0o777, Format: tar.FormatPAX})).To(Succeed()) + Expect(tw.Close()).To(Succeed()) + + img, err := createImageWithLayer(buf.Bytes()) + Expect(err).ToNot(HaveOccurred()) + err = untar(context.Background(), tmpDir, img, []string{"keep.txt", "dangle"}) + Expect(err).ToNot(HaveOccurred()) + + _, err = os.ReadFile(filepath.Join(tmpDir, "keep.txt")) + Expect(err).ToNot(HaveOccurred()) + + _, err = os.Lstat(filepath.Join(tmpDir, "dangle")) + Expect(err).ToNot(HaveOccurred()) + }) + + It("should resolve symlink filter targets from a good hardlink peer when the bad peer appears first in the tar", func() { + // Same relative Linkname on two hard-linked symlink paths; resolving from the first path's + // dirname yields a bogus prefix (bad/target/...) that is not in the archive. Pattern + // discovery must use the peer that names real members under target/. Include each symlink + // path in required patterns so both headers are extracted (target/data.txt alone does not + // mark optional symlinks for extraction). + content := []byte("hardlink symlink peer order content") + var buf bytes.Buffer + err := writeTarSymlinkHardlinkPeersBadSymlinkFirst(&buf, content) + Expect(err).ToNot(HaveOccurred()) + + img, err := createImageWithLayer(buf.Bytes()) + Expect(err).ToNot(HaveOccurred()) + + err = untar(context.Background(), tmpDir, img, []string{"target/data.txt", "good/sym", "bad/deep/sym"}) + Expect(err).ToNot(HaveOccurred()) + + actualFile := filepath.Join(tmpDir, "target", "data.txt") + fileContent, err := os.ReadFile(actualFile) + Expect(err).ToNot(HaveOccurred()) + Expect(fileContent).To(Equal(content)) + + goodSym := filepath.Join(tmpDir, "good", "sym") + _, err = os.Lstat(goodSym) + Expect(err).ToNot(HaveOccurred()) + raw, err := os.Readlink(goodSym) + Expect(err).ToNot(HaveOccurred()) + throughGood, err := os.ReadFile(filepath.Join(filepath.Dir(goodSym), filepath.FromSlash(raw))) + Expect(err).ToNot(HaveOccurred()) + Expect(throughGood).To(Equal(content), "read through good/sym following raw linkname") + + badJoin := filepath.Join(tmpDir, "bad", "target", "data.txt") + _, statErr := os.Lstat(badJoin) + Expect(os.IsNotExist(statErr)).To(BeTrue(), "bogus path from wrong relative resolution must not be created") + }) + + It("should extract a symlink whose target is a hardlink peer when the pattern matches only the other peer", func() { + // Pattern matches deep/path/file only; short/file is a hardlink peer; consumer is a symlink + // to short/file. The consumer symlink must still be materialized to read the file via consumer. + content := []byte("peer alias symlink content") + var buf bytes.Buffer + + chain := []linkChainEntry{ + {name: "short/file", linkType: hardlink, target: "deep/path/file"}, + {name: "consumer", linkType: symlink, target: "short/file"}, + } + err := writeTarballWithLinkChain(&buf, content, "deep/path/file", chain, false) + Expect(err).ToNot(HaveOccurred()) + + img, err := createImageWithLayer(buf.Bytes()) + Expect(err).ToNot(HaveOccurred()) + + err = untar(context.Background(), tmpDir, img, []string{"deep/path/file"}) + Expect(err).ToNot(HaveOccurred()) + + verifyLinkChainExtraction(tmpDir, "deep/path/file", "consumer", content) + }) + + It("should extract files through hardlink -> hardlink -> symlink -> directory chain", func() { + // Test: hardlink1 -> hardlink2 -> symlink -> directory + // Pattern matches through hardlink1 + content := []byte("test content") + var buf bytes.Buffer + + chain := []linkChainEntry{ + {name: "storage/symlink-to-dir", linkType: symlink, target: "../actual/dir"}, + {name: "intermediate/link2", linkType: hardlink, target: "storage/symlink-to-dir"}, + {name: "alias/link1", linkType: hardlink, target: "intermediate/link2"}, + } + err := writeTarballWithLinkChain(&buf, content, "actual/dir/file.txt", chain, false) + Expect(err).ToNot(HaveOccurred()) + + img, err := createImageWithLayer(buf.Bytes()) + Expect(err).ToNot(HaveOccurred()) + + // Extract using pattern that matches through the hardlink chain + err = untar(context.Background(), tmpDir, img, []string{"alias/link1/file.txt"}) + Expect(err).ToNot(HaveOccurred()) + + verifyLinkChainExtraction(tmpDir, "actual/dir/file.txt", "alias/link1/file.txt", content) + }) + + It("should extract files through symlink -> hardlink -> symlink chain", func() { + // Test: symlink1 -> hardlink -> symlink2 -> directory + // Pattern matches through symlink1 + content := []byte("test content") + var buf bytes.Buffer + + chain := []linkChainEntry{ + {name: "intermediate/sym2", linkType: symlink, target: "../real/location"}, + {name: "middle/hlink", linkType: hardlink, target: "intermediate/sym2"}, + {name: "alias/sym1", linkType: symlink, target: "../middle/hlink"}, + } + err := writeTarballWithLinkChain(&buf, content, "real/location/file.txt", chain, false) + Expect(err).ToNot(HaveOccurred()) + + img, err := createImageWithLayer(buf.Bytes()) + Expect(err).ToNot(HaveOccurred()) + + // Extract using pattern that matches through the chain + err = untar(context.Background(), tmpDir, img, []string{"alias/sym1/file.txt"}) + Expect(err).ToNot(HaveOccurred()) + + verifyLinkChainExtraction(tmpDir, "real/location/file.txt", "alias/sym1/file.txt", content) + }) + + It("should extract files through alternating hardlink/symlink chain (hardlink -> symlink -> hardlink -> symlink -> directory)", func() { + // Test complex alternating chain where hardlinks point to symlinks + content := []byte("alternating chain content") + var buf bytes.Buffer + + chain := []linkChainEntry{ + {name: "level3/sym3", linkType: symlink, target: "../target"}, + {name: "level2/hlink2", linkType: hardlink, target: "level3/sym3"}, + {name: "level1/sym1", linkType: symlink, target: "../level2/hlink2"}, + {name: "alias/hlink1", linkType: hardlink, target: "level1/sym1"}, + } + err := writeTarballWithLinkChain(&buf, content, "target/data.db", chain, false) + Expect(err).ToNot(HaveOccurred()) + + img, err := createImageWithLayer(buf.Bytes()) + Expect(err).ToNot(HaveOccurred()) + + // Extract using pattern that matches through the alternating chain + err = untar(context.Background(), tmpDir, img, []string{"alias/hlink1/data.db"}) + Expect(err).ToNot(HaveOccurred()) + + verifyLinkChainExtraction(tmpDir, "target/data.db", "alias/hlink1/data.db", content) + }) + + It("should extract when a hardlink header appears before its target file in the tar stream", func() { + // runExtraction defers hardlinks until EOF when the target is not extracted yet; replay must still create the link. + content := []byte("deferred hardlink stream order") + var buf bytes.Buffer + + chain := []linkChainEntry{ + {name: "by-hardlink/data.bin", linkType: hardlink, target: "actual/data.bin"}, + } + err := writeTarballWithLinkChain(&buf, content, "actual/data.bin", chain, true) + Expect(err).ToNot(HaveOccurred()) + + img, err := createImageWithLayer(buf.Bytes()) + Expect(err).ToNot(HaveOccurred()) + + err = untar(context.Background(), tmpDir, img, []string{"by-hardlink/data.bin"}) + Expect(err).ToNot(HaveOccurred()) + + verifyLinkChainExtraction(tmpDir, "actual/data.bin", "by-hardlink/data.bin", content) }) }) @@ -617,3 +1099,142 @@ func writeTarballWithMultiLayerLinks(out io.Writer, contents []byte, filename st return nil } + +// linkChainEntry represents a single link in a chain of hardlinks/symlinks +type linkChainEntry struct { + name string + linkType linkType + target string +} + +// Shared OSTree-style layout: object store symlink + hardlink consumer + rpmdb under usr/share/rpm. +const ( + testOstreeObjectSymlinkPath = "sysroot/ostree/repo/objects/53/hash.file" + testOstreeRPMDBPath = "usr/share/rpm/rpmdb.sqlite" + testOstreeConsumerHardlink = "usr/lib/sysimage/rpm" + // Slash-separated path as required by filter patterns (not filepath.Join). + testOstreeExtractPattern = testOstreeConsumerHardlink + "/rpmdb.sqlite" +) + +func ostreeRPMTestFixtureChain() []linkChainEntry { + return []linkChainEntry{ + {name: testOstreeObjectSymlinkPath, linkType: symlink, target: "../../share/rpm"}, + {name: testOstreeConsumerHardlink, linkType: hardlink, target: testOstreeObjectSymlinkPath}, + } +} + +func writeOstreeRPMTestFixture(out io.Writer, contents []byte) error { + return writeTarballWithLinkChain(out, contents, testOstreeRPMDBPath, ostreeRPMTestFixtureChain(), false) +} + +// writeTarSymlinkHardlinkPeersBadSymlinkFirst writes: a symlink at bad/deep/sym with the same +// relative Linkname as good/sym (so resolving from bad/deep alone yields a nonexistent path), +// then the regular file, then good/sym, then TypeLink bad/deep/sym -> good/sym. +func writeTarSymlinkHardlinkPeersBadSymlinkFirst(out io.Writer, content []byte) error { + tw := tar.NewWriter(out) + defer tw.Close() + + raw := "../target/data.txt" + headers := []tar.Header{ + {Typeflag: tar.TypeSymlink, Name: "bad/deep/sym", Linkname: raw, Mode: 0o777, Format: tar.FormatPAX}, + {Typeflag: tar.TypeReg, Name: "target/data.txt", Size: int64(len(content)), Mode: 0o644, Format: tar.FormatPAX}, + {Typeflag: tar.TypeSymlink, Name: "good/sym", Linkname: raw, Mode: 0o777, Format: tar.FormatPAX}, + {Typeflag: tar.TypeLink, Name: "bad/deep/sym", Linkname: "good/sym", Mode: 0o777, Format: tar.FormatPAX}, + } + for i := range headers { + if err := tw.WriteHeader(&headers[i]); err != nil { + return err + } + if headers[i].Typeflag == tar.TypeReg { + if _, err := tw.Write(content); err != nil { + return err + } + } + } + return nil +} + +// writeTarballWithLinkChain writes a tar archive with a regular file and a chain of hardlinks/symlinks. +// The chain is specified as a slice of linkChainEntry, where each entry points to the next (or the final file). +// Example: for chain [link1->link2, link2->file], pass entries for link1 and link2. +// If linksBeforeFile is true, link headers are written before the regular file so hardlinks can +// appear before their target in the stream (deferred hardlink replay in runExtraction). +func writeTarballWithLinkChain(out io.Writer, contents []byte, filename string, chain []linkChainEntry, linksBeforeFile bool) error { + tw := tar.NewWriter(out) + defer tw.Close() + + fileHeader := &tar.Header{ + Typeflag: tar.TypeReg, + Name: filename, + Size: int64(len(contents)), + Mode: 0o644, + Format: tar.FormatPAX, + } + + writeLinks := func() error { + for _, link := range chain { + linkHeader := &tar.Header{ + Typeflag: byte(link.linkType), + Name: link.name, + Linkname: link.target, + Mode: 0o777, + Format: tar.FormatPAX, + } + if err := tw.WriteHeader(linkHeader); err != nil { + return err + } + } + return nil + } + + if linksBeforeFile { + if err := writeLinks(); err != nil { + return err + } + if err := tw.WriteHeader(fileHeader); err != nil { + return err + } + if _, err := tw.Write(contents); err != nil { + return err + } + return nil + } + + if err := tw.WriteHeader(fileHeader); err != nil { + return err + } + if _, err := tw.Write(contents); err != nil { + return err + } + return writeLinks() +} + +// verifyLinkChainExtraction verifies that files were extracted correctly through a link chain. +// It checks that the actual file exists with correct content, and that the file can be read +// through the specified chain path. +func verifyLinkChainExtraction(tmpDir string, actualFilePath, chainPath string, expectedContent []byte) { + // Verify the actual file was extracted + fullActualPath := filepath.Join(tmpDir, actualFilePath) + fileContent, err := os.ReadFile(fullActualPath) + Expect(err).ToNot(HaveOccurred()) + Expect(fileContent).To(Equal(expectedContent)) + + // Verify we can read through the chain + fullChainPath := filepath.Join(tmpDir, chainPath) + chainFileContent, err := os.ReadFile(fullChainPath) + Expect(err).ToNot(HaveOccurred()) + Expect(chainFileContent).To(Equal(expectedContent), "should be able to read file through link chain") +} + +// verifySymlinkExists verifies that a symlink exists at the given path. +// Returns the symlink target for further verification if needed. +func verifySymlinkExists(tmpDir, symlinkPath, description string) string { + fullPath := filepath.Join(tmpDir, symlinkPath) + info, err := os.Lstat(fullPath) + Expect(err).ToNot(HaveOccurred()) + Expect(info.Mode()&os.ModeSymlink).To(Equal(os.ModeSymlink), description+" should be a symlink") + + target, err := os.Readlink(fullPath) + Expect(err).ToNot(HaveOccurred()) + return target +}