Merge branch 'ajwalker/add-cachekey-package' into 'main'

saracen · GitLab · commit 91029783f745 · 2026-03-11T19:05:05.000Z
Refactor extract cache key sanitization into dedicated package See merge request https://gitlab.com/gitlab-org/gitlab-runner/-/merge_requests/6509 Merged-by: Arran Walker <ajwalker@gitlab.com> Approved-by: Ashvin Sharma <ashsharma@gitlab.com> Approved-by: Axel von Bertoldi <avonbertoldi@gitlab.com> Reviewed-by: Ashvin Sharma <ashsharma@gitlab.com>
diff --git a/cache/cachekey/cachekey.go b/cache/cachekey/cachekey.go
@@ -0,0 +1,57 @@
+package cachekey
+
+import (
+	"fmt"
+	"path"
+	"strings"
+	"unicode"
+)
+
+// normaliser decodes URL-encoded slashes and dots, and converts backslashes to
+// forward slashes in a single pass.
+var normaliser = strings.NewReplacer(
+	"%2f", "/",
+	"%2F", "/",
+	"%2e", ".",
+	"%2E", ".",
+	`\`, "/",
+)
+
+// Sanitize validates and normalises a cache key.
+// Cache keys may contain path separators. The function:
+//   - decodes URL-encoded '/' (%2f) and '.' (%2e) characters
+//   - replaces all '\' with '/'
+//   - resolves path traversals (., ..) within a virtual root
+//   - strips trailing whitespace from the rightmost path segments,
+//     removing any that become empty after trimming
+func Sanitize(cacheKey string) (string, error) {
+	if cacheKey == "" {
+		return "", nil
+	}
+
+	// Decode percent-encoded chars and normalise separators, then
+	// resolve traversals against a virtual root so ".." can never
+	// escape beyond the root.
+	cleaned := path.Clean("/" + normaliser.Replace(cacheKey))
+
+	// Strip the leading "/" we added, split into segments, then walk
+	// backwards trimming trailing whitespace from the rightmost
+	// segments—dropping any that become empty.
+	parts := strings.Split(cleaned[1:], "/")
+	n := len(parts)
+	for n > 0 {
+		parts[n-1] = strings.TrimRightFunc(parts[n-1], unicode.IsSpace)
+		if parts[n-1] != "" {
+			break
+		}
+		n--
+	}
+
+	key := strings.Join(parts[:n], "/")
+
+	if key == "" {
+		return "", fmt.Errorf("cache key %q could not be sanitized", cacheKey)
+	}
+
+	return key, nil
+}
diff --git a/cache/cachekey/cachekey_test.go b/cache/cachekey/cachekey_test.go
@@ -0,0 +1,167 @@
+//go:build !integration
+
+package cachekey
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestSanitize(t *testing.T) {
+	tests := []struct {
+		rawKey      string
+		expectedKey string
+		wantErr     bool
+	}{
+		// ── Empty / identity ────────────────────────────────────────
+		{rawKey: ""},
+		{rawKey: "fallback_key", expectedKey: "fallback_key"},
+		{rawKey: "some-job/some-ref", expectedKey: "some-job/some-ref"},
+		{rawKey: ".../....", expectedKey: ".../...."},
+		{rawKey: "...", expectedKey: "..."},
+
+		// ── Trailing whitespace / slashes / backslashes ─────────────
+		{rawKey: "fallback_key/", expectedKey: "fallback_key"},
+		{rawKey: "fallback_key ", expectedKey: "fallback_key"},
+		{rawKey: "fallback_key\\", expectedKey: "fallback_key"},
+		{rawKey: "fallback_key/ \\", expectedKey: "fallback_key"},
+		{rawKey: "fallback_key/ / \\  \\", expectedKey: "fallback_key"},
+		{rawKey: "fallback_key/o", expectedKey: "fallback_key/o"},
+		{rawKey: "fallback_key / \\o", expectedKey: "fallback_key / /o"},
+		{rawKey: "\t foo bar \t\r", expectedKey: "\t foo bar"},
+		{rawKey: " foo / bar ", expectedKey: " foo / bar"},
+		{rawKey: "foo\r", expectedKey: "foo"},
+		{rawKey: "foo\t", expectedKey: "foo"},
+		{rawKey: "foo \t \r ", expectedKey: "foo"},
+
+		// ── Completely unsanitisable ────────────────────────────────
+		{rawKey: "\\", wantErr: true},
+		{rawKey: "\\.", wantErr: true},
+		{rawKey: "/", wantErr: true},
+		{rawKey: " ", wantErr: true},
+		{rawKey: ".", wantErr: true},
+		{rawKey: "..", wantErr: true},
+		{rawKey: " / ", wantErr: true},
+		{rawKey: "//", wantErr: true},
+		{rawKey: `//\`, wantErr: true},
+		{rawKey: "../.", wantErr: true},
+		{rawKey: "foo\\bar\\..\\..", wantErr: true},
+		{rawKey: "foo/bar/../..", wantErr: true},
+		{rawKey: " \t\r\n", wantErr: true},
+
+		// ── URL-encoded slashes (%2f / %2F) ────────────────────────
+		{rawKey: "something %2F something", expectedKey: "something / something"},
+		{rawKey: "something %2f something", expectedKey: "something / something"},
+		{rawKey: "some%2f../job/some/ref/.", expectedKey: "job/some/ref"},
+
+		// ── URL-encoded dots (%2e / %2E) ───────────────────────────
+		{rawKey: "%2E", wantErr: true},
+		{rawKey: "%2E%2E", wantErr: true},
+		{rawKey: "%2E%2E%2E", expectedKey: "..."},
+		{rawKey: "%2e", wantErr: true},
+		{rawKey: "%2e%2E", wantErr: true},
+		{rawKey: ".%2E", wantErr: true},
+		{rawKey: "%2e.", wantErr: true},
+		{rawKey: "%2E%2e%2E", expectedKey: "..."},
+
+		// %5C is left as-is (literal percent-encoded backslash is fine).
+		{rawKey: "%5C", expectedKey: "%5C"},
+		{rawKey: "%5c", expectedKey: "%5c"},
+
+		// ── Forward-slash path traversal ────────────────────────────
+		{rawKey: "foo/./bar", expectedKey: "foo/bar"},
+		{rawKey: "foo/blipp/../bar", expectedKey: "foo/bar"},
+		{rawKey: "/foo/bar", expectedKey: "foo/bar"},
+		{rawKey: "//foo/bar", expectedKey: "foo/bar"},
+		{rawKey: "./foo/bar", expectedKey: "foo/bar"},
+		{rawKey: "../foo/bar", expectedKey: "foo/bar"},
+		{rawKey: ".../foo/bar", expectedKey: ".../foo/bar"},
+		{rawKey: "foo/bar/..", expectedKey: "foo"},
+		{rawKey: "foo/bar/../../../.././blerp", expectedKey: "blerp"},
+		{rawKey: "a/b/c/../../d", expectedKey: "a/d"},
+
+		// ── Backslash path traversal ────────────────────────────────
+		{rawKey: `job\name/git\ref`, expectedKey: "job/name/git/ref"},
+		{rawKey: "foo\\.\\bar", expectedKey: "foo/bar"},
+		{rawKey: "foo\\blipp\\..\\bar", expectedKey: "foo/bar"},
+		{rawKey: "\\foo\\bar", expectedKey: "foo/bar"},
+		{rawKey: "\\\\foo\\bar", expectedKey: "foo/bar"},
+		{rawKey: ".\\foo\\bar", expectedKey: "foo/bar"},
+		{rawKey: "..\\foo\\bar", expectedKey: "foo/bar"},
+		{rawKey: "...\\foo\\bar", expectedKey: ".../foo/bar"},
+		{rawKey: "foo\\bar\\..", expectedKey: "foo"},
+		{rawKey: "foo\\bar\\..\\..\\..\\..\\.\\blerp", expectedKey: "blerp"},
+
+		// ── Space-only segments & misc ──────────────────────────────
+		{rawKey: "foo/ /bar", expectedKey: "foo/ /bar"},
+		{rawKey: "foo/ /", expectedKey: "foo"},
+		{rawKey: "foo/ / /", expectedKey: "foo"},
+	}
+
+	for i, tt := range tests {
+		name := fmt.Sprintf("%d:%q", i, tt.rawKey)
+		t.Run(name, func(t *testing.T) {
+			actual, err := Sanitize(tt.rawKey)
+			if tt.wantErr {
+				assert.Error(t, err)
+			} else {
+				assert.NoError(t, err)
+			}
+			assert.Equal(t, tt.expectedKey, actual)
+		})
+	}
+}
+
+// TestSanitizeInvariants checks properties that must hold for every sanitised
+// key, regardless of input.
+func TestSanitizeInvariants(t *testing.T) {
+	cases := []string{
+		"a", "a/b", "../a", "a/../b", "a/./b",
+		"a\\b", `a\..\\b`, "/a/b/", " a ", "...",
+		"%2e%2e/%2f", "a/b/c/../../d/e",
+	}
+	for _, raw := range cases {
+		t.Run(raw, func(t *testing.T) {
+			key, _ := Sanitize(raw)
+			if key == "" {
+				return // unsanitisable, nothing to check
+			}
+			assert.False(t, strings.HasPrefix(key, "/"), "must not start with /")
+			assert.False(t, key == ".." || strings.HasPrefix(key, "../"), "must not start with .. segment")
+			assert.False(t, strings.Contains(key, `\`), "must not contain backslash")
+			assert.False(t, strings.HasSuffix(key, " "), "must not end with space")
+			assert.False(t, strings.HasSuffix(key, "/"), "must not end with /")
+
+			// No segment should be "." or ".."
+			for _, seg := range strings.Split(key, "/") {
+				assert.NotEqual(t, ".", seg, "must not contain '.' segment")
+				assert.NotEqual(t, "..", seg, "must not contain '..' segment")
+			}
+		})
+	}
+}
+
+// TestSanitizeIdempotent verifies that sanitising an already-clean key
+// returns it unchanged with no error.
+func TestSanitizeIdempotent(t *testing.T) {
+	inputs := []string{
+		"fallback_key",
+		"some-job/some-ref",
+		"a/b/c",
+		"...",
+		".../foo/bar",
+	}
+	for _, raw := range inputs {
+		t.Run(raw, func(t *testing.T) {
+			first, err1 := Sanitize(raw)
+			assert.NoError(t, err1)
+
+			second, err2 := Sanitize(first)
+			assert.NoError(t, err2)
+			assert.Equal(t, first, second, "sanitise should be idempotent")
+		})
+	}
+}
diff --git a/shells/abstract.go b/shells/abstract.go
@@ -14,9 +14,9 @@ import (
 	"strconv"
 	"strings"
 	"time"
-	"unicode"
 
 	"gitlab.com/gitlab-org/gitlab-runner/cache"
+	"gitlab.com/gitlab-org/gitlab-runner/cache/cachekey"
 	"gitlab.com/gitlab-org/gitlab-runner/common"
 	"gitlab.com/gitlab-org/gitlab-runner/common/spec"
 	"gitlab.com/gitlab-org/gitlab-runner/helpers"
@@ -118,24 +118,21 @@ func newCacheConfig(build *common.Build, userKey string, keyChecks ...func(strin
 		rawKey = build.GetAllVariables().ExpandValue(userKey)
 	}
 
-	// hashers per mode: nop in unhashed mode, sha256sum in hashed mode
-	hashers := map[bool]func(string) string{
-		false: func(s string) string { return s },
-		true:  func(s string) string { return fmt.Sprintf("%x", sha256.Sum256([]byte(s))) },
+	hasher := func(s string) string { return s }
+	sanitizer := cachekey.Sanitize
+	// if hash key support is enabled, we don't need to sanitize keys anymore
+	if build.IsFeatureFlagOn(featureflags.HashCacheKeys) {
+		hasher = func(s string) string { return fmt.Sprintf("%x", sha256.Sum256([]byte(s))) }
+		sanitizer = func(s string) (string, error) { return s, nil }
 	}
-	// sanitizers per mode: real sanitizer in unhashed mode, nop sanitizer in hashed mode
-	sanitizers := map[bool]func(string) (string, error){
-		false: sanitizeCacheKey,
-		true:  func(s string) (string, error) { return s, nil },
-	}
-
-	hashCacheKeys := build.IsFeatureFlagOn(featureflags.HashCacheKeys)
-	hasher, sanitizer := hashers[hashCacheKeys], sanitizers[hashCacheKeys]
 
 	var warning string
 	humanKey, err := sanitizer(rawKey)
-	if err != nil {
+	switch {
+	case err != nil:
 		warning = err.Error()
+	case humanKey != rawKey:
+		warning = fmt.Sprintf("cache key %q sanitized to %q", rawKey, humanKey)
 	}
 
 	for _, check := range keyChecks {
@@ -304,68 +301,6 @@ func (b *AbstractShell) extractCacheOrFallbackCachesWrapper(
 	})
 }
 
-// sanitizeCacheKey replicates some cache key rules from GitLab and adds additional validations for known-bad cache
-// keys.
-// It accepts that the cache keys can be paths and:
-//   - replaces the URL encoded version of `/` & `.` with their ASCII ones
-//   - replaces all `\` with `/`
-//   - ensures there are no path traversals possible "outside of the base path"
-//   - ensures the last path element is not empty or ends in a space
-func sanitizeCacheKey(cacheKey string) (sanitizedKey string, err error) {
-	if cacheKey == "" {
-		return "", nil
-	}
-
-	replaceEncodedSlashes := func(s string) string {
-		for _, slash := range []string{"%2f", "%2F"} {
-			s = strings.ReplaceAll(s, slash, "/")
-		}
-		return s
-	}
-	replaceEncodedDots := func(s string) string {
-		for _, dot := range []string{"%2e", "%2E"} {
-			s = strings.ReplaceAll(s, dot, ".")
-		}
-		return s
-	}
-	toSlash := func(s string) string {
-		return strings.ReplaceAll(s, `\`, `/`)
-	}
-	trimSpaceRight := func(s string) string {
-		return strings.TrimRightFunc(s, unicode.IsSpace)
-	}
-
-	// We root the path, so that path traversals outside of the base path, e.g. resulting in a key like `../../foo/bar`,
-	// aren't possible
-	// Note: path.Join calls path.Clean internally
-	cleaned := path.Join(
-		"/", toSlash(replaceEncodedSlashes(replaceEncodedDots(cacheKey))),
-	)
-
-	var key string
-	for cleaned != "" {
-		dir, file := path.Split(cleaned)
-		file = trimSpaceRight(file)
-
-		if file == "" {
-			cleaned = dir[:len(dir)-1] // cut off the trailing `/` from dir and continue with that
-			continue
-		}
-
-		key = path.Join(dir[1:], file) // cut off the leading `/` from dir, because we rooted the path initially
-		break
-	}
-
-	if key == "" {
-		return "", fmt.Errorf("cache key %q could not be sanitized", cacheKey)
-	}
-	if key != cacheKey {
-		return key, fmt.Errorf("cache key %q sanitized to %q", cacheKey, key)
-	}
-
-	return key, nil
-}
-
 func (b *AbstractShell) addExtractCacheCommand(
 	ctx context.Context,
 	w ShellWriter,
diff --git a/shells/abstract_test.go b/shells/abstract_test.go