Skip to content

Commit 0899036

Browse files
Frank Guoclaude
andcommitted
Add secret redaction and path anonymization to checkpoint pipeline
Introduces the scrub package that detects and redacts 20+ secret patterns (API keys, tokens, connection strings, private keys, emails, IPs) and anonymizes OS usernames in file paths before DB insertion. Applied automatically after ParseTranscript in the checkpoint flow. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent e04c462 commit 0899036

6 files changed

Lines changed: 707 additions & 0 deletions

File tree

cmd/rekal/cli/checkpoint.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414

1515
"github.com/oklog/ulid/v2"
1616
"github.com/rekal-dev/rekal-cli/cmd/rekal/cli/db"
17+
"github.com/rekal-dev/rekal-cli/cmd/rekal/cli/scrub"
1718
"github.com/rekal-dev/rekal-cli/cmd/rekal/cli/session"
1819
"github.com/spf13/cobra"
1920
)
@@ -138,6 +139,9 @@ func doCheckpoint(gitRoot string, w io.Writer) error {
138139
continue
139140
}
140141

142+
// Redact secrets and anonymize paths before any DB insertion.
143+
scrub.Scrub(payload)
144+
141145
if len(payload.Turns) == 0 && len(payload.ToolCalls) == 0 {
142146
continue
143147
}

cmd/rekal/cli/scrub/paths.go

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
package scrub
2+
3+
import (
4+
"crypto/sha256"
5+
"fmt"
6+
"os/user"
7+
"regexp"
8+
"strings"
9+
)
10+
11+
// pathAnonymizer holds the precomputed state for path anonymization.
12+
type pathAnonymizer struct {
13+
username string
14+
hashedUser string // "user_<8hex>"
15+
macPattern *regexp.Regexp
16+
linuxPattern *regexp.Regexp
17+
hyphenMac *regexp.Regexp
18+
hyphenLinux *regexp.Regexp
19+
bareUser *regexp.Regexp
20+
}
21+
22+
var defaultAnonymizer *pathAnonymizer
23+
24+
func init() {
25+
u, err := user.Current()
26+
if err != nil {
27+
return
28+
}
29+
defaultAnonymizer = newAnonymizer(u.Username)
30+
}
31+
32+
func newAnonymizer(username string) *pathAnonymizer {
33+
if username == "" {
34+
return nil
35+
}
36+
h := sha256.Sum256([]byte(username))
37+
hashed := fmt.Sprintf("user_%x", h[:4])
38+
39+
escaped := regexp.QuoteMeta(username)
40+
return &pathAnonymizer{
41+
username: username,
42+
hashedUser: hashed,
43+
macPattern: regexp.MustCompile(`/Users/` + escaped + `(?:/|$)`),
44+
linuxPattern: regexp.MustCompile(`/home/` + escaped + `(?:/|$)`),
45+
hyphenMac: regexp.MustCompile(`-Users-` + escaped + `-`),
46+
hyphenLinux: regexp.MustCompile(`-home-` + escaped + `-`),
47+
bareUser: regexp.MustCompile(`(?:^|[/\-])` + escaped + `(?:$|[/\-])`),
48+
}
49+
}
50+
51+
// projectRelativeDirs are directories under the home folder where we strip
52+
// the full path and keep only the project-relative portion.
53+
var projectRelativeDirs = []string{"Documents", "Downloads", "Desktop", "Projects", "projects", "src", "repos", "code", "dev", "workspace"}
54+
55+
// AnonymizePath replaces the username in a single file path with the hashed form.
56+
func AnonymizePath(path string) string {
57+
if defaultAnonymizer == nil {
58+
return path
59+
}
60+
return defaultAnonymizer.anonymizePath(path)
61+
}
62+
63+
func (a *pathAnonymizer) anonymizePath(path string) string {
64+
// /Users/<username>/... → /Users/<hashed>/...
65+
path = a.macPattern.ReplaceAllStringFunc(path, func(m string) string {
66+
return strings.Replace(m, a.username, a.hashedUser, 1)
67+
})
68+
// /home/<username>/... → /home/<hashed>/...
69+
path = a.linuxPattern.ReplaceAllStringFunc(path, func(m string) string {
70+
return strings.Replace(m, a.username, a.hashedUser, 1)
71+
})
72+
// Hyphen-encoded: -Users-<username>- → -Users-<hashed>-
73+
path = a.hyphenMac.ReplaceAllStringFunc(path, func(m string) string {
74+
return strings.Replace(m, a.username, a.hashedUser, 1)
75+
})
76+
path = a.hyphenLinux.ReplaceAllStringFunc(path, func(m string) string {
77+
return strings.Replace(m, a.username, a.hashedUser, 1)
78+
})
79+
return path
80+
}
81+
82+
// AnonymizeText replaces usernames in paths throughout a block of text.
83+
func AnonymizeText(text string) string {
84+
if defaultAnonymizer == nil {
85+
return text
86+
}
87+
return defaultAnonymizer.anonymizeText(text)
88+
}
89+
90+
func (a *pathAnonymizer) anonymizeText(text string) string {
91+
// Replace full paths first (most specific).
92+
text = a.macPattern.ReplaceAllStringFunc(text, func(m string) string {
93+
return strings.Replace(m, a.username, a.hashedUser, 1)
94+
})
95+
text = a.linuxPattern.ReplaceAllStringFunc(text, func(m string) string {
96+
return strings.Replace(m, a.username, a.hashedUser, 1)
97+
})
98+
// Hyphen-encoded paths.
99+
text = a.hyphenMac.ReplaceAllStringFunc(text, func(m string) string {
100+
return strings.Replace(m, a.username, a.hashedUser, 1)
101+
})
102+
text = a.hyphenLinux.ReplaceAllStringFunc(text, func(m string) string {
103+
return strings.Replace(m, a.username, a.hashedUser, 1)
104+
})
105+
return text
106+
}
107+
108+
// NewAnonymizerForUser creates an anonymizer for a specific username (for testing).
109+
func NewAnonymizerForUser(username string) *pathAnonymizer {
110+
return newAnonymizer(username)
111+
}

cmd/rekal/cli/scrub/paths_test.go

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
package scrub
2+
3+
import (
4+
"crypto/sha256"
5+
"fmt"
6+
"strings"
7+
"testing"
8+
)
9+
10+
func hashedUser(username string) string {
11+
h := sha256.Sum256([]byte(username))
12+
return fmt.Sprintf("user_%x", h[:4])
13+
}
14+
15+
func TestAnonymizeMacPath(t *testing.T) {
16+
t.Parallel()
17+
a := newAnonymizer("frank")
18+
input := "/Users/frank/projects/rekal/main.go"
19+
got := a.anonymizePath(input)
20+
want := "/Users/" + hashedUser("frank") + "/projects/rekal/main.go"
21+
if got != want {
22+
t.Errorf("got %q, want %q", got, want)
23+
}
24+
}
25+
26+
func TestAnonymizeLinuxPath(t *testing.T) {
27+
t.Parallel()
28+
a := newAnonymizer("alice")
29+
input := "/home/alice/src/project/file.py"
30+
got := a.anonymizePath(input)
31+
want := "/home/" + hashedUser("alice") + "/src/project/file.py"
32+
if got != want {
33+
t.Errorf("got %q, want %q", got, want)
34+
}
35+
}
36+
37+
func TestAnonymizeHyphenEncodedMac(t *testing.T) {
38+
t.Parallel()
39+
a := newAnonymizer("frank")
40+
input := "-Users-frank-projects-rekal"
41+
got := a.anonymizeText(input)
42+
want := "-Users-" + hashedUser("frank") + "-projects-rekal"
43+
if got != want {
44+
t.Errorf("got %q, want %q", got, want)
45+
}
46+
}
47+
48+
func TestAnonymizeHyphenEncodedLinux(t *testing.T) {
49+
t.Parallel()
50+
a := newAnonymizer("bob")
51+
input := "-home-bob-code-app"
52+
got := a.anonymizeText(input)
53+
want := "-home-" + hashedUser("bob") + "-code-app"
54+
if got != want {
55+
t.Errorf("got %q, want %q", got, want)
56+
}
57+
}
58+
59+
func TestAnonymizeTextMultipleOccurrences(t *testing.T) {
60+
t.Parallel()
61+
a := newAnonymizer("frank")
62+
input := "Reading /Users/frank/a.go and /Users/frank/b.go"
63+
got := a.anonymizeText(input)
64+
if strings.Contains(got, "/Users/frank/") {
65+
t.Errorf("username not fully anonymized: %s", got)
66+
}
67+
hashed := hashedUser("frank")
68+
if count := strings.Count(got, hashed); count != 2 {
69+
t.Errorf("expected 2 occurrences of hashed user, got %d: %s", count, got)
70+
}
71+
}
72+
73+
func TestAnonymizeNilAnonymizer(t *testing.T) {
74+
t.Parallel()
75+
a := newAnonymizer("")
76+
if a != nil {
77+
t.Error("expected nil anonymizer for empty username")
78+
}
79+
}
80+
81+
func TestAnonymizeNoMatchLeaveUnchanged(t *testing.T) {
82+
t.Parallel()
83+
a := newAnonymizer("frank")
84+
input := "no paths here, just text"
85+
got := a.anonymizeText(input)
86+
if got != input {
87+
t.Errorf("text should be unchanged: got %q", got)
88+
}
89+
}
90+
91+
func TestHashedUserDeterministic(t *testing.T) {
92+
t.Parallel()
93+
h1 := hashedUser("frank")
94+
h2 := hashedUser("frank")
95+
if h1 != h2 {
96+
t.Errorf("hashed user should be deterministic: %s != %s", h1, h2)
97+
}
98+
}
99+
100+
func TestHashedUserDifferentForDifferentUsers(t *testing.T) {
101+
t.Parallel()
102+
h1 := hashedUser("frank")
103+
h2 := hashedUser("alice")
104+
if h1 == h2 {
105+
t.Errorf("different users should have different hashes: %s == %s", h1, h2)
106+
}
107+
}

cmd/rekal/cli/scrub/scrub.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
package scrub
2+
3+
import (
4+
"github.com/rekal-dev/rekal-cli/cmd/rekal/cli/session"
5+
)
6+
7+
// Scrub applies secret redaction and path anonymization to a SessionPayload
8+
// in place. Call this after session.ParseTranscript and before DB insertion.
9+
func Scrub(payload *session.SessionPayload) {
10+
if payload == nil {
11+
return
12+
}
13+
14+
for i := range payload.Turns {
15+
payload.Turns[i].Content = RedactText(payload.Turns[i].Content)
16+
payload.Turns[i].Content = AnonymizeText(payload.Turns[i].Content)
17+
}
18+
19+
for i := range payload.ToolCalls {
20+
payload.ToolCalls[i].Path = AnonymizePath(payload.ToolCalls[i].Path)
21+
payload.ToolCalls[i].CmdPrefix = RedactText(payload.ToolCalls[i].CmdPrefix)
22+
payload.ToolCalls[i].CmdPrefix = AnonymizeText(payload.ToolCalls[i].CmdPrefix)
23+
}
24+
}

0 commit comments

Comments
 (0)