Skip to content

Commit f908bb4

Browse files
Frank Guoclaude
andcommitted
fix: supplement files_touched from session tool_calls
files_touched was only populated from git diff HEAD~1, missing files that spanned multiple commits or weren't committed yet. Now also collects Write/Edit/NotebookEdit paths from parsed sessions at capture time (change_type "T"), and backfills existing data via the indexer. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 8adaff1 commit f908bb4

13 files changed

Lines changed: 277 additions & 121 deletions

File tree

README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ Your agent starts every session knowing *why* the code looks the way it does.
2525
- **Team-shared memory**`rekal push` and `rekal sync` share session context across your entire team through git. Every developer's agent benefits from every other developer's prior sessions.
2626
- **Git-native** — No external infrastructure. Rekal data lives on standard orphan branches, syncs through your existing remote, and uses git's object store for point-in-time recovery. Every checkpoint is anchored to a commit SHA.
2727
- **DuckDB-powered** — Full-text search (BM25), LSA vector embeddings, and file co-occurrence graphs built on DuckDB. The index is local-only and rebuilt on demand from the shared data.
28-
- **Agent-first**`rekal <query>` is the primary interface. Output is structured JSON designed for machine consumption. The agent calls `rekal` directly and gets precise memory back — no human-readable wrappers needed.
28+
- **Agent-first**Progressive context loading. `rekal <query>` returns scored snippets and metadata — just enough for the agent to decide what matters. `rekal query --session <id>` drills into a specific session for full turns. The agent controls how much context it loads.
2929
- **Signal, not bulk** — A 2-10 MB session file becomes a ~300 byte payload. The wire format is a custom binary codec with zstd compression, string interning via varint references, and append-only framing.
3030

3131
## How It Works
@@ -79,8 +79,9 @@ When a newer release is available, the CLI prints an update notice after each co
7979
| `rekal sync [--self]` | Sync team context from remote rekal branches |
8080
| `rekal index` | Rebuild the index DB from the data DB |
8181
| `rekal log [--limit N]` | Show recent checkpoints |
82-
| `rekal query "<sql>" [--index]` | Run raw SQL against the data or index DB |
8382
| `rekal [filters...] [query]` | Recall — hybrid search (BM25 + LSA) over sessions |
83+
| `rekal query --session <id> [--full]` | Drill into a session (turns, tool calls, files) |
84+
| `rekal query "<sql>" [--index]` | Run raw SQL against the data or index DB |
8485

8586
### Recall Filters (root command)
8687

@@ -105,6 +106,8 @@ rekal log # Show recent checkpoints
105106
rekal "JWT expiry" # Recall sessions mentioning JWT
106107
rekal --file src/auth/ "token refresh" # Recall with file filter
107108
rekal --actor agent "migration" # Show only agent-initiated sessions
109+
rekal query --session 01JNQX... # Full turns for a specific session
110+
rekal query --session 01JNQX... --full # Include tool calls and files
108111
rekal query "SELECT * FROM sessions LIMIT 5"
109112
rekal clean # Remove Rekal from this repo
110113
```

cmd/rekal/cli/checkpoint.go

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,15 @@ func newCheckpointCmd() *cobra.Command {
2121
return &cobra.Command{
2222
Use: "checkpoint",
2323
Short: "Capture the current session after a commit",
24+
Long: `Snapshot the active AI session into the local data DB.
25+
26+
Reads session transcript files (conversation turns, tool calls, file changes)
27+
from the agent's session directory, deduplicates by content hash, and inserts
28+
into .rekal/data.db. Each checkpoint is linked to the current HEAD commit and
29+
records which files were changed.
30+
31+
Normally runs automatically via the post-commit hook installed by 'rekal init'.
32+
Run manually to capture a session without committing.`,
2433
RunE: func(cmd *cobra.Command, _ []string) error {
2534
cmd.SilenceUsage = true
2635

@@ -83,6 +92,8 @@ func doCheckpoint(gitRoot string, w io.Writer) error {
8392

8493
var sessionIDs []string
8594
var inserted int
95+
// Collect unique relative file paths from file-modifying tool_calls across all sessions.
96+
toolCallPaths := make(map[string]struct{})
8697

8798
for _, f := range files {
8899
// Incremental: check checkpoint_state to skip unchanged files.
@@ -159,6 +170,24 @@ func doCheckpoint(gitRoot string, w io.Writer) error {
159170
}
160171
}
161172

173+
// Collect file-modifying tool_call paths for files_touched supplementation.
174+
for _, tc := range payload.ToolCalls {
175+
if tc.Path == "" {
176+
continue
177+
}
178+
switch tc.Tool {
179+
case "Write", "Edit", "NotebookEdit":
180+
default:
181+
continue
182+
}
183+
rel := strings.TrimPrefix(tc.Path, gitRoot+"/")
184+
if rel == tc.Path {
185+
// Path is not under gitRoot — external file, skip.
186+
continue
187+
}
188+
toolCallPaths[rel] = struct{}{}
189+
}
190+
162191
// Update checkpoint state cache.
163192
_ = db.UpsertCheckpointState(dataDB, f, info.Size(), hash)
164193

@@ -184,17 +213,29 @@ func doCheckpoint(gitRoot string, w io.Writer) error {
184213
return fmt.Errorf("insert checkpoint: %w", err)
185214
}
186215

187-
// Insert files_touched.
216+
// Insert files_touched from git diff.
217+
gitTouchedSet := make(map[string]struct{})
188218
for _, ft := range filesTouched {
189219
parts := strings.SplitN(ft, "\t", 2)
190220
if len(parts) != 2 {
191221
continue
192222
}
223+
gitTouchedSet[parts[1]] = struct{}{}
193224
if err := db.InsertFileTouched(dataDB, newID(), checkpointID, parts[1], parts[0]); err != nil {
194225
return fmt.Errorf("insert file_touched: %w", err)
195226
}
196227
}
197228

229+
// Supplement files_touched with file-modifying tool_call paths not already covered by git diff.
230+
for p := range toolCallPaths {
231+
if _, exists := gitTouchedSet[p]; exists {
232+
continue
233+
}
234+
if err := db.InsertFileTouched(dataDB, newID(), checkpointID, p, "T"); err != nil {
235+
return fmt.Errorf("insert file_touched (tool_call): %w", err)
236+
}
237+
}
238+
198239
// Insert checkpoint_sessions junction rows.
199240
for _, sid := range sessionIDs {
200241
if err := db.InsertCheckpointSession(dataDB, checkpointID, sid); err != nil {

cmd/rekal/cli/clean.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,15 @@ func newCleanCmd() *cobra.Command {
1313
return &cobra.Command{
1414
Use: "clean",
1515
Short: "Remove Rekal setup from this repository (local only)",
16+
Long: `Remove Rekal setup from this repository. Local only — does not touch
17+
the remote branch or .gitignore.
18+
19+
Removes:
20+
.rekal/ Data DB, index DB, and all local state
21+
post-commit hook Only if it contains the rekal marker
22+
pre-push hook Only if it contains the rekal marker
23+
24+
Run 'rekal init' to reinitialize after cleaning.`,
1625
RunE: func(cmd *cobra.Command, _ []string) error {
1726
cmd.SilenceUsage = true
1827

cmd/rekal/cli/db/indexer.go

Lines changed: 48 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,29 @@ func PopulateIndex(d *sql.DB, gitRoot string) error {
136136
return fmt.Errorf("populate files_index: %w", err)
137137
}
138138

139+
// files_index — supplement from file-modifying tool_calls for existing data
140+
// that was checkpointed before the capture-time fix.
141+
gitRootPrefix := gitRoot + "/"
142+
if _, err := d.Exec(`
143+
INSERT INTO files_index (checkpoint_id, session_id, file_path, change_type)
144+
SELECT DISTINCT cs.checkpoint_id, tc.session_id,
145+
replace(tc.path, $1, ''),
146+
'T'
147+
FROM data_db.tool_calls tc
148+
JOIN data_db.checkpoint_sessions cs ON cs.session_id = tc.session_id
149+
WHERE tc.tool IN ('Write', 'Edit', 'NotebookEdit')
150+
AND tc.path IS NOT NULL AND length(tc.path) > 0
151+
AND tc.path LIKE ($1 || '%')
152+
AND NOT EXISTS (
153+
SELECT 1 FROM files_index fi
154+
WHERE fi.checkpoint_id = cs.checkpoint_id
155+
AND fi.session_id = tc.session_id
156+
AND fi.file_path = replace(tc.path, $1, '')
157+
)
158+
`, gitRootPrefix); err != nil {
159+
return fmt.Errorf("populate files_index from tool_calls: %w", err)
160+
}
161+
139162
// session_facets — aggregation
140163
if _, err := d.Exec(`
141164
INSERT INTO session_facets (
@@ -186,7 +209,7 @@ func PopulateIndex(d *sql.DB, gitRoot string) error {
186209

187210
// CreateFTSIndex creates the DuckDB full-text search index on turns_ft.
188211
func CreateFTSIndex(d *sql.DB) error {
189-
_, err := d.Exec(`PRAGMA create_fts_index('turns_ft', 'id', 'content', stemmer='english', stopwords='english')`)
212+
_, err := d.Exec(`PRAGMA create_fts_index('turns_ft', 'id', 'content', stemmer='english', stopwords='english', overwrite=1)`)
190213
if err != nil {
191214
return fmt.Errorf("create fts index: %w", err)
192215
}
@@ -214,23 +237,37 @@ func WriteIndexState(d *sql.DB, key, value string) error {
214237

215238
// StoreEmbeddings bulk-inserts session embeddings into the index DB.
216239
func StoreEmbeddings(d *sql.DB, vectors map[string][]float64, model string) error {
217-
stmt, err := d.Prepare(`
218-
INSERT INTO session_embeddings (session_id, embedding, model, generated_at)
219-
VALUES ($1, $2, $3, now())
220-
`)
221-
if err != nil {
222-
return fmt.Errorf("prepare embedding insert: %w", err)
223-
}
224-
defer stmt.Close() //nolint:errcheck
225-
226240
for sessionID, vec := range vectors {
227-
if _, err := stmt.Exec(sessionID, vec, model); err != nil {
241+
// Inline the array literal because the database/sql driver cannot
242+
// bind a string to a FLOAT[] column, even with a cast.
243+
query := fmt.Sprintf(
244+
`INSERT INTO session_embeddings (session_id, embedding, model, generated_at)
245+
VALUES ($1, %s::FLOAT[], $2, now())`,
246+
float64SliceToDuckDB(vec),
247+
)
248+
if _, err := d.Exec(query, sessionID, model); err != nil {
228249
return fmt.Errorf("insert embedding for %s: %w", sessionID, err)
229250
}
230251
}
231252
return nil
232253
}
233254

255+
// float64SliceToDuckDB serializes a float64 slice as a DuckDB list literal
256+
// (e.g. "[0.1, 0.2, 0.3]") because the database/sql driver does not support
257+
// passing Go slices for FLOAT[] columns.
258+
func float64SliceToDuckDB(v []float64) string {
259+
var b strings.Builder
260+
b.WriteByte('[')
261+
for i, f := range v {
262+
if i > 0 {
263+
b.WriteString(", ")
264+
}
265+
fmt.Fprintf(&b, "%g", f)
266+
}
267+
b.WriteByte(']')
268+
return b.String()
269+
}
270+
234271
// QuerySessionContent returns session_id → concatenated turn content for LSA.
235272
func QuerySessionContent(d *sql.DB) (map[string]string, error) {
236273
rows, err := d.Query(`

cmd/rekal/cli/index_cmd.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,17 @@ func newIndexCmd() *cobra.Command {
1313
return &cobra.Command{
1414
Use: "index",
1515
Short: "Rebuild the index DB from the data DB",
16+
Long: `Drop and rebuild the index DB (.rekal/index.db) from the data DB.
17+
18+
The index is local-only and never synced. It contains:
19+
- Full-text search index (BM25) over conversation turns
20+
- LSA vector embeddings for semantic similarity
21+
- Session facets (author, branch, actor, counts) for fast filtering
22+
- File co-occurrence graph
23+
- Tool call indexes
24+
25+
Rebuild when the index is out of date or after importing new data.
26+
'rekal sync' rebuilds the index automatically.`,
1627
RunE: func(cmd *cobra.Command, _ []string) error {
1728
cmd.SilenceUsage = true
1829

cmd/rekal/cli/init.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,17 @@ func newInitCmd() *cobra.Command {
1919
cmd := &cobra.Command{
2020
Use: "init",
2121
Short: "Initialize Rekal in the current git repository",
22+
Long: `Initialize Rekal in the current git repository.
23+
24+
Creates:
25+
.rekal/ Local directory (gitignored) with data.db and index.db
26+
post-commit hook Runs 'rekal checkpoint' after each commit
27+
pre-push hook Runs 'rekal push' before each push
28+
orphan branch rekal/<email> for wire format storage
29+
agent skill .claude/skills/rekal/SKILL.md for Claude Code
30+
31+
If the remote already has data on your rekal branch, it is fetched and
32+
imported into the local data DB automatically.`,
2233
RunE: func(cmd *cobra.Command, _ []string) error {
2334
cmd.SilenceUsage = true
2435

cmd/rekal/cli/log.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@ func newLogCmd() *cobra.Command {
1313
cmd := &cobra.Command{
1414
Use: "log",
1515
Short: "Show recent checkpoints",
16+
Long: `Show recent checkpoints from the data DB, newest first.
17+
18+
Each entry shows the checkpoint ID, timestamp, git commit SHA, branch,
19+
author email, and number of sessions captured. Use --limit to control
20+
how many entries are shown.`,
1621
RunE: func(cmd *cobra.Command, _ []string) error {
1722
cmd.SilenceUsage = true
1823

cmd/rekal/cli/push.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,16 @@ func newPushCmd() *cobra.Command {
1515
cmd := &cobra.Command{
1616
Use: "push",
1717
Short: "Push Rekal data to the remote branch",
18+
Long: `Export new checkpoints to the wire format and push to the remote orphan branch.
19+
20+
Unexported checkpoints in data.db are encoded into the compact binary wire
21+
format (rekal.body + dict.bin) and committed to the local orphan branch
22+
(rekal/<email>). The branch is then pushed to origin.
23+
24+
Use --force to overwrite the remote branch when it has diverged from local
25+
(e.g. after a rebuild or conflict).
26+
27+
Normally runs automatically via the pre-push hook installed by 'rekal init'.`,
1828
RunE: func(cmd *cobra.Command, _ []string) error {
1929
cmd.SilenceUsage = true
2030

cmd/rekal/cli/query.go

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,53 @@ func newQueryCmd() *cobra.Command {
2020
cmd := &cobra.Command{
2121
Use: "query [<sql> | --session <id>]",
2222
Short: "Run raw SQL or drill into a session",
23-
Args: cobra.MaximumNArgs(1),
23+
Long: `Run raw SQL against the data or index DB, or drill into a specific session.
24+
25+
Session drill-down (--session) returns the full conversation as JSON. Add --full
26+
to include tool calls and files touched.
27+
28+
Raw SQL mode accepts SELECT statements only. Output is one JSON object per row.
29+
Use --index to query the index DB instead of the data DB.
30+
31+
DATA DB SCHEMA (.rekal/data.db):
32+
33+
sessions id, parent_session_id, session_hash, captured_at, actor_type,
34+
agent_id, user_email, branch
35+
turns id, session_id, turn_index, role, content, ts
36+
tool_calls id, session_id, call_order, tool, path, cmd_prefix
37+
checkpoints id, git_sha, git_branch, user_email, ts, actor_type, agent_id,
38+
exported
39+
files_touched id, checkpoint_id, file_path, change_type
40+
checkpoint_sessions checkpoint_id, session_id
41+
42+
INDEX DB SCHEMA (.rekal/index.db):
43+
44+
turns_ft id, session_id, turn_index, role, content, ts
45+
tool_calls_index id, session_id, call_order, tool, path, cmd_prefix
46+
files_index checkpoint_id, session_id, file_path, change_type
47+
session_facets session_id, user_email, git_branch, actor_type, agent_id,
48+
captured_at, turn_count, tool_call_count, file_count,
49+
checkpoint_id, git_sha
50+
file_cooccurrence file_a, file_b, count
51+
session_embeddings session_id, embedding, model, generated_at`,
52+
Example: ` # Drill into a session (turns only)
53+
rekal query --session 01JNQX...
54+
55+
# Drill into a session (turns + tool calls + files)
56+
rekal query --session 01JNQX... --full
57+
58+
# Recent sessions
59+
rekal query "SELECT id, user_email, branch, captured_at FROM sessions ORDER BY captured_at DESC LIMIT 5"
60+
61+
# Sessions that touched a file
62+
rekal query "SELECT DISTINCT s.id, s.user_email, s.captured_at FROM tool_calls t JOIN sessions s ON t.session_id = s.id WHERE t.path LIKE '%auth%'"
63+
64+
# Most-edited files
65+
rekal query "SELECT path, count(*) as n FROM tool_calls WHERE tool IN ('Write','Edit') AND path IS NOT NULL GROUP BY path ORDER BY n DESC LIMIT 10"
66+
67+
# File co-occurrence (index DB)
68+
rekal query --index "SELECT * FROM file_cooccurrence WHERE file_a LIKE '%auth%' ORDER BY count DESC LIMIT 10"`,
69+
Args: cobra.MaximumNArgs(1),
2470
RunE: func(cmd *cobra.Command, args []string) error {
2571
cmd.SilenceUsage = true
2672

cmd/rekal/cli/root.go

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,18 @@ import (
1111

1212
const gettingStarted = `
1313
14+
Workflow:
15+
rekal "keyword" Search sessions (BM25 + LSA hybrid)
16+
rekal --file auth "token refresh" Filter by file path
17+
rekal query --session <id> Drill into a session (full turns)
18+
rekal query --session <id> --full Include tool calls and files
19+
rekal query "SELECT ..." Raw SQL for edge cases
20+
1421
Getting Started:
15-
rekal init Initialize Rekal in a git repository
16-
rekal checkpoint Capture the current session
17-
rekal push Share context with the team
18-
rekal sync Pull team context
19-
rekal "query" Recall sessions by keyword
22+
rekal init Initialize Rekal in a git repository
23+
rekal checkpoint Capture the current session
24+
rekal push Share context with the team
25+
rekal sync Pull team context
2026
`
2127

2228
// NewRootCmd returns the root command for the rekal CLI.

0 commit comments

Comments
 (0)