Skip to content

Commit 0eb248f

Browse files
committed
feat: fix doc extraction regression + add incremental bootstrap
- Fix doc node dedup: short summaries (<50 chars) use 0.7 threshold to prevent "Documentation: X.md" nodes from collapsing into each other - Add git-diff-based incremental bootstrap: stores HEAD SHA per service in bootstrap_state table, skips unchanged repos on subsequent runs - Changed files <50 trigger targeted analysis, >=50 falls back to full
1 parent 1b741a7 commit 0eb248f

3 files changed

Lines changed: 86 additions & 3 deletions

File tree

internal/bootstrap/runner.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@ func (r *Runner) Close() {
3535

3636
// RunOptions configures a runner execution.
3737
type RunOptions struct {
38-
Workspace string // Workspace name for monorepo support ('root' for global, service name for scoped)
38+
Workspace string // Workspace name for monorepo support ('root' for global, service name for scoped)
39+
ChangedFiles []string // If set, only analyze these files (incremental mode)
3940
}
4041

4142
// ProviderSupportsBatch returns true if the provider has a batch API with cost savings.

internal/bootstrap/service.go

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"encoding/json"
66
"fmt"
77
"os"
8+
"os/exec"
89
"path/filepath"
910
"strings"
1011
"time"
@@ -88,8 +89,41 @@ func (s *Service) RunMultiRepoAnalysis(ctx context.Context, ws *project.Workspac
8889

8990
runner := NewRunner(s.llmCfg, servicePath)
9091

92+
// Incremental mode: check if we can skip or limit analysis
93+
opts := RunOptions{Workspace: serviceName}
94+
stateKey := "bootstrap-sha-" + serviceName
95+
dbPath := filepath.Join(s.basePath, ".taskwing", "memory")
96+
if store, storeErr := memory.NewSQLiteStore(dbPath); storeErr == nil {
97+
if state, stateErr := store.GetBootstrapState(stateKey); stateErr == nil && state != nil && state.Checksum != "" {
98+
headSHA := getGitHEAD(servicePath)
99+
if headSHA != "" && headSHA == state.Checksum {
100+
if onProgress != nil {
101+
onProgress(serviceName, fmt.Sprintf("[%d/%d] no changes", i+1, len(ws.Services)))
102+
}
103+
_ = store.Close()
104+
runner.Close()
105+
continue
106+
}
107+
if headSHA != "" {
108+
changedFiles := getChangedFilesSince(servicePath, state.Checksum)
109+
if changedFiles != nil && len(changedFiles) == 0 {
110+
if onProgress != nil {
111+
onProgress(serviceName, fmt.Sprintf("[%d/%d] no changes", i+1, len(ws.Services)))
112+
}
113+
_ = store.Close()
114+
runner.Close()
115+
continue
116+
}
117+
if changedFiles != nil && len(changedFiles) < 50 {
118+
opts.ChangedFiles = changedFiles
119+
}
120+
}
121+
}
122+
_ = store.Close()
123+
}
124+
91125
// Pass workspace (service name) to the runner so agents can tag their findings
92-
results, err := runner.RunWithOptions(ctx, servicePath, RunOptions{Workspace: serviceName})
126+
results, err := runner.RunWithOptions(ctx, servicePath, opts)
93127
// Close runner immediately after use - NOT deferred in loop!
94128
runner.Close()
95129

@@ -142,6 +176,18 @@ func (s *Service) RunMultiRepoAnalysis(ctx context.Context, ws *project.Workspac
142176
allFindings = append(allFindings, findings...)
143177
allRelationships = append(allRelationships, relationships...)
144178

179+
// Save git SHA for incremental mode on next run
180+
if headSHA := getGitHEAD(servicePath); headSHA != "" {
181+
if store, storeErr := memory.NewSQLiteStore(dbPath); storeErr == nil {
182+
_ = store.SetBootstrapState(&memory.BootstrapState{
183+
Component: stateKey,
184+
Status: "completed",
185+
Checksum: headSHA,
186+
})
187+
_ = store.Close()
188+
}
189+
}
190+
145191
if onProgress != nil {
146192
onProgress(serviceName, fmt.Sprintf("[%d/%d] done (%d findings)", i+1, len(ws.Services), len(findings)))
147193
}
@@ -454,6 +500,36 @@ func generateReport(projectPath string, results []core.Output, findings []core.F
454500
return report
455501
}
456502

503+
// getGitHEAD returns the current git HEAD SHA for a directory, or empty string if not a git repo.
504+
func getGitHEAD(dir string) string {
505+
cmd := exec.Command("git", "rev-parse", "HEAD")
506+
cmd.Dir = dir
507+
out, err := cmd.Output()
508+
if err != nil {
509+
return ""
510+
}
511+
return strings.TrimSpace(string(out))
512+
}
513+
514+
// getChangedFilesSince returns files changed between oldSHA and HEAD in the given directory.
515+
// Returns nil if git operations fail (triggers full bootstrap fallback).
516+
func getChangedFilesSince(dir, oldSHA string) []string {
517+
if oldSHA == "" {
518+
return nil
519+
}
520+
cmd := exec.Command("git", "diff", "--name-only", oldSHA+"..HEAD")
521+
cmd.Dir = dir
522+
out, err := cmd.Output()
523+
if err != nil {
524+
return nil
525+
}
526+
raw := strings.TrimSpace(string(out))
527+
if raw == "" {
528+
return []string{} // No changes - empty slice means "nothing changed"
529+
}
530+
return strings.Split(raw, "\n")
531+
}
532+
457533
func saveReport(path string, report *core.BootstrapReport) error {
458534
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
459535
return fmt.Errorf("create report directory: %w", err)

internal/memory/sqlite.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1445,7 +1445,13 @@ func (s *SQLiteStore) UpsertNodeBySummary(n Node) error {
14451445
continue
14461446
}
14471447
sim := textSimilarity(n.Summary, existingSummary)
1448-
if sim >= textSimilarityThreshold {
1448+
// Use higher threshold for short summaries where common prefixes
1449+
// (e.g., "Documentation: X.md" vs "Documentation: Y.md") inflate Jaccard scores
1450+
threshold := textSimilarityThreshold
1451+
if len(n.Summary) < 50 || len(existingSummary) < 50 {
1452+
threshold = 0.7
1453+
}
1454+
if sim >= threshold {
14491455
_ = rows.Close() // Close before executing update
14501456
// Found a similar node - update it instead of inserting new (including evidence and debt columns)
14511457
if n.Content != similarContent {

0 commit comments

Comments
 (0)