diff --git a/cmd/engram/consolidate_grouping_test.go b/cmd/engram/consolidate_grouping_test.go new file mode 100644 index 00000000..148bc839 --- /dev/null +++ b/cmd/engram/consolidate_grouping_test.go @@ -0,0 +1,122 @@ +package main + +import ( + "strings" + "testing" + + "github.com/Gentleman-Programming/engram/internal/store" +) + +// groupContaining returns the group whose Names include target, or nil. +func groupContaining(groups []projectGroup, target string) *projectGroup { + for i := range groups { + for _, n := range groups[i].Names { + if n == target { + return &groups[i] + } + } + } + return nil +} + +// TestGroupSimilarProjects_NoisySharedDirNotGrouped reproduces issue #283 bug 4: +// many unrelated projects that merely share a common parent/root directory +// (e.g. $HOME or a session root) must NOT be unioned into one giant component. +// Before the fix, the shared-directory union step grouped all of them together +// and consolidate would propose merging them into a single canonical project — +// a catastrophic data-loss footgun. +func TestGroupSimilarProjects_NoisySharedDirNotGrouped(t *testing.T) { + // Names are deliberately long and mutually dissimilar (no substring overlap, + // Levenshtein distance well above the scaled threshold) so the ONLY possible + // grouping signal is the shared "/home/user" directory. + const sharedHome = "/home/user" + projects := []store.ProjectStats{ + {Name: "kubernetes", ObservationCount: 5, Directories: []string{"/home/user/kubernetes", sharedHome}}, + {Name: "photoshop", ObservationCount: 7, Directories: []string{"/home/user/photoshop", sharedHome}}, + {Name: "wireguard", ObservationCount: 3, Directories: []string{"/home/user/wireguard", sharedHome}}, + {Name: "blender", ObservationCount: 9, Directories: []string{"/home/user/blender", sharedHome}}, + {Name: "terraform", ObservationCount: 2, Directories: []string{"/home/user/terraform", sharedHome}}, + } + + groups := groupSimilarProjects(projects) + + // The five names are mutually dissimilar, so the ONLY thing that could group + // them is the shared "/home/user" directory. Since that directory is touched + // by more than maxSharedProjectsForDirMatch distinct projects, it must be + // treated as noise and skipped — yielding no groups at all. + if len(groups) != 0 { + t.Fatalf("expected 0 groups (noisy shared dir must be skipped), got %d: %+v", len(groups), groups) + } +} + +// TestGroupSimilarProjects_RealSharedDirStillGroups guards against over-correcting: +// when only a small number of projects share a directory (a genuine rename signal), +// they SHOULD still be grouped. +func TestGroupSimilarProjects_RealSharedDirStillGroups(t *testing.T) { + const sharedRepo = "/repos/shared-monorepo" + projects := []store.ProjectStats{ + {Name: "webapp", ObservationCount: 12, Directories: []string{sharedRepo}}, + {Name: "legacy-portal", ObservationCount: 4, Directories: []string{sharedRepo}}, + } + + groups := groupSimilarProjects(projects) + + g := groupContaining(groups, "webapp") + if g == nil { + t.Fatalf("expected webapp and legacy-portal to be grouped via shared dir, got groups: %+v", groups) + } + if len(g.Names) != 2 { + t.Fatalf("expected group of 2, got %d: %+v", len(g.Names), g.Names) + } +} + +// TestGroupSimilarProjects_NameSimilarityStillWorks confirms the legitimate +// name-based grouping path is untouched by the shared-dir fix. +func TestGroupSimilarProjects_NameSimilarityStillWorks(t *testing.T) { + // "frontend" is a substring of "frontend-app" → name-similar (no shared dir). + projects := []store.ProjectStats{ + {Name: "frontend", ObservationCount: 8, Directories: []string{"/a"}}, + {Name: "frontend-app", ObservationCount: 5, Directories: []string{"/b"}}, + {Name: "unrelated", ObservationCount: 1, Directories: []string{"/c"}}, + } + + groups := groupSimilarProjects(projects) + + g := groupContaining(groups, "frontend") + if g == nil { + t.Fatalf("expected frontend* projects grouped by name similarity, got: %+v", groups) + } + if len(g.Names) != 2 { + t.Fatalf("expected name-similar group of 2, got %d: %+v", len(g.Names), g.Names) + } + if groupContaining(groups, "unrelated") != nil { + t.Fatalf("unrelated project must not be grouped: %+v", groups) + } +} + +// TestCmdProjectsConsolidateAllDoesNotMergeNoisySharedDir is a command-level +// regression test for issue #283 bug 4: `projects consolidate --all` must not +// propose merging many unrelated projects just because their sessions share a +// common directory. mustSeedSession records every session under the same "/tmp" +// directory, so seeding several distinctly-named projects reproduces the noisy +// shared-directory scenario end-to-end. +func TestCmdProjectsConsolidateAllDoesNotMergeNoisySharedDir(t *testing.T) { + cfg := testConfig(t) + + for _, name := range []string{"kubernetes", "photoshop", "wireguard", "blender", "terraform"} { + mustSeedSession(t, cfg, "s-"+name, name) + } + + withArgs(t, "engram", "projects", "consolidate", "--all", "--dry-run") + stdout, stderr := captureOutput(t, func() { cmdProjectsConsolidate(cfg) }) + + if stderr != "" { + t.Fatalf("expected no stderr, got: %q", stderr) + } + if !strings.Contains(stdout, "No similar project name groups found") { + t.Fatalf("expected no groups for projects sharing only a noisy dir, got: %q", stdout) + } + if strings.Contains(stdout, "Would merge") { + t.Fatalf("must not propose a merge for noisy-shared-dir projects, got: %q", stdout) + } +} diff --git a/cmd/engram/main.go b/cmd/engram/main.go index fd8eed2d..23381f47 100644 --- a/cmd/engram/main.go +++ b/cmd/engram/main.go @@ -1836,7 +1836,7 @@ func cmdProjects(cfg store.Config) { fmt.Fprintf(os.Stderr, "unknown projects subcommand: %s\n", subCmd) fmt.Fprintln(os.Stderr, "usage: engram projects list") fmt.Fprintln(os.Stderr, " engram projects consolidate [--all] [--dry-run]") - fmt.Fprintln(os.Stderr, " engram projects prune [--dry-run]") + fmt.Fprintln(os.Stderr, " engram projects prune [--dry-run] [--paths-only]") exitFunc(1) } } @@ -1883,6 +1883,20 @@ type projectGroup struct { Canonical string // suggested canonical (most observations) } +// maxSharedProjectsForDirMatch caps how many distinct projects may share a single +// directory before that directory is treated as noise (e.g. $HOME or a session +// root) and excluded from shared-directory grouping. See issue #283 bug 4. +const maxSharedProjectsForDirMatch = 3 + +// distinctProjectCount returns the number of unique project indices in idxs. +func distinctProjectCount(idxs []int) int { + seen := make(map[int]struct{}, len(idxs)) + for _, i := range idxs { + seen[i] = struct{}{} + } + return len(seen) +} + // groupSimilarProjects groups projects by name similarity and shared directories. // Uses a simple union-find approach. func groupSimilarProjects(projects []store.ProjectStats) []projectGroup { @@ -1939,6 +1953,13 @@ func groupSimilarProjects(projects []store.ProjectStats) []projectGroup { } } for _, idxs := range dirToProjects { + // Skip noisy ancestor directories shared by many unrelated projects + // (issue #283 bug 4): unioning everything under $HOME or a session root + // collapses the whole store into one component and turns consolidate into + // a data-loss footgun. + if distinctProjectCount(idxs) > maxSharedProjectsForDirMatch { + continue + } for k := 1; k < len(idxs); k++ { union(idxs[0], idxs[k]) } @@ -2241,11 +2262,23 @@ func cmdProjectsConsolidate(cfg store.Config) { } } +// isPathLikeProjectName reports whether a project name is actually a filesystem +// path (contains a path separator). Mirrors the write-boundary rejection in +// normalizeExplicitWriteProject so cleanup targets exactly the names newer +// versions now refuse to create. See issue #283 bug 2/bug 4. +func isPathLikeProjectName(name string) bool { + return strings.ContainsAny(name, `/\`) +} + func cmdProjectsPrune(cfg store.Config) { dryRun := false + pathsOnly := false for i := 3; i < len(os.Args); i++ { - if os.Args[i] == "--dry-run" { + switch os.Args[i] { + case "--dry-run": dryRun = true + case "--paths-only": + pathsOnly = true } } @@ -2263,13 +2296,24 @@ func cmdProjectsPrune(cfg store.Config) { // Find projects with 0 observations var candidates []store.ProjectStats for _, ps := range allStats { - if ps.ObservationCount == 0 { - candidates = append(candidates, ps) + if ps.ObservationCount != 0 { + continue } + // --paths-only: restrict to garbage projects whose name is a filesystem + // path (issue #283 bug 4). These are leftovers from older versions that + // fell back to using cwd as the project name when detection failed. + if pathsOnly && !isPathLikeProjectName(ps.Name) { + continue + } + candidates = append(candidates, ps) } if len(candidates) == 0 { - fmt.Println("No empty projects to prune.") + if pathsOnly { + fmt.Println("No path-named projects to prune.") + } else { + fmt.Println("No empty projects to prune.") + } return } diff --git a/cmd/engram/prune_paths_test.go b/cmd/engram/prune_paths_test.go new file mode 100644 index 00000000..a7ad58e0 --- /dev/null +++ b/cmd/engram/prune_paths_test.go @@ -0,0 +1,129 @@ +package main + +import ( + "strings" + "testing" + + "github.com/Gentleman-Programming/engram/internal/store" +) + +// TestIsPathLikeProjectName covers the detector used to single out garbage +// "path-as-name" projects (issue #283 bug 4 / bug 2 leftovers). It mirrors the +// write-boundary rejection in normalizeExplicitWriteProject: a project name that +// contains a path separator is a filesystem path, not a real project name. +func TestIsPathLikeProjectName(t *testing.T) { + cases := []struct { + name string + want bool + }{ + {`c:\users\foo`, true}, + {`/home/user`, true}, + {`c:/users/foo`, true}, + {`\\server\share`, true}, + {"d:\\workspace\\sub", true}, + {"engram", false}, + {"agent-teams-lite", false}, + {"e3", false}, + {"general", false}, + {"", false}, + } + for _, tc := range cases { + if got := isPathLikeProjectName(tc.name); got != tc.want { + t.Errorf("isPathLikeProjectName(%q) = %v, want %v", tc.name, got, tc.want) + } + } +} + +// TestCmdProjectsPrunePathsOnlyDryRun verifies that `--paths-only` narrows the +// prune candidate set to path-named projects with 0 observations, leaving +// legitimate empty projects and projects with observations untouched. +func TestCmdProjectsPrunePathsOnlyDryRun(t *testing.T) { + cfg := testConfig(t) + + // Garbage: path-as-name project, 0 observations (only a session). + mustSeedSession(t, cfg, "s-garbage", `c:\users\foo`) + // Legitimate empty project, 0 observations — must NOT be selected by --paths-only. + mustSeedSession(t, cfg, "s-legit", "legit-empty") + // Real project with observations — never a prune candidate. + mustSeedObservation(t, cfg, "s-real", "realproject", "note", "title", "content", "project") + + withArgs(t, "engram", "projects", "prune", "--paths-only", "--dry-run") + stdout, stderr := captureOutput(t, func() { cmdProjectsPrune(cfg) }) + + if stderr != "" { + t.Fatalf("expected no stderr, got: %q", stderr) + } + if !strings.Contains(stdout, `c:\users\foo`) { + t.Fatalf("expected path-named project listed, got: %q", stdout) + } + if strings.Contains(stdout, "legit-empty") { + t.Fatalf("legitimate empty project must NOT be listed with --paths-only, got: %q", stdout) + } + if strings.Contains(stdout, "realproject") { + t.Fatalf("project with observations must never be a prune candidate, got: %q", stdout) + } + if !strings.Contains(stdout, "dry-run") { + t.Fatalf("expected dry-run notice, got: %q", stdout) + } +} + +// TestCmdProjectsPrunePathsOnlyDeletesOnlyPathNamed verifies the actual +// (non-dry-run) deletion: `prune --paths-only` removes path-named empty projects +// while leaving legitimate empty projects and projects with observations intact. +func TestCmdProjectsPrunePathsOnlyDeletesOnlyPathNamed(t *testing.T) { + cfg := testConfig(t) + + mustSeedSession(t, cfg, "s-garbage", `c:\users\foo`) + mustSeedSession(t, cfg, "s-legit", "legit-empty") + mustSeedObservation(t, cfg, "s-real", "realproject", "note", "title", "content", "project") + + // Answer "all" to prune every (path-named) candidate. + oldScan := scanInputLine + t.Cleanup(func() { scanInputLine = oldScan }) + scanInputLine = func(a ...any) (int, error) { + if ptr, ok := a[0].(*string); ok { + *ptr = "all" + } + return 1, nil + } + + withArgs(t, "engram", "projects", "prune", "--paths-only") + _, stderr := captureOutput(t, func() { cmdProjectsPrune(cfg) }) + if stderr != "" { + t.Fatalf("expected no stderr, got: %q", stderr) + } + + s, err := store.New(cfg) + if err != nil { + t.Fatalf("store.New: %v", err) + } + defer s.Close() + // Use ListProjectsWithStats: it includes 0-observation projects (derived from + // sessions), so a not-pruned empty project like "legit-empty" still shows up. + stats, err := s.ListProjectsWithStats() + if err != nil { + t.Fatalf("ListProjectsWithStats: %v", err) + } + names := make([]string, len(stats)) + for i, ps := range stats { + names[i] = ps.Name + } + + has := func(target string) bool { + for _, n := range names { + if n == target { + return true + } + } + return false + } + if has(`c:\users\foo`) { + t.Fatalf("path-named project should have been pruned, still present: %v", names) + } + if !has("legit-empty") { + t.Fatalf("legitimate empty project must NOT be pruned by --paths-only: %v", names) + } + if !has("realproject") { + t.Fatalf("project with observations must remain: %v", names) + } +}