Skip to content

Commit 053923f

Browse files
fix(cli): close #283 bug 4 — tighten consolidate grouping + prune path-named projects
Bug 4 from issue #283 has two halves, both addressed here. Grouping (data-loss footgun): groupSimilarProjects unioned every project that shared ANY directory. A noisy ancestor (e.g. $HOME or a session root) listed under dozens of projects collapsed the whole store into one component, so `projects consolidate --all` proposed merging unrelated projects into a single bucket. Now skip any directory shared by more than maxSharedProjectsForDirMatch (3) distinct projects; genuine rename signals (a directory shared by a few projects) still group. Cleanup of pre-existing garbage: add `--paths-only` to `projects prune`, restricting the 0-observation candidate set to projects whose name is a filesystem path. Mirrors the write-boundary rejection in normalizeExplicitWriteProject (ContainsAny(name, "/\\")), so cleanup targets exactly the names newer versions already refuse to create. Reuses the existing prune machinery (dry-run, interactive selection, cascade delete). Regression coverage (unit + command-level): noisy shared dir not grouped; small shared dir still groups; name similarity untouched; path-like detector table; `consolidate --all` proposes no merge for noisy-shared-dir projects; `prune --paths-only` lists and deletes only path-named empty projects.
1 parent 6bfe33d commit 053923f

3 files changed

Lines changed: 300 additions & 5 deletions

File tree

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
package main
2+
3+
import (
4+
"strings"
5+
"testing"
6+
7+
"github.com/Gentleman-Programming/engram/internal/store"
8+
)
9+
10+
// groupContaining returns the group whose Names include target, or nil.
11+
func groupContaining(groups []projectGroup, target string) *projectGroup {
12+
for i := range groups {
13+
for _, n := range groups[i].Names {
14+
if n == target {
15+
return &groups[i]
16+
}
17+
}
18+
}
19+
return nil
20+
}
21+
22+
// TestGroupSimilarProjects_NoisySharedDirNotGrouped reproduces issue #283 bug 4:
23+
// many unrelated projects that merely share a common parent/root directory
24+
// (e.g. $HOME or a session root) must NOT be unioned into one giant component.
25+
// Before the fix, the shared-directory union step grouped all of them together
26+
// and consolidate would propose merging them into a single canonical project —
27+
// a catastrophic data-loss footgun.
28+
func TestGroupSimilarProjects_NoisySharedDirNotGrouped(t *testing.T) {
29+
// Names are deliberately long and mutually dissimilar (no substring overlap,
30+
// Levenshtein distance well above the scaled threshold) so the ONLY possible
31+
// grouping signal is the shared "/home/user" directory.
32+
const sharedHome = "/home/user"
33+
projects := []store.ProjectStats{
34+
{Name: "kubernetes", ObservationCount: 5, Directories: []string{"/home/user/kubernetes", sharedHome}},
35+
{Name: "photoshop", ObservationCount: 7, Directories: []string{"/home/user/photoshop", sharedHome}},
36+
{Name: "wireguard", ObservationCount: 3, Directories: []string{"/home/user/wireguard", sharedHome}},
37+
{Name: "blender", ObservationCount: 9, Directories: []string{"/home/user/blender", sharedHome}},
38+
{Name: "terraform", ObservationCount: 2, Directories: []string{"/home/user/terraform", sharedHome}},
39+
}
40+
41+
groups := groupSimilarProjects(projects)
42+
43+
// The five names are mutually dissimilar, so the ONLY thing that could group
44+
// them is the shared "/home/user" directory. Since that directory is touched
45+
// by more than maxSharedProjectsForDirMatch distinct projects, it must be
46+
// treated as noise and skipped — yielding no groups at all.
47+
if len(groups) != 0 {
48+
t.Fatalf("expected 0 groups (noisy shared dir must be skipped), got %d: %+v", len(groups), groups)
49+
}
50+
}
51+
52+
// TestGroupSimilarProjects_RealSharedDirStillGroups guards against over-correcting:
53+
// when only a small number of projects share a directory (a genuine rename signal),
54+
// they SHOULD still be grouped.
55+
func TestGroupSimilarProjects_RealSharedDirStillGroups(t *testing.T) {
56+
const sharedRepo = "/repos/shared-monorepo"
57+
projects := []store.ProjectStats{
58+
{Name: "webapp", ObservationCount: 12, Directories: []string{sharedRepo}},
59+
{Name: "legacy-portal", ObservationCount: 4, Directories: []string{sharedRepo}},
60+
}
61+
62+
groups := groupSimilarProjects(projects)
63+
64+
g := groupContaining(groups, "webapp")
65+
if g == nil {
66+
t.Fatalf("expected webapp and legacy-portal to be grouped via shared dir, got groups: %+v", groups)
67+
}
68+
if len(g.Names) != 2 {
69+
t.Fatalf("expected group of 2, got %d: %+v", len(g.Names), g.Names)
70+
}
71+
}
72+
73+
// TestGroupSimilarProjects_NameSimilarityStillWorks confirms the legitimate
74+
// name-based grouping path is untouched by the shared-dir fix.
75+
func TestGroupSimilarProjects_NameSimilarityStillWorks(t *testing.T) {
76+
// "frontend" is a substring of "frontend-app" → name-similar (no shared dir).
77+
projects := []store.ProjectStats{
78+
{Name: "frontend", ObservationCount: 8, Directories: []string{"/a"}},
79+
{Name: "frontend-app", ObservationCount: 5, Directories: []string{"/b"}},
80+
{Name: "unrelated", ObservationCount: 1, Directories: []string{"/c"}},
81+
}
82+
83+
groups := groupSimilarProjects(projects)
84+
85+
g := groupContaining(groups, "frontend")
86+
if g == nil {
87+
t.Fatalf("expected frontend* projects grouped by name similarity, got: %+v", groups)
88+
}
89+
if len(g.Names) != 2 {
90+
t.Fatalf("expected name-similar group of 2, got %d: %+v", len(g.Names), g.Names)
91+
}
92+
if groupContaining(groups, "unrelated") != nil {
93+
t.Fatalf("unrelated project must not be grouped: %+v", groups)
94+
}
95+
}
96+
97+
// TestCmdProjectsConsolidateAllDoesNotMergeNoisySharedDir is a command-level
98+
// regression test for issue #283 bug 4: `projects consolidate --all` must not
99+
// propose merging many unrelated projects just because their sessions share a
100+
// common directory. mustSeedSession records every session under the same "/tmp"
101+
// directory, so seeding several distinctly-named projects reproduces the noisy
102+
// shared-directory scenario end-to-end.
103+
func TestCmdProjectsConsolidateAllDoesNotMergeNoisySharedDir(t *testing.T) {
104+
cfg := testConfig(t)
105+
106+
for _, name := range []string{"kubernetes", "photoshop", "wireguard", "blender", "terraform"} {
107+
mustSeedSession(t, cfg, "s-"+name, name)
108+
}
109+
110+
withArgs(t, "engram", "projects", "consolidate", "--all", "--dry-run")
111+
stdout, stderr := captureOutput(t, func() { cmdProjectsConsolidate(cfg) })
112+
113+
if stderr != "" {
114+
t.Fatalf("expected no stderr, got: %q", stderr)
115+
}
116+
if !strings.Contains(stdout, "No similar project name groups found") {
117+
t.Fatalf("expected no groups for projects sharing only a noisy dir, got: %q", stdout)
118+
}
119+
if strings.Contains(stdout, "Would merge") {
120+
t.Fatalf("must not propose a merge for noisy-shared-dir projects, got: %q", stdout)
121+
}
122+
}

cmd/engram/main.go

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1836,7 +1836,7 @@ func cmdProjects(cfg store.Config) {
18361836
fmt.Fprintf(os.Stderr, "unknown projects subcommand: %s\n", subCmd)
18371837
fmt.Fprintln(os.Stderr, "usage: engram projects list")
18381838
fmt.Fprintln(os.Stderr, " engram projects consolidate [--all] [--dry-run]")
1839-
fmt.Fprintln(os.Stderr, " engram projects prune [--dry-run]")
1839+
fmt.Fprintln(os.Stderr, " engram projects prune [--dry-run] [--paths-only]")
18401840
exitFunc(1)
18411841
}
18421842
}
@@ -1883,6 +1883,20 @@ type projectGroup struct {
18831883
Canonical string // suggested canonical (most observations)
18841884
}
18851885

1886+
// maxSharedProjectsForDirMatch caps how many distinct projects may share a single
1887+
// directory before that directory is treated as noise (e.g. $HOME or a session
1888+
// root) and excluded from shared-directory grouping. See issue #283 bug 4.
1889+
const maxSharedProjectsForDirMatch = 3
1890+
1891+
// distinctProjectCount returns the number of unique project indices in idxs.
1892+
func distinctProjectCount(idxs []int) int {
1893+
seen := make(map[int]struct{}, len(idxs))
1894+
for _, i := range idxs {
1895+
seen[i] = struct{}{}
1896+
}
1897+
return len(seen)
1898+
}
1899+
18861900
// groupSimilarProjects groups projects by name similarity and shared directories.
18871901
// Uses a simple union-find approach.
18881902
func groupSimilarProjects(projects []store.ProjectStats) []projectGroup {
@@ -1939,6 +1953,13 @@ func groupSimilarProjects(projects []store.ProjectStats) []projectGroup {
19391953
}
19401954
}
19411955
for _, idxs := range dirToProjects {
1956+
// Skip noisy ancestor directories shared by many unrelated projects
1957+
// (issue #283 bug 4): unioning everything under $HOME or a session root
1958+
// collapses the whole store into one component and turns consolidate into
1959+
// a data-loss footgun.
1960+
if distinctProjectCount(idxs) > maxSharedProjectsForDirMatch {
1961+
continue
1962+
}
19421963
for k := 1; k < len(idxs); k++ {
19431964
union(idxs[0], idxs[k])
19441965
}
@@ -2241,11 +2262,23 @@ func cmdProjectsConsolidate(cfg store.Config) {
22412262
}
22422263
}
22432264

2265+
// isPathLikeProjectName reports whether a project name is actually a filesystem
2266+
// path (contains a path separator). Mirrors the write-boundary rejection in
2267+
// normalizeExplicitWriteProject so cleanup targets exactly the names newer
2268+
// versions now refuse to create. See issue #283 bug 2/bug 4.
2269+
func isPathLikeProjectName(name string) bool {
2270+
return strings.ContainsAny(name, `/\`)
2271+
}
2272+
22442273
func cmdProjectsPrune(cfg store.Config) {
22452274
dryRun := false
2275+
pathsOnly := false
22462276
for i := 3; i < len(os.Args); i++ {
2247-
if os.Args[i] == "--dry-run" {
2277+
switch os.Args[i] {
2278+
case "--dry-run":
22482279
dryRun = true
2280+
case "--paths-only":
2281+
pathsOnly = true
22492282
}
22502283
}
22512284

@@ -2263,13 +2296,24 @@ func cmdProjectsPrune(cfg store.Config) {
22632296
// Find projects with 0 observations
22642297
var candidates []store.ProjectStats
22652298
for _, ps := range allStats {
2266-
if ps.ObservationCount == 0 {
2267-
candidates = append(candidates, ps)
2299+
if ps.ObservationCount != 0 {
2300+
continue
22682301
}
2302+
// --paths-only: restrict to garbage projects whose name is a filesystem
2303+
// path (issue #283 bug 4). These are leftovers from older versions that
2304+
// fell back to using cwd as the project name when detection failed.
2305+
if pathsOnly && !isPathLikeProjectName(ps.Name) {
2306+
continue
2307+
}
2308+
candidates = append(candidates, ps)
22692309
}
22702310

22712311
if len(candidates) == 0 {
2272-
fmt.Println("No empty projects to prune.")
2312+
if pathsOnly {
2313+
fmt.Println("No path-named projects to prune.")
2314+
} else {
2315+
fmt.Println("No empty projects to prune.")
2316+
}
22732317
return
22742318
}
22752319

cmd/engram/prune_paths_test.go

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
package main
2+
3+
import (
4+
"strings"
5+
"testing"
6+
7+
"github.com/Gentleman-Programming/engram/internal/store"
8+
)
9+
10+
// TestIsPathLikeProjectName covers the detector used to single out garbage
11+
// "path-as-name" projects (issue #283 bug 4 / bug 2 leftovers). It mirrors the
12+
// write-boundary rejection in normalizeExplicitWriteProject: a project name that
13+
// contains a path separator is a filesystem path, not a real project name.
14+
func TestIsPathLikeProjectName(t *testing.T) {
15+
cases := []struct {
16+
name string
17+
want bool
18+
}{
19+
{`c:\users\foo`, true},
20+
{`/home/user`, true},
21+
{`c:/users/foo`, true},
22+
{`\\server\share`, true},
23+
{"d:\\workspace\\sub", true},
24+
{"engram", false},
25+
{"agent-teams-lite", false},
26+
{"e3", false},
27+
{"general", false},
28+
{"", false},
29+
}
30+
for _, tc := range cases {
31+
if got := isPathLikeProjectName(tc.name); got != tc.want {
32+
t.Errorf("isPathLikeProjectName(%q) = %v, want %v", tc.name, got, tc.want)
33+
}
34+
}
35+
}
36+
37+
// TestCmdProjectsPrunePathsOnlyDryRun verifies that `--paths-only` narrows the
38+
// prune candidate set to path-named projects with 0 observations, leaving
39+
// legitimate empty projects and projects with observations untouched.
40+
func TestCmdProjectsPrunePathsOnlyDryRun(t *testing.T) {
41+
cfg := testConfig(t)
42+
43+
// Garbage: path-as-name project, 0 observations (only a session).
44+
mustSeedSession(t, cfg, "s-garbage", `c:\users\foo`)
45+
// Legitimate empty project, 0 observations — must NOT be selected by --paths-only.
46+
mustSeedSession(t, cfg, "s-legit", "legit-empty")
47+
// Real project with observations — never a prune candidate.
48+
mustSeedObservation(t, cfg, "s-real", "realproject", "note", "title", "content", "project")
49+
50+
withArgs(t, "engram", "projects", "prune", "--paths-only", "--dry-run")
51+
stdout, stderr := captureOutput(t, func() { cmdProjectsPrune(cfg) })
52+
53+
if stderr != "" {
54+
t.Fatalf("expected no stderr, got: %q", stderr)
55+
}
56+
if !strings.Contains(stdout, `c:\users\foo`) {
57+
t.Fatalf("expected path-named project listed, got: %q", stdout)
58+
}
59+
if strings.Contains(stdout, "legit-empty") {
60+
t.Fatalf("legitimate empty project must NOT be listed with --paths-only, got: %q", stdout)
61+
}
62+
if strings.Contains(stdout, "realproject") {
63+
t.Fatalf("project with observations must never be a prune candidate, got: %q", stdout)
64+
}
65+
if !strings.Contains(stdout, "dry-run") {
66+
t.Fatalf("expected dry-run notice, got: %q", stdout)
67+
}
68+
}
69+
70+
// TestCmdProjectsPrunePathsOnlyDeletesOnlyPathNamed verifies the actual
71+
// (non-dry-run) deletion: `prune --paths-only` removes path-named empty projects
72+
// while leaving legitimate empty projects and projects with observations intact.
73+
func TestCmdProjectsPrunePathsOnlyDeletesOnlyPathNamed(t *testing.T) {
74+
cfg := testConfig(t)
75+
76+
mustSeedSession(t, cfg, "s-garbage", `c:\users\foo`)
77+
mustSeedSession(t, cfg, "s-legit", "legit-empty")
78+
mustSeedObservation(t, cfg, "s-real", "realproject", "note", "title", "content", "project")
79+
80+
// Answer "all" to prune every (path-named) candidate.
81+
oldScan := scanInputLine
82+
t.Cleanup(func() { scanInputLine = oldScan })
83+
scanInputLine = func(a ...any) (int, error) {
84+
if ptr, ok := a[0].(*string); ok {
85+
*ptr = "all"
86+
}
87+
return 1, nil
88+
}
89+
90+
withArgs(t, "engram", "projects", "prune", "--paths-only")
91+
_, stderr := captureOutput(t, func() { cmdProjectsPrune(cfg) })
92+
if stderr != "" {
93+
t.Fatalf("expected no stderr, got: %q", stderr)
94+
}
95+
96+
s, err := store.New(cfg)
97+
if err != nil {
98+
t.Fatalf("store.New: %v", err)
99+
}
100+
defer s.Close()
101+
// Use ListProjectsWithStats: it includes 0-observation projects (derived from
102+
// sessions), so a not-pruned empty project like "legit-empty" still shows up.
103+
stats, err := s.ListProjectsWithStats()
104+
if err != nil {
105+
t.Fatalf("ListProjectsWithStats: %v", err)
106+
}
107+
names := make([]string, len(stats))
108+
for i, ps := range stats {
109+
names[i] = ps.Name
110+
}
111+
112+
has := func(target string) bool {
113+
for _, n := range names {
114+
if n == target {
115+
return true
116+
}
117+
}
118+
return false
119+
}
120+
if has(`c:\users\foo`) {
121+
t.Fatalf("path-named project should have been pruned, still present: %v", names)
122+
}
123+
if !has("legit-empty") {
124+
t.Fatalf("legitimate empty project must NOT be pruned by --paths-only: %v", names)
125+
}
126+
if !has("realproject") {
127+
t.Fatalf("project with observations must remain: %v", names)
128+
}
129+
}

0 commit comments

Comments
 (0)