Skip to content

Commit fac0c57

Browse files
committed
Fix stability: SQLite lock race, stale SHM recovery, OOM guard, CPU reduction
- Fix ref counting race in ListProjects and watcher pollAll: use AcquireStore instead of ForProject to prevent evictor from closing stores mid-query (#52) - Add recoverStaleSHM: remove stale SHM files after SIGKILL to prevent deadlock on next connection (#52) - Increase busy_timeout from 5s to 10s for large databases (#52) - Default GOMEMLIMIT to 2GB when not configured, preventing unbounded memory growth and OOM kills (#49, #46) - Reduce mmap_size from 256MB to 64MB per database (#49, #46) - Increase watcher base interval from 1s to 5s and poll interval base from 1s to 5s, reducing CPU usage by ~5x (#45) - Normalize Windows drive letter to lowercase in ProjectNameFromPath, preventing duplicate databases for same path (#50)
1 parent 74546ed commit fac0c57

File tree

7 files changed

+74
-32
lines changed

7 files changed

+74
-32
lines changed

cmd/codebase-memory-mcp/main.go

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -67,13 +67,14 @@ func main() {
6767
log.Fatalf("config err=%v", err)
6868
}
6969

70-
// Apply GOMEMLIMIT from config (e.g. "2G", "512M").
71-
if memLimit := cfg.Get(store.ConfigMemLimit, ""); memLimit != "" {
72-
if limit, parseErr := parseByteSize(memLimit); parseErr != nil {
73-
log.Printf("config: invalid mem_limit %q: %v", memLimit, parseErr)
74-
} else {
75-
debug.SetMemoryLimit(limit)
76-
}
70+
// Apply GOMEMLIMIT from config (e.g. "2G", "512M"), default 2GB.
71+
// Prevents OOM kills on systems without user-configured limits.
72+
memLimitStr := cfg.Get(store.ConfigMemLimit, "2G")
73+
if limit, parseErr := parseByteSize(memLimitStr); parseErr != nil {
74+
log.Printf("config: invalid mem_limit %q: %v", memLimitStr, parseErr)
75+
debug.SetMemoryLimit(2 << 30) // fallback: 2GB
76+
} else {
77+
debug.SetMemoryLimit(limit)
7778
}
7879

7980
srv := tools.NewServer(router, tools.WithConfig(cfg))

internal/pipeline/pipeline.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,11 @@ func ProjectNameFromPath(absPath string) string {
7373
// Clean and normalize separators (backslash is not a separator on non-Windows)
7474
cleaned := filepath.ToSlash(filepath.Clean(absPath))
7575
cleaned = strings.ReplaceAll(cleaned, "\\", "/")
76+
// Normalize Windows drive letter casing: "D:/foo" → "d:/foo"
77+
// Prevents duplicate DBs for same path with different drive letter case.
78+
if len(cleaned) >= 2 && cleaned[1] == ':' {
79+
cleaned = strings.ToLower(cleaned[:1]) + cleaned[1:]
80+
}
7681
// Replace slashes and colons with dashes
7782
name := strings.ReplaceAll(cleaned, "/", "-")
7883
name = strings.ReplaceAll(name, ":", "-")

internal/pipeline/pipeline_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -793,10 +793,10 @@ func TestProjectNameFromPath(t *testing.T) {
793793
{"/Users/martin/projects/myapp", "Users-martin-projects-myapp"},
794794
{"/home/user/repo", "home-user-repo"},
795795
{"/single", "single"},
796-
// Windows paths (#20)
797-
{"C:/Users/project", "C-Users-project"},
798-
{"D:\\Projects\\myapp", "D-Projects-myapp"},
799-
{"C:\\Temp\\codebase-memory-mcp", "C-Temp-codebase-memory-mcp"},
796+
// Windows paths (#20) — drive letter normalized to lowercase (#50)
797+
{"C:/Users/project", "c-Users-project"},
798+
{"D:\\Projects\\myapp", "d-Projects-myapp"},
799+
{"C:\\Temp\\codebase-memory-mcp", "c-Temp-codebase-memory-mcp"},
800800
}
801801
for _, tt := range tests {
802802
got := ProjectNameFromPath(tt.path)

internal/store/router.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,8 @@ func (r *StoreRouter) AllStores() map[string]*Store {
194194
}
195195

196196
// ListProjects scans .db files and queries each for metadata.
197+
// Uses AcquireStore to prevent the evictor from closing stores mid-query.
198+
// Individual DB failures are logged and skipped (never block the full list).
197199
func (r *StoreRouter) ListProjects() ([]*ProjectInfo, error) {
198200
entries, err := os.ReadDir(r.dir)
199201
if err != nil {
@@ -214,10 +216,12 @@ func (r *StoreRouter) ListProjects() ([]*ProjectInfo, error) {
214216
DBPath: filepath.Join(r.dir, e.Name()),
215217
}
216218

217-
// Try to get root_path from the projects table
218-
s, err := r.ForProject(name)
219-
if err == nil {
219+
// Try to get root_path from the projects table.
220+
// AcquireStore increments refs so the evictor can't close mid-query.
221+
s, release, acqErr := r.AcquireStore(name)
222+
if acqErr == nil {
220223
projects, listErr := s.ListProjects()
224+
release()
221225
if listErr == nil && len(projects) > 0 {
222226
info.RootPath = projects[0].RootPath
223227
}

internal/store/store.go

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,12 @@ func OpenInDir(dir, project string) (*Store, error) {
8181

8282
// OpenPath opens a SQLite database at the given path.
8383
func OpenPath(dbPath string) (*Store, error) {
84+
// Recover from stale SHM left by SIGKILL: if WAL is empty/missing but SHM exists,
85+
// the SHM has stale lock state that can deadlock new connections. Safe to remove.
86+
recoverStaleSHM(dbPath)
87+
8488
dsn := dbPath + "?_journal_mode=WAL" +
85-
"&_busy_timeout=5000" +
89+
"&_busy_timeout=10000" +
8690
"&_foreign_keys=1" +
8791
"&_synchronous=NORMAL" +
8892
"&_txlock=immediate"
@@ -97,7 +101,7 @@ func OpenPath(dbPath string) (*Store, error) {
97101
// PRAGMAs not supported in mattn DSN — set via Exec after Open.
98102
ctx := context.Background()
99103
_, _ = db.ExecContext(ctx, "PRAGMA temp_store = MEMORY")
100-
_, _ = db.ExecContext(ctx, "PRAGMA mmap_size = 268435456") // 256 MB
104+
_, _ = db.ExecContext(ctx, "PRAGMA mmap_size = 67108864") // 64 MB
101105

102106
// Adaptive cache: 10% of DB file size, clamped to 2-64 MB.
103107
cacheMB := adaptiveCacheMB(dbPath)
@@ -114,6 +118,32 @@ func OpenPath(dbPath string) (*Store, error) {
114118
return s, nil
115119
}
116120

121+
// recoverStaleSHM removes stale SHM files left by unclean shutdowns (SIGKILL).
122+
// If the WAL file is empty or missing but the SHM file exists, the SHM contains
123+
// stale lock state that can deadlock new connections. Removing it lets SQLite
124+
// rebuild clean shared memory on next open.
125+
func recoverStaleSHM(dbPath string) {
126+
shmPath := dbPath + "-shm"
127+
walPath := dbPath + "-wal"
128+
129+
shmInfo, shmErr := os.Stat(shmPath)
130+
if shmErr != nil {
131+
return // no SHM file — nothing to recover
132+
}
133+
134+
walInfo, walErr := os.Stat(walPath)
135+
walEmpty := walErr != nil || walInfo.Size() == 0
136+
137+
if walEmpty && shmInfo.Size() > 0 {
138+
slog.Info("store.recover_shm", "path", dbPath, "shm_bytes", shmInfo.Size())
139+
_ = os.Remove(shmPath)
140+
// Also remove empty WAL to let SQLite start fresh
141+
if walErr == nil {
142+
_ = os.Remove(walPath)
143+
}
144+
}
145+
}
146+
117147
// adaptiveCacheMB returns a cache size in MB proportional to the DB file size.
118148
// 10% of DB size, clamped to [2, 64] MB. Returns 2 for missing/small files.
119149
func adaptiveCacheMB(dbPath string) int {

internal/watcher/watcher.go

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ func (s watchStrategy) String() string {
4242
}
4343

4444
const (
45-
baseInterval = 1 * time.Second
45+
baseInterval = 5 * time.Second
4646
maxInterval = 60 * time.Second
4747
fullSnapshotInterval = 5 // polls between forced full snapshots
4848
projectsCacheTTL = 60 * time.Second
@@ -185,25 +185,27 @@ func (w *Watcher) pollAll() {
185185

186186
now := time.Now()
187187
for _, info := range projectInfos {
188-
st, stErr := w.router.ForProject(info.Name)
188+
state, exists := w.projects[info.Name]
189+
if exists && now.Before(state.nextPoll) {
190+
continue // not due yet
191+
}
192+
193+
// AcquireStore increments refs so the evictor can't close mid-query.
194+
st, release, stErr := w.router.AcquireStore(info.Name)
189195
if stErr != nil {
190196
continue
191197
}
192198
proj, projErr := st.GetProject(info.Name)
199+
release()
193200
if projErr != nil || proj == nil {
194201
continue
195202
}
196203

197-
state, exists := w.projects[info.Name]
198204
if !exists {
199205
state = &projectState{}
200206
w.projects[info.Name] = state
201207
}
202208

203-
if exists && now.Before(state.nextPoll) {
204-
continue // not due yet
205-
}
206-
207209
w.pollProject(proj, state)
208210
}
209211
}
@@ -579,9 +581,9 @@ func snapshotsEqual(a, b map[string]fileSnapshot) bool {
579581
}
580582

581583
// pollInterval computes the adaptive interval from file count.
582-
// 1s base + 1s per 500 files, capped at 60s.
584+
// 5s base + 1s per 500 files, capped at 60s.
583585
func pollInterval(fileCount int) time.Duration {
584-
ms := 1000 + (fileCount/500)*1000
586+
ms := 5000 + (fileCount/500)*1000
585587
if ms > 60000 {
586588
ms = 60000
587589
}

internal/watcher/watcher_test.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -163,13 +163,13 @@ func TestPollInterval(t *testing.T) {
163163
files int
164164
expected time.Duration
165165
}{
166-
{0, 1 * time.Second},
167-
{70, 1 * time.Second},
168-
{499, 1 * time.Second},
169-
{500, 2 * time.Second},
170-
{2000, 5 * time.Second},
171-
{5000, 11 * time.Second},
172-
{10000, 21 * time.Second},
166+
{0, 5 * time.Second},
167+
{70, 5 * time.Second},
168+
{499, 5 * time.Second},
169+
{500, 6 * time.Second},
170+
{2000, 9 * time.Second},
171+
{5000, 15 * time.Second},
172+
{10000, 25 * time.Second},
173173
{50000, 60 * time.Second},
174174
{100000, 60 * time.Second},
175175
}

0 commit comments

Comments
 (0)