Skip to content

Commit f0def5a

Browse files
authored
Merge pull request #865 from nextlevelbuilder/dev
Release: vault enrich filter, stop bug, graph, tests, security fixes
2 parents fb60403 + 1ea24ef commit f0def5a

4 files changed

Lines changed: 74 additions & 17 deletions

File tree

internal/vault/enrich_classify.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ import (
66
"log/slog"
77
"maps"
88
"slices"
9-
"strings"
109

1110
"github.com/nextlevelbuilder/goclaw/internal/providers"
1211
"github.com/nextlevelbuilder/goclaw/internal/store"
@@ -154,8 +153,8 @@ func (w *EnrichWorker) gatherCandidates(ctx context.Context, tenantID, _ string,
154153
if n.Score < enrichSimilarityMin || n.Document.Summary == "" {
155154
continue
156155
}
157-
// Skip auto-generated media files as link targets — they create noise.
158-
if strings.HasPrefix(n.Document.PathBasename, "goclaw_gen_") {
156+
// Skip meaningless filenames as link targets — they create noise.
157+
if shouldSkipEnrichment(n.Document.PathBasename) {
159158
continue
160159
}
161160
// Bidirectional dedup: only process each pair once.
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
package vault
2+
3+
import (
4+
"log/slog"
5+
"regexp"
6+
"strings"
7+
)
8+
9+
// Compiled patterns for meaningless filenames that should skip enrichment.
10+
var (
11+
reDigitsOnly = regexp.MustCompile(`^[0-9]+$`)
12+
reUUID = regexp.MustCompile(`(?i)^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$`)
13+
reHexHash = regexp.MustCompile(`(?i)^[0-9a-f]{8,}$`)
14+
reMixedJunk = regexp.MustCompile(`(?i)^(img|tmp|temp|dsc|screenshot|untitled|file|pic|photo|vid|clip|scan|page)[_-][0-9]+$`)
15+
)
16+
17+
// shouldSkipEnrichment returns true if the basename indicates a file that
18+
// would produce noise during enrichment (auto-generated, hash-named, etc.).
19+
// Checks are applied to the stem (basename without extension).
20+
// Unicode/CJK filenames pass through — they carry semantic meaning.
21+
func shouldSkipEnrichment(basename string) bool {
22+
// Strip extension to get stem.
23+
stem := basename
24+
if idx := strings.LastIndex(basename, "."); idx > 0 {
25+
stem = basename[:idx]
26+
}
27+
28+
switch {
29+
case strings.HasPrefix(stem, "goclaw_gen_"):
30+
slog.Debug("vault.enrich: skip_generated", "file", basename)
31+
return true
32+
case len(stem) < 3:
33+
slog.Debug("vault.enrich: skip_short", "file", basename)
34+
return true
35+
case reDigitsOnly.MatchString(stem):
36+
slog.Debug("vault.enrich: skip_digits", "file", basename)
37+
return true
38+
case reUUID.MatchString(stem):
39+
slog.Debug("vault.enrich: skip_uuid", "file", basename)
40+
return true
41+
case reHexHash.MatchString(stem):
42+
slog.Debug("vault.enrich: skip_hex_hash", "file", basename)
43+
return true
44+
case reMixedJunk.MatchString(stem):
45+
slog.Debug("vault.enrich: skip_mixed_junk", "file", basename)
46+
return true
47+
}
48+
return false
49+
}

internal/vault/enrich_worker.go

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -99,18 +99,20 @@ func (w *EnrichWorker) resolveProviderForTenant(ctx context.Context, tenantID st
9999
// Stop cancels in-flight enrichment for the given tenant.
100100
// Safe to call even if no enrichment is running.
101101
func (w *EnrichWorker) Stop(tenantID string) {
102-
if cancel, ok := w.cancelFuncs.Load(tenantID); ok {
102+
if cancel, ok := w.cancelFuncs.LoadAndDelete(tenantID); ok {
103103
cancel.(context.CancelFunc)()
104-
w.cancelFuncs.Delete(tenantID)
105-
w.progress.Finish()
106-
slog.Info("vault.enrich: stopped by user", "tenant", tenantID)
107104
}
105+
// Always finish progress — ensures UI resets even if cancelFuncs was empty.
106+
w.progress.Finish()
107+
slog.Info("vault.enrich: stopped by user", "tenant", tenantID)
108108
}
109109

110110
// IsRunning returns true if enrichment is in progress for the tenant.
111111
func (w *EnrichWorker) IsRunning(tenantID string) bool {
112-
_, ok := w.cancelFuncs.Load(tenantID)
113-
return ok
112+
if _, ok := w.cancelFuncs.Load(tenantID); ok {
113+
return true
114+
}
115+
return w.progress.Status().Running
114116
}
115117

116118
// EnqueueUnenriched fetches documents with empty summary and emits enrichment events.
@@ -127,8 +129,8 @@ func (w *EnrichWorker) EnqueueUnenriched(ctx context.Context, tenantID, workspac
127129

128130
count := 0
129131
for _, doc := range docs {
130-
// Skip auto-generated media files — they create noise links.
131-
if strings.HasPrefix(filepath.Base(doc.Path), "goclaw_gen_") {
132+
// Skip meaningless filenames — they create noise links.
133+
if shouldSkipEnrichment(filepath.Base(doc.Path)) {
132134
continue
133135
}
134136
agentID := ""
@@ -178,10 +180,8 @@ func (w *EnrichWorker) Handle(ctx context.Context, event eventbus.DomainEvent) e
178180
return nil
179181
}
180182

181-
// Skip auto-generated media files (goclaw_gen_*) — they create excessive
182-
// noise links due to similar embeddings. These are typically image outputs
183-
// that don't benefit from semantic linking.
184-
if basename := filepath.Base(payload.Path); strings.HasPrefix(basename, "goclaw_gen_") {
183+
// Skip meaningless filenames — they create noise links.
184+
if shouldSkipEnrichment(filepath.Base(payload.Path)) {
185185
return nil
186186
}
187187

@@ -201,7 +201,12 @@ func (w *EnrichWorker) Handle(ctx context.Context, event eventbus.DomainEvent) e
201201
return nil // another goroutine already processing this agent's queue
202202
}
203203

204-
w.processBatch(ctx, key)
204+
// Create per-tenant cancel context for stop capability.
205+
cancelCtx, cancel := context.WithCancel(ctx)
206+
w.cancelFuncs.Store(payload.TenantID, cancel)
207+
w.processBatch(cancelCtx, key)
208+
// Clean up after batch completes naturally.
209+
w.cancelFuncs.Delete(payload.TenantID)
205210
return nil
206211
}
207212

@@ -217,6 +222,10 @@ type enriched struct {
217222
// overwhelm the LLM provider with hundreds of concurrent requests.
218223
func (w *EnrichWorker) processBatch(ctx context.Context, key string) {
219224
for {
225+
if ctx.Err() != nil {
226+
w.queue.TryFinish(key)
227+
return
228+
}
220229
items := w.queue.Drain(key)
221230
if len(items) == 0 {
222231
if w.queue.TryFinish(key) {

internal/vault/rescan.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ func RescanWorkspace(ctx context.Context, params RescanParams, vs store.VaultSto
117117
// can call progress.Start(total) before workers receive events.
118118
// Skip auto-generated media files (goclaw_gen_*) — they create excessive
119119
// noise links and shouldn't be counted in progress tracking.
120-
if bus != nil && !strings.HasPrefix(filepath.Base(relPath), "goclaw_gen_") {
120+
if bus != nil && !shouldSkipEnrichment(filepath.Base(relPath)) {
121121
result.PendingEvents = append(result.PendingEvents, eventbus.DomainEvent{
122122
ID: uuid.Must(uuid.NewV7()).String(),
123123
Type: eventbus.EventVaultDocUpserted,

0 commit comments

Comments
 (0)