Skip to content

Commit 0336e90

Browse files
author
razvan
committed
feat(indexing): add extension breakdown to indexing progress and exclude tmp dir
- Added `Breakdown map[string]int` to `LangStatus` to track sub-extension counts per language (e.g., TS vs JS vs Vue). - Refactored `CountAllFiles` to return a `FileCountResult` struct containing both main counts and extension breakdowns. - Updated `engine.go` to populate `Breakdown` during indexing pre-scan, allowing user interfaces to distinguish between types like TypeScript and JavaScript files. - Added `tmp` to hardcoded exclusion directories to prevent test fixture workspaces from skewing index counts. - Added comprehensive unit tests for `CountAllFiles` and JSON round-trip stability.
1 parent 4118928 commit 0336e90

5 files changed

Lines changed: 297 additions & 16 deletions

File tree

internal/service/engine/engine.go

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1039,12 +1039,12 @@ func (e *Engine) IndexWorkspace(ctx context.Context, path string, recreate bool)
10391039
// This gives us the real on_disk totals for accurate progress reporting,
10401040
// instead of using len(changedFiles) which only reflects modified files.
10411041
fileCounts := e.indexer.CountAllFiles(wctx.Root, excludePatterns)
1042-
logger.Instance.Info("[IDX] ws=%s file counts: %v", wsName, fileCounts)
1042+
logger.Instance.Info("[IDX] ws=%s file counts: %v (breakdowns: %v)", wsName, fileCounts.Counts, fileCounts.Breakdowns)
10431043

10441044
// Sort languages by file count descending so the dominant language is indexed
10451045
// first and AI search works immediately for the most relevant code.
10461046
sort.Slice(languages, func(i, j int) bool {
1047-
return fileCounts[languages[i]] > fileCounts[languages[j]]
1047+
return fileCounts.Counts[languages[i]] > fileCounts.Counts[languages[j]]
10481048
})
10491049
logger.Instance.Info("[IDX] ws=%s indexing order: %v", wsName, languages)
10501050

@@ -1059,17 +1059,21 @@ func (e *Engine) IndexWorkspace(ctx context.Context, path string, recreate bool)
10591059
if s.Languages == nil {
10601060
s.Languages = make(map[string]indexer.LangStatus)
10611061
}
1062-
// Pre-populate real on_disk counts so languages with 0 changed files still appear.
1062+
// Pre-populate real on_disk counts and extension breakdowns so languages
1063+
// with 0 changed files still appear, and consumers see sub-type detail.
10631064
for _, l := range languages {
10641065
entry := s.Languages[l]
1065-
entry.OnDisk = fileCounts[l]
1066+
entry.OnDisk = fileCounts.Counts[l]
1067+
if bd, ok := fileCounts.Breakdowns[l]; ok {
1068+
entry.Breakdown = bd
1069+
}
10661070
s.Languages[l] = entry
10671071
}
10681072
indexer.SaveIndexStatus(wctx.Root, s)
10691073

10701074
var indexErrors []string
10711075
for _, lang := range languages {
1072-
diskTotal := fileCounts[lang]
1076+
diskTotal := fileCounts.Counts[lang]
10731077
collection := wctx.CollectionName(lang)
10741078
logger.Instance.Info("[IDX] ws=%s lang=%s ▶ starting (on_disk=%d)", wsName, lang, diskTotal)
10751079

pkg/indexer/count_files_test.go

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
package indexer
2+
3+
import (
4+
"os"
5+
"path/filepath"
6+
"testing"
7+
8+
// Import parsers so they register via init()
9+
_ "github.com/doITmagic/rag-code-mcp/pkg/parser/css"
10+
_ "github.com/doITmagic/rag-code-mcp/pkg/parser/docs"
11+
_ "github.com/doITmagic/rag-code-mcp/pkg/parser/go"
12+
_ "github.com/doITmagic/rag-code-mcp/pkg/parser/html"
13+
_ "github.com/doITmagic/rag-code-mcp/pkg/parser/javascript"
14+
_ "github.com/doITmagic/rag-code-mcp/pkg/parser/php"
15+
_ "github.com/doITmagic/rag-code-mcp/pkg/parser/python"
16+
)
17+
18+
// createFile is a test helper that creates a file with minimal content.
19+
func createFile(t *testing.T, path string) {
20+
t.Helper()
21+
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
22+
t.Fatal(err)
23+
}
24+
if err := os.WriteFile(path, []byte("// test"), 0o644); err != nil {
25+
t.Fatal(err)
26+
}
27+
}
28+
29+
func TestCountAllFiles_Breakdown(t *testing.T) {
30+
root := t.TempDir()
31+
svc := &Service{} // CountAllFiles doesn't need embedder or store
32+
33+
// Create a mixed JS/TS workspace
34+
createFile(t, filepath.Join(root, "src", "app.ts"))
35+
createFile(t, filepath.Join(root, "src", "utils.ts"))
36+
createFile(t, filepath.Join(root, "src", "types.tsx"))
37+
createFile(t, filepath.Join(root, "src", "legacy.js"))
38+
createFile(t, filepath.Join(root, "src", "config.mjs"))
39+
createFile(t, filepath.Join(root, "src", "App.vue"))
40+
41+
// Create Go files
42+
createFile(t, filepath.Join(root, "backend", "main.go"))
43+
createFile(t, filepath.Join(root, "backend", "handler.go"))
44+
45+
// Create docs
46+
createFile(t, filepath.Join(root, "README.md"))
47+
createFile(t, filepath.Join(root, "config.yaml"))
48+
49+
// Create PHP
50+
createFile(t, filepath.Join(root, "web", "index.php"))
51+
52+
result := svc.CountAllFiles(root, nil)
53+
54+
// Verify total counts
55+
if result.Counts["javascript"] != 6 {
56+
t.Errorf("javascript count: got %d, want 6", result.Counts["javascript"])
57+
}
58+
if result.Counts["go"] != 2 {
59+
t.Errorf("go count: got %d, want 2", result.Counts["go"])
60+
}
61+
if result.Counts["docs"] != 2 {
62+
t.Errorf("docs count: got %d, want 2", result.Counts["docs"])
63+
}
64+
if result.Counts["php"] != 1 {
65+
t.Errorf("php count: got %d, want 1", result.Counts["php"])
66+
}
67+
68+
// Verify JavaScript breakdown — the key test!
69+
jsBd := result.Breakdowns["javascript"]
70+
if jsBd == nil {
71+
t.Fatal("expected javascript breakdown to be non-nil")
72+
}
73+
if jsBd[".ts"] != 2 {
74+
t.Errorf("javascript .ts: got %d, want 2", jsBd[".ts"])
75+
}
76+
if jsBd[".tsx"] != 1 {
77+
t.Errorf("javascript .tsx: got %d, want 1", jsBd[".tsx"])
78+
}
79+
if jsBd[".js"] != 1 {
80+
t.Errorf("javascript .js: got %d, want 1", jsBd[".js"])
81+
}
82+
if jsBd[".mjs"] != 1 {
83+
t.Errorf("javascript .mjs: got %d, want 1", jsBd[".mjs"])
84+
}
85+
if jsBd[".vue"] != 1 {
86+
t.Errorf("javascript .vue: got %d, want 1", jsBd[".vue"])
87+
}
88+
89+
// Verify Go breakdown (single extension)
90+
goBd := result.Breakdowns["go"]
91+
if goBd[".go"] != 2 {
92+
t.Errorf("go .go: got %d, want 2", goBd[".go"])
93+
}
94+
95+
// Verify docs breakdown
96+
docsBd := result.Breakdowns["docs"]
97+
if docsBd[".md"] != 1 {
98+
t.Errorf("docs .md: got %d, want 1", docsBd[".md"])
99+
}
100+
if docsBd[".yaml"] != 1 {
101+
t.Errorf("docs .yaml: got %d, want 1", docsBd[".yaml"])
102+
}
103+
}
104+
105+
func TestCountAllFiles_ExcludePatterns(t *testing.T) {
106+
root := t.TempDir()
107+
svc := &Service{}
108+
109+
createFile(t, filepath.Join(root, "src", "app.ts"))
110+
createFile(t, filepath.Join(root, "src", "utils.js"))
111+
createFile(t, filepath.Join(root, "dist", "bundle.js")) // should be excluded
112+
createFile(t, filepath.Join(root, "build", "output.js")) // should be excluded
113+
114+
result := svc.CountAllFiles(root, []string{"dist", "build"})
115+
116+
if result.Counts["javascript"] != 2 {
117+
t.Errorf("javascript count with excludes: got %d, want 2", result.Counts["javascript"])
118+
}
119+
120+
jsBd := result.Breakdowns["javascript"]
121+
if jsBd[".ts"] != 1 {
122+
t.Errorf("javascript .ts with excludes: got %d, want 1", jsBd[".ts"])
123+
}
124+
if jsBd[".js"] != 1 {
125+
t.Errorf("javascript .js with excludes: got %d, want 1", jsBd[".js"])
126+
}
127+
}
128+
129+
func TestCountAllFiles_EmptyDir(t *testing.T) {
130+
root := t.TempDir()
131+
svc := &Service{}
132+
133+
result := svc.CountAllFiles(root, nil)
134+
135+
if len(result.Counts) != 0 {
136+
t.Errorf("expected empty counts for empty dir, got %v", result.Counts)
137+
}
138+
if len(result.Breakdowns) != 0 {
139+
t.Errorf("expected empty breakdowns for empty dir, got %v", result.Breakdowns)
140+
}
141+
}
142+
143+
func TestCountAllFiles_NodeModulesExcluded(t *testing.T) {
144+
root := t.TempDir()
145+
svc := &Service{}
146+
147+
createFile(t, filepath.Join(root, "src", "app.ts"))
148+
createFile(t, filepath.Join(root, "node_modules", "lib", "index.js")) // auto-excluded
149+
150+
result := svc.CountAllFiles(root, nil)
151+
152+
if result.Counts["javascript"] != 1 {
153+
t.Errorf("expected 1 (node_modules excluded), got %d", result.Counts["javascript"])
154+
}
155+
if result.Breakdowns["javascript"][".ts"] != 1 {
156+
t.Errorf("expected .ts=1, got %d", result.Breakdowns["javascript"][".ts"])
157+
}
158+
}

pkg/indexer/index_status.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,10 @@ type IndexStatus struct {
2727

2828
// LangStatus holds indexing stats for a single language.
2929
type LangStatus struct {
30-
OnDisk int `json:"on_disk"` // total files on disk for this language
31-
Changed int `json:"-"` // internal: files that need processing (hidden from AI consumers)
32-
Processed int `json:"processed"` // files processed so far
30+
OnDisk int `json:"on_disk"` // total files on disk for this language
31+
Changed int `json:"-"` // internal: files that need processing (hidden from AI consumers)
32+
Processed int `json:"processed"` // files processed so far
33+
Breakdown map[string]int `json:"breakdown,omitempty"` // extension → count (e.g. ".ts": 37, ".js": 2)
3334
}
3435

3536
// callerChain returns a compact caller stack (skipping skip frames) for debugging.

pkg/indexer/index_status_test.go

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,3 +41,102 @@ func TestLoadIndexStatusMissing(t *testing.T) {
4141
t.Fatal("expected nil for missing file")
4242
}
4343
}
44+
45+
func TestIndexStatusBreakdownRoundTrip(t *testing.T) {
46+
wsRoot := t.TempDir()
47+
48+
status := &IndexStatus{
49+
StartedAt: "2026-04-01T00:00:00Z",
50+
Languages: map[string]LangStatus{
51+
"javascript": {
52+
OnDisk: 41,
53+
Processed: 41,
54+
Breakdown: map[string]int{
55+
".ts": 30,
56+
".tsx": 7,
57+
".js": 3,
58+
".vue": 1,
59+
},
60+
},
61+
"go": {
62+
OnDisk: 80,
63+
Processed: 80,
64+
Breakdown: map[string]int{
65+
".go": 80,
66+
},
67+
},
68+
"docs": {
69+
OnDisk: 15,
70+
Processed: 15,
71+
Breakdown: map[string]int{
72+
".md": 10,
73+
".yaml": 3,
74+
".json": 2,
75+
},
76+
},
77+
},
78+
}
79+
80+
SaveIndexStatus(wsRoot, status)
81+
loaded := LoadIndexStatus(wsRoot)
82+
83+
if loaded == nil {
84+
t.Fatal("expected non-nil after save")
85+
}
86+
87+
// Verify JavaScript breakdown
88+
jsStatus := loaded.Languages["javascript"]
89+
if jsStatus.OnDisk != 41 {
90+
t.Errorf("javascript OnDisk: got %d, want 41", jsStatus.OnDisk)
91+
}
92+
if jsStatus.Breakdown[".ts"] != 30 {
93+
t.Errorf("javascript .ts: got %d, want 30", jsStatus.Breakdown[".ts"])
94+
}
95+
if jsStatus.Breakdown[".tsx"] != 7 {
96+
t.Errorf("javascript .tsx: got %d, want 7", jsStatus.Breakdown[".tsx"])
97+
}
98+
if jsStatus.Breakdown[".js"] != 3 {
99+
t.Errorf("javascript .js: got %d, want 3", jsStatus.Breakdown[".js"])
100+
}
101+
if jsStatus.Breakdown[".vue"] != 1 {
102+
t.Errorf("javascript .vue: got %d, want 1", jsStatus.Breakdown[".vue"])
103+
}
104+
if len(jsStatus.Breakdown) != 4 {
105+
t.Errorf("javascript breakdown length: got %d, want 4", len(jsStatus.Breakdown))
106+
}
107+
108+
// Verify docs breakdown has multiple extensions
109+
docsStatus := loaded.Languages["docs"]
110+
if docsStatus.Breakdown[".md"] != 10 {
111+
t.Errorf("docs .md: got %d, want 10", docsStatus.Breakdown[".md"])
112+
}
113+
114+
// Verify go breakdown (single extension)
115+
goStatus := loaded.Languages["go"]
116+
if goStatus.Breakdown[".go"] != 80 {
117+
t.Errorf("go .go: got %d, want 80", goStatus.Breakdown[".go"])
118+
}
119+
}
120+
121+
func TestIndexStatusBreakdownOmitEmpty(t *testing.T) {
122+
wsRoot := t.TempDir()
123+
124+
// Status without breakdown — should not appear in JSON
125+
status := &IndexStatus{
126+
StartedAt: "2026-04-01T00:00:00Z",
127+
Languages: map[string]LangStatus{
128+
"go": {OnDisk: 50, Processed: 50},
129+
},
130+
}
131+
132+
SaveIndexStatus(wsRoot, status)
133+
loaded := LoadIndexStatus(wsRoot)
134+
135+
if loaded == nil {
136+
t.Fatal("expected non-nil after save")
137+
}
138+
goStatus := loaded.Languages["go"]
139+
if goStatus.Breakdown != nil {
140+
t.Errorf("expected nil breakdown for omitempty, got %v", goStatus.Breakdown)
141+
}
142+
}

pkg/indexer/service.go

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ func (s *Service) IndexWorkspace(ctx context.Context, root string, collection st
7878
if d.IsDir() {
7979
name := d.Name()
8080
// Basic exclusion
81-
if strings.HasPrefix(name, ".") || name == "vendor" || name == "node_modules" {
81+
if strings.HasPrefix(name, ".") || name == "vendor" || name == "node_modules" || name == "tmp" {
8282
return filepath.SkipDir
8383
}
8484
// User exclusion
@@ -612,19 +612,29 @@ func (s *Service) symbolToMap(sym parser.Symbol) map[string]interface{} {
612612
return res
613613
}
614614

615+
// FileCountResult holds the results of a file count scan:
616+
// per-language totals and per-extension breakdowns within each language.
617+
type FileCountResult struct {
618+
Counts map[string]int // langName → total count
619+
Breakdowns map[string]map[string]int // langName → (extension → count)
620+
}
621+
615622
// CountAllFiles counts files per language in root using a single WalkDir pass,
616623
// applying the same directory exclusion rules as IndexWorkspace.
617-
// It returns a map[langName]count that can be used to pre-populate progress
618-
// totals before indexing begins, avoiding O(languages × files) traversals.
619-
func (s *Service) CountAllFiles(root string, excludePatterns []string) map[string]int {
620-
counts := make(map[string]int)
624+
// It returns a FileCountResult with per-language totals and per-extension
625+
// breakdowns, used to pre-populate progress totals before indexing begins.
626+
func (s *Service) CountAllFiles(root string, excludePatterns []string) FileCountResult {
627+
result := FileCountResult{
628+
Counts: make(map[string]int),
629+
Breakdowns: make(map[string]map[string]int),
630+
}
621631
_ = filepath.WalkDir(root, func(path string, d os.DirEntry, err error) error {
622632
if err != nil {
623633
return nil
624634
}
625635
if d.IsDir() {
626636
name := d.Name()
627-
if strings.HasPrefix(name, ".") || name == "vendor" || name == "node_modules" {
637+
if strings.HasPrefix(name, ".") || name == "vendor" || name == "node_modules" || name == "tmp" {
628638
return filepath.SkipDir
629639
}
630640
for _, p := range excludePatterns {
@@ -638,8 +648,17 @@ func (s *Service) CountAllFiles(root string, excludePatterns []string) map[strin
638648
if a == nil {
639649
return nil
640650
}
641-
counts[a.Name()]++
651+
lang := a.Name()
652+
result.Counts[lang]++
653+
654+
// Track per-extension breakdown
655+
ext := strings.ToLower(filepath.Ext(path))
656+
if result.Breakdowns[lang] == nil {
657+
result.Breakdowns[lang] = make(map[string]int)
658+
}
659+
result.Breakdowns[lang][ext]++
660+
642661
return nil
643662
})
644-
return counts
663+
return result
645664
}

0 commit comments

Comments
 (0)