Skip to content

Commit 46ed6e6

Browse files
authored
bench(compliance): scanner pipeline benchmarks with baselines
bench(compliance): scanner pipeline benchmarks with baselines
2 parents 0b4a02f + f09378d commit 46ed6e6

File tree

5 files changed

+669
-94
lines changed

5 files changed

+669
-94
lines changed

internal/backends/scip/loader.go

Lines changed: 170 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@ import (
44
"fmt"
55
"os"
66
"path/filepath"
7+
"runtime"
8+
"sort"
9+
"sync"
710
"time"
811

912
"github.com/SimplyLiz/CodeMCP/internal/errors"
@@ -26,6 +29,9 @@ type SCIPIndex struct {
2629
// Documents are all indexed documents
2730
Documents []*Document
2831

32+
// DocumentsByPath is an O(1) lookup map from relative path to document
33+
DocumentsByPath map[string]*Document
34+
2935
// Symbols maps symbol IDs to symbol information
3036
Symbols map[string]*SymbolInformation
3137

@@ -44,9 +50,6 @@ type SCIPIndex struct {
4450

4551
// IndexedCommit is the git commit the index was built from
4652
IndexedCommit string
47-
48-
// raw is the raw protobuf index
49-
raw *scippb.Index
5053
}
5154

5255
// LoadSCIPIndex loads a SCIP index from the specified path
@@ -93,93 +96,185 @@ func LoadSCIPIndex(path string) (*SCIPIndex, error) {
9396
)
9497
}
9598

96-
// Convert to internal representation
97-
scipIndex := &SCIPIndex{
98-
Metadata: convertMetadata(index.Metadata),
99-
Documents: convertDocuments(index.Documents),
100-
Symbols: make(map[string]*SymbolInformation),
101-
RefIndex: make(map[string][]*OccurrenceRef),
102-
ConvertedSymbols: make(map[string]*SCIPSymbol),
103-
ContainerIndex: make(map[string]string),
104-
LoadedAt: time.Now(),
105-
raw: &index,
106-
}
107-
108-
// Build symbol map and reference index in a single pass
109-
for _, doc := range scipIndex.Documents {
110-
// Index symbols
111-
for _, sym := range doc.Symbols {
112-
scipIndex.Symbols[sym.Symbol] = sym
113-
}
99+
// Convert to internal representation using parallel document processing.
100+
nWorkers := runtime.GOMAXPROCS(0)
101+
102+
// Phase 1: convert documents and build per-doc indexes in parallel.
103+
type docResult struct {
104+
doc *Document
105+
symbols map[string]*SymbolInformation
106+
refEntries map[string][]*OccurrenceRef
107+
containerEntries map[string]string
108+
}
114109

115-
// Build inverted reference index for O(1) lookups
116-
for _, occ := range doc.Occurrences {
117-
if occ.Symbol != "" {
118-
scipIndex.RefIndex[occ.Symbol] = append(
119-
scipIndex.RefIndex[occ.Symbol],
120-
&OccurrenceRef{Doc: doc, Occ: occ},
121-
)
110+
results := make([]docResult, len(index.Documents))
111+
112+
var wg sync.WaitGroup
113+
sem := make(chan struct{}, nWorkers)
114+
115+
for i, pbDoc := range index.Documents {
116+
wg.Add(1)
117+
sem <- struct{}{}
118+
go func(i int, pbDoc *scippb.Document) {
119+
defer wg.Done()
120+
defer func() { <-sem }()
121+
122+
doc := convertDocument(pbDoc)
123+
r := docResult{
124+
doc: doc,
125+
symbols: make(map[string]*SymbolInformation, len(doc.Symbols)),
126+
refEntries: make(map[string][]*OccurrenceRef),
127+
containerEntries: make(map[string]string),
122128
}
123-
}
124129

125-
// Build container index for O(1) containment lookup
126-
// First collect all definition occurrences with enclosing ranges
127-
type defScope struct {
128-
symbol string
129-
startLine int32
130-
endLine int32
131-
}
132-
var defScopes []defScope
133-
for _, occ := range doc.Occurrences {
134-
if occ.SymbolRoles&SymbolRoleDefinition != 0 && len(occ.EnclosingRange) >= 3 {
135-
startLine := occ.EnclosingRange[0]
136-
var endLine int32
137-
if len(occ.EnclosingRange) >= 4 {
138-
endLine = occ.EnclosingRange[2]
139-
} else {
140-
endLine = startLine
130+
// Index symbols
131+
for _, sym := range doc.Symbols {
132+
r.symbols[sym.Symbol] = sym
133+
}
134+
135+
// Build inverted reference index for O(1) lookups
136+
for _, occ := range doc.Occurrences {
137+
if occ.Symbol != "" {
138+
r.refEntries[occ.Symbol] = append(
139+
r.refEntries[occ.Symbol],
140+
&OccurrenceRef{Doc: doc, Occ: occ},
141+
)
141142
}
142-
defScopes = append(defScopes, defScope{
143-
symbol: occ.Symbol,
144-
startLine: startLine,
145-
endLine: endLine,
146-
})
147143
}
148-
}
149144

150-
// For each occurrence, find its innermost containing scope
151-
for _, occ := range doc.Occurrences {
152-
if len(occ.Range) < 2 {
153-
continue
145+
// Build container index.
146+
// Collect definition occurrences that have enclosing ranges.
147+
type defScope struct {
148+
symbol string
149+
startLine int32
150+
endLine int32
154151
}
155-
occLine := occ.Range[0]
156-
157-
// Find the smallest (innermost) scope containing this occurrence
158-
var bestScope *defScope
159-
var bestSize int32 = -1
160-
for i := range defScopes {
161-
ds := &defScopes[i]
162-
if occLine >= ds.startLine && occLine <= ds.endLine {
163-
size := ds.endLine - ds.startLine
164-
if bestScope == nil || size < bestSize {
165-
bestScope = ds
166-
bestSize = size
152+
var defScopes []defScope
153+
for _, occ := range doc.Occurrences {
154+
if occ.SymbolRoles&SymbolRoleDefinition != 0 && len(occ.EnclosingRange) >= 3 {
155+
startLine := occ.EnclosingRange[0]
156+
var endLine int32
157+
if len(occ.EnclosingRange) >= 4 {
158+
endLine = occ.EnclosingRange[2]
159+
} else {
160+
endLine = startLine
167161
}
162+
defScopes = append(defScopes, defScope{
163+
symbol: occ.Symbol,
164+
startLine: startLine,
165+
endLine: endLine,
166+
})
168167
}
169168
}
170169

171-
if bestScope != nil {
172-
key := fmt.Sprintf("%s:%d:%d", doc.RelativePath, occ.Range[0], occ.Range[1])
173-
scipIndex.ContainerIndex[key] = bestScope.symbol
170+
if len(defScopes) > 0 {
171+
// Sort by scope size ascending so the first match is the innermost.
172+
sort.Slice(defScopes, func(a, b int) bool {
173+
return (defScopes[a].endLine - defScopes[a].startLine) <
174+
(defScopes[b].endLine - defScopes[b].startLine)
175+
})
176+
177+
for _, occ := range doc.Occurrences {
178+
if len(occ.Range) < 2 {
179+
continue
180+
}
181+
occLine := occ.Range[0]
182+
for idx := range defScopes {
183+
ds := &defScopes[idx]
184+
if occLine >= ds.startLine && occLine <= ds.endLine {
185+
key := fmt.Sprintf("%s:%d:%d", doc.RelativePath, occ.Range[0], occ.Range[1])
186+
r.containerEntries[key] = ds.symbol
187+
break // first match is innermost (sorted by size asc)
188+
}
189+
}
190+
}
174191
}
192+
193+
results[i] = r
194+
}(i, pbDoc)
195+
}
196+
wg.Wait()
197+
198+
// Merge per-doc results into the main index (serial, fast map assignment).
199+
// Pre-size maps based on doc count to reduce rehashing.
200+
totalSyms := 0
201+
totalRefs := 0
202+
totalContainer := 0
203+
docs := make([]*Document, len(results))
204+
for i, r := range results {
205+
docs[i] = r.doc
206+
totalSyms += len(r.symbols)
207+
totalRefs += len(r.refEntries)
208+
totalContainer += len(r.containerEntries)
209+
}
210+
211+
scipIndex := &SCIPIndex{
212+
Metadata: convertMetadata(index.Metadata),
213+
Documents: docs,
214+
DocumentsByPath: make(map[string]*Document, len(docs)),
215+
Symbols: make(map[string]*SymbolInformation, totalSyms),
216+
RefIndex: make(map[string][]*OccurrenceRef, totalRefs),
217+
ConvertedSymbols: make(map[string]*SCIPSymbol, totalSyms),
218+
ContainerIndex: make(map[string]string, totalContainer),
219+
LoadedAt: time.Now(),
220+
}
221+
222+
for _, doc := range docs {
223+
scipIndex.DocumentsByPath[doc.RelativePath] = doc
224+
}
225+
for _, r := range results {
226+
for k, v := range r.symbols {
227+
scipIndex.Symbols[k] = v
228+
}
229+
for k, v := range r.refEntries {
230+
scipIndex.RefIndex[k] = append(scipIndex.RefIndex[k], v...)
175231
}
232+
for k, v := range r.containerEntries {
233+
scipIndex.ContainerIndex[k] = v
234+
}
235+
}
236+
237+
// Phase 2: pre-convert all symbols in parallel.
238+
// RefIndex and Symbols are fully built at this point (read-only from here).
239+
type symResult struct {
240+
id string
241+
sym *SCIPSymbol
242+
}
243+
244+
symIDs := make([]string, 0, len(scipIndex.Symbols))
245+
for id := range scipIndex.Symbols {
246+
symIDs = append(symIDs, id)
247+
}
248+
249+
symCh := make(chan symResult, len(symIDs))
250+
batchSize := (len(symIDs) + nWorkers - 1) / nWorkers
251+
if batchSize < 1 {
252+
batchSize = 1
176253
}
177254

178-
// Pre-convert all symbols to avoid repeated conversion during queries
179-
for symbolId, symInfo := range scipIndex.Symbols {
180-
if converted, err := convertToSCIPSymbol(symInfo, scipIndex); err == nil {
181-
scipIndex.ConvertedSymbols[symbolId] = converted
255+
var wg2 sync.WaitGroup
256+
for b := 0; b*batchSize < len(symIDs); b++ {
257+
start := b * batchSize
258+
end := start + batchSize
259+
if end > len(symIDs) {
260+
end = len(symIDs)
182261
}
262+
wg2.Add(1)
263+
go func(ids []string) {
264+
defer wg2.Done()
265+
for _, id := range ids {
266+
if converted, err := convertToSCIPSymbol(scipIndex.Symbols[id], scipIndex); err == nil {
267+
symCh <- symResult{id: id, sym: converted}
268+
}
269+
}
270+
}(symIDs[start:end])
271+
}
272+
go func() {
273+
wg2.Wait()
274+
close(symCh)
275+
}()
276+
for r := range symCh {
277+
scipIndex.ConvertedSymbols[r.id] = r.sym
183278
}
184279

185280
// Extract indexed commit from metadata if available
@@ -203,12 +298,7 @@ func (i *SCIPIndex) IsStale(headCommit string) bool {
203298

204299
// GetDocument retrieves a document by its relative path
205300
func (i *SCIPIndex) GetDocument(relativePath string) *Document {
206-
for _, doc := range i.Documents {
207-
if doc.RelativePath == relativePath {
208-
return doc
209-
}
210-
}
211-
return nil
301+
return i.DocumentsByPath[relativePath]
212302
}
213303

214304
// GetSymbol retrieves symbol information by ID

0 commit comments

Comments
 (0)