Skip to content

Commit f744dcc

Browse files
committed
perf: stream git log output via bufio.Scanner, fix bench map hint
Replace cmd.Output()+strings.Split with StdoutPipe+bufio.Scanner so the full git log is never loaded into memory as a bulk string. On a repo with 10k commits the old path double-copied ~500 KB ([]byte→string→[]string); now each line is processed in-place from a 64 KB ring buffer. Also switch from TrimSpace to bytes.TrimRight("\r") — precise, no leading space scan — and fix BenchmarkCoChangePipelineSimulated's pairs map hint from commits×files (20k) to files*(files-1)/2 (190), the actual unique pair ceiling.
1 parent 4510d4f commit f744dcc

2 files changed

Lines changed: 35 additions & 16 deletions

File tree

internal/perf/analyzer.go

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package perf
22

33
import (
4+
"bufio"
5+
"bytes"
46
"context"
57
"fmt"
68
"log/slog"
@@ -240,11 +242,14 @@ func (a *Analyzer) buildCoChangePairs(ctx context.Context, since time.Time, scop
240242
cmd := exec.CommandContext(ctx, "git", args...)
241243
cmd.Dir = a.repoRoot
242244

243-
out, err := cmd.Output()
245+
var stderrBuf bytes.Buffer
246+
cmd.Stderr = &stderrBuf
247+
248+
stdout, err := cmd.StdoutPipe()
244249
if err != nil {
245-
if exitErr, ok := err.(*exec.ExitError); ok && len(exitErr.Stderr) > 0 {
246-
return nil, nil, fmt.Errorf("git log: %s", exitErr.Stderr)
247-
}
250+
return nil, nil, fmt.Errorf("git log pipe: %w", err)
251+
}
252+
if err := cmd.Start(); err != nil {
248253
return nil, nil, fmt.Errorf("git log: %w", err)
249254
}
250255

@@ -255,22 +260,35 @@ func (a *Analyzer) buildCoChangePairs(ctx context.Context, since time.Time, scop
255260
seen := make(map[string]bool, 32)
256261
var currentFiles []string
257262

258-
// Parse output: groups of files per commit, separated by blank lines.
259-
for _, line := range strings.Split(string(out), "\n") {
260-
line = strings.TrimSpace(line)
261-
if strings.HasPrefix(line, "COMMIT ") {
262-
// Flush previous group.
263+
// Stream git output line by line — avoids loading the full log into memory
264+
// and copying it to a string before splitting.
265+
commitPrefix := []byte("COMMIT ")
266+
scanner := bufio.NewScanner(stdout)
267+
for scanner.Scan() {
268+
b := bytes.TrimRight(scanner.Bytes(), "\r")
269+
if bytes.HasPrefix(b, commitPrefix) {
263270
a.recordCommit(currentFiles, pairCounts, fileTotals, seen)
264271
currentFiles = currentFiles[:0]
265272
continue
266273
}
267-
if line == "" {
274+
if len(b) == 0 {
268275
continue
269276
}
270-
currentFiles = append(currentFiles, line)
277+
currentFiles = append(currentFiles, string(b))
278+
}
279+
if scanErr := scanner.Err(); scanErr != nil {
280+
_ = cmd.Wait()
281+
return nil, nil, fmt.Errorf("reading git log: %w", scanErr)
271282
}
272283
a.recordCommit(currentFiles, pairCounts, fileTotals, seen)
273284

285+
if err := cmd.Wait(); err != nil {
286+
if s := stderrBuf.String(); s != "" {
287+
return nil, nil, fmt.Errorf("git log: %s", s)
288+
}
289+
return nil, nil, fmt.Errorf("git log: %w", err)
290+
}
291+
274292
return pairCounts, fileTotals, nil
275293
}
276294

internal/perf/perf_bench_test.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,11 @@ import (
3939
// correlationFilter/~20kpairs: ~372 µs/op, 0 B/op, 0 allocs/op
4040
//
4141
// Notable: recordCommit is O(files²) per commit — the dominant cost on repos
42-
// with large commits (fmt sweeps, mass renames). The seen map is now allocated
43-
// once in buildCoChangePairs and reused across commits (range-delete to clear),
44-
// reducing CoChangePipeline allocs by ~97% at 1k commits. The ignore filter
45-
// cuts pairing work by dropping testdata/vendor before O(n²).
42+
// with large commits (fmt sweeps, mass renames). The seen map is allocated once
43+
// in buildCoChangePairs and reused across commits (range-delete to clear),
44+
// reducing CoChangePipeline allocs by ~97% at 1k commits. git output is parsed
45+
// via bufio.Scanner to avoid loading the full log into memory before processing.
46+
// The ignore filter cuts pairing work by dropping testdata/vendor before O(n²).
4647
//
4748
// Use benchstat for before/after comparison:
4849
// go test -bench=. -benchmem -count=6 -run=^$ ./internal/perf > before.txt
@@ -162,7 +163,7 @@ func BenchmarkCoChangePipelineSimulated(b *testing.B) {
162163
b.ReportAllocs()
163164
b.ResetTimer()
164165
for i := 0; i < b.N; i++ {
165-
pairs := make(map[filePair]int, sc.commits*sc.files)
166+
pairs := make(map[filePair]int, sc.files*(sc.files-1)/2)
166167
totals := make(map[string]int, sc.files*2)
167168
seen := make(map[string]bool, sc.files)
168169
for _, batch := range batches {

0 commit comments

Comments
 (0)