diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 0b443f3..83a4ad6 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -17,9 +17,9 @@ jobs: - uses: actions/checkout@v4 - name: Set up Go - uses: actions/setup-go@v4 + uses: actions/setup-go@v5 with: - go-version: '1.20' + go-version: '1.25' - name: Build run: go build -v ./... diff --git a/go.mod b/go.mod index 9df9b47..e897de7 100644 --- a/go.mod +++ b/go.mod @@ -1,10 +1,13 @@ module grepturbo -go 1.20 +go 1.25.0 + +require ( + github.com/spf13/cobra v1.10.2 + golang.org/x/sys v0.30.0 +) require ( github.com/inconshreveable/mousetrap v1.1.0 // indirect - github.com/spf13/cobra v1.10.2 // indirect github.com/spf13/pflag v1.0.9 // indirect - golang.org/x/sys v0.42.0 // indirect ) diff --git a/go.sum b/go.sum index 4f009be..248c894 100644 --- a/go.sum +++ b/go.sum @@ -7,6 +7,6 @@ github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiT github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= -golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= -golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= +golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/internal/index/builder.go b/internal/index/builder.go index 44baa70..4d05283 100644 --- a/internal/index/builder.go +++ b/internal/index/builder.go @@ -4,6 +4,8 @@ import ( "io/fs" "os" "path/filepath" + "runtime" + "sync" "unicode/utf8" "grepturbo/internal/posting" @@ -65,7 +67,12 @@ var defaultSkipDirs = map[string]bool{ ".fastregex": true, } -// Build walks all files under rootDir and indexes each one. +type extractResult struct { + path string + trigrams []trigram.T +} + +// Build walks all files under rootDir and indexes each one concurrently. // Directories listed in skip are skipped entirely (e.g. "node_modules"). // Directories and files that fail to read are silently skipped. func (b *Builder) Build(rootDir string, skip ...string) error { @@ -77,6 +84,48 @@ func (b *Builder) Build(rootDir string, skip ...string) error { skipSet[s] = true } + paths := make(chan string, 100) + results := make(chan extractResult, 100) + + // Worker pool: read files and extract trigrams + var wg sync.WaitGroup + numWorkers := runtime.GOMAXPROCS(0) + for i := 0; i < numWorkers; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for path := range paths { + data, err := os.ReadFile(path) + if err != nil || !utf8.Valid(data) || len(data) > maxFileSize { + continue + } + results <- extractResult{ + path: path, + trigrams: trigram.Extract(string(data)), + } + } + }() + } + + // Signal workers are done + go func() { + wg.Wait() + close(results) + }() + + // Collector: update Builder state (sequential, lock-free) + done := make(chan struct{}) + go func() { + for res := range results { + fileID := uint32(len(b.Files)) + b.Files = append(b.Files, res.path) + for _, t := range res.trigrams { + b.Posts.AddBatch(t, []uint32{fileID}) + } + } + close(done) + }() + err := filepath.WalkDir(rootDir, func(path string, d fs.DirEntry, err error) error { if err != nil { return nil @@ -87,12 +136,16 @@ func (b *Builder) Build(rootDir string, skip ...string) error { } return nil } - b.Add(path) + paths <- path return nil }) + close(paths) + if err != nil { return err } + + <-done b.Posts.Finalize() return nil } diff --git a/scripts/benchcmd/main.go b/scripts/benchcmd/main.go index 1a9d0d7..1b33880 100644 --- a/scripts/benchcmd/main.go +++ b/scripts/benchcmd/main.go @@ -9,7 +9,9 @@ import ( "io/fs" "os" "path/filepath" + "runtime" "strings" + "sync" "unicode/utf8" "grepturbo/internal/index" @@ -28,6 +30,11 @@ type result struct { TotalMs float64 `json:"total_ms"` } +type extractResult struct { + path string + trigrams []trigram.T +} + func main() { root := flag.String("root", ".", "root directory to index") out := flag.String("out", "/tmp/grepturbo_bench_idx", "output index dir") @@ -51,6 +58,46 @@ func main() { // ── Phase: walk + extract ───────────────────────────────────────────────── walkStart := time.Now() + + paths := make(chan string, 100) + results := make(chan extractResult, 100) + + var wg sync.WaitGroup + numWorkers := runtime.GOMAXPROCS(0) + for i := 0; i < numWorkers; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for path := range paths { + data, err := os.ReadFile(path) + if err != nil || !utf8.Valid(data) || len(data) > maxFileSize { + continue + } + results <- extractResult{ + path: path, + trigrams: trigram.Extract(string(data)), + } + } + }() + } + + go func() { + wg.Wait() + close(results) + }() + + done := make(chan struct{}) + go func() { + for res := range results { + fileID := uint32(len(b.Files)) + b.Files = append(b.Files, res.path) + for _, t := range res.trigrams { + b.Posts.AddBatch(t, []uint32{fileID}) + } + } + close(done) + }() + filepath.WalkDir(*root, func(path string, d fs.DirEntry, err error) error { if err != nil { return nil @@ -61,19 +108,12 @@ func main() { } return nil } - - data, err := os.ReadFile(path) - if err != nil || !utf8.Valid(data) || len(data) > maxFileSize { - return nil - } - - fileID := uint32(len(b.Files)) - b.Files = append(b.Files, path) - for _, t := range trigram.Extract(string(data)) { - b.Posts.AddBatch(t, []uint32{fileID}) - } + paths <- path return nil }) + close(paths) + <-done + walkExtractMs := ms(time.Since(walkStart)) // ── Phase: finalize ───────────────────────────────────────────────────────