Skip to content

Commit 1dd1fae

Browse files
refactor: simplify reading chunk logic
1 parent 60ebfba commit 1dd1fae

3 files changed

Lines changed: 24 additions & 99 deletions

File tree

engine/chunk/chunk.go

Lines changed: 21 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import (
77
"bytes"
88
"errors"
99
"fmt"
10-
"io"
1110
"sync"
1211
"unicode"
1312

@@ -130,71 +129,51 @@ func (c *Chunk) GetFileThreshold() int64 {
130129

131130
// ReadChunk reads the next chunk of data from file
132131
func (c *Chunk) ReadChunk(reader *bufio.Reader, totalLines int) (string, error) {
133-
chunk, ok := c.GetBuf()
132+
// borrow a []bytes from the pool and seed it with raw data from file (up to chunk size + peek size)
133+
rawData, ok := c.GetPeekedBuf()
134134
if !ok {
135-
return "", fmt.Errorf("expected *[]byte, got %T", chunk)
135+
return "", fmt.Errorf("expected *bytes.Buffer, got %T", rawData)
136136
}
137-
defer c.PutBuf(chunk)
137+
defer c.PutPeekedBuf(rawData)
138+
n, err := reader.Read(*rawData)
138139

139-
n, err := reader.Read(*chunk)
140140
var chunkStr string
141141
// "Callers should always process the n > 0 bytes returned before considering the error err."
142142
// https://pkg.go.dev/io#Reader
143143
if n > 0 {
144144
// only check the filetype at the start of file
145-
if totalLines == 0 && ShouldSkipFile((*chunk)[:n]) {
145+
if totalLines == 0 && ShouldSkipFile((*rawData)[:n]) {
146146
return "", fmt.Errorf("skipping file: %w", ErrUnsupportedFileType)
147147
}
148148

149-
chunkStr, err = c.processChunk(reader, (*chunk)[:n])
150-
if err != nil {
151-
return "", err
152-
}
149+
chunkStr, err = c.generateChunk((*rawData)[:n])
153150
}
154151
if err != nil {
155152
return "", err
156153
}
157154
return chunkStr, nil
158155
}
159156

160-
// processChunk processes the chunk, reading until a safe boundary
161-
func (c *Chunk) processChunk(reader *bufio.Reader, chunk []byte) (string, error) {
162-
peekBuf, ok := c.GetPeekBuf(chunk)
157+
// generateChunk processes block of raw data and generates chunk to be scanned
158+
func (c *Chunk) generateChunk(rawData []byte) (string, error) {
159+
// Borrow a buffer from the pool and seed it with raw data (up to chunk size)
160+
initialChunkLen := min(len(rawData), c.size)
161+
chunkData, ok := c.GetBuf(rawData[:initialChunkLen])
163162
if !ok {
164-
return "", fmt.Errorf("expected *bytes.Buffer, got %T", peekBuf)
165-
}
166-
defer c.PutPeekBuf(peekBuf)
167-
168-
if readErr := c.readUntilSafeBoundary(reader, len(chunk), peekBuf); readErr != nil {
169-
return "", fmt.Errorf("failed to read until safe boundary for file: %w", readErr)
170-
}
171-
172-
return peekBuf.String(), nil
173-
}
174-
175-
// readUntilSafeBoundary (hopefully) avoids splitting (https://github.com/gitleaks/gitleaks/issues/1651)
176-
func (c *Chunk) readUntilSafeBoundary(r *bufio.Reader, n int, peekBuf *bytes.Buffer) error {
177-
if peekBuf.Len() == 0 {
178-
return nil
163+
return "", fmt.Errorf("expected *bytes.Buffer, got %T", chunkData)
179164
}
165+
defer c.PutBuf(chunkData)
180166

181-
// keep reading until see our “\n…\n” boundary or hit limits
182-
for peekBuf.Len()-n < c.MaxPeekSize {
183-
if endsWithTwoNewlines(peekBuf.Bytes()) {
184-
return nil
185-
}
186-
187-
b, err := r.ReadByte()
188-
if err != nil {
189-
if err == io.EOF {
190-
return nil
191-
}
192-
return fmt.Errorf("failed to read byte: %w", err)
167+
// keep seeding chunk until detecting the “\n...\n” (i.e. safe boundary)
168+
// or reaching the max limit of chunk size (i.e. chunk size + peek size)
169+
for i := chunkData.Len(); i < len(rawData); i++ {
170+
if endsWithTwoNewlines(rawData[:i]) {
171+
break
193172
}
194-
peekBuf.WriteByte(b)
173+
chunkData.WriteByte(rawData[i])
195174
}
196175

197-
return nil
176+
return chunkData.String(), nil
198177
}
199178

200179
// endsWithTwoNewlines returns true if b ends in at least two '\n's (ignoring any number of ' ', '\r', or '\t' between them)

engine/chunk/chunk_mock.go

Lines changed: 0 additions & 55 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

engine/engine.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ func Init(engineConfig EngineConfig) (IEngine, error) {
9595
detector: *detector,
9696
validator: *validation.NewValidator(),
9797
semaphore: semaphore.NewSemaphore(),
98-
chunk: chunk.NewChunk(),
98+
chunk: chunk.New(),
9999

100100
ignoredIds: engineConfig.IgnoredIds,
101101
allowedValues: engineConfig.AllowedValues,
@@ -170,9 +170,10 @@ func (e *Engine) detectChunks(item plugins.ISourceItem, secretsChannel chan *sec
170170
_ = f.Close()
171171
}()
172172

173-
reader := bufio.NewReaderSize(f, e.chunk.GetSize())
173+
reader := bufio.NewReaderSize(f, e.chunk.GetMaxPeekSize())
174174
totalLines := 0
175175

176+
// Read the file in chunks until EOF
176177
for {
177178
chunkStr, err := e.chunk.ReadChunk(reader, totalLines)
178179
if err != nil {

0 commit comments

Comments
 (0)