|
7 | 7 | "bytes" |
8 | 8 | "errors" |
9 | 9 | "fmt" |
10 | | - "io" |
11 | 10 | "sync" |
12 | 11 | "unicode" |
13 | 12 |
|
@@ -130,71 +129,51 @@ func (c *Chunk) GetFileThreshold() int64 { |
130 | 129 |
|
131 | 130 | // ReadChunk reads the next chunk of data from file |
132 | 131 | func (c *Chunk) ReadChunk(reader *bufio.Reader, totalLines int) (string, error) { |
133 | | - chunk, ok := c.GetBuf() |
| 132 | + // borrow a []bytes from the pool and seed it with raw data from file (up to chunk size + peek size) |
| 133 | + rawData, ok := c.GetPeekedBuf() |
134 | 134 | if !ok { |
135 | | - return "", fmt.Errorf("expected *[]byte, got %T", chunk) |
| 135 | + return "", fmt.Errorf("expected *bytes.Buffer, got %T", rawData) |
136 | 136 | } |
137 | | - defer c.PutBuf(chunk) |
| 137 | + defer c.PutPeekedBuf(rawData) |
| 138 | + n, err := reader.Read(*rawData) |
138 | 139 |
|
139 | | - n, err := reader.Read(*chunk) |
140 | 140 | var chunkStr string |
141 | 141 | // "Callers should always process the n > 0 bytes returned before considering the error err." |
142 | 142 | // https://pkg.go.dev/io#Reader |
143 | 143 | if n > 0 { |
144 | 144 | // only check the filetype at the start of file |
145 | | - if totalLines == 0 && ShouldSkipFile((*chunk)[:n]) { |
| 145 | + if totalLines == 0 && ShouldSkipFile((*rawData)[:n]) { |
146 | 146 | return "", fmt.Errorf("skipping file: %w", ErrUnsupportedFileType) |
147 | 147 | } |
148 | 148 |
|
149 | | - chunkStr, err = c.processChunk(reader, (*chunk)[:n]) |
150 | | - if err != nil { |
151 | | - return "", err |
152 | | - } |
| 149 | + chunkStr, err = c.generateChunk((*rawData)[:n]) |
153 | 150 | } |
154 | 151 | if err != nil { |
155 | 152 | return "", err |
156 | 153 | } |
157 | 154 | return chunkStr, nil |
158 | 155 | } |
159 | 156 |
|
160 | | -// processChunk processes the chunk, reading until a safe boundary |
161 | | -func (c *Chunk) processChunk(reader *bufio.Reader, chunk []byte) (string, error) { |
162 | | - peekBuf, ok := c.GetPeekBuf(chunk) |
| 157 | +// generateChunk processes block of raw data and generates chunk to be scanned |
| 158 | +func (c *Chunk) generateChunk(rawData []byte) (string, error) { |
| 159 | + // Borrow a buffer from the pool and seed it with raw data (up to chunk size) |
| 160 | + initialChunkLen := min(len(rawData), c.size) |
| 161 | + chunkData, ok := c.GetBuf(rawData[:initialChunkLen]) |
163 | 162 | if !ok { |
164 | | - return "", fmt.Errorf("expected *bytes.Buffer, got %T", peekBuf) |
165 | | - } |
166 | | - defer c.PutPeekBuf(peekBuf) |
167 | | - |
168 | | - if readErr := c.readUntilSafeBoundary(reader, len(chunk), peekBuf); readErr != nil { |
169 | | - return "", fmt.Errorf("failed to read until safe boundary for file: %w", readErr) |
170 | | - } |
171 | | - |
172 | | - return peekBuf.String(), nil |
173 | | -} |
174 | | - |
175 | | -// readUntilSafeBoundary (hopefully) avoids splitting (https://github.com/gitleaks/gitleaks/issues/1651) |
176 | | -func (c *Chunk) readUntilSafeBoundary(r *bufio.Reader, n int, peekBuf *bytes.Buffer) error { |
177 | | - if peekBuf.Len() == 0 { |
178 | | - return nil |
| 163 | + return "", fmt.Errorf("expected *bytes.Buffer, got %T", chunkData) |
179 | 164 | } |
| 165 | + defer c.PutBuf(chunkData) |
180 | 166 |
|
181 | | - // keep reading until see our “\n…\n” boundary or hit limits |
182 | | - for peekBuf.Len()-n < c.MaxPeekSize { |
183 | | - if endsWithTwoNewlines(peekBuf.Bytes()) { |
184 | | - return nil |
185 | | - } |
186 | | - |
187 | | - b, err := r.ReadByte() |
188 | | - if err != nil { |
189 | | - if err == io.EOF { |
190 | | - return nil |
191 | | - } |
192 | | - return fmt.Errorf("failed to read byte: %w", err) |
| 167 | + // keep seeding chunk until detecting the “\n...\n” (i.e. safe boundary) |
| 168 | + // or reaching the max limit of chunk size (i.e. chunk size + peek size) |
| 169 | + for i := chunkData.Len(); i < len(rawData); i++ { |
| 170 | + if endsWithTwoNewlines(rawData[:i]) { |
| 171 | + break |
193 | 172 | } |
194 | | - peekBuf.WriteByte(b) |
| 173 | + chunkData.WriteByte(rawData[i]) |
195 | 174 | } |
196 | 175 |
|
197 | | - return nil |
| 176 | + return chunkData.String(), nil |
198 | 177 | } |
199 | 178 |
|
200 | 179 | // endsWithTwoNewlines returns true if b ends in at least two '\n's (ignoring any number of ' ', '\r', or '\t' between them) |
|
0 commit comments