Skip to content

Commit 9bbb8e8

Browse files
[SMO-522] Fix heap overflow in large monorepo scans (#1026)
* fix: prevent heap overflow in large monorepo scans Add streaming-based filtering to globWithGitIgnore to prevent heap overflow when scanning large monorepos with 100k+ files. Instead of accumulating all file paths and filtering afterwards, files are now filtered during streaming which dramatically reduces memory usage. Changes: - Add `filter` option to globWithGitIgnore for early filtering during streaming - Add createSupportedFilesFilter helper to create filter from supported files - Update getPackageFilesForScan to use streaming filter - Add comprehensive tests for the new filter functionality Fixes SMO-522 * Update src/utils/glob.mts Signed-off-by: John-David Dalton <jdalton@users.noreply.github.com> --------- Signed-off-by: John-David Dalton <jdalton@users.noreply.github.com> Co-authored-by: John-David Dalton <jdalton@users.noreply.github.com>
1 parent f91f262 commit 9bbb8e8

File tree

4 files changed

+403
-15
lines changed

4 files changed

+403
-15
lines changed

src/utils/glob.mts

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,14 @@ export function filterBySupportedScanFiles(
164164
return filepaths.filter(p => micromatch.some(p, patterns, { dot: true }))
165165
}
166166

167+
export function createSupportedFilesFilter(
168+
supportedFiles: SocketSdkSuccessResult<'getReportSupportedFiles'>['data'],
169+
): (filepath: string) => boolean {
170+
const patterns = getSupportedFilePatterns(supportedFiles)
171+
return (filepath: string) =>
172+
micromatch.some(filepath, patterns, { dot: true })
173+
}
174+
167175
export function getSupportedFilePatterns(
168176
supportedFiles: SocketSdkSuccessResult<'getReportSupportedFiles'>['data'],
169177
): string[] {
@@ -178,6 +186,10 @@ export function getSupportedFilePatterns(
178186
}
179187

180188
type GlobWithGitIgnoreOptions = GlobOptions & {
189+
// Optional filter function to apply during streaming.
190+
// When provided, only files passing this filter are accumulated.
191+
// This is critical for memory efficiency when scanning large monorepos.
192+
filter?: ((filepath: string) => boolean) | undefined
181193
socketConfig?: SocketYml | undefined
182194
}
183195

@@ -187,6 +199,7 @@ export async function globWithGitIgnore(
187199
): Promise<string[]> {
188200
const {
189201
cwd = process.cwd(),
202+
filter,
190203
socketConfig,
191204
...additionalOptions
192205
} = { __proto__: null, ...options } as GlobWithGitIgnoreOptions
@@ -243,27 +256,39 @@ export async function globWithGitIgnore(
243256
...additionalOptions,
244257
} as GlobOptions
245258

246-
if (!hasNegatedPattern) {
259+
// When no filter is provided and no negated patterns exist, use the fast path.
260+
if (!hasNegatedPattern && !filter) {
247261
return await fastGlob.glob(patterns as string[], globOptions)
248262
}
249-
250263
// Add support for negated "ignore" patterns which many globbing libraries,
251264
// including 'fast-glob', 'globby', and 'tinyglobby', lack support for.
252-
const filtered: string[] = []
253-
const ig = ignore().add([...ignores])
265+
// Use streaming to avoid unbounded memory accumulation.
266+
// This is critical for large monorepos with 100k+ files.
267+
const results: string[] = []
268+
const ig = hasNegatedPattern ? ignore().add([...ignores]) : null
254269
const stream = fastGlob.globStream(
255270
patterns as string[],
256271
globOptions,
257272
) as AsyncIterable<string>
258273
for await (const p of stream) {
259-
// Note: the input files must be INSIDE the cwd. If you get strange looking
260-
// relative path errors here, most likely your path is outside the given cwd.
261-
const relPath = globOptions.absolute ? path.relative(cwd, p) : p
262-
if (!ig.ignores(relPath)) {
263-
filtered.push(p)
274+
// Check gitignore patterns with negation support.
275+
if (ig) {
276+
// Note: the input files must be INSIDE the cwd. If you get strange looking
277+
// relative path errors here, most likely your path is outside the given cwd.
278+
const relPath = globOptions.absolute ? path.relative(cwd, p) : p
279+
if (ig.ignores(relPath)) {
280+
continue
281+
}
282+
}
283+
// Apply the optional filter to reduce memory usage.
284+
// When scanning large monorepos, this filters early (e.g., to manifest files only)
285+
// instead of accumulating all 100k+ files and filtering later.
286+
if (filter && !filter(p)) {
287+
continue
264288
}
289+
results.push(p)
265290
}
266-
return filtered
291+
return results
267292
}
268293

269294
export async function globWorkspace(

src/utils/glob.test.mts

Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
import { existsSync, readdirSync, rmSync } from 'node:fs'
2+
import path from 'node:path'
3+
import { fileURLToPath } from 'node:url'
4+
5+
import mockFs from 'mock-fs'
6+
import { afterEach, describe, expect, it } from 'vitest'
7+
8+
import { normalizePath } from '@socketsecurity/registry/lib/path'
9+
10+
import { NODE_MODULES } from '../constants.mjs'
11+
import {
12+
createSupportedFilesFilter,
13+
globWithGitIgnore,
14+
pathsToGlobPatterns,
15+
} from './glob.mts'
16+
17+
import type FileSystem from 'mock-fs/lib/filesystem'
18+
19+
// Filter functions defined at module scope to satisfy linting rules.
20+
function filterJsonFiles(filepath: string): boolean {
21+
return filepath.endsWith('.json')
22+
}
23+
24+
function filterTsFiles(filepath: string): boolean {
25+
return filepath.endsWith('.ts')
26+
}
27+
28+
const __filename = fileURLToPath(import.meta.url)
29+
const __dirname = path.dirname(__filename)
30+
31+
const rootNmPath = path.join(__dirname, '../..', NODE_MODULES)
32+
const mockFixturePath = normalizePath(path.join(__dirname, 'glob-mock'))
33+
const mockNmPath = normalizePath(rootNmPath)
34+
35+
// Remove broken symlinks in node_modules before loading to prevent mock-fs errors.
36+
function cleanupBrokenSymlinks(dirPath: string): void {
37+
try {
38+
if (!existsSync(dirPath)) {
39+
return
40+
}
41+
const entries = readdirSync(dirPath, { withFileTypes: true })
42+
for (const entry of entries) {
43+
const fullPath = path.join(dirPath, entry.name)
44+
try {
45+
if (entry.isSymbolicLink() && !existsSync(fullPath)) {
46+
// Symlink exists but target does not, remove it.
47+
rmSync(fullPath, { force: true })
48+
} else if (entry.isDirectory()) {
49+
// Recursively check subdirectories.
50+
cleanupBrokenSymlinks(fullPath)
51+
}
52+
} catch {
53+
// Ignore errors for individual entries.
54+
}
55+
}
56+
} catch {
57+
// If we cannot read the directory, skip cleanup.
58+
}
59+
}
60+
61+
// Clean up broken symlinks before loading node_modules.
62+
cleanupBrokenSymlinks(rootNmPath)
63+
64+
// Load node_modules with error handling for any remaining issues.
65+
const mockedNmCallback = (() => {
66+
try {
67+
return mockFs.load(rootNmPath)
68+
} catch (e) {
69+
// If loading fails due to broken symlinks or missing files, return empty mock.
70+
console.warn(
71+
`Warning: Failed to load node_modules for mock-fs: ${e instanceof Error ? e.message : String(e)}`,
72+
)
73+
return {}
74+
}
75+
})()
76+
77+
function mockTestFs(config: FileSystem.DirectoryItems) {
78+
return mockFs({
79+
...config,
80+
[mockNmPath]: mockedNmCallback,
81+
})
82+
}
83+
84+
describe('glob utilities', () => {
85+
afterEach(() => {
86+
mockFs.restore()
87+
})
88+
89+
describe('globWithGitIgnore()', () => {
90+
it('should find files matching glob patterns', async () => {
91+
mockTestFs({
92+
[`${mockFixturePath}/package.json`]: '{}',
93+
[`${mockFixturePath}/src/index.ts`]: '',
94+
})
95+
96+
const results = await globWithGitIgnore(['**/*.json'], {
97+
cwd: mockFixturePath,
98+
})
99+
100+
expect(results.map(normalizePath)).toEqual([
101+
`${mockFixturePath}/package.json`,
102+
])
103+
})
104+
105+
it('should respect .gitignore files', async () => {
106+
mockTestFs({
107+
[`${mockFixturePath}/.gitignore`]: 'ignored/**',
108+
[`${mockFixturePath}/package.json`]: '{}',
109+
[`${mockFixturePath}/ignored/package.json`]: '{}',
110+
[`${mockFixturePath}/included/package.json`]: '{}',
111+
})
112+
113+
const results = await globWithGitIgnore(['**/*.json'], {
114+
cwd: mockFixturePath,
115+
})
116+
117+
expect(results.map(normalizePath).sort()).toEqual([
118+
`${mockFixturePath}/included/package.json`,
119+
`${mockFixturePath}/package.json`,
120+
])
121+
})
122+
123+
it('should handle negated patterns in .gitignore', async () => {
124+
mockTestFs({
125+
[`${mockFixturePath}/.gitignore`]: 'ignored/**\n!ignored/keep.json',
126+
[`${mockFixturePath}/package.json`]: '{}',
127+
[`${mockFixturePath}/ignored/excluded.json`]: '{}',
128+
[`${mockFixturePath}/ignored/keep.json`]: '{}',
129+
})
130+
131+
const results = await globWithGitIgnore(['**/*.json'], {
132+
cwd: mockFixturePath,
133+
})
134+
135+
// The negated pattern should allow keep.json to be included.
136+
expect(results.map(normalizePath).sort()).toEqual([
137+
`${mockFixturePath}/ignored/keep.json`,
138+
`${mockFixturePath}/package.json`,
139+
])
140+
})
141+
142+
it('should apply filter function during streaming to reduce memory', async () => {
143+
// Create a mock filesystem with many files.
144+
const files: FileSystem.DirectoryItems = {}
145+
const fileCount = 100
146+
for (let i = 0; i < fileCount; i += 1) {
147+
files[`${mockFixturePath}/file${i}.txt`] = 'content'
148+
files[`${mockFixturePath}/file${i}.json`] = '{}'
149+
}
150+
// Add a gitignore with negated pattern to trigger the streaming path.
151+
files[`${mockFixturePath}/.gitignore`] = 'temp/\n!temp/keep.json'
152+
mockTestFs(files)
153+
154+
const results = await globWithGitIgnore(['**/*'], {
155+
cwd: mockFixturePath,
156+
filter: filterJsonFiles,
157+
})
158+
159+
// Should only include .json files (100 files).
160+
expect(results).toHaveLength(fileCount)
161+
for (const result of results) {
162+
expect(result.endsWith('.json')).toBe(true)
163+
}
164+
})
165+
166+
it('should apply filter without negated patterns', async () => {
167+
mockTestFs({
168+
[`${mockFixturePath}/package.json`]: '{}',
169+
[`${mockFixturePath}/src/index.ts`]: '',
170+
[`${mockFixturePath}/src/utils.ts`]: '',
171+
[`${mockFixturePath}/readme.md`]: '',
172+
})
173+
174+
const results = await globWithGitIgnore(['**/*'], {
175+
cwd: mockFixturePath,
176+
filter: filterTsFiles,
177+
})
178+
179+
expect(results.map(normalizePath).sort()).toEqual([
180+
`${mockFixturePath}/src/index.ts`,
181+
`${mockFixturePath}/src/utils.ts`,
182+
])
183+
})
184+
185+
it('should combine filter with negated gitignore patterns', async () => {
186+
mockTestFs({
187+
[`${mockFixturePath}/.gitignore`]: 'build/**\n!build/manifest.json',
188+
[`${mockFixturePath}/package.json`]: '{}',
189+
[`${mockFixturePath}/src/index.ts`]: '',
190+
[`${mockFixturePath}/build/output.js`]: '',
191+
[`${mockFixturePath}/build/manifest.json`]: '{}',
192+
})
193+
194+
const results = await globWithGitIgnore(['**/*'], {
195+
cwd: mockFixturePath,
196+
filter: filterJsonFiles,
197+
})
198+
199+
// Should include package.json and the negated build/manifest.json, but not build/output.js.
200+
expect(results.map(normalizePath).sort()).toEqual([
201+
`${mockFixturePath}/build/manifest.json`,
202+
`${mockFixturePath}/package.json`,
203+
])
204+
})
205+
})
206+
207+
describe('createSupportedFilesFilter()', () => {
208+
it('should create a filter function matching supported file patterns', () => {
209+
const supportedFiles = {
210+
npm: {
211+
packagejson: { pattern: 'package.json' },
212+
packagelockjson: { pattern: 'package-lock.json' },
213+
},
214+
}
215+
216+
const filter = createSupportedFilesFilter(supportedFiles)
217+
218+
expect(filter('/path/to/package.json')).toBe(true)
219+
expect(filter('/path/to/package-lock.json')).toBe(true)
220+
expect(filter('/path/to/random.txt')).toBe(false)
221+
expect(filter('/path/to/nested/package.json')).toBe(true)
222+
})
223+
})
224+
225+
describe('pathsToGlobPatterns()', () => {
226+
it('should convert "." to "**/*"', () => {
227+
expect(pathsToGlobPatterns(['.'])).toEqual(['**/*'])
228+
expect(pathsToGlobPatterns(['./'])).toEqual(['**/*'])
229+
})
230+
231+
it('should append "/**/*" to directory paths', () => {
232+
mockTestFs({
233+
[`${mockFixturePath}/subdir`]: {
234+
'file.txt': '',
235+
},
236+
})
237+
238+
// The function checks if path is a directory using isDirSync.
239+
const result = pathsToGlobPatterns(['subdir'], mockFixturePath)
240+
expect(result).toEqual(['subdir/**/*'])
241+
})
242+
243+
it('should keep file paths unchanged', () => {
244+
mockTestFs({
245+
[`${mockFixturePath}/file.txt`]: '',
246+
})
247+
248+
const result = pathsToGlobPatterns(['file.txt'], mockFixturePath)
249+
expect(result).toEqual(['file.txt'])
250+
})
251+
})
252+
})

src/utils/path-resolve.mts

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import { isDirSync } from '@socketsecurity/registry/lib/fs'
99

1010
import constants, { NODE_MODULES, NPM } from '../constants.mts'
1111
import {
12-
filterBySupportedScanFiles,
12+
createSupportedFilesFilter,
1313
globWithGitIgnore,
1414
pathsToGlobPatterns,
1515
} from './glob.mts'
@@ -114,13 +114,17 @@ export async function getPackageFilesForScan(
114114
...options,
115115
} as PackageFilesForScanOptions
116116

117-
const filepaths = await globWithGitIgnore(
117+
// Apply the supported files filter during streaming to avoid accumulating
118+
// all files in memory. This is critical for large monorepos with 100k+ files
119+
// where accumulating all paths before filtering causes OOM errors.
120+
const filter = createSupportedFilesFilter(supportedFiles)
121+
122+
return await globWithGitIgnore(
118123
pathsToGlobPatterns(inputPaths, options?.cwd),
119124
{
120125
cwd,
126+
filter,
121127
socketConfig,
122128
},
123129
)
124-
125-
return filterBySupportedScanFiles(filepaths!, supportedFiles)
126130
}

0 commit comments

Comments
 (0)