@@ -13,67 +13,145 @@ internal sealed class FileScanner
1313 private readonly bool _noDefaultExcludes ;
1414 private readonly bool _noGitignore ;
1515 private readonly bool _followSymlinks ;
16+ private readonly bool _trackStats ;
17+ private long _dirsVisited ;
1618
1719 private static readonly FrozenSet < string > DefaultExcludedDirs =
1820 FrozenSet . ToFrozenSet (
19- [ ".git" , ".hg" , ".svn" , "node_modules" , "__pycache__" , "bin" , "obj" ] ,
20- StringComparer . OrdinalIgnoreCase ) ;
21+ [
22+ // Version control
23+ ".git" , ".hg" , ".svn" ,
24+ // Build output
25+ "bin" , "obj" , "node_modules" , "__pycache__" ,
26+ // Package manager caches
27+ ".npm" , ".nuget" , ".cargo" , ".rustup" , ".gradle" , ".m2" ,
28+ ".pnpm-store" , "bower_components" ,
29+ ".bun" , ".deno" , ".gem" , ".cocoapods" , ".pub-cache" ,
30+ // Language runtimes / version managers (multi-GB, never contain source)
31+ ".nvm" , ".dotnet" , ".local" , ".conda" , ".virtualenvs" , ".venvs" ,
32+ ".android" , ".sdkman" , ".jabba" , ".swiftly" ,
33+ // Python virtual environments / caches
34+ "venv" , ".venv" , ".tox" , ".mypy_cache" , ".pytest_cache" , ".ruff_cache" ,
35+ // IDE / editor state
36+ ".idea" , ".vs" , ".fleet" ,
37+ ".vscode" , ".vscode-insiders" , ".cursor" , ".windsurf" ,
38+ // AI / ML tool caches (model weights, multi-GB)
39+ ".ollama" , ".lmstudio" , ".keras" , ".matplotlib" ,
40+ ".claude" , ".codex" , ".cline" , ".aider" , ".copilot" ,
41+ // Container / cloud / infra
42+ ".docker" , ".minikube" , ".kube" ,
43+ ".terraform" , ".pulumi" ,
44+ // Misc caches and generated dirs
45+ ".cache" , ".Trash" , ".Trashes" ,
46+ ".next" , ".turbo" , ".angular" , ".parcel-cache" ,
47+ "coverage" , ".nyc_output" ,
48+ // macOS / filesystem metadata
49+ ".Spotlight-V100" , ".fseventsd" , ".TemporaryItems" ,
50+ ] , StringComparer . OrdinalIgnoreCase ) ;
51+
52+ /// <summary>
53+ /// Directories excluded only on macOS — system/app directories that never contain
54+ /// user source code but can have tens of thousands of subdirectories.
55+ /// </summary>
56+ private static readonly FrozenSet < string > MacOsExcludedDirs = OperatingSystem . IsMacOS ( )
57+ ? FrozenSet . ToFrozenSet (
58+ [ "Library" , "Applications" , "Movies" , "Music" , "Pictures" ] ,
59+ StringComparer . OrdinalIgnoreCase )
60+ : FrozenSet < string > . Empty ;
2161
2262 private static readonly FrozenSet < string > KnownBinaryExtensions =
2363 FrozenSet . ToFrozenSet (
2464 [
25- ".exe" , ".dll" , ".pdb" , ".obj" , ".bin" , ".so" , ".dylib" ,
65+ // Executables / libraries
66+ ".exe" , ".dll" , ".pdb" , ".obj" , ".bin" , ".so" , ".dylib" , ".framework" ,
67+ // Images
2668 ".png" , ".jpg" , ".jpeg" , ".gif" , ".bmp" , ".ico" , ".webp" , ".svg" ,
27- ".mp3" , ".mp4" , ".wav" , ".avi" , ".mkv" , ".mov" , ".flac" , ".ogg" ,
28- ".zip" , ".gz" , ".tar" , ".7z" , ".rar" , ".bz2" , ".xz" , ".zst" ,
69+ ".tiff" , ".tif" , ".heic" , ".heif" , ".avif" , ".raw" , ".cr2" , ".nef" ,
70+ // Audio
71+ ".mp3" , ".wav" , ".flac" , ".ogg" , ".aac" , ".wma" , ".m4a" , ".opus" ,
72+ // Video
73+ ".mp4" , ".avi" , ".mkv" , ".mov" , ".wmv" , ".flv" , ".webm" , ".m4v" , ".ts" ,
74+ // Archives
75+ ".zip" , ".gz" , ".tar" , ".7z" , ".rar" , ".bz2" , ".xz" , ".zst" , ".lz4" ,
76+ ".cab" , ".dmg" , ".iso" , ".img" , ".pkg" , ".deb" , ".rpm" ,
77+ // Documents (binary formats)
2978 ".pdf" , ".doc" , ".docx" , ".xls" , ".xlsx" , ".ppt" , ".pptx" ,
79+ ".odt" , ".ods" , ".odp" , ".pages" , ".numbers" , ".keynote" ,
80+ // Fonts
3081 ".woff" , ".woff2" , ".ttf" , ".otf" , ".eot" ,
31- ".nupkg" , ".snupkg" , ".ttkb" ,
32- ".class" , ".pyc" , ".o" , ".a" , ".lib" ,
82+ // .NET / Java
83+ ".nupkg" , ".snupkg" , ".ttkb" , ".class" , ".jar" , ".war" , ".ear" ,
84+ // Compiled objects
85+ ".pyc" , ".pyo" , ".o" , ".a" , ".lib" , ".ko" ,
86+ // Databases
87+ ".db" , ".sqlite" , ".sqlite3" , ".mdb" , ".ldb" ,
88+ // Misc binary
89+ ".dat" , ".DS_Store" , ".localized" ,
3390 ] , StringComparer . OrdinalIgnoreCase ) ;
3491
3592 private static readonly FrozenSet < string > KnownTextExtensions =
3693 FrozenSet . ToFrozenSet (
3794 [
95+ // .NET
3896 ".cs" , ".csx" , ".fs" , ".fsx" , ".vb" ,
39- ".json" , ".jsonl" , ".xml" , ".yaml" , ".yml" , ".toml" , ".ini" , ".cfg" , ".conf" ,
97+ ".sln" , ".slnx" , ".csproj" , ".fsproj" , ".vbproj" , ".props" , ".targets" ,
98+ ".razor" , ".cshtml" ,
99+ // Data / Config
100+ ".json" , ".jsonl" , ".jsonc" , ".xml" , ".yaml" , ".yml" , ".toml" ,
101+ ".ini" , ".cfg" , ".conf" , ".config" , ".properties" ,
102+ ".env" , ".lock" , ".plist" ,
103+ // Markup / Documentation
40104 ".md" , ".mdx" , ".txt" , ".text" , ".log" , ".csv" , ".tsv" ,
41- ".html" , ".htm" , ".css" , ".scss" , ".sass" , ".less" ,
105+ ".rst" , ".adoc" , ".tex" , ".latex" ,
106+ // Web
107+ ".html" , ".htm" , ".css" , ".scss" , ".sass" , ".less" , ".styl" ,
42108 ".js" , ".jsx" , ".ts" , ".tsx" , ".mjs" , ".cjs" , ".mts" , ".cts" ,
109+ ".vue" , ".svelte" , ".astro" ,
110+ // Languages
43111 ".py" , ".pyi" , ".rb" , ".go" , ".rs" , ".java" , ".kt" , ".kts" , ".scala" ,
44112 ".c" , ".h" , ".cpp" , ".hpp" , ".cc" , ".hh" , ".cxx" , ".hxx" ,
45113 ".swift" , ".m" , ".mm" ,
46114 ".sh" , ".bash" , ".zsh" , ".fish" , ".ps1" , ".psm1" , ".bat" , ".cmd" ,
47115 ".sql" , ".graphql" , ".gql" , ".proto" ,
48116 ".r" , ".R" , ".jl" , ".lua" , ".pl" , ".pm" , ".php" ,
49117 ".tf" , ".hcl" , ".dockerfile" , ".makefile" ,
118+ ".zig" , ".nim" , ".dart" , ".ex" , ".exs" , ".erl" , ".hrl" ,
119+ ".clj" , ".cljs" , ".cljc" , ".edn" ,
120+ ".hs" , ".lhs" , ".elm" , ".ml" , ".mli" , ".f90" , ".f95" ,
121+ // DevOps / Config files
50122 ".gitignore" , ".gitattributes" , ".editorconfig" , ".prettierrc" ,
51- ".sln" , ".slnx" , ".csproj" , ".fsproj" , ".vbproj" , ".props" , ".targets" ,
52- ".razor" , ".cshtml" ,
53- ".env" , ".lock" ,
123+ ".eslintrc" , ".babelrc" , ".npmrc" ,
124+ ".dockerignore" , ".helmignore" ,
54125 ] , StringComparer . OrdinalIgnoreCase ) ;
55126
56127 private static readonly bool s_isWindows = OperatingSystem . IsWindows ( ) ;
57128
58129 public ScanStats Stats { get ; private set ; } = new ( ) ;
59130
131+ /// <summary>
132+ /// Approximate number of directories visited so far (thread-safe, for progress reporting).
133+ /// </summary>
134+ public long DirsVisited => Interlocked . Read ( ref _dirsVisited ) ;
135+
60136 public FileScanner (
61137 IEnumerable < string > ? includePatterns = null ,
62138 IEnumerable < string > ? excludePatterns = null ,
63139 long maxFileSize = 50 * 1024 * 1024 ,
64140 bool noDefaultExcludes = false ,
65141 bool noGitignore = false ,
66- bool followSymlinks = false )
142+ bool followSymlinks = false ,
143+ bool trackStats = false )
67144 {
68145 _includePatterns = includePatterns ? . ToList ( ) ?? [ ] ;
69146 _excludePatterns = excludePatterns ? . ToList ( ) ?? [ ] ;
70147 _maxFileSize = maxFileSize ;
71148 _noDefaultExcludes = noDefaultExcludes ;
72149 _noGitignore = noGitignore ;
73150 _followSymlinks = followSymlinks ;
151+ _trackStats = trackStats ;
74152 }
75153
76- public IReadOnlyList < string > Scan ( string rootPath )
154+ public IReadOnlyList < string > Scan ( string rootPath , CancellationToken cancellationToken = default )
77155 {
78156 rootPath = Path . GetFullPath ( rootPath ) ;
79157
@@ -105,7 +183,7 @@ public IReadOnlyList<string> Scan(string rootPath)
105183
106184 var localStats = new ScanStats ( ) ;
107185 var results = new List < string > ( ) ;
108- ScanDirectory ( rootPath , rootPath , rootPrefixLen , parentIgnores , anyParentHasFileRules , results , localStats , depth : 0 ) ;
186+ ScanDirectory ( rootPath , rootPath , rootPrefixLen , parentIgnores , anyParentHasFileRules , results , localStats , depth : 0 , cancellationToken ) ;
109187 results . Sort ( StringComparer . OrdinalIgnoreCase ) ;
110188 Stats = localStats ;
111189 return results ;
@@ -127,10 +205,14 @@ private static FileSystemEnumerable<DirEntry> EnumerateEntries(string dirPath)
127205 {
128206 return new FileSystemEnumerable < DirEntry > (
129207 dirPath ,
130- static ( ref FileSystemEntry entry ) => new DirEntry (
131- entry . ToFullPath ( ) ,
132- entry . IsDirectory ,
133- entry . Attributes . HasFlag ( FileAttributes . ReparsePoint ) ) ,
208+ static ( ref FileSystemEntry entry ) =>
209+ {
210+ var isDir = entry . IsDirectory ; // Free on Unix (uses d_type from readdir)
211+ // Only access entry.Attributes for directories — this triggers fstatat()
212+ // on Unix. For files we don't need symlink info, so skip the extra syscall.
213+ var isSymlink = isDir && ( entry . Attributes & FileAttributes . ReparsePoint ) != 0 ;
214+ return new DirEntry ( entry . ToFullPath ( ) , isDir , isSymlink ) ;
215+ } ,
134216 new EnumerationOptions
135217 {
136218 IgnoreInaccessible = true ,
@@ -153,8 +235,12 @@ private void ScanDirectory(
153235 bool anyHasFileRules ,
154236 List < string > results ,
155237 ScanStats localStats ,
156- int depth )
238+ int depth ,
239+ CancellationToken cancellationToken )
157240 {
241+ cancellationToken . ThrowIfCancellationRequested ( ) ;
242+ Interlocked . Increment ( ref _dirsVisited ) ;
243+
158244 // Load .gitignore from this directory if it exists
159245 var localMatcher = _noGitignore ? null : LoadLocalGitignore ( dirPath ) ;
160246 List < ( string Directory , GitignoreMatcher Matcher ) > effectiveIgnores ;
@@ -207,7 +293,7 @@ private void ScanDirectory(
207293
208294 var dirName = Path . GetFileName ( entry . FullPath ) ;
209295
210- if ( ! _noDefaultExcludes && DefaultExcludedDirs . Contains ( dirName ) )
296+ if ( ! _noDefaultExcludes && ( DefaultExcludedDirs . Contains ( dirName ) || MacOsExcludedDirs . Contains ( dirName ) ) )
211297 {
212298 localStats . DirsDefaultExcluded ++ ;
213299 continue ;
@@ -225,10 +311,10 @@ private void ScanDirectory(
225311 relDir = relDir . Replace ( '\\ ' , '/' ) ;
226312 }
227313
228- var t0 = Stopwatch . GetTimestamp ( ) ;
314+ var t0 = _trackStats ? Stopwatch . GetTimestamp ( ) : 0 ;
229315 var ignored = IsIgnoredByGitignore ( effectiveIgnores , rootPath , entry . FullPath , relDir + "/" ) ||
230316 IsIgnoredByGitignore ( effectiveIgnores , rootPath , entry . FullPath , relDir ) ;
231- localStats . TicksGitignoreMatch += Stopwatch . GetTimestamp ( ) - t0 ;
317+ if ( _trackStats ) localStats . TicksGitignoreMatch += Stopwatch . GetTimestamp ( ) - t0 ;
232318
233319 if ( ignored )
234320 {
@@ -257,52 +343,51 @@ private void ScanDirectory(
257343 // in the effective set can match files (only directory-only rules exist).
258344 if ( effectiveHasFileRules )
259345 {
260- var t0 = Stopwatch . GetTimestamp ( ) ;
346+ var t0 = _trackStats ? Stopwatch . GetTimestamp ( ) : 0 ;
261347 var ignored = IsIgnoredByGitignore ( effectiveIgnores , rootPath , entry . FullPath , relativePath ) ;
262- localStats . TicksGitignoreMatch += Stopwatch . GetTimestamp ( ) - t0 ;
348+ if ( _trackStats ) localStats . TicksGitignoreMatch += Stopwatch . GetTimestamp ( ) - t0 ;
263349 if ( ignored )
264350 {
265351 localStats . FilesGitignored ++ ;
266352 continue ;
267353 }
268354 }
269355
270- if ( _includePatterns . Count > 0 &&
271- ! _includePatterns . Any ( p => MatchesPattern ( relativePath , p ) ) )
356+ if ( _includePatterns . Count > 0 && ! MatchesAnyPattern ( relativePath , _includePatterns ) )
272357 {
273358 localStats . FilesFilteredOut ++ ;
274359 continue ;
275360 }
276361
277- if ( _excludePatterns . Any ( p => MatchesPattern ( relativePath , p ) ) )
362+ if ( _excludePatterns . Count > 0 && MatchesAnyPattern ( relativePath , _excludePatterns ) )
278363 {
279364 localStats . FilesFilteredOut ++ ;
280365 continue ;
281366 }
282367
283368 // Defer stat() until after gitignore/filter checks pass.
284- var tStat = Stopwatch . GetTimestamp ( ) ;
369+ var tStat = _trackStats ? Stopwatch . GetTimestamp ( ) : 0 ;
285370 long fileSize ;
286371 try
287372 {
288373 fileSize = new FileInfo ( entry . FullPath ) . Length ;
289374 }
290375 catch ( IOException )
291376 {
292- localStats . TicksStatSize += Stopwatch . GetTimestamp ( ) - tStat ;
377+ if ( _trackStats ) localStats . TicksStatSize += Stopwatch . GetTimestamp ( ) - tStat ;
293378 continue ;
294379 }
295- localStats . TicksStatSize += Stopwatch . GetTimestamp ( ) - tStat ;
380+ if ( _trackStats ) localStats . TicksStatSize += Stopwatch . GetTimestamp ( ) - tStat ;
296381
297382 if ( fileSize > _maxFileSize )
298383 {
299384 localStats . FilesTooLarge ++ ;
300385 continue ;
301386 }
302387
303- var tBin = Stopwatch . GetTimestamp ( ) ;
388+ var tBin = _trackStats ? Stopwatch . GetTimestamp ( ) : 0 ;
304389 var isBin = IsBinary ( entry . FullPath , fileSize ) ;
305- localStats . TicksBinaryDetect += Stopwatch . GetTimestamp ( ) - tBin ;
390+ if ( _trackStats ) localStats . TicksBinaryDetect += Stopwatch . GetTimestamp ( ) - tBin ;
306391
307392 if ( isBin )
308393 {
@@ -326,20 +411,20 @@ private void ScanDirectory(
326411 return ;
327412 }
328413
329- // Parallelize top-level and second-level subdirectories for throughput.
330- // Going deeper adds thread pool overhead that outweighs the benefit .
331- if ( depth <= 1 && subDirs . Count > 1 )
414+ // Parallelize shallow subdirectories for throughput on large trees .
415+ // depth <= 3 ensures heavy subtrees (e.g. ~/GitHub/<repo>/) are also parallelized .
416+ if ( depth <= 3 && subDirs . Count > 1 )
332417 {
333418 var bags = new ConcurrentBag < ( List < string > Results , ScanStats Stats ) > ( ) ;
334419 try
335420 {
336- Parallel . ForEach ( subDirs , subDir =>
421+ Parallel . ForEach ( subDirs , new ParallelOptions { CancellationToken = cancellationToken } , subDir =>
337422 {
338423 var localResults = new List < string > ( ) ;
339424 var localSubStats = new ScanStats ( ) ;
340425 try
341426 {
342- ScanDirectory ( subDir , rootPath , rootPrefixLen , effectiveIgnores , effectiveHasFileRules , localResults , localSubStats , depth + 1 ) ;
427+ ScanDirectory ( subDir , rootPath , rootPrefixLen , effectiveIgnores , effectiveHasFileRules , localResults , localSubStats , depth + 1 , cancellationToken ) ;
343428 }
344429 catch ( Exception ex ) when ( ex is IOException or UnauthorizedAccessException or PathTooLongException )
345430 {
@@ -369,7 +454,7 @@ private void ScanDirectory(
369454 {
370455 try
371456 {
372- ScanDirectory ( subDir , rootPath , rootPrefixLen , effectiveIgnores , effectiveHasFileRules , results , localStats , depth + 1 ) ;
457+ ScanDirectory ( subDir , rootPath , rootPrefixLen , effectiveIgnores , effectiveHasFileRules , results , localStats , depth + 1 , cancellationToken ) ;
373458 }
374459 catch ( Exception ex ) when ( ex is IOException or UnauthorizedAccessException or PathTooLongException )
375460 {
@@ -553,6 +638,18 @@ private static bool IsIgnoredByGitignore(
553638 return false ;
554639 }
555640
641+ private static bool MatchesAnyPattern ( string relativePath , List < string > patterns )
642+ {
643+ foreach ( var pattern in patterns )
644+ {
645+ if ( MatchesPattern ( relativePath , pattern ) )
646+ {
647+ return true ;
648+ }
649+ }
650+ return false ;
651+ }
652+
556653 private static bool MatchesPattern ( string relativePath , string pattern )
557654 {
558655 // relativePath is already normalized to '/' on entry
0 commit comments