Skip to content

Commit 482b84f

Browse files
HavenDVclaude
andcommitted
Fix stack overflow on large files + optimize FileScanner for huge directories
BytePairEncoding.FindPartsHeap: heap-allocate 5 internal arrays when n > 512 instead of stackalloc, preventing stack overflow on thread pool threads when tokenizing large files (e.g., 500KB+). FileScanner optimizations for scanning directories like ~/ (500K+ dirs): - Expand default-excluded dirs from 7 to ~45 (package caches, runtimes, IDE state, AI tools) - Add macOS-specific excludes (Library, Applications, Movies, Music, Pictures) - Avoid fstatat() syscall on files (only access entry.Attributes for directories) - Replace HasFlag with bitwise AND to avoid boxing - Guard Stopwatch.GetTimestamp() behind trackStats flag - Add CancellationToken support to Scan/ScanDirectory - Replace LINQ .Any() with manual loop in hot path - Increase parallel directory depth from 1 to 3 - Expand known binary/text extension lists to reduce binary detection I/O - Add progress indicator (DirsVisited counter + Timer in Program.cs) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 38411db commit 482b84f

3 files changed

Lines changed: 194 additions & 51 deletions

File tree

src/cli/Tiktoken.Cli/IO/FileScanner.cs

Lines changed: 136 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -13,67 +13,145 @@ internal sealed class FileScanner
1313
private readonly bool _noDefaultExcludes;
1414
private readonly bool _noGitignore;
1515
private readonly bool _followSymlinks;
16+
private readonly bool _trackStats;
17+
private long _dirsVisited;
1618

1719
private static readonly FrozenSet<string> DefaultExcludedDirs =
1820
FrozenSet.ToFrozenSet(
19-
[".git", ".hg", ".svn", "node_modules", "__pycache__", "bin", "obj"],
20-
StringComparer.OrdinalIgnoreCase);
21+
[
22+
// Version control
23+
".git", ".hg", ".svn",
24+
// Build output
25+
"bin", "obj", "node_modules", "__pycache__",
26+
// Package manager caches
27+
".npm", ".nuget", ".cargo", ".rustup", ".gradle", ".m2",
28+
".pnpm-store", "bower_components",
29+
".bun", ".deno", ".gem", ".cocoapods", ".pub-cache",
30+
// Language runtimes / version managers (multi-GB, never contain source)
31+
".nvm", ".dotnet", ".local", ".conda", ".virtualenvs", ".venvs",
32+
".android", ".sdkman", ".jabba", ".swiftly",
33+
// Python virtual environments / caches
34+
"venv", ".venv", ".tox", ".mypy_cache", ".pytest_cache", ".ruff_cache",
35+
// IDE / editor state
36+
".idea", ".vs", ".fleet",
37+
".vscode", ".vscode-insiders", ".cursor", ".windsurf",
38+
// AI / ML tool caches (model weights, multi-GB)
39+
".ollama", ".lmstudio", ".keras", ".matplotlib",
40+
".claude", ".codex", ".cline", ".aider", ".copilot",
41+
// Container / cloud / infra
42+
".docker", ".minikube", ".kube",
43+
".terraform", ".pulumi",
44+
// Misc caches and generated dirs
45+
".cache", ".Trash", ".Trashes",
46+
".next", ".turbo", ".angular", ".parcel-cache",
47+
"coverage", ".nyc_output",
48+
// macOS / filesystem metadata
49+
".Spotlight-V100", ".fseventsd", ".TemporaryItems",
50+
], StringComparer.OrdinalIgnoreCase);
51+
52+
/// <summary>
53+
/// Directories excluded only on macOS — system/app directories that never contain
54+
/// user source code but can have tens of thousands of subdirectories.
55+
/// </summary>
56+
private static readonly FrozenSet<string> MacOsExcludedDirs = OperatingSystem.IsMacOS()
57+
? FrozenSet.ToFrozenSet(
58+
["Library", "Applications", "Movies", "Music", "Pictures"],
59+
StringComparer.OrdinalIgnoreCase)
60+
: FrozenSet<string>.Empty;
2161

2262
private static readonly FrozenSet<string> KnownBinaryExtensions =
2363
FrozenSet.ToFrozenSet(
2464
[
25-
".exe", ".dll", ".pdb", ".obj", ".bin", ".so", ".dylib",
65+
// Executables / libraries
66+
".exe", ".dll", ".pdb", ".obj", ".bin", ".so", ".dylib", ".framework",
67+
// Images
2668
".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".webp", ".svg",
27-
".mp3", ".mp4", ".wav", ".avi", ".mkv", ".mov", ".flac", ".ogg",
28-
".zip", ".gz", ".tar", ".7z", ".rar", ".bz2", ".xz", ".zst",
69+
".tiff", ".tif", ".heic", ".heif", ".avif", ".raw", ".cr2", ".nef",
70+
// Audio
71+
".mp3", ".wav", ".flac", ".ogg", ".aac", ".wma", ".m4a", ".opus",
72+
// Video
73+
".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".m4v", ".ts",
74+
// Archives
75+
".zip", ".gz", ".tar", ".7z", ".rar", ".bz2", ".xz", ".zst", ".lz4",
76+
".cab", ".dmg", ".iso", ".img", ".pkg", ".deb", ".rpm",
77+
// Documents (binary formats)
2978
".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx",
79+
".odt", ".ods", ".odp", ".pages", ".numbers", ".keynote",
80+
// Fonts
3081
".woff", ".woff2", ".ttf", ".otf", ".eot",
31-
".nupkg", ".snupkg", ".ttkb",
32-
".class", ".pyc", ".o", ".a", ".lib",
82+
// .NET / Java
83+
".nupkg", ".snupkg", ".ttkb", ".class", ".jar", ".war", ".ear",
84+
// Compiled objects
85+
".pyc", ".pyo", ".o", ".a", ".lib", ".ko",
86+
// Databases
87+
".db", ".sqlite", ".sqlite3", ".mdb", ".ldb",
88+
// Misc binary
89+
".dat", ".DS_Store", ".localized",
3390
], StringComparer.OrdinalIgnoreCase);
3491

3592
private static readonly FrozenSet<string> KnownTextExtensions =
3693
FrozenSet.ToFrozenSet(
3794
[
95+
// .NET
3896
".cs", ".csx", ".fs", ".fsx", ".vb",
39-
".json", ".jsonl", ".xml", ".yaml", ".yml", ".toml", ".ini", ".cfg", ".conf",
97+
".sln", ".slnx", ".csproj", ".fsproj", ".vbproj", ".props", ".targets",
98+
".razor", ".cshtml",
99+
// Data / Config
100+
".json", ".jsonl", ".jsonc", ".xml", ".yaml", ".yml", ".toml",
101+
".ini", ".cfg", ".conf", ".config", ".properties",
102+
".env", ".lock", ".plist",
103+
// Markup / Documentation
40104
".md", ".mdx", ".txt", ".text", ".log", ".csv", ".tsv",
41-
".html", ".htm", ".css", ".scss", ".sass", ".less",
105+
".rst", ".adoc", ".tex", ".latex",
106+
// Web
107+
".html", ".htm", ".css", ".scss", ".sass", ".less", ".styl",
42108
".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs", ".mts", ".cts",
109+
".vue", ".svelte", ".astro",
110+
// Languages
43111
".py", ".pyi", ".rb", ".go", ".rs", ".java", ".kt", ".kts", ".scala",
44112
".c", ".h", ".cpp", ".hpp", ".cc", ".hh", ".cxx", ".hxx",
45113
".swift", ".m", ".mm",
46114
".sh", ".bash", ".zsh", ".fish", ".ps1", ".psm1", ".bat", ".cmd",
47115
".sql", ".graphql", ".gql", ".proto",
48116
".r", ".R", ".jl", ".lua", ".pl", ".pm", ".php",
49117
".tf", ".hcl", ".dockerfile", ".makefile",
118+
".zig", ".nim", ".dart", ".ex", ".exs", ".erl", ".hrl",
119+
".clj", ".cljs", ".cljc", ".edn",
120+
".hs", ".lhs", ".elm", ".ml", ".mli", ".f90", ".f95",
121+
// DevOps / Config files
50122
".gitignore", ".gitattributes", ".editorconfig", ".prettierrc",
51-
".sln", ".slnx", ".csproj", ".fsproj", ".vbproj", ".props", ".targets",
52-
".razor", ".cshtml",
53-
".env", ".lock",
123+
".eslintrc", ".babelrc", ".npmrc",
124+
".dockerignore", ".helmignore",
54125
], StringComparer.OrdinalIgnoreCase);
55126

56127
private static readonly bool s_isWindows = OperatingSystem.IsWindows();
57128

58129
public ScanStats Stats { get; private set; } = new();
59130

131+
/// <summary>
132+
/// Approximate number of directories visited so far (thread-safe, for progress reporting).
133+
/// </summary>
134+
public long DirsVisited => Interlocked.Read(ref _dirsVisited);
135+
60136
public FileScanner(
61137
IEnumerable<string>? includePatterns = null,
62138
IEnumerable<string>? excludePatterns = null,
63139
long maxFileSize = 50 * 1024 * 1024,
64140
bool noDefaultExcludes = false,
65141
bool noGitignore = false,
66-
bool followSymlinks = false)
142+
bool followSymlinks = false,
143+
bool trackStats = false)
67144
{
68145
_includePatterns = includePatterns?.ToList() ?? [];
69146
_excludePatterns = excludePatterns?.ToList() ?? [];
70147
_maxFileSize = maxFileSize;
71148
_noDefaultExcludes = noDefaultExcludes;
72149
_noGitignore = noGitignore;
73150
_followSymlinks = followSymlinks;
151+
_trackStats = trackStats;
74152
}
75153

76-
public IReadOnlyList<string> Scan(string rootPath)
154+
public IReadOnlyList<string> Scan(string rootPath, CancellationToken cancellationToken = default)
77155
{
78156
rootPath = Path.GetFullPath(rootPath);
79157

@@ -105,7 +183,7 @@ public IReadOnlyList<string> Scan(string rootPath)
105183

106184
var localStats = new ScanStats();
107185
var results = new List<string>();
108-
ScanDirectory(rootPath, rootPath, rootPrefixLen, parentIgnores, anyParentHasFileRules, results, localStats, depth: 0);
186+
ScanDirectory(rootPath, rootPath, rootPrefixLen, parentIgnores, anyParentHasFileRules, results, localStats, depth: 0, cancellationToken);
109187
results.Sort(StringComparer.OrdinalIgnoreCase);
110188
Stats = localStats;
111189
return results;
@@ -127,10 +205,14 @@ private static FileSystemEnumerable<DirEntry> EnumerateEntries(string dirPath)
127205
{
128206
return new FileSystemEnumerable<DirEntry>(
129207
dirPath,
130-
static (ref FileSystemEntry entry) => new DirEntry(
131-
entry.ToFullPath(),
132-
entry.IsDirectory,
133-
entry.Attributes.HasFlag(FileAttributes.ReparsePoint)),
208+
static (ref FileSystemEntry entry) =>
209+
{
210+
var isDir = entry.IsDirectory; // Free on Unix (uses d_type from readdir)
211+
// Only access entry.Attributes for directories — this triggers fstatat()
212+
// on Unix. For files we don't need symlink info, so skip the extra syscall.
213+
var isSymlink = isDir && (entry.Attributes & FileAttributes.ReparsePoint) != 0;
214+
return new DirEntry(entry.ToFullPath(), isDir, isSymlink);
215+
},
134216
new EnumerationOptions
135217
{
136218
IgnoreInaccessible = true,
@@ -153,8 +235,12 @@ private void ScanDirectory(
153235
bool anyHasFileRules,
154236
List<string> results,
155237
ScanStats localStats,
156-
int depth)
238+
int depth,
239+
CancellationToken cancellationToken)
157240
{
241+
cancellationToken.ThrowIfCancellationRequested();
242+
Interlocked.Increment(ref _dirsVisited);
243+
158244
// Load .gitignore from this directory if it exists
159245
var localMatcher = _noGitignore ? null : LoadLocalGitignore(dirPath);
160246
List<(string Directory, GitignoreMatcher Matcher)> effectiveIgnores;
@@ -207,7 +293,7 @@ private void ScanDirectory(
207293

208294
var dirName = Path.GetFileName(entry.FullPath);
209295

210-
if (!_noDefaultExcludes && DefaultExcludedDirs.Contains(dirName))
296+
if (!_noDefaultExcludes && (DefaultExcludedDirs.Contains(dirName) || MacOsExcludedDirs.Contains(dirName)))
211297
{
212298
localStats.DirsDefaultExcluded++;
213299
continue;
@@ -225,10 +311,10 @@ private void ScanDirectory(
225311
relDir = relDir.Replace('\\', '/');
226312
}
227313

228-
var t0 = Stopwatch.GetTimestamp();
314+
var t0 = _trackStats ? Stopwatch.GetTimestamp() : 0;
229315
var ignored = IsIgnoredByGitignore(effectiveIgnores, rootPath, entry.FullPath, relDir + "/") ||
230316
IsIgnoredByGitignore(effectiveIgnores, rootPath, entry.FullPath, relDir);
231-
localStats.TicksGitignoreMatch += Stopwatch.GetTimestamp() - t0;
317+
if (_trackStats) localStats.TicksGitignoreMatch += Stopwatch.GetTimestamp() - t0;
232318

233319
if (ignored)
234320
{
@@ -257,52 +343,51 @@ private void ScanDirectory(
257343
// in the effective set can match files (only directory-only rules exist).
258344
if (effectiveHasFileRules)
259345
{
260-
var t0 = Stopwatch.GetTimestamp();
346+
var t0 = _trackStats ? Stopwatch.GetTimestamp() : 0;
261347
var ignored = IsIgnoredByGitignore(effectiveIgnores, rootPath, entry.FullPath, relativePath);
262-
localStats.TicksGitignoreMatch += Stopwatch.GetTimestamp() - t0;
348+
if (_trackStats) localStats.TicksGitignoreMatch += Stopwatch.GetTimestamp() - t0;
263349
if (ignored)
264350
{
265351
localStats.FilesGitignored++;
266352
continue;
267353
}
268354
}
269355

270-
if (_includePatterns.Count > 0 &&
271-
!_includePatterns.Any(p => MatchesPattern(relativePath, p)))
356+
if (_includePatterns.Count > 0 && !MatchesAnyPattern(relativePath, _includePatterns))
272357
{
273358
localStats.FilesFilteredOut++;
274359
continue;
275360
}
276361

277-
if (_excludePatterns.Any(p => MatchesPattern(relativePath, p)))
362+
if (_excludePatterns.Count > 0 && MatchesAnyPattern(relativePath, _excludePatterns))
278363
{
279364
localStats.FilesFilteredOut++;
280365
continue;
281366
}
282367

283368
// Defer stat() until after gitignore/filter checks pass.
284-
var tStat = Stopwatch.GetTimestamp();
369+
var tStat = _trackStats ? Stopwatch.GetTimestamp() : 0;
285370
long fileSize;
286371
try
287372
{
288373
fileSize = new FileInfo(entry.FullPath).Length;
289374
}
290375
catch (IOException)
291376
{
292-
localStats.TicksStatSize += Stopwatch.GetTimestamp() - tStat;
377+
if (_trackStats) localStats.TicksStatSize += Stopwatch.GetTimestamp() - tStat;
293378
continue;
294379
}
295-
localStats.TicksStatSize += Stopwatch.GetTimestamp() - tStat;
380+
if (_trackStats) localStats.TicksStatSize += Stopwatch.GetTimestamp() - tStat;
296381

297382
if (fileSize > _maxFileSize)
298383
{
299384
localStats.FilesTooLarge++;
300385
continue;
301386
}
302387

303-
var tBin = Stopwatch.GetTimestamp();
388+
var tBin = _trackStats ? Stopwatch.GetTimestamp() : 0;
304389
var isBin = IsBinary(entry.FullPath, fileSize);
305-
localStats.TicksBinaryDetect += Stopwatch.GetTimestamp() - tBin;
390+
if (_trackStats) localStats.TicksBinaryDetect += Stopwatch.GetTimestamp() - tBin;
306391

307392
if (isBin)
308393
{
@@ -326,20 +411,20 @@ private void ScanDirectory(
326411
return;
327412
}
328413

329-
// Parallelize top-level and second-level subdirectories for throughput.
330-
// Going deeper adds thread pool overhead that outweighs the benefit.
331-
if (depth <= 1 && subDirs.Count > 1)
414+
// Parallelize shallow subdirectories for throughput on large trees.
415+
// depth <= 3 ensures heavy subtrees (e.g. ~/GitHub/<repo>/) are also parallelized.
416+
if (depth <= 3 && subDirs.Count > 1)
332417
{
333418
var bags = new ConcurrentBag<(List<string> Results, ScanStats Stats)>();
334419
try
335420
{
336-
Parallel.ForEach(subDirs, subDir =>
421+
Parallel.ForEach(subDirs, new ParallelOptions { CancellationToken = cancellationToken }, subDir =>
337422
{
338423
var localResults = new List<string>();
339424
var localSubStats = new ScanStats();
340425
try
341426
{
342-
ScanDirectory(subDir, rootPath, rootPrefixLen, effectiveIgnores, effectiveHasFileRules, localResults, localSubStats, depth + 1);
427+
ScanDirectory(subDir, rootPath, rootPrefixLen, effectiveIgnores, effectiveHasFileRules, localResults, localSubStats, depth + 1, cancellationToken);
343428
}
344429
catch (Exception ex) when (ex is IOException or UnauthorizedAccessException or PathTooLongException)
345430
{
@@ -369,7 +454,7 @@ private void ScanDirectory(
369454
{
370455
try
371456
{
372-
ScanDirectory(subDir, rootPath, rootPrefixLen, effectiveIgnores, effectiveHasFileRules, results, localStats, depth + 1);
457+
ScanDirectory(subDir, rootPath, rootPrefixLen, effectiveIgnores, effectiveHasFileRules, results, localStats, depth + 1, cancellationToken);
373458
}
374459
catch (Exception ex) when (ex is IOException or UnauthorizedAccessException or PathTooLongException)
375460
{
@@ -553,6 +638,18 @@ private static bool IsIgnoredByGitignore(
553638
return false;
554639
}
555640

641+
private static bool MatchesAnyPattern(string relativePath, List<string> patterns)
642+
{
643+
foreach (var pattern in patterns)
644+
{
645+
if (MatchesPattern(relativePath, pattern))
646+
{
647+
return true;
648+
}
649+
}
650+
return false;
651+
}
652+
556653
private static bool MatchesPattern(string relativePath, string pattern)
557654
{
558655
// relativePath is already normalized to '/' on entry

src/cli/Tiktoken.Cli/Program.cs

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -223,8 +223,25 @@
223223
{
224224
if (Directory.Exists(path))
225225
{
226-
var scanner = new FileScanner(include, exclude, maxFileSize, noDefaultExcludes, noGitignore, followSymlinks);
227-
var files = scanner.Scan(path);
226+
var scanner = new FileScanner(include, exclude, maxFileSize, noDefaultExcludes, noGitignore, followSymlinks, trackStats: stats);
227+
228+
Timer? progressTimer = null;
229+
if (progress)
230+
{
231+
progressTimer = new Timer(_ =>
232+
{
233+
Console.Error.Write($"\r Scanning directories... {scanner.DirsVisited:N0}");
234+
}, null, 200, 200);
235+
}
236+
237+
var files = scanner.Scan(path, cancellationToken);
238+
239+
if (progressTimer != null)
240+
{
241+
await progressTimer.DisposeAsync().ConfigureAwait(false);
242+
Console.Error.Write($"\r Scanning directories... {scanner.DirsVisited:N0} done\n");
243+
}
244+
228245
lastScanner = scanner;
229246
var root = Path.GetFullPath(path);
230247
foreach (var file in files)

0 commit comments

Comments
 (0)