Skip to content

Commit b97ecb9

Browse files
HavenDVclaude
andcommitted
Fix home directory crash + use ArrayPool in BPE heap path
FileScanner: catch all exceptions (not just IO types) in Parallel.ForEach workers and sequential branches. Nested Parallel.ForEach wraps inner errors in AggregateException which escaped the previous IOException-only catch filter. Also: - Lower depth guard from 64 to 40 (no real source tree is that deep) - Add macOS inner-Library excludes (Developer, Caches, Containers, etc.) as defense-in-depth for --no-default-excludes or scanning inside ~/Library - Add Steam.AppBundle to default excludes (recursive .app bundle) BytePairEncoding: replace heap allocations (new int[n]) with ArrayPool<int>.Shared.Rent/Return in FindPartsHeap to reduce GC pressure when tokenizing many large files. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 482b84f commit b97ecb9

2 files changed

Lines changed: 58 additions & 24 deletions

File tree

src/cli/Tiktoken.Cli/IO/FileScanner.cs

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ internal sealed class FileScanner
4747
"coverage", ".nyc_output",
4848
// macOS / filesystem metadata
4949
".Spotlight-V100", ".fseventsd", ".TemporaryItems",
50+
// App bundles with recursive symlink structures
51+
"Steam.AppBundle",
5052
], StringComparer.OrdinalIgnoreCase);
5153

5254
/// <summary>
@@ -55,7 +57,13 @@ internal sealed class FileScanner
5557
/// </summary>
5658
private static readonly FrozenSet<string> MacOsExcludedDirs = OperatingSystem.IsMacOS()
5759
? FrozenSet.ToFrozenSet(
58-
["Library", "Applications", "Movies", "Music", "Pictures"],
60+
[
61+
"Library", "Applications", "Movies", "Music", "Pictures",
62+
// Inside ~/Library — excluded as a second line of defense if Library itself
63+
// is not excluded (e.g. --no-default-excludes, or scanning inside ~/Library)
64+
"Developer", "Application Support", "Containers", "Group Containers",
65+
"Caches", "Logs", "Saved Application State", "WebKit",
66+
],
5967
StringComparer.OrdinalIgnoreCase)
6068
: FrozenSet<string>.Empty;
6169

@@ -257,8 +265,9 @@ private void ScanDirectory(
257265
effectiveHasFileRules = anyHasFileRules;
258266
}
259267

260-
// Guard against symlink loops and extremely deep paths
261-
if (depth > 64)
268+
// Guard against symlink loops, recursive .app bundles, and extremely deep paths
269+
// that would cause PathTooLongException. 40 levels is far beyond any real source tree.
270+
if (depth > 40)
262271
{
263272
localStats.DirsErrored++;
264273
return;
@@ -426,20 +435,27 @@ private void ScanDirectory(
426435
{
427436
ScanDirectory(subDir, rootPath, rootPrefixLen, effectiveIgnores, effectiveHasFileRules, localResults, localSubStats, depth + 1, cancellationToken);
428437
}
429-
catch (Exception ex) when (ex is IOException or UnauthorizedAccessException or PathTooLongException)
438+
catch (OperationCanceledException)
430439
{
440+
throw; // Let cancellation propagate to Parallel.ForEach
441+
}
442+
catch (Exception)
443+
{
444+
// Catch ALL exceptions including AggregateException from nested
445+
// Parallel.ForEach, IOException, PathTooLongException, etc.
431446
localSubStats.DirsErrored++;
432447
}
433448
bags.Add((localResults, localSubStats));
434449
});
435450
}
436-
catch (AggregateException ae)
451+
catch (OperationCanceledException)
437452
{
438-
// If any worker threw an unhandled exception, count it and continue
439-
foreach (var _ in ae.InnerExceptions)
440-
{
441-
localStats.DirsErrored++;
442-
}
453+
throw;
454+
}
455+
catch (Exception)
456+
{
457+
// AggregateException or any other exception from Parallel.ForEach
458+
localStats.DirsErrored++;
443459
}
444460

445461
foreach (var (subResults, subStats) in bags)
@@ -456,7 +472,11 @@ private void ScanDirectory(
456472
{
457473
ScanDirectory(subDir, rootPath, rootPrefixLen, effectiveIgnores, effectiveHasFileRules, results, localStats, depth + 1, cancellationToken);
458474
}
459-
catch (Exception ex) when (ex is IOException or UnauthorizedAccessException or PathTooLongException)
475+
catch (OperationCanceledException)
476+
{
477+
throw;
478+
}
479+
catch (Exception)
460480
{
461481
localStats.DirsErrored++;
462482
}

src/libs/Tiktoken.Core/BytePairEncoding.cs

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
using System.Collections.Frozen;
88
#endif
99
#if NET8_0_OR_GREATER
10+
using System.Buffers;
1011
using Tiktoken.Core;
1112
#endif
1213

@@ -204,22 +205,35 @@ private static unsafe int FindPartsHeap(
204205
{
205206
var n = partsLength;
206207

207-
// For large pieces, heap-allocate to avoid stack overflow on thread pool threads.
208+
// For large pieces, rent from ArrayPool to avoid stack overflow on thread pool
209+
// threads and reduce GC pressure on repeated large-file tokenizations.
208210
// 5 arrays × n × 4 bytes each — for n > 512 this exceeds safe stackalloc limits.
209211
if (n > MaxStackAllocLength)
210212
{
211-
var nextArr = new int[n];
212-
var prevArr = new int[n];
213-
var ranksArr = new int[n];
214-
var heapArr = new int[n];
215-
var heapPosArr = new int[n];
216-
fixed (int* next = nextArr)
217-
fixed (int* prev = prevArr)
218-
fixed (int* ranks = ranksArr)
219-
fixed (int* heap = heapArr)
220-
fixed (int* heapPos = heapPosArr)
221-
{
222-
return FindPartsHeapCore(pieceSpan, resultIndexes, n, encoder, next, prev, ranks, heap, heapPos);
213+
var pool = ArrayPool<int>.Shared;
214+
var nextArr = pool.Rent(n);
215+
var prevArr = pool.Rent(n);
216+
var ranksArr = pool.Rent(n);
217+
var heapArr = pool.Rent(n);
218+
var heapPosArr = pool.Rent(n);
219+
try
220+
{
221+
fixed (int* next = nextArr)
222+
fixed (int* prev = prevArr)
223+
fixed (int* ranks = ranksArr)
224+
fixed (int* heap = heapArr)
225+
fixed (int* heapPos = heapPosArr)
226+
{
227+
return FindPartsHeapCore(pieceSpan, resultIndexes, n, encoder, next, prev, ranks, heap, heapPos);
228+
}
229+
}
230+
finally
231+
{
232+
pool.Return(heapPosArr);
233+
pool.Return(heapArr);
234+
pool.Return(ranksArr);
235+
pool.Return(prevArr);
236+
pool.Return(nextArr);
223237
}
224238
}
225239
else

0 commit comments

Comments
 (0)