tok/chunker_test.go at main · GrayCodeAI/tok · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
package tok_test

import (
	"strings"
	"testing"

	"github.com/GrayCodeAI/tok"
)

func TestChunkCode_GoFuncBoundaries(t *testing.T) {
	source := `package main

import "fmt"

func Hello() {
	fmt.Println("hello")
}

func World() {
	fmt.Println("world")
}

type Config struct {
	Name string
	Port int
}
`
	opts := tok.ChunkOptions{MaxTokens: 500, MinTokens: 5, Language: "go"}
	chunks := tok.ChunkCode(source, opts)

	if len(chunks) < 2 {
		t.Fatalf("expected at least 2 chunks for Go code with multiple funcs, got %d", len(chunks))
	}

	// Verify chunks have reasonable start/end lines
	for i, c := range chunks {
		if c.StartLine <= 0 || c.EndLine <= 0 {
			t.Errorf("chunk %d: invalid line range %d-%d", i, c.StartLine, c.EndLine)
		}
		if c.Content == "" {
			t.Errorf("chunk %d: empty content", i)
		}
		if c.Tokens <= 0 {
			t.Errorf("chunk %d: expected positive token count, got %d", i, c.Tokens)
		}
	}

	// At least one chunk should have a symbol name
	hasSymbol := false
	for _, c := range chunks {
		if c.Symbol != "" {
			hasSymbol = true
			break
		}
	}
	if !hasSymbol {
		t.Error("expected at least one chunk to have a symbol name")
	}
}

func TestChunkCode_PythonBoundaries(t *testing.T) {
	source := `import os

class Animal:
    def __init__(self, name):
        self.name = name

    def speak(self):
        pass

def greet(name):
    print(f"Hello, {name}")

async def fetch_data():
    pass
`
	opts := tok.ChunkOptions{MaxTokens: 500, MinTokens: 5, Language: "python"}
	chunks := tok.ChunkCode(source, opts)

	if len(chunks) < 2 {
		t.Fatalf("expected at least 2 chunks for Python code, got %d", len(chunks))
	}

	// Check that Python symbol names are extracted
	foundClass := false
	foundDef := false
	for _, c := range chunks {
		if c.Symbol == "Animal" {
			foundClass = true
		}
		if c.Symbol == "greet" || c.Symbol == "fetch_data" {
			foundDef = true
		}
	}
	if !foundClass && !foundDef {
		t.Log("chunks:")
		for i, c := range chunks {
			t.Logf("  [%d] symbol=%q startLine=%d endLine=%d tokens=%d", i, c.Symbol, c.StartLine, c.EndLine, c.Tokens)
		}
		t.Error("expected to find Python class or def symbol names")
	}
}

func TestChunkCode_MergeSmallChunks(t *testing.T) {
	// Create source with very small functions that should be merged
	source := `package main

func A() {}

func B() {}

func C() {}
`
	// Set a high MinTokens so small chunks get merged
	opts := tok.ChunkOptions{MaxTokens: 500, MinTokens: 100, Language: "go"}
	chunks := tok.ChunkCode(source, opts)

	// With high MinTokens, small funcs should be merged together
	if len(chunks) > 2 {
		t.Errorf("expected small chunks to be merged (got %d chunks)", len(chunks))
	}
}

func TestChunkCode_SplitLargeChunks(t *testing.T) {
	// Generate a large function body that exceeds MaxTokens
	var sb strings.Builder
	sb.WriteString("package main\n\nfunc BigFunc() {\n")
	for i := 0; i < 200; i++ {
		sb.WriteString("    println(\"line " + strings.Repeat("x", 20) + "\")\n")
		// Add occasional blank lines for split points
		if i%20 == 0 && i > 0 {
			sb.WriteString("\n")
		}
	}
	sb.WriteString("}\n")

	opts := tok.ChunkOptions{MaxTokens: 100, MinTokens: 10, Language: "go"}
	chunks := tok.ChunkCode(sb.String(), opts)

	if len(chunks) < 2 {
		t.Fatalf("expected large function to be split into multiple chunks, got %d", len(chunks))
	}

	// Verify no chunk exceeds a reasonable bound (2x MaxTokens as grace)
	for i, c := range chunks {
		if c.Tokens > opts.MaxTokens*3 {
			t.Errorf("chunk %d has %d tokens, expected roughly <= %d", i, c.Tokens, opts.MaxTokens)
		}
	}
}

func TestDetectLanguageByExtension(t *testing.T) {
	tests := []struct {
		path string
		want string
	}{
		{"main.go", "go"},
		{"script.py", "python"},
		{"app.ts", "typescript"},
		{"index.js", "javascript"},
		{"lib.rs", "rust"},
		{"Main.java", "java"},
		{"app.rb", "ruby"},
		{"index.php", "php"},
		{"main.c", "c"},
		{"header.h", "c"},
		{"main.cpp", "cpp"},
		{"main.cc", "cpp"},
		{"Program.cs", "csharp"},
		{"App.kt", "kotlin"},
		{"App.swift", "swift"},
		{"main.dart", "dart"},
		{"setup.sh", "bash"},
		{"query.sql", "sql"},
		{"index.html", "html"},
		{"style.css", "css"},
		{"App.vue", "vue"},
		{"main.zig", "zig"},
		{"app.ex", "elixir"},
		{"Main.scala", "scala"},
		{"script.lua", "lua"},
		{"analysis.r", "r"},
		{"analysis.R", "r"},
		{"README.md", "markdown"},
		{"/path/to/file.go", "go"},
		{"unknown.xyz", ""},
		{"noext", ""},
	}
	for _, tt := range tests {
		got := tok.DetectLanguageByExtension(tt.path)
		if got != tt.want {
			t.Errorf("DetectLanguageByExtension(%q) = %q, want %q", tt.path, got, tt.want)
		}
	}
}

func TestDetectLanguageByExtension_EmptyAndUnknown(t *testing.T) {
	// Empty path
	if got := tok.DetectLanguageByExtension(""); got != "" {
		t.Errorf("expected empty for empty path, got %q", got)
	}
	// Unknown extension
	if got := tok.DetectLanguageByExtension("data.parquet"); got != "" {
		t.Errorf("expected empty for .parquet, got %q", got)
	}
}

func TestSeparatorLevels(t *testing.T) {
	// Create a block of text with no blank lines but with sentence endings.
	// The separator hierarchy should split at sentence boundaries when
	// paragraph breaks (level 1) and line breaks (level 2) are not available.
	text := "This is sentence one. This is sentence two. This is sentence three. This is sentence four. This is sentence five."

	// Use a small MaxTokens so the text must be split
	opts := tok.ChunkOptions{MaxTokens: 20, MinTokens: 1, Language: "go", Overlap: 0}

	// Wrap it in a Go package so it's treated as one chunk initially
	source := "package main\n\n" + text
	chunks := tok.ChunkCode(source, opts)

	if len(chunks) < 2 {
		t.Fatalf("expected separator levels to split the text into multiple chunks, got %d", len(chunks))
	}

	// Verify all chunks have content
	for i, c := range chunks {
		if c.Content == "" {
			t.Errorf("chunk %d: empty content", i)
		}
	}
	t.Logf("separator levels produced %d chunks from run-on text", len(chunks))
}

func TestExtendedExtensions(t *testing.T) {
	tests := []struct {
		path string
		want string
	}{
		{"file.m", "objective-c"},
		{"file.mm", "objective-cpp"},
		{"file.pl", "perl"},
		{"file.pm", "perl"},
		{"file.hs", "haskell"},
		{"file.lhs", "haskell"},
		{"file.erl", "erlang"},
		{"file.hrl", "erlang"},
		{"file.clj", "clojure"},
		{"file.cljs", "clojure"},
		{"file.fs", "fsharp"},
		{"file.fsx", "fsharp"},
		{"file.ml", "ocaml"},
		{"file.mli", "ocaml"},
		{"file.jl", "julia"},
		{"file.nim", "nim"},
		{"file.cr", "crystal"},
		{"file.v", "v"},
		{"file.vhdl", "vhdl"},
		{"file.sol", "solidity"},
		{"file.proto", "protobuf"},
		{"file.graphql", "graphql"},
		{"file.gql", "graphql"},
		{"file.tf", "terraform"},
		{"file.hcl", "hcl"},
		{"file.cmake", "cmake"},
		{"file.gradle", "gradle"},
		{"file.groovy", "groovy"},
		{"file.ps1", "powershell"},
		{"file.psm1", "powershell"},
		{"file.bat", "batch"},
		{"file.cmd", "batch"},
		{"file.fish", "fish"},
		{"file.vim", "vim"},
		{"file.el", "elisp"},
		{"file.rkt", "racket"},
		{"file.pas", "pascal"},
		{"file.d", "d"},
		{"file.ada", "ada"},
		{"file.adb", "ada"},
		{"file.f90", "fortran"},
		{"file.f95", "fortran"},
		{"file.cob", "cobol"},
		{"file.tsx", "tsx"},
		{"file.jsx", "jsx"},
		{"file.svelte", "svelte"},
		{"file.astro", "astro"},
		{"file.prisma", "prisma"},
		{"file.env", "dotenv"},
		{"file.ini", "ini"},
		{"file.cfg", "ini"},
		{"file.conf", "conf"},
		{"file.nginx", "nginx"},
		{"file.dockerfile", "dockerfile"},
		{"file.containerfile", "dockerfile"},
		{"file.nix", "nix"},
		{"file.dhall", "dhall"},
		{"file.jsonnet", "jsonnet"},
		{"file.starlark", "starlark"},
		{"file.bzl", "starlark"},
		{"file.wasm", "wasm"},
		{"file.wat", "wat"},
	}
	for _, tt := range tests {
		got := tok.DetectLanguageByExtension(tt.path)
		if got != tt.want {
			t.Errorf("DetectLanguageByExtension(%q) = %q, want %q", tt.path, got, tt.want)
		}
	}
}

func TestKeepSeparator(t *testing.T) {
	source := `package main

func Alpha() {
	println("alpha")
}

func Beta() {
	println("beta")
}

func Gamma() {
	println("gamma")
}
`
	t.Run("SepLeft", func(t *testing.T) {
		opts := tok.ChunkOptions{
			MaxTokens:     500,
			MinTokens:     1,
			Language:      "go",
			Overlap:       0,
			KeepSeparator: tok.SepLeft,
		}
		chunks := tok.ChunkCode(source, opts)
		if len(chunks) < 2 {
			t.Fatalf("expected multiple chunks, got %d", len(chunks))
		}
		// With SepLeft (default), boundary line starts the next chunk
		// (this is the existing behavior - boundary detected, previous flushed,
		//  boundary line added to next chunk)
		for i, c := range chunks {
			t.Logf("SepLeft chunk %d: %q", i, c.Content[:min(60, len(c.Content))])
		}
	})

	t.Run("SepRight", func(t *testing.T) {
		opts := tok.ChunkOptions{
			MaxTokens:     500,
			MinTokens:     1,
			Language:      "go",
			Overlap:       0,
			KeepSeparator: tok.SepRight,
		}
		chunks := tok.ChunkCode(source, opts)
		if len(chunks) < 2 {
			t.Fatalf("expected multiple chunks, got %d", len(chunks))
		}
		// With SepRight, boundary line should go to the following chunk
		for i, c := range chunks {
			t.Logf("SepRight chunk %d: %q", i, c.Content[:min(60, len(c.Content))])
		}
		// The second chunk should start with "func"
		for _, c := range chunks[1:] {
			trimmed := strings.TrimSpace(c.Content)
			if !strings.HasPrefix(trimmed, "func") && trimmed != "" {
				// This is OK - some chunks may just be whitespace that got merged
			}
		}
	})

	t.Run("SepDiscard", func(t *testing.T) {
		opts := tok.ChunkOptions{
			MaxTokens:     500,
			MinTokens:     1,
			Language:      "go",
			Overlap:       0,
			KeepSeparator: tok.SepDiscard,
		}
		chunks := tok.ChunkCode(source, opts)
		if len(chunks) < 2 {
			t.Fatalf("expected multiple chunks, got %d", len(chunks))
		}
		// With SepDiscard, the boundary lines (func ...) should be removed.
		// Only the first chunk (which has "package main" + first func) should contain "func"
		// since the first func is not a boundary split point.
		discardedCount := 0
		for _, c := range chunks {
			// Count how many chunks DON'T contain "func " - those had it discarded
			if !strings.Contains(c.Content, "func ") {
				discardedCount++
			}
		}
		if discardedCount == 0 {
			t.Log("expected at least some chunks to have their boundary line discarded")
		}
		for i, c := range chunks {
			t.Logf("SepDiscard chunk %d: %q", i, c.Content[:min(60, len(c.Content))])
		}
	})
}

func TestOptimizedMerging(t *testing.T) {
	// Create source with many tiny functions. With cost-optimized merging,
	// small chunks below MinTokens should be merged to reduce penalties.
	var sb strings.Builder
	sb.WriteString("package main\n\n")
	for i := 0; i < 10; i++ {
		sb.WriteString("func F" + string(rune('A'+i)) + "() {\n")
		sb.WriteString("    println(\"hello\")\n")
		sb.WriteString("}\n\n")
	}

	opts := tok.ChunkOptions{
		MaxTokens: 200,
		MinTokens: 30,
		Language:  "go",
		Overlap:   0,
	}
	chunks := tok.ChunkCode(sb.String(), opts)

	// The optimizer should merge small functions together rather than leaving
	// each as its own tiny chunk
	tooSmallCount := 0
	for _, c := range chunks {
		if c.Tokens < opts.MinTokens {
			tooSmallCount++
		}
	}

	// With optimization, we should have fewer too-small chunks than total functions
	if tooSmallCount > 5 {
		t.Errorf("expected optimizer to merge small chunks, but %d/%d chunks are below MinTokens=%d",
			tooSmallCount, len(chunks), opts.MinTokens)
	}

	t.Logf("optimized merging: %d chunks from 10 tiny functions, %d below MinTokens",
		len(chunks), tooSmallCount)

	// Verify no chunk exceeds MaxTokens
	for i, c := range chunks {
		if c.Tokens > opts.MaxTokens {
			t.Errorf("chunk %d exceeds MaxTokens: %d > %d", i, c.Tokens, opts.MaxTokens)
		}
	}
}

func TestCustomLanguagePatterns(t *testing.T) {
	// Register a custom language with a custom boundary pattern
	tok.RegisterLanguagePatterns("mylang", []string{`^SECTION `})

	// Verify GetLanguagePatterns returns the custom patterns
	patterns := tok.GetLanguagePatterns("mylang")
	if len(patterns) != 1 || patterns[0] != `^SECTION ` {
		t.Fatalf("expected custom patterns, got %v", patterns)
	}

	// Verify GetLanguagePatterns returns built-in for known languages
	goPatterns := tok.GetLanguagePatterns("go")
	if len(goPatterns) == 0 {
		t.Fatal("expected built-in patterns for Go")
	}

	// Use the custom language for chunking
	source := `SECTION Intro
This is the introduction.
Some more text here.

SECTION Body
This is the body content.
More body text.

SECTION Conclusion
This is the conclusion.
`
	opts := tok.ChunkOptions{
		MaxTokens: 500,
		MinTokens: 1,
		Language:  "mylang",
		Overlap:   0,
	}
	chunks := tok.ChunkCode(source, opts)

	if len(chunks) < 2 {
		t.Fatalf("expected custom boundary to split into multiple chunks, got %d", len(chunks))
	}

	// Verify the chunks split at SECTION boundaries
	t.Logf("custom language produced %d chunks", len(chunks))
	for i, c := range chunks {
		t.Logf("  chunk %d: %q", i, c.Content[:min(50, len(c.Content))])
	}

	// Clean up: re-register with nil to avoid affecting other tests
	tok.RegisterLanguagePatterns("mylang", nil)
}

func min(a, b int) int {
	if a < b {
		return a
	}
	return b
}

func TestChunkCode_TokenCountingAccuracy(t *testing.T) {
	source := `package main

func main() {
	fmt.Println("hello world")
}
`
	opts := tok.ChunkOptions{MaxTokens: 500, MinTokens: 5, Language: "go"}
	chunks := tok.ChunkCode(source, opts)

	if len(chunks) == 0 {
		t.Fatal("expected at least one chunk")
	}

	for i, c := range chunks {
		// Verify that the reported token count matches what EstimateTokensPrecise returns
		expected := tok.EstimateTokensPrecise(c.Content)
		if c.Tokens != expected {
			t.Errorf("chunk %d: token count mismatch: chunk reports %d, EstimateTokensPrecise returns %d",
				i, c.Tokens, expected)
		}

		// Sanity check: tokens should be positive for non-empty content
		if c.Content != "" && c.Tokens <= 0 {
			t.Errorf("chunk %d: non-empty content but %d tokens", i, c.Tokens)
		}
	}
}