diff --git a/chunker.go b/chunker.go index eae31149..ed3ece29 100644 --- a/chunker.go +++ b/chunker.go @@ -761,266 +761,4 @@ func symbolReForLang(lang string) *regexp.Regexp { } } -// extensionLanguageMap maps file extensions to language names. -var extensionLanguageMap = map[string]string{ - // Go - ".go": "go", - // Python - ".py": "python", - ".pyw": "python", - // TypeScript / JavaScript - ".ts": "typescript", - ".tsx": "tsx", - ".js": "javascript", - ".jsx": "jsx", - // Rust - ".rs": "rust", - // Java - ".java": "java", - // Ruby - ".rb": "ruby", - // PHP - ".php": "php", - // C - ".c": "c", - ".h": "c", - // C++ - ".cpp": "cpp", - ".cc": "cpp", - ".cxx": "cpp", - ".hpp": "cpp", - // C# - ".cs": "csharp", - // Kotlin - ".kt": "kotlin", - ".kts": "kotlin", - // Swift - ".swift": "swift", - // Dart - ".dart": "dart", - // Shell - ".sh": "bash", - ".bash": "bash", - // SQL - ".sql": "sql", - // HTML - ".html": "html", - ".htm": "html", - // CSS - ".css": "css", - // Vue - ".vue": "vue", - // Zig - ".zig": "zig", - // Elixir - ".ex": "elixir", - ".exs": "elixir", - // Scala - ".scala": "scala", - // Lua - ".lua": "lua", - // R - ".r": "r", - ".R": "r", - // Markdown - ".md": "markdown", - // Objective-C / Objective-C++ - ".m": "objective-c", - ".mm": "objective-cpp", - // Perl - ".pl": "perl", - ".pm": "perl", - // Haskell - ".hs": "haskell", - ".lhs": "haskell", - // Erlang - ".erl": "erlang", - ".hrl": "erlang", - // Clojure - ".clj": "clojure", - ".cljs": "clojure", - // F# - ".fs": "fsharp", - ".fsx": "fsharp", - // OCaml - ".ml": "ocaml", - ".mli": "ocaml", - // Julia - ".jl": "julia", - // Nim - ".nim": "nim", - // Crystal - ".cr": "crystal", - // V - ".v": "v", - // VHDL - ".vhdl": "vhdl", - // Solidity - ".sol": "solidity", - // Protobuf - ".proto": "protobuf", - // GraphQL - ".graphql": "graphql", - ".gql": "graphql", - // Terraform / HCL - ".tf": "terraform", - ".hcl": "hcl", - // CMake - ".cmake": "cmake", - // Gradle - ".gradle": "gradle", - // Groovy - ".groovy": "groovy", - // PowerShell - ".ps1": "powershell", - ".psm1": "powershell", - // Batch - ".bat": "batch", - ".cmd": "batch", - // Fish - ".fish": "fish", - // Vim - ".vim": "vim", - // Emacs Lisp - ".el": "elisp", - // Racket - ".rkt": "racket", - // Pascal - ".pas": "pascal", - // D - ".d": "d", - // Ada - ".ada": "ada", - ".adb": "ada", - // Fortran - ".f90": "fortran", - ".f95": "fortran", - // COBOL - ".cob": "cobol", - // Svelte - ".svelte": "svelte", - // Astro - ".astro": "astro", - // Prisma - ".prisma": "prisma", - // dotenv - ".env": "dotenv", - // INI / Config - ".ini": "ini", - ".cfg": "ini", - // Config files - ".conf": "conf", - // Nginx - ".nginx": "nginx", - // Dockerfile - ".dockerfile": "dockerfile", - ".containerfile": "dockerfile", - // Nix - ".nix": "nix", - // Dhall - ".dhall": "dhall", - // Jsonnet - ".jsonnet": "jsonnet", - // Starlark - ".starlark": "starlark", - ".bzl": "starlark", - // WebAssembly - ".wasm": "wasm", - ".wat": "wat", - // YAML - ".yaml": "yaml", - ".yml": "yaml", - // JSON - ".json": "json", - // TOML - ".toml": "toml", - // XML - ".xml": "xml", - // SCSS / SASS / LESS - ".scss": "scss", - ".sass": "sass", - ".less": "less", - // CoffeeScript - ".coffee": "coffeescript", - // Elm - ".elm": "elm", - // PureScript - ".purs": "purescript", - // Assembly - ".asm": "assembly", - ".s": "assembly", - // Makefile - ".mk": "makefile", - // Diff / Patch - ".diff": "diff", - ".patch": "diff", - // LaTeX - ".tex": "latex", - // reStructuredText - ".rst": "rst", - // Org-mode - ".org": "org", - // CSV / TSV - ".csv": "csv", - ".tsv": "tsv", - // Dockerfile special - ".Dockerfile": "dockerfile", - // Scheme - ".scm": "scheme", - // Common Lisp - ".lisp": "commonlisp", - ".cl": "commonlisp", - // Prolog - ".pro": "prolog", - // Tcl - ".tcl": "tcl", - // R Markdown - ".rmd": "rmarkdown", - // Jupyter - ".ipynb": "jupyter", - // Terraform vars - ".tfvars": "terraform", - // Protocol Buffers - ".pb": "protobuf", - // Thrift - ".thrift": "thrift", - // Avro - ".avsc": "avro", - // GLSL / HLSL - ".glsl": "glsl", - ".hlsl": "hlsl", - // Cuda - ".cu": "cuda", - ".cuh": "cuda", - // CMakeLists - ".cmake.in": "cmake", - // Vagrant - ".vagrantfile": "ruby", - // Puppet - ".pp": "puppet", - // Chef - ".erb": "erb", - // Ansible - ".j2": "jinja2", - // Svelte - ".svx": "svelte", - // MDX - ".mdx": "mdx", -} - -// DetectLanguageByExtension returns the programming language name for a file -// path based on its extension. Returns "" for unknown extensions. -func DetectLanguageByExtension(path string) string { - ext := filepath.Ext(path) - if ext == "" { - return "" - } - if lang, ok := extensionLanguageMap[ext]; ok { - return lang - } - // Try lowercase for case-insensitive matching (except .R which is special) - if lang, ok := extensionLanguageMap[strings.ToLower(ext)]; ok { - return lang - } - return "" -} +// extensionLanguageMap and DetectLanguageByExtension moved to chunker_extensions.go. diff --git a/chunker_extensions.go b/chunker_extensions.go new file mode 100644 index 00000000..e0985f0d --- /dev/null +++ b/chunker_extensions.go @@ -0,0 +1,270 @@ +package tok + +import ( + "path/filepath" + "strings" +) + +// extensionLanguageMap maps file extensions to language names. +var extensionLanguageMap = map[string]string{ + // Go + ".go": "go", + // Python + ".py": "python", + ".pyw": "python", + // TypeScript / JavaScript + ".ts": "typescript", + ".tsx": "tsx", + ".js": "javascript", + ".jsx": "jsx", + // Rust + ".rs": "rust", + // Java + ".java": "java", + // Ruby + ".rb": "ruby", + // PHP + ".php": "php", + // C + ".c": "c", + ".h": "c", + // C++ + ".cpp": "cpp", + ".cc": "cpp", + ".cxx": "cpp", + ".hpp": "cpp", + // C# + ".cs": "csharp", + // Kotlin + ".kt": "kotlin", + ".kts": "kotlin", + // Swift + ".swift": "swift", + // Dart + ".dart": "dart", + // Shell + ".sh": "bash", + ".bash": "bash", + // SQL + ".sql": "sql", + // HTML + ".html": "html", + ".htm": "html", + // CSS + ".css": "css", + // Vue + ".vue": "vue", + // Zig + ".zig": "zig", + // Elixir + ".ex": "elixir", + ".exs": "elixir", + // Scala + ".scala": "scala", + // Lua + ".lua": "lua", + // R + ".r": "r", + ".R": "r", + // Markdown + ".md": "markdown", + // Objective-C / Objective-C++ + ".m": "objective-c", + ".mm": "objective-cpp", + // Perl + ".pl": "perl", + ".pm": "perl", + // Haskell + ".hs": "haskell", + ".lhs": "haskell", + // Erlang + ".erl": "erlang", + ".hrl": "erlang", + // Clojure + ".clj": "clojure", + ".cljs": "clojure", + // F# + ".fs": "fsharp", + ".fsx": "fsharp", + // OCaml + ".ml": "ocaml", + ".mli": "ocaml", + // Julia + ".jl": "julia", + // Nim + ".nim": "nim", + // Crystal + ".cr": "crystal", + // V + ".v": "v", + // VHDL + ".vhdl": "vhdl", + // Solidity + ".sol": "solidity", + // Protobuf + ".proto": "protobuf", + // GraphQL + ".graphql": "graphql", + ".gql": "graphql", + // Terraform / HCL + ".tf": "terraform", + ".hcl": "hcl", + // CMake + ".cmake": "cmake", + // Gradle + ".gradle": "gradle", + // Groovy + ".groovy": "groovy", + // PowerShell + ".ps1": "powershell", + ".psm1": "powershell", + // Batch + ".bat": "batch", + ".cmd": "batch", + // Fish + ".fish": "fish", + // Vim + ".vim": "vim", + // Emacs Lisp + ".el": "elisp", + // Racket + ".rkt": "racket", + // Pascal + ".pas": "pascal", + // D + ".d": "d", + // Ada + ".ada": "ada", + ".adb": "ada", + // Fortran + ".f90": "fortran", + ".f95": "fortran", + // COBOL + ".cob": "cobol", + // Svelte + ".svelte": "svelte", + // Astro + ".astro": "astro", + // Prisma + ".prisma": "prisma", + // dotenv + ".env": "dotenv", + // INI / Config + ".ini": "ini", + ".cfg": "ini", + // Config files + ".conf": "conf", + // Nginx + ".nginx": "nginx", + // Dockerfile + ".dockerfile": "dockerfile", + ".containerfile": "dockerfile", + // Nix + ".nix": "nix", + // Dhall + ".dhall": "dhall", + // Jsonnet + ".jsonnet": "jsonnet", + // Starlark + ".starlark": "starlark", + ".bzl": "starlark", + // WebAssembly + ".wasm": "wasm", + ".wat": "wat", + // YAML + ".yaml": "yaml", + ".yml": "yaml", + // JSON + ".json": "json", + // TOML + ".toml": "toml", + // XML + ".xml": "xml", + // SCSS / SASS / LESS + ".scss": "scss", + ".sass": "sass", + ".less": "less", + // CoffeeScript + ".coffee": "coffeescript", + // Elm + ".elm": "elm", + // PureScript + ".purs": "purescript", + // Assembly + ".asm": "assembly", + ".s": "assembly", + // Makefile + ".mk": "makefile", + // Diff / Patch + ".diff": "diff", + ".patch": "diff", + // LaTeX + ".tex": "latex", + // reStructuredText + ".rst": "rst", + // Org-mode + ".org": "org", + // CSV / TSV + ".csv": "csv", + ".tsv": "tsv", + // Dockerfile special + ".Dockerfile": "dockerfile", + // Scheme + ".scm": "scheme", + // Common Lisp + ".lisp": "commonlisp", + ".cl": "commonlisp", + // Prolog + ".pro": "prolog", + // Tcl + ".tcl": "tcl", + // R Markdown + ".rmd": "rmarkdown", + // Jupyter + ".ipynb": "jupyter", + // Terraform vars + ".tfvars": "terraform", + // Protocol Buffers + ".pb": "protobuf", + // Thrift + ".thrift": "thrift", + // Avro + ".avsc": "avro", + // GLSL / HLSL + ".glsl": "glsl", + ".hlsl": "hlsl", + // Cuda + ".cu": "cuda", + ".cuh": "cuda", + // CMakeLists + ".cmake.in": "cmake", + // Vagrant + ".vagrantfile": "ruby", + // Puppet + ".pp": "puppet", + // Chef + ".erb": "erb", + // Ansible + ".j2": "jinja2", + // Svelte + ".svx": "svelte", + // MDX + ".mdx": "mdx", +} + +// DetectLanguageByExtension returns the programming language name for a file +// path based on its extension. Returns "" for unknown extensions. +func DetectLanguageByExtension(path string) string { + ext := filepath.Ext(path) + if ext == "" { + return "" + } + if lang, ok := extensionLanguageMap[ext]; ok { + return lang + } + // Try lowercase for case-insensitive matching (except .R which is special) + if lang, ok := extensionLanguageMap[strings.ToLower(ext)]; ok { + return lang + } + return "" +} diff --git a/integration_advanced_test.go b/integration_advanced_test.go new file mode 100644 index 00000000..a58d0eb4 --- /dev/null +++ b/integration_advanced_test.go @@ -0,0 +1,473 @@ +package tok_test + +import ( + "strings" + "testing" + "time" + + "github.com/GrayCodeAI/tok" +) + +// --------------------------------------------------------------------------- +// 6. Performance - Test that compression completes within reasonable time +// --------------------------------------------------------------------------- + +func TestIntegration_Performance_SmallInput(t *testing.T) { + input := "Short text for performance test." + deadline := 500 * time.Millisecond + + start := time.Now() + tok.Compress(input) + elapsed := time.Since(start) + + if elapsed > deadline { + t.Errorf("small input compression took %v, expected under %v", elapsed, deadline) + } +} + +func TestIntegration_Performance_MediumInput(t *testing.T) { + input := strings.Repeat("This is a medium-length sentence for performance testing. ", 100) + deadline := 2 * time.Second + + start := time.Now() + tok.Compress(input, tok.Aggressive) + elapsed := time.Since(start) + + if elapsed > deadline { + t.Errorf("medium input compression took %v, expected under %v", elapsed, deadline) + } +} + +func TestIntegration_Performance_LargeInput(t *testing.T) { + // ~100 KB of content + input := strings.Repeat("This is a line of text that will be compressed by the full pipeline. It has enough content to exercise multiple layers. ", 2000) + // Race detector adds significant overhead; allow extra headroom when enabled. + deadline := 10 * time.Second + if raceEnabled() { + deadline = 120 * time.Second + } + + start := time.Now() + tok.Compress(input, tok.Aggressive) + elapsed := time.Since(start) + + if elapsed > deadline { + t.Errorf("large input compression took %v, expected under %v", elapsed, deadline) + } +} + +func TestIntegration_Performance_CodeInput(t *testing.T) { + var sb strings.Builder + for i := 0; i < 100; i++ { + sb.WriteString("func process") + sb.WriteString(strings.Repeat("x", 10)) + sb.WriteString("(input string) string {\n") + sb.WriteString("\tresult := strings.TrimSpace(input)\n") + sb.WriteString("\tresult = strings.ToLower(result)\n") + sb.WriteString("\treturn result\n") + sb.WriteString("}\n\n") + } + input := sb.String() + deadline := 5 * time.Second + + start := time.Now() + tok.Compress(input, tok.Code) + elapsed := time.Since(start) + + if elapsed > deadline { + t.Errorf("code input compression took %v, expected under %v", elapsed, deadline) + } +} + +func TestIntegration_Performance_RepeatedCalls(t *testing.T) { + input := "This is a sentence for testing repeated compression calls with the same input." + deadline := 5 * time.Second + + start := time.Now() + for i := 0; i < 100; i++ { + tok.Compress(input) + } + elapsed := time.Since(start) + + if elapsed > deadline { + t.Errorf("100 repeated compressions took %v, expected under %v", elapsed, deadline) + } +} + +func TestIntegration_Performance_CompressorReuse(t *testing.T) { + c := tok.NewCompressor(tok.Adaptive) + inputs := []string{ + "First input to the reusable compressor.", + "Second input with different content for comparison.", + "Third input that is a bit longer to test how the pipeline handles varied sizes across calls.", + strings.Repeat("Fourth input is quite large. ", 50), + } + + deadline := 5 * time.Second + start := time.Now() + for _, input := range inputs { + output, stats := c.Compress(input) + if output == "" { + t.Fatal("reusable compressor returned empty output") + } + if stats.OriginalTokens == 0 { + t.Fatal("reusable compressor returned zero OriginalTokens") + } + } + elapsed := time.Since(start) + + if elapsed > deadline { + t.Errorf("4 compressions with reused compressor took %v, expected under %v", elapsed, deadline) + } +} + +// --------------------------------------------------------------------------- +// 7. Configuration - Test that pipeline configuration affects output correctly +// --------------------------------------------------------------------------- + +func TestIntegration_Configuration_BudgetEnforcement(t *testing.T) { + input := strings.Repeat("word ", 500) // ~500 words + + _, tightStats := tok.Compress(input, tok.WithBudget(50)) + _, looseStats := tok.Compress(input, tok.WithBudget(500)) + + if tightStats.FinalTokens > 70 { + t.Errorf("tight budget (50) produced %d tokens, expected near 50", tightStats.FinalTokens) + } + // Loose budget should preserve more tokens + if looseStats.FinalTokens < tightStats.FinalTokens { + t.Errorf("loose budget (%d) produced fewer tokens (%d) than tight budget (%d)", + 500, looseStats.FinalTokens, tightStats.FinalTokens) + } +} + +func TestIntegration_Configuration_MinimalVsAggressive(t *testing.T) { + input := strings.Repeat("This is test content for comparing compression modes. ", 100) + + _, minimalStats := tok.Compress(input, tok.Minimal) + _, aggressiveStats := tok.Compress(input, tok.Aggressive) + + if minimalStats.OriginalTokens == 0 || aggressiveStats.OriginalTokens == 0 { + t.Fatal("OriginalTokens should be non-zero for both modes") + } + + // Aggressive mode should produce fewer or equal tokens compared to minimal + if aggressiveStats.FinalTokens > minimalStats.FinalTokens+10 { // allow small margin + t.Errorf("aggressive mode (%d tokens) should not produce more than minimal mode (%d tokens)", + aggressiveStats.FinalTokens, minimalStats.FinalTokens) + } +} + +func TestIntegration_Configuration_CodeVsLogTier(t *testing.T) { + codeInput := `func main() { + fmt.Println("hello") + x := 42 + return x +} + +func helper(s string) string { + return strings.TrimSpace(s) +}` + + _, codeStats := tok.Compress(codeInput, tok.Code) + _, logStats := tok.Compress(codeInput, tok.Log) + + // Both should produce non-empty output + if codeStats.OriginalTokens == 0 || logStats.OriginalTokens == 0 { + t.Fatal("both tiers should report non-zero OriginalTokens") + } + // Stats may differ based on tier-specific layers + if codeStats.FinalTokens == 0 { + t.Error("code tier should produce non-zero FinalTokens") + } + if logStats.FinalTokens == 0 { + t.Error("log tier should produce non-zero FinalTokens") + } +} + +func TestIntegration_Configuration_WithAndWithoutQuery(t *testing.T) { + input := `[INFO] Application started +[ERROR] Failed to connect to database +[WARN] Retrying connection +[INFO] Connected successfully +[ERROR] Timeout on request to /api/data +[INFO] Retrying request +[INFO] Request succeeded` + + _, noQueryStats := tok.Compress(input) + _, withQueryStats := tok.Compress(input, tok.WithQuery("database errors")) + + if noQueryStats.OriginalTokens == 0 || withQueryStats.OriginalTokens == 0 { + t.Fatal("OriginalTokens should be non-zero") + } + // Query-aware compression may produce different results (more or fewer tokens + // depending on relevance scoring); both should be valid + if noQueryStats.FinalTokens == 0 { + t.Error("no-query compression should produce non-zero FinalTokens") + } + if withQueryStats.FinalTokens == 0 { + t.Error("query-aware compression should produce non-zero FinalTokens") + } +} + +func TestIntegration_Configuration_AdaptiveTier_AutoDetectsContentType(t *testing.T) { + logInput := strings.Repeat("[INFO] 2026-05-28T10:00:00Z level=info msg=\"request processed\" status=200\n", 100) + _, logStats := tok.Compress(logInput, tok.Adaptive) + if logStats.OriginalTokens == 0 { + t.Fatal("adaptive tier should report non-zero OriginalTokens for log content") + } + + codeInput := strings.Repeat("func processItem(id int) error {\n\treturn nil\n}\n\n", 50) + _, codeStats := tok.Compress(codeInput, tok.Adaptive) + if codeStats.OriginalTokens == 0 { + t.Fatal("adaptive tier should report non-zero OriginalTokens for code content") + } +} + +func TestIntegration_Configuration_SurfaceTier_FastPath(t *testing.T) { + input := strings.Repeat("content line for surface tier testing\n", 100) + + start := time.Now() + output, stats := tok.Compress(input, tok.Surface) + elapsed := time.Since(start) + + if output == "" { + t.Fatal("surface tier returned empty output") + } + if stats.OriginalTokens == 0 { + t.Fatal("surface tier should report non-zero OriginalTokens") + } + // Surface tier (4 layers) should be fast + if elapsed > 2*time.Second { + t.Errorf("surface tier took %v, expected under 2s for medium input", elapsed) + } +} + +func TestIntegration_Configuration_StatsLayerBreakdown(t *testing.T) { + input := strings.Repeat("test content for layer stats verification\n", 100) + _, stats := tok.Compress(input, tok.Minimal) + + if stats.Layers == nil { + t.Fatal("stats.Layers should not be nil") + } + if len(stats.Layers) == 0 { + t.Error("stats.Layers should contain at least one layer stat") + } + // Verify that reported layers have reasonable values + for name, ls := range stats.Layers { + if ls.TokensSaved < 0 { + t.Errorf("layer %q has negative TokensSaved: %d", name, ls.TokensSaved) + } + if ls.DurationMs < 0 { + t.Errorf("layer %q has negative DurationMs: %d", name, ls.DurationMs) + } + } +} + +// --------------------------------------------------------------------------- +// Additional integration tests: Compressor reuse, concurrent safety +// --------------------------------------------------------------------------- + +func TestIntegration_Compressor_ReuseAcrossInputs(t *testing.T) { + c := tok.NewCompressor(tok.Minimal) + + inputs := []string{ + "First unique input for reuse testing.", + "Second completely different content here.", + "", + "Fourth input after empty.", + } + + for i, input := range inputs { + output, stats := c.Compress(input) + if input == "" { + if output != "" { + t.Errorf("call %d: empty input should produce empty output", i) + } + continue + } + if output == "" { + t.Errorf("call %d: non-empty input produced empty output", i) + } + if stats.OriginalTokens == 0 { + t.Errorf("call %d: OriginalTokens is zero for non-empty input", i) + } + } +} + +func TestIntegration_ConcurrentCompression(t *testing.T) { + input := strings.Repeat("concurrent safety test content with enough words\n", 50) + + done := make(chan bool, 20) + for i := 0; i < 20; i++ { + go func() { + output, stats := tok.Compress(input) + if output == "" { + t.Error("concurrent compression returned empty output") + } + if stats.OriginalTokens == 0 { + t.Error("concurrent compression returned zero OriginalTokens") + } + done <- true + }() + } + + for i := 0; i < 20; i++ { + select { + case <-done: + case <-time.After(10 * time.Second): + t.Fatal("concurrent compression timed out") + } + } +} + +func TestIntegration_ConcurrentCompressor(t *testing.T) { + c := tok.NewCompressor(tok.Adaptive) + input := strings.Repeat("compressor concurrent test input with enough content\n", 50) + + done := make(chan bool, 10) + for i := 0; i < 10; i++ { + go func() { + output, stats := c.Compress(input) + if output == "" { + t.Error("concurrent compressor returned empty output") + } + if stats.OriginalTokens == 0 { + t.Error("concurrent compressor returned zero OriginalTokens") + } + done <- true + }() + } + + for i := 0; i < 10; i++ { + select { + case <-done: + case <-time.After(10 * time.Second): + t.Fatal("concurrent compressor timed out") + } + } +} + +// --------------------------------------------------------------------------- +// Additional integration tests: CompactionSchema and BuildCompactionPrompt +// --------------------------------------------------------------------------- + +func TestIntegration_CompactionSchema_ToPrompt(t *testing.T) { + schema := &tok.CompactionSchema{ + TaskOverview: "Implement compression pipeline tests", + CurrentState: "Writing integration tests for the tok package", + ImportantDiscoveries: []string{"Pipeline has 20 layers", "Supports multiple tiers"}, + NextSteps: []string{"Run tests", "Fix failures"}, + ContextToPreserve: []string{"File: integration_test.go", "Module: github.com/GrayCodeAI/tok"}, + } + + prompt := schema.ToPrompt() + if prompt == "" { + t.Fatal("ToPrompt returned empty string") + } + if !strings.Contains(prompt, "Task Overview") { + t.Error("prompt should contain 'Task Overview' section") + } + if !strings.Contains(prompt, "Important Discoveries") { + t.Error("prompt should contain 'Important Discoveries' section") + } + if !strings.Contains(prompt, "compression pipeline") { + t.Error("prompt should contain the task overview text") + } +} + +func TestIntegration_CompactionSchema_ParseResponse(t *testing.T) { + jsonResponse := `{ + "task_overview": "Building test suite", + "current_state": "Almost done", + "important_discoveries": ["Finding 1", "Finding 2"], + "next_steps": ["Run tests"], + "context_to_preserve": ["key detail"] + }` + + schema, err := tok.ParseCompactionResponse(jsonResponse) + if err != nil { + t.Fatalf("ParseCompactionResponse failed: %v", err) + } + if schema.TaskOverview != "Building test suite" { + t.Errorf("TaskOverview = %q, want %q", schema.TaskOverview, "Building test suite") + } + if len(schema.ImportantDiscoveries) != 2 { + t.Errorf("ImportantDiscoveries count = %d, want 2", len(schema.ImportantDiscoveries)) + } +} + +func TestIntegration_CompactionSchema_ParseMarkdownFencedResponse(t *testing.T) { + fencedResponse := "```json\n{\n\t\"task_overview\": \"Test\",\n\t\"current_state\": \"Done\"\n}\n```" + + schema, err := tok.ParseCompactionResponse(fencedResponse) + if err != nil { + t.Fatalf("ParseCompactionResponse with markdown fences failed: %v", err) + } + if schema.TaskOverview != "Test" { + t.Errorf("TaskOverview = %q, want %q", schema.TaskOverview, "Test") + } +} + +func TestIntegration_BuildCompactionPrompt(t *testing.T) { + prompt := tok.BuildCompactionPrompt("some context to compress", 0) + if prompt == "" { + t.Fatal("BuildCompactionPrompt returned empty string") + } + if !strings.Contains(prompt, "some context to compress") { + t.Error("prompt should contain the input context") + } + if !strings.Contains(prompt, "task_overview") { + t.Error("prompt should contain schema field names") + } +} + +func TestIntegration_BuildCompactionPrompt_Truncation(t *testing.T) { + longContext := strings.Repeat("x", 10000) + prompt := tok.BuildCompactionPrompt(longContext, 1000) + // The context portion should be truncated to maxChars + if len(prompt) > 2000 { // system prompt + truncated context + t.Errorf("prompt length %d exceeds expected maximum", len(prompt)) + } +} + +// --------------------------------------------------------------------------- +// Additional integration tests: Token estimation precision +// --------------------------------------------------------------------------- + +func TestIntegration_TokenEstimation_PreciseVsFast(t *testing.T) { + // EstimateTokens uses BPE (precise); we verify it returns positive values + // for a variety of content types + contents := []string{ + "Hello, world!", + "func main() { fmt.Println(\"test\") }", + "The quick brown fox jumps over the lazy dog.", + strings.Repeat("a", 1000), + "JSON: {\"key\": \"value\", \"array\": [1, 2, 3]}", + } + + for _, c := range contents { + tokens := tok.EstimateTokens(c) + if tokens <= 0 { + t.Errorf("EstimateTokens(%q) = %d, want > 0", c[:min(30, len(c))], tokens) + } + } +} + +func TestIntegration_TokenEstimation_Monotonicity(t *testing.T) { + // Longer text should generally have more tokens + short := "Hello" + medium := "Hello, this is a medium-length sentence for testing." + long := "Hello, this is a much longer sentence that goes on and on with many more words and details about testing token estimation monotonicity properties." + + shortTok := tok.EstimateTokens(short) + medTok := tok.EstimateTokens(medium) + longTok := tok.EstimateTokens(long) + + if medTok < shortTok { + t.Errorf("medium tokens (%d) should be >= short tokens (%d)", medTok, shortTok) + } + if longTok < medTok { + t.Errorf("long tokens (%d) should be >= medium tokens (%d)", longTok, medTok) + } +} diff --git a/integration_test.go b/integration_test.go index 2f5465f5..ef881634 100644 --- a/integration_test.go +++ b/integration_test.go @@ -3,7 +3,6 @@ package tok_test import ( "strings" "testing" - "time" "github.com/GrayCodeAI/tok" ) @@ -591,466 +590,5 @@ func TestIntegration_EdgeCase_RepetitiveContent(t *testing.T) { } } -// --------------------------------------------------------------------------- -// 6. Performance - Test that compression completes within reasonable time -// --------------------------------------------------------------------------- - -func TestIntegration_Performance_SmallInput(t *testing.T) { - input := "Short text for performance test." - deadline := 500 * time.Millisecond - - start := time.Now() - tok.Compress(input) - elapsed := time.Since(start) - - if elapsed > deadline { - t.Errorf("small input compression took %v, expected under %v", elapsed, deadline) - } -} - -func TestIntegration_Performance_MediumInput(t *testing.T) { - input := strings.Repeat("This is a medium-length sentence for performance testing. ", 100) - deadline := 2 * time.Second - - start := time.Now() - tok.Compress(input, tok.Aggressive) - elapsed := time.Since(start) - - if elapsed > deadline { - t.Errorf("medium input compression took %v, expected under %v", elapsed, deadline) - } -} - -func TestIntegration_Performance_LargeInput(t *testing.T) { - // ~100 KB of content - input := strings.Repeat("This is a line of text that will be compressed by the full pipeline. It has enough content to exercise multiple layers. ", 2000) - // Race detector adds significant overhead; allow extra headroom when enabled. - deadline := 10 * time.Second - if raceEnabled() { - deadline = 120 * time.Second - } - - start := time.Now() - tok.Compress(input, tok.Aggressive) - elapsed := time.Since(start) - - if elapsed > deadline { - t.Errorf("large input compression took %v, expected under %v", elapsed, deadline) - } -} - -func TestIntegration_Performance_CodeInput(t *testing.T) { - var sb strings.Builder - for i := 0; i < 100; i++ { - sb.WriteString("func process") - sb.WriteString(strings.Repeat("x", 10)) - sb.WriteString("(input string) string {\n") - sb.WriteString("\tresult := strings.TrimSpace(input)\n") - sb.WriteString("\tresult = strings.ToLower(result)\n") - sb.WriteString("\treturn result\n") - sb.WriteString("}\n\n") - } - input := sb.String() - deadline := 5 * time.Second - - start := time.Now() - tok.Compress(input, tok.Code) - elapsed := time.Since(start) - - if elapsed > deadline { - t.Errorf("code input compression took %v, expected under %v", elapsed, deadline) - } -} - -func TestIntegration_Performance_RepeatedCalls(t *testing.T) { - input := "This is a sentence for testing repeated compression calls with the same input." - deadline := 5 * time.Second - - start := time.Now() - for i := 0; i < 100; i++ { - tok.Compress(input) - } - elapsed := time.Since(start) - - if elapsed > deadline { - t.Errorf("100 repeated compressions took %v, expected under %v", elapsed, deadline) - } -} - -func TestIntegration_Performance_CompressorReuse(t *testing.T) { - c := tok.NewCompressor(tok.Adaptive) - inputs := []string{ - "First input to the reusable compressor.", - "Second input with different content for comparison.", - "Third input that is a bit longer to test how the pipeline handles varied sizes across calls.", - strings.Repeat("Fourth input is quite large. ", 50), - } - - deadline := 5 * time.Second - start := time.Now() - for _, input := range inputs { - output, stats := c.Compress(input) - if output == "" { - t.Fatal("reusable compressor returned empty output") - } - if stats.OriginalTokens == 0 { - t.Fatal("reusable compressor returned zero OriginalTokens") - } - } - elapsed := time.Since(start) - - if elapsed > deadline { - t.Errorf("4 compressions with reused compressor took %v, expected under %v", elapsed, deadline) - } -} - -// --------------------------------------------------------------------------- -// 7. Configuration - Test that pipeline configuration affects output correctly -// --------------------------------------------------------------------------- - -func TestIntegration_Configuration_BudgetEnforcement(t *testing.T) { - input := strings.Repeat("word ", 500) // ~500 words - - _, tightStats := tok.Compress(input, tok.WithBudget(50)) - _, looseStats := tok.Compress(input, tok.WithBudget(500)) - - if tightStats.FinalTokens > 70 { - t.Errorf("tight budget (50) produced %d tokens, expected near 50", tightStats.FinalTokens) - } - // Loose budget should preserve more tokens - if looseStats.FinalTokens < tightStats.FinalTokens { - t.Errorf("loose budget (%d) produced fewer tokens (%d) than tight budget (%d)", - 500, looseStats.FinalTokens, tightStats.FinalTokens) - } -} - -func TestIntegration_Configuration_MinimalVsAggressive(t *testing.T) { - input := strings.Repeat("This is test content for comparing compression modes. ", 100) - - _, minimalStats := tok.Compress(input, tok.Minimal) - _, aggressiveStats := tok.Compress(input, tok.Aggressive) - - if minimalStats.OriginalTokens == 0 || aggressiveStats.OriginalTokens == 0 { - t.Fatal("OriginalTokens should be non-zero for both modes") - } - - // Aggressive mode should produce fewer or equal tokens compared to minimal - if aggressiveStats.FinalTokens > minimalStats.FinalTokens+10 { // allow small margin - t.Errorf("aggressive mode (%d tokens) should not produce more than minimal mode (%d tokens)", - aggressiveStats.FinalTokens, minimalStats.FinalTokens) - } -} - -func TestIntegration_Configuration_CodeVsLogTier(t *testing.T) { - codeInput := `func main() { - fmt.Println("hello") - x := 42 - return x -} - -func helper(s string) string { - return strings.TrimSpace(s) -}` - - _, codeStats := tok.Compress(codeInput, tok.Code) - _, logStats := tok.Compress(codeInput, tok.Log) - - // Both should produce non-empty output - if codeStats.OriginalTokens == 0 || logStats.OriginalTokens == 0 { - t.Fatal("both tiers should report non-zero OriginalTokens") - } - // Stats may differ based on tier-specific layers - if codeStats.FinalTokens == 0 { - t.Error("code tier should produce non-zero FinalTokens") - } - if logStats.FinalTokens == 0 { - t.Error("log tier should produce non-zero FinalTokens") - } -} - -func TestIntegration_Configuration_WithAndWithoutQuery(t *testing.T) { - input := `[INFO] Application started -[ERROR] Failed to connect to database -[WARN] Retrying connection -[INFO] Connected successfully -[ERROR] Timeout on request to /api/data -[INFO] Retrying request -[INFO] Request succeeded` - - _, noQueryStats := tok.Compress(input) - _, withQueryStats := tok.Compress(input, tok.WithQuery("database errors")) - - if noQueryStats.OriginalTokens == 0 || withQueryStats.OriginalTokens == 0 { - t.Fatal("OriginalTokens should be non-zero") - } - // Query-aware compression may produce different results (more or fewer tokens - // depending on relevance scoring); both should be valid - if noQueryStats.FinalTokens == 0 { - t.Error("no-query compression should produce non-zero FinalTokens") - } - if withQueryStats.FinalTokens == 0 { - t.Error("query-aware compression should produce non-zero FinalTokens") - } -} - -func TestIntegration_Configuration_AdaptiveTier_AutoDetectsContentType(t *testing.T) { - logInput := strings.Repeat("[INFO] 2026-05-28T10:00:00Z level=info msg=\"request processed\" status=200\n", 100) - _, logStats := tok.Compress(logInput, tok.Adaptive) - if logStats.OriginalTokens == 0 { - t.Fatal("adaptive tier should report non-zero OriginalTokens for log content") - } - - codeInput := strings.Repeat("func processItem(id int) error {\n\treturn nil\n}\n\n", 50) - _, codeStats := tok.Compress(codeInput, tok.Adaptive) - if codeStats.OriginalTokens == 0 { - t.Fatal("adaptive tier should report non-zero OriginalTokens for code content") - } -} - -func TestIntegration_Configuration_SurfaceTier_FastPath(t *testing.T) { - input := strings.Repeat("content line for surface tier testing\n", 100) - - start := time.Now() - output, stats := tok.Compress(input, tok.Surface) - elapsed := time.Since(start) - - if output == "" { - t.Fatal("surface tier returned empty output") - } - if stats.OriginalTokens == 0 { - t.Fatal("surface tier should report non-zero OriginalTokens") - } - // Surface tier (4 layers) should be fast - if elapsed > 2*time.Second { - t.Errorf("surface tier took %v, expected under 2s for medium input", elapsed) - } -} - -func TestIntegration_Configuration_StatsLayerBreakdown(t *testing.T) { - input := strings.Repeat("test content for layer stats verification\n", 100) - _, stats := tok.Compress(input, tok.Minimal) - - if stats.Layers == nil { - t.Fatal("stats.Layers should not be nil") - } - if len(stats.Layers) == 0 { - t.Error("stats.Layers should contain at least one layer stat") - } - // Verify that reported layers have reasonable values - for name, ls := range stats.Layers { - if ls.TokensSaved < 0 { - t.Errorf("layer %q has negative TokensSaved: %d", name, ls.TokensSaved) - } - if ls.DurationMs < 0 { - t.Errorf("layer %q has negative DurationMs: %d", name, ls.DurationMs) - } - } -} - -// --------------------------------------------------------------------------- -// Additional integration tests: Compressor reuse, concurrent safety -// --------------------------------------------------------------------------- - -func TestIntegration_Compressor_ReuseAcrossInputs(t *testing.T) { - c := tok.NewCompressor(tok.Minimal) - - inputs := []string{ - "First unique input for reuse testing.", - "Second completely different content here.", - "", - "Fourth input after empty.", - } - - for i, input := range inputs { - output, stats := c.Compress(input) - if input == "" { - if output != "" { - t.Errorf("call %d: empty input should produce empty output", i) - } - continue - } - if output == "" { - t.Errorf("call %d: non-empty input produced empty output", i) - } - if stats.OriginalTokens == 0 { - t.Errorf("call %d: OriginalTokens is zero for non-empty input", i) - } - } -} - -func TestIntegration_ConcurrentCompression(t *testing.T) { - input := strings.Repeat("concurrent safety test content with enough words\n", 50) - - done := make(chan bool, 20) - for i := 0; i < 20; i++ { - go func() { - output, stats := tok.Compress(input) - if output == "" { - t.Error("concurrent compression returned empty output") - } - if stats.OriginalTokens == 0 { - t.Error("concurrent compression returned zero OriginalTokens") - } - done <- true - }() - } - - for i := 0; i < 20; i++ { - select { - case <-done: - case <-time.After(10 * time.Second): - t.Fatal("concurrent compression timed out") - } - } -} - -func TestIntegration_ConcurrentCompressor(t *testing.T) { - c := tok.NewCompressor(tok.Adaptive) - input := strings.Repeat("compressor concurrent test input with enough content\n", 50) - - done := make(chan bool, 10) - for i := 0; i < 10; i++ { - go func() { - output, stats := c.Compress(input) - if output == "" { - t.Error("concurrent compressor returned empty output") - } - if stats.OriginalTokens == 0 { - t.Error("concurrent compressor returned zero OriginalTokens") - } - done <- true - }() - } - - for i := 0; i < 10; i++ { - select { - case <-done: - case <-time.After(10 * time.Second): - t.Fatal("concurrent compressor timed out") - } - } -} - -// --------------------------------------------------------------------------- -// Additional integration tests: CompactionSchema and BuildCompactionPrompt -// --------------------------------------------------------------------------- - -func TestIntegration_CompactionSchema_ToPrompt(t *testing.T) { - schema := &tok.CompactionSchema{ - TaskOverview: "Implement compression pipeline tests", - CurrentState: "Writing integration tests for the tok package", - ImportantDiscoveries: []string{"Pipeline has 20 layers", "Supports multiple tiers"}, - NextSteps: []string{"Run tests", "Fix failures"}, - ContextToPreserve: []string{"File: integration_test.go", "Module: github.com/GrayCodeAI/tok"}, - } - - prompt := schema.ToPrompt() - if prompt == "" { - t.Fatal("ToPrompt returned empty string") - } - if !strings.Contains(prompt, "Task Overview") { - t.Error("prompt should contain 'Task Overview' section") - } - if !strings.Contains(prompt, "Important Discoveries") { - t.Error("prompt should contain 'Important Discoveries' section") - } - if !strings.Contains(prompt, "compression pipeline") { - t.Error("prompt should contain the task overview text") - } -} - -func TestIntegration_CompactionSchema_ParseResponse(t *testing.T) { - jsonResponse := `{ - "task_overview": "Building test suite", - "current_state": "Almost done", - "important_discoveries": ["Finding 1", "Finding 2"], - "next_steps": ["Run tests"], - "context_to_preserve": ["key detail"] - }` - - schema, err := tok.ParseCompactionResponse(jsonResponse) - if err != nil { - t.Fatalf("ParseCompactionResponse failed: %v", err) - } - if schema.TaskOverview != "Building test suite" { - t.Errorf("TaskOverview = %q, want %q", schema.TaskOverview, "Building test suite") - } - if len(schema.ImportantDiscoveries) != 2 { - t.Errorf("ImportantDiscoveries count = %d, want 2", len(schema.ImportantDiscoveries)) - } -} - -func TestIntegration_CompactionSchema_ParseMarkdownFencedResponse(t *testing.T) { - fencedResponse := "```json\n{\n\t\"task_overview\": \"Test\",\n\t\"current_state\": \"Done\"\n}\n```" - - schema, err := tok.ParseCompactionResponse(fencedResponse) - if err != nil { - t.Fatalf("ParseCompactionResponse with markdown fences failed: %v", err) - } - if schema.TaskOverview != "Test" { - t.Errorf("TaskOverview = %q, want %q", schema.TaskOverview, "Test") - } -} - -func TestIntegration_BuildCompactionPrompt(t *testing.T) { - prompt := tok.BuildCompactionPrompt("some context to compress", 0) - if prompt == "" { - t.Fatal("BuildCompactionPrompt returned empty string") - } - if !strings.Contains(prompt, "some context to compress") { - t.Error("prompt should contain the input context") - } - if !strings.Contains(prompt, "task_overview") { - t.Error("prompt should contain schema field names") - } -} - -func TestIntegration_BuildCompactionPrompt_Truncation(t *testing.T) { - longContext := strings.Repeat("x", 10000) - prompt := tok.BuildCompactionPrompt(longContext, 1000) - // The context portion should be truncated to maxChars - if len(prompt) > 2000 { // system prompt + truncated context - t.Errorf("prompt length %d exceeds expected maximum", len(prompt)) - } -} - -// --------------------------------------------------------------------------- -// Additional integration tests: Token estimation precision -// --------------------------------------------------------------------------- - -func TestIntegration_TokenEstimation_PreciseVsFast(t *testing.T) { - // EstimateTokens uses BPE (precise); we verify it returns positive values - // for a variety of content types - contents := []string{ - "Hello, world!", - "func main() { fmt.Println(\"test\") }", - "The quick brown fox jumps over the lazy dog.", - strings.Repeat("a", 1000), - "JSON: {\"key\": \"value\", \"array\": [1, 2, 3]}", - } - - for _, c := range contents { - tokens := tok.EstimateTokens(c) - if tokens <= 0 { - t.Errorf("EstimateTokens(%q) = %d, want > 0", c[:min(30, len(c))], tokens) - } - } -} - -func TestIntegration_TokenEstimation_Monotonicity(t *testing.T) { - // Longer text should generally have more tokens - short := "Hello" - medium := "Hello, this is a medium-length sentence for testing." - long := "Hello, this is a much longer sentence that goes on and on with many more words and details about testing token estimation monotonicity properties." - - shortTok := tok.EstimateTokens(short) - medTok := tok.EstimateTokens(medium) - longTok := tok.EstimateTokens(long) - - if medTok < shortTok { - t.Errorf("medium tokens (%d) should be >= short tokens (%d)", medTok, shortTok) - } - if longTok < medTok { - t.Errorf("long tokens (%d) should be >= medium tokens (%d)", longTok, medTok) - } -} +// Performance, Configuration, concurrency, CompactionSchema, and token +// estimation precision integration tests moved to integration_advanced_test.go.