1- import { spawn } from 'node:child_process' ;
2- import { mkdtemp , readdir , readFile , rm , writeFile } from 'node:fs/promises' ;
3- import { tmpdir } from 'node:os' ;
41import path from 'node:path' ;
5- import { fileURLToPath } from 'node:url' ;
62import { buildClaudeArgs } from '../claude-invocation.ts' ;
73import { compareBenchmark } from '../compare.ts' ;
84import { readConfig } from '../config.ts' ;
95import { analyzeClaudeJsonl } from '../transcript.ts' ;
106import type { BenchmarkArtifacts , BenchmarkRunMetadata } from '../types.ts' ;
117
12- const repoRoot = path . resolve ( path . dirname ( fileURLToPath ( import . meta. url ) ) , '../../../..' ) ;
13-
148function line ( value : unknown ) : string {
159 return JSON . stringify ( value ) ;
1610}
@@ -42,28 +36,6 @@ function runMetadata(wallClockSeconds: number): BenchmarkRunMetadata {
4236 } ;
4337}
4438
45- function runParserScript ( args : string [ ] ) : Promise < {
46- exitCode : number | null ;
47- stdout : string ;
48- stderr : string ;
49- } > {
50- return new Promise ( ( resolve , reject ) => {
51- const child = spawn ( 'python3' , args , { stdio : [ 'ignore' , 'pipe' , 'pipe' ] } ) ;
52- const stdout : Buffer [ ] = [ ] ;
53- const stderr : Buffer [ ] = [ ] ;
54- child . stdout . on ( 'data' , ( chunk : Buffer ) => stdout . push ( chunk ) ) ;
55- child . stderr . on ( 'data' , ( chunk : Buffer ) => stderr . push ( chunk ) ) ;
56- child . on ( 'error' , reject ) ;
57- child . on ( 'close' , ( exitCode ) => {
58- resolve ( {
59- exitCode,
60- stdout : Buffer . concat ( stdout ) . toString ( 'utf8' ) ,
61- stderr : Buffer . concat ( stderr ) . toString ( 'utf8' ) ,
62- } ) ;
63- } ) ;
64- } ) ;
65- }
66-
6739describe ( 'Claude UI benchmark tool configuration' , ( ) => {
6840 it ( 'loads Claude invocation and tool analysis from suite config' , ( ) => {
6941 const config = readConfig (
@@ -612,6 +584,120 @@ describe('Claude UI benchmark tool configuration', () => {
612584 } ) ;
613585 } ) ;
614586
587+ it ( 'handles tracked tool results without content' , ( ) => {
588+ const config = readConfig (
589+ {
590+ name : 'private CLI weather' ,
591+ prompt : 'weather.md' ,
592+ failurePatterns : [ 'WAIT_TIMEOUT' ] ,
593+ toolAnalysis : {
594+ matchers : [
595+ {
596+ kind : 'bashCommand' ,
597+ commandPrefix : 'privatecli' ,
598+ shortName : 'privatecli.other' ,
599+ } ,
600+ ] ,
601+ } ,
602+ } ,
603+ 'private-cli.yml' ,
604+ ) ;
605+ const transcript = [
606+ line ( {
607+ type : 'assistant' ,
608+ message : {
609+ content : [
610+ {
611+ type : 'tool_use' ,
612+ id : 'tool-1' ,
613+ name : 'Bash' ,
614+ input : { command : 'privatecli --version' } ,
615+ } ,
616+ ] ,
617+ } ,
618+ } ) ,
619+ line ( {
620+ type : 'user' ,
621+ message : {
622+ content : [ { type : 'tool_result' , tool_use_id : 'tool-1' , is_error : true } ] ,
623+ } ,
624+ } ) ,
625+ ] . join ( '\n' ) ;
626+
627+ const audit = analyzeClaudeJsonl ( transcript , {
628+ toolAnalysis : config . toolAnalysis ,
629+ failurePatterns : config . failurePatterns ,
630+ } ) ;
631+
632+ expect ( audit . failures ) . toEqual ( [
633+ {
634+ id : 'tool-1' ,
635+ fullName : 'Bash' ,
636+ shortName : 'privatecli.other' ,
637+ line : 2 ,
638+ message : '' ,
639+ } ,
640+ ] ) ;
641+ expect ( audit . patternFailures ) . toEqual ( [ ] ) ;
642+ } ) ;
643+
644+ it ( 'counts repeated matches in one Bash failure result once' , ( ) => {
645+ const config = readConfig (
646+ {
647+ name : 'private CLI weather' ,
648+ prompt : 'weather.md' ,
649+ toolAnalysis : {
650+ matchers : [
651+ {
652+ kind : 'bashCommand' ,
653+ commandPrefix : 'privatecli' ,
654+ shortName : 'privatecli.other' ,
655+ } ,
656+ ] ,
657+ } ,
658+ } ,
659+ 'private-cli.yml' ,
660+ ) ;
661+ const transcript = [
662+ line ( {
663+ type : 'assistant' ,
664+ message : {
665+ content : [
666+ {
667+ type : 'tool_use' ,
668+ id : 'tool-1' ,
669+ name : 'Bash' ,
670+ input : { command : 'privatecli one && privatecli two' } ,
671+ } ,
672+ ] ,
673+ } ,
674+ } ) ,
675+ line ( {
676+ type : 'user' ,
677+ message : {
678+ content : [
679+ {
680+ type : 'tool_result' ,
681+ tool_use_id : 'tool-1' ,
682+ is_error : true ,
683+ content : 'Exit code 1' ,
684+ } ,
685+ ] ,
686+ } ,
687+ } ) ,
688+ ] . join ( '\n' ) ;
689+
690+ const audit = analyzeClaudeJsonl ( transcript , { toolAnalysis : config . toolAnalysis } ) ;
691+ const result = compareBenchmark ( config , audit , runMetadata ( 600 ) ) ;
692+
693+ expect ( audit . trackedSequence . map ( ( call ) => call . shortName ) ) . toEqual ( [
694+ 'privatecli.other' ,
695+ 'privatecli.other' ,
696+ ] ) ;
697+ expect ( audit . failures ) . toHaveLength ( 1 ) ;
698+ expect ( result . completion . issueCount ) . toBe ( 1 ) ;
699+ } ) ;
700+
615701 it ( 'marks the benchmark incomplete when Claude exits non-zero' , ( ) => {
616702 const config = readConfig (
617703 {
@@ -633,53 +719,55 @@ describe('Claude UI benchmark tool configuration', () => {
633719 } ) ;
634720 } ) ;
635721
636- it ( 'lets the parser include configured non-MCP tool names' , async ( ) => {
637- const dir = await mkdtemp ( path . join ( tmpdir ( ) , 'claude-ui-parser-' ) ) ;
638- try {
639- const jsonlPath = path . join ( dir , 'claude.jsonl' ) ;
640- const outputPath = path . join ( dir , 'parsed' ) ;
641- await writeFile (
642- jsonlPath ,
643- [
644- line ( {
645- type : 'assistant' ,
646- message : {
647- content : [
648- {
649- type : 'tool_use' ,
650- id : 'tool-1' ,
651- name : 'Bash' ,
652- input : { command : 'vendorcli ui screen --json' } ,
653- } ,
654- ] ,
655- } ,
656- } ) ,
657- line ( {
658- type : 'user' ,
659- message : { content : [ { type : 'tool_result' , tool_use_id : 'tool-1' , content : 'ok' } ] } ,
660- } ) ,
661- ] . join ( '\n' ) ,
662- 'utf8' ,
663- ) ;
664-
665- const result = await runParserScript ( [
666- path . join ( repoRoot , 'benchmarks/claude-ui/parse_claude_conversation.py' ) ,
667- jsonlPath ,
668- outputPath ,
669- '--tool-prefix=mcp__xcodebuildmcp' ,
670- '--tool-name=Bash' ,
671- ] ) ;
672-
673- expect ( result . exitCode ) . toBe ( 0 ) ;
674- expect ( await readdir ( outputPath ) ) . toEqual ( [
675- '0001_tool_call_Bash.md' ,
676- '0002_tool_result_Bash.md' ,
677- ] ) ;
678- expect ( await readFile ( path . join ( outputPath , '0001_tool_call_Bash.md' ) , 'utf8' ) ) . toContain (
679- 'vendorcli ui screen --json' ,
680- ) ;
681- } finally {
682- await rm ( dir , { recursive : true , force : true } ) ;
683- }
722+ it ( 'keeps configured non-MCP tool names in transcript analysis' , ( ) => {
723+ const config = readConfig (
724+ {
725+ name : 'vendor CLI weather' ,
726+ prompt : 'weather.md' ,
727+ toolAnalysis : {
728+ matchers : [
729+ {
730+ kind : 'bashCommand' ,
731+ commandPrefix : 'vendorcli ui screen' ,
732+ shortName : 'vendorcli.screen' ,
733+ } ,
734+ ] ,
735+ } ,
736+ } ,
737+ 'vendor-cli.yml' ,
738+ ) ;
739+ const transcript = [
740+ line ( {
741+ type : 'assistant' ,
742+ message : {
743+ content : [
744+ {
745+ type : 'tool_use' ,
746+ id : 'tool-1' ,
747+ name : 'Bash' ,
748+ input : { command : 'vendorcli ui screen --json' } ,
749+ } ,
750+ ] ,
751+ } ,
752+ } ) ,
753+ ] . join ( '\n' ) ;
754+
755+ const audit = analyzeClaudeJsonl ( transcript , { toolAnalysis : config . toolAnalysis } ) ;
756+
757+ expect (
758+ audit . trackedSequence . map ( ( call ) => ( {
759+ fullName : call . fullName ,
760+ shortName : call . shortName ,
761+ isUiAutomation : call . isUiAutomation ,
762+ line : call . line ,
763+ } ) ) ,
764+ ) . toEqual ( [
765+ {
766+ fullName : 'Bash' ,
767+ shortName : 'vendorcli.screen' ,
768+ isUiAutomation : false ,
769+ line : 1 ,
770+ } ,
771+ ] ) ;
684772 } ) ;
685773} ) ;
0 commit comments