getsentry
diff --git a/‎src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts‎
Lines changed: 36 additions & 0 deletions b/‎src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎src/benchmarks/claude-ui/__tests__/claude-ui-tool-config.test.ts‎
Lines changed: 164 additions & 76 deletions b/‎src/benchmarks/claude-ui/__tests__/claude-ui-tool-config.test.ts‎
Lines changed: 164 additions & 76 deletions
diff --git a/‎src/benchmarks/claude-ui/config.ts‎
Lines changed: 18 additions & 0 deletions b/‎src/benchmarks/claude-ui/config.ts‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎src/benchmarks/claude-ui/harness.ts‎
Lines changed: 18 additions & 3 deletions b/‎src/benchmarks/claude-ui/harness.ts‎
Lines changed: 18 additions & 3 deletions
@@ -359,6 +359,42 @@ describe('Claude UI benchmark analysis', () => {
     ).toThrow('weather.yml.claude.activateSkill: requires skillDirs');
   });
 
+  it('rejects activateSkill that does not match skillDirs when loading config', () => {
+    expect(() =>
+      readConfig(
+        {
+          name: 'weather',
+          prompt: 'prompt.md',
+          claude: {
+            skillDirs: ['benchmarks/claude-ui/local/skills/vendor-cli'],
+            activateSkill: 'other-skill',
+            isolatedWorkingDirectory: true,
+          },
+        },
+        'weather.yml',
+      ),
+    ).toThrow('weather.yml.claude.activateSkill: must match a basename from skillDirs');
+  });
+
+  it('rejects duplicate skillDir basenames when loading config', () => {
+    expect(() =>
+      readConfig(
+        {
+          name: 'weather',
+          prompt: 'prompt.md',
+          claude: {
+            skillDirs: [
+              'benchmarks/claude-ui/local/skills/vendor-cli',
+              'benchmarks/claude-ui/fixtures/skills/vendor-cli',
+            ],
+            isolatedWorkingDirectory: true,
+          },
+        },
+        'weather.yml',
+      ),
+    ).toThrow("weather.yml.claude.skillDirs: duplicate basename 'vendor-cli'");
+  });
+
   it('rejects invalid session defaults when loading config', () => {
     expect(() =>
       readConfig(
 
@@ -1,16 +1,10 @@
-import { spawn } from 'node:child_process';
-import { mkdtemp, readdir, readFile, rm, writeFile } from 'node:fs/promises';
-import { tmpdir } from 'node:os';
 import path from 'node:path';
-import { fileURLToPath } from 'node:url';
 import { buildClaudeArgs } from '../claude-invocation.ts';
 import { compareBenchmark } from '../compare.ts';
 import { readConfig } from '../config.ts';
 import { analyzeClaudeJsonl } from '../transcript.ts';
 import type { BenchmarkArtifacts, BenchmarkRunMetadata } from '../types.ts';
 
-const repoRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '../../../..');
-
 function line(value: unknown): string {
   return JSON.stringify(value);
 }
@@ -42,28 +36,6 @@ function runMetadata(wallClockSeconds: number): BenchmarkRunMetadata {
   };
 }
 
-function runParserScript(args: string[]): Promise<{
-  exitCode: number | null;
-  stdout: string;
-  stderr: string;
-}> {
-  return new Promise((resolve, reject) => {
-    const child = spawn('python3', args, { stdio: ['ignore', 'pipe', 'pipe'] });
-    const stdout: Buffer[] = [];
-    const stderr: Buffer[] = [];
-    child.stdout.on('data', (chunk: Buffer) => stdout.push(chunk));
-    child.stderr.on('data', (chunk: Buffer) => stderr.push(chunk));
-    child.on('error', reject);
-    child.on('close', (exitCode) => {
-      resolve({
-        exitCode,
-        stdout: Buffer.concat(stdout).toString('utf8'),
-        stderr: Buffer.concat(stderr).toString('utf8'),
-      });
-    });
-  });
-}
-
 describe('Claude UI benchmark tool configuration', () => {
   it('loads Claude invocation and tool analysis from suite config', () => {
     const config = readConfig(
@@ -612,6 +584,120 @@ describe('Claude UI benchmark tool configuration', () => {
     });
   });
 
+  it('handles tracked tool results without content', () => {
+    const config = readConfig(
+      {
+        name: 'private CLI weather',
+        prompt: 'weather.md',
+        failurePatterns: ['WAIT_TIMEOUT'],
+        toolAnalysis: {
+          matchers: [
+            {
+              kind: 'bashCommand',
+              commandPrefix: 'privatecli',
+              shortName: 'privatecli.other',
+            },
+          ],
+        },
+      },
+      'private-cli.yml',
+    );
+    const transcript = [
+      line({
+        type: 'assistant',
+        message: {
+          content: [
+            {
+              type: 'tool_use',
+              id: 'tool-1',
+              name: 'Bash',
+              input: { command: 'privatecli --version' },
+            },
+          ],
+        },
+      }),
+      line({
+        type: 'user',
+        message: {
+          content: [{ type: 'tool_result', tool_use_id: 'tool-1', is_error: true }],
+        },
+      }),
+    ].join('\n');
+
+    const audit = analyzeClaudeJsonl(transcript, {
+      toolAnalysis: config.toolAnalysis,
+      failurePatterns: config.failurePatterns,
+    });
+
+    expect(audit.failures).toEqual([
+      {
+        id: 'tool-1',
+        fullName: 'Bash',
+        shortName: 'privatecli.other',
+        line: 2,
+        message: '',
+      },
+    ]);
+    expect(audit.patternFailures).toEqual([]);
+  });
+
+  it('counts repeated matches in one Bash failure result once', () => {
+    const config = readConfig(
+      {
+        name: 'private CLI weather',
+        prompt: 'weather.md',
+        toolAnalysis: {
+          matchers: [
+            {
+              kind: 'bashCommand',
+              commandPrefix: 'privatecli',
+              shortName: 'privatecli.other',
+            },
+          ],
+        },
+      },
+      'private-cli.yml',
+    );
+    const transcript = [
+      line({
+        type: 'assistant',
+        message: {
+          content: [
+            {
+              type: 'tool_use',
+              id: 'tool-1',
+              name: 'Bash',
+              input: { command: 'privatecli one && privatecli two' },
+            },
+          ],
+        },
+      }),
+      line({
+        type: 'user',
+        message: {
+          content: [
+            {
+              type: 'tool_result',
+              tool_use_id: 'tool-1',
+              is_error: true,
+              content: 'Exit code 1',
+            },
+          ],
+        },
+      }),
+    ].join('\n');
+
+    const audit = analyzeClaudeJsonl(transcript, { toolAnalysis: config.toolAnalysis });
+    const result = compareBenchmark(config, audit, runMetadata(600));
+
+    expect(audit.trackedSequence.map((call) => call.shortName)).toEqual([
+      'privatecli.other',
+      'privatecli.other',
+    ]);
+    expect(audit.failures).toHaveLength(1);
+    expect(result.completion.issueCount).toBe(1);
+  });
+
   it('marks the benchmark incomplete when Claude exits non-zero', () => {
     const config = readConfig(
       {
@@ -633,53 +719,55 @@ describe('Claude UI benchmark tool configuration', () => {
     });
   });
 
-  it('lets the parser include configured non-MCP tool names', async () => {
-    const dir = await mkdtemp(path.join(tmpdir(), 'claude-ui-parser-'));
-    try {
-      const jsonlPath = path.join(dir, 'claude.jsonl');
-      const outputPath = path.join(dir, 'parsed');
-      await writeFile(
-        jsonlPath,
-        [
-          line({
-            type: 'assistant',
-            message: {
-              content: [
-                {
-                  type: 'tool_use',
-                  id: 'tool-1',
-                  name: 'Bash',
-                  input: { command: 'vendorcli ui screen --json' },
-                },
-              ],
-            },
-          }),
-          line({
-            type: 'user',
-            message: { content: [{ type: 'tool_result', tool_use_id: 'tool-1', content: 'ok' }] },
-          }),
-        ].join('\n'),
-        'utf8',
-      );
-
-      const result = await runParserScript([
-        path.join(repoRoot, 'benchmarks/claude-ui/parse_claude_conversation.py'),
-        jsonlPath,
-        outputPath,
-        '--tool-prefix=mcp__xcodebuildmcp',
-        '--tool-name=Bash',
-      ]);
-
-      expect(result.exitCode).toBe(0);
-      expect(await readdir(outputPath)).toEqual([
-        '0001_tool_call_Bash.md',
-        '0002_tool_result_Bash.md',
-      ]);
-      expect(await readFile(path.join(outputPath, '0001_tool_call_Bash.md'), 'utf8')).toContain(
-        'vendorcli ui screen --json',
-      );
-    } finally {
-      await rm(dir, { recursive: true, force: true });
-    }
+  it('keeps configured non-MCP tool names in transcript analysis', () => {
+    const config = readConfig(
+      {
+        name: 'vendor CLI weather',
+        prompt: 'weather.md',
+        toolAnalysis: {
+          matchers: [
+            {
+              kind: 'bashCommand',
+              commandPrefix: 'vendorcli ui screen',
+              shortName: 'vendorcli.screen',
+            },
+          ],
+        },
+      },
+      'vendor-cli.yml',
+    );
+    const transcript = [
+      line({
+        type: 'assistant',
+        message: {
+          content: [
+            {
+              type: 'tool_use',
+              id: 'tool-1',
+              name: 'Bash',
+              input: { command: 'vendorcli ui screen --json' },
+            },
+          ],
+        },
+      }),
+    ].join('\n');
+
+    const audit = analyzeClaudeJsonl(transcript, { toolAnalysis: config.toolAnalysis });
+
+    expect(
+      audit.trackedSequence.map((call) => ({
+        fullName: call.fullName,
+        shortName: call.shortName,
+        isUiAutomation: call.isUiAutomation,
+        line: call.line,
+      })),
+    ).toEqual([
+      {
+        fullName: 'Bash',
+        shortName: 'vendorcli.screen',
+        isUiAutomation: false,
+        line: 1,
+      },
+    ]);
   });
 });
@@ -1,4 +1,5 @@
 import { readFile } from 'node:fs/promises';
+import path from 'node:path';
 import { parse as parseYaml } from 'yaml';
 import * as z from 'zod';
 import { sessionDefaultsSchema } from '../../utils/session-defaults-schema.ts';
@@ -116,10 +117,27 @@ function readClaudeInvocationConfig(
     throw new Error(`${source}.permissionMode: expected 'default' or 'bypassPermissions'`);
   }
   const skillDirs = readOptionalStringArray(raw, 'skillDirs', source);
+  if (skillDirs !== undefined) {
+    const basenames = new Set<string>();
+    for (const skillDir of skillDirs) {
+      const basename = path.basename(skillDir);
+      if (basenames.has(basename)) {
+        throw new Error(`${source}.skillDirs: duplicate basename '${basename}'`);
+      }
+      basenames.add(basename);
+    }
+  }
   const activateSkill = readOptionalString(raw, 'activateSkill', source);
   if (activateSkill !== undefined && (!skillDirs || skillDirs.length === 0)) {
     throw new Error(`${source}.activateSkill: requires skillDirs`);
   }
+  if (
+    activateSkill !== undefined &&
+    skillDirs !== undefined &&
+    !skillDirs.some((skillDir) => path.basename(skillDir) === activateSkill)
+  ) {
+    throw new Error(`${source}.activateSkill: must match a basename from skillDirs`);
+  }
 
   return {
     useMcpServer: readOptionalBoolean(raw, 'useMcpServer', source),
 
@@ -218,6 +218,7 @@ function runCommand(opts: {
     const started = process.hrtime.bigint();
     let stdoutBuffer = '';
     let terminalResultExitCode: number | undefined;
+    let terminalResultRequestedTermination = false;
     let terminalResultTimer: NodeJS.Timeout | undefined;
     let timeoutTimer: NodeJS.Timeout | undefined;
     let hardKillTimer: NodeJS.Timeout | undefined;
@@ -272,13 +273,20 @@ function runCommand(opts: {
       if (terminalResultExitCode !== undefined || opts.terminalJsonResultGraceMs === undefined)
         return;
       terminalResultExitCode = result.is_error === true ? 1 : 0;
-      terminalResultTimer = setTimeout(terminateChild, opts.terminalJsonResultGraceMs);
+      terminalResultTimer = setTimeout(() => {
+        terminalResultRequestedTermination = true;
+        terminateChild();
+      }, opts.terminalJsonResultGraceMs);
       terminalResultTimer.unref();
     };
 
     if (opts.timeoutMs !== undefined) {
       timeoutTimer = setTimeout(() => {
-        timedOut = true;
+        if (terminalResultExitCode === undefined) {
+          timedOut = true;
+        } else {
+          terminalResultRequestedTermination = true;
+        }
         terminateChild();
       }, opts.timeoutMs);
       timeoutTimer.unref();
@@ -325,12 +333,19 @@ function runCommand(opts: {
       clearTimeoutTimer();
       clearHardKillTimer();
       const durationSeconds = Number(process.hrtime.bigint() - started) / 1_000_000_000;
+      const resolvedExitCode =
+        terminalResultExitCode !== undefined &&
+        (terminalResultRequestedTermination || exitCode === 0 || exitCode === null)
+          ? terminalResultExitCode
+          : timedOut
+            ? 143
+            : (exitCode ?? null);
       stdout.end();
       stderr.end();
       Promise.all([finished(stdout), finished(stderr)])
         .then(() =>
           resolve({
-            exitCode: timedOut ? 143 : (exitCode ?? terminalResultExitCode ?? null),
+            exitCode: resolvedExitCode,
             durationSeconds,
           }),
         )