Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 64 additions & 12 deletions packages/evals/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,18 @@
* Evals CLI entry point.
*
* Modes:
* - `evals` (no args) → interactive REPL
* - `evals run <target> …` → single-shot run with rich progress
* - `evals list [tier]` → list discovered tasks
* - `evals config [sub]` → print / get / set defaults
* - `evals experiments [sub]` → inspect / compare Braintrust runs
* - `evals new <tier> <cat> <name>` → scaffold a task file
* - `evals help` / `-h` → help
* - `evals` (no args) → interactive REPL
* - `evals --quiet` / `evals -q` → REPL with no banner / welcome / inline warnings
* - `evals run <target> …` → single-shot run with rich progress
* - `evals list [tier]` → list discovered tasks
* - `evals config [sub]` → print / get / set defaults
* - `evals experiments [sub]` → inspect / compare Braintrust runs
* - `evals doctor` / `health` → env-key + config + discovery health report
* - `evals new <tier> <cat> <name>`→ scaffold a task file
* - `evals help` / `-h` → help
*
* Env vars:
* - EVALS_NO_WELCOME=1 → suppress first-run welcome panel (REPL only)
*
* No child processes. All runs flow through framework/runEvals in-process.
*
Expand Down Expand Up @@ -95,14 +100,19 @@ const args = process.argv.slice(2);
process.on("SIGINT", () => void handleSignal("SIGINT"));
process.on("SIGTERM", () => void handleSignal("SIGTERM"));

// REPL launch: zero args, or only `--quiet`/`-q` flags. Quiet flags are
// REPL-only (they suppress chrome); other args route to the argv switch.
const isQuietFlag = (a: string): boolean => a === "--quiet" || a === "-q";
const replLaunch = args.length === 0 || args.every(isQuietFlag);

// Argv mode: Esc behaves like Ctrl+C. The REPL has its own keypress
// handler that does cooperative-then-aggressive abort instead — this
// path is only active when no arg-less REPL is running.
//
// Note: raw mode disables the OS-level Ctrl+C → SIGINT translation,
// so we forward it ourselves.
let cleanupArgvInput = (): void => {};
if (args.length > 0 && process.stdin.isTTY) {
if (!replLaunch && args.length > 0 && process.stdin.isTTY) {
const readline = await import("node:readline");
const wasRaw = process.stdin.isRaw;
readline.emitKeypressEvents(process.stdin);
Expand Down Expand Up @@ -149,10 +159,30 @@ const args = process.argv.slice(2);
await runCommand(resolved);
}

const isHelpToken = (token: string | undefined): boolean =>
token === "--help" || token === "-h" || token === "help";

const isConfigHelpInvocation = (tokens: string[]): boolean => {
if (isHelpToken(tokens[0])) return true;
if (tokens[0] === "core") {
return isHelpToken(tokens[1]) || isHelpToken(tokens[2]);
}
return isHelpToken(tokens[1]);
};

const isExperimentsHelpInvocation = (tokens: string[]): boolean =>
isHelpToken(tokens[0]) || isHelpToken(tokens[1]);

// Whether to write the first-run marker in `finally`. Help-only paths and
// the doctor command don't count as "first uses" — they're discovery
// actions. The REPL marks itself.
let shouldMarkFirstRun = false;

try {
if (args.length === 0) {
if (replLaunch) {
const { startRepl } = await import("./tui/repl.js");
await startRepl(ENTRY_DIR);
const quiet = args.some(isQuietFlag);
await startRepl(ENTRY_DIR, { quiet });
return;
}

Expand All @@ -162,15 +192,15 @@ const args = process.argv.slice(2);
// after the command. Later positions are arguments or flag values and
// must not be swallowed (e.g. `evals run act --help` would otherwise
// print run help instead of erroring on the unknown `--help` flag).
const wantsHelp =
subArgs[0] === "--help" || subArgs[0] === "-h" || subArgs[0] === "help";
const wantsHelp = isHelpToken(subArgs[0]);

switch (command) {
case "run": {
if (wantsHelp) {
printRunHelp();
return;
}
shouldMarkFirstRun = true;
await executeRun(subArgs);
return;
}
Expand All @@ -180,6 +210,7 @@ const args = process.argv.slice(2);
printListHelp();
return;
}
shouldMarkFirstRun = true;
const detailed =
subArgs.includes("--detailed") || subArgs.includes("-d");
const tierFilter = subArgs.find((a) => !a.startsWith("-"));
Expand All @@ -192,24 +223,36 @@ const args = process.argv.slice(2);
}

case "config": {
shouldMarkFirstRun = !isConfigHelpInvocation(subArgs);
const { handleConfig } = await import("./tui/commands/config.js");
await handleConfig(subArgs, ENTRY_DIR);
return;
}

case "experiments": {
shouldMarkFirstRun = !isExperimentsHelpInvocation(subArgs);
const { handleExperiments } = await import(
"./tui/commands/experiments.js"
);
await handleExperiments(subArgs);
return;
}

case "doctor":
case "health": {
// Doctor is a diagnostic, not a "first use" — don't mark the marker.
const { handleDoctor } = await import("./tui/commands/doctor.js");
const exitCode = await handleDoctor(subArgs, ENTRY_DIR);
if (exitCode !== 0) process.exitCode = exitCode;
return;
}

case "new": {
if (wantsHelp) {
printNewHelp();
return;
}
shouldMarkFirstRun = true;
const { scaffoldTask } = await import("./tui/commands/new.js");
scaffoldTask(subArgs);
return;
Expand All @@ -223,6 +266,7 @@ const args = process.argv.slice(2);

default: {
// Unknown first arg → treat as run target: `evals act` == `evals run act`
shouldMarkFirstRun = true;
await executeRun(args);
return;
}
Expand All @@ -231,6 +275,14 @@ const args = process.argv.slice(2);
console.error(red(`Error: ${(err as Error).message}`));
process.exitCode = 1;
} finally {
if (shouldMarkFirstRun) {
try {
const { markFirstRunComplete } = await import("./tui/welcomeState.js");
markFirstRunComplete(ENTRY_DIR);
} catch {
// best-effort
}
}
cleanupArgvInput();
}
})();
8 changes: 8 additions & 0 deletions packages/evals/scripts/build-cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,14 @@ if (fs.existsSync(distConfigPath)) {
...existing.defaults,
};
}
// Preserve the first-run welcome marker across rebuilds so a contributor
// who's already seen the welcome on the dist path doesn't see it again
// after every `pnpm run build:cli`. If the source has _meta and dist
// doesn't (fresh dist install), the source value is inherited via the
// sourceConfig literal — already correct.
if (existing._meta) {
sourceConfig._meta = { ...sourceConfig._meta, ...existing._meta };
}
} catch {
// invalid existing config – overwrite entirely
}
Expand Down
70 changes: 70 additions & 0 deletions packages/evals/tests/cli.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,18 @@ const SOURCE_CONFIG = path.join(
"evals.config.json",
);

// File-level snapshot/restore: any `evals run …` invocation through the
// real CLI writes `_meta.firstRunCompletedAt` into the source config
// (because the test runs in source mode). Restore at the end so the
// repo file stays pristine.
let __fileLevelConfigSnapshot: string;
beforeAll(() => {
__fileLevelConfigSnapshot = fs.readFileSync(SOURCE_CONFIG, "utf-8");
});
afterAll(() => {
fs.writeFileSync(SOURCE_CONFIG, __fileLevelConfigSnapshot);
});

async function runCli(
args: string[],
): Promise<{ stdout: string; stderr: string; code: number }> {
Expand All @@ -38,6 +50,17 @@ async function runCli(
}
}

function resetSourceWelcomeMeta(): void {
const config = JSON.parse(fs.readFileSync(SOURCE_CONFIG, "utf-8"));
delete config._meta;
fs.writeFileSync(SOURCE_CONFIG, JSON.stringify(config, null, 2) + "\n");
}

function readSourceWelcomeCompletedAt(): string | undefined {
const config = JSON.parse(fs.readFileSync(SOURCE_CONFIG, "utf-8"));
return config._meta?.firstRunCompletedAt;
}

describe("CLI entrypoint", () => {
it("shows help", async () => {
const { stdout, code } = await runCli(["-h"]);
Expand All @@ -59,6 +82,39 @@ describe("CLI entrypoint", () => {
expect(stdout).toContain("compare");
});

it("includes doctor in top-level help", async () => {
const { stdout, code } = await runCli(["-h"]);
expect(code).toBe(0);
expect(stdout).toContain("doctor");
});

it("shows doctor help via --help", async () => {
const { stdout, code } = await runCli(["doctor", "--help"]);
expect(code).toBe(0);
expect(stdout).toContain("evals doctor");
expect(stdout).toContain("--json");
// Hidden --probe flag must not appear
expect(stdout).not.toContain("--probe");
});

it("doctor --json emits a parseable report", async () => {
const { stdout, code } = await runCli(["doctor", "--json"]);
// --json always exits 0 regardless of verdict
expect(code).toBe(0);
const payload = JSON.parse(stdout);
expect(payload).toHaveProperty("verdict");
expect(payload).toHaveProperty("runtime.node");
expect(payload).toHaveProperty("keys.openai");
expect(Array.isArray(payload.reasons)).toBe(true);
});

it("health is an alias for doctor", async () => {
const { stdout, code } = await runCli(["health", "--json"]);
expect(code).toBe(0);
const payload = JSON.parse(stdout);
expect(payload).toHaveProperty("verdict");
});

it("shows experiments compare help", async () => {
const { stdout, code } = await runCli(["experiments", "compare", "-h"]);
expect(code).toBe(0);
Expand Down Expand Up @@ -141,6 +197,20 @@ describe("CLI entrypoint", () => {
},
);

it("does not mark first-run complete for nested help invocations", async () => {
resetSourceWelcomeMeta();

for (const args of [
["config", "set", "--help"],
["experiments", "compare", "--help"],
]) {
const { stdout, code } = await runCli(args);
expect(code).toBe(0);
expect(stdout).toContain("evals");
expect(readSourceWelcomeCompletedAt()).toBeUndefined();
}
});

// Regression: help interception must not reach into value positions.
// `config set <key> <value>` must surface a parse/value error, not silently
// print help — otherwise `--help` would be a magical sentinel anywhere.
Expand Down
Loading
Loading