diff --git a/packages/evals/cli.ts b/packages/evals/cli.ts index 27966b491..b9c59b64b 100644 --- a/packages/evals/cli.ts +++ b/packages/evals/cli.ts @@ -2,13 +2,18 @@ * Evals CLI entry point. * * Modes: - * - `evals` (no args) → interactive REPL - * - `evals run …` → single-shot run with rich progress - * - `evals list [tier]` → list discovered tasks - * - `evals config [sub]` → print / get / set defaults - * - `evals experiments [sub]` → inspect / compare Braintrust runs - * - `evals new ` → scaffold a task file - * - `evals help` / `-h` → help + * - `evals` (no args) → interactive REPL + * - `evals --quiet` / `evals -q` → REPL with no banner / welcome / inline warnings + * - `evals run …` → single-shot run with rich progress + * - `evals list [tier]` → list discovered tasks + * - `evals config [sub]` → print / get / set defaults + * - `evals experiments [sub]` → inspect / compare Braintrust runs + * - `evals doctor` / `health` → env-key + config + discovery health report + * - `evals new `→ scaffold a task file + * - `evals help` / `-h` → help + * + * Env vars: + * - EVALS_NO_WELCOME=1 → suppress first-run welcome panel (REPL only) * * No child processes. All runs flow through framework/runEvals in-process. * @@ -95,6 +100,11 @@ const args = process.argv.slice(2); process.on("SIGINT", () => void handleSignal("SIGINT")); process.on("SIGTERM", () => void handleSignal("SIGTERM")); + // REPL launch: zero args, or only `--quiet`/`-q` flags. Quiet flags are + // REPL-only (they suppress chrome); other args route to the argv switch. + const isQuietFlag = (a: string): boolean => a === "--quiet" || a === "-q"; + const replLaunch = args.length === 0 || args.every(isQuietFlag); + // Argv mode: Esc behaves like Ctrl+C. The REPL has its own keypress // handler that does cooperative-then-aggressive abort instead — this // path is only active when no arg-less REPL is running. @@ -102,7 +112,7 @@ const args = process.argv.slice(2); // Note: raw mode disables the OS-level Ctrl+C → SIGINT translation, // so we forward it ourselves. let cleanupArgvInput = (): void => {}; - if (args.length > 0 && process.stdin.isTTY) { + if (!replLaunch && args.length > 0 && process.stdin.isTTY) { const readline = await import("node:readline"); const wasRaw = process.stdin.isRaw; readline.emitKeypressEvents(process.stdin); @@ -149,10 +159,30 @@ const args = process.argv.slice(2); await runCommand(resolved); } + const isHelpToken = (token: string | undefined): boolean => + token === "--help" || token === "-h" || token === "help"; + + const isConfigHelpInvocation = (tokens: string[]): boolean => { + if (isHelpToken(tokens[0])) return true; + if (tokens[0] === "core") { + return isHelpToken(tokens[1]) || isHelpToken(tokens[2]); + } + return isHelpToken(tokens[1]); + }; + + const isExperimentsHelpInvocation = (tokens: string[]): boolean => + isHelpToken(tokens[0]) || isHelpToken(tokens[1]); + + // Whether to write the first-run marker in `finally`. Help-only paths and + // the doctor command don't count as "first uses" — they're discovery + // actions. The REPL marks itself. + let shouldMarkFirstRun = false; + try { - if (args.length === 0) { + if (replLaunch) { const { startRepl } = await import("./tui/repl.js"); - await startRepl(ENTRY_DIR); + const quiet = args.some(isQuietFlag); + await startRepl(ENTRY_DIR, { quiet }); return; } @@ -162,8 +192,7 @@ const args = process.argv.slice(2); // after the command. Later positions are arguments or flag values and // must not be swallowed (e.g. `evals run act --help` would otherwise // print run help instead of erroring on the unknown `--help` flag). - const wantsHelp = - subArgs[0] === "--help" || subArgs[0] === "-h" || subArgs[0] === "help"; + const wantsHelp = isHelpToken(subArgs[0]); switch (command) { case "run": { @@ -171,6 +200,7 @@ const args = process.argv.slice(2); printRunHelp(); return; } + shouldMarkFirstRun = true; await executeRun(subArgs); return; } @@ -180,6 +210,7 @@ const args = process.argv.slice(2); printListHelp(); return; } + shouldMarkFirstRun = true; const detailed = subArgs.includes("--detailed") || subArgs.includes("-d"); const tierFilter = subArgs.find((a) => !a.startsWith("-")); @@ -192,12 +223,14 @@ const args = process.argv.slice(2); } case "config": { + shouldMarkFirstRun = !isConfigHelpInvocation(subArgs); const { handleConfig } = await import("./tui/commands/config.js"); await handleConfig(subArgs, ENTRY_DIR); return; } case "experiments": { + shouldMarkFirstRun = !isExperimentsHelpInvocation(subArgs); const { handleExperiments } = await import( "./tui/commands/experiments.js" ); @@ -205,11 +238,21 @@ const args = process.argv.slice(2); return; } + case "doctor": + case "health": { + // Doctor is a diagnostic, not a "first use" — don't mark the marker. + const { handleDoctor } = await import("./tui/commands/doctor.js"); + const exitCode = await handleDoctor(subArgs, ENTRY_DIR); + if (exitCode !== 0) process.exitCode = exitCode; + return; + } + case "new": { if (wantsHelp) { printNewHelp(); return; } + shouldMarkFirstRun = true; const { scaffoldTask } = await import("./tui/commands/new.js"); scaffoldTask(subArgs); return; @@ -223,6 +266,7 @@ const args = process.argv.slice(2); default: { // Unknown first arg → treat as run target: `evals act` == `evals run act` + shouldMarkFirstRun = true; await executeRun(args); return; } @@ -231,6 +275,14 @@ const args = process.argv.slice(2); console.error(red(`Error: ${(err as Error).message}`)); process.exitCode = 1; } finally { + if (shouldMarkFirstRun) { + try { + const { markFirstRunComplete } = await import("./tui/welcomeState.js"); + markFirstRunComplete(ENTRY_DIR); + } catch { + // best-effort + } + } cleanupArgvInput(); } })(); diff --git a/packages/evals/scripts/build-cli.ts b/packages/evals/scripts/build-cli.ts index a45cbeed0..35c87c77a 100644 --- a/packages/evals/scripts/build-cli.ts +++ b/packages/evals/scripts/build-cli.ts @@ -50,6 +50,14 @@ if (fs.existsSync(distConfigPath)) { ...existing.defaults, }; } + // Preserve the first-run welcome marker across rebuilds so a contributor + // who's already seen the welcome on the dist path doesn't see it again + // after every `pnpm run build:cli`. If the source has _meta and dist + // doesn't (fresh dist install), the source value is inherited via the + // sourceConfig literal — already correct. + if (existing._meta) { + sourceConfig._meta = { ...sourceConfig._meta, ...existing._meta }; + } } catch { // invalid existing config – overwrite entirely } diff --git a/packages/evals/tests/cli.test.ts b/packages/evals/tests/cli.test.ts index 596305a42..e285f3e01 100644 --- a/packages/evals/tests/cli.test.ts +++ b/packages/evals/tests/cli.test.ts @@ -15,6 +15,18 @@ const SOURCE_CONFIG = path.join( "evals.config.json", ); +// File-level snapshot/restore: any `evals run …` invocation through the +// real CLI writes `_meta.firstRunCompletedAt` into the source config +// (because the test runs in source mode). Restore at the end so the +// repo file stays pristine. +let __fileLevelConfigSnapshot: string; +beforeAll(() => { + __fileLevelConfigSnapshot = fs.readFileSync(SOURCE_CONFIG, "utf-8"); +}); +afterAll(() => { + fs.writeFileSync(SOURCE_CONFIG, __fileLevelConfigSnapshot); +}); + async function runCli( args: string[], ): Promise<{ stdout: string; stderr: string; code: number }> { @@ -38,6 +50,17 @@ async function runCli( } } +function resetSourceWelcomeMeta(): void { + const config = JSON.parse(fs.readFileSync(SOURCE_CONFIG, "utf-8")); + delete config._meta; + fs.writeFileSync(SOURCE_CONFIG, JSON.stringify(config, null, 2) + "\n"); +} + +function readSourceWelcomeCompletedAt(): string | undefined { + const config = JSON.parse(fs.readFileSync(SOURCE_CONFIG, "utf-8")); + return config._meta?.firstRunCompletedAt; +} + describe("CLI entrypoint", () => { it("shows help", async () => { const { stdout, code } = await runCli(["-h"]); @@ -59,6 +82,39 @@ describe("CLI entrypoint", () => { expect(stdout).toContain("compare"); }); + it("includes doctor in top-level help", async () => { + const { stdout, code } = await runCli(["-h"]); + expect(code).toBe(0); + expect(stdout).toContain("doctor"); + }); + + it("shows doctor help via --help", async () => { + const { stdout, code } = await runCli(["doctor", "--help"]); + expect(code).toBe(0); + expect(stdout).toContain("evals doctor"); + expect(stdout).toContain("--json"); + // Hidden --probe flag must not appear + expect(stdout).not.toContain("--probe"); + }); + + it("doctor --json emits a parseable report", async () => { + const { stdout, code } = await runCli(["doctor", "--json"]); + // --json always exits 0 regardless of verdict + expect(code).toBe(0); + const payload = JSON.parse(stdout); + expect(payload).toHaveProperty("verdict"); + expect(payload).toHaveProperty("runtime.node"); + expect(payload).toHaveProperty("keys.openai"); + expect(Array.isArray(payload.reasons)).toBe(true); + }); + + it("health is an alias for doctor", async () => { + const { stdout, code } = await runCli(["health", "--json"]); + expect(code).toBe(0); + const payload = JSON.parse(stdout); + expect(payload).toHaveProperty("verdict"); + }); + it("shows experiments compare help", async () => { const { stdout, code } = await runCli(["experiments", "compare", "-h"]); expect(code).toBe(0); @@ -141,6 +197,20 @@ describe("CLI entrypoint", () => { }, ); + it("does not mark first-run complete for nested help invocations", async () => { + resetSourceWelcomeMeta(); + + for (const args of [ + ["config", "set", "--help"], + ["experiments", "compare", "--help"], + ]) { + const { stdout, code } = await runCli(args); + expect(code).toBe(0); + expect(stdout).toContain("evals"); + expect(readSourceWelcomeCompletedAt()).toBeUndefined(); + } + }); + // Regression: help interception must not reach into value positions. // `config set ` must surface a parse/value error, not silently // print help — otherwise `--help` would be a magical sentinel anywhere. diff --git a/packages/evals/tests/tui/doctor.test.ts b/packages/evals/tests/tui/doctor.test.ts new file mode 100644 index 000000000..7b44bdf3d --- /dev/null +++ b/packages/evals/tests/tui/doctor.test.ts @@ -0,0 +1,213 @@ +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { handleDoctor } from "../../tui/commands/doctor.js"; +import { __resetPackageEnvCacheForTests } from "../../tui/welcomeStatus.js"; + +const PROVIDER_KEYS = [ + "OPENAI_API_KEY", + "ANTHROPIC_API_KEY", + "GOOGLE_GENERATIVE_AI_API_KEY", + "GEMINI_API_KEY", + "BROWSERBASE_API_KEY", + "BROWSERBASE_PROJECT_ID", + "BB_API_KEY", + "BB_PROJECT_ID", + "BRAINTRUST_API_KEY", +]; + +const savedEnv: Record = {}; +const tempDirs: string[] = []; +let savedDisablePkgEnv: string | undefined; + +type DoctorJsonReport = { + verdict: string; + reasons: string[]; + [key: string]: unknown; +}; + +function clearProviderKeys(): void { + for (const key of PROVIDER_KEYS) { + savedEnv[key] = process.env[key]; + delete process.env[key]; + } + // Neutralize the package-local .env loader so tests don't depend on + // whatever real keys the developer happens to have at packages/evals/.env. + savedDisablePkgEnv = process.env.EVALS_DISABLE_PACKAGE_ENV; + process.env.EVALS_DISABLE_PACKAGE_ENV = "1"; + __resetPackageEnvCacheForTests(); +} + +function restoreProviderKeys(): void { + for (const key of PROVIDER_KEYS) { + if (savedEnv[key] === undefined) delete process.env[key]; + else process.env[key] = savedEnv[key]; + } + if (savedDisablePkgEnv === undefined) { + delete process.env.EVALS_DISABLE_PACKAGE_ENV; + } else { + process.env.EVALS_DISABLE_PACKAGE_ENV = savedDisablePkgEnv; + } + __resetPackageEnvCacheForTests(); +} + +function makeTempEntryDir(defaults?: Record): string { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), "evals-doctor-")); + tempDirs.push(dir); + fs.writeFileSync( + path.join(dir, "evals.config.json"), + JSON.stringify( + { defaults: defaults ?? { env: "local", trials: 3 }, benchmarks: {} }, + null, + 2, + ), + ); + return dir; +} + +async function runDoctorJson( + entryDir: string, +): Promise<{ exit: number; report: DoctorJsonReport }> { + const chunks: string[] = []; + const spy = vi + .spyOn(console, "log") + .mockImplementation((...args: unknown[]) => { + chunks.push( + args.map((a) => (typeof a === "string" ? a : String(a))).join(" "), + ); + }); + let exit: number; + try { + exit = await handleDoctor(["--json"], entryDir); + } finally { + spy.mockRestore(); + } + const out = chunks.join("\n"); + return { exit, report: JSON.parse(out) as DoctorJsonReport }; +} + +beforeEach(() => clearProviderKeys()); +afterEach(() => { + restoreProviderKeys(); + while (tempDirs.length > 0) { + const dir = tempDirs.pop(); + if (dir) fs.rmSync(dir, { recursive: true, force: true }); + } + vi.restoreAllMocks(); +}); + +describe("handleDoctor --json", () => { + it("always exits 0 even on fail verdict", async () => { + const entryDir = makeTempEntryDir(); + const { exit, report } = await runDoctorJson(entryDir); + expect(exit).toBe(0); + expect(report.verdict).toBe("fail"); // zero provider keys + }); + + it("emits the full schema (verdict, runtime, config, discovery, keys, reasons)", async () => { + const entryDir = makeTempEntryDir(); + const { report } = await runDoctorJson(entryDir); + expect(report).toHaveProperty("verdict"); + expect(report).toHaveProperty("runtime.node"); + expect(report).toHaveProperty("runtime.mode"); + expect(report).toHaveProperty("config.path"); + expect(report).toHaveProperty("config.env"); + expect(report).toHaveProperty("discovery.ok"); + expect(report).toHaveProperty("keys.openai.state"); + expect(report).toHaveProperty("keys.browserbase.apiKey"); + expect(Array.isArray(report.reasons)).toBe(true); + }); +}); + +describe("handleDoctor verdicts", () => { + it("fail — zero provider keys", async () => { + const entryDir = makeTempEntryDir(); + const { report } = await runDoctorJson(entryDir); + expect(report.verdict).toBe("fail"); + expect(report.reasons.join(" ")).toMatch(/No provider API key/); + }); + + it("fail — env=browserbase with both BB vars missing", async () => { + process.env.OPENAI_API_KEY = "sk-test"; + __resetPackageEnvCacheForTests(); + const entryDir = makeTempEntryDir({ env: "browserbase", trials: 3 }); + const { report } = await runDoctorJson(entryDir); + expect(report.verdict).toBe("fail"); + expect(report.reasons.join(" ")).toMatch(/env=browserbase/); + }); + + it("warn — provider key present but Braintrust missing", async () => { + process.env.OPENAI_API_KEY = "sk-test"; + __resetPackageEnvCacheForTests(); + const entryDir = makeTempEntryDir({ env: "local", trials: 3 }); + const { report } = await runDoctorJson(entryDir); + expect(report.verdict).toBe("warn"); + expect(report.reasons.join(" ")).toMatch(/BRAINTRUST_API_KEY missing/); + }); + + it("warn — Browserbase partially configured", async () => { + process.env.OPENAI_API_KEY = "sk-test"; + process.env.BRAINTRUST_API_KEY = "bt-test"; + process.env.BROWSERBASE_API_KEY = "bb-test"; + __resetPackageEnvCacheForTests(); + const entryDir = makeTempEntryDir({ env: "local", trials: 3 }); + const { report } = await runDoctorJson(entryDir); + expect(report.verdict).toBe("warn"); + expect(report.reasons.join(" ")).toMatch(/Browserbase is partially/); + }); + + it("ok — provider + Braintrust set, no BB needed (env=local)", async () => { + process.env.OPENAI_API_KEY = "sk-test"; + process.env.BRAINTRUST_API_KEY = "bt-test"; + __resetPackageEnvCacheForTests(); + const entryDir = makeTempEntryDir({ env: "local", trials: 3 }); + const { report } = await runDoctorJson(entryDir); + expect(report.verdict).toBe("ok"); + expect(report.reasons).toEqual([]); + }); +}); + +describe("handleDoctor exit code", () => { + it("returns 1 on fail (human output)", async () => { + const entryDir = makeTempEntryDir(); + const spy = vi.spyOn(console, "log").mockImplementation(() => {}); + const exit = await handleDoctor([], entryDir); + spy.mockRestore(); + expect(exit).toBe(1); + }); + + it("returns 0 on ok (human output)", async () => { + process.env.OPENAI_API_KEY = "sk-test"; + process.env.BRAINTRUST_API_KEY = "bt-test"; + __resetPackageEnvCacheForTests(); + const entryDir = makeTempEntryDir({ env: "local", trials: 3 }); + const spy = vi.spyOn(console, "log").mockImplementation(() => {}); + const exit = await handleDoctor([], entryDir); + spy.mockRestore(); + expect(exit).toBe(0); + }); +}); + +describe("handleDoctor --help", () => { + it("prints help and exits 0 on --help", async () => { + const entryDir = makeTempEntryDir(); + const spy = vi.spyOn(console, "log").mockImplementation(() => {}); + const exit = await handleDoctor(["--help"], entryDir); + // Capture before restore — vitest's mockRestore wipes mock.calls + const text = spy.mock.calls.flat().join("\n"); + spy.mockRestore(); + expect(exit).toBe(0); + expect(text).toContain("evals doctor"); + // Hidden flag should NOT appear in help + expect(text).not.toContain("--probe"); + }); + + it("prints help and exits 0 on -h", async () => { + const entryDir = makeTempEntryDir(); + const spy = vi.spyOn(console, "log").mockImplementation(() => {}); + const exit = await handleDoctor(["-h"], entryDir); + spy.mockRestore(); + expect(exit).toBe(0); + }); +}); diff --git a/packages/evals/tests/tui/welcomeState.test.ts b/packages/evals/tests/tui/welcomeState.test.ts new file mode 100644 index 000000000..bf7ee118d --- /dev/null +++ b/packages/evals/tests/tui/welcomeState.test.ts @@ -0,0 +1,98 @@ +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { afterEach, describe, expect, it } from "vitest"; +import { + isFirstRun, + markFirstRunComplete, + readWelcomeMeta, +} from "../../tui/welcomeState.js"; +import { readConfig, writeConfig } from "../../tui/commands/config.js"; + +const tempDirs: string[] = []; + +function makeTempEntryDir(initial?: Record): string { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), "evals-welcome-state-")); + tempDirs.push(dir); + fs.writeFileSync( + path.join(dir, "evals.config.json"), + JSON.stringify( + { defaults: {}, benchmarks: {}, ...(initial ?? {}) }, + null, + 2, + ), + ); + return dir; +} + +afterEach(() => { + while (tempDirs.length > 0) { + const dir = tempDirs.pop(); + if (dir) fs.rmSync(dir, { recursive: true, force: true }); + } +}); + +describe("welcomeState", () => { + it("isFirstRun is true on a fresh config", () => { + const dir = makeTempEntryDir(); + expect(isFirstRun(dir)).toBe(true); + expect(readWelcomeMeta(dir)).toEqual({}); + }); + + it("markFirstRunComplete writes _meta with ISO timestamp + version", () => { + const dir = makeTempEntryDir(); + markFirstRunComplete(dir); + const meta = readWelcomeMeta(dir); + expect(meta.firstRunCompletedAt).toMatch(/^\d{4}-\d{2}-\d{2}T/); + expect(meta.version).toBe(1); + expect(isFirstRun(dir)).toBe(false); + }); + + it("markFirstRunComplete is idempotent — second call doesn't overwrite timestamp", () => { + const dir = makeTempEntryDir(); + markFirstRunComplete(dir); + const first = readWelcomeMeta(dir).firstRunCompletedAt; + // small delay to ensure timestamps would differ if rewritten + const now = Date.now(); + while (Date.now() - now < 5) { + /* spin */ + } + markFirstRunComplete(dir); + expect(readWelcomeMeta(dir).firstRunCompletedAt).toBe(first); + }); + + it("marker round-trips without clobbering defaults / core / benchmarks", () => { + const dir = makeTempEntryDir({ + defaults: { trials: 7, env: "browserbase" }, + core: { tool: "understudy_code" }, + benchmarks: { webvoyager: { limit: 12 } }, + }); + markFirstRunComplete(dir); + const config = readConfig(dir); + expect(config.defaults.trials).toBe(7); + expect(config.defaults.env).toBe("browserbase"); + expect(config.core?.tool).toBe("understudy_code"); + expect(config.benchmarks).toMatchObject({ webvoyager: { limit: 12 } }); + expect(config._meta?.firstRunCompletedAt).toBeDefined(); + }); + + it("writeConfig prune does not drop _meta when present", () => { + const dir = makeTempEntryDir(); + const config = readConfig(dir); + config._meta = { firstRunCompletedAt: "2026-05-10T00:00:00Z", version: 1 }; + writeConfig(dir, config); + const reread = readConfig(dir); + expect(reread._meta).toEqual({ + firstRunCompletedAt: "2026-05-10T00:00:00Z", + version: 1, + }); + }); + + it("missing config — readWelcomeMeta returns {} instead of throwing", () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), "evals-welcome-state-")); + tempDirs.push(dir); + // No evals.config.json written + expect(readWelcomeMeta(dir)).toEqual({}); + expect(isFirstRun(dir)).toBe(true); + }); +}); diff --git a/packages/evals/tests/tui/welcomeStatus.test.ts b/packages/evals/tests/tui/welcomeStatus.test.ts new file mode 100644 index 000000000..b94a58b32 --- /dev/null +++ b/packages/evals/tests/tui/welcomeStatus.test.ts @@ -0,0 +1,190 @@ +import { afterEach, beforeEach, describe, expect, it } from "vitest"; +import { + snapshotEnv, + renderInlineWarning, + hasZeroProviderKeys, + __resetPackageEnvCacheForTests, +} from "../../tui/welcomeStatus.js"; + +const PROVIDER_KEYS = [ + "OPENAI_API_KEY", + "ANTHROPIC_API_KEY", + "GOOGLE_GENERATIVE_AI_API_KEY", + "GEMINI_API_KEY", + "BROWSERBASE_API_KEY", + "BROWSERBASE_PROJECT_ID", + "BB_API_KEY", + "BB_PROJECT_ID", + "BRAINTRUST_API_KEY", +]; + +const savedEnv: Record = {}; +let savedDisablePkgEnv: string | undefined; + +function clearProviderKeys(): void { + for (const key of PROVIDER_KEYS) { + savedEnv[key] = process.env[key]; + delete process.env[key]; + } + // Neutralize the package-local .env loader so tests don't depend on + // whatever real keys the developer happens to have at packages/evals/.env. + savedDisablePkgEnv = process.env.EVALS_DISABLE_PACKAGE_ENV; + process.env.EVALS_DISABLE_PACKAGE_ENV = "1"; + __resetPackageEnvCacheForTests(); +} + +function restoreProviderKeys(): void { + for (const key of PROVIDER_KEYS) { + if (savedEnv[key] === undefined) delete process.env[key]; + else process.env[key] = savedEnv[key]; + } + if (savedDisablePkgEnv === undefined) { + delete process.env.EVALS_DISABLE_PACKAGE_ENV; + } else { + process.env.EVALS_DISABLE_PACKAGE_ENV = savedDisablePkgEnv; + } + __resetPackageEnvCacheForTests(); +} + +describe("snapshotEnv", () => { + beforeEach(() => clearProviderKeys()); + afterEach(() => restoreProviderKeys()); + + it("reports all missing when no provider keys are set", () => { + const s = snapshotEnv(); + expect(s.openai.state).toBe("missing"); + expect(s.anthropic.state).toBe("missing"); + expect(s.google.state).toBe("missing"); + expect(s.browserbase.apiKey).toBe("missing"); + expect(s.browserbase.projectId).toBe("missing"); + expect(s.braintrust.state).toBe("missing"); + }); + + it("detects OpenAI from process.env with the right source", () => { + process.env.OPENAI_API_KEY = "sk-test"; + __resetPackageEnvCacheForTests(); + const s = snapshotEnv(); + expect(s.openai.state).toBe("set"); + expect(s.openai.source).toBe("process-env"); + }); + + it("prefers GOOGLE_GENERATIVE_AI_API_KEY over GEMINI_API_KEY", () => { + process.env.GEMINI_API_KEY = "gemini-test"; + process.env.GOOGLE_GENERATIVE_AI_API_KEY = "google-test"; + __resetPackageEnvCacheForTests(); + const s = snapshotEnv(); + expect(s.google.state).toBe("set"); + expect(s.google.var).toBe("GOOGLE_GENERATIVE_AI_API_KEY"); + }); + + it("falls back to GEMINI_API_KEY when canonical is missing", () => { + process.env.GEMINI_API_KEY = "gemini-only"; + __resetPackageEnvCacheForTests(); + const s = snapshotEnv(); + expect(s.google.state).toBe("set"); + expect(s.google.var).toBe("GEMINI_API_KEY"); + }); + + it("treats BB_* alias keys as set with viaAlias=true", () => { + process.env.BB_API_KEY = "bb-key"; + process.env.BB_PROJECT_ID = "bb-proj"; + __resetPackageEnvCacheForTests(); + const s = snapshotEnv(); + expect(s.browserbase.apiKey).toBe("set"); + expect(s.browserbase.projectId).toBe("set"); + expect(s.browserbase.viaAlias).toBe(true); + }); + + it("does not flag viaAlias when canonical BB names are present", () => { + process.env.BROWSERBASE_API_KEY = "bb-key"; + process.env.BROWSERBASE_PROJECT_ID = "bb-proj"; + __resetPackageEnvCacheForTests(); + const s = snapshotEnv(); + expect(s.browserbase.viaAlias).toBe(false); + }); + + it("does NOT flag viaAlias when canonical + alias are mixed (one of each)", () => { + // viaAlias should mean "all present BB values are alias-only". If the + // user set BROWSERBASE_API_KEY (canonical) AND BB_PROJECT_ID (alias), + // the dim "(via BB_API_KEY)" hint would be misleading — suppress it. + process.env.BROWSERBASE_API_KEY = "bb-key"; + process.env.BB_PROJECT_ID = "bb-proj"; + __resetPackageEnvCacheForTests(); + const s = snapshotEnv(); + expect(s.browserbase.apiKey).toBe("set"); + expect(s.browserbase.projectId).toBe("set"); + expect(s.browserbase.viaAlias).toBe(false); + }); + + it("flags viaAlias when only one BB var is present and it came via alias", () => { + process.env.BB_API_KEY = "bb-key"; + // BROWSERBASE_PROJECT_ID intentionally absent + __resetPackageEnvCacheForTests(); + const s = snapshotEnv(); + expect(s.browserbase.apiKey).toBe("set"); + expect(s.browserbase.projectId).toBe("missing"); + expect(s.browserbase.viaAlias).toBe(true); + }); + + it("partial BB — one of two vars set", () => { + process.env.BROWSERBASE_API_KEY = "bb-key"; + __resetPackageEnvCacheForTests(); + const s = snapshotEnv(); + expect(s.browserbase.apiKey).toBe("set"); + expect(s.browserbase.projectId).toBe("missing"); + }); +}); + +describe("hasZeroProviderKeys", () => { + beforeEach(() => clearProviderKeys()); + afterEach(() => restoreProviderKeys()); + + it("true when all three providers missing", () => { + expect(hasZeroProviderKeys(snapshotEnv())).toBe(true); + }); + + it("false with only OpenAI set", () => { + process.env.OPENAI_API_KEY = "sk-test"; + __resetPackageEnvCacheForTests(); + expect(hasZeroProviderKeys(snapshotEnv())).toBe(false); + }); + + it("false with only Anthropic set", () => { + process.env.ANTHROPIC_API_KEY = "ak-test"; + __resetPackageEnvCacheForTests(); + expect(hasZeroProviderKeys(snapshotEnv())).toBe(false); + }); + + it("false with only Google set (via GEMINI_API_KEY)", () => { + process.env.GEMINI_API_KEY = "gemini-test"; + __resetPackageEnvCacheForTests(); + expect(hasZeroProviderKeys(snapshotEnv())).toBe(false); + }); +}); + +describe("renderInlineWarning", () => { + beforeEach(() => clearProviderKeys()); + afterEach(() => restoreProviderKeys()); + + it("returns a non-null warning when zero provider keys", () => { + const out = renderInlineWarning(snapshotEnv()); + expect(out).not.toBeNull(); + // Strip ANSI for substring match. + // eslint-disable-next-line no-control-regex + const plain = (out ?? "").replace(/\[[0-9;]*m/g, ""); + expect(plain).toContain("No provider API key found"); + expect(plain).toContain("evals doctor"); + }); + + it("returns null when at least one provider key is set", () => { + process.env.OPENAI_API_KEY = "sk-test"; + __resetPackageEnvCacheForTests(); + expect(renderInlineWarning(snapshotEnv())).toBeNull(); + }); + + it("returns null even when Braintrust+BB are missing but a provider is set", () => { + process.env.ANTHROPIC_API_KEY = "ak-test"; + __resetPackageEnvCacheForTests(); + expect(renderInlineWarning(snapshotEnv())).toBeNull(); + }); +}); diff --git a/packages/evals/tui/banner.ts b/packages/evals/tui/banner.ts index 92ceba494..d2e789767 100644 --- a/packages/evals/tui/banner.ts +++ b/packages/evals/tui/banner.ts @@ -1,6 +1,9 @@ /** * ASCII art banner for REPL mode. - * Same block-letter style as the agents dev-cli. + * + * Pure ASCII output — the tip line that used to live here is now + * `printTipLine()` in tui/welcome.ts so the REPL can choose between + * "extended welcome" (first-run) and "banner + tip" (returning user). */ import { c } from "./format.js"; @@ -16,7 +19,4 @@ ${c.bbBold}╚══════╝ ╚═══╝ ╚═╝ ╚═╝╚═ export function printBanner(): void { console.log(BANNER_ART); - console.log( - `${c.dim} Type ${c.reset}help${c.dim} for commands, ${c.reset}exit${c.dim} to quit${c.reset}\n`, - ); } diff --git a/packages/evals/tui/commands/config.ts b/packages/evals/tui/commands/config.ts index 1ebfc4267..be12e703c 100644 --- a/packages/evals/tui/commands/config.ts +++ b/packages/evals/tui/commands/config.ts @@ -36,10 +36,24 @@ export type CoreConfigSection = { startup?: string; }; -type ConfigFile = { +/** + * First-run / welcome metadata. Persisted inside `evals.config.json` so it + * follows the same per-mode (source vs. dist) storage as `defaults`/`core`. + * Owned by tui/welcomeState.ts; the type lives here because it round-trips + * through readConfig/writeConfig. + */ +export type WelcomeMeta = { + /** ISO 8601 timestamp when the first-run welcome was completed. */ + firstRunCompletedAt?: string; + /** Schema version for the welcome marker (currently 1). */ + version?: number; +}; + +export type ConfigFile = { defaults: Defaults; benchmarks?: Record; core?: CoreConfigSection; + _meta?: WelcomeMeta; }; const VALID_KEYS: Array = [ @@ -76,6 +90,7 @@ export function readConfig(entryDir: string): ConfigFile { defaults: raw.defaults ?? {}, benchmarks: raw.benchmarks ?? {}, core: raw.core ?? undefined, + _meta: raw._meta ?? undefined, }; } catch (error) { if ( diff --git a/packages/evals/tui/commands/doctor.ts b/packages/evals/tui/commands/doctor.ts new file mode 100644 index 000000000..58c425f61 --- /dev/null +++ b/packages/evals/tui/commands/doctor.ts @@ -0,0 +1,458 @@ +/** + * `evals doctor` — on-demand health report. + * + * The single canonical surface for env-key status. Replaces what earlier + * drafts proposed as an always-on status row in the REPL; the REPL itself + * only emits a single inline line when zero provider keys are present + * (see tui/welcomeStatus.ts). + * + * Sections: + * 1. Runtime — node version, Stagehand version, mode (source/dist) + * 2. Config — evals.config.json path, defaults.env/trials/concurrency, core.* + * 3. Discovery — total tasks + core/bench split + * 4. API keys — full matrix from snapshotEnv() with source provenance + * 5. Verdict — ok | warn | fail; exit code 0 | 0 | 1 (sans --json) + * + * Flags: + * --json machine-readable output, always exit 0 + * --help/-h prints printDoctorHelp() + * --probe HIDDEN. Issues a tiny no-op LLM call to verify the OpenAI key + * actually works. Used in CI; not advertised in --help. + */ + +import fs from "node:fs"; +import path from "node:path"; +import { + bold, + cyan, + dim, + gray, + green, + red, + yellow, + padRight, +} from "../format.js"; +import { readConfig, resolveConfigPath } from "./config.js"; +import { resolveKey, snapshotEnv, type EnvSnapshot } from "../welcomeStatus.js"; +import { getPackageRootDir, getRuntimeTasksRoot } from "../../runtimePaths.js"; +import { discoverTasks } from "../../framework/discovery.js"; +import type { TaskRegistry } from "../../framework/types.js"; + +type Verdict = "ok" | "warn" | "fail"; + +type RuntimeInfo = { + node: string; + stagehand: string | null; + mode: "source" | "dist"; +}; + +type ConfigSummary = { + path: string; + env: string | null; + trials: number | null; + concurrency: number | null; + core: { tool: string | null; startup: string | null }; +}; + +type DiscoverySummary = { + ok: boolean; + total: number; + core: number; + bench: number; + error?: string; + root: string; +}; + +type DoctorReport = { + verdict: Verdict; + runtime: RuntimeInfo; + config: ConfigSummary; + discovery: DiscoverySummary; + keys: EnvSnapshot; + reasons: string[]; +}; + +// --------------------------------------------------------------------------- +// Help +// --------------------------------------------------------------------------- + +export function printDoctorHelp(): void { + const HELP_COL = 28; + const row = (left: string, right: string): string => + ` ${padRight(left, HELP_COL)} ${right}`; + console.log( + [ + "", + ` ${bold("evals doctor")} ${dim("[options]")}`, + "", + " Health report: env-key matrix, config locations, discovered tasks, runtime.", + "", + ` ${bold("Options:")}`, + "", + row(cyan("--json"), "Emit machine-readable JSON (always exits 0)"), + row(cyan("--help, -h"), "Show this help"), + "", + ` ${bold("Aliases:")} ${gray("evals health")}`, + "", + ` ${bold("Exit codes:")}`, + "", + row(gray("0"), "ok / warn"), + row(gray("1"), "fail (zero provider keys, broken env=browserbase, etc.)"), + "", + ].join("\n"), + ); +} + +// --------------------------------------------------------------------------- +// Report assembly +// --------------------------------------------------------------------------- + +function readStagehandVersion(): string | null { + try { + const repoRoot = path.dirname(getPackageRootDir()); + const corePkgPath = path.join(repoRoot, "core", "package.json"); + const corePkg = JSON.parse(fs.readFileSync(corePkgPath, "utf-8")); + return typeof corePkg.version === "string" ? corePkg.version : null; + } catch { + return null; + } +} + +function detectMode(entryDir: string): "source" | "dist" { + // Anchor on the actual built location (`packages/evals/dist/cli`) so a + // user whose checkout happens to live under a path containing `/dist/` + // (e.g. `~/work/dist/stagehand/...`) isn't misclassified. + return entryDir.endsWith("/dist/cli") || entryDir.endsWith("\\dist\\cli") + ? "dist" + : "source"; +} + +function summarizeConfig(entryDir: string): ConfigSummary { + let env: string | null = null; + let trials: number | null = null; + let concurrency: number | null = null; + let coreTool: string | null = null; + let coreStartup: string | null = null; + try { + const c = readConfig(entryDir); + env = (c.defaults.env as string | null | undefined) ?? null; + trials = (c.defaults.trials as number | null | undefined) ?? null; + concurrency = (c.defaults.concurrency as number | null | undefined) ?? null; + coreTool = c.core?.tool ?? null; + coreStartup = c.core?.startup ?? null; + } catch { + // Leave as nulls — the path is still useful for the user to fix. + } + return { + path: resolveConfigPath(entryDir), + env, + trials, + concurrency, + core: { tool: coreTool, startup: coreStartup }, + }; +} + +async function summarizeDiscovery(): Promise { + const root = getRuntimeTasksRoot(); + try { + const registry: TaskRegistry = await discoverTasks(root, false); + const core = registry.byTier.get("core")?.length ?? 0; + const bench = registry.byTier.get("bench")?.length ?? 0; + return { ok: true, total: registry.tasks.length, core, bench, root }; + } catch (err) { + return { + ok: false, + total: 0, + core: 0, + bench: 0, + error: (err as Error).message, + root, + }; + } +} + +/** + * Verdict rules (plan §07): + * fail — zero provider keys, OR defaults.env=browserbase with both BB + * vars missing, OR discovery threw. + * warn — at least one provider key present, but Braintrust missing or + * BB partial (only one of two BB vars set). + * ok — otherwise. + */ +function computeVerdict( + keys: EnvSnapshot, + config: ConfigSummary, + discovery: DiscoverySummary, +): { verdict: Verdict; reasons: string[] } { + const reasons: string[] = []; + + if (!discovery.ok) { + reasons.push(`Discovery failed: ${discovery.error ?? "unknown error"}`); + } + + const zeroProviders = + keys.openai.state === "missing" && + keys.anthropic.state === "missing" && + keys.google.state === "missing"; + if (zeroProviders) { + reasons.push( + "No provider API key found (OpenAI / Anthropic / Google all missing).", + ); + } + + const envIsBrowserbase = config.env === "browserbase"; + const bothBBMissing = + keys.browserbase.apiKey === "missing" && + keys.browserbase.projectId === "missing"; + if (envIsBrowserbase && bothBBMissing) { + reasons.push( + "env=browserbase but both BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID are missing.", + ); + } + + if (!discovery.ok || zeroProviders || (envIsBrowserbase && bothBBMissing)) { + return { verdict: "fail", reasons }; + } + + const partialBB = + (keys.browserbase.apiKey === "set" && + keys.browserbase.projectId === "missing") || + (keys.browserbase.apiKey === "missing" && + keys.browserbase.projectId === "set"); + if (partialBB) { + reasons.push( + "Browserbase is partially configured (one of API key / project ID is missing).", + ); + } + if (keys.braintrust.state === "missing") { + reasons.push( + "BRAINTRUST_API_KEY missing — `experiments` commands will fail.", + ); + } + + if (partialBB || keys.braintrust.state === "missing") { + return { verdict: "warn", reasons }; + } + + return { verdict: "ok", reasons }; +} + +async function buildReport(entryDir: string): Promise { + const runtime: RuntimeInfo = { + node: process.version, + stagehand: readStagehandVersion(), + mode: detectMode(entryDir), + }; + const config = summarizeConfig(entryDir); + const discovery = await summarizeDiscovery(); + const keys = snapshotEnv(); + const { verdict, reasons } = computeVerdict(keys, config, discovery); + return { verdict, runtime, config, discovery, keys, reasons }; +} + +// --------------------------------------------------------------------------- +// Renderers +// --------------------------------------------------------------------------- + +function keyRow( + label: string, + entry: { state: "set" | "missing"; source: string }, + note?: string, +): string { + const value = + entry.state === "set" + ? `${green("✓ set")} ${dim(`(${entry.source})`)}` + : red("✗ missing"); + const suffix = note ? ` ${dim(note)}` : ""; + return ` ${padRight(label, 30)} ${value}${suffix}`; +} + +function renderHuman(report: DoctorReport): void { + const r = report; + console.log(""); + console.log(` ${bold("Stagehand evals · doctor")}`); + console.log(""); + + console.log(` ${bold("Runtime")}`); + console.log(` ${padRight("Node", 15)} ${r.runtime.node}`); + console.log( + ` ${padRight("Stagehand", 15)} ${r.runtime.stagehand ?? gray("(unknown)")} ${dim("(packages/core/package.json)")}`, + ); + console.log(` ${padRight("Mode", 15)} ${r.runtime.mode}`); + console.log(""); + + console.log(` ${bold("Config")}`); + console.log(` ${padRight("evals.config.json", 22)} ${dim(r.config.path)}`); + console.log(` ${padRight("env", 22)} ${cyan(r.config.env ?? "local")}`); + console.log( + ` ${padRight("trials", 22)} ${cyan(String(r.config.trials ?? 3))}`, + ); + console.log( + ` ${padRight("concurrency", 22)} ${cyan(String(r.config.concurrency ?? 3))}`, + ); + console.log( + ` ${padRight("core.tool", 22)} ${ + r.config.core.tool + ? cyan(r.config.core.tool) + : gray("(runner default: understudy_code)") + }`, + ); + if (r.config.core.startup) { + console.log( + ` ${padRight("core.startup", 22)} ${cyan(r.config.core.startup)}`, + ); + } + console.log(""); + + console.log(` ${bold("Discovery")}`); + if (r.discovery.ok) { + console.log( + ` ${padRight("Tasks", 22)} ${cyan(String(r.discovery.total))} ${dim(`(core: ${r.discovery.core} · bench: ${r.discovery.bench})`)}`, + ); + console.log(` ${padRight("Tasks root", 22)} ${dim(r.discovery.root)}`); + } else { + console.log(` ${red("✗ failed")} ${dim(r.discovery.error ?? "")}`); + console.log(` ${padRight("Tasks root", 22)} ${dim(r.discovery.root)}`); + } + console.log(""); + + console.log(` ${bold("API keys")}`); + console.log(keyRow("OPENAI_API_KEY", r.keys.openai)); + console.log(keyRow("ANTHROPIC_API_KEY", r.keys.anthropic)); + const googleLabel = r.keys.google.var ?? "GOOGLE_GENERATIVE_AI_API_KEY"; + console.log( + keyRow(googleLabel, { + state: r.keys.google.state, + source: r.keys.google.source, + }), + ); + console.log( + ` ${padRight("BROWSERBASE_API_KEY", 30)} ${ + r.keys.browserbase.apiKey === "set" ? green("✓ set") : red("✗ missing") + }${r.keys.browserbase.viaAlias ? ` ${dim("(via BB_API_KEY)")}` : ""}`, + ); + console.log( + ` ${padRight("BROWSERBASE_PROJECT_ID", 30)} ${ + r.keys.browserbase.projectId === "set" ? green("✓ set") : red("✗ missing") + }`, + ); + console.log( + keyRow( + "BRAINTRUST_API_KEY", + r.keys.braintrust, + "(needed for `experiments`)", + ), + ); + console.log(""); + + console.log(` ${bold("Status")}`); + if (r.verdict === "ok") { + console.log(` ${green("✓ ok")}`); + } else if (r.verdict === "warn") { + console.log(` ${yellow("⚠ warn")}`); + } else { + console.log(` ${red("✗ fail")}`); + } + for (const reason of r.reasons) { + console.log(` ${dim("— " + reason)}`); + } + console.log(""); + + if (r.verdict !== "ok") { + console.log( + ` ${dim("To set keys: edit")} ${cyan(path.join(getPackageRootDir(), ".env"))} ${dim("or export them in your shell.")}`, + ); + console.log(""); + } +} + +function renderJson(report: DoctorReport): void { + // Keep field order stable for downstream consumers. + const out = { + verdict: report.verdict, + runtime: report.runtime, + config: report.config, + discovery: { + ok: report.discovery.ok, + total: report.discovery.total, + core: report.discovery.core, + bench: report.discovery.bench, + root: report.discovery.root, + ...(report.discovery.error ? { error: report.discovery.error } : {}), + }, + keys: report.keys, + reasons: report.reasons, + }; + console.log(JSON.stringify(out, null, 2)); +} + +// --------------------------------------------------------------------------- +// Probe (hidden) +// --------------------------------------------------------------------------- + +async function runOpenAIProbe( + keys: EnvSnapshot, +): Promise<{ ok: boolean; error?: string }> { + if (keys.openai.state !== "set") + return { ok: false, error: "OPENAI_API_KEY missing" }; + // Use the SAME resolution as the snapshot — i.e. check process.env AND + // packages/evals/.env. If we only read process.env here, a key stored + // only in the package-local .env would show "✓ set" in the snapshot but + // probe with an empty bearer token and silently fail with an auth error. + const { value: apiKey } = resolveKey("OPENAI_API_KEY"); + if (!apiKey) { + return { ok: false, error: "OPENAI_API_KEY missing after resolution" }; + } + // Tiny no-op model list call — cheaper than a chat completion. + try { + const res = await fetch("https://api.openai.com/v1/models?limit=1", { + headers: { Authorization: `Bearer ${apiKey}` }, + }); + if (!res.ok) { + return { ok: false, error: `HTTP ${res.status}` }; + } + return { ok: true }; + } catch (err) { + return { + ok: false, + error: (err as Error).message, + }; + } +} + +// --------------------------------------------------------------------------- +// Entry point +// --------------------------------------------------------------------------- + +export async function handleDoctor( + args: string[], + entryDir: string, +): Promise { + if (args.includes("--help") || args.includes("-h") || args[0] === "help") { + printDoctorHelp(); + return 0; + } + + const wantJson = args.includes("--json"); + const wantProbe = args.includes("--probe"); + + const report = await buildReport(entryDir); + + if (wantProbe) { + const probeResult = await runOpenAIProbe(report.keys); + if (!probeResult.ok) { + report.reasons.push(`Probe failed: ${probeResult.error ?? "unknown"}`); + report.verdict = "fail"; + } + } + + if (wantJson) { + renderJson(report); + return 0; // --json always exits 0; verdict is in the payload + } + + renderHuman(report); + + if (report.verdict === "fail") return 1; + return 0; +} diff --git a/packages/evals/tui/commands/help.ts b/packages/evals/tui/commands/help.ts index 3339a676d..d9cc20738 100644 --- a/packages/evals/tui/commands/help.ts +++ b/packages/evals/tui/commands/help.ts @@ -29,6 +29,10 @@ export function printHelp(): void { "Inspect and compare Braintrust experiment runs", ), row(`${cyan("new")} ${dim(" ")}`, "Scaffold a new task"), + row( + `${cyan("doctor")} ${dim("[--json]")}`, + "Health report (env keys, config, discovery)", + ), row(cyan("help"), "Show this help"), row(cyan("clear"), "Clear the screen"), row(cyan("exit"), "Exit the REPL"), diff --git a/packages/evals/tui/repl.ts b/packages/evals/tui/repl.ts index fe970a538..a21687eea 100644 --- a/packages/evals/tui/repl.ts +++ b/packages/evals/tui/repl.ts @@ -17,6 +17,7 @@ import { import { printList } from "./commands/list.js"; import { handleConfig } from "./commands/config.js"; import { handleExperiments } from "./commands/experiments.js"; +import { handleDoctor } from "./commands/doctor.js"; import { runCommand } from "./commands/run.js"; import { scaffoldTask } from "./commands/new.js"; import { parseRunArgs, resolveRunOptions } from "./commands/parse.js"; @@ -24,6 +25,9 @@ import { readConfig } from "./commands/config.js"; import { discoverTasks } from "../framework/discovery.js"; import type { TaskRegistry } from "../framework/types.js"; import { getRuntimeTasksRoot } from "../runtimePaths.js"; +import { printExtendedWelcome, printTipLine } from "./welcome.js"; +import { snapshotEnv, renderInlineWarning } from "./welcomeStatus.js"; +import { isFirstRun, markFirstRunComplete } from "./welcomeState.js"; function tokenize(input: string): string[] { const tokens: string[] = []; @@ -52,19 +56,58 @@ function tokenize(input: string): string[] { return tokens; } -export async function startRepl(entryDir: string): Promise { - printBanner(); +export type ReplOptions = { + /** Suppress banner, welcome, and any inline warnings. Output is just the prompt. */ + quiet?: boolean; +}; + +export async function startRepl( + entryDir: string, + options: ReplOptions = {}, +): Promise { + const quiet = options.quiet === true; + const noWelcome = quiet || Boolean(process.env.EVALS_NO_WELCOME); const resolvedTasksRoot = getRuntimeTasksRoot(); let registry: TaskRegistry; try { registry = await discoverTasks(resolvedTasksRoot, false); - console.log(dim(` Discovered ${registry.tasks.length} tasks\n`)); } catch (err) { console.error(red(` Failed to discover tasks: ${(err as Error).message}`)); process.exit(1); } + // ─── Onboarding chrome ─────────────────────────────────────────────── + // First-run-only welcome panel; otherwise just the banner + tip line. + // The only inline output about env state is the zero-keys warning, + // surfaced when no welcome panel is shown. Discovery count is NOT + // printed (use `list` or `evals doctor` instead). + if (!quiet) { + printBanner(); + const showExtendedWelcome = !noWelcome && isFirstRun(entryDir); + if (showExtendedWelcome) { + printExtendedWelcome({ snapshot: snapshotEnv(), registry }); + } else { + const warning = renderInlineWarning(snapshotEnv()); + if (warning && process.stdout.isTTY) { + console.log(warning); + } + printTipLine(); + } + console.log(""); + // Mark the marker pre-prompt so even an immediate Ctrl+C counts as + // "first-run complete" — we don't want to re-prompt on every relaunch + // when the user dismisses the welcome. + // + // Gated on `!quiet`: a `evals --quiet` invocation (often used by CI / + // automation that pipes into the REPL) must NOT burn the first-run + // marker, since the user never had a chance to see the welcome. + // `EVALS_NO_WELCOME=1`, on the other hand, IS an explicit dismissal, + // so it still marks the marker via the `else` branch above already + // having rendered the tip line — the user knows they're in the REPL. + markFirstRunComplete(entryDir); + } + const rl = readline.createInterface({ input: process.stdin, output: process.stdout, @@ -164,6 +207,12 @@ export async function startRepl(entryDir: string): Promise { break; } + case "doctor": + case "health": { + await handleDoctor(args, entryDir); + break; + } + case "new": if (wantsHelp) { printNewHelp(); diff --git a/packages/evals/tui/welcome.ts b/packages/evals/tui/welcome.ts new file mode 100644 index 000000000..2dbd319d1 --- /dev/null +++ b/packages/evals/tui/welcome.ts @@ -0,0 +1,110 @@ +/** + * Welcome panel + tip line. + * + * Two surfaces: + * - `printExtendedWelcome` — the one-time first-run panel. Shows banner- + * adjacent "what is this" copy, a health snapshot, and a quickstart. + * Gated by `isFirstRun(entryDir)` and `EVALS_NO_WELCOME`. + * - `printTipLine` — the small "Type help, .. to leave, exit · evals + * doctor for diagnostics" line that prints on every non-quiet launch. + * Previously hardcoded in banner.ts. + * + * No status row. The only inline output about env state is the zero-keys + * warning surfaced via welcomeStatus.renderInlineWarning — printed by repl.ts + * after the banner when no welcome is shown. + */ + +import { bold, cyan, dim, green, red } from "./format.js"; +import type { EnvSnapshot } from "./welcomeStatus.js"; +import type { TaskRegistry } from "../framework/types.js"; + +function tagIcon(state: "set" | "missing"): string { + return state === "set" ? green("✓") : red("✗"); +} + +function providerLabel(s: EnvSnapshot): string { + // Compact one-liner used inside the welcome panel only. + const parts = [ + `${tagIcon(s.openai.state)} openai`, + `${tagIcon(s.anthropic.state)} anthropic`, + `${tagIcon(s.google.state)} google`, + ]; + return parts.join(" "); +} + +function browserbaseLabel(s: EnvSnapshot): string { + if (s.browserbase.apiKey === "set" && s.browserbase.projectId === "set") { + return green("✓"); + } + if ( + s.browserbase.apiKey === "missing" && + s.browserbase.projectId === "missing" + ) { + return red("✗"); + } + // Partial — one of two BB vars present. + return red("⚠"); +} + +export type WelcomeContext = { + snapshot: EnvSnapshot; + registry: TaskRegistry; +}; + +/** + * The first-run panel. Prints to stdout. Does NOT include the discovery + * count — that was removed; the task count is reachable via `evals list` + * and `evals doctor`. Does NOT print the banner — repl.ts prints the + * banner first and the welcome second. + */ +export function printExtendedWelcome(ctx: WelcomeContext): void { + const lines: string[] = []; + lines.push(""); + lines.push( + ` ${bold("Welcome to Stagehand evals.")} ${dim("First run detected — showing this once.")}`, + ); + lines.push(""); + lines.push( + ` ${dim("·")} Run agent benchmarks (${cyan("bench")}) and deterministic CLI perf tests (${cyan("core")}).`, + ); + lines.push( + ` ${dim("·")} Type a command (e.g. ${cyan("list")}) or a run target (e.g. ${cyan("act")}).`, + ); + lines.push( + ` ${dim("·")} ${cyan("help")} for commands · ${cyan("evals doctor")} for env health · ${cyan("exit")} to quit.`, + ); + lines.push(""); + lines.push(` ${bold("Health")}`); + lines.push( + ` AI: ${providerLabel(ctx.snapshot)} BB: ${browserbaseLabel(ctx.snapshot)} Braintrust: ${tagIcon(ctx.snapshot.braintrust.state)}`, + ); + lines.push( + ` ${dim("Run")} ${cyan("evals doctor")} ${dim("for setup help.")}`, + ); + lines.push(""); + lines.push(` ${bold("Try first")}`); + lines.push( + ` ${cyan("list")} ${dim("# see what tasks exist")}`, + ); + lines.push( + ` ${cyan("run act")} ${dim("# run the act category once (env=local)")}`, + ); + lines.push( + ` ${cyan("experiments list")} ${dim("# recent Braintrust runs (needs BRAINTRUST_API_KEY)")}`, + ); + lines.push( + ` ${cyan("config")} ${dim("# see your defaults")}`, + ); + lines.push(""); + console.log(lines.join("\n")); +} + +/** + * The compact tip line that prints on every non-quiet launch. + * Replaces the line that used to live at banner.ts:19-21. + */ +export function printTipLine(): void { + console.log( + ` ${dim("Type")} ${cyan("help")} ${dim("for commands,")} ${cyan("exit")} ${dim("to quit")}`, + ); +} diff --git a/packages/evals/tui/welcomeState.ts b/packages/evals/tui/welcomeState.ts new file mode 100644 index 000000000..18b1626a9 --- /dev/null +++ b/packages/evals/tui/welcomeState.ts @@ -0,0 +1,56 @@ +/** + * First-run welcome state. + * + * The marker (`_meta.firstRunCompletedAt`) lives inside `evals.config.json` + * so it follows the same per-mode (source vs. dist) storage rules as the + * rest of the config. This is intentional: a contributor switching between + * `pnpm evals` and a globally installed CLI sees the welcome again — that's + * acceptable and avoids a separate cross-install state location. + * + * `scripts/build-cli.ts` preserves `_meta` across rebuilds so the dist + * config inherits the source marker on first build. + */ + +import { + readConfig, + writeConfig, + type WelcomeMeta, +} from "./commands/config.js"; + +export const CURRENT_SCHEMA_VERSION = 1; + +export function readWelcomeMeta(entryDir: string): WelcomeMeta { + try { + const config = readConfig(entryDir); + return config._meta ?? {}; + } catch { + // Missing/invalid config → treat as no marker. Reading is best-effort + // here; the actual handlers surface read errors when they need to. + return {}; + } +} + +export function isFirstRun(entryDir: string): boolean { + const meta = readWelcomeMeta(entryDir); + return !meta.firstRunCompletedAt; +} + +/** + * Mark the first-run welcome as completed. Idempotent: re-runs don't change + * the stored timestamp once set (avoids churn on every launch). + */ +export function markFirstRunComplete(entryDir: string): void { + try { + const config = readConfig(entryDir); + if (config._meta?.firstRunCompletedAt) return; + config._meta = { + ...(config._meta ?? {}), + firstRunCompletedAt: new Date().toISOString(), + version: CURRENT_SCHEMA_VERSION, + }; + writeConfig(entryDir, config); + } catch { + // Best-effort. The welcome panel still rendered; failing to persist the + // marker just means the next launch will show it again. + } +} diff --git a/packages/evals/tui/welcomeStatus.ts b/packages/evals/tui/welcomeStatus.ts new file mode 100644 index 000000000..6f2e8ad57 --- /dev/null +++ b/packages/evals/tui/welcomeStatus.ts @@ -0,0 +1,198 @@ +/** + * Environment snapshot + inline warning rendering. + * + * Used by: + * - the one-time first-run welcome panel (`welcome.ts`) + * - `evals doctor` + * - the REPL's zero-keys inline warning (only inline output about env state) + * + * The single canonical view of which API keys are present, with source + * provenance for the doctor's JSON output. The renderInlineWarning function + * is intentionally narrow — it returns non-null ONLY when zero provider keys + * are present, so the daily REPL stays quiet. Adding more inline cases here + * is a deliberate policy change, not a code edit. + */ + +import fs from "node:fs"; +import path from "node:path"; +import dotenv from "dotenv"; +import { cyan, dim, yellow } from "./format.js"; +import { getPackageRootDir } from "../runtimePaths.js"; + +export type KeyState = "set" | "missing"; +export type KeySource = "process-env" | "package-dotenv" | "none"; + +export type ProviderKeyEntry = { + state: KeyState; + source: KeySource; +}; + +export type GoogleKeyEntry = ProviderKeyEntry & { + /** Which env var actually held the value, or null if missing. */ + var: "GOOGLE_GENERATIVE_AI_API_KEY" | "GEMINI_API_KEY" | null; +}; + +export type BrowserbaseKeyEntry = { + apiKey: KeyState; + projectId: KeyState; + /** True if only the BB_* alias variants are present (not the canonical names). */ + viaAlias: boolean; +}; + +export type EnvSnapshot = { + openai: ProviderKeyEntry; + anthropic: ProviderKeyEntry; + google: GoogleKeyEntry; + browserbase: BrowserbaseKeyEntry; + braintrust: ProviderKeyEntry; +}; + +// --------------------------------------------------------------------------- +// Package-dotenv loader (one-shot, cached for the process lifetime). +// Mirrors the pattern in lib/braintrust-report.ts:365-373 — read +// packages/evals/.env so users running `pnpm evals` from the repo root +// (cwd ≠ packages/evals) still see their package-local keys. +// --------------------------------------------------------------------------- + +let cachedPackageEnv: Record | null = null; +let packageEnvLoaded = false; + +function loadPackageEnv(): Record { + // Test escape hatch: set EVALS_DISABLE_PACKAGE_ENV=1 to skip reading the + // package-local .env file. Lives in process.env (not a module-scoped + // variable) so it works regardless of how the welcomeStatus module is + // resolved by the test runner. + if (process.env.EVALS_DISABLE_PACKAGE_ENV === "1") return {}; + if (packageEnvLoaded) return cachedPackageEnv ?? {}; + packageEnvLoaded = true; + try { + const envPath = path.join(getPackageRootDir(), ".env"); + const raw = fs.readFileSync(envPath, "utf-8"); + cachedPackageEnv = dotenv.parse(raw); + } catch { + cachedPackageEnv = null; + } + return cachedPackageEnv ?? {}; +} + +/** + * Resolve a single env var, checking process.env first then the package .env. + * Returns the value + which source it came from. + * + * Exported so callers that need the actual value (e.g. the doctor's + * `--probe` flag) can use the same resolution as `snapshotEnv()`. The + * snapshot itself intentionally exposes only `state` + `source`, not the + * value — exposing raw key material via the doctor JSON would be a leak. + */ +export function resolveKey(name: string): { value: string; source: KeySource } { + const fromProcess = process.env[name]; + if (fromProcess && fromProcess.length > 0) { + return { value: fromProcess, source: "process-env" }; + } + const fromPackage = loadPackageEnv()[name]; + if (fromPackage && fromPackage.length > 0) { + return { value: fromPackage, source: "package-dotenv" }; + } + return { value: "", source: "none" }; +} + +function providerEntry(name: string): ProviderKeyEntry { + const r = resolveKey(name); + return { + state: r.value ? "set" : "missing", + source: r.source, + }; +} + +function googleEntry(): GoogleKeyEntry { + // Prefer the canonical GOOGLE_GENERATIVE_AI_API_KEY name; fall back to GEMINI_API_KEY. + const a = resolveKey("GOOGLE_GENERATIVE_AI_API_KEY"); + if (a.value) { + return { + state: "set", + source: a.source, + var: "GOOGLE_GENERATIVE_AI_API_KEY", + }; + } + const b = resolveKey("GEMINI_API_KEY"); + if (b.value) { + return { state: "set", source: b.source, var: "GEMINI_API_KEY" }; + } + return { state: "missing", source: "none", var: null }; +} + +function browserbaseEntry(): BrowserbaseKeyEntry { + const canonApi = resolveKey("BROWSERBASE_API_KEY"); + const canonProj = resolveKey("BROWSERBASE_PROJECT_ID"); + const aliasApi = resolveKey("BB_API_KEY"); + const aliasProj = resolveKey("BB_PROJECT_ID"); + + const canonApiPresent = canonApi.value.length > 0; + const aliasApiPresent = aliasApi.value.length > 0; + const canonProjPresent = canonProj.value.length > 0; + const aliasProjPresent = aliasProj.value.length > 0; + + const apiSet = canonApiPresent || aliasApiPresent; + const projSet = canonProjPresent || aliasProjPresent; + + // `viaAlias` is true iff at least one BB var is present AND every present + // BB var was resolved only via its BB_* alias (no canonical name won). + // Drives the dim "(via BB_API_KEY)" note in the doctor — if the user set + // the canonical name for one and the alias for the other, the note would + // be misleading, so we suppress it. + const apiAbsent = !apiSet; + const projAbsent = !projSet; + const apiOnlyAlias = aliasApiPresent && !canonApiPresent; + const projOnlyAlias = aliasProjPresent && !canonProjPresent; + const anyPresent = !apiAbsent || !projAbsent; + const allPresentAreAlias = + (apiAbsent || apiOnlyAlias) && (projAbsent || projOnlyAlias); + const viaAlias = anyPresent && allPresentAreAlias; + + return { + apiKey: apiSet ? "set" : "missing", + projectId: projSet ? "set" : "missing", + viaAlias, + }; +} + +/** + * Read process.env + packages/evals/.env into a single snapshot. + * Pure modulo the cached dotenv read; safe to call repeatedly. + */ +export function snapshotEnv(): EnvSnapshot { + return { + openai: providerEntry("OPENAI_API_KEY"), + anthropic: providerEntry("ANTHROPIC_API_KEY"), + google: googleEntry(), + browserbase: browserbaseEntry(), + braintrust: providerEntry("BRAINTRUST_API_KEY"), + }; +} + +// --------------------------------------------------------------------------- +// Inline warning rendering. +// Returns the warning string iff zero provider keys are present. Otherwise +// null — meaning "do not print anything inline about env state." +// --------------------------------------------------------------------------- + +export function hasZeroProviderKeys(s: EnvSnapshot): boolean { + return ( + s.openai.state === "missing" && + s.anthropic.state === "missing" && + s.google.state === "missing" + ); +} + +export function renderInlineWarning(s: EnvSnapshot): string | null { + if (!hasZeroProviderKeys(s)) return null; + return ` ${yellow("⚠ No provider API key found.")} ${dim("Run")} ${cyan("evals doctor")} ${dim("for setup help.")}`; +} + +/** + * Internal helper exported for tests so the cached dotenv can be reset. + */ +export function __resetPackageEnvCacheForTests(): void { + cachedPackageEnv = null; + packageEnvLoaded = false; +}