browserbase · miguelg719 · May 11, 2026 · May 11, 2026 · May 11, 2026
diff --git a/packages/evals/cli.ts b/packages/evals/cli.ts
@@ -2,13 +2,18 @@
  * Evals CLI entry point.
  *
  * Modes:
- *   - `evals` (no args)          → interactive REPL
- *   - `evals run <target> …`     → single-shot run with rich progress
- *   - `evals list [tier]`        → list discovered tasks
- *   - `evals config [sub]`       → print / get / set defaults
- *   - `evals experiments [sub]`  → inspect / compare Braintrust runs
- *   - `evals new <tier> <cat> <name>` → scaffold a task file
- *   - `evals help` / `-h`        → help
+ *   - `evals` (no args)              → interactive REPL
+ *   - `evals --quiet` / `evals -q`   → REPL with no banner / welcome / inline warnings
+ *   - `evals run <target> …`         → single-shot run with rich progress
+ *   - `evals list [tier]`            → list discovered tasks
+ *   - `evals config [sub]`           → print / get / set defaults
+ *   - `evals experiments [sub]`      → inspect / compare Braintrust runs
+ *   - `evals doctor` / `health`      → env-key + config + discovery health report
+ *   - `evals new <tier> <cat> <name>`→ scaffold a task file
+ *   - `evals help` / `-h`            → help
+ *
+ * Env vars:
+ *   - EVALS_NO_WELCOME=1             → suppress first-run welcome panel (REPL only)
  *
  * No child processes. All runs flow through framework/runEvals in-process.
  *
@@ -95,14 +100,19 @@ const args = process.argv.slice(2);
   process.on("SIGINT", () => void handleSignal("SIGINT"));
   process.on("SIGTERM", () => void handleSignal("SIGTERM"));
 
+  // REPL launch: zero args, or only `--quiet`/`-q` flags. Quiet flags are
+  // REPL-only (they suppress chrome); other args route to the argv switch.
+  const isQuietFlag = (a: string): boolean => a === "--quiet" || a === "-q";
+  const replLaunch = args.length === 0 || args.every(isQuietFlag);
+
   // Argv mode: Esc behaves like Ctrl+C. The REPL has its own keypress
   // handler that does cooperative-then-aggressive abort instead — this
   // path is only active when no arg-less REPL is running.
   //
   // Note: raw mode disables the OS-level Ctrl+C → SIGINT translation,
   // so we forward it ourselves.
   let cleanupArgvInput = (): void => {};
-  if (args.length > 0 && process.stdin.isTTY) {
+  if (!replLaunch && args.length > 0 && process.stdin.isTTY) {
     const readline = await import("node:readline");
     const wasRaw = process.stdin.isRaw;
     readline.emitKeypressEvents(process.stdin);
@@ -149,10 +159,30 @@ const args = process.argv.slice(2);
     await runCommand(resolved);
   }
 
+  const isHelpToken = (token: string | undefined): boolean =>
+    token === "--help" || token === "-h" || token === "help";
+
+  const isConfigHelpInvocation = (tokens: string[]): boolean => {
+    if (isHelpToken(tokens[0])) return true;
+    if (tokens[0] === "core") {
+      return isHelpToken(tokens[1]) || isHelpToken(tokens[2]);
+    }
+    return isHelpToken(tokens[1]);
+  };
+
+  const isExperimentsHelpInvocation = (tokens: string[]): boolean =>
+    isHelpToken(tokens[0]) || isHelpToken(tokens[1]);
+
+  // Whether to write the first-run marker in `finally`. Help-only paths and
+  // the doctor command don't count as "first uses" — they're discovery
+  // actions. The REPL marks itself.
+  let shouldMarkFirstRun = false;
+
   try {
-    if (args.length === 0) {
+    if (replLaunch) {
       const { startRepl } = await import("./tui/repl.js");
-      await startRepl(ENTRY_DIR);
+      const quiet = args.some(isQuietFlag);
+      await startRepl(ENTRY_DIR, { quiet });
       return;
     }
 
@@ -162,15 +192,15 @@ const args = process.argv.slice(2);
     // after the command. Later positions are arguments or flag values and
     // must not be swallowed (e.g. `evals run act --help` would otherwise
     // print run help instead of erroring on the unknown `--help` flag).
-    const wantsHelp =
-      subArgs[0] === "--help" || subArgs[0] === "-h" || subArgs[0] === "help";
+    const wantsHelp = isHelpToken(subArgs[0]);
 
     switch (command) {
       case "run": {
         if (wantsHelp) {
           printRunHelp();
           return;
         }
+        shouldMarkFirstRun = true;
         await executeRun(subArgs);
         return;
       }
@@ -180,6 +210,7 @@ const args = process.argv.slice(2);
           printListHelp();
           return;
         }
+        shouldMarkFirstRun = true;
         const detailed =
           subArgs.includes("--detailed") || subArgs.includes("-d");
         const tierFilter = subArgs.find((a) => !a.startsWith("-"));
@@ -192,24 +223,36 @@ const args = process.argv.slice(2);
       }
 
       case "config": {
+        shouldMarkFirstRun = !isConfigHelpInvocation(subArgs);
         const { handleConfig } = await import("./tui/commands/config.js");
         await handleConfig(subArgs, ENTRY_DIR);
         return;
       }
 
       case "experiments": {
+        shouldMarkFirstRun = !isExperimentsHelpInvocation(subArgs);
         const { handleExperiments } = await import(
           "./tui/commands/experiments.js"
         );
         await handleExperiments(subArgs);
         return;
       }
 
+      case "doctor":
+      case "health": {
+        // Doctor is a diagnostic, not a "first use" — don't mark the marker.
+        const { handleDoctor } = await import("./tui/commands/doctor.js");
+        const exitCode = await handleDoctor(subArgs, ENTRY_DIR);
+        if (exitCode !== 0) process.exitCode = exitCode;
+        return;
+      }
+
       case "new": {
         if (wantsHelp) {
           printNewHelp();
           return;
         }
+        shouldMarkFirstRun = true;
         const { scaffoldTask } = await import("./tui/commands/new.js");
         scaffoldTask(subArgs);
         return;
@@ -223,6 +266,7 @@ const args = process.argv.slice(2);
 
       default: {
         // Unknown first arg → treat as run target: `evals act` == `evals run act`
+        shouldMarkFirstRun = true;
         await executeRun(args);
         return;
       }
@@ -231,6 +275,14 @@ const args = process.argv.slice(2);
     console.error(red(`Error: ${(err as Error).message}`));
     process.exitCode = 1;
   } finally {
+    if (shouldMarkFirstRun) {
+      try {
+        const { markFirstRunComplete } = await import("./tui/welcomeState.js");
+        markFirstRunComplete(ENTRY_DIR);
+      } catch {
+        // best-effort
+      }
+    }
     cleanupArgvInput();
   }
 })();
diff --git a/packages/evals/scripts/build-cli.ts b/packages/evals/scripts/build-cli.ts
@@ -50,6 +50,14 @@ if (fs.existsSync(distConfigPath)) {
         ...existing.defaults,
       };
     }
+    // Preserve the first-run welcome marker across rebuilds so a contributor
+    // who's already seen the welcome on the dist path doesn't see it again
+    // after every `pnpm run build:cli`. If the source has _meta and dist
+    // doesn't (fresh dist install), the source value is inherited via the
+    // sourceConfig literal — already correct.
+    if (existing._meta) {
+      sourceConfig._meta = { ...sourceConfig._meta, ...existing._meta };
+    }
   } catch {
     // invalid existing config – overwrite entirely
   }

diff --git a/packages/evals/tests/cli.test.ts b/packages/evals/tests/cli.test.ts
@@ -15,6 +15,18 @@ const SOURCE_CONFIG = path.join(
   "evals.config.json",
 );
 
+// File-level snapshot/restore: any `evals run …` invocation through the
+// real CLI writes `_meta.firstRunCompletedAt` into the source config
+// (because the test runs in source mode). Restore at the end so the
+// repo file stays pristine.
+let __fileLevelConfigSnapshot: string;
+beforeAll(() => {
+  __fileLevelConfigSnapshot = fs.readFileSync(SOURCE_CONFIG, "utf-8");
+});
+afterAll(() => {
+  fs.writeFileSync(SOURCE_CONFIG, __fileLevelConfigSnapshot);
+});
+
 async function runCli(
   args: string[],
 ): Promise<{ stdout: string; stderr: string; code: number }> {
@@ -38,6 +50,17 @@ async function runCli(
   }
 }
 
+function resetSourceWelcomeMeta(): void {
+  const config = JSON.parse(fs.readFileSync(SOURCE_CONFIG, "utf-8"));
+  delete config._meta;
+  fs.writeFileSync(SOURCE_CONFIG, JSON.stringify(config, null, 2) + "\n");
+}
+
+function readSourceWelcomeCompletedAt(): string | undefined {
+  const config = JSON.parse(fs.readFileSync(SOURCE_CONFIG, "utf-8"));
+  return config._meta?.firstRunCompletedAt;
+}
+
 describe("CLI entrypoint", () => {
   it("shows help", async () => {
     const { stdout, code } = await runCli(["-h"]);
@@ -59,6 +82,39 @@ describe("CLI entrypoint", () => {
     expect(stdout).toContain("compare");
   });
 
+  it("includes doctor in top-level help", async () => {
+    const { stdout, code } = await runCli(["-h"]);
+    expect(code).toBe(0);
+    expect(stdout).toContain("doctor");
+  });
+
+  it("shows doctor help via --help", async () => {
+    const { stdout, code } = await runCli(["doctor", "--help"]);
+    expect(code).toBe(0);
+    expect(stdout).toContain("evals doctor");
+    expect(stdout).toContain("--json");
+    // Hidden --probe flag must not appear
+    expect(stdout).not.toContain("--probe");
+  });
+
+  it("doctor --json emits a parseable report", async () => {
+    const { stdout, code } = await runCli(["doctor", "--json"]);
+    // --json always exits 0 regardless of verdict
+    expect(code).toBe(0);
+    const payload = JSON.parse(stdout);
+    expect(payload).toHaveProperty("verdict");
+    expect(payload).toHaveProperty("runtime.node");
+    expect(payload).toHaveProperty("keys.openai");
+    expect(Array.isArray(payload.reasons)).toBe(true);
+  });
+
+  it("health is an alias for doctor", async () => {
+    const { stdout, code } = await runCli(["health", "--json"]);
+    expect(code).toBe(0);
+    const payload = JSON.parse(stdout);
+    expect(payload).toHaveProperty("verdict");
+  });
+
   it("shows experiments compare help", async () => {
     const { stdout, code } = await runCli(["experiments", "compare", "-h"]);
     expect(code).toBe(0);
@@ -141,6 +197,20 @@ describe("CLI entrypoint", () => {
     },
   );
 
+  it("does not mark first-run complete for nested help invocations", async () => {
+    resetSourceWelcomeMeta();
+
+    for (const args of [
+      ["config", "set", "--help"],
+      ["experiments", "compare", "--help"],
+    ]) {
+      const { stdout, code } = await runCli(args);
+      expect(code).toBe(0);
+      expect(stdout).toContain("evals");
+      expect(readSourceWelcomeCompletedAt()).toBeUndefined();
+    }
+  });
+
   // Regression: help interception must not reach into value positions.
   // `config set <key> <value>` must surface a parse/value error, not silently
   // print help — otherwise `--help` would be a magical sentinel anywhere.