diff --git a/apps/desktop/scripts/build-sidecar.ts b/apps/desktop/scripts/build-sidecar.ts index cf248e910..74b56f9e8 100644 --- a/apps/desktop/scripts/build-sidecar.ts +++ b/apps/desktop/scripts/build-sidecar.ts @@ -45,7 +45,11 @@ await rm(EXECUTOR_OUT_DIR, { recursive: true, force: true }); await mkdir(EXECUTOR_OUT_DIR, { recursive: true }); await cp(sourceBinDir, EXECUTOR_OUT_DIR, { recursive: true }); -if (process.platform !== "win32") { +// Restore the unix executable bit — keyed on the TARGET, not the host. A +// windows-target cross-build (BUN_TARGET=bun-windows-x64 on macOS/linux) stages +// `executor.exe`, which needs no bit; chmod'ing a non-existent `executor` there +// would ENOENT. +if (!targetPackage.includes("windows")) { await chmod(join(EXECUTOR_OUT_DIR, "executor"), 0o755); } diff --git a/e2e/AGENTS.md b/e2e/AGENTS.md index a77694871..78964b0bd 100644 --- a/e2e/AGENTS.md +++ b/e2e/AGENTS.md @@ -130,6 +130,80 @@ When handing results to the user, follow the evidence contract in the root [AGENTS.md](../AGENTS.md) (direct run links + a live instance + what to try); [RUNNING.md](../RUNNING.md) has the current sharing/demo mechanics. +## Authoring from a live browser (`browse` → `promote`) + +You don't have to hand-write a browser scenario. Drive a running instance's web +UI one step at a time, then turn the recorded journey into a committed scenario. +The generated test drives the same Browser surface the exploration drove, so it +is the real test, not a transcript of one — develop the flow, then crystallize +it. + +```sh +cd e2e +bun run cli up cloud # a live instance to develop against +bun run cli browse cloud goto / # each step REPLAYS the whole flow from a +bun run cli browse cloud click link Policies # clean browser and prints the page's controls +bun run cli browse cloud at-url /policies # (role · name) + a screenshot, so the next +bun run cli browse cloud see "No policies yet" # step is written against what's actually there +bun run cli promote cloud "Policies · a fresh workspace has none" +``` + +Each `browse` replays every step so far, so what you are building is, at every +moment, exactly what `promote` emits — a step that doesn't reproduce fails here, +not in CI. Steps: `goto `, `click `, `click-text `, +`fill `, `press `, and the assertions `see ` / +`at-url `. `--label "…"` names a step (it becomes the `step(...)` +group); `browse show | undo | reset` manages the journey. + +`promote` writes `/.gen.test.ts` and runs it against the live +instance, producing the usual run artifacts (session.mp4, step screenshots, +trace). A journey with no assertion is refused — a scenario must prove +something. From then on the file is an ordinary scenario: edit it, add API/MCP +checks, drop the `.gen` once it's yours. The journey itself lives in +`.dev/.journey.json` (gitignored), not the repo. + +## Desktop targets (the app on real OSes, filmed) + +The packaged desktop app runs as its own targets, each landing in its own +`runs//` bucket with a video. One shared scenario (`desktop-vm/`) and the +shared driver (`src/vm/desktop.ts`) + setup plumbing (`setup/desktop-vm.ts`); one +project + globalsetup per guest OS. + +- **`desktop-packaged`** — the real electron-builder bundle on THIS machine's + display (the supervised-daemon attach path). Needs a logged-in GUI session. +- **`desktop-macos` / `desktop-linux`** — the same bundle inside a guest VM, + driven over CDP from the host and filmed. The globalsetup boots the guest + (tart), builds + pushes the bundle, brings the app up with + `--remote-debugging-port`, forwards it, and the scenario connects + drives + + records. Provisioned automatically — or attach to a running guest with + `E2E_DESKTOP_VM_IP=`: + + ```sh + vitest run --project desktop-macos # or desktop-linux + ``` + +The guests run tart `--no-graphics` (no host window, never steals focus) but +still have a usable display: + +- **macOS**: the base image's autologin reaches a real Aqua session + (WindowServer/Dock/Finder). Launch the app INTO it with `sudo launchctl asuser + …` (a plain SSH spawn lands in a non-GUI session); the unsigned arm64 + bundle is ad-hoc `codesign`'d in the guest; `screencapture` films it. +- **linux**: no window server, so the app renders into an `Xvfb` display with a + minimal WM (`openbox` — without it the electron window never maps); the window + maps tiny (10x10) so the globalsetup `xdotool`-resizes it to fill, and ffmpeg + `x11grab` films it. `--no-sandbox` (the chrome-sandbox needs setuid root). + +Base images (`admin`/`admin`): `executor-macos-base` (cirruslabs sequoia, autologin) +and `executor-linux-base` (cirruslabs ubuntu + Xvfb/ffmpeg/openbox/xdotool + +electron runtime libs). The bundle's `executor` binary is cross-compiled for the +guest (`BUN_TARGET`), and electron-builder's `dir` target assembles the unpacked +app on macOS — so both bundles build on this Mac. + +Note: `desktop-packaged`'s `guiAvailable()` probe (`launchctl managername`) reads +"Background" over SSH even when Aqua is up, so it's host-only; the VM targets gate +on a CDP page target instead. + ## Discovering endpoints - The full OpenAPI spec: `curl http://127.0.0.1:/api/openapi.json` diff --git a/e2e/desktop-vm/console-renders.test.ts b/e2e/desktop-vm/console-renders.test.ts new file mode 100644 index 000000000..c24658784 --- /dev/null +++ b/e2e/desktop-vm/console-renders.test.ts @@ -0,0 +1,75 @@ +// The PACKAGED desktop app, on camera, inside a GUI guest — driven over CDP from +// the host. ONE scenario shared by every desktop- project (desktop-macos, +// desktop-linux): the same bundle and CDP driver, proving it renders on a guest +// OS and filming the actual console. The desktop- globalsetup boots the +// guest, launches the app, forwards its --remote-debugging-port (E2E_DESKTOP_CDP_PORT) +// and publishes the guest IP; this scenario connects, drives, and records. The +// run lands in runs// (its own per-OS bucket). Without a guest it skips +// honestly, like desktop-packaged without a display. +import { writeFileSync } from "node:fs"; +import { join } from "node:path"; + +import { expect, it } from "@effect/vitest"; +import { Effect } from "effect"; + +import { scenario } from "../src/scenario"; +import { RunDir } from "../src/services"; +import { CdpPage, pageWsUrl, recordGuestScreen } from "../src/vm/desktop"; + +const NAME = "Desktop (packaged, in a VM) · the bundle renders its console"; +const cdpPort = process.env.E2E_DESKTOP_CDP_PORT; +const guestIp = process.env.E2E_DESKTOP_VM_IP; +const recSeconds = Number(process.env.E2E_DESKTOP_REC_SECONDS ?? "12"); +const os: "macos" | "linux" | "windows" = + process.env.E2E_TARGET === "desktop-windows" + ? "windows" + : process.env.E2E_TARGET === "desktop-linux" + ? "linux" + : "macos"; + +const run = async (runDir: string) => { + const cdp = await CdpPage.connect(await pageWsUrl(Number(cdpPort))); + try { + await cdp.command("Runtime.enable"); + await cdp.command("Page.enable"); + + // Film the console while we drive it (OS-aware capture lands a playable mp4). + const recording = recordGuestScreen( + guestIp as string, + recSeconds, + join(runDir, "session.mp4"), + os, + ); + + // Reaching the nav proves the packaged bundle booted and connected to its + // daemon on this OS. + await cdp.waitForText("Integrations", 60_000).catch(() => cdp.waitForText("Settings", 60_000)); + writeFileSync(join(runDir, "01-console-rendered.png"), await cdp.screenshot()); + + const body = await cdp.command<{ result?: { value?: string } }>("Runtime.evaluate", { + expression: "document.body.innerText", + returnByValue: true, + }); + expect(body.result?.value ?? "", "the packaged console rendered its nav").toContain( + "Integrations", + ); + + await recording; + } finally { + cdp.close(); + } +}; + +if (!cdpPort || !guestIp) { + it.skip(`${NAME} (needs a desktop guest — set E2E_DESKTOP_VM_IP or run the desktop- project)`, () => {}); +} else { + // Literal name (not NAME) so the run's test.ts review artifact captures it. + scenario( + "Desktop (packaged, in a VM) · the bundle renders its console", + { timeout: 180_000 }, + Effect.gen(function* () { + const runDir = yield* RunDir; + yield* Effect.promise(() => run(runDir)); + }), + ); +} diff --git a/e2e/scripts/cli.ts b/e2e/scripts/cli.ts index 38b13caaa..d619b99f3 100644 --- a/e2e/scripts/cli.ts +++ b/e2e/scripts/cli.ts @@ -7,6 +7,8 @@ // bun scripts/cli.ts api [json] // bun scripts/cli.ts mcp tools | call [json] // bun scripts/cli.ts ledger [workos|autumn] +// bun scripts/cli.ts browse (goto/click/see/…; show|undo|reset) +// bun scripts/cli.ts promote "" // bun scripts/cli.ts logs // bun scripts/cli.ts down // @@ -25,6 +27,9 @@ import { networkInterfaces } from "node:os"; import { join } from "node:path"; import { fileURLToPath } from "node:url"; +import type { JourneyFile } from "../src/journey/codegen"; +import type { Role, Step } from "../src/journey/steps"; + const e2eDir = fileURLToPath(new URL("..", import.meta.url)); const devDir = join(e2eDir, ".dev"); const cliPath = fileURLToPath(import.meta.url); @@ -443,6 +448,308 @@ const ledger = async (targetName: string, service = "workos") => { console.log(JSON.stringify(entries, null, 2)); }; +// --- browser journeys ------------------------------------------------------ +// Drive a live instance's web UI step by step. Each step appends to a journey +// file and REPLAYS the whole flow from a clean browser, so what you build is, at +// every moment, exactly what `promote` emits as a scenario — develop the flow, +// then crystallize it (e2e/AGENTS.md) with no translation gap. The agent can't +// see the screen, so every step returns the page's interactive controls (the +// vocabulary the next step is written against) plus a screenshot for a human. + +const journeyPaths = (target: string) => ({ + file: join(devDir, `${target}.journey.json`), + shotsDir: join(devDir, `${target}.journey`), +}); + +const readJourney = (target: string): JourneyFile => { + try { + return JSON.parse(readFileSync(journeyPaths(target).file, "utf8")) as JourneyFile; + } catch { + return { target, org: true, steps: [] }; + } +}; + +const writeJourney = (journey: JourneyFile) => { + mkdirSync(devDir, { recursive: true }); + writeFileSync(journeyPaths(journey.target).file, JSON.stringify(journey, null, 1)); +}; + +const STEP_ROLES = new Set([ + "link", + "button", + "heading", + "textbox", + "tab", + "menuitem", + "checkbox", +]); + +/** One CLI step verb → a Step. The verbs are deliberately plain English so the + * journey reads like instructions: goto / click / click-text / fill / press / + * see / at-url / run / request. */ +const parseStep = ( + verb: string, + args: ReadonlyArray, + opts: { readonly label?: string; readonly contains?: string }, +): Step => { + const withLabel = opts.label ? { label: opts.label } : {}; + const withContains = opts.contains !== undefined ? { contains: opts.contains } : {}; + switch (verb) { + case "goto": { + const path = args[0]; + if (!path) throw new Error("usage: browse goto (e.g. goto /)"); + return { kind: "goto", path, ...withLabel }; + } + case "click": { + const role = args[0]; + const name = args.slice(1).join(" "); + if (!role || !name) { + throw new Error( + "usage: browse click (role: " + [...STEP_ROLES].join("|") + ")", + ); + } + if (!STEP_ROLES.has(role as Role)) throw new Error(`unknown role ${JSON.stringify(role)}`); + return { kind: "clickRole", role: role as Role, name, ...withLabel }; + } + case "click-text": { + const text = args.join(" "); + if (!text) throw new Error("usage: browse click-text "); + return { kind: "clickText", text, ...withLabel }; + } + case "fill": { + const field = args[0]; + const value = args.slice(1).join(" "); + if (!field || args.length < 2) throw new Error("usage: browse fill "); + return { kind: "fill", field, value, ...withLabel }; + } + case "press": { + const key = args[0]; + if (!key) throw new Error("usage: browse press (e.g. press Enter)"); + return { kind: "press", key, ...withLabel }; + } + case "see": { + const text = args.join(" "); + if (!text) + throw new Error("usage: browse see (asserts the text is visible)"); + return { kind: "expectText", text, ...withLabel }; + } + case "at-url": { + const contains = args[0]; + if (!contains) throw new Error("usage: browse at-url "); + return { kind: "expectUrl", contains, ...withLabel }; + } + case "run": { + const command = args.join(" "); + if (!command) { + throw new Error( + 'usage: browse run "" [--contains ] ({base} = the instance URL)', + ); + } + return { kind: "run", command, ...withContains, ...withLabel }; + } + case "request": { + const method = (args[0] ?? "").toUpperCase(); + const path = args[1]; + if (!method || !path) { + throw new Error("usage: browse request [--contains ]"); + } + return { kind: "request", method, path, ...withContains, ...withLabel }; + } + default: + throw new Error( + `unknown step ${JSON.stringify(verb)} — goto | click | click-text | fill | press | see | at-url | run | request`, + ); + } +}; + +const printJourney = (journey: JourneyFile) => { + if (journey.steps.length === 0) return console.log(`${journey.target}: empty journey`); + console.log(`${journey.target} journey (${journey.steps.length} steps, org=${journey.org}):`); + journey.steps.forEach((step, index) => { + console.log(` ${String(index + 1).padStart(2)}. ${stepLabelOf(step)}`); + }); +}; + +// A local copy of the label default so printing doesn't pull in the browser +// module (which imports playwright). Kept trivially in sync with steps.ts. +const stepLabelOf = (step: Step): string => { + if (step.label) return step.label; + if (step.kind === "goto") return `goto ${step.path}`; + if (step.kind === "clickRole") return `click ${step.role} ${JSON.stringify(step.name)}`; + if (step.kind === "clickText") return `click-text ${JSON.stringify(step.text)}`; + if (step.kind === "fill") return `fill ${JSON.stringify(step.field)}`; + if (step.kind === "press") return `press ${step.key}`; + if (step.kind === "expectText") return `see ${JSON.stringify(step.text)}`; + if (step.kind === "expectUrl") return `at-url ${JSON.stringify(step.contains)}`; + if (step.kind === "run") { + return `run ${JSON.stringify(step.command)}${step.contains ? ` → ${JSON.stringify(step.contains)}` : ""}`; + } + return `request ${step.method} ${step.path}${step.contains ? ` → ${JSON.stringify(step.contains)}` : ""}`; +}; + +const browse = async (raw: ReadonlyArray) => { + const target = raw[0]; + if (!target) throw new Error("usage: browse (or show | undo | reset)"); + + // Re-parse tokens here: --label takes a value, which the top-level flag/arg + // split would mangle. + let label: string | undefined; + let contains: string | undefined; + const positional: string[] = []; + const bools = new Set(); + const tokens = raw.slice(1); + for (let i = 0; i < tokens.length; i++) { + const token = tokens[i]!; + if (token === "--label") { + label = tokens[++i]; + continue; + } + if (token.startsWith("--label=")) { + label = token.slice("--label=".length); + continue; + } + if (token === "--contains") { + contains = tokens[++i]; + continue; + } + if (token.startsWith("--contains=")) { + contains = token.slice("--contains=".length); + continue; + } + if (token.startsWith("--")) { + bools.add(token); + continue; + } + positional.push(token); + } + + const verb = positional[0]; + const { file, shotsDir } = journeyPaths(target); + let journey = readJourney(target); + + if (verb === "show") return printJourney(journey); + if (verb === "reset") { + rmSync(file, { force: true }); + rmSync(shotsDir, { recursive: true, force: true }); + return console.log(`${target}: journey reset`); + } + if (!verb) throw new Error("usage: browse (or show | undo | reset)"); + + // The flow runs as ONE minted identity per replay — same as the generated + // test mints one per run, so a stateful journey behaves identically. + const { target: resolved } = await loadTarget(target); + const identity = await runEffect( + resolved.newIdentity(journey.org ? undefined : { org: false }), + ); + const { replayJourney } = await import("../src/journey/run"); + + let steps = [...journey.steps]; + if (verb === "undo") { + if (steps.length === 0) throw new Error("nothing to undo"); + steps = steps.slice(0, -1); + } else { + if (steps.length === 0 && bools.has("--no-org")) journey = { ...journey, org: false }; + steps = [...steps, parseStep(verb, positional.slice(1), { label, contains })]; + } + + mkdirSync(shotsDir, { recursive: true }); + const shot = join(shotsDir, `${String(steps.length).padStart(2, "0")}.png`); + const observation = await replayJourney(resolved, identity, steps, { screenshotPath: shot }); + + if (observation.failedStep) { + const failed = observation.failedStep; + const isNew = verb !== "undo" && failed.index === steps.length - 1; + console.log( + `✗ ${isNew ? "this step" : `step ${failed.index + 1} (${stepLabelOf(steps[failed.index]!)})`} failed — journey unchanged.`, + ); + console.log(` ${failed.error.split("\n")[0]}`); + console.log(` screenshot: ${observation.screenshotPath}`); + return; + } + + journey = { ...journey, steps }; + writeJourney(journey); + + console.log(`→ ${observation.url} ${JSON.stringify(observation.title)}`); + console.log(` screenshot: ${observation.screenshotPath}`); + if (observation.lastOutput) { + const lines = observation.lastOutput.split("\n").slice(0, 12); + console.log("output:"); + for (const line of lines) console.log(` ${line}`); + } + printJourney(journey); + if (observation.controls.length > 0) { + console.log("controls on the page (role · name):"); + for (const control of observation.controls.slice(0, 40)) { + console.log(` ${control.role.padEnd(9)} ${control.name}`); + } + if (observation.controls.length > 40) { + console.log(` … ${observation.controls.length - 40} more`); + } + } +}; + +const promote = async (raw: ReadonlyArray) => { + const target = raw[0]; + const positional = raw.slice(1).filter((arg) => !arg.startsWith("--")); + const name = positional[0]; + const noRun = raw.includes("--no-run"); + if (!target || !name) { + throw new Error('usage: promote "" [--no-run]'); + } + const journey = readJourney(target); + if (journey.steps.length === 0) { + throw new Error(`no journey for ${target} — build one with \`browse ${target} …\` first`); + } + const { codegenScenario, journeyHasAssertion, journeyHasBrowserStep } = + await import("../src/journey/codegen"); + if (!journeyHasBrowserStep(journey)) { + throw new Error( + "this journey is all terminal/HTTP steps — `promote` generates browser-anchored scenarios. Add a browser step, or write a CLI/API test directly.", + ); + } + if (!journeyHasAssertion(journey)) { + throw new Error( + "this journey has no assertion (a `see`, `at-url`, `request`, or `run --contains` step), so the scenario would prove nothing. Add one, then promote.", + ); + } + + const source = codegenScenario(name, journey); + const slug = name + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-+|-+$/g, "") + .slice(0, 80); + // Scope the file to the target's own dir (cloud/, selfhost/) so it runs only + // where its UI selectors apply, not on every cross-target host. + const relFile = join(target, `${slug}.gen.test.ts`); + writeFileSync(join(e2eDir, relFile), source); + console.log(`wrote ${relFile} (${journey.steps.length} steps from the ${target} journey)`); + + const urlKey = `E2E_${target.toUpperCase()}_URL`; + const url = readState(target)?.env?.[urlKey]; + if (noRun || !url) { + console.log( + `run it against the live instance:\n ${urlKey}= bunx vitest run --project ${target} ${relFile}`, + ); + return; + } + + console.log(`running it against the live instance (${url}) …`); + const proc = spawn("bunx", ["vitest", "run", "--project", target, relFile], { + cwd: e2eDir, + stdio: "inherit", + env: { ...process.env, [urlKey]: url }, + }); + const code: number = await new Promise((resolve) => proc.on("exit", (c) => resolve(c ?? 1))); + const slugDir = join("runs", target, slug); + console.log( + code === 0 + ? `\n✓ passed. Artifacts (test.ts, result.json, session.mp4/film.mp4): e2e/${slugDir}\n view: cd e2e && bun run serve → #/${target}/${slug}` + : `\n✗ the generated test failed (exit ${code}). The flow that passed live did not pass as a committed test — inspect ${relFile} and e2e/${slugDir}.`, + ); +}; + // --- lifecycle commands ---------------------------------------------------- const status = () => { @@ -508,9 +815,24 @@ const HELP = `e2e dev CLI — the scenario primitives, interactive (see e2e/AGEN api [json] typed API call as a fresh identity mcp tools | call [json] MCP session call ledger [workos|autumn] the emulator's request ledger (cloud) + browse drive the live web UI, one step at a time; each step + replays the whole flow and prints the page's controls + steps: goto | click | click-text + | fill | press + | see | at-url + | run "" | request + flags: --label "…", --no-org, --contains + ({base} in a run command = the instance URL) + browse show | undo | reset + promote "" turn the recorded journey into a committed scenario + (/.gen.test.ts) and run it (--no-run) logs dump the instance's dev-server log down tear down (kills servers, removes tailscale serves) +A browser journey IS the scenario: develop the flow with \`browse\`, then +\`promote\` it. The generated test drives the same surface, so a reviewer judges +it by reading the test and watching its video. + Instances live in e2e/.dev/.json — a state file marks a DELIBERATE long-lived instance. Use the booted instance for e2e too: E2E_SELFHOST_URL= vitest run --project selfhost `; @@ -534,6 +856,10 @@ const main = async () => { return mcpCall(args[0] ?? "", args[1], args.slice(2)); case "ledger": return ledger(args[0] ?? "", args[1]); + case "browse": + return browse(rest); + case "promote": + return promote(rest); case "logs": return logs(args[0] ?? ""); case "down": diff --git a/e2e/selfhost/detected-auth-immutable-ui.test.ts b/e2e/selfhost/detected-auth-immutable-ui.test.ts new file mode 100644 index 000000000..2b9bfa795 --- /dev/null +++ b/e2e/selfhost/detected-auth-immutable-ui.test.ts @@ -0,0 +1,185 @@ +// Selfhost-only (browser): a spec/probe-DETECTED auth method is immutable in +// the add flow. The shared AuthMethodListEditor renders detected methods as a +// disabled, read-only summary ("Pulled from spec. Remove to override.") with no +// kind selector, so a user can't silently retype the spec's method into a kind +// nothing backs. A method the user adds by hand stays fully editable. Both the +// MCP and OpenAPI add flows compose the same editor, so one behavior, two +// surfaces. Selfhost runs with EXECUTOR_ALLOW_LOCAL_NETWORK so the probe/analyze +// can reach the loopback fixtures. Video is the artifact. +import { randomBytes } from "node:crypto"; +import { createServer } from "node:http"; + +import { expect } from "@effect/vitest"; +import { Effect } from "effect"; +import { makeGreetingMcpServer, serveMcpServerWithOAuth } from "@executor-js/plugin-mcp/testing"; +import { OAuthTestServer } from "@executor-js/sdk/testing"; + +import { scenario } from "../src/scenario"; +import { Browser, Target } from "../src/services"; + +const REMOVE_HINT = "Pulled from spec. Remove to override."; + +scenario( + "Detected auth · an MCP probe's OAuth method is immutable in the add flow", + {}, + Effect.scoped( + Effect.gen(function* () { + const target = yield* Target; + const browser = yield* Browser; + // OAuth-protected server: the probe 401s with resource metadata, so the + // method list seeds a single detected OAuth row (discovered metadata). + const server = yield* serveMcpServerWithOAuth( + () => makeGreetingMcpServer({ name: `oauth-mcp-${randomBytes(3).toString("hex")}` }), + { path: "/mcp" }, + ); + const identity = yield* target.newIdentity(); + + yield* browser.session(identity, async ({ page, step }) => { + await step("Open the add-MCP flow pointed at the OAuth server", async () => { + await page.goto(`/integrations/add/mcp?url=${encodeURIComponent(server.endpoint)}`, { + waitUntil: "networkidle", + }); + await page.getByText("How does this server authenticate?").waitFor(); + await page.getByText("Method 1 · Detected").waitFor(); + }); + + await step("The detected method is locked: read-only, named, no selector", async () => { + // The kind is named explicitly ("OAuth"), the discovered-OAuth summary + // and override hint sit inside a disabled block, and there is NO + // editable kind selector (the FilterTabs render as buttons). + await page.getByText("OAuth", { exact: true }).first().waitFor(); + await page.getByText("OAuth metadata is discovered from this server").waitFor(); + await page.getByText(REMOVE_HINT).waitFor(); + expect( + await page.locator("[aria-disabled]").count(), + "the detected method renders a disabled (non-interactive) block", + ).toBeGreaterThan(0); + expect( + await page.getByRole("button", { name: "API key", exact: true }).count(), + "no editable kind selector is shown for the detected method", + ).toBe(0); + }); + + await step("A hand-added method keeps the full editable selector", async () => { + await page.getByRole("button", { name: "Add method" }).click(); + await page.getByText("Method 2").waitFor(); + // The added row defaults to API key and exposes the None/API key/OAuth + // kind tabs (buttons) — the detected row above still shows none. + expect( + await page.getByRole("button", { name: "API key", exact: true }).count(), + "the added method exposes the kind selector", + ).toBeGreaterThan(0); + await page.getByText(REMOVE_HINT).waitFor(); + }); + }); + }), + ).pipe(Effect.provide(OAuthTestServer.layer())), +); + +/** A real 127.0.0.1 server that serves a static OpenAPI spec for the add flow. */ +const serveSpec = (body: string) => + Effect.acquireRelease( + Effect.callback<{ readonly url: string; readonly close: () => void }>((resume) => { + const server = createServer((_request, response) => { + response.writeHead(200, { "content-type": "application/json" }); + response.end(body); + }); + server.listen(0, "127.0.0.1", () => { + const address = server.address(); + const port = typeof address === "object" && address ? address.port : 0; + resume( + Effect.succeed({ + url: `http://127.0.0.1:${port}/spec.json`, + close: () => { + server.close(); + server.closeAllConnections(); + }, + }), + ); + }); + }), + (server) => Effect.sync(server.close), + ); + +const apiKeyAndOAuthSpec = (): string => + JSON.stringify({ + openapi: "3.0.3", + info: { title: "Acme Immutable Auth Fixture", version: "1.0.0" }, + servers: [{ url: "https://api.acme.test" }], + security: [{ bearerAuth: [] }, { acmeOAuth: ["read"] }], + components: { + securitySchemes: { + bearerAuth: { type: "http", scheme: "bearer" }, + acmeOAuth: { + type: "oauth2", + flows: { + authorizationCode: { + authorizationUrl: "https://api.acme.test/oauth/authorize", + tokenUrl: "https://api.acme.test/oauth/token", + scopes: { read: "Read access" }, + }, + }, + }, + }, + }, + paths: { + "/widgets": { + get: { + operationId: "listWidgets", + summary: "List widgets", + responses: { "200": { description: "ok" } }, + }, + }, + }, + }); + +scenario( + "Detected auth · OpenAPI spec-detected methods are immutable in the add flow", + {}, + Effect.scoped( + Effect.gen(function* () { + const target = yield* Target; + const browser = yield* Browser; + const spec = yield* serveSpec(apiKeyAndOAuthSpec()); + const identity = yield* target.newIdentity(); + + yield* browser.session(identity, async ({ page, step }) => { + await step("Analyze a spec that declares both API key and OAuth", async () => { + await page.goto(`/integrations/add/openapi`, { waitUntil: "networkidle" }); + await page + .getByPlaceholder(/openapi\.json/i) + .first() + .fill(spec.url); + await page.getByText("How does this API authenticate?").waitFor(); + await page.getByText("Method 2").waitFor(); + }); + + await step("Both detected methods are locked, named, read-only", async () => { + // Two detected methods, each with the override hint and its kind named + // ("API key" / "OAuth"); the OAuth one shows the spec's real endpoints + // read-only. No editable kind selector (FilterTabs render as buttons). + expect( + await page.getByText(REMOVE_HINT).count(), + "both detected methods show the remove-to-override hint", + ).toBe(2); + await page.getByText("https://api.acme.test/oauth/authorize").waitFor(); + await page.getByText("API key", { exact: true }).first().waitFor(); + await page.getByText("OAuth", { exact: true }).first().waitFor(); + expect( + await page.getByRole("button", { name: "OAuth", exact: true }).count(), + "no editable kind selector is shown for the detected methods", + ).toBe(0); + }); + + await step("A hand-added method keeps the full editable selector", async () => { + await page.getByRole("button", { name: "Add method" }).click(); + await page.getByText("Method 3").waitFor(); + expect( + await page.getByRole("button", { name: "API key", exact: true }).count(), + "the added method exposes the kind selector", + ).toBeGreaterThan(0); + }); + }); + }), + ), +); diff --git a/e2e/setup/desktop-linux.globalsetup.ts b/e2e/setup/desktop-linux.globalsetup.ts new file mode 100644 index 000000000..46d535a1c --- /dev/null +++ b/e2e/setup/desktop-linux.globalsetup.ts @@ -0,0 +1,125 @@ +// desktop-linux: bring the PACKAGED app up inside a Linux guest and forward its +// CDP port (the shared attach/forward lives in ./desktop-vm). No window server, +// so the app renders into an Xvfb virtual display; ffmpeg x11grab (in the +// scenario's recorder) films that display. Simpler than macOS: no Aqua, no +// codesign, no launchctl — just background processes with DISPLAY set and +// --no-sandbox (the chrome-sandbox needs setuid root, pointless on a throwaway +// guest). The base image (executor-linux-base) carries Xvfb + ffmpeg + the +// electron runtime libs. +import { execFileSync } from "node:child_process"; +import { existsSync } from "node:fs"; +import { fileURLToPath } from "node:url"; +import { basename, join } from "node:path"; + +import { pushDirAsTar } from "../src/vm/desktop"; +import { tartVm } from "../src/vm/tart"; +import { + attachOrProvision, + CDP_GUEST_PORT, + waitGuestHttp, + waitGuestPageTarget, + type ProvisionedGuest, +} from "./desktop-vm"; + +const DAEMON_PORT = 4789; +const GUEST_DIR = "/home/admin/exe"; +const GUEST_HOME = "/home/admin/exe-home"; +const DISPLAY = ":99"; + +const appDir = fileURLToPath(new URL("../../apps/desktop/", import.meta.url)); +const hostBundle = () => { + // electron-builder names the dir `linux--unpacked` for non-x64. + const dir = join(appDir, "dist", "linux-arm64-unpacked"); + return { + dir, + exe: join(dir, "executor-desktop"), + executor: join(dir, "resources/executor/executor"), + }; +}; + +/** Build the packaged linux-arm64 bundle if it isn't on disk. The `executor` + * binary is cross-compiled here via BUN_TARGET (same as the cli-linux lane); + * electron-builder's `dir` target assembles the unpacked app on macOS without + * Docker. */ +const ensureBundle = (): void => { + if (existsSync(hostBundle().dir)) return; + const run = (cmd: string, args: string[], env: Record = {}) => + execFileSync(cmd, args, { cwd: appDir, stdio: "inherit", env: { ...process.env, ...env } }); + run("bun", ["./scripts/build-sidecar.ts"], { BUN_TARGET: "bun-linux-arm64" }); + run("bunx", ["--bun", "electron-vite", "build"]); + run( + "bunx", + [ + "--bun", + "electron-builder", + "--config", + "electron-builder.e2e.config.ts", + "--linux", + "--arm64", + ], + { CSC_IDENTITY_AUTO_DISCOVERY: "false" }, + ); +}; + +const provisionLinux = async (): Promise => { + ensureBundle(); + const { dir } = hostBundle(); + const vm = await tartVm("linux", "arm64").provision(); + try { + await vm.ssh(`rm -rf ${GUEST_DIR} ${GUEST_HOME}; mkdir -p ${GUEST_HOME}/.executor`); + await pushDirAsTar(vm.host, dir, GUEST_DIR); + + const guestApp = `${GUEST_DIR}/${basename(dir)}`; + const guestExe = `${guestApp}/executor-desktop`; + const guestExecutor = `${guestApp}/resources/executor/executor`; + await vm.ssh(`chmod +x '${guestExe}' '${guestExecutor}' 2>/dev/null || true`); + const env = `HOME=${GUEST_HOME} EXECUTOR_DATA_DIR=${GUEST_HOME}/.executor`; + + // A virtual display + a minimal WM (openbox) — without a window manager the + // electron window doesn't map onto the framebuffer that x11grab records. + await vm.ssh( + `pkill Xvfb 2>/dev/null; pkill openbox 2>/dev/null; ` + + `nohup Xvfb ${DISPLAY} -screen 0 1280x800x24 >/tmp/xvfb.log 2>&1 & sleep 2; ` + + `DISPLAY=${DISPLAY} nohup openbox >/tmp/openbox.log 2>&1 & sleep 1; echo up`, + ); + + // 1) the bundled daemon, supervised — the app attaches to this. + await vm.ssh( + `nohup env ${env} EXECUTOR_SUPERVISED=1 EXECUTOR_AUTH_TOKEN=desktop-linux-e2e EXECUTOR_CLIENT=desktop ` + + `'${guestExecutor}' daemon run --foreground --port ${DAEMON_PORT} --hostname 127.0.0.1 ` + + `>/tmp/executor-daemon.log 2>&1 &`, + ); + if (!(await waitGuestHttp(vm, `http://127.0.0.1:${DAEMON_PORT}/`))) { + throw new Error( + "supervised daemon never came up in the guest (see /tmp/executor-daemon.log)", + ); + } + + // 2) the packaged app on the virtual display, with CDP enabled. + await vm.ssh( + `nohup env ${env} DISPLAY=${DISPLAY} '${guestExe}' --no-sandbox ` + + `--remote-debugging-port=${CDP_GUEST_PORT} --remote-allow-origins='*' ` + + `>/tmp/executor-app.log 2>&1 &`, + ); + if (!(await waitGuestPageTarget(vm, CDP_GUEST_PORT))) { + const log = (await vm.ssh("tail -40 /tmp/executor-app.log 2>/dev/null").catch(() => null)) + ?.stdout; + throw new Error(`the app's CDP page target never appeared:\n${log ?? "(no app log)"}`); + } + + // The electron window maps tiny (10x10) under Xvfb; size it to the screen so + // the x11grab recording captures the full console (CDP screenshots the + // renderer surface regardless, but the film grabs the X framebuffer). + await vm.ssh( + `WID=$(DISPLAY=${DISPLAY} xdotool search --name executor-desktop | head -1); ` + + `[ -n "$WID" ] && DISPLAY=${DISPLAY} xdotool windowsize "$WID" 1280 800 windowmove "$WID" 0 0 || true`, + ); + + return { ip: vm.host, teardown: async () => void (await vm.discard()) }; + } catch (error) { + await vm.discard(); + throw error; + } +}; + +export default (): Promise<(() => Promise) | void> => attachOrProvision(provisionLinux); diff --git a/e2e/setup/desktop-macos.globalsetup.ts b/e2e/setup/desktop-macos.globalsetup.ts new file mode 100644 index 000000000..b26707ca2 --- /dev/null +++ b/e2e/setup/desktop-macos.globalsetup.ts @@ -0,0 +1,108 @@ +// desktop-macos: bring the PACKAGED app up inside a macOS GUI guest and forward +// its CDP port (the shared attach/forward lives in ./desktop-vm). The guest runs +// tart `--no-graphics` (no host window) but the base image's autologin still +// reaches a real Aqua session, so the GUI renders and `screencapture` films it. +// We come up the SAME way desktop-packaged does — start the bundled daemon, then +// launch the app so it ATTACHES (no sidecar spawn → no first-run consent modal). +// The app must be launched INTO the Aqua session (`launchctl asuser`); a plain +// SSH spawn lands in a non-GUI session. +import { execFileSync } from "node:child_process"; +import { existsSync } from "node:fs"; +import { fileURLToPath } from "node:url"; +import { join } from "node:path"; + +import { pushDirAsTar } from "../src/vm/desktop"; +import { tartVm } from "../src/vm/tart"; +import { + attachOrProvision, + CDP_GUEST_PORT, + waitGuestHttp, + waitGuestPageTarget, + type ProvisionedGuest, +} from "./desktop-vm"; + +const DAEMON_PORT = 4789; +const GUEST_DIR = "/Users/admin/exe"; +const GUEST_HOME = "/Users/admin/exe-home"; + +const appDir = fileURLToPath(new URL("../../apps/desktop/", import.meta.url)); +const hostBundle = () => { + const app = join(appDir, "dist", "mac-arm64", "Executor.app"); + return { + app, + exe: join(app, "Contents/MacOS/Executor"), + executor: join(app, "Contents/Resources/executor/executor"), + }; +}; + +/** Build the packaged mac bundle if it isn't on disk (slow; reuse an existing + * dist/ while iterating). Mirrors desktop-packaged.globalsetup. */ +const ensureBundle = (): void => { + if (existsSync(hostBundle().app)) return; + const run = (cmd: string, args: string[]) => + execFileSync(cmd, args, { cwd: appDir, stdio: "inherit", env: { ...process.env } }); + run("bun", ["./scripts/build-sidecar.ts"]); + run("bunx", ["--bun", "electron-vite", "build"]); + execFileSync( + "bunx", + ["--bun", "electron-builder", "--config", "electron-builder.e2e.config.ts", "--mac"], + { + cwd: appDir, + stdio: "inherit", + env: { ...process.env, CSC_IDENTITY_AUTO_DISCOVERY: "false" }, + }, + ); +}; + +const provisionMac = async (): Promise => { + ensureBundle(); + const { exe, executor } = hostBundle(); + const vm = await tartVm("macos", "arm64").provision(); + try { + // Push the bundle (tar-stream, robust over the just-booted link) and clear + // the scp quarantine so it can run. + await vm.ssh(`rm -rf ${GUEST_DIR} ${GUEST_HOME} && mkdir -p ${GUEST_HOME}/.executor`); + await pushDirAsTar(vm.host, hostBundle().app, GUEST_DIR); + await vm.ssh(`xattr -dr com.apple.quarantine ${GUEST_DIR} 2>/dev/null || true`); + // The e2e build is unsigned; an arm64 app needs at least an ad-hoc signature + // to execute, and the host build's signature isn't trusted on another Mac. + await vm.ssh( + `codesign --force --deep --sign - ${GUEST_DIR}/Executor.app 2>&1 | tail -2 || true`, + ); + + const guestExe = `${GUEST_DIR}/Executor.app/${exe.split("/Executor.app/")[1]}`; + const guestExecutor = `${GUEST_DIR}/Executor.app/${executor.split("/Executor.app/")[1]}`; + const env = `HOME=${GUEST_HOME} EXECUTOR_DATA_DIR=${GUEST_HOME}/.executor`; + + // 1) the bundled daemon, supervised — the app attaches to this. + await vm.ssh( + `nohup env ${env} EXECUTOR_SUPERVISED=1 EXECUTOR_AUTH_TOKEN=desktop-macos-e2e EXECUTOR_CLIENT=desktop ` + + `'${guestExecutor}' daemon run --foreground --port ${DAEMON_PORT} --hostname 127.0.0.1 ` + + `>/tmp/executor-daemon.log 2>&1 &`, + ); + if (!(await waitGuestHttp(vm, `http://127.0.0.1:${DAEMON_PORT}/`))) { + throw new Error( + "supervised daemon never came up in the guest (see /tmp/executor-daemon.log)", + ); + } + + // 2) the packaged app, launched INTO the Aqua session with CDP enabled. + await vm.ssh( + `U=$(id -u); sudo launchctl asuser $U bash -lc ` + + `'nohup env HOME=${GUEST_HOME} "${guestExe}" --remote-debugging-port=${CDP_GUEST_PORT} --remote-allow-origins="*" ` + + `>/tmp/executor-app.log 2>&1 &'`, + ); + if (!(await waitGuestPageTarget(vm, CDP_GUEST_PORT))) { + const log = (await vm.ssh("tail -40 /tmp/executor-app.log 2>/dev/null").catch(() => null)) + ?.stdout; + throw new Error(`the app's CDP page target never appeared:\n${log ?? "(no app log)"}`); + } + + return { ip: vm.host, teardown: async () => void (await vm.discard()) }; + } catch (error) { + await vm.discard(); + throw error; + } +}; + +export default (): Promise<(() => Promise) | void> => attachOrProvision(provisionMac); diff --git a/e2e/setup/desktop-vm.ts b/e2e/setup/desktop-vm.ts new file mode 100644 index 000000000..c15f15864 --- /dev/null +++ b/e2e/setup/desktop-vm.ts @@ -0,0 +1,91 @@ +// Shared plumbing for the desktop- globalsetups. Each OS setup supplies a +// `provision` that boots its guest and brings the packaged app up with +// --remote-debugging-port; this module handles the rest the same everywhere: +// attach to an already-running guest (E2E_DESKTOP_VM_IP) or provision a fresh +// one, then forward the guest's CDP port and publish it for the scenario. +import { guestTunnel } from "../src/vm/desktop"; +import type { VmHandle } from "../src/vm/types"; + +export const CDP_GUEST_PORT = 9222; + +const sleep = (ms: number): Promise => new Promise((r) => setTimeout(r, ms)); + +/** Poll until an HTTP endpoint inside the guest answers (any status — a 401 from + * the bearer-gated daemon still means "up"). HTTP, not lsof: the app may be + * owned by root (launchctl asuser), whose listening socket an unprivileged lsof + * can't see — a loopback HTTP probe works regardless of owner. */ +export const waitGuestHttp = async (vm: VmHandle, url: string, attempts = 60): Promise => { + for (let i = 0; i < attempts; i++) { + const r = await vm.ssh( + `curl -s -o /dev/null -w '%{http_code}' --max-time 5 ${url} 2>/dev/null || echo 000`, + ); + const code = r.stdout.trim().slice(-3); + if (code !== "000" && code !== "") return true; + await sleep(2000); + } + return false; +}; + +/** Poll until CDP advertises a real PAGE target — i.e. the app's window/renderer + * is up, not just the browser endpoint. On a cold guest the page appears a good + * bit after the port opens, so gating on this makes the scenario deterministic. */ +export const waitGuestPageTarget = async ( + vm: VmHandle, + port: number, + attempts = 60, +): Promise => { + for (let i = 0; i < attempts; i++) { + const r = await vm.ssh( + `curl -s --max-time 5 http://127.0.0.1:${port}/json/list 2>/dev/null | grep -c '"type": "page"' || echo 0`, + ); + if (Number(r.stdout.trim() || "0") > 0) return true; + await sleep(2000); + } + return false; +}; + +export interface ProvisionedGuest { + readonly ip: string; + readonly teardown: () => Promise; +} + +/** + * The body every desktop-.globalsetup returns: attach to E2E_DESKTOP_VM_IP + * if set, else provision a fresh guest; then forward the guest's CDP port and + * publish it (+ the guest IP, for filming) for the worker. A provision/forward + * failure never fails the run — the scenario skips honestly, like + * desktop-packaged without a display. + */ +export const attachOrProvision = async ( + provision: () => Promise, +): Promise<(() => Promise) | void> => { + let ip = process.env.E2E_DESKTOP_VM_IP; + let teardownVm: (() => Promise) | undefined; + + if (!ip) { + // oxlint-disable-next-line executor/no-try-catch-or-throw -- boundary: VM/host setup may fail; degrade to a skip + try { + const result = await provision(); + ip = result.ip; + teardownVm = result.teardown; + } catch (error) { + console.warn(`[desktop] provision failed, scenario will skip: ${String(error)}`); + return; + } + } + + // oxlint-disable-next-line executor/no-try-catch-or-throw -- boundary: forwarding may fail; degrade to a skip + try { + const forward = await guestTunnel(ip, CDP_GUEST_PORT); + process.env.E2E_DESKTOP_CDP_PORT = String(forward.localPort); + process.env.E2E_DESKTOP_VM_IP = ip; + return async () => { + forward.close(); + await teardownVm?.(); + }; + } catch (error) { + console.warn(`[desktop] could not forward CDP from ${ip}: ${String(error)}`); + await teardownVm?.(); + return; + } +}; diff --git a/e2e/setup/desktop-windows.globalsetup.ts b/e2e/setup/desktop-windows.globalsetup.ts new file mode 100644 index 000000000..814df0871 --- /dev/null +++ b/e2e/setup/desktop-windows.globalsetup.ts @@ -0,0 +1,106 @@ +// desktop-windows: drive the PACKAGED app running in a Windows guest over CDP. +// Windows-in-a-VM works best with dockur (QEMU on a Linux/KVM host): autologin +// gives a real interactive session the app renders into, and QEMU `screendump` +// films the framebuffer directly — sidestepping the session-0 problem that +// defeats SSH-driven screenshots (the prior proof of this path). +// +// Unlike the tart targets this ATTACHES to a long-lived Windows host (the dockur +// guest stays up between runs, like a shared selfhost): it forwards the guest's +// --remote-debugging-port to the host over an SSH jump and publishes it. The +// shared scenario drives; the windows recorder (src/vm/desktop.ts) films via +// screendump. Without a reachable app it skips honestly. All connection details +// come from env (no baked-in host): +// E2E_DESKTOP_WIN_HOST (ssh alias of the docker/KVM host to jump through), +// _SSH_PORT (the guest's mapped OpenSSH port), _KEY, _USER; the recorder also +// reads _CONTAINER and _STORAGE. +import { spawn } from "node:child_process"; +import net from "node:net"; + +const SSH_PORT = process.env.E2E_DESKTOP_WIN_SSH_PORT ?? "2222"; +const KEY = process.env.E2E_DESKTOP_WIN_KEY ?? "/tmp/winkey"; +const USER = process.env.E2E_DESKTOP_WIN_USER ?? "Administrator"; +const CDP_GUEST_PORT = 9222; + +const sleep = (ms: number): Promise => new Promise((r) => setTimeout(r, ms)); + +const freePort = (): Promise => + new Promise((resolve, reject) => { + const srv = net.createServer(); + srv.on("error", reject); + srv.listen(0, "127.0.0.1", () => { + const port = (srv.address() as net.AddressInfo).port; + srv.close(() => resolve(port)); + }); + }); + +interface CdpTarget { + readonly type: string; + readonly webSocketDebuggerUrl?: string; +} + +/** Poll the forwarded port until the app advertises a CDP page target. */ +const pageReady = async (port: number, attempts = 30): Promise => { + for (let i = 0; i < attempts; i++) { + const targets = (await fetch(`http://127.0.0.1:${port}/json/list`) + .then((r) => (r.ok ? r.json() : [])) + .catch(() => [])) as ReadonlyArray; + if (targets.some((t) => t.type === "page" && t.webSocketDebuggerUrl)) return true; + await sleep(2000); + } + return false; +}; + +export default async function setup(): Promise<(() => Promise) | void> { + const host = process.env.E2E_DESKTOP_WIN_HOST; + if (!host) { + console.warn( + "[desktop-windows] E2E_DESKTOP_WIN_HOST not set; scenario will skip. Point it at the ssh " + + "alias of a dockur/KVM Windows host running the packaged app with --remote-debugging-port.", + ); + return; + } + const localPort = await freePort(); + // mac:localPort → (jump host) → guest:9222. -p is the guest's mapped OpenSSH + // port on the host; the final hop into Windows carries the -L forward. + const tunnel = spawn( + "ssh", + [ + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/dev/null", + "-o", + "ConnectTimeout=12", + "-o", + "ServerAliveInterval=15", + "-J", + host, + "-p", + SSH_PORT, + "-i", + KEY, + "-L", + `${localPort}:127.0.0.1:${CDP_GUEST_PORT}`, + "-N", + `${USER}@127.0.0.1`, + ], + { stdio: "ignore" }, + ); + + if (!(await pageReady(localPort))) { + tunnel.kill(); + console.warn( + `[desktop-windows] no app/CDP reachable on the Windows host (${host}); scenario will skip. ` + + `Bring up the packaged app with --remote-debugging-port=${CDP_GUEST_PORT} in the dockur guest.`, + ); + return; + } + + process.env.E2E_DESKTOP_CDP_PORT = String(localPort); + // Non-empty so the scenario runs; the windows recorder uses E2E_DESKTOP_WIN_*. + process.env.E2E_DESKTOP_VM_IP = host; + + return async () => { + tunnel.kill(); + }; +} diff --git a/e2e/src/journey/codegen.ts b/e2e/src/journey/codegen.ts new file mode 100644 index 000000000..77d540228 --- /dev/null +++ b/e2e/src/journey/codegen.ts @@ -0,0 +1,78 @@ +// Generate a committed scenario from a recorded journey. The output is a normal +// scenario file (the same shape a human writes, see e2e/AGENTS.md): an Effect +// body that yields Target + Browser, mints a fresh identity, and drives one +// browser session of labelled steps. Terminal (`run`) and HTTP (`request`) steps +// run INSIDE that session, interleaved with the UI, so page state is never lost +// between them. It is meant to be read and edited after generation — promotion +// is the START of a scenario's life, not a frozen artifact. +import { codegenStep, isAssertion, isBrowserStep, stepLabel, type Step } from "./steps"; + +export interface JourneyFile { + readonly target: string; + readonly org: boolean; + readonly steps: ReadonlyArray; +} + +const INDENT = " "; + +const stepBlock = (step: Step): string => { + const label = JSON.stringify(stepLabel(step)); + const body = codegenStep(step) + .split("\n") + .map((line) => `${INDENT} ${line}`) + .join("\n"); + return `${INDENT}await step(${label}, async () => {\n${body}\n${INDENT}});`; +}; + +/** A journey with no assertion proves nothing; one with no browser step isn't + * what this tool generates (write a CLI/API test directly). `promote` checks + * both. */ +export const journeyHasAssertion = (journey: JourneyFile): boolean => + journey.steps.some(isAssertion); + +export const journeyHasBrowserStep = (journey: JourneyFile): boolean => + journey.steps.some(isBrowserStep); + +export const codegenScenario = (name: string, journey: JourneyFile): string => { + const identityArg = journey.org ? "" : "{ org: false }"; + const body = journey.steps.map(stepBlock).join("\n"); + + const needsExec = journey.steps.some((step) => step.kind === "run"); + const needsExpect = journey.steps.some( + (step) => step.kind === "request" || (step.kind === "run" && step.contains !== undefined), + ); + + const imports: string[] = []; + if (needsExec) { + imports.push(`import { execFile } from "node:child_process";`); + imports.push(`import { promisify } from "node:util";`); + imports.push(""); + } + if (needsExpect) imports.push(`import { expect } from "@effect/vitest";`); + imports.push(`import { Effect } from "effect";`); + imports.push(""); + imports.push(`import { scenario } from "../src/scenario";`); + imports.push(`import { Browser, Target } from "../src/services";`); + + const execHelper = needsExec ? "\nconst execFileAsync = promisify(execFile);\n" : ""; + + return `// Generated from an interactive browser journey: \`bun scripts/cli.ts promote ${journey.target} ""\`. +// This is now an ordinary scenario — edit it freely. It drives the same Browser +// surface the exploration used, so a reviewer can judge the guarantee by +// reading it. Re-run with: E2E_${journey.target.toUpperCase()}_URL= vitest run --project ${journey.target} +${imports.join("\n")} +${execHelper} +scenario( + ${JSON.stringify(name)}, + { timeout: 120_000 }, + Effect.gen(function* () { + const target = yield* Target; + const browser = yield* Browser; + const identity = yield* target.newIdentity(${identityArg}); + yield* browser.session(identity, async ({ page, step }) => { +${body} + }); + }), +); +`; +}; diff --git a/e2e/src/journey/run.ts b/e2e/src/journey/run.ts new file mode 100644 index 000000000..70d8916be --- /dev/null +++ b/e2e/src/journey/run.ts @@ -0,0 +1,104 @@ +// Replay a journey live, from a clean browser, and report what the page looks +// like afterward. This is the development loop: each `browse` command appends a +// step and replays the WHOLE journey from scratch, so the flow the agent is +// building is, at every moment, exactly what the generated test will run — a +// step that doesn't reproduce fails here, not later. The returned observation +// (url, title, the page's interactive controls) is how the agent, which can't +// see the screen, decides the next step; the screenshot is for a human. +import { chromium, type Page } from "playwright"; + +import type { Identity, Target } from "../target"; +import { executeStep, type Step } from "./steps"; + +export interface Control { + readonly role: string; + readonly name: string; +} + +export interface Observation { + readonly url: string; + readonly title: string; + /** The interactive elements on the page, as (role, accessible name) — the + * vocabulary the next clickRole/fill step is written against. */ + readonly controls: ReadonlyArray; + readonly screenshotPath: string; + /** Textual output of the last terminal/HTTP step, if any — so the agent sees + * what a `run`/`request` produced, not just the page. */ + readonly lastOutput?: string; + /** Index of the step that threw, with its message — undefined on success. */ + readonly failedStep?: { readonly index: number; readonly error: string }; +} + +const OBSERVED_ROLES = ["link", "button", "textbox", "tab", "menuitem", "checkbox"] as const; +const PER_ROLE_CAP = 30; + +/** A compact, deduped list of the page's interactive controls. Names come from + * the accessible name (text / aria-label / placeholder), trimmed. */ +const snapshotControls = async (page: Page): Promise => { + const seen = new Set(); + const controls: Control[] = []; + for (const role of OBSERVED_ROLES) { + const elements = await page.getByRole(role).all(); + for (const element of elements.slice(0, PER_ROLE_CAP)) { + const raw = + (await element.textContent().catch(() => null))?.trim() || + (await element.getAttribute("aria-label").catch(() => null)) || + (await element.getAttribute("placeholder").catch(() => null)) || + ""; + const name = raw.replace(/\s+/g, " ").trim().slice(0, 70); + if (!name) continue; + const key = `${role}:${name}`; + if (seen.has(key)) continue; + seen.add(key); + controls.push({ role, name }); + } + } + return controls; +}; + +export const replayJourney = async ( + target: Target, + identity: Identity, + steps: ReadonlyArray, + options: { readonly screenshotPath: string }, +): Promise => { + const browser = await chromium.launch(); + let failedStep: Observation["failedStep"]; + try { + const context = await browser.newContext({ + colorScheme: "dark", + viewport: { width: 1280, height: 800 }, + baseURL: target.baseUrl, + }); + // Same identity injection the Browser surface does, so the live page is the + // logged-in page the generated test will drive. + if (identity.cookies?.length) { + await context.addCookies( + identity.cookies.map((cookie) => ({ ...cookie, url: target.baseUrl })), + ); + } + const page = await context.newPage(); + let lastOutput: string | undefined; + for (let index = 0; index < steps.length; index++) { + try { + const output = await executeStep({ page, baseUrl: target.baseUrl }, steps[index]!); + if (output !== undefined) lastOutput = output; + } catch (error) { + failedStep = { index, error: error instanceof Error ? error.message : String(error) }; + break; + } + } + await page.waitForLoadState("networkidle").catch(() => {}); + await page.screenshot({ path: options.screenshotPath }).catch(() => {}); + return { + url: page.url(), + title: await page.title().catch(() => ""), + controls: await snapshotControls(page).catch(() => []), + screenshotPath: options.screenshotPath, + lastOutput, + failedStep, + }; + } finally { + await browser.close(); + } +}; diff --git a/e2e/src/journey/steps.ts b/e2e/src/journey/steps.ts new file mode 100644 index 000000000..22eb4413d --- /dev/null +++ b/e2e/src/journey/steps.ts @@ -0,0 +1,236 @@ +// The journey Step DSL: the single source of truth shared by interactive +// exploration and the generated scenario. ONE step description is both +// (a) executed live against the real product while the agent develops a flow +// (`executeStep`), and (b) emitted as the matching Playwright line inside a +// committed scenario (`codegenStep`). Because both sides read the same record, +// "turn what I just did into a test" is a translation, not a reimplementation: +// the generated test drives the exact surface the exploration drove. +// +// Steps are plain JSON (they persist to .dev/.journey.json between CLI +// invocations), so they carry no closures — every action is a named primitive. +import { execFile } from "node:child_process"; +import { promisify } from "node:util"; + +import type { Page } from "playwright"; + +const execFileAsync = promisify(execFile); + +/** ARIA roles the journey can target. A closed set keeps codegen honest and + * the live `getByRole` calls type-safe (Playwright's role arg is a union). */ +export type Role = "link" | "button" | "heading" | "textbox" | "tab" | "menuitem" | "checkbox"; + +export type Step = + | { readonly kind: "goto"; readonly path: string; readonly label?: string } + | { + readonly kind: "clickRole"; + readonly role: Role; + readonly name: string; + readonly label?: string; + } + | { readonly kind: "clickText"; readonly text: string; readonly label?: string } + | { + readonly kind: "fill"; + readonly field: string; + readonly value: string; + readonly label?: string; + } + | { readonly kind: "press"; readonly key: string; readonly label?: string } + | { readonly kind: "expectText"; readonly text: string; readonly label?: string } + | { readonly kind: "expectUrl"; readonly contains: string; readonly label?: string } + // A terminal command. `{base}` expands to the target's base URL, so a journey + // can hit the same instance the UI is driving (curl, npx add-mcp, executor …). + // `contains` (when set) asserts on the combined stdout+stderr. + | { + readonly kind: "run"; + readonly command: string; + readonly contains?: string; + readonly label?: string; + } + // An HTTP call through the page's own authenticated session (relative paths + // resolve against the base URL). `contains` asserts on the response body; + // without it, the assertion is a 2xx. + | { + readonly kind: "request"; + readonly method: string; + readonly path: string; + readonly contains?: string; + readonly label?: string; + }; + +/** Assertions are the steps a reviewer reads as the guarantee — a journey with + * none asserts nothing, so `promote` refuses it. A `run`/`request` is an + * assertion when it carries an expectation (`contains`, or `request`'s 2xx). */ +export const isAssertion = (step: Step): boolean => + step.kind === "expectText" || + step.kind === "expectUrl" || + step.kind === "request" || + (step.kind === "run" && step.contains !== undefined); + +export const isBrowserStep = (step: Step): boolean => + step.kind !== "run" && step.kind !== "request"; + +/** The human-readable step name (the `step(label, …)` group + screenshot + * caption). The agent can override per step; this is the sensible default so a + * generated test reads as a journey even when labels were left implicit. */ +export const stepLabel = (step: Step): string => { + if (step.label) return step.label; + switch (step.kind) { + case "goto": + return `Open ${step.path}`; + case "clickRole": + return `Click the ${JSON.stringify(step.name)} ${step.role}`; + case "clickText": + return `Click ${JSON.stringify(step.text)}`; + case "fill": + return `Fill ${JSON.stringify(step.field)}`; + case "press": + return `Press ${step.key}`; + case "expectText": + return `See ${JSON.stringify(step.text)}`; + case "expectUrl": + return `Land on a URL containing ${JSON.stringify(step.contains)}`; + case "run": + return step.contains + ? `Run ${JSON.stringify(step.command)} and see ${JSON.stringify(step.contains)}` + : `Run ${JSON.stringify(step.command)}`; + case "request": + return step.contains + ? `${step.method} ${step.path} returns ${JSON.stringify(step.contains)}` + : `${step.method} ${step.path} succeeds`; + } +}; + +const ASSERT_TIMEOUT = 15_000; + +export interface StepContext { + readonly page: Page; + /** The target's base URL — `{base}` in a `run` command expands to this. */ + readonly baseUrl: string; +} + +/** Expand `{base}` so a terminal command can reach the instance under test. */ +const withBase = (command: string, baseUrl: string): string => + command.replaceAll("{base}", baseUrl); + +/** Drive one step against a live page. Assertions throw on failure (a missing + * text, a wrong URL, a non-matching command output) so it surfaces immediately + * while exploring, the same way it would fail the generated test. Returns any + * textual output (terminal / HTTP) so the caller can show it. */ +export const executeStep = async (ctx: StepContext, step: Step): Promise => { + const { page } = ctx; + switch (step.kind) { + case "goto": + await page.goto(step.path, { waitUntil: "networkidle" }); + return; + case "clickRole": + await page.getByRole(step.role, { name: step.name }).first().click(); + return; + case "clickText": + await page.getByText(step.text).first().click(); + return; + case "fill": + await page.getByLabel(step.field).first().fill(step.value); + return; + case "press": + await page.keyboard.press(step.key); + return; + case "expectText": + await page + .getByText(step.text) + .first() + .waitFor({ state: "visible", timeout: ASSERT_TIMEOUT }); + return; + case "expectUrl": + await page.waitForURL((url) => url.toString().includes(step.contains), { + timeout: ASSERT_TIMEOUT, + }); + return; + case "run": { + const result = await execFileAsync("sh", ["-c", withBase(step.command, ctx.baseUrl)]).catch( + (error: { stdout?: string; stderr?: string }) => ({ + stdout: error.stdout ?? "", + stderr: error.stderr ?? String(error), + }), + ); + const output = `${result.stdout}${result.stderr}`; + if (step.contains !== undefined && !output.includes(step.contains)) { + throw new Error( + `\`run\` output did not contain ${JSON.stringify(step.contains)}\n${output.slice(0, 1000)}`, + ); + } + return output.trim().slice(0, 2000); + } + case "request": { + const response = await page.request.fetch(step.path, { method: step.method }); + const body = await response.text(); + if (step.contains !== undefined) { + if (!body.includes(step.contains)) { + throw new Error( + `${step.method} ${step.path} body did not contain ${JSON.stringify(step.contains)} (status ${response.status()})`, + ); + } + } else if (!response.ok()) { + throw new Error(`${step.method} ${step.path} returned ${response.status()}`); + } + return `${response.status()} ${body.slice(0, 800)}`; + } + } +}; + +/** The Playwright line(s) for this step, as they appear inside the generated + * scenario's `step(label, async () => { … })` body. Mirrors `executeStep` + * exactly — same locator, same call — so live behavior and the test match. */ +export const codegenStep = (step: Step): string => { + const s = (value: string): string => JSON.stringify(value); + switch (step.kind) { + case "goto": + return `await page.goto(${s(step.path)}, { waitUntil: "networkidle" });`; + case "clickRole": + return `await page.getByRole(${s(step.role)}, { name: ${s(step.name)} }).first().click();`; + case "clickText": + return `await page.getByText(${s(step.text)}).first().click();`; + case "fill": + return `await page.getByLabel(${s(step.field)}).first().fill(${s(step.value)});`; + case "press": + return `await page.keyboard.press(${s(step.key)});`; + case "expectText": + // The repo's browser-assertion idiom: waiting for the element IS the + // assertion (a timeout fails the step with the locator in the message). + return `await page.getByText(${s(step.text)}).first().waitFor();`; + case "expectUrl": + return `await page.waitForURL((url) => url.toString().includes(${s(step.contains)}));`; + case "run": { + const lines = [ + `const { stdout } = await execFileAsync("sh", ["-c", ${backtick(step.command)}]);`, + ]; + if (step.contains !== undefined) { + lines.push( + `expect(stdout, "the command output is as expected").toContain(${s(step.contains)});`, + ); + } + return lines.join("\n"); + } + case "request": { + const lines = [ + `const response = await page.request.fetch(${s(step.path)}, { method: ${s(step.method)} });`, + ]; + lines.push( + step.contains !== undefined + ? `expect(await response.text(), "the response is as expected").toContain(${s(step.contains)});` + : `expect(response.ok(), "the request succeeded").toBe(true);`, + ); + return lines.join("\n"); + } + } +}; + +/** A terminal command as a template literal so `{base}` becomes `target.baseUrl` + * (which is in scope in the generated body). Backticks in the command are + * escaped so the literal stays valid. */ +const backtick = (command: string): string => + "`" + + command + .replaceAll("\\", "\\\\") + .replaceAll("`", "\\`") + .replaceAll("{base}", "${target.baseUrl}") + + "`"; diff --git a/e2e/src/vm/desktop.ts b/e2e/src/vm/desktop.ts new file mode 100644 index 000000000..e04866b0a --- /dev/null +++ b/e2e/src/vm/desktop.ts @@ -0,0 +1,327 @@ +// Driving the PACKAGED desktop app inside a GUI guest, from the host. This is +// the shared substrate for the cross-OS desktop targets (Gap A): SSH plumbing, +// an SSH local-forward, a minimal CDP page client, and screen recording — the +// pieces proven against a tart macOS guest. The desktop- globalsetup boots +// the guest and launches the app; a scenario connects over CDP and records. +// +// Why these mechanics (macOS): a tart `--no-graphics` guest opens no host window +// (no focus stealing) yet, with the base image's autologin, still reaches a real +// Aqua session (WindowServer/Dock/Finder) the app can render into. A GUI app must +// be launched INTO that session (`sudo launchctl asuser …`); a plain SSH +// spawn lands in a non-GUI session. The app's --remote-debugging-port is then +// reachable over an SSH forward, and `screencapture` films the console. +import { execFile, spawn } from "node:child_process"; +import net from "node:net"; +import { basename, dirname } from "node:path"; +import { promisify } from "node:util"; + +const execFileP = promisify(execFile); + +const SSHPASS = process.env.E2E_SSHPASS_BIN ?? "/opt/homebrew/bin/sshpass"; +const GUEST_PASS = process.env.E2E_DESKTOP_VM_PASS ?? "admin"; +const GUEST_USER = process.env.E2E_DESKTOP_VM_USER ?? "admin"; +const SSH_OPTS = [ + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/dev/null", + "-o", + "ConnectTimeout=8", + "-o", + "LogLevel=ERROR", + // Password auth only (sshpass): a loaded SSH agent's keys would otherwise + // exhaust the guest's MaxAuthTries before the password is tried. + "-o", + "PubkeyAuthentication=no", + "-o", + "IdentitiesOnly=yes", +]; + +export const sleep = (ms: number): Promise => + new Promise((resolve) => setTimeout(resolve, ms)); + +export const guestSsh = ( + ip: string, + command: string, +): Promise<{ stdout: string; stderr: string }> => + execFileP(SSHPASS, ["-p", GUEST_PASS, "ssh", ...SSH_OPTS, `${GUEST_USER}@${ip}`, command], { + maxBuffer: 64 * 1024 * 1024, + }); + +export const guestScpFrom = (ip: string, remote: string, local: string): Promise => + execFileP(SSHPASS, [ + "-p", + GUEST_PASS, + "scp", + ...SSH_OPTS, + `${GUEST_USER}@${ip}:${remote}`, + local, + ]); + +/** + * Push a directory into the guest by streaming a tar over ssh: one connection, + * no per-file round-trips, and the flowing data keeps the link alive — far more + * robust than `scp -r` of a big app bundle (thousands of files + symlinks), + * which drops mid-transfer on a freshly-booted guest. Retries once. The dir + * lands at `${remoteParent}/${basename(localDir)}`. + */ +export const pushDirAsTar = async ( + ip: string, + localDir: string, + remoteParent: string, +): Promise => { + const parent = dirname(localDir); + const base = basename(localDir); + const remote = `${SSHPASS} -p ${GUEST_PASS} ssh ${SSH_OPTS.join(" ")} ${GUEST_USER}@${ip} ${JSON.stringify( + `mkdir -p ${remoteParent} && tar xf - -C ${remoteParent}`, + )}`; + const pipeline = `tar cf - -C ${JSON.stringify(parent)} ${JSON.stringify(base)} | ${remote}`; + // oxlint-disable-next-line executor/no-try-catch-or-throw -- boundary: one retry over a flaky just-booted guest link + try { + await execFileP("sh", ["-c", pipeline], { maxBuffer: 16 * 1024 * 1024 }); + } catch { + await sleep(3000); + await execFileP("sh", ["-c", pipeline], { maxBuffer: 16 * 1024 * 1024 }); + } +}; + +const freePort = (): Promise => + new Promise((resolve, reject) => { + const srv = net.createServer(); + srv.on("error", reject); + srv.listen(0, "127.0.0.1", () => { + const port = (srv.address() as net.AddressInfo).port; + srv.close(() => resolve(port)); + }); + }); + +export interface Forward { + readonly localPort: number; + close(): void; +} + +/** SSH local-forward host:localPort → guest:guestPort; resolves once it binds. */ +export const guestTunnel = async (ip: string, guestPort: number): Promise => { + const localPort = await freePort(); + const child = spawn( + SSHPASS, + [ + "-p", + GUEST_PASS, + "ssh", + ...SSH_OPTS, + "-N", + "-L", + `${localPort}:127.0.0.1:${guestPort}`, + `${GUEST_USER}@${ip}`, + ], + { stdio: "ignore" }, + ); + for (let i = 0; i < 40; i++) { + const ok = await new Promise((resolve) => { + const sock = net.connect({ host: "127.0.0.1", port: localPort }, () => { + sock.destroy(); + resolve(true); + }); + sock.on("error", () => resolve(false)); + sock.setTimeout(1000, () => { + sock.destroy(); + resolve(false); + }); + }); + if (ok) break; + await sleep(500); + } + return { localPort, close: () => child.kill() }; +}; + +const guestFileSize = (ip: string, remote: string): Promise => + guestSsh(ip, `stat -f%z ${remote} 2>/dev/null || stat -c%s ${remote} 2>/dev/null || echo 0`) + .then((r) => Number(r.stdout.trim() || "0")) + .catch(() => 0); + +/** + * Film the guest's screen for `seconds` and land it on the host as `localMp4` + * (mp4, plays everywhere). OS-aware capture: + * • macOS — `screencapture -V` to a .mov, then host-side ffmpeg to mp4. The + * first capture after a cold display can silently no-op, so warm it with a + * throwaway still and verify+retry. + * • linux — ffmpeg `x11grab` of the Xvfb display straight to mp4. + * Best-effort: failures never throw — "every run is watchable" wants the video, + * but a missing one shouldn't fail the run. Run it concurrently with the drive. + */ +export const recordGuestScreen = async ( + ip: string, + seconds: number, + localMp4: string, + os: "macos" | "linux" | "windows", +): Promise => { + if (os === "windows") { + // Windows can't screenshot the interactive desktop from an SSH session, so + // we film the VM framebuffer directly via QEMU's `screendump` (the dockur + // host runs the loop + ffmpeg; we pull the mp4). Host/container/storage come + // from env (no baked-in host); best-effort, so skip filming if unconfigured. + const host = process.env.E2E_DESKTOP_WIN_HOST; + const storage = process.env.E2E_DESKTOP_WIN_STORAGE; + if (!host || !storage) return; + const container = process.env.E2E_DESKTOP_WIN_CONTAINER ?? "exec-win"; + const frames = Math.max(8, seconds * 4); + const py = `import socket,time +s=socket.socket(socket.AF_UNIX); s.connect("/run/shm/monitor.sock"); time.sleep(0.2); s.recv(65536) +for i in range(${frames}): + s.sendall(("screendump /storage/frames/f%03d.ppm\\n"%i).encode()); time.sleep(0.2) + try: s.recv(65536) + except Exception: pass`; + const b64 = Buffer.from(py).toString("base64"); + const remote = + `S=${storage}; rm -rf "$S/frames"; mkdir -p "$S/frames"; ` + + `docker exec ${container} python3 -c "import base64;exec(base64.b64decode('${b64}'))"; ` + + `ffmpeg -y -framerate 4 -i "$S/frames/f%03d.ppm" -pix_fmt yuv420p -movflags +faststart "$S/win.mp4" >/dev/null 2>&1`; + await execFileP("ssh", ["-o", "ConnectTimeout=10", host, remote], { + maxBuffer: 16 * 1024 * 1024, + }).catch(() => undefined); + await execFileP("scp", [ + "-o", + "ConnectTimeout=10", + `${host}:${storage}/win.mp4`, + localMp4, + ]).catch(() => undefined); + return; + } + + if (os === "linux") { + const remote = "/tmp/executor-desktop-vm.mp4"; + await guestSsh( + ip, + `rm -f ${remote}; DISPLAY=:99 ffmpeg -y -f x11grab -video_size 1280x800 -framerate 15 ` + + `-i :99 -t ${seconds} -pix_fmt yuv420p ${remote} >/tmp/ffmpeg.log 2>&1`, + ).catch(() => undefined); + // The mostly-flat console compresses small under x264 — a real capture is + // ~30-60KB, a blank/failed one only a few KB. + if ((await guestFileSize(ip, remote)) > 12_000) { + await guestScpFrom(ip, remote, localMp4).catch(() => undefined); + } + return; + } + + const remoteMov = "/tmp/executor-desktop-vm.mov"; + // Warm the capture subsystem — the first screencapture after the display comes + // up can produce nothing. + await guestSsh(ip, "screencapture -x /tmp/.warm.png 2>/dev/null; rm -f /tmp/.warm.png").catch( + () => undefined, + ); + for (let attempt = 0; attempt < 2; attempt++) { + await guestSsh(ip, `rm -f ${remoteMov}; screencapture -V ${seconds} -x ${remoteMov}`).catch( + () => undefined, + ); + if ((await guestFileSize(ip, remoteMov)) > 100_000) { + const localMov = `${localMp4}.mov`; + await guestScpFrom(ip, remoteMov, localMov).catch(() => undefined); + await execFileP("ffmpeg", [ + "-y", + "-i", + localMov, + "-c:v", + "libx264", + "-pix_fmt", + "yuv420p", + "-movflags", + "+faststart", + localMp4, + ]) + .then(() => execFileP("rm", ["-f", localMov])) + .catch(() => undefined); + return; + } + } +}; + +// --- a minimal CDP page client (same protocol as desktop-packaged's driver) -- + +interface CdpTarget { + readonly type: string; + readonly webSocketDebuggerUrl?: string; +} + +export class CdpPage { + private nextId = 1; + private readonly pending = new Map void>(); + + private constructor(private readonly socket: WebSocket) { + socket.addEventListener("message", (event) => { + if (typeof event.data !== "string") return; + const message = JSON.parse(event.data) as { id?: number; result?: unknown }; + if (message.id && this.pending.has(message.id)) { + this.pending.get(message.id)!(message.result); + this.pending.delete(message.id); + } + }); + } + + static connect = (url: string): Promise => + new Promise((resolve, reject) => { + const socket = new WebSocket(url); + const timer = setTimeout( + // oxlint-disable-next-line executor/no-promise-reject, executor/no-error-constructor -- boundary: WebSocket connection promise adapter + () => reject(new Error(`CDP connect timeout: ${url}`)), + 30_000, + ); + socket.addEventListener("open", () => { + clearTimeout(timer); + resolve(new CdpPage(socket)); + }); + socket.addEventListener("error", () => { + clearTimeout(timer); + // oxlint-disable-next-line executor/no-promise-reject, executor/no-error-constructor -- boundary: WebSocket connection promise adapter + reject(new Error(`CDP connect failed: ${url}`)); + }); + }); + + command = (method: string, params: Record = {}): Promise => { + const id = this.nextId++; + const result = new Promise((resolve) => + this.pending.set(id, (value) => resolve(value as T)), + ); + this.socket.send(JSON.stringify({ id, method, params })); + return result; + }; + + waitForText = async (text: string, timeoutMs: number): Promise => { + const deadline = Date.now() + timeoutMs; + const expression = `document.body?.innerText.includes(${JSON.stringify(text)}) ?? false`; + for (;;) { + const r = await this.command<{ result?: { value?: boolean } }>("Runtime.evaluate", { + expression, + returnByValue: true, + }); + if (r.result?.value) return; + // oxlint-disable-next-line executor/no-error-constructor -- boundary: a wait timeout is a plain failure here + if (Date.now() >= deadline) throw new Error(`timed out waiting for text: ${text}`); + await sleep(250); + } + }; + + screenshot = async (): Promise => { + const r = await this.command<{ data: string }>("Page.captureScreenshot", { format: "png" }); + return Buffer.from(r.data, "base64"); + }; + + close = (): void => this.socket.close(); +} + +/** The first drivable page target's WebSocket URL, fetched through the forward + * (so the returned ws URL already points at the local port). */ +export const pageWsUrl = async (localPort: number): Promise => { + const deadline = Date.now() + 60_000; + for (;;) { + const targets = (await fetch(`http://127.0.0.1:${localPort}/json/list`) + .then((r) => (r.ok ? r.json() : [])) + .catch(() => [])) as ReadonlyArray; + const page = targets.find((t) => t.type === "page" && t.webSocketDebuggerUrl); + if (page?.webSocketDebuggerUrl) return page.webSocketDebuggerUrl; + // oxlint-disable-next-line executor/no-error-constructor -- boundary: setup failure surfaced to the caller + if (Date.now() >= deadline) + throw new Error("no CDP page target (app not running with --remote-debugging-port?)"); + await sleep(500); + } +}; diff --git a/e2e/src/vm/tart.ts b/e2e/src/vm/tart.ts index 5ca696420..6b496dca5 100644 --- a/e2e/src/vm/tart.ts +++ b/e2e/src/vm/tart.ts @@ -30,6 +30,14 @@ const SSH_OPTS = [ "ServerAliveInterval=5", "-o", "LogLevel=ERROR", + // We authenticate with sshpass (password). A loaded SSH agent would otherwise + // offer its keys first and exhaust the guest's MaxAuthTries ("Too many + // authentication failures") before the password is tried — intermittently, + // depending on how many keys the agent holds. Force password-only. + "-o", + "PubkeyAuthentication=no", + "-o", + "IdentitiesOnly=yes", ]; const GUEST_USER = "admin"; const GUEST_PASS = "admin"; @@ -92,7 +100,15 @@ export const tartVm = (os: "macos" | "linux", arch: VmArch = "arm64"): VmProvide provision: async () => { const name = `executor-e2e-${os}-${process.pid}-${Math.floor(performance.now())}`; await execFileP(TART, ["clone", baseImage(os), name]); - const runProc = spawn(TART, ["run", name, "--no-graphics"], { stdio: "ignore" }); + // `--no-graphics` opens NO host window (never steals focus) yet the guest + // still has a virtual display: with the base image's autologin it reaches a + // real Aqua session (WindowServer/Dock/Finder), so even the packaged GUI app + // renders and `screencapture` records it. No windowed/VNC mode is needed. + const runProc = spawn(TART, ["run", name, "--no-graphics"], { + stdio: "ignore", + detached: true, + }); + runProc.unref(); const tunnelClosers: Array<() => void> = []; let ip = ""; diff --git a/e2e/targets/desktop.ts b/e2e/targets/desktop.ts index 722ca08ac..5cded8a1a 100644 --- a/e2e/targets/desktop.ts +++ b/e2e/targets/desktop.ts @@ -9,7 +9,10 @@ import { Effect } from "effect"; import type { Target } from "../src/target"; export const desktopTarget = (): Target => ({ - name: "desktop", + // The project name (desktop / desktop-packaged / desktop-macos) so each lands + // in its own runs// bucket and viewer column — they're the same app + // in different harnesses (dev electron / packaged / packaged-in-a-VM). + name: process.env.E2E_TARGET ?? "desktop", baseUrl: "", mcpUrl: "", capabilities: new Set(), diff --git a/e2e/targets/registry.ts b/e2e/targets/registry.ts index 94e966746..c3d6aa351 100644 --- a/e2e/targets/registry.ts +++ b/e2e/targets/registry.ts @@ -19,6 +19,12 @@ const factories: Record Target> = { // The packaged desktop bundle launches its own app per scenario, same as // `desktop` — no standard surfaces to carry. See desktop-packaged.globalsetup. "desktop-packaged": desktopTarget, + // The packaged bundle inside a GUI guest (one per OS), driven over CDP from + // the host. Carries no surfaces (the scenario drives CDP itself). See + // desktop-.globalsetup. + "desktop-macos": desktopTarget, + "desktop-linux": desktopTarget, + "desktop-windows": desktopTarget, local: localTarget, // The supervised CLI daemon inside a VM, one project per guest OS — restart() // is a real reboot. See setup/cli.globalsetup.ts. diff --git a/e2e/vitest.config.ts b/e2e/vitest.config.ts index 74c45288f..408716954 100644 --- a/e2e/vitest.config.ts +++ b/e2e/vitest.config.ts @@ -77,6 +77,26 @@ export default defineConfig({ testTimeout: 360_000, hookTimeout: 600_000, }), + // The packaged desktop app inside a GUI guest, driven over CDP from the + // host and filmed (the cross-OS counterpart of desktop-packaged) — one + // shared scenario (desktop-vm/), one project per guest OS. The globalsetup + // provisions the guest, launches the bundle with --remote-debugging-port, + // and forwards it; the scenario connects, drives, and records the console. + // Each lands in runs//. Not in the default `npm run test` chain — + // run with `vitest run --project desktop-macos` (or desktop-linux). The VM + // is provisioned automatically; set E2E_DESKTOP_VM_IP to attach to an + // already-running guest instead. + // macos/linux provision a tart guest and build+push the ~450MB bundle; + // windows ATTACHES to a long-lived dockur guest over an SSH jump (no + // provision), so it needs no build but the same generous hooks. + ...(["macos", "linux", "windows"] as const).map((os) => + project(`desktop-${os}`, { + include: ["desktop-vm/**/*.test.ts"], + fileParallelism: false, + testTimeout: 300_000, + hookTimeout: 900_000, + }), + ), // The single-user local app. Each scenario launches its OWN `executor // web` via the CLI on a throwaway data dir + an OS-assigned port, so // there is no shared instance and scenarios are independent. Files run diff --git a/packages/react/src/components/auth-method-list-editor.tsx b/packages/react/src/components/auth-method-list-editor.tsx index 49b657629..45b1344e1 100644 --- a/packages/react/src/components/auth-method-list-editor.tsx +++ b/packages/react/src/components/auth-method-list-editor.tsx @@ -12,8 +12,9 @@ // --------------------------------------------------------------------------- import { useCallback, useEffect, useRef, useState } from "react"; -import { PlusIcon, XIcon } from "lucide-react"; +import { LockIcon, PlusIcon, XIcon } from "lucide-react"; +import { PlacementLine } from "../lib/auth-placements"; import { Button } from "./button"; import { FieldLabel } from "./field"; import { @@ -35,6 +36,11 @@ export interface AuthMethodSeed { export interface AuthMethodRow { readonly value: AuthTemplateEditorValue; + /** True when this row came from detection (a seed), false when the user added + * it. Detected rows are immutable — the spec/probe declared them — so the + * editor renders them read-only. Not inferred from `seedSlug`: some plugins + * (MCP) seed a detected method with a label but no slug. */ + readonly seeded: boolean; readonly seedSlug?: string; readonly seedLabel?: string; } @@ -59,6 +65,7 @@ export function useAuthMethodList(seeds: readonly AuthMethodSeed[]): AuthMethodL seeds.map( (seed: AuthMethodSeed): AuthMethodRow => ({ value: seed.value, + seeded: true, ...(seed.slug !== undefined ? { seedSlug: seed.slug } : {}), ...(seed.label !== undefined ? { seedLabel: seed.label } : {}), }), @@ -79,7 +86,10 @@ export function useAuthMethodList(seeds: readonly AuthMethodSeed[]): AuthMethodL }, []); const addRow = useCallback(() => { - setRows((current: readonly AuthMethodRow[]) => [...current, { value: emptyApiKeyValue() }]); + setRows((current: readonly AuthMethodRow[]) => [ + ...current, + { value: emptyApiKeyValue(), seeded: false }, + ]); }, []); return { rows, setRowAt, removeRowAt, addRow }; @@ -115,36 +125,53 @@ export function AuthMethodListEditor(props: AuthMethodListEditorProps) { ) : null ) : (
- {list.rows.map((row: AuthMethodRow, index: number) => ( -
-
- - Method {index + 1} - {row.seedLabel ? ` · ${row.seedLabel}` : ""} - - + {list.rows.map((row: AuthMethodRow, index: number) => { + // A row seeded from detection is the spec's own auth declaration: + // it's IMMUTABLE here. We render it read-only (no kind selector, no + // editable fields) so a user can't silently retype the spec's + // method into something nothing backs (e.g. flipping a Bearer-token + // API to OAuth with empty endpoints). The escape hatch is to remove + // the row and add a custom one. Manually added rows (no seed) get + // the full editor. + const detected = row.seeded; + return ( +
+
+ + {detected ? : null} + + Method {index + 1} + {row.seedLabel ? ` · ${row.seedLabel}` : ""} + + + +
+ {detected ? ( + + ) : ( + list.setRowAt(index, next)} + {...(allowedKinds ? { allowedKinds } : {})} + {...(presets ? { presets } : {})} + {...(oauthMetadata ? { oauthMetadata } : {})} + /> + )}
- list.setRowAt(index, next)} - {...(allowedKinds ? { allowedKinds } : {})} - {...(presets ? { presets } : {})} - {...(oauthMetadata ? { oauthMetadata } : {})} - /> -
- ))} + ); + })}
)} {list.rows.length > 0 && props.footerHint ? ( @@ -153,3 +180,73 @@ export function AuthMethodListEditor(props: AuthMethodListEditorProps) { ); } + +/** One read-only `label value` line, mono value, for the detected summary. */ +function SpecField(props: { readonly label: string; readonly value: string }) { + return ( +
+ {props.label} + {props.value} +
+ ); +} + +/** Read-only view of a spec-detected method: shows what the spec declared + * (placements / OAuth endpoints) as a DISABLED, non-interactive block. The + * detected method is immutable here, so the summary is styled like a disabled + * field (muted, not-allowed cursor, text not selectable) to communicate that + * plainly. The only action is to remove the row (the header's X) and add a + * custom method to override. */ +function DetectedMethodSummary(props: { + readonly value: AuthTemplateEditorValue; + readonly oauthMetadata?: "editable" | "discovered"; +}) { + const { value, oauthMetadata } = props; + // Name the auth kind explicitly: a detection label like MCP's "Detected" + // doesn't say whether it's OAuth or an API key, so surface it here. + const kindLabel = + value.kind === "oauth" ? "OAuth" : value.kind === "apikey" ? "API key" : "No auth"; + return ( +
+

+ {kindLabel} +

+
+ {value.kind === "none" && ( +

No credential — tools are callable without an account.

+ )} + + {value.kind === "apikey" && + (value.placements.length > 0 ? ( +
+ {value.placements.map((placement, i: number) => ( + + ))} +
+ ) : null)} + + {value.kind === "oauth" && + (oauthMetadata === "discovered" ? ( +

+ OAuth metadata is discovered from this server when you connect an account. +

+ ) : ( +
+ {value.authorizationUrl ? ( + + ) : null} + {value.tokenUrl ? : null} + {value.scopes.length > 0 ? ( + + ) : null} +
+ ))} +
+ +

Pulled from spec. Remove to override.

+
+ ); +} diff --git a/packages/react/src/lib/auth-placements.tsx b/packages/react/src/lib/auth-placements.tsx index c4f49d36e..71a3383a2 100644 --- a/packages/react/src/lib/auth-placements.tsx +++ b/packages/react/src/lib/auth-placements.tsx @@ -106,8 +106,13 @@ export function PlacementLine(props: { readonly placement: Placement; readonly m : placement.carrier === "env" ? `${placement.name || "TOKEN"}=` : `?${placement.name || "api_key"}=`; + // Plain inline (not inline-flex): flex trims the whitespace at the edges of + // each child, which would drop the space after "Authorization:" and the + // trailing space carried by a prefix like "Bearer ", rendering + // "Authorization:Bearer••••••". whitespace-pre-wrap keeps those spaces while + // still allowing the line to wrap. return ( - + {lead} {placement.prefix ? ( {placement.prefix}