From 488419f2e4db7a9197af96a219b02d4afcf1e913 Mon Sep 17 00:00:00 2001
From: ScriptedAlchemy <zack@module-federation.io>
Date: Tue, 9 Jun 2026 23:10:40 +0200
Subject: [PATCH 1/9] feat(testing-framework): add Gherkin/JS dual front-end
 flow-IR POC

Shared flow IR (variable table, named flows with scoped args/returns,
keyword-to-node policy) with three authoring surfaces: .feature files
via @cucumber/gherkin, a fluent typed JS API, and bindFeature sparse
overlays with drift validation. Includes offline demo and unit tests
with fake agents.
---
 packages/testing-framework/POC-GHERKIN.md     | 266 ++++++++++
 .../example/demo-app/index.html               | 121 +++++
 .../example/flows/shop.feature                |  41 ++
 .../example/flows/shop.flows.ts               |  63 +++
 .../example/flows/shop.overlay.ts             |  38 ++
 packages/testing-framework/package.json       |   2 +
 .../scripts/demo/scripted-agents.ts           | 119 +++++
 .../testing-framework/src/flow-ir/index.ts    |  25 +
 .../testing-framework/src/flow-ir/registry.ts |  64 +++
 .../src/flow-ir/run-scenario.ts               | 449 +++++++++++++++++
 .../src/flow-ir/substitute.ts                 |  36 ++
 .../testing-framework/src/flow-ir/types.ts    | 121 +++++
 .../src/frontends/gherkin/index.ts            | 237 +++++++++
 .../src/frontends/js/bind-feature.ts          | 386 +++++++++++++++
 .../src/frontends/js/index.ts                 | 223 +++++++++
 packages/testing-framework/src/index.ts       |  53 ++
 .../tests/unit-test/bind-feature.test.ts      | 342 +++++++++++++
 .../tests/unit-test/example-parity.test.ts    | 121 +++++
 .../tests/unit-test/flow-ir.test.ts           |  89 ++++
 .../tests/unit-test/gherkin-frontend.test.ts  | 159 ++++++
 .../tests/unit-test/helpers/fake-agents.ts    |  78 +++
 .../tests/unit-test/js-frontend.test.ts       | 149 ++++++
 .../tests/unit-test/run-scenario.test.ts      | 393 +++++++++++++++
 packages/testing-framework/vitest.config.ts   |   3 +
 pnpm-lock.yaml                                | 467 ++----------------
 25 files changed, 3617 insertions(+), 428 deletions(-)
 create mode 100644 packages/testing-framework/POC-GHERKIN.md
 create mode 100644 packages/testing-framework/example/demo-app/index.html
 create mode 100644 packages/testing-framework/example/flows/shop.feature
 create mode 100644 packages/testing-framework/example/flows/shop.flows.ts
 create mode 100644 packages/testing-framework/example/flows/shop.overlay.ts
 create mode 100644 packages/testing-framework/scripts/demo/scripted-agents.ts
 create mode 100644 packages/testing-framework/src/flow-ir/index.ts
 create mode 100644 packages/testing-framework/src/flow-ir/registry.ts
 create mode 100644 packages/testing-framework/src/flow-ir/run-scenario.ts
 create mode 100644 packages/testing-framework/src/flow-ir/substitute.ts
 create mode 100644 packages/testing-framework/src/flow-ir/types.ts
 create mode 100644 packages/testing-framework/src/frontends/gherkin/index.ts
 create mode 100644 packages/testing-framework/src/frontends/js/bind-feature.ts
 create mode 100644 packages/testing-framework/src/frontends/js/index.ts
 create mode 100644 packages/testing-framework/tests/unit-test/bind-feature.test.ts
 create mode 100644 packages/testing-framework/tests/unit-test/example-parity.test.ts
 create mode 100644 packages/testing-framework/tests/unit-test/flow-ir.test.ts
 create mode 100644 packages/testing-framework/tests/unit-test/gherkin-frontend.test.ts
 create mode 100644 packages/testing-framework/tests/unit-test/helpers/fake-agents.ts
 create mode 100644 packages/testing-framework/tests/unit-test/js-frontend.test.ts
 create mode 100644 packages/testing-framework/tests/unit-test/run-scenario.test.ts

diff --git a/packages/testing-framework/POC-GHERKIN.md b/packages/testing-framework/POC-GHERKIN.md
new file mode 100644
index 0000000000..385ced409d
--- /dev/null
+++ b/packages/testing-framework/POC-GHERKIN.md
@@ -0,0 +1,266 @@
+# POC: Two authoring front-ends over one shared flow-IR
+
+This POC extends the Phase 0 framework with composable, reusable "prompt
+flows" authored in **two surfaces** — a fluent JS/TS API and Gherkin
+`.feature` files — that compile to **one shared intermediate representation
+(flow-IR)**, which in turn lowers onto the existing engine node kinds
+(`ui` / `verify` / `soft` / `agent`). No step-definition code anywhere: every
+step is natural language executed by the AI agents. A third, **hybrid** mode
+(`bindFeature`) layers a sparse JS overlay over a `.feature` file.
+
+```
+ .feature files          .flows.ts files
+      │       └─────┐          │
+      │       bindFeature      │
+      │       (sparse JS       │
+      │        overlay)        │
+      │             │          │
+ frontends/gherkin  │     frontends/js
+ (@cucumber/gherkin │     (defineFlow / scenario /
+  pickles compiler) │      Given/When/Then/Soft …)
+      │             │          │
+      └────────┬────┴──────────┘
+               ▼
+        src/flow-ir  (ScenarioIR / FlowDefIR / FlowRegistry)
+               │   runScenario(): variable table, {var} substitution,
+               │   flow-call scoping & depth cap
+               ▼
+        engine/run-node.ts  (existing ui / verify / soft / agent)
+               │
+        UI Agent (aiAct / aiString)  +  GeneralAgentAdapter (verdicts)
+```
+
+## The IR (`src/flow-ir/`)
+
+Three step kinds (`types.ts`):
+
+| IR step    | Lowers to                                                        |
+| ---------- | ---------------------------------------------------------------- |
+| `prompt`   | one engine node: `ui` (setup/action), `verify`, `soft`, `agent`  |
+| `capture`  | structured extraction via the UI agent (`aiString`), stored in the variable table |
+| `callFlow` | the registered flow's steps, run in a fresh child scope          |
+
+**Variable table** (`substitute.ts`, `run-scenario.ts`): scenario-scoped,
+machine-owned. `capture` steps ("remember … as varName") extract values
+through `aiString`; later templates get **mechanical** `{varName}`
+substitution *before* any prompt is sent to a model. Unknown placeholders
+fail the step immediately (typo safety) without a model call. Model-owned
+prose conclusions keep flowing through the existing `StepOutput` channel —
+the two channels never mix.
+
+**Named flows** (`registry.ts`): parameterized prompt sequences in a
+`FlowRegistry`. Invocation semantics:
+
+- declared `params` only — missing/extra args fail the step;
+- a **fresh variable scope** inside the flow (args + its own captures);
+  caller variables are invisible;
+- only declared `returns` are copied back into the caller scope;
+- UI/browser state is naturally shared (same UI agent);
+- call depth is capped at 2 (`MAX_FLOW_CALL_DEPTH`); deeper nesting fails;
+- `memo: 'once-per-run'` is accepted but stubbed (TODO in
+  `run-scenario.ts`).
+
+**Keyword→policy mapping**: given-like → `ui` (setup), when-like → `ui`
+(action), then-like → `verify` (fail-closed), soft variants → `soft`
+(warn-only), advisory → `agent`.
+
+The executor `runScenario()` mirrors `runCase()`'s contract (same
+`CaseResult` shape plus a `variables` table; gating failures stop the flow,
+soft failures only warn) and reuses `runNode` directly, so the
+`GeneralAgentAdapter`, custom runtime nodes, context assembly and verdict
+fail-closed semantics are all the engine's existing behavior.
+
+## JS/TS front-end (`src/frontends/js/`)
+
+```ts
+import {
+  defineFlow, scenario, feature, createFlowRegistry,
+  Given, When, Then, Soft, remember, callFlow,
+} from '@midscene/testing-framework';
+
+const login = defineFlow({
+  name: 'Login',
+  params: ['role'],
+  returns: ['greeting'],
+  steps: [
+    When('I open the login page'),
+    When('I sign in as the "{role}" user'),
+    remember('the greeting shown in the header', 'greeting'),
+  ],
+});
+
+const checkout = scenario('Checkout as admin', [
+  Given('the demo shop is open on the home page'),
+  callFlow('Login', { role: 'admin' }),
+  remember('the price of the "Trail Backpack" product', 'price'),
+  'I add the "Trail Backpack" to the cart',          // bare string = When
+  Then('the cart total equals {price}'),
+]);
+```
+
+Keyword helpers are capitalized like cucumber-js (also: a lowercase `then`
+export would make the module namespace a thenable and break dynamic
+`import()`). Everything is plain JS values, so dynamic authoring (mapping
+over data, computed args, build-time conditionals) just works —
+`defineFlow` additionally runs cheap static scoping checks.
+
+## Gherkin front-end (`src/frontends/gherkin/`)
+
+`.feature` files are parsed with `@cucumber/gherkin` and compiled through
+its **pickles** API, so Scenario Outline expansion (example values
+substituted into step text), Background merging and tag inheritance come for
+free. Conventions on top:
+
+- `Given`/`When`/`Then`/`And`/`But` map per the policy table; pickle step
+  types already resolve And/But to the last primary keyword;
+- `@soft` tag on a scenario turns its `Then` steps into `soft` nodes;
+- `I remember <description> as "varName"` → capture step;
+- `I run the "FlowName" flow with role "admin" and region "eu"` → flow
+  invocation;
+- a Scenario tagged `@flow` is registered as a flow definition instead of a
+  runnable scenario; params/returns are tags: `@param:role`,
+  `@returns:greeting`. Background steps are excluded from `@flow` pickles so
+  a reusable flow never replays the feature's setup.
+
+## Hybrid mode: `bindFeature` (`src/frontends/js/bind-feature.ts`)
+
+Modeled on jest-cucumber's inverted binding (JS attaches to a loaded
+`.feature` and the two are validated to stay in sync), with one deliberate
+difference: jest-cucumber must mirror *every* step in JS because steps need
+somewhere to put code. AI execution removes that need, so the overlay is
+**sparse** — Gherkin stays the source of truth and unmentioned
+scenarios/steps run as pure Gherkin, no restatement required.
+
+```ts
+import { bindFeature } from '@midscene/testing-framework';
+
+const bound = bindFeature('flows/shop.feature', {
+  scenarios: {
+    'Checkout as admin': {
+      vars: { couponCode: computeCoupon() },          // inject computed variables
+      steps: [
+        {
+          at: 'I add the "Trail Backpack" to the cart and open the cart',
+          after: ['apply the coupon code {couponCode} in the cart'], // insert
+        },
+        {
+          at: 'the cart total equals {price}',
+          node: 'soft',                                // override node kind
+          template: 'the cart total roughly equals {price}', // override prompt
+        },
+        { at: 'Login', args: { role: 'auditor' } },    // adjust flow-call args
+      ],
+    },
+    'Promo banner is advisory': { skip: true },        // per-scenario config
+  },
+});
+// bound: CompiledFeature — same shape as compileFeature(), run via runScenario.
+```
+
+Binding glue is **title + anchor**: scenarios are keyed by title (a Scenario
+Outline title patches every expansion), steps by exact anchor text (prompt
+template, capture description, or flow name) or by index. Anchors always
+resolve against the *original* step list, so inserts never shift one
+another. `template`/`node` apply to prompt steps, `template` to captures,
+`args` to flow calls — mismatches fail at bind time.
+
+**Drift validation with codegen**: every overlay reference is checked at
+bind/compile time, never at execution time. An overlay pointing at a renamed
+scenario or step throws an error that names the closest match
+("Did you mean …?") and pastes a ready-to-use starter overlay listing every
+real anchor — jest-cucumber's best trick, applied to a sparse overlay:
+
+```
+[midscene] bindFeature(shop.feature): scenario "Checkout as admin" has no
+step matching anchor "the cart total equals {prce}".
+Did you mean "the cart total equals {price}"?
+Available anchors:
+
+scenarios: {
+  "Checkout as admin": {
+    steps: [
+      { at: "the demo shop is open on the home page" },  // 0: ui node
+      { at: "Login" },                                   // 1: flow call Login(role)
+      ...
+```
+
+### Choosing a mode
+
+| Mode | Use when |
+| --- | --- |
+| Pure Gherkin (`compileFeature`) | Non-engineers own the suite; no computed values or per-env tweaks needed. |
+| Pure JS (`defineFlow`/`scenario`) | The suite is generated or heavily dynamic (loops, conditionals, computed prompts); no BDD stakeholders. |
+| Bound overlay (`bindFeature`) | Gherkin is the shared source of truth, but a few scenarios need computed variables, env-specific arg tweaks, inserted steps, or skip/only flags — without forking the feature file or restating it in JS. |
+
+## Example
+
+`example/flows/shop.feature` and `example/flows/shop.flows.ts` author the
+same suite — a `Login` flow reused by a checkout scenario, a `@soft` promo
+check, and a per-role login matrix (Scenario Outline vs `roles.map(...)`).
+The test `tests/unit-test/example-parity.test.ts` proves both compile to the
+same IR and produce identical execution traces (same prompts to the UI
+agent, same verify prompts to the general agent, same final variable table)
+through the shared executor.
+
+`example/flows/shop.overlay.ts` shows the hybrid mode on the same feature: a
+computed coupon code injected into the checkout scenario's variable table,
+an inserted "apply the coupon" step that uses it, the exact-total verify
+downgraded to a reworded soft check, and the promo scenario skipped — while
+the login-matrix scenarios stay untouched pure Gherkin.
+
+Run programmatically (no CLI wiring yet):
+
+```ts
+import { compileFeatureFile, createFlowRegistry, runScenario } from '@midscene/testing-framework';
+
+const { scenarios, flows } = compileFeatureFile('flows/shop.feature');
+const registry = createFlowRegistry(flows);
+for (const s of scenarios) {
+  const result = await runScenario({ scenario: s, registry, uiAgent, generalAgent });
+}
+```
+
+## Validation
+
+- `pnpm --filter @midscene/testing-framework test` — 100 tests, all green
+  (63 new across `flow-ir.test.ts`, `js-frontend.test.ts`,
+  `gherkin-frontend.test.ts`, `run-scenario.test.ts`, `bind-feature.test.ts`,
+  `example-parity.test.ts`; fakes only, no browsers / no model calls).
+
+## Open questions / next steps
+
+- **Runner integration**: `runAll` / the CLI only discover `*.yaml`. Wire
+  `.feature` and `*.flows.ts` discovery into `discoverCases` + `runScenario`
+  so both surfaces run via `midscene-tf run`.
+- **Typed captures**: `capture` always extracts strings (`aiString`); add
+  number/boolean/structured (`aiQuery`) tiers and maybe a declared type in
+  the "remember" convention.
+- **Memoization**: implement `once-per-run` (memo table keyed by flow name +
+  resolved args, replaying returns) — useful for login-type flows; decide
+  whether UI state divergence makes replay unsafe by default.
+- **Flow-call reporting**: inner flow steps are flattened into the case's
+  step list after an `info` "Entering flow …" marker; reports may want a
+  nested view instead.
+- **Cross-file flow registries**: today a registry is built per
+  feature/module; decide on project-level registration (config field, glob
+  for `*.flows.ts`, shared between Gherkin and JS suites).
+- **Gherkin arg syntax**: the `with key "value" and key "value"` convention
+  is regex-based; data tables (`PickleStepArgument`) would be a more
+  Gherkin-native way to pass args (and to seed variables).
+- **Variable channel vs prose**: verify nodes still see capture steps in the
+  assembled context (as past-step outputs). That is intentional (the agent
+  may ground its verdict), but worth revisiting if it blurs the
+  machine/model ownership line.
+- **Overlay scope**: `bindFeature` overlays target runnable scenarios only;
+  `@flow` definitions are deliberately not overlayable (a flow is shared by
+  many call sites, so a per-feature patch would act at a distance). If the
+  need is real, a separate `flows:` overlay section with explicit semantics
+  is the way in.
+- **Anchor identity for prompts**: text anchors match the compiled anchor
+  text (prompt template / capture description / flow name), not the raw
+  Gherkin line — e.g. anchoring a `remember` step means anchoring its
+  description, and outline-expanded steps must be anchored by the expanded
+  text (or index). Keeping the original pickle text on IR steps would let
+  anchors match the literal `.feature` line instead.
+- **skip/only enforcement**: `scenario.config` is attached at the IR level
+  but nothing consumes it until runner integration lands.
diff --git a/packages/testing-framework/example/demo-app/index.html b/packages/testing-framework/example/demo-app/index.html
new file mode 100644
index 0000000000..8da4189e2e
--- /dev/null
+++ b/packages/testing-framework/example/demo-app/index.html
@@ -0,0 +1,121 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <title>Midscene POC Shop</title>
+    <style>
+      * { box-sizing: border-box; font-family: system-ui, sans-serif; }
+      body { margin: 0; color: #1f2933; }
+      header { display: flex; justify-content: space-between; align-items: center; padding: 16px 24px; background: #0b5fff; color: #fff; }
+      header button { padding: 8px 14px; border: none; border-radius: 6px; cursor: pointer; }
+      main { padding: 24px; max-width: 640px; margin: 0 auto; }
+      section { display: none; }
+      section.active { display: block; }
+      .card { border: 1px solid #e4e7eb; border-radius: 10px; padding: 16px; margin-bottom: 16px; }
+      .price { color: #0b5fff; font-weight: 600; }
+      button.primary { padding: 12px 20px; background: #0b5fff; color: #fff; border: none; border-radius: 6px; font-size: 16px; cursor: pointer; }
+      select, input { padding: 10px; border: 1px solid #cbd2d9; border-radius: 6px; margin-right: 8px; }
+      .row { display: flex; align-items: center; gap: 8px; margin: 12px 0; }
+      #cartTotal { font-size: 20px; font-weight: 700; }
+      .muted { color: #52606d; font-size: 14px; }
+    </style>
+  </head>
+  <body>
+    <header>
+      <strong>Midscene POC Shop</strong>
+      <div>
+        <span id="greeting" hidden></span>
+        <button id="navLogin">Login</button>
+      </div>
+    </header>
+    <main>
+      <section id="home" class="active">
+        <h2>Welcome to the POC shop</h2>
+        <div class="card" id="productCard">
+          <h3>Trail Backpack</h3>
+          <div class="price" id="productPrice">$129.00</div>
+          <p class="muted">Lightweight 28L pack for day hikes.</p>
+          <button class="primary" id="addToCart">Add to cart</button>
+        </div>
+        <button id="openCart">Open cart</button>
+      </section>
+
+      <section id="login">
+        <h2>Sign in</h2>
+        <div class="row">
+          <select id="role">
+            <option value="admin">admin</option>
+            <option value="guest">guest</option>
+          </select>
+          <button class="primary" id="signIn">Sign in with saved test credentials</button>
+        </div>
+      </section>
+
+      <section id="dashboard">
+        <h2 id="dashboardTitle"></h2>
+        <p class="muted">You are signed in. Use the header to keep shopping.</p>
+        <button id="backHome">Back to shop</button>
+      </section>
+
+      <section id="cart">
+        <h2>Your cart</h2>
+        <div class="card" id="cartLine" hidden>
+          <span>Trail Backpack</span> — <span class="price">$129.00</span>
+        </div>
+        <div class="row">
+          <input id="coupon" placeholder="Coupon code" />
+          <button id="applyCoupon">Apply coupon</button>
+        </div>
+        <p>Total: <span id="cartTotal">$0.00</span></p>
+        <p class="muted" id="couponNote" hidden></p>
+        <button id="backHome2">Back to shop</button>
+      </section>
+    </main>
+
+    <script>
+      const state = { role: null, inCart: false, discount: 0 };
+      const price = 129.0;
+      const show = (id) => {
+        document
+          .querySelectorAll('section')
+          .forEach((s) => s.classList.toggle('active', s.id === id));
+      };
+      const renderTotal = () => {
+        const total = state.inCart ? price * (1 - state.discount) : 0;
+        document.getElementById('cartTotal').textContent = `$${total.toFixed(2)}`;
+      };
+      document.getElementById('navLogin').addEventListener('click', () => show('login'));
+      document.getElementById('signIn').addEventListener('click', () => {
+        state.role = document.getElementById('role').value;
+        const pretty = state.role[0].toUpperCase() + state.role.slice(1);
+        const greeting = document.getElementById('greeting');
+        greeting.textContent = `Hello, ${pretty}!`;
+        greeting.hidden = false;
+        document.getElementById('dashboardTitle').textContent = `Dashboard (${state.role})`;
+        show('dashboard');
+      });
+      document.getElementById('addToCart').addEventListener('click', () => {
+        state.inCart = true;
+        document.getElementById('cartLine').hidden = false;
+        renderTotal();
+      });
+      document.getElementById('openCart').addEventListener('click', () => {
+        renderTotal();
+        show('cart');
+      });
+      document.getElementById('applyCoupon').addEventListener('click', () => {
+        const code = document.getElementById('coupon').value.trim();
+        if (!code) return;
+        state.discount = 0.1;
+        const note = document.getElementById('couponNote');
+        note.textContent = `Coupon "${code}" applied: 10% off.`;
+        note.hidden = false;
+        renderTotal();
+      });
+      for (const id of ['backHome', 'backHome2']) {
+        document.getElementById(id).addEventListener('click', () => show('home'));
+      }
+    </script>
+  </body>
+</html>
diff --git a/packages/testing-framework/example/flows/shop.feature b/packages/testing-framework/example/flows/shop.feature
new file mode 100644
index 0000000000..15349c0907
--- /dev/null
+++ b/packages/testing-framework/example/flows/shop.feature
@@ -0,0 +1,41 @@
+# POC: Gherkin front-end over the shared flow-IR.
+# Compile with `compileFeatureFile(...)` and execute with `runScenario(...)`.
+# The same flows + scenarios are authored in JS in ./shop.flows.ts.
+Feature: Checkout with a reusable login flow
+
+  Background:
+    Given the demo shop is open on the home page
+
+  # A named flow: registered in the FlowRegistry instead of run as a scenario.
+  # Params/returns are declared as tags; "{role}" is substituted mechanically
+  # from the caller's arguments before any prompt reaches the model.
+  @flow @param:role @returns:greeting
+  Scenario: Login
+    When I open the login page
+    And I sign in as the "{role}" user with the saved test credentials
+    Then the dashboard for the "{role}" role is visible
+    When I remember the greeting message shown in the header as "greeting"
+
+  Scenario: Checkout as admin
+    When I run the "Login" flow with role "admin"
+    And I remember the price of the "Trail Backpack" product as "price"
+    When I add the "Trail Backpack" to the cart and open the cart
+    Then the cart total equals {price}
+    But the cart does not show any error banner
+
+  # @soft turns Then steps into soft nodes: failures warn, never gate.
+  @soft
+  Scenario: Promo banner is advisory
+    Then a promo banner is visible at the top of the page
+
+  # Scenario Outline examples are expanded by the Gherkin pickles compiler;
+  # "<role>" is replaced per example row, while "{greeting}" stays a runtime
+  # variable filled by the Login flow's declared return.
+  Scenario Outline: Login greets every role
+    When I run the "Login" flow with role "<role>"
+    Then the header greets the user with {greeting}
+
+    Examples:
+      | role  |
+      | admin |
+      | guest |
diff --git a/packages/testing-framework/example/flows/shop.flows.ts b/packages/testing-framework/example/flows/shop.flows.ts
new file mode 100644
index 0000000000..6d8ca7ff9d
--- /dev/null
+++ b/packages/testing-framework/example/flows/shop.flows.ts
@@ -0,0 +1,63 @@
+/**
+ * POC: JS/TS front-end over the shared flow-IR — the exact counterpart of
+ * ./shop.feature. Both compile to the same IR and run through `runScenario`.
+ */
+import {
+  Given,
+  Soft,
+  Then,
+  When,
+  callFlow,
+  createFlowRegistry,
+  defineFlow,
+  feature,
+  remember,
+  scenario,
+} from '@midscene/testing-framework';
+
+// A named flow: parameterized, fresh variable scope inside (only `role` is
+// visible), and only the declared return (`greeting`) flows back to callers.
+export const loginFlow = defineFlow({
+  name: 'Login',
+  params: ['role'],
+  returns: ['greeting'],
+  steps: [
+    When('I open the login page'),
+    When('I sign in as the "{role}" user with the saved test credentials'),
+    Then('the dashboard for the "{role}" role is visible'),
+    remember('the greeting message shown in the header', 'greeting'),
+  ],
+});
+
+export const registry = createFlowRegistry([loginFlow]);
+
+const background = Given('the demo shop is open on the home page');
+
+export const checkoutAsAdmin = scenario('Checkout as admin', [
+  background,
+  callFlow('Login', { role: 'admin' }),
+  remember('the price of the "Trail Backpack" product', 'price'),
+  When('I add the "Trail Backpack" to the cart and open the cart'),
+  Then('the cart total equals {price}'),
+  Then('the cart does not show any error banner'),
+]);
+
+export const promoBanner = scenario('Promo banner is advisory', [
+  background,
+  Soft('a promo banner is visible at the top of the page'),
+]);
+
+// Dynamic authoring: plain JS replaces Scenario Outline examples.
+const roles = ['admin', 'guest'];
+
+export const shopFeature = feature('Checkout with a reusable login flow', [
+  checkoutAsAdmin,
+  promoBanner,
+  ...roles.map((role) =>
+    scenario(`Login greets every role (${role})`, [
+      background,
+      callFlow('Login', { role }),
+      Then('the header greets the user with {greeting}'),
+    ]),
+  ),
+]);
diff --git a/packages/testing-framework/example/flows/shop.overlay.ts b/packages/testing-framework/example/flows/shop.overlay.ts
new file mode 100644
index 0000000000..8530a554e2
--- /dev/null
+++ b/packages/testing-framework/example/flows/shop.overlay.ts
@@ -0,0 +1,38 @@
+/**
+ * POC: hybrid authoring mode — ./shop.feature stays the source of truth,
+ * and this sparse overlay attaches JS only where it adds something. Every
+ * scenario/step not mentioned here runs as pure Gherkin. Drift between this
+ * overlay and the feature fails at bind time with a corrected starter
+ * snippet in the error message (jest-cucumber style).
+ */
+import { join } from 'node:path';
+import { bindFeature } from '@midscene/testing-framework';
+
+// Computed at bind time — exactly the kind of value Gherkin cannot express.
+const couponCode = `E2E-${new Date().toISOString().slice(0, 10)}`;
+
+export const bound = bindFeature(join(__dirname, 'shop.feature'), {
+  scenarios: {
+    'Checkout as admin': {
+      // (b) inject a computed variable into the scenario's variable table.
+      vars: { couponCode },
+      steps: [
+        {
+          // (c) insert an extra step that uses the injected variable.
+          at: 'I add the "Trail Backpack" to the cart and open the cart',
+          after: ['apply the coupon code {couponCode} in the cart'],
+        },
+        {
+          // (a) override: the total now includes the coupon discount, so
+          // downgrade the exact-total check to a non-gating soft node.
+          at: 'the cart total equals {price}',
+          node: 'soft',
+          template:
+            'the cart total equals {price} minus the "{couponCode}" coupon discount',
+        },
+      ],
+    },
+    // (d) per-scenario config at the IR level.
+    'Promo banner is advisory': { skip: true },
+  },
+});
diff --git a/packages/testing-framework/package.json b/packages/testing-framework/package.json
index e3a10f8e68..0729ff5610 100644
--- a/packages/testing-framework/package.json
+++ b/packages/testing-framework/package.json
@@ -38,6 +38,8 @@
     "test:u": "vitest --run -u"
   },
   "dependencies": {
+    "@cucumber/gherkin": "^39.1.0",
+    "@cucumber/messages": "^32.3.1",
     "@earendil-works/pi-ai": "^0.78.0",
     "@earendil-works/pi-coding-agent": "^0.78.0",
     "@midscene/core": "workspace:*",
diff --git a/packages/testing-framework/scripts/demo/scripted-agents.ts b/packages/testing-framework/scripts/demo/scripted-agents.ts
new file mode 100644
index 0000000000..437c5666b6
--- /dev/null
+++ b/packages/testing-framework/scripts/demo/scripted-agents.ts
@@ -0,0 +1,119 @@
+/**
+ * Offline scripted agents for the reference demo. They simulate a plausible
+ * shop journey (login → greeting → add to cart → totals → coupon) with a tiny
+ * state machine — no browser, no model API. The same shape as the test fakes
+ * in tests/unit-test/helpers, but behavior-driven instead of queue-driven so
+ * all three authoring modes can run against one simulation.
+ */
+import type { Agent } from '@midscene/core/agent';
+import type {
+  GeneralAgentAdapter,
+  GeneralAgentInput,
+  GeneralAgentResult,
+} from '../../src/general-agent/types';
+
+const PRICE = 129.0;
+
+class ShopSimulation {
+  role: string | null = null;
+  inCart = false;
+  couponApplied = false;
+
+  get greeting(): string {
+    if (!this.role) return '(not signed in)';
+    return `Hello, ${this.role[0].toUpperCase()}${this.role.slice(1)}!`;
+  }
+
+  get total(): number {
+    if (!this.inCart) return 0;
+    return this.couponApplied ? PRICE * 0.9 : PRICE;
+  }
+}
+
+export class ScriptedUiAgent {
+  private readonly sim = new ShopSimulation();
+
+  async aiAct(instruction: string): Promise<string> {
+    const signIn = /sign in as the "([^"]+)" user/i.exec(instruction);
+    if (signIn) {
+      this.sim.role = signIn[1];
+      return `Signed in as ${signIn[1]}; the dashboard is shown.`;
+    }
+    if (/add .*to the cart/i.test(instruction)) {
+      this.sim.inCart = true;
+      return 'Added "Trail Backpack" to the cart and opened the cart view.';
+    }
+    if (/apply the coupon code/i.test(instruction)) {
+      this.sim.couponApplied = true;
+      return `Applied the coupon; the total is now $${this.sim.total.toFixed(2)}.`;
+    }
+    if (/login page/i.test(instruction)) {
+      return 'The login page is open.';
+    }
+    if (/home page/i.test(instruction)) {
+      return 'The shop home page is open.';
+    }
+    return `Done: ${instruction}`;
+  }
+
+  async aiAsk(_prompt: string): Promise<string> {
+    return 'The requested action was completed on the simulated page.';
+  }
+
+  async aiString(prompt: string): Promise<string> {
+    if (/greeting/i.test(prompt)) return this.sim.greeting;
+    if (/price/i.test(prompt)) return `$${PRICE.toFixed(2)}`;
+    if (/badge|count/i.test(prompt)) return this.sim.inCart ? '1' : '0';
+    return '(no value found on the simulated page)';
+  }
+
+  interface = {
+    screenshotBase64: async () => 'data:image/png;base64,SIMULATED',
+  };
+
+  asAgent(): Agent {
+    return this as unknown as Agent;
+  }
+
+  describeState(): string {
+    return `role=${this.sim.role ?? 'anonymous'}, cart=${this.sim.inCart ? 'Trail Backpack' : 'empty'}, total=$${this.sim.total.toFixed(2)}`;
+  }
+}
+
+export class ScriptedGeneralAgent implements GeneralAgentAdapter {
+  async run(input: GeneralAgentInput): Promise<GeneralAgentResult> {
+    const i = input.instruction;
+    // The simulated shop has no promo banner — the @soft scenario warns.
+    if (/promo banner/i.test(i)) {
+      return {
+        text: 'I looked at the top of the page and found no promo banner.',
+        verdict: {
+          pass: false,
+          reason: 'No promo banner is present on the simulated shop page.',
+        },
+      };
+    }
+    if (/coupon discount/i.test(i)) {
+      return {
+        text: 'The cart shows the discounted total.',
+        verdict: {
+          pass: true,
+          reason: `$${(PRICE * 0.9).toFixed(2)} equals $${PRICE.toFixed(2)} minus the 10% coupon.`,
+        },
+      };
+    }
+    if (/cart total/i.test(i)) {
+      return {
+        text: 'The cart total matches the captured price.',
+        verdict: {
+          pass: true,
+          reason: `The cart shows $${PRICE.toFixed(2)}, matching the remembered price.`,
+        },
+      };
+    }
+    return {
+      text: 'Confirmed against the simulated screen.',
+      verdict: { pass: true, reason: 'Confirmed on the simulated screen.' },
+    };
+  }
+}
diff --git a/packages/testing-framework/src/flow-ir/index.ts b/packages/testing-framework/src/flow-ir/index.ts
new file mode 100644
index 0000000000..dc5e65275d
--- /dev/null
+++ b/packages/testing-framework/src/flow-ir/index.ts
@@ -0,0 +1,25 @@
+/** POC: shared flow-IR — see `types.ts` for the design notes. */
+export {
+  MAX_FLOW_CALL_DEPTH,
+  assertIdentifier,
+} from './types';
+export type {
+  PromptRole,
+  PromptStepIR,
+  CaptureStepIR,
+  CallFlowStepIR,
+  FlowIRStep,
+  ScenarioIR,
+  ScenarioConfigIR,
+  FlowDefIR,
+  FeatureIR,
+} from './types';
+export { FlowRegistry, createFlowRegistry } from './registry';
+export { substitute, listPlaceholders } from './substitute';
+export type { VariableScope } from './substitute';
+export { runScenario } from './run-scenario';
+export type {
+  RunScenarioOptions,
+  ScenarioRunResult,
+  ScenarioRunEvent,
+} from './run-scenario';
diff --git a/packages/testing-framework/src/flow-ir/registry.ts b/packages/testing-framework/src/flow-ir/registry.ts
new file mode 100644
index 0000000000..5f6db7c43e
--- /dev/null
+++ b/packages/testing-framework/src/flow-ir/registry.ts
@@ -0,0 +1,64 @@
+/**
+ * POC: registry of named flows. Both front-ends register {@link FlowDefIR}s
+ * here; the IR executor resolves `callFlow` steps against it.
+ */
+import { type FlowDefIR, assertIdentifier } from './types';
+
+export class FlowRegistry {
+  private readonly flows = new Map<string, FlowDefIR>();
+
+  register(flow: FlowDefIR): void {
+    if (!flow.name.trim()) {
+      throw new Error('[midscene] FlowRegistry: a flow must have a name.');
+    }
+    if (this.flows.has(flow.name)) {
+      throw new Error(
+        `[midscene] FlowRegistry: flow "${flow.name}" is already registered.`,
+      );
+    }
+    if (flow.steps.length === 0) {
+      throw new Error(
+        `[midscene] FlowRegistry: flow "${flow.name}" has no steps.`,
+      );
+    }
+    for (const param of flow.params) {
+      assertIdentifier(param, `flow "${flow.name}" params`);
+    }
+    for (const ret of flow.returns) {
+      assertIdentifier(ret, `flow "${flow.name}" returns`);
+    }
+    this.flows.set(flow.name, flow);
+  }
+
+  registerAll(flows: Iterable<FlowDefIR>): void {
+    for (const flow of flows) {
+      this.register(flow);
+    }
+  }
+
+  has(name: string): boolean {
+    return this.flows.has(name);
+  }
+
+  get(name: string): FlowDefIR {
+    const flow = this.flows.get(name);
+    if (!flow) {
+      const known = [...this.flows.keys()].join(', ') || '(none)';
+      throw new Error(
+        `[midscene] Unknown flow "${name}". Registered flows: ${known}.`,
+      );
+    }
+    return flow;
+  }
+
+  names(): string[] {
+    return [...this.flows.keys()];
+  }
+}
+
+/** Convenience: build a registry from a list of flow definitions. */
+export function createFlowRegistry(flows: FlowDefIR[] = []): FlowRegistry {
+  const registry = new FlowRegistry();
+  registry.registerAll(flows);
+  return registry;
+}
diff --git a/packages/testing-framework/src/flow-ir/run-scenario.ts b/packages/testing-framework/src/flow-ir/run-scenario.ts
new file mode 100644
index 0000000000..b6b2471193
--- /dev/null
+++ b/packages/testing-framework/src/flow-ir/run-scenario.ts
@@ -0,0 +1,449 @@
+/**
+ * POC: the flow-IR executor. Walks a {@link ScenarioIR} and lowers each IR
+ * step onto the existing Phase 0 engine:
+ *
+ *  - `prompt` steps → `runNode` with the engine's ui / verify / soft / agent
+ *    semantics (verify gates fail-closed, soft only warns, agent is advisory);
+ *  - `capture` steps → a structured extraction via the UI agent
+ *    (`aiString`), with the result written into the machine-owned variable
+ *    table for the current scope;
+ *  - `callFlow` steps → recursive execution of the registered flow with a
+ *    fresh variable scope (declared args only), declared returns copied back
+ *    to the caller scope, and a hard cap on call depth.
+ *
+ * All templates go through mechanical `{varName}` substitution before any
+ * model sees them.
+ */
+import type { Agent } from '@midscene/core/agent';
+import { OutputStoreImpl } from '../engine/output-store';
+import { type RunNodeDeps, runNode } from '../engine/run-node';
+import type { GeneralAgentAdapter } from '../general-agent/types';
+import type { RuntimeNode } from '../runtime';
+import type { CaseResult, StepResult } from '../types';
+import { FlowRegistry } from './registry';
+import { type VariableScope, substitute } from './substitute';
+import {
+  type CallFlowStepIR,
+  type CaptureStepIR,
+  type FlowIRStep,
+  MAX_FLOW_CALL_DEPTH,
+  type PromptStepIR,
+  type ScenarioIR,
+} from './types';
+
+/**
+ * Observability events emitted while a scenario runs (e.g. for the demo's
+ * narrated walkthrough). Purely informational — handlers cannot alter
+ * execution.
+ */
+export type ScenarioRunEvent =
+  | {
+      type: 'stepStart';
+      index: number;
+      node: string;
+      /** Resolved input (after `{var}` substitution). */
+      input: string;
+      /** The authored template, when it differs from the resolved input. */
+      template?: string;
+      depth: number;
+    }
+  | { type: 'stepEnd'; result: StepResult; depth: number }
+  | {
+      type: 'varSet';
+      name: string;
+      value: string;
+      source: 'seed' | 'capture' | 'return';
+      depth: number;
+    }
+  | {
+      type: 'flowEnter';
+      flowName: string;
+      args: Record<string, string>;
+      depth: number;
+    }
+  | {
+      type: 'flowExit';
+      flowName: string;
+      returns: Record<string, string>;
+      depth: number;
+    };
+
+export interface RunScenarioOptions {
+  scenario: ScenarioIR;
+  /** Resolves `callFlow` steps. Defaults to an empty registry. */
+  registry?: FlowRegistry;
+  /** Source file the scenario came from, for reporting. */
+  file?: string;
+  uiAgent: Agent;
+  generalAgent: GeneralAgentAdapter;
+  runtimeNodes?: Record<string, RuntimeNode>;
+  projectRoot?: string;
+  env?: NodeJS.ProcessEnv;
+  /** Optional observer for narration/debugging. */
+  onEvent?: (event: ScenarioRunEvent) => void;
+}
+
+/** A {@link CaseResult} plus the final machine-owned variable table. */
+export interface ScenarioRunResult extends CaseResult {
+  /** Top-level scope after the run (captures + seed vars + flow returns). */
+  variables: Record<string, string>;
+}
+
+interface ExecCtx {
+  registry: FlowRegistry;
+  uiAgent: Agent;
+  generalAgent: GeneralAgentAdapter;
+  runtimeNodes: Record<string, RuntimeNode>;
+  projectRoot: string;
+  env: NodeJS.ProcessEnv;
+  caseName: string;
+  caseFile: string;
+  outputs: OutputStoreImpl;
+  state: Record<string, unknown>;
+  steps: StepResult[];
+  warnings: string[];
+  emit: (event: ScenarioRunEvent) => void;
+}
+
+/**
+ * Execute one compiled scenario. Mirrors the engine's `runCase` contract:
+ * never throws for step-level failures; a gating failure stops the flow.
+ */
+export async function runScenario(
+  options: RunScenarioOptions,
+): Promise<ScenarioRunResult> {
+  const { scenario } = options;
+  const ctx: ExecCtx = {
+    registry: options.registry ?? new FlowRegistry(),
+    uiAgent: options.uiAgent,
+    generalAgent: options.generalAgent,
+    runtimeNodes: options.runtimeNodes ?? {},
+    projectRoot: options.projectRoot ?? process.cwd(),
+    env: options.env ?? process.env,
+    caseName: scenario.name,
+    caseFile: options.file ?? '<ir>',
+    outputs: new OutputStoreImpl(),
+    state: {},
+    steps: [],
+    warnings: [],
+    emit: options.onEvent ?? (() => {}),
+  };
+
+  const scope: VariableScope = new Map(Object.entries(scenario.vars ?? {}));
+  for (const [name, value] of scope) {
+    ctx.emit({ type: 'varSet', name, value, source: 'seed', depth: 0 });
+  }
+  const startedAt = Date.now();
+
+  const ok = await execSteps(scenario.steps, scope, 0, ctx);
+
+  return {
+    name: scenario.name,
+    file: ctx.caseFile,
+    status: ok ? 'passed' : 'failed',
+    steps: ctx.steps,
+    warnings: ctx.warnings,
+    durationMs: Date.now() - startedAt,
+    reportFile: getReportFile(ctx.uiAgent),
+    variables: Object.fromEntries(scope),
+  };
+}
+
+/** Returns false when a gating failure stopped execution. */
+async function execSteps(
+  steps: FlowIRStep[],
+  scope: VariableScope,
+  depth: number,
+  ctx: ExecCtx,
+): Promise<boolean> {
+  for (const step of steps) {
+    const ok = await execStep(step, scope, depth, ctx);
+    if (!ok) return false;
+  }
+  return true;
+}
+
+async function execStep(
+  step: FlowIRStep,
+  scope: VariableScope,
+  depth: number,
+  ctx: ExecCtx,
+): Promise<boolean> {
+  switch (step.kind) {
+    case 'prompt':
+      return execPromptStep(step, scope, depth, ctx);
+    case 'capture':
+      return execCaptureStep(step, scope, depth, ctx);
+    case 'callFlow':
+      return execCallFlowStep(step, scope, depth, ctx);
+  }
+}
+
+async function execPromptStep(
+  step: PromptStepIR,
+  scope: VariableScope,
+  depth: number,
+  ctx: ExecCtx,
+): Promise<boolean> {
+  const index = ctx.steps.length;
+  const stepStart = Date.now();
+
+  let stepResult: StepResult;
+  try {
+    // Substitution happens here, mechanically, before any model call.
+    const resolved = substitute(
+      step.template,
+      scope,
+      `${ctx.caseName} step ${index + 1} (${step.node})`,
+    );
+    ctx.emit({
+      type: 'stepStart',
+      index,
+      node: step.node,
+      input: resolved,
+      template: resolved === step.template ? undefined : step.template,
+      depth,
+    });
+    const outcome = await runNode(step.node, resolved, nodeDeps(ctx));
+    stepResult = {
+      index,
+      node: step.node,
+      input: resolved,
+      status: outcome.status,
+      output: outcome.output,
+      verdict: outcome.verdict,
+      error: outcome.error,
+      durationMs: Date.now() - stepStart,
+    };
+  } catch (err) {
+    stepResult = {
+      index,
+      node: step.node,
+      input: step.template,
+      status: 'failed',
+      error: (err as Error).message,
+      durationMs: Date.now() - stepStart,
+    };
+  }
+
+  recordStep(stepResult, depth, ctx);
+  return stepResult.status !== 'failed';
+}
+
+async function execCaptureStep(
+  step: CaptureStepIR,
+  scope: VariableScope,
+  depth: number,
+  ctx: ExecCtx,
+): Promise<boolean> {
+  const index = ctx.steps.length;
+  const stepStart = Date.now();
+
+  let stepResult: StepResult;
+  try {
+    const resolved = substitute(
+      step.template,
+      scope,
+      `${ctx.caseName} step ${index + 1} (capture ${step.varName})`,
+    );
+    ctx.emit({
+      type: 'stepStart',
+      index,
+      node: 'capture',
+      input: resolved,
+      template: resolved === step.template ? undefined : step.template,
+      depth,
+    });
+    // Lower to a structured extraction on the UI agent. The value is
+    // machine-owned: it goes into the variable table, not into model prose.
+    const value = await ctx.uiAgent.aiString(resolved);
+    scope.set(step.varName, String(value));
+    ctx.emit({
+      type: 'varSet',
+      name: step.varName,
+      value: String(value),
+      source: 'capture',
+      depth,
+    });
+
+    stepResult = {
+      index,
+      node: 'capture',
+      input: resolved,
+      status: 'info',
+      output: {
+        text: `Captured variable {${step.varName}} = ${JSON.stringify(String(value))} (${resolved}).`,
+        structured: { [step.varName]: String(value) },
+      },
+      durationMs: Date.now() - stepStart,
+    };
+  } catch (err) {
+    stepResult = {
+      index,
+      node: 'capture',
+      input: step.template,
+      status: 'failed',
+      error: (err as Error).message,
+      durationMs: Date.now() - stepStart,
+    };
+  }
+
+  recordStep(stepResult, depth, ctx);
+  return stepResult.status !== 'failed';
+}
+
+async function execCallFlowStep(
+  step: CallFlowStepIR,
+  scope: VariableScope,
+  depth: number,
+  ctx: ExecCtx,
+): Promise<boolean> {
+  const index = ctx.steps.length;
+  const stepStart = Date.now();
+  const where = `${ctx.caseName} step ${index + 1} (flow "${step.flowName}")`;
+
+  let childScope: VariableScope;
+  let resolvedArgs: Record<string, string>;
+  try {
+    if (depth + 1 > MAX_FLOW_CALL_DEPTH) {
+      throw new Error(
+        `[midscene] ${where}: flow call depth exceeds the cap of ${MAX_FLOW_CALL_DEPTH}. Flatten the composition instead of nesting deeper.`,
+      );
+    }
+    const flow = ctx.registry.get(step.flowName);
+
+    for (const arg of Object.keys(step.args)) {
+      if (!flow.params.includes(arg)) {
+        throw new Error(
+          `[midscene] ${where}: unknown argument "${arg}". Declared params: ${flow.params.join(', ') || '(none)'}.`,
+        );
+      }
+    }
+    resolvedArgs = {};
+    childScope = new Map();
+    for (const param of flow.params) {
+      const template = step.args[param];
+      if (template === undefined) {
+        throw new Error(
+          `[midscene] ${where}: missing argument "${param}" (declared params: ${flow.params.join(', ')}).`,
+        );
+      }
+      // Args are resolved against the CALLER scope; the callee scope is fresh.
+      const value = substitute(template, scope, `${where} arg "${param}"`);
+      resolvedArgs[param] = value;
+      childScope.set(param, value);
+    }
+
+    // TODO(POC): flow.memo === 'once-per-run' should look up a per-run memo
+    // table keyed by (flowName, resolvedArgs) and replay returns on a hit.
+    // For now every call executes.
+
+    ctx.emit({
+      type: 'flowEnter',
+      flowName: step.flowName,
+      args: resolvedArgs,
+      depth: depth + 1,
+    });
+    recordStep(
+      {
+        index,
+        node: 'flow',
+        input: formatCall(step.flowName, resolvedArgs),
+        status: 'info',
+        output: {
+          text: `Entering flow "${step.flowName}" with ${formatArgs(resolvedArgs)}.`,
+        },
+        durationMs: Date.now() - stepStart,
+      },
+      depth,
+      ctx,
+    );
+
+    const ok = await execSteps(flow.steps, childScope, depth + 1, ctx);
+    if (!ok) return false;
+
+    // Only declared returns flow back; everything else in the callee scope
+    // is discarded.
+    const returns: Record<string, string> = {};
+    for (const ret of flow.returns) {
+      const value = childScope.get(ret);
+      if (value === undefined) {
+        throw new Error(
+          `[midscene] ${where}: flow declares return "${ret}" but never captured it.`,
+        );
+      }
+      scope.set(ret, value);
+      returns[ret] = value;
+      ctx.emit({ type: 'varSet', name: ret, value, source: 'return', depth });
+    }
+    ctx.emit({
+      type: 'flowExit',
+      flowName: step.flowName,
+      returns,
+      depth: depth + 1,
+    });
+    return true;
+  } catch (err) {
+    recordStep(
+      {
+        index: ctx.steps.length,
+        node: 'flow',
+        input: formatCall(step.flowName, step.args),
+        status: 'failed',
+        error: (err as Error).message,
+        durationMs: Date.now() - stepStart,
+      },
+      depth,
+      ctx,
+    );
+    return false;
+  }
+}
+
+function nodeDeps(ctx: ExecCtx): RunNodeDeps {
+  return {
+    uiAgent: ctx.uiAgent,
+    generalAgent: ctx.generalAgent,
+    runtimeNodes: ctx.runtimeNodes,
+    outputs: ctx.outputs,
+    state: ctx.state,
+    projectRoot: ctx.projectRoot,
+    caseName: ctx.caseName,
+    caseFile: ctx.caseFile,
+    pastSteps: ctx.steps,
+    env: ctx.env,
+  };
+}
+
+/** Mirror `runCase`'s bookkeeping for outputs and warnings. */
+function recordStep(stepResult: StepResult, depth: number, ctx: ExecCtx): void {
+  ctx.emit({ type: 'stepEnd', result: stepResult, depth });
+  ctx.steps.push(stepResult);
+  if (stepResult.output) {
+    ctx.outputs.add(stepResult.node, stepResult.index, stepResult.output);
+  }
+  if (stepResult.status === 'warning' && stepResult.error) {
+    ctx.warnings.push(stepResult.error);
+  }
+  if (stepResult.status === 'warning' && stepResult.verdict) {
+    ctx.warnings.push(
+      `soft check failed at step ${stepResult.index + 1} (${stepResult.node}): ${stepResult.verdict.reason}`,
+    );
+  }
+}
+
+function formatCall(flowName: string, args: Record<string, string>): string {
+  return `${flowName}(${formatArgs(args)})`;
+}
+
+function formatArgs(args: Record<string, string>): string {
+  const entries = Object.entries(args);
+  if (entries.length === 0) return 'no arguments';
+  return entries.map(([k, v]) => `${k}=${JSON.stringify(v)}`).join(', ');
+}
+
+function getReportFile(agent: Agent): string | undefined {
+  const candidate = (agent as unknown as { reportFile?: string | null })
+    .reportFile;
+  return candidate ?? undefined;
+}
diff --git a/packages/testing-framework/src/flow-ir/substitute.ts b/packages/testing-framework/src/flow-ir/substitute.ts
new file mode 100644
index 0000000000..95ace24600
--- /dev/null
+++ b/packages/testing-framework/src/flow-ir/substitute.ts
@@ -0,0 +1,36 @@
+/**
+ * POC: mechanical `{varName}` substitution against the scenario-scoped
+ * variable table. This runs BEFORE any prompt is sent to a model — the model
+ * only ever sees the resolved text. Unknown placeholders throw (fail fast on
+ * typos rather than letting the model guess).
+ */
+
+const PLACEHOLDER = /\{([A-Za-z_][A-Za-z0-9_]*)\}/g;
+
+export type VariableScope = Map<string, string>;
+
+export function substitute(
+  template: string,
+  vars: ReadonlyMap<string, string>,
+  where: string,
+): string {
+  return template.replace(PLACEHOLDER, (_match, name: string) => {
+    const value = vars.get(name);
+    if (value === undefined) {
+      const known = [...vars.keys()].join(', ') || '(none)';
+      throw new Error(
+        `[midscene] ${where}: unknown variable {${name}}. Variables in scope: ${known}.`,
+      );
+    }
+    return value;
+  });
+}
+
+/** All `{varName}` placeholder names referenced by a template, in order. */
+export function listPlaceholders(template: string): string[] {
+  const names: string[] = [];
+  for (const match of template.matchAll(PLACEHOLDER)) {
+    names.push(match[1]);
+  }
+  return names;
+}
diff --git a/packages/testing-framework/src/flow-ir/types.ts b/packages/testing-framework/src/flow-ir/types.ts
new file mode 100644
index 0000000000..acbfd5b283
--- /dev/null
+++ b/packages/testing-framework/src/flow-ir/types.ts
@@ -0,0 +1,121 @@
+/**
+ * POC: shared flow intermediate representation (flow-IR).
+ *
+ * Both authoring front-ends (the JS/TS fluent API in `frontends/js` and the
+ * Gherkin compiler in `frontends/gherkin`) compile to this IR. The IR executor
+ * (`run-scenario.ts`) then lowers each IR step onto the engine's existing node
+ * kinds (ui / verify / soft / agent via `runNode`), adding two capabilities on
+ * top of the Phase 0 engine:
+ *
+ *  - a scenario-scoped VARIABLE TABLE: `capture` steps extract machine-owned
+ *    values through the UI agent (`aiString`), and `{varName}` placeholders in
+ *    later step templates are substituted mechanically BEFORE the prompt is
+ *    sent to any model. Model-owned prose conclusions keep flowing through the
+ *    existing `StepOutput` channel — the two channels never mix.
+ *  - NAMED FLOWS: parameterized, reusable prompt sequences registered in a
+ *    {@link FlowRegistry}-shaped registry. A `callFlow` step runs the callee
+ *    with a fresh variable scope (seeded only with the declared args); only
+ *    the callee's declared `returns` flow back into the caller scope. UI /
+ *    browser state is naturally shared (same UI agent).
+ */
+import type { BuiltinNodeType } from '../types';
+
+/** Keyword→policy mapping: what authoring role a prompt step plays. */
+export type PromptRole = 'setup' | 'action' | 'assertion' | 'advisory';
+
+/**
+ * A natural-language prompt step. Lowers 1:1 onto an engine node:
+ * given-like → `ui` (setup), when-like → `ui` (action), then-like → `verify`
+ * (fail-closed), soft variants → `soft`, advisory → `agent`.
+ */
+export interface PromptStepIR {
+  kind: 'prompt';
+  node: BuiltinNodeType;
+  role: PromptRole;
+  /** Natural-language template; may contain `{varName}` placeholders. */
+  template: string;
+}
+
+/**
+ * Variable capture ("remember ... as varName"). Lowers to a structured
+ * extraction via the UI agent (`aiString`), storing the result in the current
+ * variable scope under {@link CaptureStepIR.varName}.
+ */
+export interface CaptureStepIR {
+  kind: 'capture';
+  /** What to extract, as natural language; may contain `{varName}` placeholders. */
+  template: string;
+  /** Machine-owned variable name the captured value is stored under. */
+  varName: string;
+}
+
+/** Invocation of a named flow from the registry. */
+export interface CallFlowStepIR {
+  kind: 'callFlow';
+  flowName: string;
+  /**
+   * Arguments by declared param name. Values are templates: `{varName}`
+   * placeholders are substituted against the CALLER scope before the call.
+   */
+  args: Record<string, string>;
+}
+
+export type FlowIRStep = PromptStepIR | CaptureStepIR | CallFlowStepIR;
+
+/**
+ * Per-scenario execution config attached at the IR level (e.g. by a
+ * `bindFeature` overlay). The IR executor itself ignores these — they are a
+ * contract for the runner layer (which is out of scope for this POC).
+ */
+export interface ScenarioConfigIR {
+  skip?: boolean;
+  only?: boolean;
+}
+
+/** A runnable scenario compiled from either front-end. */
+export interface ScenarioIR {
+  name: string;
+  steps: FlowIRStep[];
+  /** Seed variables (e.g. computed at build time by the JS front-end). */
+  vars?: Record<string, string>;
+  /** Front-end tags (e.g. Gherkin `@soft`), kept for reporting. */
+  tags?: string[];
+  /** Runner-facing flags (skip/only); absent unless explicitly attached. */
+  config?: ScenarioConfigIR;
+}
+
+/** A named, parameterized, reusable prompt sequence. */
+export interface FlowDefIR {
+  name: string;
+  /** Declared argument names; the fresh callee scope is seeded with exactly these. */
+  params: string[];
+  /** Variable names copied back into the caller scope after the flow finishes. */
+  returns: string[];
+  steps: FlowIRStep[];
+  /**
+   * Memoization tier. Only 'none' is implemented.
+   * TODO(POC): 'once-per-run' should skip re-execution and replay the
+   * memoized returns when the flow is called again with identical args.
+   */
+  memo?: 'none' | 'once-per-run';
+}
+
+/** A group of scenarios (Gherkin Feature / JS `feature()` builder). */
+export interface FeatureIR {
+  name: string;
+  scenarios: ScenarioIR[];
+}
+
+/** Flow calls may nest at most this deep (scenario itself is depth 0). */
+export const MAX_FLOW_CALL_DEPTH = 2;
+
+const IDENTIFIER = /^[A-Za-z_][A-Za-z0-9_]*$/;
+
+/** Variable / param names must be simple identifiers so `{name}` is unambiguous. */
+export function assertIdentifier(name: string, where: string): void {
+  if (!IDENTIFIER.test(name)) {
+    throw new Error(
+      `[midscene] ${where}: "${name}" is not a valid variable name (expected /^[A-Za-z_][A-Za-z0-9_]*$/).`,
+    );
+  }
+}
diff --git a/packages/testing-framework/src/frontends/gherkin/index.ts b/packages/testing-framework/src/frontends/gherkin/index.ts
new file mode 100644
index 0000000000..8cb86c0529
--- /dev/null
+++ b/packages/testing-framework/src/frontends/gherkin/index.ts
@@ -0,0 +1,237 @@
+/**
+ * POC: Gherkin authoring front-end over the shared flow-IR.
+ *
+ * `.feature` files are parsed with `@cucumber/gherkin` and compiled through
+ * its pickles API — Scenario Outline expansion (example values substituted
+ * into step text), Background merging (leading steps) and tag inheritance all
+ * come for free. Each pickle is then compiled to the same {@link ScenarioIR}
+ * the JS front-end produces.
+ *
+ * Keyword→policy mapping (pickle step types already resolve And/But to the
+ * last primary keyword):
+ *  - Given (Context)  → ui node, setup semantics
+ *  - When  (Action)   → ui node, action
+ *  - Then  (Outcome)  → verify node (fail-closed), or soft when the scenario
+ *    carries the `@soft` tag
+ *  - `*`   (Unknown)  → ui node, action
+ *
+ * Step conventions:
+ *  - `I remember <description> as "varName"` → variable capture
+ *  - `I run the "FlowName" flow with arg "value" and other "value"` → flow
+ *    invocation
+ *
+ * Flow definitions: a Scenario tagged `@flow` is registered as a named flow
+ * instead of a runnable scenario. Params and returns are declared as tags:
+ * `@param:role`, `@returns:greeting`.
+ */
+import { readFileSync } from 'node:fs';
+import {
+  AstBuilder,
+  GherkinClassicTokenMatcher,
+  Parser,
+  compile,
+} from '@cucumber/gherkin';
+import {
+  type GherkinDocument,
+  IdGenerator,
+  type Pickle,
+  type PickleStep,
+  PickleStepType,
+} from '@cucumber/messages';
+import type {
+  FlowDefIR,
+  FlowIRStep,
+  PromptStepIR,
+  ScenarioIR,
+} from '../../flow-ir';
+import { assertIdentifier } from '../../flow-ir';
+
+export interface CompiledFeature {
+  name: string;
+  /** Runnable scenarios (everything not tagged `@flow`). */
+  scenarios: ScenarioIR[];
+  /** Flow definitions (scenarios tagged `@flow`), ready for a FlowRegistry. */
+  flows: FlowDefIR[];
+}
+
+const REMEMBER_STEP = /^I remember (.+?) as "([A-Za-z_][A-Za-z0-9_]*)"$/i;
+const CALL_FLOW_STEP = /^I run the "([^"]+)" flow(?: with (.+))?$/i;
+const CALL_FLOW_ARG = /([A-Za-z_][A-Za-z0-9_]*)\s+"([^"]*)"/g;
+const PARAM_TAG = /^@param:([A-Za-z_][A-Za-z0-9_]*)$/;
+const RETURNS_TAG = /^@returns?:([A-Za-z_][A-Za-z0-9_]*)$/;
+
+/** Compile Gherkin source text into IR scenarios and flow definitions. */
+export function compileFeature(
+  source: string,
+  uri = '<inline>',
+): CompiledFeature {
+  const newId = IdGenerator.uuid();
+  const parser = new Parser(
+    new AstBuilder(newId),
+    new GherkinClassicTokenMatcher(),
+  );
+
+  let pickles: readonly Pickle[];
+  let featureName: string;
+  let backgroundStepIds: Set<string>;
+  try {
+    const document = parser.parse(source);
+    featureName = document.feature?.name ?? uri;
+    backgroundStepIds = collectBackgroundStepIds(document);
+    pickles = compile(document, uri, newId);
+  } catch (err) {
+    throw new Error(
+      `[midscene] Failed to parse Gherkin in ${uri}: ${(err as Error).message}`,
+    );
+  }
+
+  const scenarios: ScenarioIR[] = [];
+  const flows: FlowDefIR[] = [];
+
+  for (const pickle of pickles) {
+    const tags = pickle.tags.map((t) => t.name);
+    if (tags.includes('@flow')) {
+      flows.push(compileFlowDef(pickle, tags, uri, backgroundStepIds));
+    } else {
+      scenarios.push(compileScenario(pickle, tags, uri));
+    }
+  }
+
+  return { name: featureName, scenarios, flows };
+}
+
+/** Convenience wrapper: read and compile a `.feature` file. */
+export function compileFeatureFile(file: string): CompiledFeature {
+  return compileFeature(readFileSync(file, 'utf-8'), file);
+}
+
+function compileScenario(
+  pickle: Pickle,
+  tags: string[],
+  uri: string,
+): ScenarioIR {
+  const isSoft = tags.includes('@soft');
+  return {
+    name: pickle.name,
+    steps: pickle.steps.map((step) =>
+      compileStep(step, { isSoft, where: `${uri}: "${pickle.name}"` }),
+    ),
+    tags,
+  };
+}
+
+function compileFlowDef(
+  pickle: Pickle,
+  tags: string[],
+  uri: string,
+  backgroundStepIds: Set<string>,
+): FlowDefIR {
+  const where = `${uri}: flow "${pickle.name}"`;
+  const params: string[] = [];
+  const returns: string[] = [];
+  for (const tag of tags) {
+    const param = PARAM_TAG.exec(tag);
+    if (param) params.push(param[1]);
+    const ret = RETURNS_TAG.exec(tag);
+    if (ret) returns.push(ret[1]);
+  }
+  const isSoft = tags.includes('@soft');
+  // Background steps belong to runnable scenarios, not to reusable flows:
+  // a flow invoked mid-scenario must not replay the feature's setup.
+  const steps = pickle.steps.filter(
+    (step) => !step.astNodeIds.some((id) => backgroundStepIds.has(id)),
+  );
+  return {
+    name: pickle.name,
+    params,
+    returns,
+    steps: steps.map((step) => compileStep(step, { isSoft, where })),
+  };
+}
+
+/** IDs of all Background steps (feature-level and inside Rules). */
+function collectBackgroundStepIds(document: GherkinDocument): Set<string> {
+  const ids = new Set<string>();
+  for (const child of document.feature?.children ?? []) {
+    const backgrounds = child.background
+      ? [child.background]
+      : (child.rule?.children ?? [])
+          .map((ruleChild) => ruleChild.background)
+          .filter((bg) => bg !== undefined);
+    for (const background of backgrounds) {
+      for (const step of background.steps) {
+        ids.add(step.id);
+      }
+    }
+  }
+  return ids;
+}
+
+function compileStep(
+  step: PickleStep,
+  opts: { isSoft: boolean; where: string },
+): FlowIRStep {
+  const text = step.text.trim();
+
+  const remember = REMEMBER_STEP.exec(text);
+  if (remember) {
+    const [, description, varName] = remember;
+    assertIdentifier(varName, opts.where);
+    return { kind: 'capture', template: description.trim(), varName };
+  }
+
+  const call = CALL_FLOW_STEP.exec(text);
+  if (call) {
+    const [, flowName, argClause] = call;
+    return {
+      kind: 'callFlow',
+      flowName,
+      args: parseCallArgs(argClause, flowName, opts.where),
+    };
+  }
+
+  return promptFromPickleType(step, text, opts);
+}
+
+function parseCallArgs(
+  argClause: string | undefined,
+  flowName: string,
+  where: string,
+): Record<string, string> {
+  const args: Record<string, string> = {};
+  if (argClause === undefined) return args;
+
+  const matches = [...argClause.matchAll(CALL_FLOW_ARG)];
+  if (matches.length === 0) {
+    throw new Error(
+      `[midscene] ${where}: could not parse arguments for flow "${flowName}" from "${argClause}". Expected: with name "value" and other "value".`,
+    );
+  }
+  for (const [, name, value] of matches) {
+    args[name] = value;
+  }
+  return args;
+}
+
+function promptFromPickleType(
+  step: PickleStep,
+  text: string,
+  opts: { isSoft: boolean; where: string },
+): PromptStepIR {
+  // Pickle step types come from the Gherkin compiler, which already resolves
+  // And/But (conjunctions) to the last primary keyword.
+  switch (step.type) {
+    case PickleStepType.CONTEXT:
+      return { kind: 'prompt', node: 'ui', role: 'setup', template: text };
+    case PickleStepType.OUTCOME:
+      return {
+        kind: 'prompt',
+        node: opts.isSoft ? 'soft' : 'verify',
+        role: 'assertion',
+        template: text,
+      };
+    default:
+      // ACTION and UNKNOWN (`*` bullets) both run as plain UI actions.
+      return { kind: 'prompt', node: 'ui', role: 'action', template: text };
+  }
+}
diff --git a/packages/testing-framework/src/frontends/js/bind-feature.ts b/packages/testing-framework/src/frontends/js/bind-feature.ts
new file mode 100644
index 0000000000..8ce886758a
--- /dev/null
+++ b/packages/testing-framework/src/frontends/js/bind-feature.ts
@@ -0,0 +1,386 @@
+import type {
+  CallFlowStepIR,
+  FlowIRStep,
+  ScenarioConfigIR,
+  ScenarioIR,
+} from '../../flow-ir';
+import { assertIdentifier } from '../../flow-ir';
+/**
+ * POC: hybrid authoring mode — `bindFeature(featurePathOrSource, overlay)`.
+ *
+ * Inspired by jest-cucumber's inverted model, with one deliberate inversion of
+ * its inversion: jest-cucumber requires every step to be restated in JS
+ * because steps need somewhere to put code. AI execution removes that need,
+ * so the overlay here is SPARSE — the `.feature` file stays the source of
+ * truth, scenarios/steps not mentioned in the overlay run as pure Gherkin,
+ * and JS only attaches where it adds something:
+ *
+ *  - override an anchored step's prompt template or node kind;
+ *  - inject computed variables into the scenario's variable table, or extra
+ *    args into an anchored flow call;
+ *  - insert extra IR steps before/after an anchored step;
+ *  - attach per-scenario runner config (skip/only) at the IR level.
+ *
+ * Binding glue is title + anchor (exact step text or index), and drift is
+ * validated at bind time with jest-cucumber-style errors: closest matches
+ * plus a ready-to-paste corrected overlay snippet.
+ */
+import type { BuiltinNodeType } from '../../types';
+import {
+  type CompiledFeature,
+  compileFeature,
+  compileFeatureFile,
+} from '../gherkin';
+import { type StepInput, When } from './index';
+
+/** Anchor a step by its exact text (see {@link anchorText}) or its index. */
+export type StepAnchor = string | number;
+
+export interface StepOverlay {
+  /**
+   * Which step this overlay binds to. Text anchors match the step's
+   * "anchor text": the prompt template for prompt steps, the capture
+   * description for `remember` steps, and the flow name for flow calls.
+   */
+  at: StepAnchor;
+  /** Override the step's natural-language template (prompt/capture steps). */
+  template?: string;
+  /** Override the node kind (prompt steps only), e.g. verify → soft. */
+  node?: BuiltinNodeType;
+  /** Merge computed args into an anchored flow call. */
+  args?: Record<string, string | number | boolean>;
+  /** Extra steps inserted before/after the anchored step. */
+  before?: StepInput[];
+  after?: StepInput[];
+}
+
+export interface ScenarioOverlay {
+  /** Computed variables injected into the scenario's variable table. */
+  vars?: Record<string, string | number | boolean>;
+  steps?: StepOverlay[];
+  /** Runner-facing flags, attached to the IR as `scenario.config`. */
+  skip?: boolean;
+  only?: boolean;
+}
+
+export interface FeatureOverlay {
+  /** Keyed by scenario title. Unmentioned scenarios run as pure Gherkin. */
+  scenarios?: Record<string, ScenarioOverlay>;
+}
+
+/**
+ * Compile a `.feature` (path or inline source — sources are detected by
+ * containing a newline) and apply a sparse JS overlay. Throws at bind time on
+ * any drift between the overlay and the feature.
+ */
+export function bindFeature(
+  featurePathOrSource: string,
+  overlay: FeatureOverlay = {},
+): CompiledFeature {
+  const isSource = featurePathOrSource.includes('\n');
+  const uri = isSource ? '<inline>' : featurePathOrSource;
+  const compiled = isSource
+    ? compileFeature(featurePathOrSource, uri)
+    : compileFeatureFile(featurePathOrSource);
+
+  const overlays = overlay.scenarios ?? {};
+  const titles = new Set(compiled.scenarios.map((s) => s.name));
+
+  for (const title of Object.keys(overlays)) {
+    if (!titles.has(title)) {
+      throw unknownScenarioError(title, compiled, uri);
+    }
+  }
+
+  // A title may expand to several scenarios (Scenario Outline); the overlay
+  // applies to every expansion.
+  const scenarios = compiled.scenarios.map((s) => {
+    const scenarioOverlay = overlays[s.name];
+    return scenarioOverlay ? applyScenarioOverlay(s, scenarioOverlay, uri) : s;
+  });
+
+  return { ...compiled, scenarios };
+}
+
+/** The text a step overlay's `at:` anchor is matched against. */
+export function anchorText(step: FlowIRStep): string {
+  switch (step.kind) {
+    case 'prompt':
+    case 'capture':
+      return step.template;
+    case 'callFlow':
+      return step.flowName;
+  }
+}
+
+// ———————————————————————— overlay application ————————————————————————
+
+function applyScenarioOverlay(
+  scenario: ScenarioIR,
+  overlay: ScenarioOverlay,
+  uri: string,
+): ScenarioIR {
+  const where = `bindFeature(${uri}): scenario "${scenario.name}"`;
+
+  interface Patch {
+    overlays: StepOverlay[];
+    before: FlowIRStep[];
+    after: FlowIRStep[];
+  }
+  const patches = new Map<number, Patch>();
+
+  for (const stepOverlay of overlay.steps ?? []) {
+    // All anchors resolve against the ORIGINAL step list, so several
+    // overlays never shift each other's positions.
+    const index = resolveAnchor(stepOverlay.at, scenario, uri);
+    validateStepOverlay(stepOverlay, scenario.steps[index], index, where);
+
+    const patch = patches.get(index) ?? {
+      overlays: [],
+      before: [],
+      after: [],
+    };
+    patch.overlays.push(stepOverlay);
+    patch.before.push(...normalizeInserts(stepOverlay.before));
+    patch.after.push(...normalizeInserts(stepOverlay.after));
+    patches.set(index, patch);
+  }
+
+  const steps: FlowIRStep[] = [];
+  for (let i = 0; i < scenario.steps.length; i++) {
+    const patch = patches.get(i);
+    if (!patch) {
+      steps.push(scenario.steps[i]);
+      continue;
+    }
+    steps.push(...patch.before);
+    steps.push(patch.overlays.reduce(patchStep, scenario.steps[i]));
+    steps.push(...patch.after);
+  }
+
+  const result: ScenarioIR = { ...scenario, steps };
+
+  if (overlay.vars) {
+    const vars: Record<string, string> = { ...scenario.vars };
+    for (const [key, value] of Object.entries(overlay.vars)) {
+      assertIdentifier(key, `${where} overlay vars`);
+      vars[key] = String(value);
+    }
+    result.vars = vars;
+  }
+
+  if (overlay.skip !== undefined || overlay.only !== undefined) {
+    const config: ScenarioConfigIR = {};
+    if (overlay.skip !== undefined) config.skip = overlay.skip;
+    if (overlay.only !== undefined) config.only = overlay.only;
+    result.config = config;
+  }
+
+  return result;
+}
+
+function patchStep(step: FlowIRStep, overlay: StepOverlay): FlowIRStep {
+  switch (step.kind) {
+    case 'prompt':
+      return {
+        ...step,
+        template: overlay.template ?? step.template,
+        node: overlay.node ?? step.node,
+      };
+    case 'capture':
+      return { ...step, template: overlay.template ?? step.template };
+    case 'callFlow':
+      return overlay.args
+        ? { ...step, args: { ...step.args, ...stringifyArgs(overlay.args) } }
+        : step;
+  }
+}
+
+function normalizeInserts(inserts: StepInput[] | undefined): FlowIRStep[] {
+  return (inserts ?? []).map((s) => (typeof s === 'string' ? When(s) : s));
+}
+
+function stringifyArgs(
+  args: Record<string, string | number | boolean>,
+): Record<string, string> {
+  const out: Record<string, string> = {};
+  for (const [key, value] of Object.entries(args)) {
+    assertIdentifier(key, 'bindFeature overlay args');
+    out[key] = String(value);
+  }
+  return out;
+}
+
+// ——————————————————— bind-time drift validation ———————————————————
+
+function resolveAnchor(
+  anchor: StepAnchor,
+  scenario: ScenarioIR,
+  uri: string,
+): number {
+  const where = `bindFeature(${uri}): scenario "${scenario.name}"`;
+
+  if (typeof anchor === 'number') {
+    if (
+      !Number.isInteger(anchor) ||
+      anchor < 0 ||
+      anchor >= scenario.steps.length
+    ) {
+      throw new Error(
+        `[midscene] ${where}: step anchor ${anchor} is out of range (the scenario has ${scenario.steps.length} steps, indices 0–${scenario.steps.length - 1}).\n\n${anchorListing(scenario)}`,
+      );
+    }
+    return anchor;
+  }
+
+  const matches: number[] = [];
+  scenario.steps.forEach((step, i) => {
+    if (anchorText(step) === anchor) matches.push(i);
+  });
+
+  if (matches.length === 1) return matches[0];
+
+  if (matches.length > 1) {
+    const byIndex = matches
+      .map((i) => `  { at: ${i} },  // ${describeStep(scenario.steps[i])}`)
+      .join('\n');
+    throw new Error(
+      `[midscene] ${where}: step anchor ${JSON.stringify(anchor)} is ambiguous (matches steps ${matches.join(', ')}). Anchor by index instead:\n\n${byIndex}`,
+    );
+  }
+
+  const closest = closestMatch(
+    anchor,
+    scenario.steps.map((s) => anchorText(s)),
+  );
+  const hint = closest ? `Did you mean ${JSON.stringify(closest)}?\n\n` : '';
+  throw new Error(
+    `[midscene] ${where}: no step matches anchor ${JSON.stringify(anchor)}. ${hint}${anchorListing(scenario)}`,
+  );
+}
+
+function validateStepOverlay(
+  overlay: StepOverlay,
+  step: FlowIRStep,
+  index: number,
+  where: string,
+): void {
+  const target = `step ${index} (${describeStep(step)})`;
+  if (overlay.node !== undefined && step.kind !== 'prompt') {
+    throw new Error(
+      `[midscene] ${where}: \`node\` can only override prompt steps, but ${target} is a ${step.kind} step.`,
+    );
+  }
+  if (overlay.template !== undefined && step.kind === 'callFlow') {
+    throw new Error(
+      `[midscene] ${where}: \`template\` cannot override ${target}; use \`args\` to adjust a flow call.`,
+    );
+  }
+  if (overlay.args !== undefined && step.kind !== 'callFlow') {
+    throw new Error(
+      `[midscene] ${where}: \`args\` only applies to flow-call steps, but ${target} is a ${step.kind} step.`,
+    );
+  }
+}
+
+function unknownScenarioError(
+  title: string,
+  compiled: CompiledFeature,
+  uri: string,
+): Error {
+  const head = `[midscene] bindFeature(${uri}): overlay references unknown scenario ${JSON.stringify(title)}.`;
+
+  // A common drift: targeting a @flow definition, which is not a runnable
+  // scenario and cannot be overlaid.
+  if (compiled.flows.some((f) => f.name === title)) {
+    return new Error(
+      `${head} ${JSON.stringify(title)} is a @flow definition; overlays only target runnable scenarios.`,
+    );
+  }
+
+  const titles = [...new Set(compiled.scenarios.map((s) => s.name))];
+  const closest = closestMatch(title, titles);
+  const hint = closest ? `Did you mean ${JSON.stringify(closest)}?\n` : '';
+  const snippetFor = closest
+    ? compiled.scenarios.find((s) => s.name === closest)
+    : compiled.scenarios[0];
+
+  return new Error(
+    `${head}\n${hint}Scenario titles in this feature: ${titles.map((t) => JSON.stringify(t)).join(', ')}.\n\nStarter overlay:\n\n${snippetFor ? overlaySnippet(snippetFor) : '(the feature has no runnable scenarios)'}`,
+  );
+}
+
+// ————————————————————— codegen for error messages —————————————————————
+
+/** Ready-to-paste overlay skeleton for one scenario (jest-cucumber style). */
+function overlaySnippet(scenario: ScenarioIR): string {
+  const lines: string[] = [];
+  lines.push('scenarios: {');
+  lines.push(`  ${JSON.stringify(scenario.name)}: {`);
+  lines.push('    steps: [');
+  scenario.steps.forEach((step, i) => {
+    lines.push(
+      `      { at: ${JSON.stringify(anchorText(step))} },  // ${i}: ${describeStep(step)}`,
+    );
+  });
+  lines.push('    ],');
+  lines.push('  },');
+  lines.push('},');
+  return lines.join('\n');
+}
+
+function anchorListing(scenario: ScenarioIR): string {
+  return `Available anchors:\n\n${overlaySnippet(scenario)}`;
+}
+
+function describeStep(step: FlowIRStep): string {
+  switch (step.kind) {
+    case 'prompt':
+      return `${step.node} node`;
+    case 'capture':
+      return `capture → {${step.varName}}`;
+    case 'callFlow':
+      return `flow call ${formatCallShort(step)}`;
+  }
+}
+
+function formatCallShort(step: CallFlowStepIR): string {
+  return `${step.flowName}(${Object.keys(step.args).join(', ')})`;
+}
+
+function closestMatch(needle: string, haystack: string[]): string | undefined {
+  let best: string | undefined;
+  let bestDistance = Number.POSITIVE_INFINITY;
+  for (const candidate of haystack) {
+    const distance = levenshtein(needle.toLowerCase(), candidate.toLowerCase());
+    if (distance < bestDistance) {
+      bestDistance = distance;
+      best = candidate;
+    }
+  }
+  // Only suggest when reasonably close (less than half the title differs).
+  if (best && bestDistance <= Math.max(needle.length, best.length) / 2) {
+    return best;
+  }
+  return undefined;
+}
+
+function levenshtein(a: string, b: string): number {
+  if (a === b) return 0;
+  const prev = new Array<number>(b.length + 1);
+  for (let j = 0; j <= b.length; j++) prev[j] = j;
+  for (let i = 1; i <= a.length; i++) {
+    let diagonal = prev[0];
+    prev[0] = i;
+    for (let j = 1; j <= b.length; j++) {
+      const next = Math.min(
+        prev[j] + 1,
+        prev[j - 1] + 1,
+        diagonal + (a[i - 1] === b[j - 1] ? 0 : 1),
+      );
+      diagonal = prev[j];
+      prev[j] = next;
+    }
+  }
+  return prev[b.length];
+}
diff --git a/packages/testing-framework/src/frontends/js/index.ts b/packages/testing-framework/src/frontends/js/index.ts
new file mode 100644
index 0000000000..1fff6e0078
--- /dev/null
+++ b/packages/testing-framework/src/frontends/js/index.ts
@@ -0,0 +1,223 @@
+/**
+ * POC: JS/TS authoring front-end over the shared flow-IR.
+ *
+ * A fluent, typed API in the spirit of `defineMidsceneConfig` /
+ * `defineRuntime`: steps are natural-language strings with `{var}`
+ * placeholders, flows are declared with `defineFlow({...})`, and scenarios /
+ * features are assembled with `scenario()` / `feature()`. Because everything
+ * is plain JS values, dynamic authoring (computed args, conditionals, mapping
+ * over data) happens naturally at build time — the output is always the same
+ * static IR the Gherkin front-end produces.
+ *
+ * Keyword→policy mapping:
+ *  - `Given(...)` → ui node (setup semantics)
+ *  - `When(...)` / bare string → ui node (action)
+ *  - `Then(...)` → verify node (fail-closed)
+ *  - `Soft(...)` → soft node (warns, never gates)
+ *  - `Advisory(...)` → agent node (free-form analysis)
+ *  - `remember(description, varName)` → variable capture
+ *  - `callFlow(name, args)` → named-flow invocation
+ *
+ * A third, hybrid mode lives in `./bind-feature`: `bindFeature()` compiles a
+ * `.feature` file and applies a sparse JS overlay (see that module's docs).
+ */
+import {
+  type CallFlowStepIR,
+  type CaptureStepIR,
+  type FeatureIR,
+  type FlowDefIR,
+  type FlowIRStep,
+  type PromptStepIR,
+  type ScenarioIR,
+  assertIdentifier,
+  listPlaceholders,
+} from '../../flow-ir';
+
+/** A step in the fluent API: an IR step, or a bare string (= `when`). */
+export type StepInput = FlowIRStep | string;
+
+// Note: keyword helpers are capitalized like cucumber-js (`Given`/`When`/
+// `Then`). A lowercase `then` export would also make the module namespace a
+// thenable, which breaks dynamic `import()` of this module.
+export function Given(template: string): PromptStepIR {
+  return promptStep('ui', 'setup', template, 'Given');
+}
+
+export function When(template: string): PromptStepIR {
+  return promptStep('ui', 'action', template, 'When');
+}
+
+export function Then(template: string): PromptStepIR {
+  return promptStep('verify', 'assertion', template, 'Then');
+}
+
+export function Soft(template: string): PromptStepIR {
+  return promptStep('soft', 'assertion', template, 'Soft');
+}
+
+export function Advisory(template: string): PromptStepIR {
+  return promptStep('agent', 'advisory', template, 'Advisory');
+}
+
+/** "Remember <description> as {varName}" — machine-owned variable capture. */
+export function remember(description: string, varName: string): CaptureStepIR {
+  if (!description.trim()) {
+    throw new Error('[midscene] remember(): description must not be empty.');
+  }
+  assertIdentifier(varName, 'remember()');
+  return { kind: 'capture', template: description, varName };
+}
+
+/** Invoke a registered named flow. Arg values may use `{var}` placeholders. */
+export function callFlow(
+  flowName: string,
+  args: Record<string, string | number | boolean> = {},
+): CallFlowStepIR {
+  if (!flowName.trim()) {
+    throw new Error('[midscene] callFlow(): flow name must not be empty.');
+  }
+  const normalized: Record<string, string> = {};
+  for (const [key, value] of Object.entries(args)) {
+    assertIdentifier(key, `callFlow("${flowName}") args`);
+    normalized[key] = String(value);
+  }
+  return { kind: 'callFlow', flowName, args: normalized };
+}
+
+export interface DefineFlowInput {
+  name: string;
+  params?: string[];
+  returns?: string[];
+  steps: StepInput[];
+  /** TODO(POC): only 'none' is implemented; 'once-per-run' is accepted but ignored. */
+  memo?: 'none' | 'once-per-run';
+}
+
+/** Declare a named, parameterized, reusable prompt flow. */
+export function defineFlow(input: DefineFlowInput): FlowDefIR {
+  if (!input.name?.trim()) {
+    throw new Error('[midscene] defineFlow(): a flow must have a name.');
+  }
+  const params = input.params ?? [];
+  const returns = input.returns ?? [];
+  for (const param of params) {
+    assertIdentifier(param, `defineFlow("${input.name}") params`);
+  }
+  for (const ret of returns) {
+    assertIdentifier(ret, `defineFlow("${input.name}") returns`);
+  }
+  const steps = normalizeSteps(input.steps, `defineFlow("${input.name}")`);
+
+  validateFlowScoping(input.name, params, returns, steps);
+
+  return { name: input.name, params, returns, steps, memo: input.memo };
+}
+
+export interface ScenarioOptions {
+  /** Seed variables available to `{var}` placeholders from the first step. */
+  vars?: Record<string, string | number | boolean>;
+  tags?: string[];
+}
+
+/** Assemble a runnable scenario from fluent steps. */
+export function scenario(
+  name: string,
+  steps: StepInput[],
+  options: ScenarioOptions = {},
+): ScenarioIR {
+  if (!name.trim()) {
+    throw new Error('[midscene] scenario(): a scenario must have a name.');
+  }
+  const vars: Record<string, string> = {};
+  for (const [key, value] of Object.entries(options.vars ?? {})) {
+    assertIdentifier(key, `scenario("${name}") vars`);
+    vars[key] = String(value);
+  }
+  return {
+    name,
+    steps: normalizeSteps(steps, `scenario("${name}")`),
+    vars,
+    tags: options.tags ?? [],
+  };
+}
+
+/** Group scenarios, mirroring a Gherkin Feature. */
+export function feature(name: string, scenarios: ScenarioIR[]): FeatureIR {
+  if (!name.trim()) {
+    throw new Error('[midscene] feature(): a feature must have a name.');
+  }
+  return { name, scenarios };
+}
+
+function promptStep(
+  node: PromptStepIR['node'],
+  role: PromptStepIR['role'],
+  template: string,
+  helper: string,
+): PromptStepIR {
+  if (!template.trim()) {
+    throw new Error(`[midscene] ${helper}(): the prompt must not be empty.`);
+  }
+  return { kind: 'prompt', node, role, template };
+}
+
+function normalizeSteps(steps: StepInput[], where: string): FlowIRStep[] {
+  if (!Array.isArray(steps) || steps.length === 0) {
+    throw new Error(`[midscene] ${where}: steps must be a non-empty array.`);
+  }
+  return steps.map((step) => (typeof step === 'string' ? When(step) : step));
+}
+
+// Hybrid mode (Gherkin source of truth + sparse JS overlay). Re-exported
+// last: bind-feature imports `When` from this module, and keeping the cycle
+// edge at the bottom makes the load order explicit.
+export { bindFeature, anchorText } from './bind-feature';
+export type {
+  FeatureOverlay,
+  ScenarioOverlay,
+  StepOverlay,
+  StepAnchor,
+} from './bind-feature';
+
+/**
+ * Cheap static authoring checks for flows. Calls to other flows make full
+ * static analysis impossible without a registry, so the check goes lenient as
+ * soon as a `callFlow` step appears; the executor still enforces everything
+ * at runtime.
+ */
+function validateFlowScoping(
+  name: string,
+  params: string[],
+  returns: string[],
+  steps: FlowIRStep[],
+): void {
+  const hasFlowCalls = steps.some((s) => s.kind === 'callFlow');
+  const known = new Set(params);
+
+  for (const step of steps) {
+    if (step.kind === 'prompt' || step.kind === 'capture') {
+      if (!hasFlowCalls) {
+        for (const placeholder of listPlaceholders(step.template)) {
+          if (!known.has(placeholder)) {
+            throw new Error(
+              `[midscene] defineFlow("${name}"): {${placeholder}} is not a param and is not captured by an earlier step. Flows get a fresh scope — only declared params and earlier captures are visible.`,
+            );
+          }
+        }
+      }
+    }
+    if (step.kind === 'capture') {
+      known.add(step.varName);
+    }
+  }
+
+  if (!hasFlowCalls) {
+    for (const ret of returns) {
+      if (!known.has(ret)) {
+        throw new Error(
+          `[midscene] defineFlow("${name}"): return "${ret}" is neither a param nor captured by any step.`,
+        );
+      }
+    }
+  }
+}
diff --git a/packages/testing-framework/src/index.ts b/packages/testing-framework/src/index.ts
index 0acad2d162..d0c27bc629 100644
--- a/packages/testing-framework/src/index.ts
+++ b/packages/testing-framework/src/index.ts
@@ -75,3 +75,56 @@ export { loadConfig, resolveConfigPath } from './runner/load-config';
 export { discoverCases } from './runner/glob';
 export { createUIAgent } from './ui-agent/factory';
 export type { ResolvedUIAgent } from './ui-agent/factory';
+
+// —— POC: shared flow-IR + authoring front-ends ——
+// A flow intermediate representation with a scenario-scoped variable table
+// and named, parameterized flows. Two authoring surfaces compile to it: a
+// fluent JS/TS API and a Gherkin (.feature) compiler. See POC-GHERKIN.md.
+export {
+  FlowRegistry,
+  createFlowRegistry,
+  runScenario,
+  substitute,
+  listPlaceholders,
+  MAX_FLOW_CALL_DEPTH,
+} from './flow-ir';
+export type {
+  FlowIRStep,
+  PromptStepIR,
+  CaptureStepIR,
+  CallFlowStepIR,
+  PromptRole,
+  ScenarioIR,
+  ScenarioConfigIR,
+  FlowDefIR,
+  FeatureIR,
+  RunScenarioOptions,
+  ScenarioRunResult,
+  ScenarioRunEvent,
+  VariableScope,
+} from './flow-ir';
+export {
+  defineFlow,
+  scenario,
+  feature,
+  Given,
+  When,
+  Then,
+  Soft,
+  Advisory,
+  remember,
+  callFlow,
+  bindFeature,
+  anchorText,
+} from './frontends/js';
+export type {
+  DefineFlowInput,
+  ScenarioOptions,
+  StepInput,
+  FeatureOverlay,
+  ScenarioOverlay,
+  StepOverlay,
+  StepAnchor,
+} from './frontends/js';
+export { compileFeature, compileFeatureFile } from './frontends/gherkin';
+export type { CompiledFeature } from './frontends/gherkin';
diff --git a/packages/testing-framework/tests/unit-test/bind-feature.test.ts b/packages/testing-framework/tests/unit-test/bind-feature.test.ts
new file mode 100644
index 0000000000..fb4c369483
--- /dev/null
+++ b/packages/testing-framework/tests/unit-test/bind-feature.test.ts
@@ -0,0 +1,342 @@
+import { describe, expect, it } from 'vitest';
+import { createFlowRegistry, runScenario } from '../../src/flow-ir';
+import { compileFeature } from '../../src/frontends/gherkin';
+import { Soft, bindFeature, remember } from '../../src/frontends/js';
+import { FakeGeneralAgent, FakeUiAgent } from './helpers/fake-agents';
+
+const FEATURE = `
+Feature: Checkout
+
+  @flow @param:role @returns:greeting
+  Scenario: Login
+    When I sign in as the "{role}" user
+    When I remember the greeting shown in the header as "greeting"
+
+  Scenario: Checkout as admin
+    When I run the "Login" flow with role "admin"
+    And I remember the price of the "Trail Backpack" product as "price"
+    When I add the "Trail Backpack" to the cart
+    Then the cart total equals {price}
+
+  Scenario: Browse anonymously
+    When I open the catalog page
+    Then the product grid is visible
+`;
+
+describe('bindFeature: parity and sparseness', () => {
+  it('an empty (or omitted) overlay produces IR identical to the plain compile', () => {
+    const plain = compileFeature(FEATURE, '<inline>');
+    expect(bindFeature(FEATURE)).toEqual(plain);
+    expect(bindFeature(FEATURE, {})).toEqual(plain);
+    expect(bindFeature(FEATURE, { scenarios: {} })).toEqual(plain);
+  });
+
+  it('leaves unmentioned scenarios byte-identical to pure Gherkin', () => {
+    const plain = compileFeature(FEATURE, '<inline>');
+    const bound = bindFeature(FEATURE, {
+      scenarios: {
+        'Checkout as admin': { vars: { coupon: 'SAVE10' } },
+      },
+    });
+    const untouched = bound.scenarios.find(
+      (s) => s.name === 'Browse anonymously',
+    );
+    expect(untouched).toEqual(
+      plain.scenarios.find((s) => s.name === 'Browse anonymously'),
+    );
+    // Flows are never touched by overlays.
+    expect(bound.flows).toEqual(plain.flows);
+  });
+});
+
+describe('bindFeature: overlay application', () => {
+  it('overrides a step template and node kind by text anchor', () => {
+    const bound = bindFeature(FEATURE, {
+      scenarios: {
+        'Checkout as admin': {
+          steps: [
+            {
+              at: 'the cart total equals {price}',
+              node: 'soft',
+              template: 'the cart total equals {price} within $0.01',
+            },
+          ],
+        },
+      },
+    });
+    const checkout = bound.scenarios.find(
+      (s) => s.name === 'Checkout as admin',
+    );
+    expect(checkout?.steps.at(-1)).toEqual({
+      kind: 'prompt',
+      node: 'soft',
+      role: 'assertion',
+      template: 'the cart total equals {price} within $0.01',
+    });
+    // Earlier steps untouched.
+    expect(checkout?.steps[0]).toMatchObject({ kind: 'callFlow' });
+  });
+
+  it('injects computed variables and flow-call args', () => {
+    const bound = bindFeature(FEATURE, {
+      scenarios: {
+        'Checkout as admin': {
+          vars: { coupon: 'SAVE10', qty: 2 },
+          steps: [{ at: 'Login', args: { role: 'auditor' } }],
+        },
+      },
+    });
+    const checkout = bound.scenarios.find(
+      (s) => s.name === 'Checkout as admin',
+    );
+    expect(checkout?.vars).toEqual({ coupon: 'SAVE10', qty: '2' });
+    expect(checkout?.steps[0]).toEqual({
+      kind: 'callFlow',
+      flowName: 'Login',
+      args: { role: 'auditor' },
+    });
+  });
+
+  it('inserts extra steps before/after anchored steps without shifting other anchors', () => {
+    const bound = bindFeature(FEATURE, {
+      scenarios: {
+        'Checkout as admin': {
+          steps: [
+            {
+              at: 'I add the "Trail Backpack" to the cart',
+              before: [remember('the current cart badge count', 'badgeBefore')],
+              after: ['apply the coupon code {coupon} in the cart'],
+            },
+            // Anchors resolve against the ORIGINAL list: this index is the
+            // Then step pre-insertion.
+            { at: 3, node: 'soft' },
+          ],
+        },
+      },
+    });
+    const steps = bound.scenarios.find(
+      (s) => s.name === 'Checkout as admin',
+    )?.steps;
+    expect(steps?.map((s) => (s.kind === 'prompt' ? s.node : s.kind))).toEqual([
+      'callFlow',
+      'capture',
+      'capture', // inserted before
+      'ui',
+      'ui', // inserted after (bare string → When)
+      'soft', // index-anchored override of the original Then
+    ]);
+    expect(steps?.[4]).toMatchObject({
+      template: 'apply the coupon code {coupon} in the cart',
+      role: 'action',
+    });
+  });
+
+  it('attaches per-scenario config flags at the IR level', () => {
+    const bound = bindFeature(FEATURE, {
+      scenarios: {
+        'Browse anonymously': { skip: true },
+        'Checkout as admin': { only: true },
+      },
+    });
+    expect(
+      bound.scenarios.find((s) => s.name === 'Browse anonymously')?.config,
+    ).toEqual({ skip: true });
+    expect(
+      bound.scenarios.find((s) => s.name === 'Checkout as admin')?.config,
+    ).toEqual({ only: true });
+    // No config attached unless asked for.
+    for (const s of bindFeature(FEATURE).scenarios) {
+      expect(s.config).toBeUndefined();
+    }
+  });
+
+  it('executes a bound scenario with overrides and injected vars (fake agents)', async () => {
+    const bound = bindFeature(FEATURE, {
+      scenarios: {
+        'Checkout as admin': {
+          vars: { coupon: 'SAVE10' },
+          steps: [
+            {
+              at: 'I add the "Trail Backpack" to the cart',
+              after: ['apply the coupon code {coupon} in the cart'],
+            },
+            { at: 'the cart total equals {price}', node: 'soft' },
+          ],
+        },
+      },
+    });
+    const ui = new FakeUiAgent(['Hello, Admin!', '129.00']);
+    const general = new FakeGeneralAgent(() => ({
+      text: 'mismatch',
+      verdict: { pass: false, reason: 'totals differ' },
+    }));
+
+    const checkout = bound.scenarios.find(
+      (s) => s.name === 'Checkout as admin',
+    );
+    if (!checkout) throw new Error('scenario not found');
+    const result = await runScenario({
+      scenario: checkout,
+      registry: createFlowRegistry(bound.flows),
+      uiAgent: ui.asAgent(),
+      generalAgent: general,
+      env: {},
+    });
+
+    // Injected variable was substituted into the inserted step's prompt.
+    expect(ui.actCalls).toContain('apply the coupon code SAVE10 in the cart');
+    // The verify→soft override means the failing verdict only warns.
+    expect(result.status).toBe('passed');
+    expect(result.warnings[0]).toMatch(/totals differ/);
+  });
+});
+
+describe('bindFeature: drift validation with codegen', () => {
+  it('rejects unknown scenario titles with closest match and a starter overlay', () => {
+    let error: Error | undefined;
+    try {
+      bindFeature(FEATURE, {
+        scenarios: { 'Checkout as admn': { skip: true } },
+      });
+    } catch (err) {
+      error = err as Error;
+    }
+    expect(error?.message).toMatch(/unknown scenario "Checkout as admn"/);
+    expect(error?.message).toMatch(/Did you mean "Checkout as admin"\?/);
+    // Codegen: a ready-to-paste overlay skeleton with real anchors.
+    expect(error?.message).toContain('"Checkout as admin": {');
+    expect(error?.message).toContain(
+      '{ at: "the cart total equals {price}" },',
+    );
+  });
+
+  it('explains when the overlay targets a @flow definition', () => {
+    expect(() =>
+      bindFeature(FEATURE, { scenarios: { Login: { skip: true } } }),
+    ).toThrow(/@flow definition; overlays only target runnable scenarios/);
+  });
+
+  it('rejects unknown step anchors with closest match and the anchor listing', () => {
+    let error: Error | undefined;
+    try {
+      bindFeature(FEATURE, {
+        scenarios: {
+          'Checkout as admin': {
+            steps: [{ at: 'the cart total equals {prce}', node: 'soft' }],
+          },
+        },
+      });
+    } catch (err) {
+      error = err as Error;
+    }
+    expect(error?.message).toMatch(/no step matches anchor/);
+    expect(error?.message).toMatch(
+      /Did you mean "the cart total equals \{price\}"\?/,
+    );
+    expect(error?.message).toContain('Available anchors:');
+    expect(error?.message).toContain('// 0: flow call Login(role)');
+  });
+
+  it('rejects out-of-range index anchors', () => {
+    expect(() =>
+      bindFeature(FEATURE, {
+        scenarios: {
+          'Browse anonymously': { steps: [{ at: 9, node: 'soft' }] },
+        },
+      }),
+    ).toThrow(/anchor 9 is out of range.*indices 0–1/s);
+  });
+
+  it('rejects ambiguous text anchors and suggests index anchors', () => {
+    const duplicated = `
+Feature: dup
+  Scenario: twice
+    When I click "Next"
+    When I click "Next"
+`;
+    expect(() =>
+      bindFeature(duplicated, {
+        scenarios: {
+          twice: { steps: [{ at: 'I click "Next"', template: 'x' }] },
+        },
+      }),
+    ).toThrow(/ambiguous \(matches steps 0, 1\).*\{ at: 0 \}/s);
+  });
+
+  it('rejects overlay fields that do not fit the anchored step kind', () => {
+    expect(() =>
+      bindFeature(FEATURE, {
+        scenarios: {
+          'Checkout as admin': {
+            steps: [{ at: 'Login', node: 'soft' }],
+          },
+        },
+      }),
+    ).toThrow(/`node` can only override prompt steps/);
+
+    expect(() =>
+      bindFeature(FEATURE, {
+        scenarios: {
+          'Checkout as admin': {
+            steps: [{ at: 'Login', template: 'nope' }],
+          },
+        },
+      }),
+    ).toThrow(/use `args` to adjust a flow call/);
+
+    expect(() =>
+      bindFeature(FEATURE, {
+        scenarios: {
+          'Browse anonymously': {
+            steps: [{ at: 'I open the catalog page', args: { x: '1' } }],
+          },
+        },
+      }),
+    ).toThrow(/`args` only applies to flow-call steps/);
+  });
+
+  it('applies an overlay to every expansion of a Scenario Outline title', () => {
+    const outline = `
+Feature: outline
+  Scenario Outline: visit page
+    When I open the "<page>" page
+    Then the "<page>" page is visible
+
+    Examples:
+      | page  |
+      | home  |
+      | about |
+`;
+    const bound = bindFeature(outline, {
+      scenarios: { 'visit page': { steps: [{ at: 1, node: 'soft' }] } },
+    });
+    expect(bound.scenarios).toHaveLength(2);
+    for (const s of bound.scenarios) {
+      expect(s.steps[1]).toMatchObject({ node: 'soft' });
+    }
+  });
+});
+
+describe('bindFeature: Soft helper interop', () => {
+  it('accepts IR steps from the fluent API as inserts', () => {
+    const bound = bindFeature(FEATURE, {
+      scenarios: {
+        'Browse anonymously': {
+          steps: [
+            {
+              at: 'the product grid is visible',
+              before: [Soft('no broken product images are visible')],
+            },
+          ],
+        },
+      },
+    });
+    const steps = bound.scenarios.find(
+      (s) => s.name === 'Browse anonymously',
+    )?.steps;
+    expect(steps?.[1]).toMatchObject({
+      node: 'soft',
+      template: 'no broken product images are visible',
+    });
+  });
+});
diff --git a/packages/testing-framework/tests/unit-test/example-parity.test.ts b/packages/testing-framework/tests/unit-test/example-parity.test.ts
new file mode 100644
index 0000000000..61c7a2f17c
--- /dev/null
+++ b/packages/testing-framework/tests/unit-test/example-parity.test.ts
@@ -0,0 +1,121 @@
+/**
+ * Parity check for the two example authoring surfaces: the Gherkin feature
+ * (example/flows/shop.feature) and its JS counterpart
+ * (example/flows/shop.flows.ts) must compile to equivalent IR and produce the
+ * same execution trace against the same fake agents.
+ */
+import { join } from 'node:path';
+import { describe, expect, it } from 'vitest';
+import {
+  checkoutAsAdmin,
+  loginFlow,
+  registry,
+  shopFeature,
+} from '../../example/flows/shop.flows';
+import { createFlowRegistry, runScenario } from '../../src/flow-ir';
+import type { ScenarioIR } from '../../src/flow-ir';
+import type { FlowRegistry } from '../../src/flow-ir';
+import { compileFeatureFile } from '../../src/frontends/gherkin';
+import { FakeGeneralAgent, FakeUiAgent } from './helpers/fake-agents';
+
+const FEATURE_FILE = join(__dirname, '../../example/flows/shop.feature');
+const gherkin = compileFeatureFile(FEATURE_FILE);
+
+describe('example overlay: shop.overlay.ts binds without drift', () => {
+  it('applies the sparse overlay on top of the plain compile', async () => {
+    const { bound } = await import('../../example/flows/shop.overlay');
+    const checkout = bound.scenarios.find(
+      (s) => s.name === 'Checkout as admin',
+    );
+    expect(checkout?.vars?.couponCode).toMatch(/^E2E-\d{4}-\d{2}-\d{2}$/);
+    expect(
+      checkout?.steps.some(
+        (s) =>
+          s.kind === 'prompt' &&
+          s.template === 'apply the coupon code {couponCode} in the cart',
+      ),
+    ).toBe(true);
+    expect(checkout?.steps.at(-2)).toMatchObject({ node: 'soft' });
+    expect(
+      bound.scenarios.find((s) => s.name === 'Promo banner is advisory')
+        ?.config,
+    ).toEqual({ skip: true });
+    // Sparse: the outline-expanded scenarios are untouched pure Gherkin.
+    expect(
+      bound.scenarios.filter((s) => s.name === 'Login greets every role'),
+    ).toEqual(
+      gherkin.scenarios.filter((s) => s.name === 'Login greets every role'),
+    );
+  });
+});
+
+describe('example parity: Gherkin vs JS front-end', () => {
+  it('compiles the same Login flow signature', () => {
+    expect(gherkin.flows).toHaveLength(1);
+    const gherkinLogin = gherkin.flows[0];
+    expect(gherkinLogin.name).toBe(loginFlow.name);
+    expect(gherkinLogin.params).toEqual(loginFlow.params);
+    expect(gherkinLogin.returns).toEqual(loginFlow.returns);
+    // Background steps are excluded from @flow pickles, so the two surfaces
+    // compile to the exact same flow body.
+    expect(gherkinLogin.steps).toEqual(loginFlow.steps);
+  });
+
+  it('compiles the same checkout scenario steps', () => {
+    const gherkinCheckout = gherkin.scenarios.find(
+      (s) => s.name === 'Checkout as admin',
+    );
+    expect(gherkinCheckout).toBeDefined();
+    expect(gherkinCheckout?.steps).toEqual(checkoutAsAdmin.steps);
+  });
+
+  it('expands the outline to the same per-role scenarios as the JS map()', () => {
+    const gherkinRoles = gherkin.scenarios
+      .filter((s) => s.name === 'Login greets every role')
+      .map((s) => s.steps);
+    const jsRoles = shopFeature.scenarios
+      .filter((s) => s.name.startsWith('Login greets every role'))
+      .map((s) => s.steps);
+    expect(gherkinRoles).toEqual(jsRoles);
+  });
+
+  it('produces identical execution traces through the shared IR executor', async () => {
+    const gherkinCheckout = gherkin.scenarios.find(
+      (s) => s.name === 'Checkout as admin',
+    ) as ScenarioIR;
+
+    const runWith = async (s: ScenarioIR, reg: FlowRegistry) => {
+      const ui = new FakeUiAgent(['Hello, Admin!', '129.00']);
+      const general = new FakeGeneralAgent();
+      const result = await runScenario({
+        scenario: s,
+        registry: reg,
+        uiAgent: ui.asAgent(),
+        generalAgent: general,
+        env: {},
+      });
+      return {
+        status: result.status,
+        variables: result.variables,
+        actCalls: ui.actCalls,
+        stringCalls: ui.stringCalls,
+        verifyPrompts: general.calls.map((c) => c.instruction),
+      };
+    };
+
+    const fromGherkin = await runWith(
+      gherkinCheckout,
+      createFlowRegistry(gherkin.flows),
+    );
+    const fromJs = await runWith(checkoutAsAdmin, registry);
+
+    // Same prompts hit the "models", same variables end up in the table.
+    expect(fromGherkin.actCalls).toEqual(fromJs.actCalls);
+    expect(fromGherkin.stringCalls).toEqual(fromJs.stringCalls);
+    expect(fromGherkin.verifyPrompts).toEqual(fromJs.verifyPrompts);
+    expect(fromGherkin.variables).toEqual(fromJs.variables);
+    expect(fromGherkin.status).toBe('passed');
+    expect(fromJs.status).toBe('passed');
+    expect(fromJs.verifyPrompts).toContain('the cart total equals 129.00');
+  });
+});
diff --git a/packages/testing-framework/tests/unit-test/flow-ir.test.ts b/packages/testing-framework/tests/unit-test/flow-ir.test.ts
new file mode 100644
index 0000000000..13f93c1752
--- /dev/null
+++ b/packages/testing-framework/tests/unit-test/flow-ir.test.ts
@@ -0,0 +1,89 @@
+import { describe, expect, it } from 'vitest';
+import {
+  FlowRegistry,
+  createFlowRegistry,
+  listPlaceholders,
+  substitute,
+} from '../../src/flow-ir';
+import type { FlowDefIR } from '../../src/flow-ir';
+
+const loginFlow: FlowDefIR = {
+  name: 'Login',
+  params: ['role'],
+  returns: [],
+  steps: [
+    {
+      kind: 'prompt',
+      node: 'ui',
+      role: 'action',
+      template: 'log in as {role}',
+    },
+  ],
+};
+
+describe('substitute', () => {
+  it('replaces known placeholders mechanically', () => {
+    const vars = new Map([
+      ['role', 'admin'],
+      ['price', '42.00'],
+    ]);
+    expect(
+      substitute('sign in as {role}; the total is {price}', vars, 'test'),
+    ).toBe('sign in as admin; the total is 42.00');
+  });
+
+  it('replaces repeated placeholders', () => {
+    const vars = new Map([['x', 'A']]);
+    expect(substitute('{x} and {x}', vars, 'test')).toBe('A and A');
+  });
+
+  it('throws on unknown placeholders (fail fast on typos)', () => {
+    const vars = new Map([['role', 'admin']]);
+    expect(() => substitute('sign in as {rolle}', vars, 'step 3')).toThrow(
+      /step 3: unknown variable \{rolle\}.*role/,
+    );
+  });
+
+  it('leaves non-placeholder braces alone', () => {
+    const vars = new Map<string, string>();
+    expect(substitute('json like {"a": 1} stays', vars, 'test')).toBe(
+      'json like {"a": 1} stays',
+    );
+  });
+});
+
+describe('listPlaceholders', () => {
+  it('lists placeholder names in order', () => {
+    expect(listPlaceholders('the {a} of {b} and {a}')).toEqual(['a', 'b', 'a']);
+  });
+});
+
+describe('FlowRegistry', () => {
+  it('registers and resolves flows', () => {
+    const registry = createFlowRegistry([loginFlow]);
+    expect(registry.has('Login')).toBe(true);
+    expect(registry.get('Login').params).toEqual(['role']);
+  });
+
+  it('rejects duplicate registration', () => {
+    const registry = createFlowRegistry([loginFlow]);
+    expect(() => registry.register(loginFlow)).toThrow(/already registered/);
+  });
+
+  it('throws for unknown flows with the registered names listed', () => {
+    const registry = createFlowRegistry([loginFlow]);
+    expect(() => registry.get('Checkout')).toThrow(
+      /Unknown flow "Checkout".*Login/,
+    );
+  });
+
+  it('rejects empty flows and invalid identifiers', () => {
+    const registry = new FlowRegistry();
+    expect(() =>
+      registry.register({ ...loginFlow, name: 'Empty', steps: [] }),
+    ).toThrow(/no steps/);
+    expect(() =>
+      registry.register({ ...loginFlow, name: 'Bad', params: ['not ok'] }),
+    ).toThrow(/not a valid variable name/);
+  });
+});
diff --git a/packages/testing-framework/tests/unit-test/gherkin-frontend.test.ts b/packages/testing-framework/tests/unit-test/gherkin-frontend.test.ts
new file mode 100644
index 0000000000..e56dab6c57
--- /dev/null
+++ b/packages/testing-framework/tests/unit-test/gherkin-frontend.test.ts
@@ -0,0 +1,159 @@
+import { describe, expect, it } from 'vitest';
+import { compileFeature } from '../../src/frontends/gherkin';
+
+const FEATURE = `
+Feature: Checkout
+
+  Background:
+    Given the demo shop is open on the home page
+
+  @flow @param:role @returns:greeting
+  Scenario: Login
+    When I open the login page
+    And I sign in as the "{role}" user
+    Then the dashboard for the "{role}" role is visible
+    When I remember the greeting shown in the header as "greeting"
+
+  Scenario: Checkout as admin
+    When I run the "Login" flow with role "admin"
+    And I remember the price of the "Trail Backpack" product as "price"
+    When I add the "Trail Backpack" to the cart
+    Then the cart total equals {price}
+    But the cart does not show an error banner
+
+  @soft
+  Scenario: Promo banner
+    Then a promo banner is visible at the top of the page
+
+  Scenario Outline: Login works for each role
+    When I run the "Login" flow with role "<role>"
+    Then the header greets the user with {greeting}
+
+    Examples:
+      | role  |
+      | admin |
+      | guest |
+`;
+
+describe('Gherkin front-end', () => {
+  const compiled = compileFeature(FEATURE, 'checkout.feature');
+
+  it('separates @flow definitions from runnable scenarios', () => {
+    expect(compiled.name).toBe('Checkout');
+    expect(compiled.flows.map((f) => f.name)).toEqual(['Login']);
+    expect(compiled.scenarios.map((s) => s.name)).toEqual([
+      'Checkout as admin',
+      'Promo banner',
+      'Login works for each role',
+      'Login works for each role',
+    ]);
+  });
+
+  it('reads params and returns from @param:/@returns: tags', () => {
+    const login = compiled.flows[0];
+    expect(login.params).toEqual(['role']);
+    expect(login.returns).toEqual(['greeting']);
+    // Background steps are excluded from flow definitions: a reusable flow
+    // invoked mid-scenario must not replay the feature's setup.
+    expect(login.steps[0]).toMatchObject({
+      kind: 'prompt',
+      node: 'ui',
+      role: 'action',
+      template: 'I open the login page',
+    });
+    // `I remember ... as "greeting"` becomes a capture step.
+    expect(login.steps.at(-1)).toEqual({
+      kind: 'capture',
+      template: 'the greeting shown in the header',
+      varName: 'greeting',
+    });
+  });
+
+  it('maps keywords to node kinds, with And/But inheriting the last primary keyword', () => {
+    const checkout = compiled.scenarios[0];
+    // Background Given → ui/setup leading step.
+    expect(checkout.steps[0]).toMatchObject({ node: 'ui', role: 'setup' });
+    // `And I remember ...` after a When still parses as capture.
+    expect(checkout.steps[2]).toMatchObject({
+      kind: 'capture',
+      varName: 'price',
+    });
+    // Then → verify (fail-closed), template keeps the {price} placeholder.
+    expect(checkout.steps[4]).toMatchObject({
+      kind: 'prompt',
+      node: 'verify',
+      template: 'the cart total equals {price}',
+    });
+    // `But` after a Then inherits Outcome → verify.
+    expect(checkout.steps[5]).toMatchObject({
+      kind: 'prompt',
+      node: 'verify',
+      template: 'the cart does not show an error banner',
+    });
+  });
+
+  it('compiles flow invocation steps with parsed args', () => {
+    const checkout = compiled.scenarios[0];
+    expect(checkout.steps[1]).toEqual({
+      kind: 'callFlow',
+      flowName: 'Login',
+      args: { role: 'admin' },
+    });
+  });
+
+  it('turns Then into soft nodes for @soft scenarios', () => {
+    const promo = compiled.scenarios[1];
+    expect(promo.tags).toContain('@soft');
+    expect(promo.steps.at(-1)).toMatchObject({ kind: 'prompt', node: 'soft' });
+  });
+
+  it('expands Scenario Outline examples into the step text', () => {
+    const [adminRun, guestRun] = compiled.scenarios.slice(2);
+    expect(adminRun.steps[1]).toMatchObject({
+      kind: 'callFlow',
+      args: { role: 'admin' },
+    });
+    expect(guestRun.steps[1]).toMatchObject({
+      kind: 'callFlow',
+      args: { role: 'guest' },
+    });
+    // `{greeting}` (curly braces) is left for the runtime variable table.
+    expect(guestRun.steps[2]).toMatchObject({
+      node: 'verify',
+      template: 'the header greets the user with {greeting}',
+    });
+  });
+
+  it('parses multiple flow args joined with "and"', () => {
+    const multi = compileFeature(
+      `Feature: f
+  Scenario: s
+    When I run the "Login" flow with role "admin" and region "eu-west"
+`,
+      'multi.feature',
+    );
+    expect(multi.scenarios[0].steps[0]).toEqual({
+      kind: 'callFlow',
+      flowName: 'Login',
+      args: { role: 'admin', region: 'eu-west' },
+    });
+  });
+
+  it('throws on an unparseable arg clause', () => {
+    expect(() =>
+      compileFeature(
+        `Feature: f
+  Scenario: s
+    When I run the "Login" flow with gibberish
+`,
+        'bad.feature',
+      ),
+    ).toThrow(/could not parse arguments/);
+  });
+
+  it('throws on invalid Gherkin', () => {
+    expect(() => compileFeature('Feature broken\n  nonsense')).toThrow(
+      /Failed to parse Gherkin/,
+    );
+  });
+});
diff --git a/packages/testing-framework/tests/unit-test/helpers/fake-agents.ts b/packages/testing-framework/tests/unit-test/helpers/fake-agents.ts
new file mode 100644
index 0000000000..2cb611709f
--- /dev/null
+++ b/packages/testing-framework/tests/unit-test/helpers/fake-agents.ts
@@ -0,0 +1,78 @@
+/**
+ * Fakes for the POC unit tests. No browser, no model calls.
+ *
+ * - `FakeUiAgent` stands in for the Midscene UI Agent at the `runNode` /
+ *   `runScenario` boundary (aiAct / aiAsk / aiString / screenshot).
+ * - `FakeGeneralAgent` implements the swappable `GeneralAgentAdapter` used by
+ *   verify / soft / agent nodes.
+ *
+ * Both record every call so tests can assert that `{var}` substitution
+ * happened mechanically BEFORE the prompt reached the "model".
+ */
+import type { Agent } from '@midscene/core/agent';
+import type {
+  GeneralAgentAdapter,
+  GeneralAgentInput,
+  GeneralAgentResult,
+} from '../../../src/general-agent/types';
+
+export class FakeUiAgent {
+  /** Instructions received by aiAct (ui nodes), post-substitution. */
+  actCalls: string[] = [];
+  /** Extraction prompts received by aiString (capture steps). */
+  stringCalls: string[] = [];
+  askCalls: string[] = [];
+
+  private readonly stringResults: string[];
+
+  constructor(stringResults: string[] = []) {
+    this.stringResults = [...stringResults];
+  }
+
+  async aiAct(instruction: string): Promise<string> {
+    this.actCalls.push(instruction);
+    return `did: ${instruction}`;
+  }
+
+  async aiAsk(prompt: string): Promise<string> {
+    this.askCalls.push(prompt);
+    return 'ok';
+  }
+
+  async aiString(prompt: string): Promise<string> {
+    this.stringCalls.push(prompt);
+    const next = this.stringResults.shift();
+    if (next === undefined) {
+      throw new Error('FakeUiAgent: no scripted aiString result left.');
+    }
+    return next;
+  }
+
+  interface = {
+    screenshotBase64: async () => 'data:image/png;base64,FAKE',
+  };
+
+  asAgent(): Agent {
+    return this as unknown as Agent;
+  }
+}
+
+export type GeneralAgentScript = (
+  input: GeneralAgentInput,
+) => GeneralAgentResult;
+
+export class FakeGeneralAgent implements GeneralAgentAdapter {
+  calls: GeneralAgentInput[] = [];
+
+  constructor(
+    private readonly script: GeneralAgentScript = () => ({
+      text: 'looks good',
+      verdict: { pass: true, reason: 'fake pass' },
+    }),
+  ) {}
+
+  async run(input: GeneralAgentInput): Promise<GeneralAgentResult> {
+    this.calls.push(input);
+    return this.script(input);
+  }
+}
diff --git a/packages/testing-framework/tests/unit-test/js-frontend.test.ts b/packages/testing-framework/tests/unit-test/js-frontend.test.ts
new file mode 100644
index 0000000000..0fa5883fbe
--- /dev/null
+++ b/packages/testing-framework/tests/unit-test/js-frontend.test.ts
@@ -0,0 +1,149 @@
+import { describe, expect, it } from 'vitest';
+import {
+  Advisory,
+  Given,
+  Soft,
+  Then,
+  When,
+  callFlow,
+  defineFlow,
+  feature,
+  remember,
+  scenario,
+} from '../../src/frontends/js';
+
+describe('JS front-end: keyword→node mapping', () => {
+  it('maps given/when/then/soft/advisory to engine node kinds', () => {
+    expect(Given('the shop is open')).toEqual({
+      kind: 'prompt',
+      node: 'ui',
+      role: 'setup',
+      template: 'the shop is open',
+    });
+    expect(When('I add the item to the cart').node).toBe('ui');
+    expect(When('I add the item to the cart').role).toBe('action');
+    expect(Then('the cart shows 1 item').node).toBe('verify');
+    expect(Soft('a promo banner is visible').node).toBe('soft');
+    expect(Advisory('summarize risks').node).toBe('agent');
+  });
+
+  it('rejects empty prompts', () => {
+    expect(() => When('   ')).toThrow(/must not be empty/);
+  });
+});
+
+describe('JS front-end: remember / callFlow', () => {
+  it('builds capture steps', () => {
+    expect(remember('the order id shown in the banner', 'orderId')).toEqual({
+      kind: 'capture',
+      template: 'the order id shown in the banner',
+      varName: 'orderId',
+    });
+  });
+
+  it('rejects invalid variable names', () => {
+    expect(() => remember('something', 'not a name')).toThrow(
+      /not a valid variable name/,
+    );
+  });
+
+  it('builds callFlow steps and stringifies arg values', () => {
+    expect(callFlow('Login', { role: 'admin', retries: 2 })).toEqual({
+      kind: 'callFlow',
+      flowName: 'Login',
+      args: { role: 'admin', retries: '2' },
+    });
+  });
+});
+
+describe('JS front-end: scenario / feature builders', () => {
+  it('normalizes bare strings to when (ui action) steps', () => {
+    const s = scenario('quick', ['open the home page', Then('it loaded')]);
+    expect(s.steps[0]).toEqual({
+      kind: 'prompt',
+      node: 'ui',
+      role: 'action',
+      template: 'open the home page',
+    });
+    expect(s.steps[1].kind).toBe('prompt');
+  });
+
+  it('stringifies seed vars', () => {
+    const s = scenario('seeded', ['x'], { vars: { qty: 3, flag: true } });
+    expect(s.vars).toEqual({ qty: '3', flag: 'true' });
+  });
+
+  it('supports dynamic build-time authoring (map over data)', () => {
+    const roles = ['admin', 'guest'];
+    const f = feature(
+      'login matrix',
+      roles.map((role) =>
+        scenario(`login as ${role}`, [
+          callFlow('Login', { role }),
+          Then(`the dashboard for the "${role}" role is visible`),
+        ]),
+      ),
+    );
+    expect(f.scenarios).toHaveLength(2);
+    expect(f.scenarios[1].steps[0]).toMatchObject({
+      kind: 'callFlow',
+      args: { role: 'guest' },
+    });
+  });
+
+  it('rejects empty step lists', () => {
+    expect(() => scenario('empty', [])).toThrow(/non-empty/);
+  });
+});
+
+describe('JS front-end: defineFlow static checks', () => {
+  it('builds a flow definition', () => {
+    const flow = defineFlow({
+      name: 'Login',
+      params: ['role'],
+      returns: ['greeting'],
+      steps: [
+        When('open the login page'),
+        When('sign in as the {role} user'),
+        remember('the greeting in the header', 'greeting'),
+      ],
+    });
+    expect(flow.params).toEqual(['role']);
+    expect(flow.returns).toEqual(['greeting']);
+    expect(flow.steps).toHaveLength(3);
+  });
+
+  it('rejects placeholders that are neither params nor earlier captures', () => {
+    expect(() =>
+      defineFlow({
+        name: 'Broken',
+        params: ['role'],
+        steps: [When('sign in as {role} with {password}')],
+      }),
+    ).toThrow(/\{password\}.*fresh scope/);
+  });
+
+  it('rejects returns that are never produced', () => {
+    expect(() =>
+      defineFlow({
+        name: 'Broken',
+        params: ['role'],
+        returns: ['token'],
+        steps: [When('sign in as {role}')],
+      }),
+    ).toThrow(/return "token"/);
+  });
+
+  it('goes lenient when the flow calls other flows', () => {
+    const flow = defineFlow({
+      name: 'Composite',
+      params: [],
+      returns: ['greeting'],
+      steps: [
+        callFlow('Login', { role: 'admin' }),
+        Then('the greeting {greeting} is shown'),
+      ],
+    });
+    expect(flow.steps[0].kind).toBe('callFlow');
+  });
+});
diff --git a/packages/testing-framework/tests/unit-test/run-scenario.test.ts b/packages/testing-framework/tests/unit-test/run-scenario.test.ts
new file mode 100644
index 0000000000..1259d12d94
--- /dev/null
+++ b/packages/testing-framework/tests/unit-test/run-scenario.test.ts
@@ -0,0 +1,393 @@
+import { describe, expect, it } from 'vitest';
+import { createFlowRegistry, runScenario } from '../../src/flow-ir';
+import type { FlowDefIR, ScenarioIR } from '../../src/flow-ir';
+import {
+  Soft,
+  Then,
+  When,
+  callFlow,
+  defineFlow,
+  remember,
+  scenario,
+} from '../../src/frontends/js';
+import { FakeGeneralAgent, FakeUiAgent } from './helpers/fake-agents';
+
+const loginFlow: FlowDefIR = defineFlow({
+  name: 'Login',
+  params: ['role'],
+  returns: ['greeting'],
+  steps: [
+    When('open the login page'),
+    When('sign in as the "{role}" user'),
+    remember('the greeting shown in the header', 'greeting'),
+  ],
+});
+
+function run(
+  s: ScenarioIR,
+  opts: {
+    flows?: FlowDefIR[];
+    ui?: FakeUiAgent;
+    general?: FakeGeneralAgent;
+  } = {},
+) {
+  const ui = opts.ui ?? new FakeUiAgent();
+  const general = opts.general ?? new FakeGeneralAgent();
+  return runScenario({
+    scenario: s,
+    registry: createFlowRegistry(opts.flows ?? []),
+    uiAgent: ui.asAgent(),
+    generalAgent: general,
+    env: {},
+  }).then((result) => ({ result, ui, general }));
+}
+
+describe('runScenario: variable capture and substitution', () => {
+  it('captures via the UI agent and substitutes before the model sees the prompt', async () => {
+    const ui = new FakeUiAgent(['A-123']);
+    const general = new FakeGeneralAgent();
+    const { result } = await run(
+      scenario('order confirmation', [
+        When('place the order'),
+        remember('the order id shown in the banner', 'orderId'),
+        Then('the confirmation page shows order {orderId}'),
+      ]),
+      { ui, general },
+    );
+
+    expect(result.status).toBe('passed');
+    expect(ui.stringCalls).toEqual(['the order id shown in the banner']);
+    // The verify prompt reached the general agent already-resolved.
+    expect(general.calls).toHaveLength(1);
+    expect(general.calls[0].instruction).toBe(
+      'the confirmation page shows order A-123',
+    );
+    // Machine-owned channel: the variable table holds the captured value.
+    expect(result.variables.orderId).toBe('A-123');
+  });
+
+  it('seeds the scope from scenario vars and substitutes into ui prompts', async () => {
+    const { result, ui } = await run(
+      scenario('seeded', [When('search for {term}')], {
+        vars: { term: 'backpack' },
+      }),
+    );
+    expect(result.status).toBe('passed');
+    expect(ui.actCalls).toEqual(['search for backpack']);
+  });
+
+  it('fails the step (and case) on an unknown variable, before any model call', async () => {
+    const { result, ui, general } = await run(
+      scenario('typo', [Then('the total is {totl}')]),
+    );
+    expect(result.status).toBe('failed');
+    expect(result.steps[0].error).toMatch(/unknown variable \{totl\}/);
+    expect(ui.actCalls).toEqual([]);
+    expect(general.calls).toEqual([]);
+  });
+});
+
+describe('runScenario: named flows', () => {
+  it('runs a flow with a fresh scope and flows declared returns back', async () => {
+    const ui = new FakeUiAgent(['Hello, Admin!']);
+    const general = new FakeGeneralAgent();
+    const { result } = await run(
+      scenario('checkout', [
+        callFlow('Login', { role: 'admin' }),
+        Then('the header shows {greeting}'),
+      ]),
+      { flows: [loginFlow], ui, general },
+    );
+
+    expect(result.status).toBe('passed');
+    // Args were substituted into the callee's prompts.
+    expect(ui.actCalls).toContain('sign in as the "admin" user');
+    // Declared return came back into the caller scope.
+    expect(result.variables.greeting).toBe('Hello, Admin!');
+    expect(general.calls[0].instruction).toBe('the header shows Hello, Admin!');
+    // The call itself is visible in the step record.
+    expect(result.steps[0]).toMatchObject({
+      node: 'flow',
+      input: 'Login(role="admin")',
+    });
+  });
+
+  it('resolves arg templates against the caller scope', async () => {
+    const ui = new FakeUiAgent(['Hi']);
+    const { result } = await run(
+      scenario('computed arg', [callFlow('Login', { role: '{whoami}' })], {
+        vars: { whoami: 'guest' },
+      }),
+      { flows: [loginFlow], ui },
+    );
+    expect(result.status).toBe('passed');
+    expect(ui.actCalls).toContain('sign in as the "guest" user');
+  });
+
+  it('does not leak caller variables into the flow scope', async () => {
+    const leaky: FlowDefIR = {
+      name: 'Leaky',
+      params: [],
+      returns: [],
+      steps: [
+        {
+          kind: 'prompt',
+          node: 'ui',
+          role: 'action',
+          template: 'use {secret}',
+        },
+      ],
+    };
+    const { result } = await run(
+      scenario('caller', [callFlow('Leaky')], { vars: { secret: 'hunter2' } }),
+      { flows: [leaky] },
+    );
+    expect(result.status).toBe('failed');
+    expect(result.steps.at(-1)?.error).toMatch(/unknown variable \{secret\}/);
+  });
+
+  it('discards callee variables that are not declared returns', async () => {
+    const flow = defineFlow({
+      name: 'Capture2',
+      params: [],
+      returns: ['kept'],
+      steps: [
+        remember('the kept value', 'kept'),
+        remember('the discarded value', 'dropped'),
+      ],
+    });
+    const ui = new FakeUiAgent(['K', 'D']);
+    const { result } = await run(scenario('scoping', [callFlow('Capture2')]), {
+      flows: [flow],
+      ui,
+    });
+    expect(result.status).toBe('passed');
+    expect(result.variables.kept).toBe('K');
+    expect(result.variables).not.toHaveProperty('dropped');
+  });
+
+  it('fails when a declared return was never captured', async () => {
+    const flow: FlowDefIR = {
+      name: 'NoCapture',
+      params: [],
+      returns: ['token'],
+      steps: [
+        { kind: 'prompt', node: 'ui', role: 'action', template: 'do nothing' },
+      ],
+    };
+    const { result } = await run(
+      scenario('missing return', [callFlow('NoCapture')]),
+      {
+        flows: [flow],
+      },
+    );
+    expect(result.status).toBe('failed');
+    expect(result.steps.at(-1)?.error).toMatch(/return "token"/);
+  });
+
+  it('fails on missing or undeclared arguments', async () => {
+    const { result: missing } = await run(
+      scenario('missing arg', [callFlow('Login')]),
+      { flows: [loginFlow] },
+    );
+    expect(missing.status).toBe('failed');
+    expect(missing.steps[0].error).toMatch(/missing argument "role"/);
+
+    const { result: unknown } = await run(
+      scenario('unknown arg', [callFlow('Login', { role: 'a', nope: 'b' })]),
+      { flows: [loginFlow] },
+    );
+    expect(unknown.status).toBe('failed');
+    expect(unknown.steps[0].error).toMatch(/unknown argument "nope"/);
+  });
+
+  it('fails on unregistered flows', async () => {
+    const { result } = await run(scenario('nope', [callFlow('Ghost')]));
+    expect(result.status).toBe('failed');
+    expect(result.steps[0].error).toMatch(/Unknown flow "Ghost"/);
+  });
+});
+
+describe('runScenario: call-depth cap', () => {
+  const leaf: FlowDefIR = {
+    name: 'Leaf',
+    params: [],
+    returns: [],
+    steps: [
+      { kind: 'prompt', node: 'ui', role: 'action', template: 'leaf action' },
+    ],
+  };
+  const mid: FlowDefIR = {
+    name: 'Mid',
+    params: [],
+    returns: [],
+    steps: [{ kind: 'callFlow', flowName: 'Leaf', args: {} }],
+  };
+  const top: FlowDefIR = {
+    name: 'Top',
+    params: [],
+    returns: [],
+    steps: [{ kind: 'callFlow', flowName: 'Mid', args: {} }],
+  };
+
+  it('allows two levels of nesting (scenario → flow → flow)', async () => {
+    const { result, ui } = await run(scenario('ok depth', [callFlow('Mid')]), {
+      flows: [leaf, mid],
+    });
+    expect(result.status).toBe('passed');
+    expect(ui.actCalls).toEqual(['leaf action']);
+  });
+
+  it('rejects a third level of nesting', async () => {
+    const { result } = await run(scenario('too deep', [callFlow('Top')]), {
+      flows: [leaf, mid, top],
+    });
+    expect(result.status).toBe('failed');
+    expect(result.steps.at(-1)?.error).toMatch(/depth exceeds the cap of 2/);
+  });
+});
+
+describe('runScenario: soft vs verify gating', () => {
+  const failingGeneral = () =>
+    new FakeGeneralAgent(() => ({
+      text: 'nope',
+      verdict: { pass: false, reason: 'not visible' },
+    }));
+
+  it('soft failures warn but the case passes and continues', async () => {
+    const general = failingGeneral();
+    const { result, ui } = await run(
+      scenario('soft path', [
+        Soft('a promo banner is visible'),
+        When('continue browsing'),
+      ]),
+      { general },
+    );
+    expect(result.status).toBe('passed');
+    expect(result.steps[0].status).toBe('warning');
+    expect(result.warnings[0]).toMatch(/soft check failed.*not visible/);
+    expect(ui.actCalls).toEqual(['continue browsing']);
+  });
+
+  it('verify failures gate the case and stop execution', async () => {
+    const general = failingGeneral();
+    const { result, ui } = await run(
+      scenario('hard path', [
+        Then('the cart shows 1 item'),
+        When('never reached'),
+      ]),
+      { general },
+    );
+    expect(result.status).toBe('failed');
+    expect(result.steps).toHaveLength(1);
+    expect(result.steps[0].verdict).toEqual({
+      pass: false,
+      reason: 'not visible',
+    });
+    expect(ui.actCalls).toEqual([]);
+  });
+
+  it('a verify failure inside a flow stops the whole scenario', async () => {
+    const flow = defineFlow({
+      name: 'Guard',
+      params: [],
+      returns: [],
+      steps: [Then('precondition holds')],
+    });
+    const general = failingGeneral();
+    const { result, ui } = await run(
+      scenario('gated', [callFlow('Guard'), When('never reached')]),
+      { flows: [flow], general },
+    );
+    expect(result.status).toBe('failed');
+    expect(ui.actCalls).toEqual([]);
+  });
+});
+
+describe('runScenario: observability events', () => {
+  it('emits stepStart/varSet/flowEnter/flowExit in execution order', async () => {
+    const ui = new FakeUiAgent(['Hello, Admin!']);
+    const events: string[] = [];
+    await runScenario({
+      scenario: scenario(
+        'observed',
+        [
+          callFlow('Login', { role: '{whoami}' }),
+          Then('header shows {greeting}'),
+        ],
+        { vars: { whoami: 'admin' } },
+      ),
+      registry: createFlowRegistry([loginFlow]),
+      uiAgent: ui.asAgent(),
+      generalAgent: new FakeGeneralAgent(),
+      env: {},
+      onEvent: (e) => {
+        switch (e.type) {
+          case 'stepStart':
+            events.push(`start:${e.node}@${e.depth}:${e.input}`);
+            break;
+          case 'stepEnd':
+            events.push(`end:${e.result.node}:${e.result.status}`);
+            break;
+          case 'varSet':
+            events.push(`var:${e.name}=${e.value}:${e.source}`);
+            break;
+          case 'flowEnter':
+            events.push(`enter:${e.flowName}(${e.args.role})@${e.depth}`);
+            break;
+          case 'flowExit':
+            events.push(`exit:${e.flowName}@${e.depth}`);
+            break;
+        }
+      },
+    });
+
+    expect(events).toEqual([
+      'var:whoami=admin:seed',
+      'enter:Login(admin)@1',
+      'end:flow:info',
+      'start:ui@1:open the login page',
+      'end:ui:info',
+      'start:ui@1:sign in as the "admin" user',
+      'end:ui:info',
+      'start:capture@1:the greeting shown in the header',
+      'var:greeting=Hello, Admin!:capture',
+      'end:capture:info',
+      'var:greeting=Hello, Admin!:return',
+      'exit:Login@1',
+      'start:verify@0:header shows Hello, Admin!',
+      'end:verify:passed',
+    ]);
+  });
+});
+
+describe('runScenario: end-to-end with the Gherkin front-end', () => {
+  it('runs a compiled .feature scenario against the fake agents', async () => {
+    const { compileFeature } = await import('../../src/frontends/gherkin');
+    const compiled = compileFeature(
+      `Feature: Mini checkout
+  @flow @param:role @returns:greeting
+  Scenario: Login
+    When I sign in as the "{role}" user
+    When I remember the greeting shown in the header as "greeting"
+
+  Scenario: Greet
+    When I run the "Login" flow with role "admin"
+    Then the header shows {greeting}
+`,
+      'mini.feature',
+    );
+
+    const ui = new FakeUiAgent(['Hello, Admin!']);
+    const general = new FakeGeneralAgent();
+    const { result } = await run(compiled.scenarios[0], {
+      flows: compiled.flows,
+      ui,
+      general,
+    });
+
+    expect(result.status).toBe('passed');
+    expect(ui.actCalls).toEqual(['I sign in as the "admin" user']);
+    expect(general.calls[0].instruction).toBe('the header shows Hello, Admin!');
+  });
+});
diff --git a/packages/testing-framework/vitest.config.ts b/packages/testing-framework/vitest.config.ts
index 613747d241..59ef99589d 100644
--- a/packages/testing-framework/vitest.config.ts
+++ b/packages/testing-framework/vitest.config.ts
@@ -13,6 +13,9 @@ export default defineConfig({
   resolve: {
     alias: {
       '@': path.resolve(__dirname, 'src'),
+      // Let tests import the example authoring files (which use the package
+      // name) without requiring a dist build.
+      '@midscene/testing-framework': path.resolve(__dirname, 'src/index.ts'),
     },
   },
   test: {
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 91d122b4d0..ef63eae36a 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -1382,7 +1382,7 @@ importers:
         version: link:../web-integration
       '@modelcontextprotocol/inspector':
         specifier: ^0.16.3
-        version: 0.16.3(@types/node@18.19.62)(@types/react-dom@19.1.5(@types/react@19.1.5))(@types/react@19.1.5)(bufferutil@4.0.9)(typescript@5.8.3)(utf-8-validate@6.0.5)
+        version: 0.16.3(@types/node@18.19.62)(@types/react-dom@19.1.5(@types/react@19.1.5))(@types/react@19.1.5)(typescript@5.8.3)
       '@modelcontextprotocol/sdk':
         specifier: 1.10.2
         version: 1.10.2
@@ -1606,6 +1606,12 @@ importers:
 
   packages/testing-framework:
     dependencies:
+      '@cucumber/gherkin':
+        specifier: ^39.1.0
+        version: 39.1.0
+      '@cucumber/messages':
+        specifier: ^32.3.1
+        version: 32.3.1
       '@earendil-works/pi-ai':
         specifier: ^0.78.0
         version: 0.78.0(bufferutil@4.0.9)(utf-8-validate@6.0.5)(ws@8.20.0(bufferutil@4.0.9)(utf-8-validate@6.0.5))(zod@3.25.76)
@@ -1903,12 +1909,6 @@ importers:
         specifier: 3.0.5
         version: 3.0.5(@types/debug@4.1.12)(@types/node@18.19.118)(jsdom@29.0.2)(less@4.3.0)(lightningcss@1.30.1)(sass-embedded@1.86.3)(terser@5.46.1)
 
-  tmp-20332-jvOvjDBiByix:
-    devDependencies:
-      nx:
-        specifier: 22.7.5
-        version: 22.7.5
-
 packages:
 
   '@alloc/quick-lru@5.2.0':
@@ -2533,6 +2533,12 @@ packages:
     resolution: {integrity: sha512-SITSV6aIXsuVNV3f3O0f2n/cgyEDWoSqtZMYiAmcsYHydcKrOz3gUxB/iXd/Qf08+IZX4KpgNbvUdMBmWz+kcA==}
     engines: {node: '>=10'}
 
+  '@cucumber/gherkin@39.1.0':
+    resolution: {integrity: sha512-pqmSO2bUWxJm3TbNrKXlDaHjL6c77+ez9kWmfCd9oRPeTRPEVH3spZvpAqdXYWOZYSNYwWFCAAeZ4RGpkauNoQ==}
+
+  '@cucumber/messages@32.3.1':
+    resolution: {integrity: sha512-yNQq1KoXRYaEKrWMFmpUQX7TdeQuU9jeGgJAZ3dArTsC/T4NpJ6DnqaJIIgwPnz/wtQIQTNX7/h0rOuF5xY4qQ==}
+
   '@devicefarmer/adbkit-logcat@2.1.3':
     resolution: {integrity: sha512-yeaGFjNBc/6+svbDeul1tNHtNChw6h8pSHAt5D+JsedUrMTN7tla7B15WLDyekxsuS2XlZHRxpuC6m92wiwCNw==}
     engines: {node: '>= 4'}
@@ -2607,24 +2613,15 @@ packages:
   '@emnapi/core@1.10.0':
     resolution: {integrity: sha512-yq6OkJ4p82CAfPl0u9mQebQHKPJkY7WrIuk205cTYnYe+k2Z8YBh11FrbRG/H6ihirqcacOgl2BIO8oyMQLeXw==}
 
-  '@emnapi/core@1.4.5':
-    resolution: {integrity: sha512-XsLw1dEOpkSX/WucdqUhPWP7hDxSvZiY+fsUC14h+FtQ2Ifni4znbBt8punRX+Uj2JG/uDb8nEHVKvrVlvdZ5Q==}
-
   '@emnapi/core@1.7.1':
     resolution: {integrity: sha512-o1uhUASyo921r2XtHYOHy7gdkGLge8ghBEQHMWmyJFoXlpU58kIrhhN3w26lpQb6dspetweapMn2CSNwQ8I4wg==}
 
   '@emnapi/runtime@1.10.0':
     resolution: {integrity: sha512-ewvYlk86xUoGI0zQRNq/mC+16R1QeDlKQy21Ki3oSYXNgLb45GV1P6A0M+/s6nyCuNDqe5VpaY84BzXGwVbwFA==}
 
-  '@emnapi/runtime@1.4.5':
-    resolution: {integrity: sha512-++LApOtY0pEEz1zrd9vy1/zXVaVJJ/EbAF3u0fXIzPJEDtnITsBGbbK0EkM72amhl/R5b+5xx0Y/QhcVOpuulg==}
-
   '@emnapi/runtime@1.7.1':
     resolution: {integrity: sha512-PVtJr5CmLwYAU9PZDMITZoR5iAOShYREoR45EyyLrbntV50mdePTgUn4AmOw90Ifcj+x2kRjdzr1HP3RrNiHGA==}
 
-  '@emnapi/wasi-threads@1.0.4':
-    resolution: {integrity: sha512-PJR+bOmMOPH8AtcTGAyYNiuJ3/Fcoj2XN/gBEWzDIKh254XO+mM9XoXHk5GNEhodxeMznbg7BlRojVbKN+gC6g==}
-
   '@emnapi/wasi-threads@1.1.0':
     resolution: {integrity: sha512-WI0DdZ8xFSbgMjR1sFsKABJ/C5OnRrjT06JXbZKexJGrDuPTzZdDYfFlsgcCXCyf+suG5QU2e/y1Wo2V/OapLQ==}
 
@@ -3882,101 +3879,51 @@ packages:
     cpu: [arm64]
     os: [darwin]
 
-  '@nx/nx-darwin-arm64@22.7.5':
-    resolution: {integrity: sha512-eoPtwx0qZqvRUD+VVOHm150AlSYwYoPxkDHBBGqKCn5nzPspb0lLWw8q83crM/L1M928YgK0WmGf3C++7eqsTA==}
-    cpu: [arm64]
-    os: [darwin]
-
   '@nx/nx-darwin-x64@22.1.3':
     resolution: {integrity: sha512-XmdccOBp1Lx9DXUzYDX65mkFqFvXaxUKm1d63bfA43vxIYUpR59SASB81KRQ/Q4dgvvU27C0EJuxSJbXsSkSYw==}
     cpu: [x64]
     os: [darwin]
 
-  '@nx/nx-darwin-x64@22.7.5':
-    resolution: {integrity: sha512-VLOn/ZoEn3HfjSj+yIHLCM56/el79r+9I28CkZNHaSXJQWZ3edSkcgcfYjVxCurpN2VEwDQHLBeFCH8M+lQ7wQ==}
-    cpu: [x64]
-    os: [darwin]
-
   '@nx/nx-freebsd-x64@22.1.3':
     resolution: {integrity: sha512-O+o4mqPwhKxfdsri4KxDbXbjwIwr04GfTSfA0TwgXs6hFf68qmc45FAmPGrPSvxIJg9+mUVDeFirdS8GcUE0jQ==}
     cpu: [x64]
     os: [freebsd]
 
-  '@nx/nx-freebsd-x64@22.7.5':
-    resolution: {integrity: sha512-LEVer/E2xfGvK9Go+imMQoEninOoq/38Z2bhV1SD3AThXrp1xaLFVkW5jQ6juebeVkAeztEoMLFlr576egS0vw==}
-    cpu: [x64]
-    os: [freebsd]
-
   '@nx/nx-linux-arm-gnueabihf@22.1.3':
     resolution: {integrity: sha512-ZIPDgzLq8qmvrZ3Bp+bWXam5uKwahjcChBNtORVtrHQfm4mxov2RMUMKTg2ZsVAWVP64zK+gmzG5LuoZjPMm4Q==}
     cpu: [arm]
     os: [linux]
 
-  '@nx/nx-linux-arm-gnueabihf@22.7.5':
-    resolution: {integrity: sha512-NP27EFGpmFJM6RL1Ey/AFJ7gA2xuqtIHaw6jjSNGvfrnZRUNaway30GrVaGGeODf0DsvAty/unqoBMPy6kDHbw==}
-    cpu: [arm]
-    os: [linux]
-
   '@nx/nx-linux-arm64-gnu@22.1.3':
     resolution: {integrity: sha512-wgpPaTpQKl+cCkSuE5zamTVrg14mRvT+bLAeN/yHSUgMztvGxwl3Ll+K9DgEcktBo1PLECTWNkVaW8IAsJm4Rg==}
     cpu: [arm64]
     os: [linux]
 
-  '@nx/nx-linux-arm64-gnu@22.7.5':
-    resolution: {integrity: sha512-QLnkJl3HkHsPfpLiNiAiMfpfAeFpic0U1diAxF8RqChOkCpQ7ulvyBVgE1UrQxvhd+gFQ3ed5RNDxtCRw8nTiw==}
-    cpu: [arm64]
-    os: [linux]
-
   '@nx/nx-linux-arm64-musl@22.1.3':
     resolution: {integrity: sha512-o9XmQehSPR2y0RD4evD+Ob3lNFuwsFOL5upVJqZ3rcE6GkJIFPg8SwEP5FaRIS5MwS04fxnek20NZ18BHjjV/g==}
     cpu: [arm64]
     os: [linux]
 
-  '@nx/nx-linux-arm64-musl@22.7.5':
-    resolution: {integrity: sha512-cEP6KmwBgnb38+jTTaibWCjwXcHmigqhTfy0tN1be7WZr6bHxbqNLsXqKRN70PSNA3HouZcxw1cdRL8tqbPBBA==}
-    cpu: [arm64]
-    os: [linux]
-
   '@nx/nx-linux-x64-gnu@22.1.3':
     resolution: {integrity: sha512-ekcinyDNTa2huVe02T2SFMR8oArohozRbMGO19zftbObXXI4dLdoAuLNb3vK9Pe4vYOpkhfxBVkZvcWMmx7JdA==}
     cpu: [x64]
     os: [linux]
 
-  '@nx/nx-linux-x64-gnu@22.7.5':
-    resolution: {integrity: sha512-tbaX1tZCSpGifDNBfDdEZAMxVF3Yg4bhFP/bm1needc0diqb+Zflc0u5tM5/6BWDMITQDwenJVsNiQ8ZdtJURA==}
-    cpu: [x64]
-    os: [linux]
-
   '@nx/nx-linux-x64-musl@22.1.3':
     resolution: {integrity: sha512-CqpRIJeIgELCqIgjtSsYnnLi6G0uqjbp/Pw9d7w4im4/NmJXqaE9gxpdHA1eowXLgAy9W1LkfzCPS8Q2IScPuQ==}
     cpu: [x64]
     os: [linux]
 
-  '@nx/nx-linux-x64-musl@22.7.5':
-    resolution: {integrity: sha512-H0M7csOZIgPT822LqjxSXzf4MXRND15vIkAQe3F3Jlr3Si8LC3tzbL52aVcRfgb8MF/xOB5U47mSwxWt1M2bPQ==}
-    cpu: [x64]
-    os: [linux]
-
   '@nx/nx-win32-arm64-msvc@22.1.3':
     resolution: {integrity: sha512-YbuWb8KQsAR9G0+7b4HA16GV962/VWtRcdS7WY2yaScmPT2W5rObl528Y2j4DuB0j/MVZj12qJKrYfUyjL+UJA==}
     cpu: [arm64]
     os: [win32]
 
-  '@nx/nx-win32-arm64-msvc@22.7.5':
-    resolution: {integrity: sha512-JTcZch9YAnDL1gbhqePz3DZ4x7iYemLn1yJzrjbbXAmXju2eiiJiZvJJHbV06+SP9HKXDT8RjTKuAWTdVxnHug==}
-    cpu: [arm64]
-    os: [win32]
-
   '@nx/nx-win32-x64-msvc@22.1.3':
     resolution: {integrity: sha512-G90Sp409ypeOUbmj6nmEbdy043KJUKaZ7pffxmM6i63yEe2F2WdmMgdi525vUEgmq+pfB9zQQOX1sDR/rPFvtg==}
     cpu: [x64]
     os: [win32]
 
-  '@nx/nx-win32-x64-msvc@22.7.5':
-    resolution: {integrity: sha512-ngcMyHdBJ9FSz2nHdbZ7gtJlFq0O2b05sPAsVMkZ18CKzdaA1qrBDJfsMO49hPCny505eiT766+CkKdaCDl5kA==}
-    cpu: [x64]
-    os: [win32]
-
   '@opentelemetry/api-logs@0.210.0':
     resolution: {integrity: sha512-CMtLxp+lYDriveZejpBND/2TmadrrhUfChyxzmkFtHaMDdSKfP59MAYyA0ICBvEBdm3iXwLcaj/8Ic/pnGw9Yg==}
     engines: {node: '>=8.0.0'}
@@ -4067,30 +4014,18 @@ packages:
   '@protobufjs/base64@1.1.2':
     resolution: {integrity: sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==}
 
-  '@protobufjs/codegen@2.0.4':
-    resolution: {integrity: sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==}
-
   '@protobufjs/codegen@2.0.5':
     resolution: {integrity: sha512-zgXFLzW3Ap33e6d0Wlj4MGIm6Ce8O89n/apUaGNB/jx+hw+ruWEp7EwGUshdLKVRCxZW12fp9r40E1mQrf/34g==}
 
-  '@protobufjs/eventemitter@1.1.0':
-    resolution: {integrity: sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==}
-
   '@protobufjs/eventemitter@1.1.1':
     resolution: {integrity: sha512-vW1GmwMZNnL+gMRaovlh9yZX74kc+TTU3FObkkurpMaRtBfLP3ldjS9KQWlwZgraRE0+dheEEoAxdzcJQ8eXZg==}
 
-  '@protobufjs/fetch@1.1.0':
-    resolution: {integrity: sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==}
-
   '@protobufjs/fetch@1.1.1':
     resolution: {integrity: sha512-GpptLrs57adMSuHi3VNj0mAF8dwh36LMaYF6XyJ6JMWlVsc+t42tm1HSEDmOs3A8fC9yyeisgLhsTVQokOZ0zw==}
 
   '@protobufjs/float@1.0.2':
     resolution: {integrity: sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==}
 
-  '@protobufjs/inquire@1.1.0':
-    resolution: {integrity: sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==}
-
   '@protobufjs/inquire@1.1.2':
     resolution: {integrity: sha512-pa0vFRuws4wkvaXKK1uXZMAwAX4/t8ANaJo45iw/oQHNQ9q5xUzwgFmVJGXiga2BeN+zpX7Vf9vmsiIa2J+MUw==}
 
@@ -4100,9 +4035,6 @@ packages:
   '@protobufjs/pool@1.1.0':
     resolution: {integrity: sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==}
 
-  '@protobufjs/utf8@1.1.0':
-    resolution: {integrity: sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==}
-
   '@protobufjs/utf8@1.1.1':
     resolution: {integrity: sha512-oOAWABowe8EAbMyWKM0tYDKi8Yaox52D+HWZhAIJqQXbqe0xI/GV7FhLWqlEKreMkfDjshR5FKgi3mnle0h6Eg==}
 
@@ -6123,9 +6055,6 @@ packages:
   axios@1.13.2:
     resolution: {integrity: sha512-VPk9ebNqPcy5lRGuSlKx752IlDatOjT9paPlm8A7yOuW2Fbvp4X3JznJtT4f0GzGLLiWE9W8onz51SqLYwzGaA==}
 
-  axios@1.16.0:
-    resolution: {integrity: sha512-6hp5CwvTPlN2A31g5dxnwAX0orzM7pmCRDLnZSX772mv8WDqICwFjowHuPs04Mc8deIld1+ejhtaMn5vp6b+1w==}
-
   axios@1.8.3:
     resolution: {integrity: sha512-iP4DebzoNlP/YN2dpwCgb8zoCmhtkajzS48JvwmkSkXvPI3DHc7m+XYL5tGnSlJtR6nImXZmdCuN5aP8dh1d8A==}
 
@@ -6149,10 +6078,6 @@ packages:
   balanced-match@1.0.2:
     resolution: {integrity: sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==}
 
-  balanced-match@4.0.3:
-    resolution: {integrity: sha512-1pHv8LX9CpKut1Zp4EXey7Z8OfH11ONNH6Dhi2WDUt31VVZFXZzKwXcysBgqSumFCmR+0dqjMK5v5JiFHzi0+g==}
-    engines: {node: 20 || >=22}
-
   balanced-match@4.0.4:
     resolution: {integrity: sha512-BLrgEcRTwX2o6gGxGOCNyMvGSp35YofuYzw9h1IMTRmKqttAZZVU67bdb9Pr2vUHA8+j3i2tJfjO6C6+4myGTA==}
     engines: {node: 18 || 20 || >=22}
@@ -6571,6 +6496,9 @@ packages:
   cipher-base@1.0.4:
     resolution: {integrity: sha512-Kkht5ye6ZGmwv40uUDZztayT2ThLQGfnj/T71N/XzeZeo3nf8foyW7zGTsPYkEya3m5f3cAypH+qe7YOrM1U2Q==}
 
+  class-transformer@0.5.1:
+    resolution: {integrity: sha512-SQa1Ws6hUbfC98vKGxZH3KFY0Y1lm5Zm0SY8XX9zbK7FJCyVEac3ATW0RIpwzW+oOfmHE5PMPufDG9hCfoEOMw==}
+
   class-variance-authority@0.7.1:
     resolution: {integrity: sha512-Ka+9Trutv7G8M6WT6SeiRWz792K5qEqIGEGzXKhAE6xOWAY6pPH8U+9IY3oCMv6kqTmLsv7Xh/2w2RigkePMsg==}
 
@@ -6585,10 +6513,6 @@ packages:
     resolution: {integrity: sha512-I/zHAwsKf9FqGoXM4WWRACob9+SNukZTd94DWF57E4toouRulbCxcUh6RKUEOQlYTHJnzkPMySvPNaaSLNfLZw==}
     engines: {node: '>=8'}
 
-  cli-progress@3.12.0:
-    resolution: {integrity: sha512-tRkV3HJ1ASwm19THiiLIXLO7Im7wlTuKnvkYaTkyoAPefqjNg7W7DHKUlGRxy9vxDvbyCYQkQozvptuMkGCg8A==}
-    engines: {node: '>=4'}
-
   cli-spinners@2.6.1:
     resolution: {integrity: sha512-x/5fWmGMnbKQAaNwN+UZlV79qBLM9JFnJuJ03gIi5whrob0xV0ofNVHy9DhwGdsMJQc2OKv0oGmLzvaqvAVv+g==}
     engines: {node: '>=6'}
@@ -7232,10 +7156,6 @@ packages:
     resolution: {integrity: sha512-zIHwmZPRshsCdpMDyVsqGmgyP0yT8GAgXUnkdAoJisxvf33k7yO6OuoKmcTGuXPWSsm8Oh88nZicRLA9Y0rUeA==}
     engines: {node: '>=12'}
 
-  dotenv-expand@12.0.3:
-    resolution: {integrity: sha512-uc47g4b+4k/M/SeaW1y4OApx+mtLWl92l5LMPP0GNXctZqELk+YGgOPIIC5elYmUH4OuoK3JLhuRUYegeySiFA==}
-    engines: {node: '>=12'}
-
   dotenv@16.4.5:
     resolution: {integrity: sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==}
     engines: {node: '>=12'}
@@ -7266,11 +7186,6 @@ packages:
   ee-first@1.1.1:
     resolution: {integrity: sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==}
 
-  ejs@5.0.1:
-    resolution: {integrity: sha512-COqBPFMxuPTPspXl2DkVYaDS3HtrD1GpzOGkNTJ1IYkifq/r9h8SVEFrjA3D9/VJGOEoMQcrlhpntcSUrM8k6A==}
-    engines: {node: '>=0.12.18'}
-    hasBin: true
-
   electron-to-chromium@1.5.182:
     resolution: {integrity: sha512-Lv65Btwv9W4J9pyODI6EWpdnhfvrve/us5h1WspW8B2Fb0366REPtY3hX7ounk1CkV/TBjWCEvCBBbYbmV0qCA==}
 
@@ -9884,18 +9799,6 @@ packages:
       '@swc/core':
         optional: true
 
-  nx@22.7.5:
-    resolution: {integrity: sha512-zoxsJabb33jl1QYnalDn0bicryrEBgSzdKp90d7VGGv/jDgzKrcLg/hw2ZxeYiOjWPIT/o8QNT9G9vTs4dv3AQ==}
-    hasBin: true
-    peerDependencies:
-      '@swc-node/register': ^1.11.1
-      '@swc/core': ^1.15.8
-    peerDependenciesMeta:
-      '@swc-node/register':
-        optional: true
-      '@swc/core':
-        optional: true
-
   object-assign@4.1.1:
     resolution: {integrity: sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==}
     engines: {node: '>=0.10.0'}
@@ -10481,10 +10384,6 @@ packages:
   proxy-from-env@1.1.0:
     resolution: {integrity: sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==}
 
-  proxy-from-env@2.1.0:
-    resolution: {integrity: sha512-cJ+oHTW1VAEa8cJslgmUZrc+sjRKgAKl3Zyse6+PV38hZe/V6Z14TbCuXcan9F9ghlz4QrFr2c92TNF82UkYHA==}
-    engines: {node: '>=10'}
-
   prr@1.0.1:
     resolution: {integrity: sha512-yPw4Sng1gWghHQWj0B3ZggWUm4qVbPwPFcRG8KyxiU7J2OHFSoEHKS+EZ3fv5l1t9CyCiop6l/ZYeWbrgoQejw==}
 
@@ -10989,6 +10888,9 @@ packages:
   reduce-configs@1.1.1:
     resolution: {integrity: sha512-EYtsVGAQarE8daT54cnaY1PIknF2VB78ug6Zre2rs36EsJfC40EG6hmTU2A2P1ZuXnKAt2KI0fzOGHcX7wzdPw==}
 
+  reflect-metadata@0.2.2:
+    resolution: {integrity: sha512-urBwgfrvVP/eAyXx4hluJivBKzuEbSQs9rKWCrCkbSxNv8mxPcUZKeuoF3Uy4mJl3Lwprp6yy5/39VWigZ4K6Q==}
+
   reflect.getprototypeof@1.0.10:
     resolution: {integrity: sha512-00o4I+DVrefhv+nX0ulyi3biSHCPDe+yLv5o/p6d/UVlirijB8E16FtfwSAi4g3tcqrQ4lRAqQSoFEZJehYEcw==}
     engines: {node: '>= 0.4'}
@@ -11448,11 +11350,6 @@ packages:
     engines: {node: '>=10'}
     hasBin: true
 
-  semver@7.7.4:
-    resolution: {integrity: sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA==}
-    engines: {node: '>=10'}
-    hasBin: true
-
   send@0.19.0:
     resolution: {integrity: sha512-dW41u5VfLXu8SJh5bwRmyYUbAoSB3c9uQh6L8h/KtsFREPWpbX1lrljJo186Jc4nmci/sGUZ9a0a0J2zgfq2hw==}
     engines: {node: '>= 0.8.0'}
@@ -11611,10 +11508,6 @@ packages:
     engines: {node: '>=6'}
     hasBin: true
 
-  smol-toml@1.6.1:
-    resolution: {integrity: sha512-dWUG8F5sIIARXih1DTaQAX4SsiTXhInKf1buxdY9DIg4ZYPZK5nGM1VRIYmEbDbsHt7USo99xSLFu5Q1IqTmsg==}
-    engines: {node: '>= 18'}
-
   snake-case@3.0.4:
     resolution: {integrity: sha512-LAOh4z89bGQvl9pFfNF8V146i7o7/CqFPbqzYgP+yYzDIDeS9HaNFtXABamRW+AQzEVODcvE79ljJ+8a9YSdMg==}
 
@@ -12083,10 +11976,6 @@ packages:
     resolution: {integrity: sha512-voyz6MApa1rQGUxT3E+BK7/ROe8itEx7vD8/HEvt4xwXucvQ5G5oeEiHkmHZJuBO21RpOf+YYm9MOivj709jow==}
     engines: {node: '>=14.14'}
 
-  tmp@0.2.6:
-    resolution: {integrity: sha512-5sJPdPjfI5Kx+qbrDesxkglRBxW//g7hCsqspEjwkewGvBMGIKMOTKzLt1hFVJzyadba3lDUN20O9qhvbQUSTA==}
-    engines: {node: '>=14.14'}
-
   tn1150@0.1.0:
     resolution: {integrity: sha512-DbplOfQFkqG5IHcDyyrs/lkvSr3mPUVsFf/RbDppOshs22yTPnSJWEe6FkYd1txAwU/zcnR905ar2fi4kwF29w==}
     engines: {node: '>=0.12'}
@@ -13908,6 +13797,15 @@ snapshots:
 
   '@ctrl/tinycolor@3.6.1': {}
 
+  '@cucumber/gherkin@39.1.0':
+    dependencies:
+      '@cucumber/messages': 32.3.1
+
+  '@cucumber/messages@32.3.1':
+    dependencies:
+      class-transformer: 0.5.1
+      reflect-metadata: 0.2.2
+
   '@devicefarmer/adbkit-logcat@2.1.3': {}
 
   '@devicefarmer/adbkit-monkey@1.2.1': {}
@@ -14093,11 +13991,6 @@ snapshots:
       '@emnapi/wasi-threads': 1.2.1
       tslib: 2.8.1
 
-  '@emnapi/core@1.4.5':
-    dependencies:
-      '@emnapi/wasi-threads': 1.0.4
-      tslib: 2.8.1
-
   '@emnapi/core@1.7.1':
     dependencies:
       '@emnapi/wasi-threads': 1.1.0
@@ -14108,19 +14001,11 @@ snapshots:
     dependencies:
       tslib: 2.8.1
 
-  '@emnapi/runtime@1.4.5':
-    dependencies:
-      tslib: 2.8.1
-
   '@emnapi/runtime@1.7.1':
     dependencies:
       tslib: 2.8.1
     optional: true
 
-  '@emnapi/wasi-threads@1.0.4':
-    dependencies:
-      tslib: 2.8.1
-
   '@emnapi/wasi-threads@1.1.0':
     dependencies:
       tslib: 2.8.1
@@ -15442,7 +15327,7 @@ snapshots:
       - typescript
       - utf-8-validate
 
-  '@modelcontextprotocol/inspector@0.16.3(@types/node@18.19.62)(@types/react-dom@19.1.5(@types/react@19.1.5))(@types/react@19.1.5)(bufferutil@4.0.9)(typescript@5.8.3)(utf-8-validate@6.0.5)':
+  '@modelcontextprotocol/inspector@0.16.3(@types/node@18.19.62)(@types/react-dom@19.1.5(@types/react@19.1.5))(@types/react@19.1.5)(typescript@5.8.3)':
     dependencies:
       '@modelcontextprotocol/inspector-cli': 0.16.3
       '@modelcontextprotocol/inspector-client': 0.16.3(@types/react-dom@19.1.5(@types/react@19.1.5))(@types/react@19.1.5)
@@ -15567,63 +15452,33 @@ snapshots:
   '@nx/nx-darwin-arm64@22.1.3':
     optional: true
 
-  '@nx/nx-darwin-arm64@22.7.5':
-    optional: true
-
   '@nx/nx-darwin-x64@22.1.3':
     optional: true
 
-  '@nx/nx-darwin-x64@22.7.5':
-    optional: true
-
   '@nx/nx-freebsd-x64@22.1.3':
     optional: true
 
-  '@nx/nx-freebsd-x64@22.7.5':
-    optional: true
-
   '@nx/nx-linux-arm-gnueabihf@22.1.3':
     optional: true
 
-  '@nx/nx-linux-arm-gnueabihf@22.7.5':
-    optional: true
-
   '@nx/nx-linux-arm64-gnu@22.1.3':
     optional: true
 
-  '@nx/nx-linux-arm64-gnu@22.7.5':
-    optional: true
-
   '@nx/nx-linux-arm64-musl@22.1.3':
     optional: true
 
-  '@nx/nx-linux-arm64-musl@22.7.5':
-    optional: true
-
   '@nx/nx-linux-x64-gnu@22.1.3':
     optional: true
 
-  '@nx/nx-linux-x64-gnu@22.7.5':
-    optional: true
-
   '@nx/nx-linux-x64-musl@22.1.3':
     optional: true
 
-  '@nx/nx-linux-x64-musl@22.7.5':
-    optional: true
-
   '@nx/nx-win32-arm64-msvc@22.1.3':
     optional: true
 
-  '@nx/nx-win32-arm64-msvc@22.7.5':
-    optional: true
-
   '@nx/nx-win32-x64-msvc@22.1.3':
     optional: true
 
-  '@nx/nx-win32-x64-msvc@22.7.5':
-    optional: true
-
   '@opentelemetry/api-logs@0.210.0':
     dependencies:
       '@opentelemetry/api': 1.9.0
@@ -15725,40 +15580,22 @@ snapshots:
 
   '@protobufjs/base64@1.1.2': {}
 
-  '@protobufjs/codegen@2.0.4':
-    optional: true
-
   '@protobufjs/codegen@2.0.5': {}
 
-  '@protobufjs/eventemitter@1.1.0':
-    optional: true
-
   '@protobufjs/eventemitter@1.1.1': {}
 
-  '@protobufjs/fetch@1.1.0':
-    dependencies:
-      '@protobufjs/aspromise': 1.1.2
-      '@protobufjs/inquire': 1.1.0
-    optional: true
-
   '@protobufjs/fetch@1.1.1':
     dependencies:
       '@protobufjs/aspromise': 1.1.2
 
   '@protobufjs/float@1.0.2': {}
 
-  '@protobufjs/inquire@1.1.0':
-    optional: true
-
   '@protobufjs/inquire@1.1.2': {}
 
   '@protobufjs/path@1.1.2': {}
 
   '@protobufjs/pool@1.1.0': {}
 
-  '@protobufjs/utf8@1.1.0':
-    optional: true
-
   '@protobufjs/utf8@1.1.1': {}
 
   '@puppeteer/browsers@2.9.0':
@@ -17680,14 +17517,6 @@ snapshots:
     optionalDependencies:
       vite: 5.4.10(@types/node@25.5.2)(less@4.2.2)(lightningcss@1.30.1)(sass-embedded@1.86.3)(terser@5.46.1)
 
-  '@vitest/mocker@3.0.5(vite@5.4.10(@types/node@25.5.2)(less@4.3.0)(lightningcss@1.30.1)(sass-embedded@1.86.3)(terser@5.46.1))':
-    dependencies:
-      '@vitest/spy': 3.0.5
-      estree-walker: 3.0.3
-      magic-string: 0.30.17
-    optionalDependencies:
-      vite: 5.4.10(@types/node@25.5.2)(less@4.3.0)(lightningcss@1.30.1)(sass-embedded@1.86.3)(terser@5.46.1)
-
   '@vitest/pretty-format@3.0.5':
     dependencies:
       tinyrainbow: 2.0.0
@@ -18319,14 +18148,6 @@ snapshots:
     transitivePeerDependencies:
       - debug
 
-  axios@1.16.0:
-    dependencies:
-      follow-redirects: 1.16.0
-      form-data: 4.0.5
-      proxy-from-env: 2.1.0
-    transitivePeerDependencies:
-      - debug
-
   axios@1.8.3:
     dependencies:
       follow-redirects: 1.15.9
@@ -18352,8 +18173,6 @@ snapshots:
 
   balanced-match@1.0.2: {}
 
-  balanced-match@4.0.3: {}
-
   balanced-match@4.0.4: {}
 
   bare-events@2.5.0:
@@ -18724,7 +18543,7 @@ snapshots:
 
   centra@2.7.0:
     dependencies:
-      follow-redirects: 1.15.11
+      follow-redirects: 1.16.0
     transitivePeerDependencies:
       - debug
 
@@ -18841,6 +18660,8 @@ snapshots:
       inherits: 2.0.4
       safe-buffer: 5.2.1
 
+  class-transformer@0.5.1: {}
+
   class-variance-authority@0.7.1:
     dependencies:
       clsx: 2.1.1
@@ -18853,10 +18674,6 @@ snapshots:
     dependencies:
       restore-cursor: 3.1.0
 
-  cli-progress@3.12.0:
-    dependencies:
-      string-width: 4.2.3
-
   cli-spinners@2.6.1: {}
 
   cli-spinners@2.9.2: {}
@@ -19532,10 +19349,6 @@ snapshots:
     dependencies:
       dotenv: 16.4.7
 
-  dotenv-expand@12.0.3:
-    dependencies:
-      dotenv: 16.4.7
-
   dotenv@16.4.5: {}
 
   dotenv@16.4.7: {}
@@ -19571,8 +19384,6 @@ snapshots:
 
   ee-first@1.1.1: {}
 
-  ejs@5.0.1: {}
-
   electron-to-chromium@1.5.182: {}
 
   electron-to-chromium@1.5.260: {}
@@ -22850,132 +22661,6 @@ snapshots:
     transitivePeerDependencies:
       - debug
 
-  nx@22.7.5:
-    dependencies:
-      '@emnapi/core': 1.4.5
-      '@emnapi/runtime': 1.4.5
-      '@emnapi/wasi-threads': 1.0.4
-      '@jest/diff-sequences': 30.0.1
-      '@napi-rs/wasm-runtime': 0.2.4
-      '@tybys/wasm-util': 0.9.0
-      '@yarnpkg/lockfile': 1.1.0
-      '@zkochan/js-yaml': 0.0.7
-      ansi-colors: 4.1.3
-      ansi-regex: 5.0.1
-      ansi-styles: 4.3.0
-      argparse: 2.0.1
-      asynckit: 0.4.0
-      axios: 1.16.0
-      balanced-match: 4.0.3
-      base64-js: 1.5.1
-      bl: 4.1.0
-      brace-expansion: 5.0.6
-      buffer: 5.7.1
-      call-bind-apply-helpers: 1.0.2
-      chalk: 4.1.2
-      cli-cursor: 3.1.0
-      cli-spinners: 2.6.1
-      cliui: 8.0.1
-      clone: 1.0.4
-      color-convert: 2.0.1
-      color-name: 1.1.4
-      combined-stream: 1.0.8
-      defaults: 1.0.4
-      define-lazy-prop: 2.0.0
-      delayed-stream: 1.0.0
-      dotenv: 16.4.7
-      dotenv-expand: 12.0.3
-      dunder-proto: 1.0.1
-      ejs: 5.0.1
-      emoji-regex: 8.0.0
-      end-of-stream: 1.4.5
-      enquirer: 2.3.6
-      es-define-property: 1.0.1
-      es-errors: 1.3.0
-      es-object-atoms: 1.1.1
-      es-set-tostringtag: 2.1.0
-      escalade: 3.2.0
-      escape-string-regexp: 1.0.5
-      figures: 3.2.0
-      flat: 5.0.2
-      follow-redirects: 1.16.0
-      form-data: 4.0.5
-      fs-constants: 1.0.0
-      function-bind: 1.1.2
-      get-caller-file: 2.0.5
-      get-intrinsic: 1.3.0
-      get-proto: 1.0.1
-      gopd: 1.2.0
-      has-flag: 4.0.0
-      has-symbols: 1.1.0
-      has-tostringtag: 1.0.2
-      hasown: 2.0.2
-      ieee754: 1.2.1
-      ignore: 7.0.5
-      inherits: 2.0.4
-      is-docker: 2.2.1
-      is-fullwidth-code-point: 3.0.0
-      is-interactive: 1.0.0
-      is-unicode-supported: 0.1.0
-      is-wsl: 2.2.0
-      json5: 2.2.3
-      jsonc-parser: 3.2.0
-      lines-and-columns: 2.0.3
-      log-symbols: 4.1.0
-      math-intrinsics: 1.1.0
-      mime-db: 1.52.0
-      mime-types: 2.1.35
-      mimic-fn: 2.1.0
-      minimatch: 10.2.5
-      minimist: 1.2.8
-      npm-run-path: 4.0.1
-      once: 1.4.0
-      onetime: 5.1.2
-      open: 8.4.2
-      ora: 5.3.0
-      path-key: 3.1.1
-      picocolors: 1.1.1
-      proxy-from-env: 2.1.0
-      readable-stream: 3.6.2
-      require-directory: 2.1.1
-      resolve.exports: 2.0.3
-      restore-cursor: 3.1.0
-      safe-buffer: 5.2.1
-      semver: 7.7.4
-      signal-exit: 3.0.7
-      smol-toml: 1.6.1
-      string-width: 4.2.3
-      string_decoder: 1.3.0
-      strip-ansi: 6.0.1
-      strip-bom: 3.0.0
-      supports-color: 7.2.0
-      tar-stream: 2.2.0
-      tmp: 0.2.6
-      tree-kill: 1.2.2
-      tsconfig-paths: 4.2.0
-      tslib: 2.8.1
-      util-deprecate: 1.0.2
-      wcwidth: 1.0.1
-      wrap-ansi: 7.0.0
-      wrappy: 1.0.2
-      y18n: 5.0.8
-      yaml: 2.9.0
-      yargs: 17.7.2
-      yargs-parser: 21.1.1
-    optionalDependencies:
-      '@nx/nx-darwin-arm64': 22.7.5
-      '@nx/nx-darwin-x64': 22.7.5
-      '@nx/nx-freebsd-x64': 22.7.5
-      '@nx/nx-linux-arm-gnueabihf': 22.7.5
-      '@nx/nx-linux-arm64-gnu': 22.7.5
-      '@nx/nx-linux-arm64-musl': 22.7.5
-      '@nx/nx-linux-x64-gnu': 22.7.5
-      '@nx/nx-linux-x64-musl': 22.7.5
-      '@nx/nx-win32-arm64-msvc': 22.7.5
-      '@nx/nx-win32-x64-msvc': 22.7.5
-    transitivePeerDependencies:
-      - debug
-
   object-assign@4.1.1: {}
 
   object-inspect@1.13.4: {}
@@ -23539,14 +23224,14 @@ snapshots:
     dependencies:
       '@protobufjs/aspromise': 1.1.2
       '@protobufjs/base64': 1.1.2
-      '@protobufjs/codegen': 2.0.4
-      '@protobufjs/eventemitter': 1.1.0
-      '@protobufjs/fetch': 1.1.0
+      '@protobufjs/codegen': 2.0.5
+      '@protobufjs/eventemitter': 1.1.1
+      '@protobufjs/fetch': 1.1.1
       '@protobufjs/float': 1.0.2
-      '@protobufjs/inquire': 1.1.0
+      '@protobufjs/inquire': 1.1.2
       '@protobufjs/path': 1.1.2
       '@protobufjs/pool': 1.1.0
-      '@protobufjs/utf8': 1.1.0
+      '@protobufjs/utf8': 1.1.1
       '@types/node': 18.19.130
       long: 5.3.2
     optional: true
@@ -23571,8 +23256,6 @@ snapshots:
 
   proxy-from-env@1.1.0: {}
 
-  proxy-from-env@2.1.0: {}
-
   prr@1.0.1:
     optional: true
 
@@ -24229,6 +23912,8 @@ snapshots:
 
   reduce-configs@1.1.1: {}
 
+  reflect-metadata@0.2.2: {}
+
   reflect.getprototypeof@1.0.10:
     dependencies:
       call-bind: 1.0.8
@@ -24719,8 +24404,6 @@ snapshots:
 
   semver@7.7.3: {}
 
-  semver@7.7.4: {}
-
   send@0.19.0:
     dependencies:
       debug: 2.6.9
@@ -24980,8 +24663,6 @@ snapshots:
       wcwidth: 1.0.1
       yargs: 15.4.1
 
-  smol-toml@1.6.1: {}
-
   snake-case@3.0.4:
     dependencies:
       dot-case: 3.0.4
@@ -25488,8 +25169,6 @@ snapshots:
 
   tmp@0.2.5: {}
 
-  tmp@0.2.6: {}
-
   tn1150@0.1.0:
     dependencies:
       unorm: 1.6.0
@@ -25967,24 +25646,6 @@ snapshots:
       - supports-color
       - terser
 
-  vite-node@3.0.5(@types/node@25.5.2)(less@4.3.0)(lightningcss@1.30.1)(sass-embedded@1.86.3)(terser@5.46.1):
-    dependencies:
-      cac: 6.7.14
-      debug: 4.4.0
-      es-module-lexer: 1.7.0
-      pathe: 2.0.3
-      vite: 5.4.10(@types/node@25.5.2)(less@4.3.0)(lightningcss@1.30.1)(sass-embedded@1.86.3)(terser@5.46.1)
-    transitivePeerDependencies:
-      - '@types/node'
-      - less
-      - lightningcss
-      - sass
-      - sass-embedded
-      - stylus
-      - sugarss
-      - supports-color
-      - terser
-
   vite@5.4.10(@types/node@18.19.118)(less@4.3.0)(lightningcss@1.30.1)(sass-embedded@1.86.3)(terser@5.46.1):
     dependencies:
       esbuild: 0.21.5
@@ -26037,19 +25698,6 @@ snapshots:
       sass-embedded: 1.86.3
       terser: 5.46.1
 
-  vite@5.4.10(@types/node@25.5.2)(less@4.3.0)(lightningcss@1.30.1)(sass-embedded@1.86.3)(terser@5.46.1):
-    dependencies:
-      esbuild: 0.21.5
-      postcss: 8.5.6
-      rollup: 4.24.3
-    optionalDependencies:
-      '@types/node': 25.5.2
-      fsevents: 2.3.3
-      less: 4.3.0
-      lightningcss: 1.30.1
-      sass-embedded: 1.86.3
-      terser: 5.46.1
-
   vitest@3.0.5(@types/debug@4.1.12)(@types/node@18.19.118)(jsdom@29.0.2)(less@4.3.0)(lightningcss@1.30.1)(sass-embedded@1.86.3)(terser@5.46.1):
     dependencies:
       '@vitest/expect': 3.0.5
@@ -26198,43 +25846,6 @@ snapshots:
       - supports-color
       - terser
 
-  vitest@3.0.5(@types/debug@4.1.12)(@types/node@25.5.2)(jsdom@29.0.2)(less@4.3.0)(lightningcss@1.30.1)(sass-embedded@1.86.3)(terser@5.46.1):
-    dependencies:
-      '@vitest/expect': 3.0.5
-      '@vitest/mocker': 3.0.5(vite@5.4.10(@types/node@25.5.2)(less@4.3.0)(lightningcss@1.30.1)(sass-embedded@1.86.3)(terser@5.46.1))
-      '@vitest/pretty-format': 3.1.1
-      '@vitest/runner': 3.0.5
-      '@vitest/snapshot': 3.0.5
-      '@vitest/spy': 3.0.5
-      '@vitest/utils': 3.0.5
-      chai: 5.2.0
-      debug: 4.4.0
-      expect-type: 1.2.1
-      magic-string: 0.30.17
-      pathe: 2.0.3
-      std-env: 3.9.0
-      tinybench: 2.9.0
-      tinyexec: 0.3.2
-      tinypool: 1.1.1
-      tinyrainbow: 2.0.0
-      vite: 5.4.10(@types/node@25.5.2)(less@4.3.0)(lightningcss@1.30.1)(sass-embedded@1.86.3)(terser@5.46.1)
-      vite-node: 3.0.5(@types/node@25.5.2)(less@4.3.0)(lightningcss@1.30.1)(sass-embedded@1.86.3)(terser@5.46.1)
-      why-is-node-running: 2.3.0
-    optionalDependencies:
-      '@types/debug': 4.1.12
-      '@types/node': 25.5.2
-      jsdom: 29.0.2
-    transitivePeerDependencies:
-      - less
-      - lightningcss
-      - msw
-      - sass
-      - sass-embedded
-      - stylus
-      - sugarss
-      - supports-color
-      - terser
-
   vm-browserify@1.1.2: {}
 
   w-json@1.3.10: {}

From b67ed17587f003749059ec35bfd56993cc66bc0b Mon Sep 17 00:00:00 2001
From: ScriptedAlchemy <zack@module-federation.io>
Date: Tue, 9 Jun 2026 23:13:38 +0200
Subject: [PATCH 2/9] feat(testing-framework): add runnable three-mode demo for
 flow-IR POC

Offline-by-default demo narrating the login/checkout journey through
pure Gherkin, pure JS, and bound overlay modes with scripted fake
agents, proving identical traces across front-ends and diffing overlay
changes. Experimental --live mode runs against the static demo shop.
---
 packages/testing-framework/POC-GHERKIN.md     |  42 ++
 packages/testing-framework/package.json       |   3 +-
 packages/testing-framework/scripts/demo.mjs   |  30 ++
 .../testing-framework/scripts/demo/live.ts    |  42 ++
 .../testing-framework/scripts/demo/main.ts    | 399 ++++++++++++++++++
 5 files changed, 515 insertions(+), 1 deletion(-)
 create mode 100644 packages/testing-framework/scripts/demo.mjs
 create mode 100644 packages/testing-framework/scripts/demo/live.ts
 create mode 100644 packages/testing-framework/scripts/demo/main.ts

diff --git a/packages/testing-framework/POC-GHERKIN.md b/packages/testing-framework/POC-GHERKIN.md
index 385ced409d..06f6fb5e2c 100644
--- a/packages/testing-framework/POC-GHERKIN.md
+++ b/packages/testing-framework/POC-GHERKIN.md
@@ -8,6 +8,48 @@ flows" authored in **two surfaces** — a fluent JS/TS API and Gherkin
 step is natural language executed by the AI agents. A third, **hybrid** mode
 (`bindFeature`) layers a sparse JS overlay over a `.feature` file.
 
+## Run the demo
+
+```bash
+pnpm --filter @midscene/testing-framework demo
+```
+
+Runs the login/checkout journey through **all three authoring modes** with a
+narrated walkthrough — offline by default (scripted fake agents simulate the
+shop; no model keys, no browser). Expected output (excerpt):
+
+```
+━━━ Mode 1/3: Pure Gherkin ━━━
+  ▶ Scenario: Checkout as admin
+    [ui]      the demo shop is open on the home page
+      → flow Login(role="admin")
+      [ui]      I sign in as the "admin" user with the saved test credentials   (template: "I sign in as the \"{role}\" user ...")
+      [capture] the greeting message shown in the header
+        {greeting} = "Hello, Admin!" (capture)
+      ← Login returned greeting="Hello, Admin!"
+    [verify]  the cart total equals $129.00   (template: "the cart total equals {price}")
+      ✔ PASS — The cart shows $129.00, matching the remembered price.
+    ✔ scenario passed
+...
+━━━ Comparison: three modes, one IR ━━━
+  Gherkin vs JS — "Checkout as admin": identical execution trace ✔ (24 events)
+  Bound overlay vs pure Gherkin:
+    "Checkout as admin":
+      - [verify] the cart total equals {price}
+      + [ui] apply the coupon code {couponCode} in the cart
+      + [soft] the cart total equals {price} minus the "{couponCode}" coupon discount
+      + injected var {couponCode} = "E2E-2026-06-09"
+```
+
+**Live mode (experimental, unverified in CI):** with model env configured
+(`MIDSCENE_MODEL_BASE_URL` etc., as for the AI tests) and puppeteer
+available, `pnpm --filter @midscene/testing-framework demo -- --live` drives
+a real web UI agent against the self-contained static shop in
+`example/demo-app/index.html` (override with `DEMO_URL`), with the default
+Pi-backed general agent issuing the verdicts. Each scenario gets a fresh
+browser. Implemented in `scripts/demo/live.ts`; the offline path is the
+verified reference.
+
 ```
  .feature files          .flows.ts files
       │       └─────┐          │
diff --git a/packages/testing-framework/package.json b/packages/testing-framework/package.json
index 0729ff5610..fb60cb2221 100644
--- a/packages/testing-framework/package.json
+++ b/packages/testing-framework/package.json
@@ -35,7 +35,8 @@
     "build": "rslib build",
     "build:watch": "rslib build --watch --no-clean",
     "test": "vitest --run",
-    "test:u": "vitest --run -u"
+    "test:u": "vitest --run -u",
+    "demo": "node scripts/demo.mjs"
   },
   "dependencies": {
     "@cucumber/gherkin": "^39.1.0",
diff --git a/packages/testing-framework/scripts/demo.mjs b/packages/testing-framework/scripts/demo.mjs
new file mode 100644
index 0000000000..8df92b4d3d
--- /dev/null
+++ b/packages/testing-framework/scripts/demo.mjs
@@ -0,0 +1,30 @@
+#!/usr/bin/env node
+/**
+ * Entry point for `pnpm --filter @midscene/testing-framework demo`.
+ *
+ * Boots the TypeScript demo through jiti (no build step needed) and aliases
+ * the package name to src/ so the example authoring files resolve without a
+ * dist build.
+ */
+import { dirname, join } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { createJiti } from 'jiti';
+
+const here = dirname(fileURLToPath(import.meta.url));
+
+const jiti = createJiti(import.meta.url, {
+  alias: {
+    '@midscene/testing-framework': join(here, '../src/index.ts'),
+  },
+});
+
+const { main } = await jiti.import(join(here, 'demo/main.ts'));
+
+main(process.argv.slice(2))
+  .then((code) => {
+    process.exitCode = code;
+  })
+  .catch((err) => {
+    console.error(err);
+    process.exitCode = 1;
+  });
diff --git a/packages/testing-framework/scripts/demo/live.ts b/packages/testing-framework/scripts/demo/live.ts
new file mode 100644
index 0000000000..6ab1ab69a2
--- /dev/null
+++ b/packages/testing-framework/scripts/demo/live.ts
@@ -0,0 +1,42 @@
+/**
+ * EXPERIMENTAL live mode for the demo: a real Midscene web UI agent
+ * (puppeteer) on the self-contained static shop in example/demo-app, plus the
+ * default Pi-backed general agent for verify/soft verdicts.
+ *
+ * Requires model configuration (at least MIDSCENE_MODEL_BASE_URL — same env
+ * the package's AI tests use) and a working puppeteer install. Each scenario
+ * gets a fresh browser so login/cart state never leaks between runs. Override
+ * the page with DEMO_URL to point at your own app.
+ */
+import { join } from 'node:path';
+import { pathToFileURL } from 'node:url';
+import { PiGeneralAgent } from '../../src/general-agent/pi-general-agent';
+import { createUIAgent } from '../../src/ui-agent/factory';
+
+export async function createLiveBundle() {
+  if (!process.env.MIDSCENE_MODEL_BASE_URL) {
+    throw new Error(
+      '[midscene] demo --live needs model configuration (MIDSCENE_MODEL_BASE_URL etc., see the repo .env conventions). Run without --live for the offline reference demo.',
+    );
+  }
+
+  const url =
+    process.env.DEMO_URL ??
+    pathToFileURL(join(__dirname, '../../example/demo-app/index.html')).href;
+
+  const { agent, cleanup } = await createUIAgent(
+    { type: 'web', options: { url } },
+    { generateReport: true },
+    process.env,
+  );
+  const general = new PiGeneralAgent();
+
+  return {
+    uiAgent: agent,
+    generalAgent: general,
+    cleanup: async () => {
+      await cleanup?.();
+      await general.dispose?.();
+    },
+  };
+}
diff --git a/packages/testing-framework/scripts/demo/main.ts b/packages/testing-framework/scripts/demo/main.ts
new file mode 100644
index 0000000000..4d8238bb11
--- /dev/null
+++ b/packages/testing-framework/scripts/demo/main.ts
@@ -0,0 +1,399 @@
+/**
+ * Narrated end-to-end demo of the POC: runs the login/checkout journey
+ * through all three authoring modes — pure Gherkin, pure JS, and the bound
+ * overlay — over the one shared flow-IR, printing each resolved prompt, the
+ * variable table as it evolves, flow entry/exit, and verdicts.
+ *
+ * Offline by default (scripted fake agents, no model keys / no browser).
+ * Pass `--live` to drive a real browser + model against the static shop in
+ * example/demo-app (experimental; needs MIDSCENE_MODEL_* env vars).
+ */
+import { join } from 'node:path';
+import type { Agent } from '@midscene/core/agent';
+import {
+  type CompiledFeature,
+  type FlowRegistry,
+  type ScenarioIR,
+  type ScenarioRunEvent,
+  type ScenarioRunResult,
+  compileFeatureFile,
+  createFlowRegistry,
+  runScenario,
+} from '@midscene/testing-framework';
+import {
+  checkoutAsAdmin,
+  registry as jsRegistry,
+  promoBanner,
+} from '../../example/flows/shop.flows';
+import { bound } from '../../example/flows/shop.overlay';
+import type { GeneralAgentAdapter } from '../../src/general-agent/types';
+import { ScriptedGeneralAgent, ScriptedUiAgent } from './scripted-agents';
+
+const FEATURE_FILE = join(__dirname, '../../example/flows/shop.feature');
+const SCENARIO_NAMES = ['Checkout as admin', 'Promo banner is advisory'];
+
+// —— tiny ANSI helpers (plain escapes; disabled via NO_COLOR) ——
+const useColor = process.env.NO_COLOR === undefined;
+const paint = (code: number) => (s: string) =>
+  useColor ? `\u001b[${code}m${s}\u001b[0m` : s;
+const bold = paint(1);
+const dim = paint(2);
+const red = paint(31);
+const green = paint(32);
+const yellow = paint(33);
+const cyan = paint(36);
+const magenta = paint(35);
+
+interface AgentBundle {
+  uiAgent: Agent;
+  generalAgent: GeneralAgentAdapter;
+  cleanup?: () => Promise<void>;
+  describeState?: () => string;
+}
+
+type AgentFactory = () => Promise<AgentBundle>;
+
+interface ScenarioOutcome {
+  name: string;
+  skipped: boolean;
+  result?: ScenarioRunResult;
+  /** Canonical event trace, used to prove cross-mode equivalence. */
+  trace: string[];
+}
+
+interface ModeOutcome {
+  label: string;
+  scenarios: ScenarioOutcome[];
+}
+
+export async function main(argv: string[]): Promise<number> {
+  const live = argv.includes('--live');
+  if (live && !process.env.MIDSCENE_MODEL_BASE_URL) {
+    console.error(
+      red(
+        '[midscene] demo --live needs model configuration (MIDSCENE_MODEL_BASE_URL etc., the same env the AI tests use). Run without --live for the offline reference demo.',
+      ),
+    );
+    return 2;
+  }
+
+  const agentFactory: AgentFactory = live
+    ? (await import('./live')).createLiveBundle
+    : async () => {
+        const ui = new ScriptedUiAgent();
+        return {
+          uiAgent: ui.asAgent(),
+          generalAgent: new ScriptedGeneralAgent(),
+          describeState: () => ui.describeState(),
+        };
+      };
+
+  console.log('');
+  console.log(
+    bold('Midscene testing-framework POC — three authoring modes, one flow-IR'),
+  );
+  console.log(
+    dim(
+      live
+        ? 'LIVE mode: real UI agent + model against example/demo-app (experimental).'
+        : 'Offline mode: scripted fake agents simulate the shop. No API keys, no browser.',
+    ),
+  );
+
+  const gherkin = compileFeatureFile(FEATURE_FILE);
+  const gherkinRegistry = createFlowRegistry(gherkin.flows);
+
+  const modes: Array<{
+    label: string;
+    source: string;
+    scenarios: ScenarioIR[];
+    registry: FlowRegistry;
+  }> = [
+    {
+      label: 'Pure Gherkin',
+      source: 'example/flows/shop.feature → compileFeatureFile()',
+      scenarios: pickScenarios(gherkin),
+      registry: gherkinRegistry,
+    },
+    {
+      label: 'Pure JS',
+      source: 'example/flows/shop.flows.ts → defineFlow()/scenario()',
+      scenarios: [checkoutAsAdmin, promoBanner],
+      registry: jsRegistry,
+    },
+    {
+      label: 'Bound overlay',
+      source:
+        'example/flows/shop.overlay.ts → bindFeature(shop.feature, overlay)',
+      scenarios: pickScenarios(bound),
+      registry: createFlowRegistry(bound.flows),
+    },
+  ];
+
+  const outcomes: ModeOutcome[] = [];
+  for (let i = 0; i < modes.length; i++) {
+    const mode = modes[i];
+    console.log('');
+    console.log(bold(cyan(`━━━ Mode ${i + 1}/3: ${mode.label} ━━━`)));
+    console.log(dim(`    ${mode.source}`));
+
+    const scenarios: ScenarioOutcome[] = [];
+    for (const scenario of mode.scenarios) {
+      scenarios.push(await runOne(scenario, mode.registry, agentFactory));
+    }
+    outcomes.push({ label: mode.label, scenarios });
+  }
+
+  printComparison(outcomes, gherkin);
+
+  const failed = outcomes
+    .flatMap((m) => m.scenarios)
+    .some((s) => s.result?.status === 'failed');
+  return failed ? 1 : 0;
+}
+
+function pickScenarios(compiled: CompiledFeature): ScenarioIR[] {
+  return SCENARIO_NAMES.map((name) => {
+    const found = compiled.scenarios.find((s) => s.name === name);
+    if (!found) {
+      throw new Error(`demo: scenario "${name}" not found in the feature.`);
+    }
+    return found;
+  });
+}
+
+async function runOne(
+  scenario: ScenarioIR,
+  registry: FlowRegistry,
+  agentFactory: AgentFactory,
+): Promise<ScenarioOutcome> {
+  console.log('');
+  console.log(`  ${bold(`▶ Scenario: ${scenario.name}`)}`);
+
+  if (scenario.config?.skip) {
+    console.log(`    ${yellow('↷ skipped')} ${dim('(overlay config.skip)')}`);
+    return { name: scenario.name, skipped: true, trace: [] };
+  }
+
+  const bundle = await agentFactory();
+  const trace: string[] = [];
+  try {
+    const result = await runScenario({
+      scenario,
+      registry,
+      uiAgent: bundle.uiAgent,
+      generalAgent: bundle.generalAgent,
+      env: process.env,
+      onEvent: (event) => {
+        narrate(event);
+        trace.push(canonical(event));
+      },
+    });
+
+    const vars = Object.entries(result.variables);
+    if (vars.length > 0) {
+      console.log(
+        `    ${dim('final variables:')} ${vars
+          .map(([k, v]) => `${magenta(`{${k}}`)}=${JSON.stringify(v)}`)
+          .join(', ')}`,
+      );
+    }
+    if (bundle.describeState) {
+      console.log(`    ${dim(`simulated shop: ${bundle.describeState()}`)}`);
+    }
+    for (const warning of result.warnings) {
+      console.log(`    ${yellow(`⚠ warning: ${warning}`)}`);
+    }
+    console.log(
+      `    ${result.status === 'passed' ? green('✔ scenario passed') : red('✘ scenario failed')}`,
+    );
+    return { name: scenario.name, skipped: false, result, trace };
+  } finally {
+    await bundle.cleanup?.();
+  }
+}
+
+// —— narration ——
+
+function narrate(event: ScenarioRunEvent): void {
+  const pad = `    ${'  '.repeat('depth' in event ? event.depth : 0)}`;
+  switch (event.type) {
+    case 'flowEnter':
+      console.log(
+        `${pad}${cyan(`→ flow ${event.flowName}(${formatArgs(event.args)})`)}`,
+      );
+      break;
+    case 'flowExit':
+      console.log(
+        `${pad}${cyan(`← ${event.flowName} returned ${formatArgs(event.returns)}`)}`,
+      );
+      break;
+    case 'stepStart': {
+      const tag = nodeTag(event.node);
+      const from = event.template
+        ? dim(`   (template: ${JSON.stringify(event.template)})`)
+        : '';
+      console.log(`${pad}${tag} ${event.input}${from}`);
+      break;
+    }
+    case 'varSet':
+      if (event.source === 'return') break; // flowExit already shows it
+      console.log(
+        `${pad}  ${magenta(`{${event.name}}`)} = ${JSON.stringify(event.value)} ${dim(`(${event.source})`)}`,
+      );
+      break;
+    case 'stepEnd': {
+      const { result } = event;
+      if (result.verdict) {
+        const mark = result.verdict.pass
+          ? green('✔ PASS')
+          : result.status === 'warning'
+            ? yellow('⚠ SOFT FAIL')
+            : red('✘ FAIL');
+        console.log(`${pad}  ${mark} ${dim(`— ${result.verdict.reason}`)}`);
+      } else if (result.error) {
+        console.log(`${pad}  ${red(`✘ error — ${result.error}`)}`);
+      } else if (result.node === 'ui' && result.output?.text) {
+        console.log(`${pad}  ${dim(`↳ ${result.output.text}`)}`);
+      }
+      break;
+    }
+  }
+}
+
+function nodeTag(node: string): string {
+  const label = `[${node}]`.padEnd(9);
+  switch (node) {
+    case 'verify':
+      return green(label);
+    case 'soft':
+      return yellow(label);
+    case 'capture':
+      return magenta(label);
+    case 'flow':
+      return cyan(label);
+    default:
+      return label;
+  }
+}
+
+function formatArgs(args: Record<string, string>): string {
+  const entries = Object.entries(args);
+  if (entries.length === 0) return '';
+  return entries.map(([k, v]) => `${k}=${JSON.stringify(v)}`).join(', ');
+}
+
+/** Mode-independent fingerprint of an event, for cross-mode comparison. */
+function canonical(event: ScenarioRunEvent): string {
+  switch (event.type) {
+    case 'stepStart':
+      return `${event.node}@${event.depth}: ${event.input}`;
+    case 'varSet':
+      return `var {${event.name}}=${event.value} (${event.source})`;
+    case 'flowEnter':
+      return `enter ${event.flowName}(${formatArgs(event.args)})`;
+    case 'flowExit':
+      return `exit ${event.flowName}`;
+    case 'stepEnd':
+      return `end ${event.result.node}:${event.result.status}`;
+  }
+}
+
+// —— final comparison ——
+
+function printComparison(
+  outcomes: ModeOutcome[],
+  gherkin: CompiledFeature,
+): void {
+  const [gherkinMode, jsMode, boundMode] = outcomes;
+
+  console.log('');
+  console.log(bold(cyan('━━━ Comparison: three modes, one IR ━━━')));
+
+  // 1. Gherkin vs JS: identical traces prove the two front-ends compile to
+  //    the same IR and drive the engine identically.
+  console.log('');
+  for (let i = 0; i < SCENARIO_NAMES.length; i++) {
+    const a = gherkinMode.scenarios[i];
+    const b = jsMode.scenarios[i];
+    const identical =
+      a.trace.length === b.trace.length &&
+      a.trace.every((line, j) => line === b.trace[j]);
+    const outcome = identical
+      ? green(`identical execution trace ✔ (${a.trace.length} events)`)
+      : red('traces DIFFER ✘');
+    console.log(`  Gherkin vs JS — "${SCENARIO_NAMES[i]}": ${outcome}`);
+  }
+
+  // 2. What the overlay changed, derived from the IR itself.
+  console.log('');
+  console.log(`  ${bold('Bound overlay vs pure Gherkin:')}`);
+  for (const name of SCENARIO_NAMES) {
+    const plain = gherkin.scenarios.find((s) => s.name === name);
+    const overlaid = bound.scenarios.find((s) => s.name === name);
+    if (!plain || !overlaid) continue;
+
+    const fingerprint = (s: ScenarioIR) =>
+      s.steps.map((step) =>
+        step.kind === 'prompt'
+          ? `[${step.node}] ${step.template}`
+          : step.kind === 'capture'
+            ? `[capture] ${step.template} → {${step.varName}}`
+            : `[flow] ${step.flowName}`,
+      );
+    const before = fingerprint(plain);
+    const after = fingerprint(overlaid);
+    const removed = before.filter((l) => !after.includes(l));
+    const added = after.filter((l) => !before.includes(l));
+    const injectedVars = Object.keys(overlaid.vars ?? {}).filter(
+      (k) => !(plain.vars && k in plain.vars),
+    );
+
+    if (
+      removed.length === 0 &&
+      added.length === 0 &&
+      injectedVars.length === 0 &&
+      !overlaid.config
+    ) {
+      console.log(`    "${name}": ${dim('untouched (pure Gherkin)')}`);
+      continue;
+    }
+    console.log(`    "${name}":`);
+    for (const line of removed) console.log(`      ${red(`- ${line}`)}`);
+    for (const line of added) console.log(`      ${green(`+ ${line}`)}`);
+    for (const k of injectedVars) {
+      console.log(
+        `      ${green(`+ injected var {${k}} = ${JSON.stringify(overlaid.vars?.[k])}`)}`,
+      );
+    }
+    if (overlaid.config) {
+      console.log(
+        `      ${yellow(`~ config: ${JSON.stringify(overlaid.config)}`)}`,
+      );
+    }
+  }
+
+  // 3. Status summary.
+  console.log('');
+  console.log(`  ${bold('Run summary:')}`);
+  for (const mode of outcomes) {
+    const cells = mode.scenarios.map((s) => {
+      if (s.skipped) return yellow(`${s.name}: skipped`);
+      const status =
+        s.result?.status === 'passed' ? green('passed') : red('failed');
+      const warn =
+        s.result && s.result.warnings.length > 0
+          ? yellow(` (+${s.result.warnings.length} warning)`)
+          : '';
+      return `${s.name}: ${status}${warn}`;
+    });
+    console.log(`    ${mode.label.padEnd(14)} ${cells.join(dim('  |  '))}`);
+  }
+  console.log('');
+  console.log(
+    dim(
+      '  (The login-matrix Scenario Outline runs too — omitted here for brevity; see example/flows/.)',
+    ),
+  );
+  console.log('');
+}

From ab2b22fc710d8a12c1b7945f357f1c799bc24bd2 Mon Sep 17 00:00:00 2001
From: ScriptedAlchemy <zack@module-federation.io>
Date: Tue, 9 Jun 2026 23:23:27 +0200
Subject: [PATCH 3/9] feat(testing-framework): wire codex app-server agent into
 live demo (wip)

In-progress increment: codex-backed general agent for the POC demo's
live mode; validation and real-run verification still pending.
---
 .../testing-framework/scripts/demo/live.ts    |  79 ++++++++--
 .../testing-framework/scripts/demo/main.ts    |  50 ++++--
 .../src/general-agent/codex-general-agent.ts  | 149 ++++++++++++++++++
 packages/testing-framework/src/index.ts       |   1 +
 4 files changed, 258 insertions(+), 21 deletions(-)
 create mode 100644 packages/testing-framework/src/general-agent/codex-general-agent.ts

diff --git a/packages/testing-framework/scripts/demo/live.ts b/packages/testing-framework/scripts/demo/live.ts
index 6ab1ab69a2..c55977d09f 100644
--- a/packages/testing-framework/scripts/demo/live.ts
+++ b/packages/testing-framework/scripts/demo/live.ts
@@ -1,24 +1,81 @@
 /**
- * EXPERIMENTAL live mode for the demo: a real Midscene web UI agent
- * (puppeteer) on the self-contained static shop in example/demo-app, plus the
- * default Pi-backed general agent for verify/soft verdicts.
+ * Live mode for the demo: a real Midscene web UI agent (puppeteer) on the
+ * self-contained static shop in example/demo-app, with real model calls.
  *
- * Requires model configuration (at least MIDSCENE_MODEL_BASE_URL — same env
- * the package's AI tests use) and a working puppeteer install. Each scenario
+ * Default model path: Midscene's CODEX APP-SERVER provider. When
+ * MIDSCENE_MODEL_BASE_URL is unset and the `codex` CLI is on PATH, the demo
+ * configures itself with:
+ *
+ *   MIDSCENE_MODEL_BASE_URL="codex://app-server"   (spawns `codex app-server`,
+ *                                                   JSON-RPC over stdio, uses
+ *                                                   the Codex CLI OAuth login —
+ *                                                   no API key)
+ *   MIDSCENE_MODEL_NAME="gpt-5.5"                  (override with env)
+ *   MIDSCENE_MODEL_FAMILY="gpt-5"
+ *
+ * Prerequisites: `codex login` once (check `codex login status`). Any other
+ * OpenAI-compatible endpoint still works by setting MIDSCENE_MODEL_* yourself.
+ *
+ * verify/soft/agent nodes use CodexGeneralAgent on the codex path (the Pi
+ * default needs an HTTP endpoint and cannot speak codex://). Each scenario
  * gets a fresh browser so login/cart state never leaks between runs. Override
- * the page with DEMO_URL to point at your own app.
+ * the page with DEMO_URL.
  */
+import { spawnSync } from 'node:child_process';
 import { join } from 'node:path';
 import { pathToFileURL } from 'node:url';
+import { CodexGeneralAgent } from '../../src/general-agent/codex-general-agent';
 import { PiGeneralAgent } from '../../src/general-agent/pi-general-agent';
+import type { GeneralAgentAdapter } from '../../src/general-agent/types';
 import { createUIAgent } from '../../src/ui-agent/factory';
 
-export async function createLiveBundle() {
-  if (!process.env.MIDSCENE_MODEL_BASE_URL) {
+const CODEX_BASE_URL = 'codex://app-server';
+const CODEX_DEFAULT_MODEL = 'gpt-5.5';
+const CODEX_DEFAULT_FAMILY = 'gpt-5';
+
+/**
+ * Ensure model env is configured, preferring the codex app-server path.
+ * Throws with concrete setup steps when nothing usable is found.
+ */
+export function ensureLiveModelEnv(env: NodeJS.ProcessEnv = process.env): {
+  baseURL: string;
+  isCodex: boolean;
+} {
+  if (!env.MIDSCENE_MODEL_BASE_URL) {
+    if (!codexCliAvailable()) {
+      throw new Error(
+        [
+          '[midscene] demo --live: no model configured and the `codex` CLI is not on PATH.',
+          'Easiest path (no API key): install the Codex CLI, run `codex login`, and re-run.',
+          'Alternative: export MIDSCENE_MODEL_BASE_URL / MIDSCENE_MODEL_API_KEY / MIDSCENE_MODEL_NAME / MIDSCENE_MODEL_FAMILY for an OpenAI-compatible endpoint.',
+        ].join('\n'),
+      );
+    }
+    env.MIDSCENE_MODEL_BASE_URL = CODEX_BASE_URL;
+    env.MIDSCENE_MODEL_NAME ??= CODEX_DEFAULT_MODEL;
+    env.MIDSCENE_MODEL_FAMILY ??= CODEX_DEFAULT_FAMILY;
+    console.log(
+      `[demo] using Midscene's codex app-server provider (model ${env.MIDSCENE_MODEL_NAME}, Codex CLI OAuth session — no API key).`,
+    );
+  }
+
+  const baseURL = env.MIDSCENE_MODEL_BASE_URL;
+  const isCodex = baseURL.trim().toLowerCase().startsWith('codex://');
+  if (isCodex && !codexCliAvailable()) {
     throw new Error(
-      '[midscene] demo --live needs model configuration (MIDSCENE_MODEL_BASE_URL etc., see the repo .env conventions). Run without --live for the offline reference demo.',
+      '[midscene] demo --live: MIDSCENE_MODEL_BASE_URL points at codex:// but the `codex` CLI is not on PATH. Install it and run `codex login`.',
     );
   }
+  return { baseURL, isCodex };
+}
+
+function codexCliAvailable(): boolean {
+  const probe = spawnSync('codex', ['--version'], { stdio: 'ignore' });
+  return probe.status === 0;
+}
+
+export async function createLiveBundle() {
+  const { isCodex } = ensureLiveModelEnv();
 
   const url =
     process.env.DEMO_URL ??
@@ -29,7 +86,9 @@ export async function createLiveBundle() {
     { generateReport: true },
     process.env,
   );
-  const general = new PiGeneralAgent();
+  const general: GeneralAgentAdapter = isCodex
+    ? new CodexGeneralAgent()
+    : new PiGeneralAgent();
 
   return {
     uiAgent: agent,
diff --git a/packages/testing-framework/scripts/demo/main.ts b/packages/testing-framework/scripts/demo/main.ts
index 4d8238bb11..a1d024adb5 100644
--- a/packages/testing-framework/scripts/demo/main.ts
+++ b/packages/testing-framework/scripts/demo/main.ts
@@ -68,13 +68,17 @@ interface ModeOutcome {
 
 export async function main(argv: string[]): Promise<number> {
   const live = argv.includes('--live');
-  if (live && !process.env.MIDSCENE_MODEL_BASE_URL) {
-    console.error(
-      red(
-        '[midscene] demo --live needs model configuration (MIDSCENE_MODEL_BASE_URL etc., the same env the AI tests use). Run without --live for the offline reference demo.',
-      ),
-    );
-    return 2;
+  const modeFilter = parseModeFilter(argv);
+  if (live) {
+    // Fail fast (and self-configure the codex app-server path) before any
+    // mode banner is printed.
+    const { ensureLiveModelEnv } = await import('./live');
+    try {
+      ensureLiveModelEnv();
+    } catch (err) {
+      console.error(red((err as Error).message));
+      return 2;
+    }
   }
 
   const agentFactory: AgentFactory = live
@@ -130,11 +134,23 @@ export async function main(argv: string[]): Promise<number> {
     },
   ];
 
+  const selectedModes = modeFilter
+    ? modes.filter((m) => m.label.toLowerCase().includes(modeFilter))
+    : modes;
+  if (selectedModes.length === 0) {
+    console.error(red(`No mode matches --mode ${modeFilter}.`));
+    return 2;
+  }
+
   const outcomes: ModeOutcome[] = [];
-  for (let i = 0; i < modes.length; i++) {
-    const mode = modes[i];
+  for (let i = 0; i < selectedModes.length; i++) {
+    const mode = selectedModes[i];
     console.log('');
-    console.log(bold(cyan(`━━━ Mode ${i + 1}/3: ${mode.label} ━━━`)));
+    console.log(
+      bold(
+        cyan(`━━━ Mode ${i + 1}/${selectedModes.length}: ${mode.label} ━━━`),
+      ),
+    );
     console.log(dim(`    ${mode.source}`));
 
     const scenarios: ScenarioOutcome[] = [];
@@ -144,7 +160,9 @@ export async function main(argv: string[]): Promise<number> {
     outcomes.push({ label: mode.label, scenarios });
   }
 
-  printComparison(outcomes, gherkin);
+  if (selectedModes.length === modes.length) {
+    printComparison(outcomes, gherkin);
+  }
 
   const failed = outcomes
     .flatMap((m) => m.scenarios)
@@ -152,6 +170,16 @@ export async function main(argv: string[]): Promise<number> {
   return failed ? 1 : 0;
 }
 
+/** `--mode gherkin|js|bound` runs a single mode (handy for live runs). */
+function parseModeFilter(argv: string[]): string | undefined {
+  const index = argv.indexOf('--mode');
+  if (index === -1) return undefined;
+  const value = argv[index + 1]?.toLowerCase();
+  if (!value)
+    throw new Error('demo: --mode requires a value (gherkin|js|bound)');
+  return value === 'bound' ? 'overlay' : value;
+}
+
 function pickScenarios(compiled: CompiledFeature): ScenarioIR[] {
   return SCENARIO_NAMES.map((name) => {
     const found = compiled.scenarios.find((s) => s.name === name);
diff --git a/packages/testing-framework/src/general-agent/codex-general-agent.ts b/packages/testing-framework/src/general-agent/codex-general-agent.ts
new file mode 100644
index 0000000000..768615ee92
--- /dev/null
+++ b/packages/testing-framework/src/general-agent/codex-general-agent.ts
@@ -0,0 +1,149 @@
+/**
+ * Codex-backed implementation of the swappable {@link GeneralAgentAdapter}.
+ *
+ * The default Pi general agent needs an OpenAI-compatible HTTP endpoint, so
+ * it cannot use Midscene's codex app-server provider
+ * (`MIDSCENE_MODEL_BASE_URL="codex://app-server"`, which spawns `codex
+ * app-server` and speaks JSON-RPC over stdio using the Codex CLI's OAuth
+ * session — see `@midscene/core`'s `service-caller/codex-app-server`). This
+ * adapter routes `verify` / `soft` / `agent` nodes through the same provider
+ * via core's public `callAI`, so the whole framework can run on a single
+ * `codex login` with no API key.
+ *
+ * Differences from the Pi adapter (POC scope):
+ *  - no tool runtime: the verdict is requested as a strict JSON object in the
+ *    reply and parsed fail-closed (no `report_verdict` tool, no `$skill`
+ *    loading — referenced skills are only named in the prompt);
+ *  - the screenshot is written to a temp file and passed as a `file://`
+ *    image_url, which the codex provider maps to a localImage input.
+ */
+import { mkdtempSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import { callAI, getModelRuntime } from '@midscene/core/ai-model';
+import { globalModelConfigManager } from '@midscene/shared/env';
+import { getDebug } from '@midscene/shared/logger';
+import type { Verdict } from '../types';
+import type {
+  GeneralAgentAdapter,
+  GeneralAgentInput,
+  GeneralAgentResult,
+} from './types';
+
+const debug = getDebug('testing-framework:codex-general-agent');
+const warn = getDebug('testing-framework:codex-general-agent', {
+  console: true,
+});
+
+const VERDICT_INSTRUCTIONS = `
+You have no tools in this environment. After your analysis, end your reply
+with the verdict as a single JSON object on its own line, exactly in this
+shape (no markdown fence around it):
+
+{"pass": true|false, "reason": "<human-readable rationale>"}
+
+If you cannot confidently determine the result, report "pass": false.`;
+
+export class CodexGeneralAgent implements GeneralAgentAdapter {
+  private tempDir?: string;
+  private screenshotCount = 0;
+
+  async run(input: GeneralAgentInput): Promise<GeneralAgentResult> {
+    const needsVerdict = input.kind === 'verify' || input.kind === 'soft';
+
+    const userContent: Array<
+      | { type: 'text'; text: string }
+      | { type: 'image_url'; image_url: { url: string } }
+    > = [{ type: 'text', text: this.buildPrompt(input, needsVerdict) }];
+
+    if (input.screenshotBase64) {
+      const file = this.writeScreenshot(
+        input.screenshotBase64,
+        input.screenshotMediaType,
+      );
+      userContent.push({
+        type: 'image_url',
+        image_url: { url: `file://${file}` },
+      });
+    }
+
+    const modelRuntime = getModelRuntime(
+      globalModelConfigManager.getModelConfig('default'),
+    );
+    const result = await callAI(
+      [{ role: 'user', content: userContent }],
+      modelRuntime,
+    );
+
+    const text = result.content?.trim() ?? '';
+    debug('codex run finished', { kind: input.kind, chars: text.length });
+
+    if (!needsVerdict) {
+      return { text };
+    }
+    const verdict = extractVerdict(text);
+    if (!verdict) {
+      warn(
+        `codex general agent reply contained no parseable verdict JSON (kind=${input.kind}); the engine treats this as fail-closed.`,
+      );
+    }
+    return { text, verdict };
+  }
+
+  async dispose(): Promise<void> {
+    if (this.tempDir) {
+      rmSync(this.tempDir, { recursive: true, force: true });
+      this.tempDir = undefined;
+    }
+  }
+
+  private buildPrompt(input: GeneralAgentInput, needsVerdict: boolean): string {
+    const parts = [input.context];
+    if (input.referencedSkills.length > 0) {
+      parts.push(
+        `\nThis task references the following skills (not loadable in this environment, judge from the screenshot and history): ${input.referencedSkills.map((s) => `$${s}`).join(', ')}.`,
+      );
+    }
+    if (needsVerdict) {
+      parts.push(VERDICT_INSTRUCTIONS);
+    }
+    return parts.join('\n');
+  }
+
+  private writeScreenshot(base64: string, mediaType?: string): string {
+    if (!this.tempDir) {
+      this.tempDir = mkdtempSync(join(tmpdir(), 'midscene-codex-ga-'));
+    }
+    const ext = mediaType === 'image/jpeg' ? 'jpg' : 'png';
+    const file = join(
+      this.tempDir,
+      `screenshot-${++this.screenshotCount}.${ext}`,
+    );
+    writeFileSync(file, Buffer.from(base64, 'base64'));
+    return file;
+  }
+}
+
+/** Parse the last `{"pass": ..., "reason": ...}` object in the reply. */
+export function extractVerdict(text: string): Verdict | undefined {
+  const candidates = text.match(/\{[^{}]*"pass"[^{}]*\}/g);
+  if (!candidates) return undefined;
+  for (let i = candidates.length - 1; i >= 0; i--) {
+    try {
+      const parsed = JSON.parse(candidates[i]);
+      if (typeof parsed.pass === 'boolean') {
+        return {
+          pass: parsed.pass,
+          reason:
+            typeof parsed.reason === 'string' && parsed.reason.trim()
+              ? parsed.reason
+              : '(no reason given)',
+          evidence: parsed.evidence,
+        };
+      }
+    } catch {
+      // try the previous candidate
+    }
+  }
+  return undefined;
+}
diff --git a/packages/testing-framework/src/index.ts b/packages/testing-framework/src/index.ts
index d0c27bc629..025f760f2f 100644
--- a/packages/testing-framework/src/index.ts
+++ b/packages/testing-framework/src/index.ts
@@ -55,6 +55,7 @@ export type {
 } from './general-agent/types';
 export { PiGeneralAgent } from './general-agent/pi-general-agent';
 export type { PiGeneralAgentOptions } from './general-agent/pi-general-agent';
+export { CodexGeneralAgent } from './general-agent/codex-general-agent';
 export { extractSkillReferences } from './general-agent/skills';
 
 // —— YAML ——

From c363d1452dd727e4f006bf4d37479fc336c4d5da Mon Sep 17 00:00:00 2001
From: ScriptedAlchemy <zack@module-federation.io>
Date: Tue, 9 Jun 2026 23:32:13 +0200
Subject: [PATCH 4/9] feat(testing-framework): verify codex app-server live
 demo end to end

Completes the codex live-mode increment: lazy-load @midscene/core/ai-model
in CodexGeneralAgent (keeps the package index importable under vitest),
fail fast when a capture extracts an empty value, add the missing
back-to-shop step the real journey exposed, note live-mode verdict
nondeterminism in the trace comparison, and document the codex setup
(codex login, auto-configured MIDSCENE_MODEL_* env) in POC-GHERKIN.md.
Verified live against codex gpt-5.5: all three modes pass.
---
 packages/testing-framework/POC-GHERKIN.md     | 44 +++++++++++++++----
 .../example/flows/shop.feature                |  1 +
 .../example/flows/shop.flows.ts               |  1 +
 .../testing-framework/scripts/demo/main.ts    | 12 ++++-
 .../src/flow-ir/run-scenario.ts               |  7 +++
 .../src/general-agent/codex-general-agent.ts  | 10 ++++-
 .../tests/unit-test/run-scenario.test.ts      | 17 +++++++
 7 files changed, 79 insertions(+), 13 deletions(-)

diff --git a/packages/testing-framework/POC-GHERKIN.md b/packages/testing-framework/POC-GHERKIN.md
index 06f6fb5e2c..fe4f1f3579 100644
--- a/packages/testing-framework/POC-GHERKIN.md
+++ b/packages/testing-framework/POC-GHERKIN.md
@@ -41,14 +41,38 @@ shop; no model keys, no browser). Expected output (excerpt):
       + injected var {couponCode} = "E2E-2026-06-09"
 ```
 
-**Live mode (experimental, unverified in CI):** with model env configured
-(`MIDSCENE_MODEL_BASE_URL` etc., as for the AI tests) and puppeteer
-available, `pnpm --filter @midscene/testing-framework demo -- --live` drives
-a real web UI agent against the self-contained static shop in
-`example/demo-app/index.html` (override with `DEMO_URL`), with the default
-Pi-backed general agent issuing the verdicts. Each scenario gets a fresh
-browser. Implemented in `scripts/demo/live.ts`; the offline path is the
-verified reference.
+**Live mode** — `pnpm --filter @midscene/testing-framework demo -- --live`
+drives a real puppeteer web agent against the self-contained static shop in
+`example/demo-app/index.html` (override with `DEMO_URL`), with real model
+calls. The default/easy path is **Midscene's codex app-server provider**
+(no API key — it spawns `codex app-server` and reuses the Codex CLI OAuth
+session via JSON-RPC over stdio, see
+`packages/core/src/ai-model/service-caller/codex-app-server.ts`):
+
+```bash
+# one-time setup
+codex login              # verify with: codex login status
+
+# run — the demo auto-configures when MIDSCENE_MODEL_BASE_URL is unset:
+#   MIDSCENE_MODEL_BASE_URL="codex://app-server"
+#   MIDSCENE_MODEL_NAME="gpt-5.5"      (override with env)
+#   MIDSCENE_MODEL_FAMILY="gpt-5"
+pnpm --filter @midscene/testing-framework demo -- --live
+
+# optional: run a single mode (faster)
+pnpm --filter @midscene/testing-framework demo -- --live --mode gherkin   # or js | bound
+```
+
+On the codex path, `verify`/`soft` verdicts run through `CodexGeneralAgent`
+(`src/general-agent/codex-general-agent.ts`), which routes the same
+provider via core's `callAI` and parses a JSON verdict fail-closed — the
+default Pi general agent needs an OpenAI-compatible HTTP endpoint and
+cannot speak `codex://`. Any such endpoint still works by setting
+`MIDSCENE_MODEL_*` yourself (Pi is used for verdicts then). Each scenario
+gets a fresh browser; Midscene HTML reports land in `midscene_run/report/`.
+Verified end to end against codex `gpt-5.5`: all three modes pass (one
+expected nondeterminism: the advisory promo-banner soft check may PASS or
+SOFT-FAIL depending on whether the model counts the header as a banner).
 
 ```
  .feature files          .flows.ts files
@@ -86,7 +110,9 @@ Three step kinds (`types.ts`):
 machine-owned. `capture` steps ("remember … as varName") extract values
 through `aiString`; later templates get **mechanical** `{varName}`
 substitution *before* any prompt is sent to a model. Unknown placeholders
-fail the step immediately (typo safety) without a model call. Model-owned
+fail the step immediately (typo safety) without a model call, and a capture
+that extracts an empty value fails fast instead of poisoning later prompts
+with a blank. Model-owned
 prose conclusions keep flowing through the existing `StepOutput` channel —
 the two channels never mix.
 
diff --git a/packages/testing-framework/example/flows/shop.feature b/packages/testing-framework/example/flows/shop.feature
index 15349c0907..9b1c456d47 100644
--- a/packages/testing-framework/example/flows/shop.feature
+++ b/packages/testing-framework/example/flows/shop.feature
@@ -18,6 +18,7 @@ Feature: Checkout with a reusable login flow
 
   Scenario: Checkout as admin
     When I run the "Login" flow with role "admin"
+    And I go back to the shop home page
     And I remember the price of the "Trail Backpack" product as "price"
     When I add the "Trail Backpack" to the cart and open the cart
     Then the cart total equals {price}
diff --git a/packages/testing-framework/example/flows/shop.flows.ts b/packages/testing-framework/example/flows/shop.flows.ts
index 6d8ca7ff9d..1a0ce0723c 100644
--- a/packages/testing-framework/example/flows/shop.flows.ts
+++ b/packages/testing-framework/example/flows/shop.flows.ts
@@ -36,6 +36,7 @@ const background = Given('the demo shop is open on the home page');
 export const checkoutAsAdmin = scenario('Checkout as admin', [
   background,
   callFlow('Login', { role: 'admin' }),
+  When('I go back to the shop home page'),
   remember('the price of the "Trail Backpack" product', 'price'),
   When('I add the "Trail Backpack" to the cart and open the cart'),
   Then('the cart total equals {price}'),
diff --git a/packages/testing-framework/scripts/demo/main.ts b/packages/testing-framework/scripts/demo/main.ts
index a1d024adb5..6a17072b7f 100644
--- a/packages/testing-framework/scripts/demo/main.ts
+++ b/packages/testing-framework/scripts/demo/main.ts
@@ -161,7 +161,7 @@ export async function main(argv: string[]): Promise<number> {
   }
 
   if (selectedModes.length === modes.length) {
-    printComparison(outcomes, gherkin);
+    printComparison(outcomes, gherkin, live);
   }
 
   const failed = outcomes
@@ -332,8 +332,9 @@ function canonical(event: ScenarioRunEvent): string {
 function printComparison(
   outcomes: ModeOutcome[],
   gherkin: CompiledFeature,
+  live: boolean,
 ): void {
-  const [gherkinMode, jsMode, boundMode] = outcomes;
+  const [gherkinMode, jsMode] = outcomes;
 
   console.log('');
   console.log(bold(cyan('━━━ Comparison: three modes, one IR ━━━')));
@@ -341,6 +342,13 @@ function printComparison(
   // 1. Gherkin vs JS: identical traces prove the two front-ends compile to
   //    the same IR and drive the engine identically.
   console.log('');
+  if (live) {
+    console.log(
+      dim(
+        '  (live mode: traces include real model verdicts, which are nondeterministic — exact trace identity is only guaranteed offline)',
+      ),
+    );
+  }
   for (let i = 0; i < SCENARIO_NAMES.length; i++) {
     const a = gherkinMode.scenarios[i];
     const b = jsMode.scenarios[i];
diff --git a/packages/testing-framework/src/flow-ir/run-scenario.ts b/packages/testing-framework/src/flow-ir/run-scenario.ts
index b6b2471193..49de0a6e7b 100644
--- a/packages/testing-framework/src/flow-ir/run-scenario.ts
+++ b/packages/testing-framework/src/flow-ir/run-scenario.ts
@@ -257,6 +257,13 @@ async function execCaptureStep(
     // Lower to a structured extraction on the UI agent. The value is
     // machine-owned: it goes into the variable table, not into model prose.
     const value = await ctx.uiAgent.aiString(resolved);
+    if (!String(value).trim()) {
+      // Fail fast instead of letting a blank variable poison later prompts
+      // (e.g. the value is not visible on the current screen).
+      throw new Error(
+        `[midscene] capture {${step.varName}}: the extraction "${resolved}" returned an empty value. Is it visible on the current screen?`,
+      );
+    }
     scope.set(step.varName, String(value));
     ctx.emit({
       type: 'varSet',
diff --git a/packages/testing-framework/src/general-agent/codex-general-agent.ts b/packages/testing-framework/src/general-agent/codex-general-agent.ts
index 768615ee92..7c90c22059 100644
--- a/packages/testing-framework/src/general-agent/codex-general-agent.ts
+++ b/packages/testing-framework/src/general-agent/codex-general-agent.ts
@@ -20,8 +20,6 @@
 import { mkdtempSync, rmSync, writeFileSync } from 'node:fs';
 import { tmpdir } from 'node:os';
 import { join } from 'node:path';
-import { callAI, getModelRuntime } from '@midscene/core/ai-model';
-import { globalModelConfigManager } from '@midscene/shared/env';
 import { getDebug } from '@midscene/shared/logger';
 import type { Verdict } from '../types';
 import type {
@@ -67,6 +65,14 @@ export class CodexGeneralAgent implements GeneralAgentAdapter {
       });
     }
 
+    // Lazy imports: `@midscene/core/ai-model` pulls in heavy image/runtime
+    // dependencies that callers of this package should not pay for unless a
+    // codex-backed general agent is actually used.
+    const [{ callAI, getModelRuntime }, { globalModelConfigManager }] =
+      await Promise.all([
+        import('@midscene/core/ai-model'),
+        import('@midscene/shared/env'),
+      ]);
     const modelRuntime = getModelRuntime(
       globalModelConfigManager.getModelConfig('default'),
     );
diff --git a/packages/testing-framework/tests/unit-test/run-scenario.test.ts b/packages/testing-framework/tests/unit-test/run-scenario.test.ts
index 1259d12d94..0cccb9298f 100644
--- a/packages/testing-framework/tests/unit-test/run-scenario.test.ts
+++ b/packages/testing-framework/tests/unit-test/run-scenario.test.ts
@@ -76,6 +76,23 @@ describe('runScenario: variable capture and substitution', () => {
     expect(ui.actCalls).toEqual(['search for backpack']);
   });
 
+  it('fails the capture step when the extraction returns an empty value', async () => {
+    const ui = new FakeUiAgent(['   ']);
+    const { result, general } = await run(
+      scenario('blank capture', [
+        remember('the order id shown in the banner', 'orderId'),
+        Then('the confirmation page shows order {orderId}'),
+      ]),
+      { ui },
+    );
+    expect(result.status).toBe('failed');
+    expect(result.steps[0].error).toMatch(
+      /capture \{orderId\}.*returned an empty value/,
+    );
+    expect(result.variables).not.toHaveProperty('orderId');
+    expect(general.calls).toEqual([]);
+  });
+
   it('fails the step (and case) on an unknown variable, before any model call', async () => {
     const { result, ui, general } = await run(
       scenario('typo', [Then('the total is {totl}')]),

From 9d6496c7b1d97264458782c1278150849dfa0864 Mon Sep 17 00:00:00 2001
From: ScriptedAlchemy <zack@module-federation.io>
Date: Tue, 9 Jun 2026 23:40:47 +0200
Subject: [PATCH 5/9] refactor(testing-framework): remove AI slop from flow-IR
 POC

---
 .../testing-framework/src/flow-ir/run-scenario.ts     | 11 +++++------
 .../src/general-agent/codex-general-agent.ts          |  2 +-
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/packages/testing-framework/src/flow-ir/run-scenario.ts b/packages/testing-framework/src/flow-ir/run-scenario.ts
index 49de0a6e7b..772f8a72a8 100644
--- a/packages/testing-framework/src/flow-ir/run-scenario.ts
+++ b/packages/testing-framework/src/flow-ir/run-scenario.ts
@@ -190,7 +190,6 @@ async function execPromptStep(
 
   let stepResult: StepResult;
   try {
-    // Substitution happens here, mechanically, before any model call.
     const resolved = substitute(
       step.template,
       scope,
@@ -257,18 +256,18 @@ async function execCaptureStep(
     // Lower to a structured extraction on the UI agent. The value is
     // machine-owned: it goes into the variable table, not into model prose.
     const value = await ctx.uiAgent.aiString(resolved);
-    if (!String(value).trim()) {
+    if (!value.trim()) {
       // Fail fast instead of letting a blank variable poison later prompts
       // (e.g. the value is not visible on the current screen).
       throw new Error(
         `[midscene] capture {${step.varName}}: the extraction "${resolved}" returned an empty value. Is it visible on the current screen?`,
       );
     }
-    scope.set(step.varName, String(value));
+    scope.set(step.varName, value);
     ctx.emit({
       type: 'varSet',
       name: step.varName,
-      value: String(value),
+      value,
       source: 'capture',
       depth,
     });
@@ -279,8 +278,8 @@ async function execCaptureStep(
       input: resolved,
       status: 'info',
       output: {
-        text: `Captured variable {${step.varName}} = ${JSON.stringify(String(value))} (${resolved}).`,
-        structured: { [step.varName]: String(value) },
+        text: `Captured variable {${step.varName}} = ${JSON.stringify(value)} (${resolved}).`,
+        structured: { [step.varName]: value },
       },
       durationMs: Date.now() - stepStart,
     };
diff --git a/packages/testing-framework/src/general-agent/codex-general-agent.ts b/packages/testing-framework/src/general-agent/codex-general-agent.ts
index 7c90c22059..a2865a90c9 100644
--- a/packages/testing-framework/src/general-agent/codex-general-agent.ts
+++ b/packages/testing-framework/src/general-agent/codex-general-agent.ts
@@ -81,7 +81,7 @@ export class CodexGeneralAgent implements GeneralAgentAdapter {
       modelRuntime,
     );
 
-    const text = result.content?.trim() ?? '';
+    const text = result.content.trim();
     debug('codex run finished', { kind: input.kind, chars: text.length });
 
     if (!needsVerdict) {

From f237c5313fc55405f5f815fbe9c3df1fabb4c6f8 Mon Sep 17 00:00:00 2001
From: ScriptedAlchemy <zack@module-federation.io>
Date: Tue, 9 Jun 2026 23:58:21 +0200
Subject: [PATCH 6/9] refactor(testing-framework): simplify flow-IR POC after
 review pass

Share engine step bookkeeping and getReportFile between runCase and the
IR executor, merge prompt/capture step scaffolding, dedupe var-record
stringification and identifier regexes, drop dead API (PromptStepIR.role,
unused executor options, FlowRegistry.names), clean up codex screenshot
temp files per call, fix nested-JSON verdict parsing with a regression
test, and memoize the demo's codex CLI probe.
---
 .../testing-framework/scripts/demo/live.ts    |   8 +-
 .../testing-framework/scripts/demo/main.ts    |   1 -
 .../testing-framework/src/engine/run-case.ts  |  38 ++--
 .../testing-framework/src/flow-ir/index.ts    |   3 +-
 .../testing-framework/src/flow-ir/registry.ts |   4 -
 .../src/flow-ir/run-scenario.ts               | 195 ++++++++----------
 .../testing-framework/src/flow-ir/types.ts    |  32 ++-
 .../src/frontends/gherkin/index.ts            |  18 +-
 .../src/frontends/js/bind-feature.ts          |  35 ++--
 .../src/frontends/js/index.ts                 |  43 ++--
 .../src/general-agent/codex-general-agent.ts  |  57 ++++-
 packages/testing-framework/src/index.ts       |   1 -
 .../tests/unit-test/bind-feature.test.ts      |   3 +-
 .../tests/unit-test/codex-verdict.test.ts     |  45 ++++
 .../tests/unit-test/flow-ir.test.ts           |   1 -
 .../tests/unit-test/gherkin-frontend.test.ts  |   5 +-
 .../tests/unit-test/js-frontend.test.ts       |   3 -
 .../tests/unit-test/run-scenario.test.ts      |   9 +-
 18 files changed, 289 insertions(+), 212 deletions(-)
 create mode 100644 packages/testing-framework/tests/unit-test/codex-verdict.test.ts

diff --git a/packages/testing-framework/scripts/demo/live.ts b/packages/testing-framework/scripts/demo/live.ts
index c55977d09f..c4cd4c2e60 100644
--- a/packages/testing-framework/scripts/demo/live.ts
+++ b/packages/testing-framework/scripts/demo/live.ts
@@ -69,9 +69,13 @@ export function ensureLiveModelEnv(env: NodeJS.ProcessEnv = process.env): {
   return { baseURL, isCodex };
 }
 
+let codexAvailable: boolean | undefined;
 function codexCliAvailable(): boolean {
-  const probe = spawnSync('codex', ['--version'], { stdio: 'ignore' });
-  return probe.status === 0;
+  // Memoized: ensureLiveModelEnv runs once per scenario bundle, and the CLI
+  // probe spawns a subprocess.
+  codexAvailable ??=
+    spawnSync('codex', ['--version'], { stdio: 'ignore' }).status === 0;
+  return codexAvailable;
 }
 
 export async function createLiveBundle() {
diff --git a/packages/testing-framework/scripts/demo/main.ts b/packages/testing-framework/scripts/demo/main.ts
index 6a17072b7f..dda1dbab9d 100644
--- a/packages/testing-framework/scripts/demo/main.ts
+++ b/packages/testing-framework/scripts/demo/main.ts
@@ -211,7 +211,6 @@ async function runOne(
       registry,
       uiAgent: bundle.uiAgent,
       generalAgent: bundle.generalAgent,
-      env: process.env,
       onEvent: (event) => {
         narrate(event);
         trace.push(canonical(event));
diff --git a/packages/testing-framework/src/engine/run-case.ts b/packages/testing-framework/src/engine/run-case.ts
index a4fc67f2ff..7a51d55066 100644
--- a/packages/testing-framework/src/engine/run-case.ts
+++ b/packages/testing-framework/src/engine/run-case.ts
@@ -81,18 +81,7 @@ export async function runCase(options: RunCaseOptions): Promise<CaseResult> {
       };
     }
 
-    steps.push(stepResult);
-    if (stepResult.output) {
-      outputs.add(step.node, index, stepResult.output);
-    }
-    if (stepResult.status === 'warning' && stepResult.error) {
-      warnings.push(stepResult.error);
-    }
-    if (stepResult.status === 'warning' && stepResult.verdict) {
-      warnings.push(
-        `soft check failed at step ${index + 1} (${step.node}): ${stepResult.verdict.reason}`,
-      );
-    }
+    recordStepResult(stepResult, { steps, outputs, warnings });
 
     if (stepResult.status === 'failed') {
       // A gating failure stops the flow; later steps depend on prior ones.
@@ -112,7 +101,30 @@ export async function runCase(options: RunCaseOptions): Promise<CaseResult> {
   };
 }
 
-function getReportFile(agent: Agent): string | undefined {
+/** Shared step bookkeeping, also used by the flow-IR executor. */
+export function recordStepResult(
+  stepResult: StepResult,
+  sink: {
+    steps: StepResult[];
+    outputs: OutputStoreImpl;
+    warnings: string[];
+  },
+): void {
+  sink.steps.push(stepResult);
+  if (stepResult.output) {
+    sink.outputs.add(stepResult.node, stepResult.index, stepResult.output);
+  }
+  if (stepResult.status === 'warning' && stepResult.error) {
+    sink.warnings.push(stepResult.error);
+  }
+  if (stepResult.status === 'warning' && stepResult.verdict) {
+    sink.warnings.push(
+      `soft check failed at step ${stepResult.index + 1} (${stepResult.node}): ${stepResult.verdict.reason}`,
+    );
+  }
+}
+
+export function getReportFile(agent: Agent): string | undefined {
   const candidate = (agent as unknown as { reportFile?: string | null })
     .reportFile;
   return candidate ?? undefined;
diff --git a/packages/testing-framework/src/flow-ir/index.ts b/packages/testing-framework/src/flow-ir/index.ts
index dc5e65275d..84f97795d5 100644
--- a/packages/testing-framework/src/flow-ir/index.ts
+++ b/packages/testing-framework/src/flow-ir/index.ts
@@ -1,10 +1,11 @@
 /** POC: shared flow-IR — see `types.ts` for the design notes. */
 export {
+  IDENTIFIER_PATTERN,
   MAX_FLOW_CALL_DEPTH,
   assertIdentifier,
+  stringifyVarRecord,
 } from './types';
 export type {
-  PromptRole,
   PromptStepIR,
   CaptureStepIR,
   CallFlowStepIR,
diff --git a/packages/testing-framework/src/flow-ir/registry.ts b/packages/testing-framework/src/flow-ir/registry.ts
index 5f6db7c43e..45d75999e8 100644
--- a/packages/testing-framework/src/flow-ir/registry.ts
+++ b/packages/testing-framework/src/flow-ir/registry.ts
@@ -50,10 +50,6 @@ export class FlowRegistry {
     }
     return flow;
   }
-
-  names(): string[] {
-    return [...this.flows.keys()];
-  }
 }
 
 /** Convenience: build a registry from a list of flow definitions. */
diff --git a/packages/testing-framework/src/flow-ir/run-scenario.ts b/packages/testing-framework/src/flow-ir/run-scenario.ts
index 772f8a72a8..0372c46226 100644
--- a/packages/testing-framework/src/flow-ir/run-scenario.ts
+++ b/packages/testing-framework/src/flow-ir/run-scenario.ts
@@ -16,9 +16,9 @@
  */
 import type { Agent } from '@midscene/core/agent';
 import { OutputStoreImpl } from '../engine/output-store';
+import { getReportFile, recordStepResult } from '../engine/run-case';
 import { type RunNodeDeps, runNode } from '../engine/run-node';
 import type { GeneralAgentAdapter } from '../general-agent/types';
-import type { RuntimeNode } from '../runtime';
 import type { CaseResult, StepResult } from '../types';
 import { FlowRegistry } from './registry';
 import { type VariableScope, substitute } from './substitute';
@@ -76,9 +76,7 @@ export interface RunScenarioOptions {
   file?: string;
   uiAgent: Agent;
   generalAgent: GeneralAgentAdapter;
-  runtimeNodes?: Record<string, RuntimeNode>;
   projectRoot?: string;
-  env?: NodeJS.ProcessEnv;
   /** Optional observer for narration/debugging. */
   onEvent?: (event: ScenarioRunEvent) => void;
 }
@@ -93,13 +91,10 @@ interface ExecCtx {
   registry: FlowRegistry;
   uiAgent: Agent;
   generalAgent: GeneralAgentAdapter;
-  runtimeNodes: Record<string, RuntimeNode>;
   projectRoot: string;
-  env: NodeJS.ProcessEnv;
   caseName: string;
   caseFile: string;
   outputs: OutputStoreImpl;
-  state: Record<string, unknown>;
   steps: StepResult[];
   warnings: string[];
   emit: (event: ScenarioRunEvent) => void;
@@ -117,13 +112,10 @@ export async function runScenario(
     registry: options.registry ?? new FlowRegistry(),
     uiAgent: options.uiAgent,
     generalAgent: options.generalAgent,
-    runtimeNodes: options.runtimeNodes ?? {},
     projectRoot: options.projectRoot ?? process.cwd(),
-    env: options.env ?? process.env,
     caseName: scenario.name,
     caseFile: options.file ?? '<ir>',
     outputs: new OutputStoreImpl(),
-    state: {},
     steps: [],
     warnings: [],
     emit: options.onEvent ?? (() => {}),
@@ -179,11 +171,21 @@ async function execStep(
   }
 }
 
-async function execPromptStep(
-  step: PromptStepIR,
+/**
+ * Shared scaffolding for templated steps (prompt and capture): substitution,
+ * stepStart emission, try/catch into a {@link StepResult}, and recording.
+ * Only the per-kind body differs.
+ */
+async function runTemplatedStep(
+  template: string,
+  node: string,
+  whereDetail: string,
   scope: VariableScope,
   depth: number,
   ctx: ExecCtx,
+  body: (
+    resolved: string,
+  ) => Promise<Pick<StepResult, 'status' | 'output' | 'verdict' | 'error'>>,
 ): Promise<boolean> {
   const index = ctx.steps.length;
   const stepStart = Date.now();
@@ -191,34 +193,30 @@ async function execPromptStep(
   let stepResult: StepResult;
   try {
     const resolved = substitute(
-      step.template,
+      template,
       scope,
-      `${ctx.caseName} step ${index + 1} (${step.node})`,
+      `${ctx.caseName} step ${index + 1} (${whereDetail})`,
     );
     ctx.emit({
       type: 'stepStart',
       index,
-      node: step.node,
+      node,
       input: resolved,
-      template: resolved === step.template ? undefined : step.template,
+      template: resolved === template ? undefined : template,
       depth,
     });
-    const outcome = await runNode(step.node, resolved, nodeDeps(ctx));
     stepResult = {
       index,
-      node: step.node,
+      node,
       input: resolved,
-      status: outcome.status,
-      output: outcome.output,
-      verdict: outcome.verdict,
-      error: outcome.error,
+      ...(await body(resolved)),
       durationMs: Date.now() - stepStart,
     };
   } catch (err) {
     stepResult = {
       index,
-      node: step.node,
-      input: step.template,
+      node,
+      input: template,
       status: 'failed',
       error: (err as Error).message,
       durationMs: Date.now() - stepStart,
@@ -229,73 +227,72 @@ async function execPromptStep(
   return stepResult.status !== 'failed';
 }
 
-async function execCaptureStep(
-  step: CaptureStepIR,
+function execPromptStep(
+  step: PromptStepIR,
   scope: VariableScope,
   depth: number,
   ctx: ExecCtx,
 ): Promise<boolean> {
-  const index = ctx.steps.length;
-  const stepStart = Date.now();
-
-  let stepResult: StepResult;
-  try {
-    const resolved = substitute(
-      step.template,
-      scope,
-      `${ctx.caseName} step ${index + 1} (capture ${step.varName})`,
-    );
-    ctx.emit({
-      type: 'stepStart',
-      index,
-      node: 'capture',
-      input: resolved,
-      template: resolved === step.template ? undefined : step.template,
-      depth,
-    });
-    // Lower to a structured extraction on the UI agent. The value is
-    // machine-owned: it goes into the variable table, not into model prose.
-    const value = await ctx.uiAgent.aiString(resolved);
-    if (!value.trim()) {
-      // Fail fast instead of letting a blank variable poison later prompts
-      // (e.g. the value is not visible on the current screen).
-      throw new Error(
-        `[midscene] capture {${step.varName}}: the extraction "${resolved}" returned an empty value. Is it visible on the current screen?`,
-      );
-    }
-    scope.set(step.varName, value);
-    ctx.emit({
-      type: 'varSet',
-      name: step.varName,
-      value,
-      source: 'capture',
-      depth,
-    });
-
-    stepResult = {
-      index,
-      node: 'capture',
-      input: resolved,
-      status: 'info',
-      output: {
-        text: `Captured variable {${step.varName}} = ${JSON.stringify(value)} (${resolved}).`,
-        structured: { [step.varName]: value },
-      },
-      durationMs: Date.now() - stepStart,
-    };
-  } catch (err) {
-    stepResult = {
-      index,
-      node: 'capture',
-      input: step.template,
-      status: 'failed',
-      error: (err as Error).message,
-      durationMs: Date.now() - stepStart,
-    };
-  }
+  return runTemplatedStep(
+    step.template,
+    step.node,
+    step.node,
+    scope,
+    depth,
+    ctx,
+    async (resolved) => {
+      const outcome = await runNode(step.node, resolved, nodeDeps(ctx));
+      return {
+        status: outcome.status,
+        output: outcome.output,
+        verdict: outcome.verdict,
+        error: outcome.error,
+      };
+    },
+  );
+}
 
-  recordStep(stepResult, depth, ctx);
-  return stepResult.status !== 'failed';
+function execCaptureStep(
+  step: CaptureStepIR,
+  scope: VariableScope,
+  depth: number,
+  ctx: ExecCtx,
+): Promise<boolean> {
+  return runTemplatedStep(
+    step.template,
+    'capture',
+    `capture ${step.varName}`,
+    scope,
+    depth,
+    ctx,
+    async (resolved) => {
+      // Lower to a structured extraction on the UI agent. The value is
+      // machine-owned: it goes into the variable table, not into model prose.
+      const value = await ctx.uiAgent.aiString(resolved);
+      if (!value.trim()) {
+        // Fail fast instead of letting a blank variable poison later prompts
+        // (e.g. the value is not visible on the current screen).
+        throw new Error(
+          `[midscene] capture {${step.varName}}: the extraction "${resolved}" returned an empty value. Is it visible on the current screen?`,
+        );
+      }
+      scope.set(step.varName, value);
+      ctx.emit({
+        type: 'varSet',
+        name: step.varName,
+        value,
+        source: 'capture',
+        depth,
+      });
+      return {
+        status: 'info',
+        output: {
+          text: `Captured variable {${step.varName}} = ${JSON.stringify(value)} (${resolved}).`,
+          structured: { [step.varName]: value },
+        },
+      };
+    },
+  );
 }
 
 async function execCallFlowStep(
@@ -410,32 +407,22 @@ function nodeDeps(ctx: ExecCtx): RunNodeDeps {
   return {
     uiAgent: ctx.uiAgent,
     generalAgent: ctx.generalAgent,
-    runtimeNodes: ctx.runtimeNodes,
+    // The IR only emits builtin node kinds, so the custom-runtime deps are
+    // inert placeholders here.
+    runtimeNodes: {},
     outputs: ctx.outputs,
-    state: ctx.state,
+    state: {},
     projectRoot: ctx.projectRoot,
     caseName: ctx.caseName,
     caseFile: ctx.caseFile,
     pastSteps: ctx.steps,
-    env: ctx.env,
+    env: process.env,
   };
 }
 
-/** Mirror `runCase`'s bookkeeping for outputs and warnings. */
 function recordStep(stepResult: StepResult, depth: number, ctx: ExecCtx): void {
   ctx.emit({ type: 'stepEnd', result: stepResult, depth });
-  ctx.steps.push(stepResult);
-  if (stepResult.output) {
-    ctx.outputs.add(stepResult.node, stepResult.index, stepResult.output);
-  }
-  if (stepResult.status === 'warning' && stepResult.error) {
-    ctx.warnings.push(stepResult.error);
-  }
-  if (stepResult.status === 'warning' && stepResult.verdict) {
-    ctx.warnings.push(
-      `soft check failed at step ${stepResult.index + 1} (${stepResult.node}): ${stepResult.verdict.reason}`,
-    );
-  }
+  recordStepResult(stepResult, ctx);
 }
 
 function formatCall(flowName: string, args: Record<string, string>): string {
@@ -447,9 +434,3 @@ function formatArgs(args: Record<string, string>): string {
   if (entries.length === 0) return 'no arguments';
   return entries.map(([k, v]) => `${k}=${JSON.stringify(v)}`).join(', ');
 }
-
-function getReportFile(agent: Agent): string | undefined {
-  const candidate = (agent as unknown as { reportFile?: string | null })
-    .reportFile;
-  return candidate ?? undefined;
-}
diff --git a/packages/testing-framework/src/flow-ir/types.ts b/packages/testing-framework/src/flow-ir/types.ts
index acbfd5b283..00bd4962b2 100644
--- a/packages/testing-framework/src/flow-ir/types.ts
+++ b/packages/testing-framework/src/flow-ir/types.ts
@@ -20,18 +20,15 @@
  */
 import type { BuiltinNodeType } from '../types';
 
-/** Keyword→policy mapping: what authoring role a prompt step plays. */
-export type PromptRole = 'setup' | 'action' | 'assertion' | 'advisory';
-
 /**
  * A natural-language prompt step. Lowers 1:1 onto an engine node:
- * given-like → `ui` (setup), when-like → `ui` (action), then-like → `verify`
- * (fail-closed), soft variants → `soft`, advisory → `agent`.
+ * given/when-like → `ui`, then-like → `verify` (fail-closed), soft variants →
+ * `soft`, advisory → `agent`. The authoring keyword fully determines `node`,
+ * so the keyword itself is not stored.
  */
 export interface PromptStepIR {
   kind: 'prompt';
   node: BuiltinNodeType;
-  role: PromptRole;
   /** Natural-language template; may contain `{varName}` placeholders. */
   template: string;
 }
@@ -109,13 +106,32 @@ export interface FeatureIR {
 /** Flow calls may nest at most this deep (scenario itself is depth 0). */
 export const MAX_FLOW_CALL_DEPTH = 2;
 
-const IDENTIFIER = /^[A-Za-z_][A-Za-z0-9_]*$/;
+/** Source pattern for identifiers, for composing into larger regexes. */
+export const IDENTIFIER_PATTERN = '[A-Za-z_][A-Za-z0-9_]*';
+
+const IDENTIFIER = new RegExp(`^${IDENTIFIER_PATTERN}$`);
 
 /** Variable / param names must be simple identifiers so `{name}` is unambiguous. */
 export function assertIdentifier(name: string, where: string): void {
   if (!IDENTIFIER.test(name)) {
     throw new Error(
-      `[midscene] ${where}: "${name}" is not a valid variable name (expected /^[A-Za-z_][A-Za-z0-9_]*$/).`,
+      `[midscene] ${where}: "${name}" is not a valid variable name (expected /^${IDENTIFIER_PATTERN}$/).`,
     );
   }
 }
+
+/**
+ * Validate keys as identifiers and stringify values — the normalization every
+ * front-end applies to user-supplied vars/args records.
+ */
+export function stringifyVarRecord(
+  record: Record<string, string | number | boolean>,
+  where: string,
+): Record<string, string> {
+  const out: Record<string, string> = {};
+  for (const [key, value] of Object.entries(record)) {
+    assertIdentifier(key, where);
+    out[key] = String(value);
+  }
+  return out;
+}
diff --git a/packages/testing-framework/src/frontends/gherkin/index.ts b/packages/testing-framework/src/frontends/gherkin/index.ts
index 8cb86c0529..35bb44f657 100644
--- a/packages/testing-framework/src/frontends/gherkin/index.ts
+++ b/packages/testing-framework/src/frontends/gherkin/index.ts
@@ -44,7 +44,7 @@ import type {
   PromptStepIR,
   ScenarioIR,
 } from '../../flow-ir';
-import { assertIdentifier } from '../../flow-ir';
+import { IDENTIFIER_PATTERN, assertIdentifier } from '../../flow-ir';
 
 export interface CompiledFeature {
   name: string;
@@ -54,11 +54,14 @@ export interface CompiledFeature {
   flows: FlowDefIR[];
 }
 
-const REMEMBER_STEP = /^I remember (.+?) as "([A-Za-z_][A-Za-z0-9_]*)"$/i;
+const REMEMBER_STEP = new RegExp(
+  `^I remember (.+?) as "(${IDENTIFIER_PATTERN})"$`,
+  'i',
+);
 const CALL_FLOW_STEP = /^I run the "([^"]+)" flow(?: with (.+))?$/i;
-const CALL_FLOW_ARG = /([A-Za-z_][A-Za-z0-9_]*)\s+"([^"]*)"/g;
-const PARAM_TAG = /^@param:([A-Za-z_][A-Za-z0-9_]*)$/;
-const RETURNS_TAG = /^@returns?:([A-Za-z_][A-Za-z0-9_]*)$/;
+const CALL_FLOW_ARG = new RegExp(`(${IDENTIFIER_PATTERN})\\s+"([^"]*)"`, 'g');
+const PARAM_TAG = new RegExp(`^@param:(${IDENTIFIER_PATTERN})$`);
+const RETURNS_TAG = new RegExp(`^@returns?:(${IDENTIFIER_PATTERN})$`);
 
 /** Compile Gherkin source text into IR scenarios and flow definitions. */
 export function compileFeature(
@@ -222,16 +225,15 @@ function promptFromPickleType(
   // And/But (conjunctions) to the last primary keyword.
   switch (step.type) {
     case PickleStepType.CONTEXT:
-      return { kind: 'prompt', node: 'ui', role: 'setup', template: text };
+      return { kind: 'prompt', node: 'ui', template: text };
     case PickleStepType.OUTCOME:
       return {
         kind: 'prompt',
         node: opts.isSoft ? 'soft' : 'verify',
-        role: 'assertion',
         template: text,
       };
     default:
       // ACTION and UNKNOWN (`*` bullets) both run as plain UI actions.
-      return { kind: 'prompt', node: 'ui', role: 'action', template: text };
+      return { kind: 'prompt', node: 'ui', template: text };
   }
 }
diff --git a/packages/testing-framework/src/frontends/js/bind-feature.ts b/packages/testing-framework/src/frontends/js/bind-feature.ts
index 8ce886758a..79fcbae58e 100644
--- a/packages/testing-framework/src/frontends/js/bind-feature.ts
+++ b/packages/testing-framework/src/frontends/js/bind-feature.ts
@@ -4,7 +4,7 @@ import type {
   ScenarioConfigIR,
   ScenarioIR,
 } from '../../flow-ir';
-import { assertIdentifier } from '../../flow-ir';
+import { stringifyVarRecord } from '../../flow-ir';
 /**
  * POC: hybrid authoring mode — `bindFeature(featurePathOrSource, overlay)`.
  *
@@ -31,7 +31,7 @@ import {
   compileFeature,
   compileFeatureFile,
 } from '../gherkin';
-import { type StepInput, When } from './index';
+import { type StepInput, normalizeStep } from './index';
 
 /** Anchor a step by its exact text (see {@link anchorText}) or its index. */
 export type StepAnchor = string | number;
@@ -161,12 +161,10 @@ function applyScenarioOverlay(
   const result: ScenarioIR = { ...scenario, steps };
 
   if (overlay.vars) {
-    const vars: Record<string, string> = { ...scenario.vars };
-    for (const [key, value] of Object.entries(overlay.vars)) {
-      assertIdentifier(key, `${where} overlay vars`);
-      vars[key] = String(value);
-    }
-    result.vars = vars;
+    result.vars = {
+      ...scenario.vars,
+      ...stringifyVarRecord(overlay.vars, `${where} overlay vars`),
+    };
   }
 
   if (overlay.skip !== undefined || overlay.only !== undefined) {
@@ -191,24 +189,19 @@ function patchStep(step: FlowIRStep, overlay: StepOverlay): FlowIRStep {
       return { ...step, template: overlay.template ?? step.template };
     case 'callFlow':
       return overlay.args
-        ? { ...step, args: { ...step.args, ...stringifyArgs(overlay.args) } }
+        ? {
+            ...step,
+            args: {
+              ...step.args,
+              ...stringifyVarRecord(overlay.args, 'bindFeature overlay args'),
+            },
+          }
         : step;
   }
 }
 
 function normalizeInserts(inserts: StepInput[] | undefined): FlowIRStep[] {
-  return (inserts ?? []).map((s) => (typeof s === 'string' ? When(s) : s));
-}
-
-function stringifyArgs(
-  args: Record<string, string | number | boolean>,
-): Record<string, string> {
-  const out: Record<string, string> = {};
-  for (const [key, value] of Object.entries(args)) {
-    assertIdentifier(key, 'bindFeature overlay args');
-    out[key] = String(value);
-  }
-  return out;
+  return (inserts ?? []).map(normalizeStep);
 }
 
 // ——————————————————— bind-time drift validation ———————————————————
diff --git a/packages/testing-framework/src/frontends/js/index.ts b/packages/testing-framework/src/frontends/js/index.ts
index 1fff6e0078..7afddad70f 100644
--- a/packages/testing-framework/src/frontends/js/index.ts
+++ b/packages/testing-framework/src/frontends/js/index.ts
@@ -31,6 +31,7 @@ import {
   type ScenarioIR,
   assertIdentifier,
   listPlaceholders,
+  stringifyVarRecord,
 } from '../../flow-ir';
 
 /** A step in the fluent API: an IR step, or a bare string (= `when`). */
@@ -40,23 +41,23 @@ export type StepInput = FlowIRStep | string;
 // `Then`). A lowercase `then` export would also make the module namespace a
 // thenable, which breaks dynamic `import()` of this module.
 export function Given(template: string): PromptStepIR {
-  return promptStep('ui', 'setup', template, 'Given');
+  return promptStep('ui', template, 'Given');
 }
 
 export function When(template: string): PromptStepIR {
-  return promptStep('ui', 'action', template, 'When');
+  return promptStep('ui', template, 'When');
 }
 
 export function Then(template: string): PromptStepIR {
-  return promptStep('verify', 'assertion', template, 'Then');
+  return promptStep('verify', template, 'Then');
 }
 
 export function Soft(template: string): PromptStepIR {
-  return promptStep('soft', 'assertion', template, 'Soft');
+  return promptStep('soft', template, 'Soft');
 }
 
 export function Advisory(template: string): PromptStepIR {
-  return promptStep('agent', 'advisory', template, 'Advisory');
+  return promptStep('agent', template, 'Advisory');
 }
 
 /** "Remember <description> as {varName}" — machine-owned variable capture. */
@@ -76,12 +77,11 @@ export function callFlow(
   if (!flowName.trim()) {
     throw new Error('[midscene] callFlow(): flow name must not be empty.');
   }
-  const normalized: Record<string, string> = {};
-  for (const [key, value] of Object.entries(args)) {
-    assertIdentifier(key, `callFlow("${flowName}") args`);
-    normalized[key] = String(value);
-  }
-  return { kind: 'callFlow', flowName, args: normalized };
+  return {
+    kind: 'callFlow',
+    flowName,
+    args: stringifyVarRecord(args, `callFlow("${flowName}") args`),
+  };
 }
 
 export interface DefineFlowInput {
@@ -128,15 +128,10 @@ export function scenario(
   if (!name.trim()) {
     throw new Error('[midscene] scenario(): a scenario must have a name.');
   }
-  const vars: Record<string, string> = {};
-  for (const [key, value] of Object.entries(options.vars ?? {})) {
-    assertIdentifier(key, `scenario("${name}") vars`);
-    vars[key] = String(value);
-  }
   return {
     name,
     steps: normalizeSteps(steps, `scenario("${name}")`),
-    vars,
+    vars: stringifyVarRecord(options.vars ?? {}, `scenario("${name}") vars`),
     tags: options.tags ?? [],
   };
 }
@@ -151,26 +146,30 @@ export function feature(name: string, scenarios: ScenarioIR[]): FeatureIR {
 
 function promptStep(
   node: PromptStepIR['node'],
-  role: PromptStepIR['role'],
   template: string,
   helper: string,
 ): PromptStepIR {
   if (!template.trim()) {
     throw new Error(`[midscene] ${helper}(): the prompt must not be empty.`);
   }
-  return { kind: 'prompt', node, role, template };
+  return { kind: 'prompt', node, template };
+}
+
+/** A bare string in a step list is shorthand for `When(...)`. */
+export function normalizeStep(step: StepInput): FlowIRStep {
+  return typeof step === 'string' ? When(step) : step;
 }
 
 function normalizeSteps(steps: StepInput[], where: string): FlowIRStep[] {
   if (!Array.isArray(steps) || steps.length === 0) {
     throw new Error(`[midscene] ${where}: steps must be a non-empty array.`);
   }
-  return steps.map((step) => (typeof step === 'string' ? When(step) : step));
+  return steps.map(normalizeStep);
 }
 
 // Hybrid mode (Gherkin source of truth + sparse JS overlay). Re-exported
-// last: bind-feature imports `When` from this module, and keeping the cycle
-// edge at the bottom makes the load order explicit.
+// last: bind-feature imports `normalizeStep` from this module, and keeping
+// the cycle edge at the bottom makes the load order explicit.
 export { bindFeature, anchorText } from './bind-feature';
 export type {
   FeatureOverlay,
diff --git a/packages/testing-framework/src/general-agent/codex-general-agent.ts b/packages/testing-framework/src/general-agent/codex-general-agent.ts
index a2865a90c9..c90a84a8fd 100644
--- a/packages/testing-framework/src/general-agent/codex-general-agent.ts
+++ b/packages/testing-framework/src/general-agent/codex-general-agent.ts
@@ -54,14 +54,15 @@ export class CodexGeneralAgent implements GeneralAgentAdapter {
       | { type: 'image_url'; image_url: { url: string } }
     > = [{ type: 'text', text: this.buildPrompt(input, needsVerdict) }];
 
+    let screenshotFile: string | undefined;
     if (input.screenshotBase64) {
-      const file = this.writeScreenshot(
+      screenshotFile = this.writeScreenshot(
         input.screenshotBase64,
         input.screenshotMediaType,
       );
       userContent.push({
         type: 'image_url',
-        image_url: { url: `file://${file}` },
+        image_url: { url: `file://${screenshotFile}` },
       });
     }
 
@@ -76,10 +77,17 @@ export class CodexGeneralAgent implements GeneralAgentAdapter {
     const modelRuntime = getModelRuntime(
       globalModelConfigManager.getModelConfig('default'),
     );
-    const result = await callAI(
-      [{ role: 'user', content: userContent }],
-      modelRuntime,
-    );
+    let result: Awaited<ReturnType<typeof callAI>>;
+    try {
+      result = await callAI(
+        [{ role: 'user', content: userContent }],
+        modelRuntime,
+      );
+    } finally {
+      // The provider has consumed the image once the call settles; delete it
+      // so long runs don't accumulate one file per step until dispose().
+      if (screenshotFile) rmSync(screenshotFile, { force: true });
+    }
 
     const text = result.content.trim();
     debug('codex run finished', { kind: input.kind, chars: text.length });
@@ -132,9 +140,9 @@ export class CodexGeneralAgent implements GeneralAgentAdapter {
 
 /** Parse the last `{"pass": ..., "reason": ...}` object in the reply. */
 export function extractVerdict(text: string): Verdict | undefined {
-  const candidates = text.match(/\{[^{}]*"pass"[^{}]*\}/g);
-  if (!candidates) return undefined;
+  const candidates = jsonObjectCandidates(text);
   for (let i = candidates.length - 1; i >= 0; i--) {
+    if (!candidates[i].includes('"pass"')) continue;
     try {
       const parsed = JSON.parse(candidates[i]);
       if (typeof parsed.pass === 'boolean') {
@@ -153,3 +161,36 @@ export function extractVerdict(text: string): Verdict | undefined {
   }
   return undefined;
 }
+
+/**
+ * Brace-balanced scan for top-level `{...}` substrings. Unlike a
+ * `[^{}]*`-style regex, this matches verdicts whose fields contain nested
+ * objects (e.g. structured `evidence`).
+ */
+function jsonObjectCandidates(text: string): string[] {
+  const out: string[] = [];
+  for (let i = 0; i < text.length; i++) {
+    if (text[i] !== '{') continue;
+    let depth = 0;
+    let inString = false;
+    for (let j = i; j < text.length; j++) {
+      const ch = text[j];
+      if (inString) {
+        if (ch === '\\') j++;
+        else if (ch === '"') inString = false;
+      } else if (ch === '"') {
+        inString = true;
+      } else if (ch === '{') {
+        depth++;
+      } else if (ch === '}') {
+        depth--;
+        if (depth === 0) {
+          out.push(text.slice(i, j + 1));
+          i = j; // resume after this object; nested braces stay inside it
+          break;
+        }
+      }
+    }
+  }
+  return out;
+}
diff --git a/packages/testing-framework/src/index.ts b/packages/testing-framework/src/index.ts
index 025f760f2f..065aa1d499 100644
--- a/packages/testing-framework/src/index.ts
+++ b/packages/testing-framework/src/index.ts
@@ -94,7 +94,6 @@ export type {
   PromptStepIR,
   CaptureStepIR,
   CallFlowStepIR,
-  PromptRole,
   ScenarioIR,
   ScenarioConfigIR,
   FlowDefIR,
diff --git a/packages/testing-framework/tests/unit-test/bind-feature.test.ts b/packages/testing-framework/tests/unit-test/bind-feature.test.ts
index fb4c369483..d2913f0c17 100644
--- a/packages/testing-framework/tests/unit-test/bind-feature.test.ts
+++ b/packages/testing-framework/tests/unit-test/bind-feature.test.ts
@@ -70,7 +70,6 @@ describe('bindFeature: overlay application', () => {
     expect(checkout?.steps.at(-1)).toEqual({
       kind: 'prompt',
       node: 'soft',
-      role: 'assertion',
       template: 'the cart total equals {price} within $0.01',
     });
     // Earlier steps untouched.
@@ -127,7 +126,7 @@ describe('bindFeature: overlay application', () => {
     ]);
     expect(steps?.[4]).toMatchObject({
       template: 'apply the coupon code {coupon} in the cart',
-      role: 'action',
+      node: 'ui',
     });
   });
 
diff --git a/packages/testing-framework/tests/unit-test/codex-verdict.test.ts b/packages/testing-framework/tests/unit-test/codex-verdict.test.ts
new file mode 100644
index 0000000000..42d9023c18
--- /dev/null
+++ b/packages/testing-framework/tests/unit-test/codex-verdict.test.ts
@@ -0,0 +1,45 @@
+import { describe, expect, it } from 'vitest';
+import { extractVerdict } from '../../src/general-agent/codex-general-agent';
+
+describe('extractVerdict', () => {
+  it('parses a trailing verdict object after prose', () => {
+    const verdict = extractVerdict(
+      'The cart shows the right total.\n{"pass": true, "reason": "total matches"}',
+    );
+    expect(verdict).toEqual({
+      pass: true,
+      reason: 'total matches',
+      evidence: undefined,
+    });
+  });
+
+  it('parses verdicts with nested evidence objects', () => {
+    const verdict = extractVerdict(
+      'Analysis done.\n{"pass": false, "reason": "missing banner", "evidence": {"selector": "header", "seen": ["logo"]}}',
+    );
+    expect(verdict).toMatchObject({
+      pass: false,
+      reason: 'missing banner',
+      evidence: { selector: 'header', seen: ['logo'] },
+    });
+  });
+
+  it('prefers the last verdict and skips non-verdict JSON', () => {
+    const verdict = extractVerdict(
+      '{"note": "scratch"}\n{"pass": false, "reason": "first"}\nrevised:\n{"pass": true, "reason": "second"}',
+    );
+    expect(verdict).toMatchObject({ pass: true, reason: 'second' });
+  });
+
+  it('fails closed on replies without a parseable verdict', () => {
+    expect(extractVerdict('all good, trust me')).toBeUndefined();
+    expect(extractVerdict('{"pass": "yes"}')).toBeUndefined();
+  });
+
+  it('ignores braces inside JSON strings', () => {
+    const verdict = extractVerdict(
+      '{"pass": true, "reason": "shows {price} placeholder literally"}',
+    );
+    expect(verdict).toMatchObject({ pass: true });
+  });
+});
diff --git a/packages/testing-framework/tests/unit-test/flow-ir.test.ts b/packages/testing-framework/tests/unit-test/flow-ir.test.ts
index 13f93c1752..a68524feea 100644
--- a/packages/testing-framework/tests/unit-test/flow-ir.test.ts
+++ b/packages/testing-framework/tests/unit-test/flow-ir.test.ts
@@ -15,7 +15,6 @@ const loginFlow: FlowDefIR = {
     {
       kind: 'prompt',
       node: 'ui',
-      role: 'action',
       template: 'log in as {role}',
     },
   ],
diff --git a/packages/testing-framework/tests/unit-test/gherkin-frontend.test.ts b/packages/testing-framework/tests/unit-test/gherkin-frontend.test.ts
index e56dab6c57..5a499062d6 100644
--- a/packages/testing-framework/tests/unit-test/gherkin-frontend.test.ts
+++ b/packages/testing-framework/tests/unit-test/gherkin-frontend.test.ts
@@ -58,7 +58,6 @@ describe('Gherkin front-end', () => {
     expect(login.steps[0]).toMatchObject({
       kind: 'prompt',
       node: 'ui',
-      role: 'action',
       template: 'I open the login page',
     });
     // `I remember ... as "greeting"` becomes a capture step.
@@ -71,8 +70,8 @@ describe('Gherkin front-end', () => {
 
   it('maps keywords to node kinds, with And/But inheriting the last primary keyword', () => {
     const checkout = compiled.scenarios[0];
-    // Background Given → ui/setup leading step.
-    expect(checkout.steps[0]).toMatchObject({ node: 'ui', role: 'setup' });
+    // Background Given → ui leading step.
+    expect(checkout.steps[0]).toMatchObject({ kind: 'prompt', node: 'ui' });
     // `And I remember ...` after a When still parses as capture.
     expect(checkout.steps[2]).toMatchObject({
       kind: 'capture',
diff --git a/packages/testing-framework/tests/unit-test/js-frontend.test.ts b/packages/testing-framework/tests/unit-test/js-frontend.test.ts
index 0fa5883fbe..d45cfde9d6 100644
--- a/packages/testing-framework/tests/unit-test/js-frontend.test.ts
+++ b/packages/testing-framework/tests/unit-test/js-frontend.test.ts
@@ -17,11 +17,9 @@ describe('JS front-end: keyword→node mapping', () => {
     expect(Given('the shop is open')).toEqual({
       kind: 'prompt',
       node: 'ui',
-      role: 'setup',
       template: 'the shop is open',
     });
     expect(When('I add the item to the cart').node).toBe('ui');
-    expect(When('I add the item to the cart').role).toBe('action');
     expect(Then('the cart shows 1 item').node).toBe('verify');
     expect(Soft('a promo banner is visible').node).toBe('soft');
     expect(Advisory('summarize risks').node).toBe('agent');
@@ -62,7 +60,6 @@ describe('JS front-end: scenario / feature builders', () => {
     expect(s.steps[0]).toEqual({
       kind: 'prompt',
       node: 'ui',
-      role: 'action',
       template: 'open the home page',
     });
     expect(s.steps[1].kind).toBe('prompt');
diff --git a/packages/testing-framework/tests/unit-test/run-scenario.test.ts b/packages/testing-framework/tests/unit-test/run-scenario.test.ts
index 0cccb9298f..6308d10053 100644
--- a/packages/testing-framework/tests/unit-test/run-scenario.test.ts
+++ b/packages/testing-framework/tests/unit-test/run-scenario.test.ts
@@ -150,7 +150,6 @@ describe('runScenario: named flows', () => {
         {
           kind: 'prompt',
           node: 'ui',
-          role: 'action',
           template: 'use {secret}',
         },
       ],
@@ -188,9 +187,7 @@ describe('runScenario: named flows', () => {
       name: 'NoCapture',
       params: [],
       returns: ['token'],
-      steps: [
-        { kind: 'prompt', node: 'ui', role: 'action', template: 'do nothing' },
-      ],
+      steps: [{ kind: 'prompt', node: 'ui', template: 'do nothing' }],
     };
     const { result } = await run(
       scenario('missing return', [callFlow('NoCapture')]),
@@ -230,9 +227,7 @@ describe('runScenario: call-depth cap', () => {
     name: 'Leaf',
     params: [],
     returns: [],
-    steps: [
-      { kind: 'prompt', node: 'ui', role: 'action', template: 'leaf action' },
-    ],
+    steps: [{ kind: 'prompt', node: 'ui', template: 'leaf action' }],
   };
   const mid: FlowDefIR = {
     name: 'Mid',

From c431eaeff4c62da8aa46443337c480b0c11c4443 Mon Sep 17 00:00:00 2001
From: ScriptedAlchemy <zack@module-federation.io>
Date: Wed, 10 Jun 2026 00:16:43 +0200
Subject: [PATCH 7/9] feat(testing-framework): address flow-IR POC review
 findings

- implement memo: 'once-per-run' flow memoization with a shareable
  memoStore on RunScenarioOptions; only fully successful completions are
  cached, hits replay returns with a narrated info step
- make verdict-channel instructions adapter-supplied (verdictInstructions
  on GeneralAgentAdapter) so Pi keeps report_verdict wording while codex
  prompts demand its JSON reply channel; adapter-neutral fail-closed reason
- bindFeature now throws on duplicate anchors targeting the same step
  instead of silently merging overlays
- introduce structural UiAgentLike and use it across the engine/executor,
  removing the `as unknown as Agent` casts from fakes and demo agents
- write codex screenshot temp files under midscene_run/tmp
  (getMidsceneRunSubDir) instead of mkdtemp, keeping per-call deletion
- make feature()/FeatureIR symmetric with the Gherkin CompiledFeature
  ({ name, scenarios, flows }); CompiledFeature is now an alias
- update POC-GHERKIN.md to match
---
 packages/testing-framework/POC-GHERKIN.md     |  42 +++--
 .../example/flows/shop.flows.ts               |  31 ++--
 .../testing-framework/scripts/demo/main.ts    |   6 +-
 .../scripts/demo/scripted-agents.ts           |   8 +-
 .../src/context/assembler.ts                  |  22 ++-
 .../testing-framework/src/engine/run-case.ts  |  11 +-
 .../testing-framework/src/engine/run-node.ts  |  18 +-
 .../testing-framework/src/flow-ir/index.ts    |   2 +
 .../src/flow-ir/run-scenario.ts               |  90 ++++++++--
 .../testing-framework/src/flow-ir/types.ts    |  36 +++-
 .../src/frontends/gherkin/index.ts            |  14 +-
 .../src/frontends/js/bind-feature.ts          |  32 ++--
 .../src/frontends/js/index.ts                 |  16 +-
 .../src/general-agent/codex-general-agent.ts  |  50 +++---
 .../src/general-agent/pi-general-agent.ts     |   6 +
 .../src/general-agent/types.ts                |   8 +
 packages/testing-framework/src/index.ts       |   2 +
 packages/testing-framework/src/types.ts       |  18 ++
 .../tests/unit-test/bind-feature.test.ts      |  20 ++-
 .../tests/unit-test/codex-verdict.test.ts     |  16 +-
 .../unit-test/context-and-skills.test.ts      |  31 +++-
 .../tests/unit-test/engine.test.ts            |  23 +++
 .../tests/unit-test/example-parity.test.ts    |   3 +-
 .../tests/unit-test/helpers/fake-agents.ts    |   8 +-
 .../tests/unit-test/js-frontend.test.ts       |  17 ++
 .../tests/unit-test/run-scenario.test.ts      | 168 +++++++++++++++++-
 26 files changed, 563 insertions(+), 135 deletions(-)

diff --git a/packages/testing-framework/POC-GHERKIN.md b/packages/testing-framework/POC-GHERKIN.md
index fe4f1f3579..fd261fa351 100644
--- a/packages/testing-framework/POC-GHERKIN.md
+++ b/packages/testing-framework/POC-GHERKIN.md
@@ -67,7 +67,11 @@ On the codex path, `verify`/`soft` verdicts run through `CodexGeneralAgent`
 (`src/general-agent/codex-general-agent.ts`), which routes the same
 provider via core's `callAI` and parses a JSON verdict fail-closed — the
 default Pi general agent needs an OpenAI-compatible HTTP endpoint and
-cannot speak `codex://`. Any such endpoint still works by setting
+cannot speak `codex://`. Each adapter supplies its own
+`verdictInstructions` (Pi: "call the `report_verdict` tool"; codex: "end
+your reply with a JSON verdict object"), which the engine embeds into the
+assembled context so the prompt always matches the verdict channel the
+adapter actually supports. Any such endpoint still works by setting
 `MIDSCENE_MODEL_*` yourself (Pi is used for verdicts then). Each scenario
 gets a fresh browser; Midscene HTML reports land in `midscene_run/report/`.
 Verified end to end against codex `gpt-5.5`: all three modes pass (one
@@ -125,8 +129,16 @@ the two channels never mix.
 - only declared `returns` are copied back into the caller scope;
 - UI/browser state is naturally shared (same UI agent);
 - call depth is capped at 2 (`MAX_FLOW_CALL_DEPTH`); deeper nesting fails;
-- `memo: 'once-per-run'` is accepted but stubbed (TODO in
-  `run-scenario.ts`).
+- `memo: 'once-per-run'` memoizes a **fully successful** completion (keyed
+  by flow name + resolved args) and replays the declared returns on the next
+  identical call instead of re-running the steps. The trace stays narratable
+  (flowEnter/flowExit still fire, plus an info "memo hit" step); failures
+  are never memoized, and different args miss. The memo table defaults to
+  per-`runScenario` call — pass one `memoStore` (a `FlowMemoStore` Map) to
+  several `runScenario` calls to share login-type flows across the
+  scenarios of a run. Note: replay assumes the flow's UI side effects (e.g.
+  an authenticated session) still hold; that judgment stays with the author
+  who opts a flow into memoization.
 
 **Keyword→policy mapping**: given-like → `ui` (setup), when-like → `ui`
 (action), then-like → `verify` (fail-closed), soft variants → `soft`
@@ -172,6 +184,12 @@ export would make the module namespace a thenable and break dynamic
 over data, computed args, build-time conditionals) just works —
 `defineFlow` additionally runs cheap static scoping checks.
 
+`feature(name, scenarios, flows)` returns the same
+`{ name, scenarios, flows }` shape as the Gherkin compiler
+(`CompiledFeature` is an alias of `FeatureIR`), so both front-ends hand
+callers an identical bundle: build a registry from `.flows`, run
+`.scenarios`.
+
 ## Gherkin front-end (`src/frontends/gherkin/`)
 
 `.feature` files are parsed with `@cucumber/gherkin` and compiled through
@@ -229,7 +247,9 @@ Binding glue is **title + anchor**: scenarios are keyed by title (a Scenario
 Outline title patches every expansion), steps by exact anchor text (prompt
 template, capture description, or flow name) or by index. Anchors always
 resolve against the *original* step list, so inserts never shift one
-another. `template`/`node` apply to prompt steps, `template` to captures,
+another, and each step may be targeted by **at most one** overlay entry —
+two entries anchoring the same step throw at bind time instead of merging
+silently. `template`/`node` apply to prompt steps, `template` to captures,
 `args` to flow calls — mismatches fail at bind time.
 
 **Drift validation with codegen**: every overlay reference is checked at
@@ -290,10 +310,11 @@ for (const s of scenarios) {
 
 ## Validation
 
-- `pnpm --filter @midscene/testing-framework test` — 100 tests, all green
-  (63 new across `flow-ir.test.ts`, `js-frontend.test.ts`,
+- `pnpm --filter @midscene/testing-framework test` — 119 tests, all green
+  (across `flow-ir.test.ts`, `js-frontend.test.ts`,
   `gherkin-frontend.test.ts`, `run-scenario.test.ts`, `bind-feature.test.ts`,
-  `example-parity.test.ts`; fakes only, no browsers / no model calls).
+  `example-parity.test.ts` and the Phase 0 suites; fakes only, no browsers /
+  no model calls).
 
 ## Open questions / next steps
 
@@ -303,9 +324,10 @@ for (const s of scenarios) {
 - **Typed captures**: `capture` always extracts strings (`aiString`); add
   number/boolean/structured (`aiQuery`) tiers and maybe a declared type in
   the "remember" convention.
-- **Memoization**: implement `once-per-run` (memo table keyed by flow name +
-  resolved args, replaying returns) — useful for login-type flows; decide
-  whether UI state divergence makes replay unsafe by default.
+- **Memo safety**: `once-per-run` is implemented (see Named flows above),
+  but replay trusts that the flow's UI side effects still hold. Decide
+  whether some flows need a cheap "still valid?" probe before replaying
+  (e.g. a session check for login flows).
 - **Flow-call reporting**: inner flow steps are flattened into the case's
   step list after an `info` "Entering flow …" marker; reports may want a
   nested view instead.
diff --git a/packages/testing-framework/example/flows/shop.flows.ts b/packages/testing-framework/example/flows/shop.flows.ts
index 1a0ce0723c..4855f1408c 100644
--- a/packages/testing-framework/example/flows/shop.flows.ts
+++ b/packages/testing-framework/example/flows/shop.flows.ts
@@ -29,8 +29,6 @@ export const loginFlow = defineFlow({
   ],
 });
 
-export const registry = createFlowRegistry([loginFlow]);
-
 const background = Given('the demo shop is open on the home page');
 
 export const checkoutAsAdmin = scenario('Checkout as admin', [
@@ -51,14 +49,21 @@ export const promoBanner = scenario('Promo banner is advisory', [
 // Dynamic authoring: plain JS replaces Scenario Outline examples.
 const roles = ['admin', 'guest'];
 
-export const shopFeature = feature('Checkout with a reusable login flow', [
-  checkoutAsAdmin,
-  promoBanner,
-  ...roles.map((role) =>
-    scenario(`Login greets every role (${role})`, [
-      background,
-      callFlow('Login', { role }),
-      Then('the header greets the user with {greeting}'),
-    ]),
-  ),
-]);
+// Same { name, scenarios, flows } shape as the Gherkin compiler's output.
+export const shopFeature = feature(
+  'Checkout with a reusable login flow',
+  [
+    checkoutAsAdmin,
+    promoBanner,
+    ...roles.map((role) =>
+      scenario(`Login greets every role (${role})`, [
+        background,
+        callFlow('Login', { role }),
+        Then('the header greets the user with {greeting}'),
+      ]),
+    ),
+  ],
+  [loginFlow],
+);
+
+export const registry = createFlowRegistry(shopFeature.flows);
diff --git a/packages/testing-framework/scripts/demo/main.ts b/packages/testing-framework/scripts/demo/main.ts
index dda1dbab9d..a27fbf5651 100644
--- a/packages/testing-framework/scripts/demo/main.ts
+++ b/packages/testing-framework/scripts/demo/main.ts
@@ -9,13 +9,13 @@
  * example/demo-app (experimental; needs MIDSCENE_MODEL_* env vars).
  */
 import { join } from 'node:path';
-import type { Agent } from '@midscene/core/agent';
 import {
   type CompiledFeature,
   type FlowRegistry,
   type ScenarioIR,
   type ScenarioRunEvent,
   type ScenarioRunResult,
+  type UiAgentLike,
   compileFeatureFile,
   createFlowRegistry,
   runScenario,
@@ -45,7 +45,7 @@ const cyan = paint(36);
 const magenta = paint(35);
 
 interface AgentBundle {
-  uiAgent: Agent;
+  uiAgent: UiAgentLike;
   generalAgent: GeneralAgentAdapter;
   cleanup?: () => Promise<void>;
   describeState?: () => string;
@@ -86,7 +86,7 @@ export async function main(argv: string[]): Promise<number> {
     : async () => {
         const ui = new ScriptedUiAgent();
         return {
-          uiAgent: ui.asAgent(),
+          uiAgent: ui,
           generalAgent: new ScriptedGeneralAgent(),
           describeState: () => ui.describeState(),
         };
diff --git a/packages/testing-framework/scripts/demo/scripted-agents.ts b/packages/testing-framework/scripts/demo/scripted-agents.ts
index 437c5666b6..6fdac055ac 100644
--- a/packages/testing-framework/scripts/demo/scripted-agents.ts
+++ b/packages/testing-framework/scripts/demo/scripted-agents.ts
@@ -5,12 +5,12 @@
  * in tests/unit-test/helpers, but behavior-driven instead of queue-driven so
  * all three authoring modes can run against one simulation.
  */
-import type { Agent } from '@midscene/core/agent';
 import type {
   GeneralAgentAdapter,
   GeneralAgentInput,
   GeneralAgentResult,
 } from '../../src/general-agent/types';
+import type { UiAgentLike } from '../../src/types';
 
 const PRICE = 129.0;
 
@@ -30,7 +30,7 @@ class ShopSimulation {
   }
 }
 
-export class ScriptedUiAgent {
+export class ScriptedUiAgent implements UiAgentLike {
   private readonly sim = new ShopSimulation();
 
   async aiAct(instruction: string): Promise<string> {
@@ -71,10 +71,6 @@ export class ScriptedUiAgent {
     screenshotBase64: async () => 'data:image/png;base64,SIMULATED',
   };
 
-  asAgent(): Agent {
-    return this as unknown as Agent;
-  }
-
   describeState(): string {
     return `role=${this.sim.role ?? 'anonymous'}, cart=${this.sim.inCart ? 'Trail Backpack' : 'empty'}, total=$${this.sim.total.toFixed(2)}`;
   }
diff --git a/packages/testing-framework/src/context/assembler.ts b/packages/testing-framework/src/context/assembler.ts
index a1261cf055..53dc6aef71 100644
--- a/packages/testing-framework/src/context/assembler.ts
+++ b/packages/testing-framework/src/context/assembler.ts
@@ -25,8 +25,24 @@ export interface AssembleContextInput {
   instruction: string;
   /** The current node's kind, for framing. */
   kind: 'verify' | 'soft' | 'agent';
+  /**
+   * Adapter-specific verdict-reporting instruction for verify/soft nodes
+   * (see {@link GeneralAgentAdapter.verdictInstructions}). Falls back to an
+   * adapter-neutral instruction when the adapter does not supply one.
+   */
+  verdictInstructions?: string;
 }
 
+/**
+ * Adapter-neutral fallback when a general agent does not say how verdicts
+ * must be reported. Deliberately mechanism-free: it demands a clear verdict
+ * without naming a tool or output format the adapter may not have.
+ */
+const NEUTRAL_VERDICT_INSTRUCTIONS =
+  'Make a judgment. You MUST finish by reporting a clear pass/fail verdict ' +
+  'with a reason. If you cannot confidently determine the result, report a ' +
+  'failure.';
+
 export function assembleContext(input: AssembleContextInput): string {
   const { caseName, pastSteps, instruction, kind } = input;
   const lines: string[] = [];
@@ -74,11 +90,7 @@ export function assembleContext(input: AssembleContextInput): string {
         'screenshot. Your output is advisory and does NOT decide pass/fail.',
     );
   } else {
-    lines.push(
-      'Make a judgment. You MUST finish by calling the `report_verdict` tool ' +
-        'with `pass`, `reason`, and optional `evidence`. If you cannot ' +
-        'confidently determine the result, report `pass: false`.',
-    );
+    lines.push(input.verdictInstructions ?? NEUTRAL_VERDICT_INSTRUCTIONS);
   }
   lines.push('');
   lines.push(instruction.trim());
diff --git a/packages/testing-framework/src/engine/run-case.ts b/packages/testing-framework/src/engine/run-case.ts
index 7a51d55066..590385f230 100644
--- a/packages/testing-framework/src/engine/run-case.ts
+++ b/packages/testing-framework/src/engine/run-case.ts
@@ -1,7 +1,6 @@
-import type { Agent } from '@midscene/core/agent';
 import type { GeneralAgentAdapter } from '../general-agent/types';
 import type { RuntimeNode } from '../runtime';
-import type { CaseResult, StepResult } from '../types';
+import type { CaseResult, StepResult, UiAgentLike } from '../types';
 import type { ParsedCase } from '../yaml/types';
 import { OutputStoreImpl } from './output-store';
 import { type RunNodeDeps, runNode } from './run-node';
@@ -9,7 +8,7 @@ import { type RunNodeDeps, runNode } from './run-node';
 export interface RunCaseOptions {
   parsed: ParsedCase;
   file: string;
-  uiAgent: Agent;
+  uiAgent: UiAgentLike;
   generalAgent: GeneralAgentAdapter;
   runtimeNodes: Record<string, RuntimeNode>;
   projectRoot: string;
@@ -124,8 +123,6 @@ export function recordStepResult(
   }
 }
 
-export function getReportFile(agent: Agent): string | undefined {
-  const candidate = (agent as unknown as { reportFile?: string | null })
-    .reportFile;
-  return candidate ?? undefined;
+export function getReportFile(agent: UiAgentLike): string | undefined {
+  return agent.reportFile ?? undefined;
 }
diff --git a/packages/testing-framework/src/engine/run-node.ts b/packages/testing-framework/src/engine/run-node.ts
index 659e1004d2..7e5a49ac40 100644
--- a/packages/testing-framework/src/engine/run-node.ts
+++ b/packages/testing-framework/src/engine/run-node.ts
@@ -3,12 +3,12 @@ import { assembleContext } from '../context/assembler';
 import { extractSkillReferences } from '../general-agent/skills';
 import type { GeneralAgentAdapter } from '../general-agent/types';
 import type { RuntimeNode, RuntimeNodeContext } from '../runtime';
-import type { StepOutput, StepResult, Verdict } from '../types';
+import type { StepOutput, StepResult, UiAgentLike, Verdict } from '../types';
 import { isBuiltinNode } from '../yaml/types';
 import type { OutputStoreImpl } from './output-store';
 
 export interface RunNodeDeps {
-  uiAgent: Agent;
+  uiAgent: UiAgentLike;
   generalAgent: GeneralAgentAdapter;
   runtimeNodes: Record<string, RuntimeNode>;
   outputs: OutputStoreImpl;
@@ -84,6 +84,10 @@ async function runJudgmentNode(
     pastSteps: deps.pastSteps,
     instruction,
     kind,
+    // The verdict channel is the adapter's business (tool call vs. JSON
+    // reply); let it phrase the instruction so the prompt never contradicts
+    // the mechanism it actually supports.
+    verdictInstructions: deps.generalAgent.verdictInstructions,
   });
 
   const result = await deps.generalAgent.run({
@@ -100,7 +104,7 @@ async function runJudgmentNode(
   const verdict: Verdict = result.verdict ?? {
     pass: false,
     reason:
-      'The agent did not report a verdict via report_verdict; treated as failure (fail-closed).',
+      'The general agent did not report a verdict; treated as failure (fail-closed).',
   };
 
   const output: StepOutput = {
@@ -164,7 +168,11 @@ async function runCustomNode(
   }
 
   const ctx: RuntimeNodeContext = {
-    uiAgent: deps.uiAgent,
+    // The runtime-node contract promises a full Agent (custom nodes may use
+    // any of its methods). Runtime nodes are only registered through the
+    // YAML runner, whose uiAgent is always a real core Agent; the flow-IR
+    // executor passes `runtimeNodes: {}` and never reaches this code.
+    uiAgent: deps.uiAgent as Agent,
     outputs: deps.outputs,
     state: deps.state,
     result: {
@@ -184,7 +192,7 @@ async function runCustomNode(
 }
 
 async function captureScreenshot(
-  agent: Agent,
+  agent: UiAgentLike,
 ): Promise<{ data?: string; mediaType: string }> {
   try {
     const raw = await agent.interface.screenshotBase64();
diff --git a/packages/testing-framework/src/flow-ir/index.ts b/packages/testing-framework/src/flow-ir/index.ts
index 84f97795d5..2b25c0a102 100644
--- a/packages/testing-framework/src/flow-ir/index.ts
+++ b/packages/testing-framework/src/flow-ir/index.ts
@@ -3,6 +3,7 @@ export {
   IDENTIFIER_PATTERN,
   MAX_FLOW_CALL_DEPTH,
   assertIdentifier,
+  flowMemoKey,
   stringifyVarRecord,
 } from './types';
 export type {
@@ -10,6 +11,7 @@ export type {
   CaptureStepIR,
   CallFlowStepIR,
   FlowIRStep,
+  FlowMemoStore,
   ScenarioIR,
   ScenarioConfigIR,
   FlowDefIR,
diff --git a/packages/testing-framework/src/flow-ir/run-scenario.ts b/packages/testing-framework/src/flow-ir/run-scenario.ts
index 0372c46226..4b670b615f 100644
--- a/packages/testing-framework/src/flow-ir/run-scenario.ts
+++ b/packages/testing-framework/src/flow-ir/run-scenario.ts
@@ -14,21 +14,22 @@
  * All templates go through mechanical `{varName}` substitution before any
  * model sees them.
  */
-import type { Agent } from '@midscene/core/agent';
 import { OutputStoreImpl } from '../engine/output-store';
 import { getReportFile, recordStepResult } from '../engine/run-case';
 import { type RunNodeDeps, runNode } from '../engine/run-node';
 import type { GeneralAgentAdapter } from '../general-agent/types';
-import type { CaseResult, StepResult } from '../types';
+import type { CaseResult, StepResult, UiAgentLike } from '../types';
 import { FlowRegistry } from './registry';
 import { type VariableScope, substitute } from './substitute';
 import {
   type CallFlowStepIR,
   type CaptureStepIR,
   type FlowIRStep,
+  type FlowMemoStore,
   MAX_FLOW_CALL_DEPTH,
   type PromptStepIR,
   type ScenarioIR,
+  flowMemoKey,
 } from './types';
 
 /**
@@ -74,9 +75,15 @@ export interface RunScenarioOptions {
   registry?: FlowRegistry;
   /** Source file the scenario came from, for reporting. */
   file?: string;
-  uiAgent: Agent;
+  uiAgent: UiAgentLike;
   generalAgent: GeneralAgentAdapter;
   projectRoot?: string;
+  /**
+   * Memo table for `memo: 'once-per-run'` flows. Defaults to a fresh store
+   * per call; pass one Map to several `runScenario` calls to share memoized
+   * flow completions across the scenarios of one run.
+   */
+  memoStore?: FlowMemoStore;
   /** Optional observer for narration/debugging. */
   onEvent?: (event: ScenarioRunEvent) => void;
 }
@@ -89,7 +96,7 @@ export interface ScenarioRunResult extends CaseResult {
 
 interface ExecCtx {
   registry: FlowRegistry;
-  uiAgent: Agent;
+  uiAgent: UiAgentLike;
   generalAgent: GeneralAgentAdapter;
   projectRoot: string;
   caseName: string;
@@ -97,6 +104,7 @@ interface ExecCtx {
   outputs: OutputStoreImpl;
   steps: StepResult[];
   warnings: string[];
+  memoStore: FlowMemoStore;
   emit: (event: ScenarioRunEvent) => void;
 }
 
@@ -118,6 +126,7 @@ export async function runScenario(
     outputs: new OutputStoreImpl(),
     steps: [],
     warnings: [],
+    memoStore: options.memoStore ?? new Map(),
     emit: options.onEvent ?? (() => {}),
   };
 
@@ -305,8 +314,6 @@ async function execCallFlowStep(
   const stepStart = Date.now();
   const where = `${ctx.caseName} step ${index + 1} (flow "${step.flowName}")`;
 
-  let childScope: VariableScope;
-  let resolvedArgs: Record<string, string>;
   try {
     if (depth + 1 > MAX_FLOW_CALL_DEPTH) {
       throw new Error(
@@ -322,8 +329,8 @@ async function execCallFlowStep(
         );
       }
     }
-    resolvedArgs = {};
-    childScope = new Map();
+    const resolvedArgs: Record<string, string> = {};
+    const childScope: VariableScope = new Map();
     for (const param of flow.params) {
       const template = step.args[param];
       if (template === undefined) {
@@ -337,9 +344,17 @@ async function execCallFlowStep(
       childScope.set(param, value);
     }
 
-    // TODO(POC): flow.memo === 'once-per-run' should look up a per-run memo
-    // table keyed by (flowName, resolvedArgs) and replay returns on a hit.
-    // For now every call executes.
+    const memoKey =
+      flow.memo === 'once-per-run'
+        ? flowMemoKey(step.flowName, resolvedArgs)
+        : undefined;
+    if (memoKey !== undefined) {
+      const memoized = ctx.memoStore.get(memoKey);
+      if (memoized) {
+        replayMemoizedFlow(step, resolvedArgs, memoized, scope, depth, ctx);
+        return true;
+      }
+    }
 
     ctx.emit({
       type: 'flowEnter',
@@ -379,6 +394,11 @@ async function execCallFlowStep(
       returns[ret] = value;
       ctx.emit({ type: 'varSet', name: ret, value, source: 'return', depth });
     }
+    // Memoize only fully successful completions (all steps green, all
+    // declared returns captured); failures above never reach this point.
+    if (memoKey !== undefined) {
+      ctx.memoStore.set(memoKey, returns);
+    }
     ctx.emit({
       type: 'flowExit',
       flowName: step.flowName,
@@ -403,6 +423,54 @@ async function execCallFlowStep(
   }
 }
 
+/**
+ * Memo hit for a `memo: 'once-per-run'` flow: skip the flow's steps and
+ * replay the memoized returns into the caller scope. The trace stays
+ * narratable — flowEnter/flowExit still fire and an info step records that
+ * the result was replayed rather than re-executed.
+ */
+function replayMemoizedFlow(
+  step: CallFlowStepIR,
+  resolvedArgs: Record<string, string>,
+  returns: Record<string, string>,
+  scope: VariableScope,
+  depth: number,
+  ctx: ExecCtx,
+): void {
+  ctx.emit({
+    type: 'flowEnter',
+    flowName: step.flowName,
+    args: resolvedArgs,
+    depth: depth + 1,
+  });
+  recordStep(
+    {
+      index: ctx.steps.length,
+      node: 'flow',
+      input: formatCall(step.flowName, resolvedArgs),
+      status: 'info',
+      output: {
+        text: `Memo hit for flow "${step.flowName}" (memo: once-per-run): steps were not re-executed; replayed ${
+          Object.keys(returns).length > 0 ? formatArgs(returns) : 'no returns'
+        } from an earlier successful run.`,
+      },
+      durationMs: 0,
+    },
+    depth,
+    ctx,
+  );
+  for (const [name, value] of Object.entries(returns)) {
+    scope.set(name, value);
+    ctx.emit({ type: 'varSet', name, value, source: 'return', depth });
+  }
+  ctx.emit({
+    type: 'flowExit',
+    flowName: step.flowName,
+    returns,
+    depth: depth + 1,
+  });
+}
+
 function nodeDeps(ctx: ExecCtx): RunNodeDeps {
   return {
     uiAgent: ctx.uiAgent,
diff --git a/packages/testing-framework/src/flow-ir/types.ts b/packages/testing-framework/src/flow-ir/types.ts
index 00bd4962b2..db51dfbb49 100644
--- a/packages/testing-framework/src/flow-ir/types.ts
+++ b/packages/testing-framework/src/flow-ir/types.ts
@@ -90,17 +90,45 @@ export interface FlowDefIR {
   returns: string[];
   steps: FlowIRStep[];
   /**
-   * Memoization tier. Only 'none' is implemented.
-   * TODO(POC): 'once-per-run' should skip re-execution and replay the
-   * memoized returns when the flow is called again with identical args.
+   * Memoization tier. With 'once-per-run', a successful completion is stored
+   * in the run's {@link FlowMemoStore} (keyed by flow name + resolved args);
+   * a later call with identical args skips the flow's steps and replays the
+   * memoized returns into the caller scope. Failed runs are never memoized.
+   * Defaults to 'none' (every call executes).
    */
   memo?: 'none' | 'once-per-run';
 }
 
-/** A group of scenarios (Gherkin Feature / JS `feature()` builder). */
+/**
+ * Memo table for `memo: 'once-per-run'` flows: cache key → the returns of a
+ * fully successful completion. `runScenario` defaults to a fresh per-call
+ * store; pass one Map to several `runScenario` calls to share memoized flows
+ * (e.g. a login) across the scenarios of one run.
+ */
+export type FlowMemoStore = Map<string, Record<string, string>>;
+
+/** Cache key for a memoized flow call: flow name + resolved args. */
+export function flowMemoKey(
+  flowName: string,
+  resolvedArgs: Record<string, string>,
+): string {
+  // resolvedArgs is built in declared-param order, so the JSON is stable.
+  return `${flowName}\u0000${JSON.stringify(resolvedArgs)}`;
+}
+
+/**
+ * A compiled feature: runnable scenarios plus the flow definitions authored
+ * alongside them. Both front-ends return this exact shape — the Gherkin
+ * compiler (`compileFeature`, where `CompiledFeature` is an alias of this
+ * type) and the JS `feature()` builder — so callers can build a registry
+ * from `.flows` and run `.scenarios` without caring about the surface.
+ */
 export interface FeatureIR {
   name: string;
+  /** Runnable scenarios (in Gherkin: everything not tagged `@flow`). */
   scenarios: ScenarioIR[];
+  /** Flow definitions, ready for a {@link FlowRegistry}. */
+  flows: FlowDefIR[];
 }
 
 /** Flow calls may nest at most this deep (scenario itself is depth 0). */
diff --git a/packages/testing-framework/src/frontends/gherkin/index.ts b/packages/testing-framework/src/frontends/gherkin/index.ts
index 35bb44f657..1a37fad2c1 100644
--- a/packages/testing-framework/src/frontends/gherkin/index.ts
+++ b/packages/testing-framework/src/frontends/gherkin/index.ts
@@ -39,6 +39,7 @@ import {
   PickleStepType,
 } from '@cucumber/messages';
 import type {
+  FeatureIR,
   FlowDefIR,
   FlowIRStep,
   PromptStepIR,
@@ -46,13 +47,12 @@ import type {
 } from '../../flow-ir';
 import { IDENTIFIER_PATTERN, assertIdentifier } from '../../flow-ir';
 
-export interface CompiledFeature {
-  name: string;
-  /** Runnable scenarios (everything not tagged `@flow`). */
-  scenarios: ScenarioIR[];
-  /** Flow definitions (scenarios tagged `@flow`), ready for a FlowRegistry. */
-  flows: FlowDefIR[];
-}
+/**
+ * The Gherkin compiler's output — the shared {@link FeatureIR} shape that the
+ * JS `feature()` builder also returns: runnable scenarios (everything not
+ * tagged `@flow`) plus flow definitions (scenarios tagged `@flow`).
+ */
+export type CompiledFeature = FeatureIR;
 
 const REMEMBER_STEP = new RegExp(
   `^I remember (.+?) as "(${IDENTIFIER_PATTERN})"$`,
diff --git a/packages/testing-framework/src/frontends/js/bind-feature.ts b/packages/testing-framework/src/frontends/js/bind-feature.ts
index 79fcbae58e..f9b7d85833 100644
--- a/packages/testing-framework/src/frontends/js/bind-feature.ts
+++ b/packages/testing-framework/src/frontends/js/bind-feature.ts
@@ -122,12 +122,7 @@ function applyScenarioOverlay(
 ): ScenarioIR {
   const where = `bindFeature(${uri}): scenario "${scenario.name}"`;
 
-  interface Patch {
-    overlays: StepOverlay[];
-    before: FlowIRStep[];
-    after: FlowIRStep[];
-  }
-  const patches = new Map<number, Patch>();
+  const patches = new Map<number, StepOverlay>();
 
   for (const stepOverlay of overlay.steps ?? []) {
     // All anchors resolve against the ORIGINAL step list, so several
@@ -135,15 +130,16 @@ function applyScenarioOverlay(
     const index = resolveAnchor(stepOverlay.at, scenario, uri);
     validateStepOverlay(stepOverlay, scenario.steps[index], index, where);
 
-    const patch = patches.get(index) ?? {
-      overlays: [],
-      before: [],
-      after: [],
-    };
-    patch.overlays.push(stepOverlay);
-    patch.before.push(...normalizeInserts(stepOverlay.before));
-    patch.after.push(...normalizeInserts(stepOverlay.after));
-    patches.set(index, patch);
+    // One overlay per step: silently merging two entries that target the
+    // same step would hide a conflict (whose template wins? in what order do
+    // inserts land?), so duplicates fail loudly like every other drift.
+    const previous = patches.get(index);
+    if (previous) {
+      throw new Error(
+        `[midscene] ${where}: overlays \`at: ${JSON.stringify(previous.at)}\` and \`at: ${JSON.stringify(stepOverlay.at)}\` both target step ${index} (${describeStep(scenario.steps[index])}). Merge them into a single overlay entry.`,
+      );
+    }
+    patches.set(index, stepOverlay);
   }
 
   const steps: FlowIRStep[] = [];
@@ -153,9 +149,9 @@ function applyScenarioOverlay(
       steps.push(scenario.steps[i]);
       continue;
     }
-    steps.push(...patch.before);
-    steps.push(patch.overlays.reduce(patchStep, scenario.steps[i]));
-    steps.push(...patch.after);
+    steps.push(...normalizeInserts(patch.before));
+    steps.push(patchStep(scenario.steps[i], patch));
+    steps.push(...normalizeInserts(patch.after));
   }
 
   const result: ScenarioIR = { ...scenario, steps };
diff --git a/packages/testing-framework/src/frontends/js/index.ts b/packages/testing-framework/src/frontends/js/index.ts
index 7afddad70f..06e96ac613 100644
--- a/packages/testing-framework/src/frontends/js/index.ts
+++ b/packages/testing-framework/src/frontends/js/index.ts
@@ -89,7 +89,7 @@ export interface DefineFlowInput {
   params?: string[];
   returns?: string[];
   steps: StepInput[];
-  /** TODO(POC): only 'none' is implemented; 'once-per-run' is accepted but ignored. */
+  /** See {@link FlowDefIR.memo}: 'once-per-run' replays a successful completion on identical args. */
   memo?: 'none' | 'once-per-run';
 }
 
@@ -136,12 +136,20 @@ export function scenario(
   };
 }
 
-/** Group scenarios, mirroring a Gherkin Feature. */
-export function feature(name: string, scenarios: ScenarioIR[]): FeatureIR {
+/**
+ * Group scenarios (and the flows they use), mirroring a Gherkin Feature.
+ * Returns the same {@link FeatureIR} shape as `compileFeature`, so
+ * `createFlowRegistry(feature.flows)` works identically for both surfaces.
+ */
+export function feature(
+  name: string,
+  scenarios: ScenarioIR[],
+  flows: FlowDefIR[] = [],
+): FeatureIR {
   if (!name.trim()) {
     throw new Error('[midscene] feature(): a feature must have a name.');
   }
-  return { name, scenarios };
+  return { name, scenarios, flows };
 }
 
 function promptStep(
diff --git a/packages/testing-framework/src/general-agent/codex-general-agent.ts b/packages/testing-framework/src/general-agent/codex-general-agent.ts
index c90a84a8fd..1f9091ab42 100644
--- a/packages/testing-framework/src/general-agent/codex-general-agent.ts
+++ b/packages/testing-framework/src/general-agent/codex-general-agent.ts
@@ -14,12 +14,13 @@
  *  - no tool runtime: the verdict is requested as a strict JSON object in the
  *    reply and parsed fail-closed (no `report_verdict` tool, no `$skill`
  *    loading — referenced skills are only named in the prompt);
- *  - the screenshot is written to a temp file and passed as a `file://`
- *    image_url, which the codex provider maps to a localImage input.
+ *  - the screenshot is written to a temp file under `midscene_run/tmp` and
+ *    passed as a `file://` image_url, which the codex provider maps to a
+ *    localImage input; the file is deleted as soon as the call settles.
  */
-import { mkdtempSync, rmSync, writeFileSync } from 'node:fs';
-import { tmpdir } from 'node:os';
+import { rmSync, writeFileSync } from 'node:fs';
 import { join } from 'node:path';
+import { getMidsceneRunSubDir } from '@midscene/shared/common';
 import { getDebug } from '@midscene/shared/logger';
 import type { Verdict } from '../types';
 import type {
@@ -33,17 +34,23 @@ const warn = getDebug('testing-framework:codex-general-agent', {
   console: true,
 });
 
-const VERDICT_INSTRUCTIONS = `
-You have no tools in this environment. After your analysis, end your reply
-with the verdict as a single JSON object on its own line, exactly in this
-shape (no markdown fence around it):
+const VERDICT_INSTRUCTIONS = `Make a judgment. You have no tools in this environment. After your analysis,
+end your reply with the verdict as a single JSON object on its own line,
+exactly in this shape (no markdown fence around it):
 
 {"pass": true|false, "reason": "<human-readable rationale>"}
 
 If you cannot confidently determine the result, report "pass": false.`;
 
 export class CodexGeneralAgent implements GeneralAgentAdapter {
-  private tempDir?: string;
+  /**
+   * Codex has no tool runtime here, so the verdict travels as a trailing
+   * JSON object in the reply (parsed fail-closed by {@link extractVerdict}).
+   * Supplying this to the engine keeps the assembled context consistent with
+   * that mechanism instead of demanding a `report_verdict` tool call.
+   */
+  readonly verdictInstructions = VERDICT_INSTRUCTIONS;
+
   private screenshotCount = 0;
 
   async run(input: GeneralAgentInput): Promise<GeneralAgentResult> {
@@ -52,7 +59,7 @@ export class CodexGeneralAgent implements GeneralAgentAdapter {
     const userContent: Array<
       | { type: 'text'; text: string }
       | { type: 'image_url'; image_url: { url: string } }
-    > = [{ type: 'text', text: this.buildPrompt(input, needsVerdict) }];
+    > = [{ type: 'text', text: this.buildPrompt(input) }];
 
     let screenshotFile: string | undefined;
     if (input.screenshotBase64) {
@@ -104,34 +111,25 @@ export class CodexGeneralAgent implements GeneralAgentAdapter {
     return { text, verdict };
   }
 
-  async dispose(): Promise<void> {
-    if (this.tempDir) {
-      rmSync(this.tempDir, { recursive: true, force: true });
-      this.tempDir = undefined;
-    }
-  }
-
-  private buildPrompt(input: GeneralAgentInput, needsVerdict: boolean): string {
+  private buildPrompt(input: GeneralAgentInput): string {
+    // Verdict-reporting instructions are NOT appended here: the engine puts
+    // `verdictInstructions` into the assembled context for verify/soft nodes.
     const parts = [input.context];
     if (input.referencedSkills.length > 0) {
       parts.push(
         `\nThis task references the following skills (not loadable in this environment, judge from the screenshot and history): ${input.referencedSkills.map((s) => `$${s}`).join(', ')}.`,
       );
     }
-    if (needsVerdict) {
-      parts.push(VERDICT_INSTRUCTIONS);
-    }
     return parts.join('\n');
   }
 
   private writeScreenshot(base64: string, mediaType?: string): string {
-    if (!this.tempDir) {
-      this.tempDir = mkdtempSync(join(tmpdir(), 'midscene-codex-ga-'));
-    }
     const ext = mediaType === 'image/jpeg' ? 'jpg' : 'png';
+    // Repo convention: transient artifacts live under midscene_run/tmp.
+    // Each file is deleted right after the provider consumes it (see run()).
     const file = join(
-      this.tempDir,
-      `screenshot-${++this.screenshotCount}.${ext}`,
+      getMidsceneRunSubDir('tmp'),
+      `codex-general-agent-${process.pid}-${++this.screenshotCount}.${ext}`,
     );
     writeFileSync(file, Buffer.from(base64, 'base64'));
     return file;
diff --git a/packages/testing-framework/src/general-agent/pi-general-agent.ts b/packages/testing-framework/src/general-agent/pi-general-agent.ts
index ca13c77025..c3771e41ca 100644
--- a/packages/testing-framework/src/general-agent/pi-general-agent.ts
+++ b/packages/testing-framework/src/general-agent/pi-general-agent.ts
@@ -58,6 +58,12 @@ interface PreparedModel {
  * Pi-backed implementation of {@link GeneralAgentAdapter}.
  */
 export class PiGeneralAgent implements GeneralAgentAdapter {
+  /** Pi registers a `report_verdict` tool, so the prompt demands it. */
+  readonly verdictInstructions =
+    'Make a judgment. You MUST finish by calling the `report_verdict` tool ' +
+    'with `pass`, `reason`, and optional `evidence`. If you cannot ' +
+    'confidently determine the result, report `pass: false`.';
+
   private prepared?: PreparedModel;
   private readonly loaderCache = new Map<string, DefaultResourceLoader>();
 
diff --git a/packages/testing-framework/src/general-agent/types.ts b/packages/testing-framework/src/general-agent/types.ts
index a327cad79c..b3e23b7537 100644
--- a/packages/testing-framework/src/general-agent/types.ts
+++ b/packages/testing-framework/src/general-agent/types.ts
@@ -52,4 +52,12 @@ export interface GeneralAgentAdapter {
   run(input: GeneralAgentInput): Promise<GeneralAgentResult>;
   /** Release any underlying resources. */
   dispose?(): Promise<void>;
+  /**
+   * How the model must report its verdict for verify/soft nodes, in this
+   * adapter's own channel (e.g. Pi's `report_verdict` tool vs. Codex's
+   * trailing JSON object). Included verbatim in the assembled context so the
+   * prompt never contradicts the adapter's actual verdict mechanism. When
+   * omitted, the engine uses an adapter-neutral instruction.
+   */
+  verdictInstructions?: string;
 }
diff --git a/packages/testing-framework/src/index.ts b/packages/testing-framework/src/index.ts
index 065aa1d499..78d2796a2c 100644
--- a/packages/testing-framework/src/index.ts
+++ b/packages/testing-framework/src/index.ts
@@ -45,6 +45,7 @@ export type {
   OutputStore,
   TestResultSoFar,
   BuiltinNodeType,
+  UiAgentLike,
 } from './types';
 
 // —— general agent (swappable) ——
@@ -97,6 +98,7 @@ export type {
   ScenarioIR,
   ScenarioConfigIR,
   FlowDefIR,
+  FlowMemoStore,
   FeatureIR,
   RunScenarioOptions,
   ScenarioRunResult,
diff --git a/packages/testing-framework/src/types.ts b/packages/testing-framework/src/types.ts
index ec224eb176..aa641bb476 100644
--- a/packages/testing-framework/src/types.ts
+++ b/packages/testing-framework/src/types.ts
@@ -10,6 +10,24 @@ import type { Agent } from '@midscene/core/agent';
 /** Built-in node types plus the open-ended custom (runtime) node name. */
 export type BuiltinNodeType = 'ui' | 'verify' | 'soft' | 'agent';
 
+/**
+ * The structural slice of the Midscene UI Agent the engine actually uses:
+ * `aiAct` (ui nodes), `aiAsk` (ui-node conclusions), `aiString` (capture
+ * steps), the current screenshot, and the optional report path. A real core
+ * {@link Agent} satisfies this as-is; tests and offline demos can supply a
+ * plain object instead of casting fakes to `Agent`.
+ */
+export interface UiAgentLike {
+  aiAct(instruction: string): Promise<string | undefined>;
+  aiAsk(prompt: string): Promise<string>;
+  aiString(prompt: string): Promise<string>;
+  interface: {
+    screenshotBase64(): Promise<string>;
+  };
+  /** Path to the Midscene HTML report, when the agent generates one. */
+  reportFile?: string | null;
+}
+
 /**
  * A verify/soft verdict. `verify` gates the case; `soft` only records a warning.
  * See RFC §6.
diff --git a/packages/testing-framework/tests/unit-test/bind-feature.test.ts b/packages/testing-framework/tests/unit-test/bind-feature.test.ts
index d2913f0c17..112db2710e 100644
--- a/packages/testing-framework/tests/unit-test/bind-feature.test.ts
+++ b/packages/testing-framework/tests/unit-test/bind-feature.test.ts
@@ -177,9 +177,8 @@ describe('bindFeature: overlay application', () => {
     const result = await runScenario({
       scenario: checkout,
       registry: createFlowRegistry(bound.flows),
-      uiAgent: ui.asAgent(),
+      uiAgent: ui,
       generalAgent: general,
-      env: {},
     });
 
     // Injected variable was substituted into the inserted step's prompt.
@@ -246,6 +245,23 @@ describe('bindFeature: drift validation with codegen', () => {
     ).toThrow(/anchor 9 is out of range.*indices 0–1/s);
   });
 
+  it('rejects two overlays targeting the same step', () => {
+    expect(() =>
+      bindFeature(FEATURE, {
+        scenarios: {
+          'Checkout as admin': {
+            steps: [
+              { at: 'the cart total equals {price}', node: 'soft' },
+              { at: 3, template: 'the cart total roughly equals {price}' },
+            ],
+          },
+        },
+      }),
+    ).toThrow(
+      /overlays `at: "the cart total equals \{price\}"` and `at: 3` both target step 3.*single overlay entry/s,
+    );
+  });
+
   it('rejects ambiguous text anchors and suggests index anchors', () => {
     const duplicated = `
 Feature: dup
diff --git a/packages/testing-framework/tests/unit-test/codex-verdict.test.ts b/packages/testing-framework/tests/unit-test/codex-verdict.test.ts
index 42d9023c18..c7d500a5e6 100644
--- a/packages/testing-framework/tests/unit-test/codex-verdict.test.ts
+++ b/packages/testing-framework/tests/unit-test/codex-verdict.test.ts
@@ -1,5 +1,19 @@
 import { describe, expect, it } from 'vitest';
-import { extractVerdict } from '../../src/general-agent/codex-general-agent';
+import {
+  CodexGeneralAgent,
+  extractVerdict,
+} from '../../src/general-agent/codex-general-agent';
+
+describe('CodexGeneralAgent verdict channel', () => {
+  it('advertises the JSON-reply channel, not a report_verdict tool', () => {
+    const instructions = new CodexGeneralAgent().verdictInstructions;
+    // Self-consistent prompt: codex has no tools, so the instruction must
+    // describe the trailing-JSON channel that extractVerdict parses.
+    expect(instructions).toContain('You have no tools');
+    expect(instructions).toContain('"pass": true|false');
+    expect(instructions).not.toContain('report_verdict');
+  });
+});
 
 describe('extractVerdict', () => {
   it('parses a trailing verdict object after prose', () => {
diff --git a/packages/testing-framework/tests/unit-test/context-and-skills.test.ts b/packages/testing-framework/tests/unit-test/context-and-skills.test.ts
index 4724a3397f..731876118b 100644
--- a/packages/testing-framework/tests/unit-test/context-and-skills.test.ts
+++ b/packages/testing-framework/tests/unit-test/context-and-skills.test.ts
@@ -51,10 +51,39 @@ describe('assembleContext', () => {
     expect(ctx).toContain('Created order #123');
     expect(ctx).toContain('"orderId":"123"');
     expect(ctx).toContain('PASS — found in db');
-    expect(ctx).toContain('report_verdict');
     expect(ctx).toContain('Use $database to verify orderId');
   });
 
+  it('uses the adapter-supplied verdict instructions verbatim', () => {
+    const ctx = assembleContext({
+      caseName: 'c',
+      pastSteps: [],
+      instruction: 'check it',
+      kind: 'verify',
+      verdictInstructions:
+        'End your reply with a single JSON verdict object on its own line.',
+    });
+    expect(ctx).toContain(
+      'End your reply with a single JSON verdict object on its own line.',
+    );
+    expect(ctx).not.toContain('report_verdict');
+  });
+
+  it('falls back to an adapter-neutral verdict instruction', () => {
+    for (const kind of ['verify', 'soft'] as const) {
+      const ctx = assembleContext({
+        caseName: 'c',
+        pastSteps: [],
+        instruction: 'check it',
+        kind,
+      });
+      // Neutral: demands a verdict without naming any reporting mechanism.
+      expect(ctx).toContain('pass/fail verdict');
+      expect(ctx).not.toContain('report_verdict');
+      expect(ctx).not.toContain('JSON');
+    }
+  });
+
   it('frames agent nodes as advisory', () => {
     const ctx = assembleContext({
       caseName: 'c',
diff --git a/packages/testing-framework/tests/unit-test/engine.test.ts b/packages/testing-framework/tests/unit-test/engine.test.ts
index d2371937e3..f2f30e36ae 100644
--- a/packages/testing-framework/tests/unit-test/engine.test.ts
+++ b/packages/testing-framework/tests/unit-test/engine.test.ts
@@ -81,6 +81,29 @@ describe('runCase node semantics', () => {
     expect(result.steps).toHaveLength(1); // stopped before ui
   });
 
+  it('verify prompts carry the adapter-supplied verdict instructions', async () => {
+    const parsed = parseCaseYaml('flow:\n  - verify: ok?');
+    const seen: string[] = [];
+    const result = await runCase({
+      ...base,
+      parsed,
+      file: 'c.yaml',
+      uiAgent: fakeAgent(),
+      generalAgent: {
+        verdictInstructions: 'Reply with VERDICT: pass or VERDICT: fail.',
+        run: async (input) => {
+          seen.push(input.context);
+          return { text: 'ok', verdict: { pass: true, reason: 'fine' } };
+        },
+      },
+    });
+    expect(result.status).toBe('passed');
+    // The adapter's own verdict channel is what the prompt demands — no
+    // hardcoded report_verdict wording for adapters without that tool.
+    expect(seen[0]).toContain('Reply with VERDICT: pass or VERDICT: fail.');
+    expect(seen[0]).not.toContain('report_verdict');
+  });
+
   it('verify with NO verdict is fail-closed', async () => {
     const parsed = parseCaseYaml('flow:\n  - verify: ok?');
     const result = await runCase({
diff --git a/packages/testing-framework/tests/unit-test/example-parity.test.ts b/packages/testing-framework/tests/unit-test/example-parity.test.ts
index 61c7a2f17c..a19e8da541 100644
--- a/packages/testing-framework/tests/unit-test/example-parity.test.ts
+++ b/packages/testing-framework/tests/unit-test/example-parity.test.ts
@@ -90,9 +90,8 @@ describe('example parity: Gherkin vs JS front-end', () => {
       const result = await runScenario({
         scenario: s,
         registry: reg,
-        uiAgent: ui.asAgent(),
+        uiAgent: ui,
         generalAgent: general,
-        env: {},
       });
       return {
         status: result.status,
diff --git a/packages/testing-framework/tests/unit-test/helpers/fake-agents.ts b/packages/testing-framework/tests/unit-test/helpers/fake-agents.ts
index 2cb611709f..81ae379b32 100644
--- a/packages/testing-framework/tests/unit-test/helpers/fake-agents.ts
+++ b/packages/testing-framework/tests/unit-test/helpers/fake-agents.ts
@@ -9,14 +9,14 @@
  * Both record every call so tests can assert that `{var}` substitution
  * happened mechanically BEFORE the prompt reached the "model".
  */
-import type { Agent } from '@midscene/core/agent';
 import type {
   GeneralAgentAdapter,
   GeneralAgentInput,
   GeneralAgentResult,
 } from '../../../src/general-agent/types';
+import type { UiAgentLike } from '../../../src/types';
 
-export class FakeUiAgent {
+export class FakeUiAgent implements UiAgentLike {
   /** Instructions received by aiAct (ui nodes), post-substitution. */
   actCalls: string[] = [];
   /** Extraction prompts received by aiString (capture steps). */
@@ -51,10 +51,6 @@ export class FakeUiAgent {
   interface = {
     screenshotBase64: async () => 'data:image/png;base64,FAKE',
   };
-
-  asAgent(): Agent {
-    return this as unknown as Agent;
-  }
 }
 
 export type GeneralAgentScript = (
diff --git a/packages/testing-framework/tests/unit-test/js-frontend.test.ts b/packages/testing-framework/tests/unit-test/js-frontend.test.ts
index d45cfde9d6..521234183a 100644
--- a/packages/testing-framework/tests/unit-test/js-frontend.test.ts
+++ b/packages/testing-framework/tests/unit-test/js-frontend.test.ts
@@ -86,6 +86,23 @@ describe('JS front-end: scenario / feature builders', () => {
       kind: 'callFlow',
       args: { role: 'guest' },
     });
+    // flows default to none; the shape still matches the Gherkin compiler.
+    expect(f.flows).toEqual([]);
+  });
+
+  it('returns the same { name, scenarios, flows } shape as compileFeature', () => {
+    const login = defineFlow({
+      name: 'Login',
+      params: ['role'],
+      steps: [When('sign in as {role}')],
+    });
+    const f = feature(
+      'shop',
+      [scenario('s', [callFlow('Login', { role: 'admin' })])],
+      [login],
+    );
+    expect(Object.keys(f).sort()).toEqual(['flows', 'name', 'scenarios']);
+    expect(f.flows).toEqual([login]);
   });
 
   it('rejects empty step lists', () => {
diff --git a/packages/testing-framework/tests/unit-test/run-scenario.test.ts b/packages/testing-framework/tests/unit-test/run-scenario.test.ts
index 6308d10053..be25b47d53 100644
--- a/packages/testing-framework/tests/unit-test/run-scenario.test.ts
+++ b/packages/testing-framework/tests/unit-test/run-scenario.test.ts
@@ -36,9 +36,8 @@ function run(
   return runScenario({
     scenario: s,
     registry: createFlowRegistry(opts.flows ?? []),
-    uiAgent: ui.asAgent(),
+    uiAgent: ui,
     generalAgent: general,
-    env: {},
   }).then((result) => ({ result, ui, general }));
 }
 
@@ -222,6 +221,168 @@ describe('runScenario: named flows', () => {
   });
 });
 
+describe('runScenario: flow memoization (once-per-run)', () => {
+  const memoLogin = defineFlow({
+    name: 'Login',
+    params: ['role'],
+    returns: ['greeting'],
+    memo: 'once-per-run',
+    steps: [
+      When('open the login page'),
+      When('sign in as the "{role}" user'),
+      remember('the greeting shown in the header', 'greeting'),
+    ],
+  });
+
+  it('replays returns on a hit without re-running the flow steps', async () => {
+    // Only ONE scripted aiString result: a re-run of the capture would throw.
+    const ui = new FakeUiAgent(['Hello, Admin!']);
+    const events: string[] = [];
+    const result = await runScenario({
+      scenario: scenario('memo hit', [
+        callFlow('Login', { role: 'admin' }),
+        callFlow('Login', { role: 'admin' }),
+        Then('the header shows {greeting}'),
+      ]),
+      registry: createFlowRegistry([memoLogin]),
+      uiAgent: ui,
+      generalAgent: new FakeGeneralAgent(),
+      onEvent: (e) => events.push(`${e.type}@${'depth' in e ? e.depth : '?'}`),
+    });
+
+    expect(result.status).toBe('passed');
+    // The flow body executed exactly once.
+    expect(ui.actCalls).toEqual([
+      'open the login page',
+      'sign in as the "admin" user',
+    ]);
+    expect(ui.stringCalls).toEqual(['the greeting shown in the header']);
+    // The replay still delivered the declared return into the caller scope.
+    expect(result.variables.greeting).toBe('Hello, Admin!');
+    // The hit stays narratable: an info step records the replay...
+    const memoSteps = result.steps.filter((s) =>
+      s.output?.text.includes('Memo hit'),
+    );
+    expect(memoSteps).toHaveLength(1);
+    expect(memoSteps[0]).toMatchObject({
+      node: 'flow',
+      status: 'info',
+      input: 'Login(role="admin")',
+    });
+    // ...and flowEnter/flowExit still fire for the replayed call.
+    expect(
+      events.filter((e) => e === 'flowEnter@1' || e === 'flowExit@1'),
+    ).toEqual(['flowEnter@1', 'flowExit@1', 'flowEnter@1', 'flowExit@1']);
+  });
+
+  it('misses when the resolved args differ', async () => {
+    const ui = new FakeUiAgent(['Hello, Admin!', 'Hello, Guest!']);
+    const { result } = await run(
+      scenario('memo miss', [
+        callFlow('Login', { role: 'admin' }),
+        callFlow('Login', { role: 'guest' }),
+      ]),
+      { flows: [memoLogin], ui },
+    );
+    expect(result.status).toBe('passed');
+    expect(ui.actCalls).toEqual([
+      'open the login page',
+      'sign in as the "admin" user',
+      'open the login page',
+      'sign in as the "guest" user',
+    ]);
+    expect(result.variables.greeting).toBe('Hello, Guest!');
+  });
+
+  it('shares hits across scenarios through a caller-provided memoStore', async () => {
+    const memoStore = new Map<string, Record<string, string>>();
+    const runWith = (ui: FakeUiAgent) =>
+      runScenario({
+        scenario: scenario('login once', [
+          callFlow('Login', { role: 'admin' }),
+          Then('the header shows {greeting}'),
+        ]),
+        registry: createFlowRegistry([memoLogin]),
+        uiAgent: ui,
+        generalAgent: new FakeGeneralAgent(),
+        memoStore,
+      });
+
+    const first = new FakeUiAgent(['Hello, Admin!']);
+    expect((await runWith(first)).status).toBe('passed');
+
+    // No scripted aiString results: any flow re-execution would fail.
+    const second = new FakeUiAgent();
+    const replayed = await runWith(second);
+    expect(replayed.status).toBe('passed');
+    expect(second.actCalls).toEqual([]);
+    expect(second.stringCalls).toEqual([]);
+    expect(replayed.variables.greeting).toBe('Hello, Admin!');
+  });
+
+  it('defaults to a per-call store (no sharing across runScenario calls)', async () => {
+    const loginOnce = scenario('login once', [
+      callFlow('Login', { role: 'admin' }),
+    ]);
+    const first = new FakeUiAgent(['Hello, Admin!']);
+    await run(loginOnce, { flows: [memoLogin], ui: first });
+
+    const second = new FakeUiAgent(['Hello, Admin!']);
+    const { result } = await run(loginOnce, { flows: [memoLogin], ui: second });
+    expect(result.status).toBe('passed');
+    // Without a shared store the flow executed again.
+    expect(second.actCalls).toContain('sign in as the "admin" user');
+  });
+
+  it('never memoizes a failed flow run', async () => {
+    const guard = defineFlow({
+      name: 'Guard',
+      params: [],
+      returns: [],
+      memo: 'once-per-run',
+      steps: [When('prepare'), Then('precondition holds')],
+    });
+    const memoStore = new Map<string, Record<string, string>>();
+    const runWith = (ui: FakeUiAgent, general: FakeGeneralAgent) =>
+      runScenario({
+        scenario: scenario('guarded', [callFlow('Guard')]),
+        registry: createFlowRegistry([guard]),
+        uiAgent: ui,
+        generalAgent: general,
+        memoStore,
+      });
+
+    const failing = await runWith(
+      new FakeUiAgent(),
+      new FakeGeneralAgent(() => ({
+        text: 'nope',
+        verdict: { pass: false, reason: 'not ready' },
+      })),
+    );
+    expect(failing.status).toBe('failed');
+
+    // The failure was not cached: the next run re-executes the flow.
+    const retryUi = new FakeUiAgent();
+    const retried = await runWith(retryUi, new FakeGeneralAgent());
+    expect(retried.status).toBe('passed');
+    expect(retryUi.actCalls).toEqual(['prepare']);
+  });
+
+  it('flows without memo always execute, even with a shared store', async () => {
+    const ui = new FakeUiAgent(['Hello, Admin!', 'Hello, Admin!']);
+    const { result } = await run(
+      scenario('no memo', [
+        callFlow('Login', { role: 'admin' }),
+        callFlow('Login', { role: 'admin' }),
+      ]),
+      { flows: [loginFlow], ui },
+    );
+    expect(result.status).toBe('passed');
+    expect(ui.actCalls).toHaveLength(4);
+    expect(ui.stringCalls).toHaveLength(2);
+  });
+});
+
 describe('runScenario: call-depth cap', () => {
   const leaf: FlowDefIR = {
     name: 'Leaf',
@@ -330,9 +491,8 @@ describe('runScenario: observability events', () => {
         { vars: { whoami: 'admin' } },
       ),
       registry: createFlowRegistry([loginFlow]),
-      uiAgent: ui.asAgent(),
+      uiAgent: ui,
       generalAgent: new FakeGeneralAgent(),
-      env: {},
       onEvent: (e) => {
         switch (e.type) {
           case 'stepStart':

From 53c6a5c2554ff5ca78472addd0f394b240946069 Mon Sep 17 00:00:00 2001
From: ScriptedAlchemy <zack@module-federation.io>
Date: Wed, 10 Jun 2026 03:54:50 +0200
Subject: [PATCH 8/9] feat(testing-framework): restructure example into a
 multi-file suite with three style folders

- example/ now shows one realistic suite authored three interchangeable
  ways: style-1-gherkin (shared flows/*.feature + independent feature
  modules), style-2-js (shared defineFlow module + per-module *.flows.ts),
  style-3-overlay (sparse bindFeature patch over style-1's checkout.feature)
- add compileSuite() to the gherkin front-end: glob a suite directory (or
  file list), merge all @flow definitions into one registry, fail loudly on
  duplicate flow names across files
- add a second shared flow ("Add product to cart") and cart-inspection
  scenarios; extend demo-app with a second product, quantity controls and a
  header cart badge; scripted agents cover the new steps
- demo runs the suite module-by-module per style, narrating each source
  file, and keeps the Gherkin-vs-JS trace parity proof and the overlay diff
- rich first-reader comments per style (what flows, captures and overlays
  are); example/README.md is the orientation point; POC-GHERKIN.md updated
---
 packages/testing-framework/POC-GHERKIN.md     | 132 +++++++-----
 packages/testing-framework/example/README.md  | 126 +++++++----
 .../example/demo-app/index.html               |  77 +++++--
 .../example/flows/shop.feature                |  42 ----
 .../example/flows/shop.flows.ts               |  69 ------
 .../example/flows/shop.overlay.ts             |  38 ----
 .../style-1-gherkin/features/cart.feature     |  30 +++
 .../style-1-gherkin/features/checkout.feature |  22 ++
 .../style-1-gherkin/features/smoke.feature    |  18 ++
 .../style-1-gherkin/flows/add-to-cart.feature |  12 ++
 .../style-1-gherkin/flows/login.feature       |  35 ++++
 .../example/style-2-js/features/cart.flows.ts |  37 ++++
 .../style-2-js/features/checkout.flows.ts     |  30 +++
 .../style-2-js/features/smoke.flows.ts        |  28 +++
 .../example/style-2-js/flows/index.ts         |  65 ++++++
 .../style-3-overlay/checkout.overlay.ts       |  62 ++++++
 .../testing-framework/scripts/demo/main.ts    | 198 +++++++++++-------
 .../scripts/demo/scripted-agents.ts           | 109 ++++++++--
 .../src/frontends/gherkin/index.ts            |   6 +
 .../src/frontends/gherkin/suite.ts            |  84 ++++++++
 packages/testing-framework/src/index.ts       |  12 +-
 .../tests/unit-test/example-parity.test.ts    | 195 ++++++++++-------
 .../tests/unit-test/suite.test.ts             | 104 +++++++++
 23 files changed, 1101 insertions(+), 430 deletions(-)
 delete mode 100644 packages/testing-framework/example/flows/shop.feature
 delete mode 100644 packages/testing-framework/example/flows/shop.flows.ts
 delete mode 100644 packages/testing-framework/example/flows/shop.overlay.ts
 create mode 100644 packages/testing-framework/example/style-1-gherkin/features/cart.feature
 create mode 100644 packages/testing-framework/example/style-1-gherkin/features/checkout.feature
 create mode 100644 packages/testing-framework/example/style-1-gherkin/features/smoke.feature
 create mode 100644 packages/testing-framework/example/style-1-gherkin/flows/add-to-cart.feature
 create mode 100644 packages/testing-framework/example/style-1-gherkin/flows/login.feature
 create mode 100644 packages/testing-framework/example/style-2-js/features/cart.flows.ts
 create mode 100644 packages/testing-framework/example/style-2-js/features/checkout.flows.ts
 create mode 100644 packages/testing-framework/example/style-2-js/features/smoke.flows.ts
 create mode 100644 packages/testing-framework/example/style-2-js/flows/index.ts
 create mode 100644 packages/testing-framework/example/style-3-overlay/checkout.overlay.ts
 create mode 100644 packages/testing-framework/src/frontends/gherkin/suite.ts
 create mode 100644 packages/testing-framework/tests/unit-test/suite.test.ts

diff --git a/packages/testing-framework/POC-GHERKIN.md b/packages/testing-framework/POC-GHERKIN.md
index fd261fa351..2221ad5632 100644
--- a/packages/testing-framework/POC-GHERKIN.md
+++ b/packages/testing-framework/POC-GHERKIN.md
@@ -14,31 +14,40 @@ step is natural language executed by the AI agents. A third, **hybrid** mode
 pnpm --filter @midscene/testing-framework demo
 ```
 
-Runs the login/checkout journey through **all three authoring modes** with a
-narrated walkthrough — offline by default (scripted fake agents simulate the
-shop; no model keys, no browser). Expected output (excerpt):
+Runs the multi-file example suite (see "Example" below) through **all three
+authoring styles**, module by module, with a narrated walkthrough — offline
+by default (scripted fake agents simulate the shop; no model keys, no
+browser). Expected output (excerpt):
 
 ```
-━━━ Mode 1/3: Pure Gherkin ━━━
-  ▶ Scenario: Checkout as admin
-    [ui]      the demo shop is open on the home page
-      → flow Login(role="admin")
-      [ui]      I sign in as the "admin" user with the saved test credentials   (template: "I sign in as the \"{role}\" user ...")
-      [capture] the greeting message shown in the header
-        {greeting} = "Hello, Admin!" (capture)
-      ← Login returned greeting="Hello, Admin!"
-    [verify]  the cart total equals $129.00   (template: "the cart total equals {price}")
-      ✔ PASS — The cart shows $129.00, matching the remembered price.
-    ✔ scenario passed
+━━━ Mode 1/3: Style 1 — pure Gherkin ━━━
+
+  ▣ Module: style-1-gherkin/features/checkout.feature
+    ▶ Scenario: Checkout as admin
+      [ui]      the demo shop is open on the home page
+        → flow Login(role="admin")
+        [ui]      I sign in as the "admin" user with the saved test credentials   (template: "I sign in as the \"{role}\" user ...")
+        [capture] the greeting message shown in the header
+          {greeting} = "Hello, Admin!" (capture)
+        ← Login returned greeting="Hello, Admin!"
+        → flow Add product to cart(product="Trail Backpack")
+        ...
+      [verify]  the cart total equals $129.00   (template: "the cart total equals {price}")
+        ✔ PASS — The cart shows $129.00, matching the remembered price.
+      ✔ scenario passed
+
+  ▣ Module: style-1-gherkin/flows/login.feature
+    registers shared flow: "Login"
+    (no runnable scenarios — flows only)
 ...
-━━━ Comparison: three modes, one IR ━━━
-  Gherkin vs JS — "Checkout as admin": identical execution trace ✔ (24 events)
-  Bound overlay vs pure Gherkin:
+━━━ Comparison: three styles, one IR ━━━
+  ...cart.feature vs ...cart.flows.ts — "Cart shows the added product with quantity and price": identical execution trace ✔ (30 events)
+  Style 3 overlay vs the style-1 checkout.feature it binds:
     "Checkout as admin":
       - [verify] the cart total equals {price}
       + [ui] apply the coupon code {couponCode} in the cart
       + [soft] the cart total equals {price} minus the "{couponCode}" coupon discount
-      + injected var {couponCode} = "E2E-2026-06-09"
+      + injected var {couponCode} = "E2E-2026-06-10"
 ```
 
 **Live mode** — `pnpm --filter @midscene/testing-framework demo -- --live`
@@ -208,6 +217,14 @@ free. Conventions on top:
   `@returns:greeting`. Background steps are excluded from `@flow` pickles so
   a reusable flow never replays the feature's setup.
 
+**Multi-file suites** (`suite.ts`): real suites keep shared flows in their
+own `.feature` files and call them from separate test modules.
+`compileSuite(dirOrFiles)` compiles every `.feature` under a directory (or
+an explicit file list), merges ALL `@flow` definitions into **one**
+`FlowRegistry` — duplicate flow names across files throw, naming both
+definition sites — and returns the compiled modules so each module's
+scenarios run against the shared registry. Flow names are suite-global.
+
 ## Hybrid mode: `bindFeature` (`src/frontends/js/bind-feature.ts`)
 
 Modeled on jest-cucumber's inverted binding (JS attaches to a loaded
@@ -220,13 +237,13 @@ scenarios/steps run as pure Gherkin, no restatement required.
 ```ts
 import { bindFeature } from '@midscene/testing-framework';
 
-const bound = bindFeature('flows/shop.feature', {
+const bound = bindFeature('features/checkout.feature', {
   scenarios: {
     'Checkout as admin': {
       vars: { couponCode: computeCoupon() },          // inject computed variables
       steps: [
         {
-          at: 'I add the "Trail Backpack" to the cart and open the cart',
+          at: 'Add product to cart',                  // a flow call, anchored by name
           after: ['apply the coupon code {couponCode} in the cart'], // insert
         },
         {
@@ -259,7 +276,7 @@ scenario or step throws an error that names the closest match
 real anchor — jest-cucumber's best trick, applied to a sparse overlay:
 
 ```
-[midscene] bindFeature(shop.feature): scenario "Checkout as admin" has no
+[midscene] bindFeature(checkout.feature): scenario "Checkout as admin" has no
 step matching anchor "the cart total equals {prce}".
 Did you mean "the cart total equals {price}"?
 Available anchors:
@@ -280,41 +297,60 @@ scenarios: {
 | Pure JS (`defineFlow`/`scenario`) | The suite is generated or heavily dynamic (loops, conditionals, computed prompts); no BDD stakeholders. |
 | Bound overlay (`bindFeature`) | Gherkin is the shared source of truth, but a few scenarios need computed variables, env-specific arg tweaks, inserted steps, or skip/only flags — without forking the feature file or restating it in JS. |
 
-## Example
-
-`example/flows/shop.feature` and `example/flows/shop.flows.ts` author the
-same suite — a `Login` flow reused by a checkout scenario, a `@soft` promo
-check, and a per-role login matrix (Scenario Outline vs `roles.map(...)`).
-The test `tests/unit-test/example-parity.test.ts` proves both compile to the
-same IR and produce identical execution traces (same prompts to the UI
-agent, same verify prompts to the general agent, same final variable table)
-through the shared executor.
+## Example: one suite, three style folders
+
+The example (`example/`, orientation in `example/README.md`) is a
+**multi-file suite** authored three times — one folder per style, all
+running against the static shop in `example/demo-app/`:
+
+```text
+example/
+  style-1-gherkin/          # pure Gherkin
+    flows/                  #   SHARED flow definitions (@flow scenarios):
+      login.feature         #     "Login" (@param:role @returns:greeting)
+      add-to-cart.feature   #     "Add product to cart" (@param:product @returns:price)
+    features/               #   independent test modules that CALL the shared
+      cart.feature          #   flows without defining them (cross-file
+      checkout.feature      #   resolution via compileSuite's merged registry)
+      smoke.feature
+  style-2-js/               # the SAME suite in the fluent JS API
+    flows/index.ts          #   defineFlow() twins + the shared registry
+    features/*.flows.ts     #   one module per .feature twin
+  style-3-overlay/          # hybrid: binds style-1's checkout.feature
+    checkout.overlay.ts     #   sparse patch (computed coupon, soft override,
+                            #   skip) — nothing duplicated from the .feature
+```
 
-`example/flows/shop.overlay.ts` shows the hybrid mode on the same feature: a
-computed coupon code injected into the checkout scenario's variable table,
-an inserted "apply the coupon" step that uses it, the exact-total verify
-downgraded to a reworded soft check, and the promo scenario skipped — while
-the login-matrix scenarios stay untouched pure Gherkin.
+The reuse story is the point: flows are written once (login,
+add-to-cart) and composed by every test module — the cart module inspects
+quantities/badges, the checkout module asserts totals, the smoke module is
+a per-role login matrix (Scenario Outline vs `roles.map(...)`). The tests
+in `tests/unit-test/example-parity.test.ts` prove styles 1 and 2 compile to
+the same IR and produce identical execution traces (same prompts to the UI
+agent, same verify prompts to the general agent, same final variable
+table); `tests/unit-test/suite.test.ts` covers `compileSuite` assembly
+(cross-file flow calls, duplicate-name errors).
 
 Run programmatically (no CLI wiring yet):
 
 ```ts
-import { compileFeatureFile, createFlowRegistry, runScenario } from '@midscene/testing-framework';
+import { compileSuite, runScenario } from '@midscene/testing-framework';
 
-const { scenarios, flows } = compileFeatureFile('flows/shop.feature');
-const registry = createFlowRegistry(flows);
-for (const s of scenarios) {
-  const result = await runScenario({ scenario: s, registry, uiAgent, generalAgent });
+const { modules, registry } = compileSuite('example/style-1-gherkin');
+for (const { feature } of modules) {
+  for (const s of feature.scenarios) {
+    const result = await runScenario({ scenario: s, registry, uiAgent, generalAgent });
+  }
 }
 ```
 
 ## Validation
 
-- `pnpm --filter @midscene/testing-framework test` — 119 tests, all green
+- `pnpm --filter @midscene/testing-framework test` — 125 tests, all green
   (across `flow-ir.test.ts`, `js-frontend.test.ts`,
-  `gherkin-frontend.test.ts`, `run-scenario.test.ts`, `bind-feature.test.ts`,
-  `example-parity.test.ts` and the Phase 0 suites; fakes only, no browsers /
-  no model calls).
+  `gherkin-frontend.test.ts`, `suite.test.ts`, `run-scenario.test.ts`,
+  `bind-feature.test.ts`, `example-parity.test.ts` and the Phase 0 suites;
+  fakes only, no browsers / no model calls).
 
 ## Open questions / next steps
 
@@ -331,9 +367,11 @@ for (const s of scenarios) {
 - **Flow-call reporting**: inner flow steps are flattened into the case's
   step list after an `info` "Entering flow …" marker; reports may want a
   nested view instead.
-- **Cross-file flow registries**: today a registry is built per
-  feature/module; decide on project-level registration (config field, glob
-  for `*.flows.ts`, shared between Gherkin and JS suites).
+- **Cross-file flow registries**: `compileSuite` solves this for Gherkin
+  (one merged registry per suite directory); JS suites share a registry by
+  importing one module. Still open: a *mixed* project-level registry
+  (config field or glob that merges `.feature` @flows AND `*.flows.ts`
+  definitions into one registry for both surfaces).
 - **Gherkin arg syntax**: the `with key "value" and key "value"` convention
   is regex-based; data tables (`PickleStepArgument`) would be a more
   Gherkin-native way to pass args (and to seed variables).
diff --git a/packages/testing-framework/example/README.md b/packages/testing-framework/example/README.md
index 637e075344..b47677ce08 100644
--- a/packages/testing-framework/example/README.md
+++ b/packages/testing-framework/example/README.md
@@ -1,55 +1,99 @@
-# Midscene v2 Testing Framework — Example
+# Midscene v2 Testing Framework — Examples
 
-A self-contained demo of [`@midscene/testing-framework`](..)
-(the AI-native v2 UI testing framework, Phase 0). Copy this folder out, install,
-set your model env vars, and run.
+Two related examples live here:
 
-## What it shows
+1. **Three authoring styles, one test suite** (`style-1-gherkin/`,
+   `style-2-js/`, `style-3-overlay/`) — the flow-IR POC. **Start here.**
+2. A copy-out **YAML runner** demo (`e2e/` + `midscene.config.ts`) — the
+   Phase 0 node engine. See [below](#the-phase-0-yaml-runner-example).
 
-- A **config-style** `uiAgent` (web) in `midscene.config.ts` — environment lives
-  in config, never in the case YAML.
-- The full node model in `e2e/*.yaml`:
-  - `ui` — natural-language UI actions (run by Midscene's UI Agent)
-  - `verify` — gating judgment with a forced pass/fail verdict
-  - `soft` — non-gating soft assertion (failure → warning only)
-  - `agent` — advisory free exploration (never gates)
-  - custom **runtime** nodes (`prepareCartFixture`, `notify`) via `defineRuntime`
-- A `$name` **skill** reference (`$catalog`) backed by `skills/catalog/SKILL.md`.
-- The **output contract**: steps record natural-language conclusions that later
-  `verify` / `agent` nodes reference by name.
+## Three interchangeable styles of the SAME suite
 
-## Run it
+The style folders author the **same multi-file test suite** for the static
+shop in `demo-app/`. They are *alternative surfaces*, not different suites:
+all three compile to one shared intermediate representation (flow-IR) and
+run through the same executor, so you pick a style per team — or mix them —
+without changing semantics. No step-definition code exists anywhere; every
+step is natural language executed by AI agents.
 
-```bash
-# 1. install
-pnpm install         # or npm install / yarn
+| Folder | Style | Read this first | Choose it when |
+| --- | --- | --- | --- |
+| `style-1-gherkin/` | Pure Gherkin `.feature` files | `flows/login.feature` | Non-engineers own the suite; specs are the shared language. |
+| `style-2-js/` | Pure JS/TS fluent API | `flows/index.ts` | The suite is generated or heavily dynamic (loops, computed prompts). |
+| `style-3-overlay/` | Gherkin source of truth + sparse JS overlay | `checkout.overlay.ts` | Gherkin stays canonical, but a few scenarios need computed values or env tweaks. Binds **style 1's** feature files — nothing is duplicated. |
+
+Inside each style the layout shows real-world modular reuse:
 
-# 2. configure the model (UI Agent + Pi share one endpoint)
-cp .env.example .env # then edit, or export the vars in your shell
+```text
+style-1-gherkin/
+  flows/                 # SHARED flow definitions (@flow scenarios)
+    login.feature        #   "Login"  — params/returns declared as tags
+    add-to-cart.feature  #   "Add product to cart"
+  features/              # independent test modules; they CALL the shared
+    cart.feature         # flows but do not define them
+    checkout.feature
+    smoke.feature
 
-# 3. run all cases
-pnpm test
+style-2-js/
+  flows/index.ts         # the same two flows, declared with defineFlow()
+  features/              # one module per .feature twin
+    cart.flows.ts
+    checkout.flows.ts
+    smoke.flows.ts
 
-# run a single case
-pnpm test:one
+style-3-overlay/
+  checkout.overlay.ts    # sparse patch over style-1's checkout.feature
 ```
 
-By default the demo runs against the bundled static page in `site/index.html`
-(offline). Set `DEMO_URL` to point at your own app.
+Cross-file resolution is the suite's job, not the file's: `compileSuite()`
+compiles every `.feature` under a directory and merges all `@flow`
+definitions into **one registry** (duplicate flow names across files fail
+loudly), then each module's scenarios run against it. The JS side gets the
+same effect by importing the shared registry from `flows/index.ts`.
 
-Results are written to `midscene_run/output/summary.json`, and Midscene HTML
-reports for the UI steps land in `midscene_run/report/`.
+Key concepts, explained in context in the "read this first" files:
 
-## Layout
+- **Flow** — a named, parameterized, reusable prompt sequence. Fresh
+  variable scope inside (only declared params visible), only declared
+  `returns` flow back to the caller.
+- **Capture / `remember`** — the UI agent extracts a value from the screen
+  into a machine-owned variable table; later prompts use `{name}`
+  placeholders that are substituted mechanically *before* any model sees
+  the text.
+- **Keyword mapping** — Given/When → UI actions; Then → fail-closed
+  `verify` (a general agent must report a pass/fail verdict); `@soft` /
+  `Soft()` → warn-only checks.
 
-```text
-.
-  midscene.config.ts     # uiAgent + discovery + runtime nodes
-  e2e/
-    product-detail.yaml   # ui + verify + soft + agent
-    add-to-cart.yaml      # custom node + $catalog skill + verify + agent + notify
-  skills/
-    catalog/SKILL.md      # a $name skill (Pi discovers/loads it)
-  site/
-    index.html            # tiny static demo app
+### Run it
+
+```bash
+pnpm --filter @midscene/testing-framework demo            # offline, no keys
+pnpm --filter @midscene/testing-framework demo -- --live  # real browser+model
+```
+
+The demo runs the suite module-by-module in all three styles, narrates each
+prompt/variable/verdict, and proves the styles are equivalent by comparing
+execution traces. See `../POC-GHERKIN.md` for the full design.
+
+## The Phase 0 YAML runner example
+
+A self-contained demo of the YAML node engine: copy this folder out,
+install, set model env vars, and run.
+
+- A **config-style** `uiAgent` (web) in `midscene.config.ts`.
+- The full node model in `e2e/*.yaml`: `ui`, `verify`, `soft`, `agent`,
+  plus custom **runtime** nodes (`prepareCartFixture`, `notify`) via
+  `defineRuntime`.
+- A `$name` **skill** reference (`$catalog`) backed by
+  `skills/catalog/SKILL.md`.
+
+```bash
+pnpm install
+cp .env.example .env   # or export MIDSCENE_MODEL_* in your shell
+pnpm test              # midscene-tf run
+pnpm test:one          # single case
 ```
+
+By default it runs against the bundled static page in `site/index.html`;
+set `DEMO_URL` to point at your own app. Results land in
+`midscene_run/output/summary.json`, HTML reports in `midscene_run/report/`.
diff --git a/packages/testing-framework/example/demo-app/index.html b/packages/testing-framework/example/demo-app/index.html
index 8da4189e2e..7091f5af7b 100644
--- a/packages/testing-framework/example/demo-app/index.html
+++ b/packages/testing-framework/example/demo-app/index.html
@@ -9,6 +9,7 @@
       body { margin: 0; color: #1f2933; }
       header { display: flex; justify-content: space-between; align-items: center; padding: 16px 24px; background: #0b5fff; color: #fff; }
       header button { padding: 8px 14px; border: none; border-radius: 6px; cursor: pointer; }
+      header .right { display: flex; align-items: center; gap: 12px; }
       main { padding: 24px; max-width: 640px; margin: 0 auto; }
       section { display: none; }
       section.active { display: block; }
@@ -17,6 +18,8 @@
       button.primary { padding: 12px 20px; background: #0b5fff; color: #fff; border: none; border-radius: 6px; font-size: 16px; cursor: pointer; }
       select, input { padding: 10px; border: 1px solid #cbd2d9; border-radius: 6px; margin-right: 8px; }
       .row { display: flex; align-items: center; gap: 8px; margin: 12px 0; }
+      .qty { display: inline-flex; align-items: center; gap: 8px; }
+      .qty button { width: 32px; height: 32px; border: 1px solid #cbd2d9; background: #fff; border-radius: 6px; cursor: pointer; font-size: 16px; }
       #cartTotal { font-size: 20px; font-weight: 700; }
       .muted { color: #52606d; font-size: 14px; }
     </style>
@@ -24,19 +27,26 @@
   <body>
     <header>
       <strong>Midscene POC Shop</strong>
-      <div>
+      <div class="right">
         <span id="greeting" hidden></span>
+        <span id="cartBadge">Cart: 0 items</span>
         <button id="navLogin">Login</button>
       </div>
     </header>
     <main>
       <section id="home" class="active">
         <h2>Welcome to the POC shop</h2>
-        <div class="card" id="productCard">
+        <div class="card">
           <h3>Trail Backpack</h3>
-          <div class="price" id="productPrice">$129.00</div>
+          <div class="price">$129.00</div>
           <p class="muted">Lightweight 28L pack for day hikes.</p>
-          <button class="primary" id="addToCart">Add to cart</button>
+          <button class="primary" data-add="Trail Backpack">Add to cart</button>
+        </div>
+        <div class="card">
+          <h3>Camp Mug</h3>
+          <div class="price">$24.50</div>
+          <p class="muted">Enamel mug that survives the campfire.</p>
+          <button class="primary" data-add="Camp Mug">Add to cart</button>
         </div>
         <button id="openCart">Open cart</button>
       </section>
@@ -60,9 +70,7 @@ <h2 id="dashboardTitle"></h2>
 
       <section id="cart">
         <h2>Your cart</h2>
-        <div class="card" id="cartLine" hidden>
-          <span>Trail Backpack</span> — <span class="price">$129.00</span>
-        </div>
+        <div id="cartLines"></div>
         <div class="row">
           <input id="coupon" placeholder="Coupon code" />
           <button id="applyCoupon">Apply coupon</button>
@@ -74,16 +82,38 @@ <h2>Your cart</h2>
     </main>
 
     <script>
-      const state = { role: null, inCart: false, discount: 0 };
-      const price = 129.0;
+      const PRICES = { 'Trail Backpack': 129.0, 'Camp Mug': 24.5 };
+      const state = { role: null, items: {}, discount: 0 };
       const show = (id) => {
         document
           .querySelectorAll('section')
           .forEach((s) => s.classList.toggle('active', s.id === id));
       };
-      const renderTotal = () => {
-        const total = state.inCart ? price * (1 - state.discount) : 0;
-        document.getElementById('cartTotal').textContent = `$${total.toFixed(2)}`;
+      const money = (n) => `$${n.toFixed(2)}`;
+      const itemCount = () =>
+        Object.values(state.items).reduce((sum, qty) => sum + qty, 0);
+      const render = () => {
+        const lines = document.getElementById('cartLines');
+        lines.innerHTML = '';
+        let total = 0;
+        for (const [name, qty] of Object.entries(state.items)) {
+          if (qty <= 0) continue;
+          total += PRICES[name] * qty;
+          const line = document.createElement('div');
+          line.className = 'card row';
+          line.innerHTML =
+            `<span>${name}</span>` +
+            `<span class="qty"><button data-dec="${name}">−</button>` +
+            `<span>Qty: <strong>${qty}</strong></span>` +
+            `<button data-inc="${name}">+</button></span>` +
+            `<span class="price">${money(PRICES[name] * qty)}</span>`;
+          lines.appendChild(line);
+        }
+        total *= 1 - state.discount;
+        document.getElementById('cartTotal').textContent = money(total);
+        const n = itemCount();
+        document.getElementById('cartBadge').textContent =
+          `Cart: ${n} item${n === 1 ? '' : 's'}`;
       };
       document.getElementById('navLogin').addEventListener('click', () => show('login'));
       document.getElementById('signIn').addEventListener('click', () => {
@@ -95,13 +125,24 @@ <h2>Your cart</h2>
         document.getElementById('dashboardTitle').textContent = `Dashboard (${state.role})`;
         show('dashboard');
       });
-      document.getElementById('addToCart').addEventListener('click', () => {
-        state.inCart = true;
-        document.getElementById('cartLine').hidden = false;
-        renderTotal();
+      document.body.addEventListener('click', (event) => {
+        const t = event.target;
+        if (!(t instanceof HTMLElement)) return;
+        if (t.dataset.add) {
+          state.items[t.dataset.add] = (state.items[t.dataset.add] ?? 0) + 1;
+          render();
+        }
+        if (t.dataset.inc) {
+          state.items[t.dataset.inc] += 1;
+          render();
+        }
+        if (t.dataset.dec) {
+          state.items[t.dataset.dec] = Math.max(0, state.items[t.dataset.dec] - 1);
+          render();
+        }
       });
       document.getElementById('openCart').addEventListener('click', () => {
-        renderTotal();
+        render();
         show('cart');
       });
       document.getElementById('applyCoupon').addEventListener('click', () => {
@@ -111,7 +152,7 @@ <h2>Your cart</h2>
         const note = document.getElementById('couponNote');
         note.textContent = `Coupon "${code}" applied: 10% off.`;
         note.hidden = false;
-        renderTotal();
+        render();
       });
       for (const id of ['backHome', 'backHome2']) {
         document.getElementById(id).addEventListener('click', () => show('home'));
diff --git a/packages/testing-framework/example/flows/shop.feature b/packages/testing-framework/example/flows/shop.feature
deleted file mode 100644
index 9b1c456d47..0000000000
--- a/packages/testing-framework/example/flows/shop.feature
+++ /dev/null
@@ -1,42 +0,0 @@
-# POC: Gherkin front-end over the shared flow-IR.
-# Compile with `compileFeatureFile(...)` and execute with `runScenario(...)`.
-# The same flows + scenarios are authored in JS in ./shop.flows.ts.
-Feature: Checkout with a reusable login flow
-
-  Background:
-    Given the demo shop is open on the home page
-
-  # A named flow: registered in the FlowRegistry instead of run as a scenario.
-  # Params/returns are declared as tags; "{role}" is substituted mechanically
-  # from the caller's arguments before any prompt reaches the model.
-  @flow @param:role @returns:greeting
-  Scenario: Login
-    When I open the login page
-    And I sign in as the "{role}" user with the saved test credentials
-    Then the dashboard for the "{role}" role is visible
-    When I remember the greeting message shown in the header as "greeting"
-
-  Scenario: Checkout as admin
-    When I run the "Login" flow with role "admin"
-    And I go back to the shop home page
-    And I remember the price of the "Trail Backpack" product as "price"
-    When I add the "Trail Backpack" to the cart and open the cart
-    Then the cart total equals {price}
-    But the cart does not show any error banner
-
-  # @soft turns Then steps into soft nodes: failures warn, never gate.
-  @soft
-  Scenario: Promo banner is advisory
-    Then a promo banner is visible at the top of the page
-
-  # Scenario Outline examples are expanded by the Gherkin pickles compiler;
-  # "<role>" is replaced per example row, while "{greeting}" stays a runtime
-  # variable filled by the Login flow's declared return.
-  Scenario Outline: Login greets every role
-    When I run the "Login" flow with role "<role>"
-    Then the header greets the user with {greeting}
-
-    Examples:
-      | role  |
-      | admin |
-      | guest |
diff --git a/packages/testing-framework/example/flows/shop.flows.ts b/packages/testing-framework/example/flows/shop.flows.ts
deleted file mode 100644
index 4855f1408c..0000000000
--- a/packages/testing-framework/example/flows/shop.flows.ts
+++ /dev/null
@@ -1,69 +0,0 @@
-/**
- * POC: JS/TS front-end over the shared flow-IR — the exact counterpart of
- * ./shop.feature. Both compile to the same IR and run through `runScenario`.
- */
-import {
-  Given,
-  Soft,
-  Then,
-  When,
-  callFlow,
-  createFlowRegistry,
-  defineFlow,
-  feature,
-  remember,
-  scenario,
-} from '@midscene/testing-framework';
-
-// A named flow: parameterized, fresh variable scope inside (only `role` is
-// visible), and only the declared return (`greeting`) flows back to callers.
-export const loginFlow = defineFlow({
-  name: 'Login',
-  params: ['role'],
-  returns: ['greeting'],
-  steps: [
-    When('I open the login page'),
-    When('I sign in as the "{role}" user with the saved test credentials'),
-    Then('the dashboard for the "{role}" role is visible'),
-    remember('the greeting message shown in the header', 'greeting'),
-  ],
-});
-
-const background = Given('the demo shop is open on the home page');
-
-export const checkoutAsAdmin = scenario('Checkout as admin', [
-  background,
-  callFlow('Login', { role: 'admin' }),
-  When('I go back to the shop home page'),
-  remember('the price of the "Trail Backpack" product', 'price'),
-  When('I add the "Trail Backpack" to the cart and open the cart'),
-  Then('the cart total equals {price}'),
-  Then('the cart does not show any error banner'),
-]);
-
-export const promoBanner = scenario('Promo banner is advisory', [
-  background,
-  Soft('a promo banner is visible at the top of the page'),
-]);
-
-// Dynamic authoring: plain JS replaces Scenario Outline examples.
-const roles = ['admin', 'guest'];
-
-// Same { name, scenarios, flows } shape as the Gherkin compiler's output.
-export const shopFeature = feature(
-  'Checkout with a reusable login flow',
-  [
-    checkoutAsAdmin,
-    promoBanner,
-    ...roles.map((role) =>
-      scenario(`Login greets every role (${role})`, [
-        background,
-        callFlow('Login', { role }),
-        Then('the header greets the user with {greeting}'),
-      ]),
-    ),
-  ],
-  [loginFlow],
-);
-
-export const registry = createFlowRegistry(shopFeature.flows);
diff --git a/packages/testing-framework/example/flows/shop.overlay.ts b/packages/testing-framework/example/flows/shop.overlay.ts
deleted file mode 100644
index 8530a554e2..0000000000
--- a/packages/testing-framework/example/flows/shop.overlay.ts
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * POC: hybrid authoring mode — ./shop.feature stays the source of truth,
- * and this sparse overlay attaches JS only where it adds something. Every
- * scenario/step not mentioned here runs as pure Gherkin. Drift between this
- * overlay and the feature fails at bind time with a corrected starter
- * snippet in the error message (jest-cucumber style).
- */
-import { join } from 'node:path';
-import { bindFeature } from '@midscene/testing-framework';
-
-// Computed at bind time — exactly the kind of value Gherkin cannot express.
-const couponCode = `E2E-${new Date().toISOString().slice(0, 10)}`;
-
-export const bound = bindFeature(join(__dirname, 'shop.feature'), {
-  scenarios: {
-    'Checkout as admin': {
-      // (b) inject a computed variable into the scenario's variable table.
-      vars: { couponCode },
-      steps: [
-        {
-          // (c) insert an extra step that uses the injected variable.
-          at: 'I add the "Trail Backpack" to the cart and open the cart',
-          after: ['apply the coupon code {couponCode} in the cart'],
-        },
-        {
-          // (a) override: the total now includes the coupon discount, so
-          // downgrade the exact-total check to a non-gating soft node.
-          at: 'the cart total equals {price}',
-          node: 'soft',
-          template:
-            'the cart total equals {price} minus the "{couponCode}" coupon discount',
-        },
-      ],
-    },
-    // (d) per-scenario config at the IR level.
-    'Promo banner is advisory': { skip: true },
-  },
-});
diff --git a/packages/testing-framework/example/style-1-gherkin/features/cart.feature b/packages/testing-framework/example/style-1-gherkin/features/cart.feature
new file mode 100644
index 0000000000..54a707dc1d
--- /dev/null
+++ b/packages/testing-framework/example/style-1-gherkin/features/cart.feature
@@ -0,0 +1,30 @@
+# An independent test module. It calls the shared "Login" and "Add product
+# to cart" flows (defined under ../flows) and only authors what is specific
+# to cart management. Cross-file resolution is the suite's job: compile all
+# .feature files with `compileSuite`, run each module's scenarios against
+# the merged flow registry.
+#
+# Keyword → runtime mapping (no step definitions anywhere):
+#   Given/When → ui action performed by the Midscene UI Agent
+#   Then       → verify: a general agent must report a pass/fail verdict;
+#                fail (or no verdict) FAILS the scenario (fail-closed)
+#   And/But    → inherit the previous primary keyword
+Feature: Cart management
+
+  Background:
+    Given the demo shop is open on the home page
+
+  Scenario: Cart shows the added product with quantity and price
+    When I run the "Login" flow with role "guest"
+    # The flow's declared return {price} lands in this scenario's variable
+    # table; the Then steps below use it after mechanical substitution.
+    And I run the "Add product to cart" flow with product "Camp Mug"
+    Then the cart lists "Camp Mug" with quantity 1 at {price}
+    And the cart badge in the header shows 1 item
+
+  Scenario: Increasing the quantity updates the total
+    When I run the "Login" flow with role "guest"
+    And I run the "Add product to cart" flow with product "Camp Mug"
+    When I increase the "Camp Mug" quantity in the cart to 2
+    Then the cart total equals twice {price}
+    And the cart badge in the header shows 2 items
diff --git a/packages/testing-framework/example/style-1-gherkin/features/checkout.feature b/packages/testing-framework/example/style-1-gherkin/features/checkout.feature
new file mode 100644
index 0000000000..f15f4c6ae8
--- /dev/null
+++ b/packages/testing-framework/example/style-1-gherkin/features/checkout.feature
@@ -0,0 +1,22 @@
+# The checkout module reuses the same shared flows as cart.feature — flows
+# are written once under ../flows and called everywhere. This file is also
+# the binding target of style 3: ../../style-3-overlay/checkout.overlay.ts
+# patches the admin journey with a computed coupon code WITHOUT this file
+# changing — it stays the single human-readable source of truth.
+Feature: Checkout
+
+  Background:
+    Given the demo shop is open on the home page
+
+  Scenario: Checkout as admin
+    When I run the "Login" flow with role "admin"
+    And I run the "Add product to cart" flow with product "Trail Backpack"
+    Then the cart total equals {price}
+    But the cart does not show any error banner
+
+  # The @soft tag downgrades this scenario's Then steps from verify to soft:
+  # a failed soft check records a warning but never fails the scenario. Use
+  # it for advisory checks that should not gate a run.
+  @soft
+  Scenario: Promo banner is advisory
+    Then a promo banner is visible at the top of the page
diff --git a/packages/testing-framework/example/style-1-gherkin/features/smoke.feature b/packages/testing-framework/example/style-1-gherkin/features/smoke.feature
new file mode 100644
index 0000000000..e7bb33b06d
--- /dev/null
+++ b/packages/testing-framework/example/style-1-gherkin/features/smoke.feature
@@ -0,0 +1,18 @@
+# Smoke module: a login matrix over the shared "Login" flow. Scenario
+# Outline examples are expanded at compile time by the Gherkin pickles
+# compiler — "<role>" is replaced per example row. Note the two kinds of
+# placeholders: "<role>" is compile-time (Gherkin examples), "{greeting}"
+# is runtime (filled by the Login flow's declared return when it executes).
+Feature: Login smoke
+
+  Background:
+    Given the demo shop is open on the home page
+
+  Scenario Outline: Login greets every role
+    When I run the "Login" flow with role "<role>"
+    Then the header greets the user with {greeting}
+
+    Examples:
+      | role  |
+      | admin |
+      | guest |
diff --git a/packages/testing-framework/example/style-1-gherkin/flows/add-to-cart.feature b/packages/testing-framework/example/style-1-gherkin/flows/add-to-cart.feature
new file mode 100644
index 0000000000..4a2c3c8b15
--- /dev/null
+++ b/packages/testing-framework/example/style-1-gherkin/flows/add-to-cart.feature
@@ -0,0 +1,12 @@
+# A second shared flow (see login.feature for the full concept notes).
+# The cart and checkout test modules both compose this with "Login" —
+# neither module defines either flow. Its declared return {price} is how a
+# value observed mid-flow (the product's price on the shop page) travels
+# back to the calling scenario for later assertions.
+Feature: Shared cart flows
+
+  @flow @param:product @returns:price
+  Scenario: Add product to cart
+    When I go to the shop home page
+    And I remember the price of the "{product}" product as "price"
+    When I add the "{product}" to the cart and open the cart
diff --git a/packages/testing-framework/example/style-1-gherkin/flows/login.feature b/packages/testing-framework/example/style-1-gherkin/flows/login.feature
new file mode 100644
index 0000000000..b8a855afe2
--- /dev/null
+++ b/packages/testing-framework/example/style-1-gherkin/flows/login.feature
@@ -0,0 +1,35 @@
+# READ THIS FIRST (style 1: pure Gherkin).
+#
+# This suite has no step-definition code anywhere: every step is natural
+# language executed by AI agents (Given/When → UI actions performed by the
+# Midscene UI Agent; Then → a fail-closed verify judgment by a general
+# agent). What WOULD be helper code in classic Cucumber becomes a FLOW:
+# a named, parameterized, reusable prompt sequence.
+#
+# A Scenario tagged @flow is not a runnable test — it is registered in the
+# suite-wide flow registry under its title ("Login"). Any feature file in
+# the suite can call it ("I run the "Login" flow with role "admin"") without
+# knowing where it is defined: `compileSuite` compiles every .feature file
+# under this folder and merges all @flow definitions into one registry
+# (duplicate flow names across files fail loudly).
+#
+# Flows are scoped like functions, not macros:
+#   - @param:role     declares an argument; the flow runs in a FRESH variable
+#     scope seeded only with its declared params (caller variables are not
+#     visible inside).
+#   - @returns:greeting declares which captured variables flow back into the
+#     caller's scope when the flow finishes. Everything else is discarded.
+Feature: Shared login flow
+
+  @flow @param:role @returns:greeting
+  Scenario: Login
+    When I open the login page
+    # "{role}" is a machine-owned variable placeholder. It is substituted
+    # mechanically from the caller's arguments BEFORE the prompt reaches any
+    # model — the model only ever sees the resolved text.
+    And I sign in as the "{role}" user with the saved test credentials
+    Then the dashboard for the "{role}" role is visible
+    # "I remember … as "x"" is a CAPTURE step: the UI agent extracts the
+    # value from the screen into the variable table (machine-owned, never
+    # model prose). Later steps reference it as {greeting}.
+    When I remember the greeting message shown in the header as "greeting"
diff --git a/packages/testing-framework/example/style-2-js/features/cart.flows.ts b/packages/testing-framework/example/style-2-js/features/cart.flows.ts
new file mode 100644
index 0000000000..8a45132115
--- /dev/null
+++ b/packages/testing-framework/example/style-2-js/features/cart.flows.ts
@@ -0,0 +1,37 @@
+/**
+ * Twin of style-1-gherkin/features/cart.feature: an independent test module
+ * that composes the shared Login and Add-product-to-cart flows without
+ * defining either (see ../flows for the flow definitions and concept notes).
+ *
+ * `callFlow('Add product to cart', …)` runs the flow's steps in a fresh
+ * scope and copies its declared return — {price} — back into this
+ * scenario's variable table, where the Then assertions use it.
+ */
+import {
+  Given,
+  Then,
+  callFlow,
+  feature,
+  scenario,
+} from '@midscene/testing-framework';
+
+const background = Given('the demo shop is open on the home page');
+
+export const cartFeature = feature('Cart management', [
+  scenario('Cart shows the added product with quantity and price', [
+    background,
+    callFlow('Login', { role: 'guest' }),
+    callFlow('Add product to cart', { product: 'Camp Mug' }),
+    Then('the cart lists "Camp Mug" with quantity 1 at {price}'),
+    Then('the cart badge in the header shows 1 item'),
+  ]),
+  scenario('Increasing the quantity updates the total', [
+    background,
+    callFlow('Login', { role: 'guest' }),
+    callFlow('Add product to cart', { product: 'Camp Mug' }),
+    // A bare string is shorthand for When(...) — a plain UI action.
+    'I increase the "Camp Mug" quantity in the cart to 2',
+    Then('the cart total equals twice {price}'),
+    Then('the cart badge in the header shows 2 items'),
+  ]),
+]);
diff --git a/packages/testing-framework/example/style-2-js/features/checkout.flows.ts b/packages/testing-framework/example/style-2-js/features/checkout.flows.ts
new file mode 100644
index 0000000000..a1c1728c75
--- /dev/null
+++ b/packages/testing-framework/example/style-2-js/features/checkout.flows.ts
@@ -0,0 +1,30 @@
+/**
+ * Twin of style-1-gherkin/features/checkout.feature, reusing the same
+ * shared flows as the cart module (imported, not defined — see ../flows).
+ * `Soft(...)` is the JS spelling of the @soft tag: the check warns on
+ * failure but never fails the scenario.
+ */
+import {
+  Given,
+  Soft,
+  Then,
+  callFlow,
+  feature,
+  scenario,
+} from '@midscene/testing-framework';
+
+const background = Given('the demo shop is open on the home page');
+
+export const checkoutFeature = feature('Checkout', [
+  scenario('Checkout as admin', [
+    background,
+    callFlow('Login', { role: 'admin' }),
+    callFlow('Add product to cart', { product: 'Trail Backpack' }),
+    Then('the cart total equals {price}'),
+    Then('the cart does not show any error banner'),
+  ]),
+  scenario('Promo banner is advisory', [
+    background,
+    Soft('a promo banner is visible at the top of the page'),
+  ]),
+]);
diff --git a/packages/testing-framework/example/style-2-js/features/smoke.flows.ts b/packages/testing-framework/example/style-2-js/features/smoke.flows.ts
new file mode 100644
index 0000000000..102402b429
--- /dev/null
+++ b/packages/testing-framework/example/style-2-js/features/smoke.flows.ts
@@ -0,0 +1,28 @@
+/**
+ * Twin of style-1-gherkin/features/smoke.feature. Where Gherkin needs a
+ * Scenario Outline + Examples table, plain JS just maps over the data —
+ * dynamic authoring (loops, conditionals, computed prompts) is the main
+ * reason to pick this style.
+ */
+import {
+  Given,
+  Then,
+  callFlow,
+  feature,
+  scenario,
+} from '@midscene/testing-framework';
+
+const background = Given('the demo shop is open on the home page');
+
+const roles = ['admin', 'guest'];
+
+export const smokeFeature = feature(
+  'Login smoke',
+  roles.map((role) =>
+    scenario('Login greets every role', [
+      background,
+      callFlow('Login', { role }),
+      Then('the header greets the user with {greeting}'),
+    ]),
+  ),
+);
diff --git a/packages/testing-framework/example/style-2-js/flows/index.ts b/packages/testing-framework/example/style-2-js/flows/index.ts
new file mode 100644
index 0000000000..cce9783c5a
--- /dev/null
+++ b/packages/testing-framework/example/style-2-js/flows/index.ts
@@ -0,0 +1,65 @@
+/**
+ * READ THIS FIRST (style 2: pure JS/TS).
+ *
+ * This folder authors the SAME suite as ../style-1-gherkin, in a fluent
+ * typed API instead of .feature files. Both compile to the identical
+ * flow-IR and run through the same executor — pick the surface, not the
+ * semantics. There is still no step-definition code: every string below is
+ * a natural-language prompt executed by AI agents.
+ *
+ * Concepts demonstrated here:
+ *  - `defineFlow` declares a FLOW: a named, reusable, parameterized prompt
+ *    sequence. A flow runs in a FRESH variable scope seeded only with its
+ *    declared `params` (caller variables are invisible inside), and only
+ *    the variables listed in `returns` flow back to the caller.
+ *  - `remember(description, name)` is a CAPTURE step: the UI agent extracts
+ *    the described value from the screen into the machine-owned variable
+ *    table. Later prompts reference it as `{name}`, and the placeholder is
+ *    substituted mechanically BEFORE any model sees the prompt — typos in
+ *    `{placeholders}` fail immediately instead of confusing a model.
+ *  - Keyword helpers map to runtime semantics: `Given`/`When` → UI actions,
+ *    `Then` → fail-closed verify (a general agent must report a pass/fail
+ *    verdict), `Soft` → warn-only check, `Advisory` → non-gating analysis.
+ *
+ * Cross-file reuse works exactly like the Gherkin side: flows live in this
+ * one module, the scenario modules under ../features import nothing but the
+ * suite registry built here. (In Gherkin, `compileSuite` does this merge.)
+ */
+import {
+  type FlowDefIR,
+  Then,
+  When,
+  createFlowRegistry,
+  defineFlow,
+  remember,
+} from '@midscene/testing-framework';
+
+/** Twin of style-1-gherkin/flows/login.feature. */
+export const loginFlow = defineFlow({
+  name: 'Login',
+  params: ['role'],
+  returns: ['greeting'],
+  steps: [
+    When('I open the login page'),
+    When('I sign in as the "{role}" user with the saved test credentials'),
+    Then('the dashboard for the "{role}" role is visible'),
+    remember('the greeting message shown in the header', 'greeting'),
+  ],
+});
+
+/** Twin of style-1-gherkin/flows/add-to-cart.feature. */
+export const addToCartFlow = defineFlow({
+  name: 'Add product to cart',
+  params: ['product'],
+  returns: ['price'],
+  steps: [
+    When('I go to the shop home page'),
+    remember('the price of the "{product}" product', 'price'),
+    When('I add the "{product}" to the cart and open the cart'),
+  ],
+});
+
+export const sharedFlows: FlowDefIR[] = [loginFlow, addToCartFlow];
+
+/** Suite-wide registry — every scenario module runs against this. */
+export const registry = createFlowRegistry(sharedFlows);
diff --git a/packages/testing-framework/example/style-3-overlay/checkout.overlay.ts b/packages/testing-framework/example/style-3-overlay/checkout.overlay.ts
new file mode 100644
index 0000000000..8bb04b01fd
--- /dev/null
+++ b/packages/testing-framework/example/style-3-overlay/checkout.overlay.ts
@@ -0,0 +1,62 @@
+/**
+ * READ THIS FIRST (style 3: Gherkin + sparse JS overlay).
+ *
+ * WHAT AN OVERLAY IS: the .feature file (style 1's checkout.feature) stays
+ * the human-readable source of truth — this file is a sparse JS PATCH on
+ * top of it. The overlay is keyed by scenario title, and within a scenario
+ * by a step anchor (the step's exact text, or its index). With it you can:
+ *   - inject computed variables into a scenario's variable table (`vars`),
+ *   - insert extra steps before/after an anchored step (`before`/`after`),
+ *   - override an anchored step's prompt or downgrade its assertion kind
+ *     (`template`, `node` — e.g. verify → soft),
+ *   - skip or focus whole scenarios (`skip`/`only`).
+ *
+ * Everything NOT mentioned here runs as pure Gherkin: no restating of
+ * steps, no parallel JS suite to keep in sync. Drift is caught at BIND
+ * time — if the feature file is edited so a title or anchored step no
+ * longer matches, `bindFeature` throws immediately with the closest match
+ * and a ready-to-paste corrected overlay snippet (nothing silently
+ * no-ops at runtime).
+ *
+ * Use this style when Gherkin is the shared language with non-engineers,
+ * but a few scenarios need values or tweaks only code can provide.
+ *
+ * Flows stay shared: the bound feature defines none, so it runs against
+ * the same suite registry as styles 1 and 2 (see scripts/demo/main.ts).
+ */
+import { join } from 'node:path';
+import { bindFeature } from '@midscene/testing-framework';
+
+// A bind-time computed value — exactly what Gherkin alone cannot express.
+const couponCode = `E2E-${new Date().toISOString().slice(0, 10)}`;
+
+export const bound = bindFeature(
+  join(__dirname, '../style-1-gherkin/features/checkout.feature'),
+  {
+    scenarios: {
+      'Checkout as admin': {
+        // Injected variable: available as {couponCode} from the first step.
+        vars: { couponCode },
+        steps: [
+          {
+            // Insert a step after the shared flow call. A flow-call step is
+            // anchored by its flow name.
+            at: 'Add product to cart',
+            after: ['apply the coupon code {couponCode} in the cart'],
+          },
+          {
+            // Override: the coupon changes the total, so the exact-total
+            // verify from the feature file is reworded and downgraded to a
+            // non-gating soft check.
+            at: 'the cart total equals {price}',
+            node: 'soft',
+            template:
+              'the cart total equals {price} minus the "{couponCode}" coupon discount',
+          },
+        ],
+      },
+      // Per-scenario runner config at the IR level.
+      'Promo banner is advisory': { skip: true },
+    },
+  },
+);
diff --git a/packages/testing-framework/scripts/demo/main.ts b/packages/testing-framework/scripts/demo/main.ts
index a27fbf5651..d23f1df5db 100644
--- a/packages/testing-framework/scripts/demo/main.ts
+++ b/packages/testing-framework/scripts/demo/main.ts
@@ -1,14 +1,15 @@
 /**
- * Narrated end-to-end demo of the POC: runs the login/checkout journey
- * through all three authoring modes — pure Gherkin, pure JS, and the bound
- * overlay — over the one shared flow-IR, printing each resolved prompt, the
- * variable table as it evolves, flow entry/exit, and verdicts.
+ * Narrated end-to-end demo of the POC: runs the multi-file example suite
+ * (example/style-*) through all three authoring styles — pure Gherkin,
+ * pure JS, and the sparse overlay — over the one shared flow-IR, printing
+ * each module, each resolved prompt, the variable table as it evolves,
+ * flow entry/exit, and verdicts.
  *
  * Offline by default (scripted fake agents, no model keys / no browser).
  * Pass `--live` to drive a real browser + model against the static shop in
  * example/demo-app (experimental; needs MIDSCENE_MODEL_* env vars).
  */
-import { join } from 'node:path';
+import { join, relative } from 'node:path';
 import {
   type CompiledFeature,
   type FlowRegistry,
@@ -16,21 +17,19 @@ import {
   type ScenarioRunEvent,
   type ScenarioRunResult,
   type UiAgentLike,
-  compileFeatureFile,
-  createFlowRegistry,
+  compileSuite,
   runScenario,
 } from '@midscene/testing-framework';
-import {
-  checkoutAsAdmin,
-  registry as jsRegistry,
-  promoBanner,
-} from '../../example/flows/shop.flows';
-import { bound } from '../../example/flows/shop.overlay';
+import { cartFeature } from '../../example/style-2-js/features/cart.flows';
+import { checkoutFeature } from '../../example/style-2-js/features/checkout.flows';
+import { smokeFeature } from '../../example/style-2-js/features/smoke.flows';
+import { registry as jsRegistry } from '../../example/style-2-js/flows';
+import { bound } from '../../example/style-3-overlay/checkout.overlay';
 import type { GeneralAgentAdapter } from '../../src/general-agent/types';
 import { ScriptedGeneralAgent, ScriptedUiAgent } from './scripted-agents';
 
-const FEATURE_FILE = join(__dirname, '../../example/flows/shop.feature');
-const SCENARIO_NAMES = ['Checkout as admin', 'Promo banner is advisory'];
+const EXAMPLE_DIR = join(__dirname, '../../example');
+const STYLE1_DIR = join(EXAMPLE_DIR, 'style-1-gherkin');
 
 // —— tiny ANSI helpers (plain escapes; disabled via NO_COLOR) ——
 const useColor = process.env.NO_COLOR === undefined;
@@ -53,11 +52,25 @@ interface AgentBundle {
 
 type AgentFactory = () => Promise<AgentBundle>;
 
+interface DemoModule {
+  /** Display path of the module's source file, relative to example/. */
+  label: string;
+  feature: CompiledFeature;
+}
+
+interface DemoMode {
+  label: string;
+  source: string;
+  modules: DemoModule[];
+  registry: FlowRegistry;
+}
+
 interface ScenarioOutcome {
+  module: string;
   name: string;
   skipped: boolean;
   result?: ScenarioRunResult;
-  /** Canonical event trace, used to prove cross-mode equivalence. */
+  /** Canonical event trace, used to prove cross-style equivalence. */
   trace: string[];
 }
 
@@ -94,7 +107,9 @@ export async function main(argv: string[]): Promise<number> {
 
   console.log('');
   console.log(
-    bold('Midscene testing-framework POC — three authoring modes, one flow-IR'),
+    bold(
+      'Midscene testing-framework POC — three authoring styles, one flow-IR',
+    ),
   );
   console.log(
     dim(
@@ -104,33 +119,47 @@ export async function main(argv: string[]): Promise<number> {
     ),
   );
 
-  const gherkin = compileFeatureFile(FEATURE_FILE);
-  const gherkinRegistry = createFlowRegistry(gherkin.flows);
+  // Style 1 (and 3) resolve flows suite-wide: compile every .feature under
+  // the style-1 folder and merge all @flow definitions into one registry.
+  const suite = compileSuite(STYLE1_DIR);
+  const suiteModules: DemoModule[] = suite.modules.map((m) => ({
+    label: relative(EXAMPLE_DIR, m.file),
+    feature: m.feature,
+  }));
 
-  const modes: Array<{
-    label: string;
-    source: string;
-    scenarios: ScenarioIR[];
-    registry: FlowRegistry;
-  }> = [
+  const modes: DemoMode[] = [
     {
-      label: 'Pure Gherkin',
-      source: 'example/flows/shop.feature → compileFeatureFile()',
-      scenarios: pickScenarios(gherkin),
-      registry: gherkinRegistry,
+      label: 'Style 1 — pure Gherkin',
+      source: 'example/style-1-gherkin → compileSuite()',
+      modules: suiteModules,
+      registry: suite.registry,
     },
     {
-      label: 'Pure JS',
-      source: 'example/flows/shop.flows.ts → defineFlow()/scenario()',
-      scenarios: [checkoutAsAdmin, promoBanner],
+      label: 'Style 2 — pure JS',
+      source: 'example/style-2-js → defineFlow() + feature()/scenario()',
+      modules: [
+        { label: 'style-2-js/features/cart.flows.ts', feature: cartFeature },
+        {
+          label: 'style-2-js/features/checkout.flows.ts',
+          feature: checkoutFeature,
+        },
+        { label: 'style-2-js/features/smoke.flows.ts', feature: smokeFeature },
+      ],
       registry: jsRegistry,
     },
     {
-      label: 'Bound overlay',
+      label: 'Style 3 — sparse overlay',
       source:
-        'example/flows/shop.overlay.ts → bindFeature(shop.feature, overlay)',
-      scenarios: pickScenarios(bound),
-      registry: createFlowRegistry(bound.flows),
+        'example/style-3-overlay/checkout.overlay.ts → bindFeature(style-1 checkout.feature)',
+      modules: [
+        {
+          label: 'style-3-overlay/checkout.overlay.ts',
+          feature: bound,
+        },
+      ],
+      // The bound feature defines no flows of its own; it runs against the
+      // same suite-wide registry as style 1.
+      registry: suite.registry,
     },
   ];
 
@@ -154,14 +183,31 @@ export async function main(argv: string[]): Promise<number> {
     console.log(dim(`    ${mode.source}`));
 
     const scenarios: ScenarioOutcome[] = [];
-    for (const scenario of mode.scenarios) {
-      scenarios.push(await runOne(scenario, mode.registry, agentFactory));
+    for (const module of mode.modules) {
+      console.log('');
+      console.log(`  ${bold(`▣ Module: ${module.label}`)}`);
+      if (module.feature.flows.length > 0) {
+        console.log(
+          `    ${dim(
+            `registers shared flow${module.feature.flows.length === 1 ? '' : 's'}: ${module.feature.flows.map((f) => `"${f.name}"`).join(', ')}`,
+          )}`,
+        );
+      }
+      if (module.feature.scenarios.length === 0) {
+        console.log(`    ${dim('(no runnable scenarios — flows only)')}`);
+        continue;
+      }
+      for (const scenario of module.feature.scenarios) {
+        scenarios.push(
+          await runOne(scenario, module.label, mode.registry, agentFactory),
+        );
+      }
     }
     outcomes.push({ label: mode.label, scenarios });
   }
 
   if (selectedModes.length === modes.length) {
-    printComparison(outcomes, gherkin, live);
+    printComparison(outcomes, suiteModules, live);
   }
 
   const failed = outcomes
@@ -170,7 +216,7 @@ export async function main(argv: string[]): Promise<number> {
   return failed ? 1 : 0;
 }
 
-/** `--mode gherkin|js|bound` runs a single mode (handy for live runs). */
+/** `--mode gherkin|js|bound` runs a single style (handy for live runs). */
 function parseModeFilter(argv: string[]): string | undefined {
   const index = argv.indexOf('--mode');
   if (index === -1) return undefined;
@@ -180,27 +226,18 @@ function parseModeFilter(argv: string[]): string | undefined {
   return value === 'bound' ? 'overlay' : value;
 }
 
-function pickScenarios(compiled: CompiledFeature): ScenarioIR[] {
-  return SCENARIO_NAMES.map((name) => {
-    const found = compiled.scenarios.find((s) => s.name === name);
-    if (!found) {
-      throw new Error(`demo: scenario "${name}" not found in the feature.`);
-    }
-    return found;
-  });
-}
-
 async function runOne(
   scenario: ScenarioIR,
+  module: string,
   registry: FlowRegistry,
   agentFactory: AgentFactory,
 ): Promise<ScenarioOutcome> {
   console.log('');
-  console.log(`  ${bold(`▶ Scenario: ${scenario.name}`)}`);
+  console.log(`    ${bold(`▶ Scenario: ${scenario.name}`)}`);
 
   if (scenario.config?.skip) {
-    console.log(`    ${yellow('↷ skipped')} ${dim('(overlay config.skip)')}`);
-    return { name: scenario.name, skipped: true, trace: [] };
+    console.log(`      ${yellow('↷ skipped')} ${dim('(overlay config.skip)')}`);
+    return { module, name: scenario.name, skipped: true, trace: [] };
   }
 
   const bundle = await agentFactory();
@@ -220,21 +257,21 @@ async function runOne(
     const vars = Object.entries(result.variables);
     if (vars.length > 0) {
       console.log(
-        `    ${dim('final variables:')} ${vars
+        `      ${dim('final variables:')} ${vars
           .map(([k, v]) => `${magenta(`{${k}}`)}=${JSON.stringify(v)}`)
           .join(', ')}`,
       );
     }
     if (bundle.describeState) {
-      console.log(`    ${dim(`simulated shop: ${bundle.describeState()}`)}`);
+      console.log(`      ${dim(`simulated shop: ${bundle.describeState()}`)}`);
     }
     for (const warning of result.warnings) {
-      console.log(`    ${yellow(`⚠ warning: ${warning}`)}`);
+      console.log(`      ${yellow(`⚠ warning: ${warning}`)}`);
     }
     console.log(
-      `    ${result.status === 'passed' ? green('✔ scenario passed') : red('✘ scenario failed')}`,
+      `      ${result.status === 'passed' ? green('✔ scenario passed') : red('✘ scenario failed')}`,
     );
-    return { name: scenario.name, skipped: false, result, trace };
+    return { module, name: scenario.name, skipped: false, result, trace };
   } finally {
     await bundle.cleanup?.();
   }
@@ -243,7 +280,7 @@ async function runOne(
 // —— narration ——
 
 function narrate(event: ScenarioRunEvent): void {
-  const pad = `    ${'  '.repeat('depth' in event ? event.depth : 0)}`;
+  const pad = `      ${'  '.repeat('depth' in event ? event.depth : 0)}`;
   switch (event.type) {
     case 'flowEnter':
       console.log(
@@ -310,7 +347,7 @@ function formatArgs(args: Record<string, string>): string {
   return entries.map(([k, v]) => `${k}=${JSON.stringify(v)}`).join(', ');
 }
 
-/** Mode-independent fingerprint of an event, for cross-mode comparison. */
+/** Style-independent fingerprint of an event, for cross-style comparison. */
 function canonical(event: ScenarioRunEvent): string {
   switch (event.type) {
     case 'stepStart':
@@ -330,16 +367,16 @@ function canonical(event: ScenarioRunEvent): string {
 
 function printComparison(
   outcomes: ModeOutcome[],
-  gherkin: CompiledFeature,
+  suiteModules: DemoModule[],
   live: boolean,
 ): void {
   const [gherkinMode, jsMode] = outcomes;
 
   console.log('');
-  console.log(bold(cyan('━━━ Comparison: three modes, one IR ━━━')));
+  console.log(bold(cyan('━━━ Comparison: three styles, one IR ━━━')));
 
-  // 1. Gherkin vs JS: identical traces prove the two front-ends compile to
-  //    the same IR and drive the engine identically.
+  // 1. Gherkin vs JS: identical traces, module by module, prove the two
+  //    front-ends compile to the same IR and drive the engine identically.
   console.log('');
   if (live) {
     console.log(
@@ -348,7 +385,8 @@ function printComparison(
       ),
     );
   }
-  for (let i = 0; i < SCENARIO_NAMES.length; i++) {
+  const pairs = Math.min(gherkinMode.scenarios.length, jsMode.scenarios.length);
+  for (let i = 0; i < pairs; i++) {
     const a = gherkinMode.scenarios[i];
     const b = jsMode.scenarios[i];
     const identical =
@@ -357,15 +395,26 @@ function printComparison(
     const outcome = identical
       ? green(`identical execution trace ✔ (${a.trace.length} events)`)
       : red('traces DIFFER ✘');
-    console.log(`  Gherkin vs JS — "${SCENARIO_NAMES[i]}": ${outcome}`);
+    console.log(
+      `  ${dim(a.module)} vs ${dim(b.module)} — "${a.name}": ${outcome}`,
+    );
+  }
+  if (gherkinMode.scenarios.length !== jsMode.scenarios.length) {
+    console.log(red('  scenario counts DIFFER between Gherkin and JS ✘'));
   }
 
   // 2. What the overlay changed, derived from the IR itself.
+  const plainCheckout = suiteModules.find((m) =>
+    m.label.endsWith('features/checkout.feature'),
+  )?.feature;
+  const boundCheckout = outcomes[2] ? bound : undefined;
   console.log('');
-  console.log(`  ${bold('Bound overlay vs pure Gherkin:')}`);
-  for (const name of SCENARIO_NAMES) {
-    const plain = gherkin.scenarios.find((s) => s.name === name);
-    const overlaid = bound.scenarios.find((s) => s.name === name);
+  console.log(
+    `  ${bold('Style 3 overlay vs the style-1 checkout.feature it binds:')}`,
+  );
+  for (const name of plainCheckout?.scenarios.map((s) => s.name) ?? []) {
+    const plain = plainCheckout?.scenarios.find((s) => s.name === name);
+    const overlaid = boundCheckout?.scenarios.find((s) => s.name === name);
     if (!plain || !overlaid) continue;
 
     const fingerprint = (s: ScenarioIR) =>
@@ -422,13 +471,8 @@ function printComparison(
           : '';
       return `${s.name}: ${status}${warn}`;
     });
-    console.log(`    ${mode.label.padEnd(14)} ${cells.join(dim('  |  '))}`);
+    console.log(`    ${bold(mode.label)}`);
+    console.log(`      ${cells.join(dim('  |  '))}`);
   }
   console.log('');
-  console.log(
-    dim(
-      '  (The login-matrix Scenario Outline runs too — omitted here for brevity; see example/flows/.)',
-    ),
-  );
-  console.log('');
 }
diff --git a/packages/testing-framework/scripts/demo/scripted-agents.ts b/packages/testing-framework/scripts/demo/scripted-agents.ts
index 6fdac055ac..f42e6bb92e 100644
--- a/packages/testing-framework/scripts/demo/scripted-agents.ts
+++ b/packages/testing-framework/scripts/demo/scripted-agents.ts
@@ -1,9 +1,10 @@
 /**
  * Offline scripted agents for the reference demo. They simulate a plausible
- * shop journey (login → greeting → add to cart → totals → coupon) with a tiny
- * state machine — no browser, no model API. The same shape as the test fakes
- * in tests/unit-test/helpers, but behavior-driven instead of queue-driven so
- * all three authoring modes can run against one simulation.
+ * shop journey (login → greeting → add to cart → quantities → totals →
+ * coupon) with a tiny state machine — no browser, no model API. The same
+ * shape as the test fakes in tests/unit-test/helpers, but behavior-driven
+ * instead of queue-driven so all three authoring styles can run against one
+ * simulation.
  */
 import type {
   GeneralAgentAdapter,
@@ -12,11 +13,17 @@ import type {
 } from '../../src/general-agent/types';
 import type { UiAgentLike } from '../../src/types';
 
-const PRICE = 129.0;
+/** Mirrors the catalog in example/demo-app/index.html. */
+const PRICES: Record<string, number> = {
+  'Trail Backpack': 129.0,
+  'Camp Mug': 24.5,
+};
+
+const money = (n: number) => `$${n.toFixed(2)}`;
 
 class ShopSimulation {
   role: string | null = null;
-  inCart = false;
+  readonly items = new Map<string, number>();
   couponApplied = false;
 
   get greeting(): string {
@@ -24,9 +31,25 @@ class ShopSimulation {
     return `Hello, ${this.role[0].toUpperCase()}${this.role.slice(1)}!`;
   }
 
+  get itemCount(): number {
+    let count = 0;
+    for (const qty of this.items.values()) count += qty;
+    return count;
+  }
+
   get total(): number {
-    if (!this.inCart) return 0;
-    return this.couponApplied ? PRICE * 0.9 : PRICE;
+    let subtotal = 0;
+    for (const [name, qty] of this.items) {
+      subtotal += (PRICES[name] ?? 0) * qty;
+    }
+    return this.couponApplied ? subtotal * 0.9 : subtotal;
+  }
+
+  describe(): string {
+    const cart =
+      [...this.items.entries()].map(([n, q]) => `${n}×${q}`).join(', ') ||
+      'empty';
+    return `role=${this.role ?? 'anonymous'}, cart=${cart}, total=${money(this.total)}`;
   }
 }
 
@@ -39,13 +62,21 @@ export class ScriptedUiAgent implements UiAgentLike {
       this.sim.role = signIn[1];
       return `Signed in as ${signIn[1]}; the dashboard is shown.`;
     }
-    if (/add .*to the cart/i.test(instruction)) {
-      this.sim.inCart = true;
-      return 'Added "Trail Backpack" to the cart and opened the cart view.';
+    const add = /add the "([^"]+)" to the cart/i.exec(instruction);
+    if (add) {
+      this.sim.items.set(add[1], (this.sim.items.get(add[1]) ?? 0) + 1);
+      return `Added "${add[1]}" to the cart and opened the cart view.`;
+    }
+    const setQty = /increase the "([^"]+)" quantity in the cart to (\d+)/i.exec(
+      instruction,
+    );
+    if (setQty) {
+      this.sim.items.set(setQty[1], Number(setQty[2]));
+      return `Increased "${setQty[1]}" to quantity ${setQty[2]}; the cart total is now ${money(this.sim.total)}.`;
     }
     if (/apply the coupon code/i.test(instruction)) {
       this.sim.couponApplied = true;
-      return `Applied the coupon; the total is now $${this.sim.total.toFixed(2)}.`;
+      return `Applied the coupon; the total is now ${money(this.sim.total)}.`;
     }
     if (/login page/i.test(instruction)) {
       return 'The login page is open.';
@@ -62,8 +93,14 @@ export class ScriptedUiAgent implements UiAgentLike {
 
   async aiString(prompt: string): Promise<string> {
     if (/greeting/i.test(prompt)) return this.sim.greeting;
-    if (/price/i.test(prompt)) return `$${PRICE.toFixed(2)}`;
-    if (/badge|count/i.test(prompt)) return this.sim.inCart ? '1' : '0';
+    const price = /price of the "([^"]+)" product/i.exec(prompt);
+    if (price) {
+      const unit = PRICES[price[1]];
+      return unit === undefined
+        ? '(no value found on the simulated page)'
+        : money(unit);
+    }
+    if (/badge|count/i.test(prompt)) return String(this.sim.itemCount);
     return '(no value found on the simulated page)';
   }
 
@@ -72,7 +109,7 @@ export class ScriptedUiAgent implements UiAgentLike {
   };
 
   describeState(): string {
-    return `role=${this.sim.role ?? 'anonymous'}, cart=${this.sim.inCart ? 'Trail Backpack' : 'empty'}, total=$${this.sim.total.toFixed(2)}`;
+    return this.sim.describe();
   }
 }
 
@@ -89,21 +126,55 @@ export class ScriptedGeneralAgent implements GeneralAgentAdapter {
         },
       };
     }
-    if (/coupon discount/i.test(i)) {
+    const twice = /total equals twice \$([\d.]+)/i.exec(i);
+    if (twice) {
+      const unit = Number(twice[1]);
+      return {
+        text: 'The cart total doubled with the quantity.',
+        verdict: {
+          pass: true,
+          reason: `${money(unit * 2)} is exactly twice the unit price ${money(unit)}.`,
+        },
+      };
+    }
+    const coupon = /equals \$([\d.]+) minus .*coupon/i.exec(i);
+    if (coupon) {
+      const base = Number(coupon[1]);
       return {
         text: 'The cart shows the discounted total.',
         verdict: {
           pass: true,
-          reason: `$${(PRICE * 0.9).toFixed(2)} equals $${PRICE.toFixed(2)} minus the 10% coupon.`,
+          reason: `${money(base * 0.9)} equals ${money(base)} minus the 10% coupon.`,
+        },
+      };
+    }
+    const lists = /lists "([^"]+)" with quantity (\d+) at \$([\d.]+)/i.exec(i);
+    if (lists) {
+      return {
+        text: 'The cart line matches.',
+        verdict: {
+          pass: true,
+          reason: `The cart shows "${lists[1]}" with quantity ${lists[2]} priced at $${lists[3]}.`,
+        },
+      };
+    }
+    const badge = /badge .*shows (\d+) item/i.exec(i);
+    if (badge) {
+      return {
+        text: 'The header badge matches the cart contents.',
+        verdict: {
+          pass: true,
+          reason: `The header badge reads "${badge[1]} item${badge[1] === '1' ? '' : 's'}".`,
         },
       };
     }
-    if (/cart total/i.test(i)) {
+    const total = /cart total equals \$([\d.]+)/i.exec(i);
+    if (total) {
       return {
         text: 'The cart total matches the captured price.',
         verdict: {
           pass: true,
-          reason: `The cart shows $${PRICE.toFixed(2)}, matching the remembered price.`,
+          reason: `The cart shows $${total[1]}, matching the remembered price.`,
         },
       };
     }
diff --git a/packages/testing-framework/src/frontends/gherkin/index.ts b/packages/testing-framework/src/frontends/gherkin/index.ts
index 1a37fad2c1..2c479bd37e 100644
--- a/packages/testing-framework/src/frontends/gherkin/index.ts
+++ b/packages/testing-framework/src/frontends/gherkin/index.ts
@@ -108,6 +108,12 @@ export function compileFeatureFile(file: string): CompiledFeature {
   return compileFeature(readFileSync(file, 'utf-8'), file);
 }
 
+// Multi-file suite assembly. Re-exported last: suite.ts imports
+// `compileFeatureFile` from this module, and keeping the cycle edge at the
+// bottom makes the load order explicit (same pattern as frontends/js).
+export { compileSuite } from './suite';
+export type { CompiledSuite, SuiteModule } from './suite';
+
 function compileScenario(
   pickle: Pickle,
   tags: string[],
diff --git a/packages/testing-framework/src/frontends/gherkin/suite.ts b/packages/testing-framework/src/frontends/gherkin/suite.ts
new file mode 100644
index 0000000000..a704388afd
--- /dev/null
+++ b/packages/testing-framework/src/frontends/gherkin/suite.ts
@@ -0,0 +1,84 @@
+/**
+ * POC: multi-file Gherkin suites.
+ *
+ * `compileFeature` returns flows per file, but real suites keep shared flow
+ * definitions (login, add-to-cart, …) in their own `.feature` files and call
+ * them from separate test modules. `compileSuite` is the assembly step for
+ * that layout: compile every file, merge ALL flow definitions into one
+ * {@link FlowRegistry} (duplicate flow names across files fail loudly,
+ * naming both files), and hand back the compiled modules so the caller can
+ * run each module's scenarios against the shared registry.
+ */
+import { statSync } from 'node:fs';
+import { join, resolve } from 'node:path';
+import type { FlowRegistry } from '../../flow-ir';
+import { createFlowRegistry } from '../../flow-ir';
+import { listFiles } from '../../runner/glob';
+import { type CompiledFeature, compileFeatureFile } from './index';
+
+/** One compiled `.feature` file of a suite. */
+export interface SuiteModule {
+  /** Absolute path of the `.feature` file. */
+  file: string;
+  feature: CompiledFeature;
+}
+
+export interface CompiledSuite {
+  /** Every compiled `.feature` file, in deterministic (sorted-path) order. */
+  modules: SuiteModule[];
+  /** All flow definitions from all files, merged into one registry. */
+  registry: FlowRegistry;
+}
+
+/**
+ * Compile a whole suite: a directory (every `.feature` under it, recursively)
+ * or an explicit list of `.feature` files. Scenarios from any module may call
+ * flows defined in any other module via the shared registry.
+ */
+export function compileSuite(input: string | string[]): CompiledSuite {
+  const files = Array.isArray(input)
+    ? input.map((f) => resolve(f))
+    : discoverFeatureFiles(input);
+  if (files.length === 0) {
+    throw new Error(
+      `[midscene] compileSuite: no .feature files found in ${JSON.stringify(input)}.`,
+    );
+  }
+
+  const modules: SuiteModule[] = files.map((file) => ({
+    file,
+    feature: compileFeatureFile(file),
+  }));
+
+  // Merge flows across files. The registry itself rejects duplicates, but a
+  // cross-file clash deserves an error that names both definition sites.
+  const flowSources = new Map<string, string>();
+  const registry = createFlowRegistry();
+  for (const { file, feature } of modules) {
+    for (const flow of feature.flows) {
+      const existing = flowSources.get(flow.name);
+      if (existing) {
+        throw new Error(
+          `[midscene] compileSuite: flow "${flow.name}" is defined in both ${existing} and ${file}. Flow names are suite-global — rename one of them.`,
+        );
+      }
+      flowSources.set(flow.name, file);
+      registry.register(flow);
+    }
+  }
+
+  return { modules, registry };
+}
+
+function discoverFeatureFiles(dir: string): string[] {
+  const root = resolve(dir);
+  if (!statSync(root).isDirectory()) {
+    throw new Error(
+      `[midscene] compileSuite: ${root} is not a directory. Pass a suite directory or an explicit list of .feature files.`,
+    );
+  }
+  return listFiles(root)
+    .filter((rel) => rel.endsWith('.feature'))
+    .sort()
+    .map((rel) => join(root, rel));
+}
diff --git a/packages/testing-framework/src/index.ts b/packages/testing-framework/src/index.ts
index 78d2796a2c..7f0df67b07 100644
--- a/packages/testing-framework/src/index.ts
+++ b/packages/testing-framework/src/index.ts
@@ -128,5 +128,13 @@ export type {
   StepOverlay,
   StepAnchor,
 } from './frontends/js';
-export { compileFeature, compileFeatureFile } from './frontends/gherkin';
-export type { CompiledFeature } from './frontends/gherkin';
+export {
+  compileFeature,
+  compileFeatureFile,
+  compileSuite,
+} from './frontends/gherkin';
+export type {
+  CompiledFeature,
+  CompiledSuite,
+  SuiteModule,
+} from './frontends/gherkin';
diff --git a/packages/testing-framework/tests/unit-test/example-parity.test.ts b/packages/testing-framework/tests/unit-test/example-parity.test.ts
index a19e8da541..43e62b51dc 100644
--- a/packages/testing-framework/tests/unit-test/example-parity.test.ts
+++ b/packages/testing-framework/tests/unit-test/example-parity.test.ts
@@ -1,91 +1,94 @@
 /**
- * Parity check for the two example authoring surfaces: the Gherkin feature
- * (example/flows/shop.feature) and its JS counterpart
- * (example/flows/shop.flows.ts) must compile to equivalent IR and produce the
- * same execution trace against the same fake agents.
+ * Parity checks for the example's three authoring styles: the multi-file
+ * Gherkin suite (example/style-1-gherkin), its JS twin (example/style-2-js)
+ * and the sparse overlay (example/style-3-overlay) must compile to
+ * equivalent IR and produce the same execution traces against the same fake
+ * agents.
  */
-import { join } from 'node:path';
+import { join, relative } from 'node:path';
 import { describe, expect, it } from 'vitest';
+import { cartFeature } from '../../example/style-2-js/features/cart.flows';
+import { checkoutFeature } from '../../example/style-2-js/features/checkout.flows';
+import { smokeFeature } from '../../example/style-2-js/features/smoke.flows';
 import {
-  checkoutAsAdmin,
+  addToCartFlow,
+  registry as jsRegistry,
   loginFlow,
-  registry,
-  shopFeature,
-} from '../../example/flows/shop.flows';
-import { createFlowRegistry, runScenario } from '../../src/flow-ir';
-import type { ScenarioIR } from '../../src/flow-ir';
-import type { FlowRegistry } from '../../src/flow-ir';
-import { compileFeatureFile } from '../../src/frontends/gherkin';
+} from '../../example/style-2-js/flows';
+import { runScenario } from '../../src/flow-ir';
+import type { FlowRegistry, ScenarioIR } from '../../src/flow-ir';
+import { compileSuite } from '../../src/frontends/gherkin';
 import { FakeGeneralAgent, FakeUiAgent } from './helpers/fake-agents';
 
-const FEATURE_FILE = join(__dirname, '../../example/flows/shop.feature');
-const gherkin = compileFeatureFile(FEATURE_FILE);
+const STYLE1_DIR = join(__dirname, '../../example/style-1-gherkin');
+const suite = compileSuite(STYLE1_DIR);
 
-describe('example overlay: shop.overlay.ts binds without drift', () => {
-  it('applies the sparse overlay on top of the plain compile', async () => {
-    const { bound } = await import('../../example/flows/shop.overlay');
-    const checkout = bound.scenarios.find(
-      (s) => s.name === 'Checkout as admin',
-    );
-    expect(checkout?.vars?.couponCode).toMatch(/^E2E-\d{4}-\d{2}-\d{2}$/);
-    expect(
-      checkout?.steps.some(
-        (s) =>
-          s.kind === 'prompt' &&
-          s.template === 'apply the coupon code {couponCode} in the cart',
-      ),
-    ).toBe(true);
-    expect(checkout?.steps.at(-2)).toMatchObject({ node: 'soft' });
-    expect(
-      bound.scenarios.find((s) => s.name === 'Promo banner is advisory')
-        ?.config,
-    ).toEqual({ skip: true });
-    // Sparse: the outline-expanded scenarios are untouched pure Gherkin.
-    expect(
-      bound.scenarios.filter((s) => s.name === 'Login greets every role'),
-    ).toEqual(
-      gherkin.scenarios.filter((s) => s.name === 'Login greets every role'),
-    );
+const featureByFile = (suffix: string) => {
+  const module = suite.modules.find((m) =>
+    relative(STYLE1_DIR, m.file).endsWith(suffix),
+  );
+  if (!module) throw new Error(`module ${suffix} not found in the suite`);
+  return module.feature;
+};
+
+describe('example suite: style-1 Gherkin assembles as one suite', () => {
+  it('discovers all modules in deterministic order', () => {
+    expect(suite.modules.map((m) => relative(STYLE1_DIR, m.file))).toEqual([
+      'features/cart.feature',
+      'features/checkout.feature',
+      'features/smoke.feature',
+      'flows/add-to-cart.feature',
+      'flows/login.feature',
+    ]);
+  });
+
+  it('merges the shared flows from the flow files into one registry', () => {
+    expect(suite.registry.has('Login')).toBe(true);
+    expect(suite.registry.has('Add product to cart')).toBe(true);
+    // Test modules define no flows of their own.
+    for (const suffix of [
+      'features/cart.feature',
+      'features/checkout.feature',
+      'features/smoke.feature',
+    ]) {
+      expect(featureByFile(suffix).flows).toEqual([]);
+    }
   });
 });
 
 describe('example parity: Gherkin vs JS front-end', () => {
-  it('compiles the same Login flow signature', () => {
-    expect(gherkin.flows).toHaveLength(1);
-    const gherkinLogin = gherkin.flows[0];
-    expect(gherkinLogin.name).toBe(loginFlow.name);
+  it('compiles the same shared flow signatures and bodies', () => {
+    const gherkinLogin = suite.registry.get('Login');
     expect(gherkinLogin.params).toEqual(loginFlow.params);
     expect(gherkinLogin.returns).toEqual(loginFlow.returns);
-    // Background steps are excluded from @flow pickles, so the two surfaces
-    // compile to the exact same flow body.
     expect(gherkinLogin.steps).toEqual(loginFlow.steps);
-  });
 
-  it('compiles the same checkout scenario steps', () => {
-    const gherkinCheckout = gherkin.scenarios.find(
-      (s) => s.name === 'Checkout as admin',
-    );
-    expect(gherkinCheckout).toBeDefined();
-    expect(gherkinCheckout?.steps).toEqual(checkoutAsAdmin.steps);
+    const gherkinCart = suite.registry.get('Add product to cart');
+    expect(gherkinCart.params).toEqual(addToCartFlow.params);
+    expect(gherkinCart.returns).toEqual(addToCartFlow.returns);
+    expect(gherkinCart.steps).toEqual(addToCartFlow.steps);
   });
 
-  it('expands the outline to the same per-role scenarios as the JS map()', () => {
-    const gherkinRoles = gherkin.scenarios
-      .filter((s) => s.name === 'Login greets every role')
-      .map((s) => s.steps);
-    const jsRoles = shopFeature.scenarios
-      .filter((s) => s.name.startsWith('Login greets every role'))
-      .map((s) => s.steps);
-    expect(gherkinRoles).toEqual(jsRoles);
+  it('compiles the same scenario steps in every test module', () => {
+    const twins: Array<[string, typeof cartFeature]> = [
+      ['features/cart.feature', cartFeature],
+      ['features/checkout.feature', checkoutFeature],
+      ['features/smoke.feature', smokeFeature],
+    ];
+    for (const [suffix, jsFeature] of twins) {
+      const gherkin = featureByFile(suffix);
+      expect(gherkin.scenarios.map((s) => s.steps)).toEqual(
+        jsFeature.scenarios.map((s) => s.steps),
+      );
+    }
   });
 
-  it('produces identical execution traces through the shared IR executor', async () => {
-    const gherkinCheckout = gherkin.scenarios.find(
-      (s) => s.name === 'Checkout as admin',
-    ) as ScenarioIR;
+  it('runs a cart scenario via flows defined in OTHER files, with identical traces', async () => {
+    const gherkinScenario = featureByFile('features/cart.feature').scenarios[0];
+    const jsScenario = cartFeature.scenarios[0];
 
     const runWith = async (s: ScenarioIR, reg: FlowRegistry) => {
-      const ui = new FakeUiAgent(['Hello, Admin!', '129.00']);
+      const ui = new FakeUiAgent(['Hello, Guest!', '$24.50']);
       const general = new FakeGeneralAgent();
       const result = await runScenario({
         scenario: s,
@@ -102,19 +105,57 @@ describe('example parity: Gherkin vs JS front-end', () => {
       };
     };
 
-    const fromGherkin = await runWith(
-      gherkinCheckout,
-      createFlowRegistry(gherkin.flows),
-    );
-    const fromJs = await runWith(checkoutAsAdmin, registry);
+    const fromGherkin = await runWith(gherkinScenario, suite.registry);
+    const fromJs = await runWith(jsScenario, jsRegistry);
 
     // Same prompts hit the "models", same variables end up in the table.
-    expect(fromGherkin.actCalls).toEqual(fromJs.actCalls);
-    expect(fromGherkin.stringCalls).toEqual(fromJs.stringCalls);
-    expect(fromGherkin.verifyPrompts).toEqual(fromJs.verifyPrompts);
-    expect(fromGherkin.variables).toEqual(fromJs.variables);
+    expect(fromGherkin).toEqual(fromJs);
     expect(fromGherkin.status).toBe('passed');
-    expect(fromJs.status).toBe('passed');
-    expect(fromJs.verifyPrompts).toContain('the cart total equals 129.00');
+    // The flow's declared return crossed file boundaries into the scenario.
+    expect(fromGherkin.variables.price).toBe('$24.50');
+    expect(fromGherkin.verifyPrompts).toContain(
+      'the cart lists "Camp Mug" with quantity 1 at $24.50',
+    );
+  });
+});
+
+describe('example overlay: style-3 binds style-1 without drift', () => {
+  it('applies the sparse overlay on top of the plain compile', async () => {
+    const { bound } = await import(
+      '../../example/style-3-overlay/checkout.overlay'
+    );
+    const plain = featureByFile('features/checkout.feature');
+
+    const checkout = bound.scenarios.find(
+      (s: ScenarioIR) => s.name === 'Checkout as admin',
+    );
+    expect(checkout?.vars?.couponCode).toMatch(/^E2E-\d{4}-\d{2}-\d{2}$/);
+
+    // The coupon step is inserted directly after the anchored flow call.
+    const flowCallIndex = checkout?.steps.findIndex(
+      (s) => s.kind === 'callFlow' && s.flowName === 'Add product to cart',
+    );
+    expect(flowCallIndex).toBeGreaterThanOrEqual(0);
+    expect(checkout?.steps[(flowCallIndex ?? 0) + 1]).toMatchObject({
+      kind: 'prompt',
+      node: 'ui',
+      template: 'apply the coupon code {couponCode} in the cart',
+    });
+
+    // The exact-total verify is reworded and downgraded to soft.
+    expect(checkout?.steps.at(-2)).toMatchObject({
+      node: 'soft',
+      template:
+        'the cart total equals {price} minus the "{couponCode}" coupon discount',
+    });
+
+    // Per-scenario config and sparseness: the promo scenario is skipped,
+    // and the overlay defines no flows (it reuses the suite registry).
+    expect(
+      bound.scenarios.find(
+        (s: ScenarioIR) => s.name === 'Promo banner is advisory',
+      )?.config,
+    ).toEqual({ skip: true });
+    expect(bound.flows).toEqual(plain.flows);
   });
 });
diff --git a/packages/testing-framework/tests/unit-test/suite.test.ts b/packages/testing-framework/tests/unit-test/suite.test.ts
new file mode 100644
index 0000000000..14c2ee07bc
--- /dev/null
+++ b/packages/testing-framework/tests/unit-test/suite.test.ts
@@ -0,0 +1,104 @@
+import { mkdirSync, mkdtempSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import { describe, expect, it } from 'vitest';
+import { runScenario } from '../../src/flow-ir';
+import { compileSuite } from '../../src/frontends/gherkin';
+import { FakeGeneralAgent, FakeUiAgent } from './helpers/fake-agents';
+
+const LOGIN_FLOW = `Feature: Shared flows
+  @flow @param:role @returns:greeting
+  Scenario: Login
+    When I sign in as the "{role}" user
+    When I remember the greeting shown in the header as "greeting"
+`;
+
+const GREET_MODULE = `Feature: Greeting
+  Scenario: Greet
+    When I run the "Login" flow with role "admin"
+    Then the header shows {greeting}
+`;
+
+function writeSuite(files: Record<string, string>): string {
+  const dir = mkdtempSync(join(tmpdir(), 'mts-suite-'));
+  for (const [rel, source] of Object.entries(files)) {
+    const file = join(dir, rel);
+    mkdirSync(join(file, '..'), { recursive: true });
+    writeFileSync(file, source);
+  }
+  return dir;
+}
+
+describe('compileSuite: multi-file assembly', () => {
+  it('globs .feature files recursively in deterministic (sorted) order', () => {
+    const dir = writeSuite({
+      'features/greet.feature': GREET_MODULE,
+      'flows/login.feature': LOGIN_FLOW,
+      'notes.md': 'not a feature file',
+    });
+    const suite = compileSuite(dir);
+    expect(suite.modules.map((m) => m.file)).toEqual([
+      join(dir, 'features/greet.feature'),
+      join(dir, 'flows/login.feature'),
+    ]);
+  });
+
+  it('resolves a flow defined in another file via the merged registry', async () => {
+    const suite = compileSuite(
+      writeSuite({
+        'features/greet.feature': GREET_MODULE,
+        'flows/login.feature': LOGIN_FLOW,
+      }),
+    );
+    const greet = suite.modules[0].feature.scenarios[0];
+
+    const ui = new FakeUiAgent(['Hello, Admin!']);
+    const general = new FakeGeneralAgent();
+    const result = await runScenario({
+      scenario: greet,
+      registry: suite.registry,
+      uiAgent: ui,
+      generalAgent: general,
+    });
+
+    expect(result.status).toBe('passed');
+    expect(ui.actCalls).toEqual(['I sign in as the "admin" user']);
+    expect(general.calls[0].instruction).toBe('the header shows Hello, Admin!');
+  });
+
+  it('accepts an explicit file list instead of a directory', () => {
+    const dir = writeSuite({
+      'flows/login.feature': LOGIN_FLOW,
+      'features/greet.feature': GREET_MODULE,
+    });
+    const suite = compileSuite([
+      join(dir, 'flows/login.feature'),
+      join(dir, 'features/greet.feature'),
+    ]);
+    // Explicit lists keep the caller's order.
+    expect(suite.modules[0].file).toBe(join(dir, 'flows/login.feature'));
+    expect(suite.registry.has('Login')).toBe(true);
+  });
+
+  it('rejects duplicate flow names across files, naming both files', () => {
+    const dir = writeSuite({
+      'flows/login.feature': LOGIN_FLOW,
+      'flows/login-copy.feature': LOGIN_FLOW,
+    });
+    let error: Error | undefined;
+    try {
+      compileSuite(dir);
+    } catch (err) {
+      error = err as Error;
+    }
+    expect(error?.message).toMatch(/flow "Login" is defined in both/);
+    expect(error?.message).toContain(join(dir, 'flows/login-copy.feature'));
+    expect(error?.message).toContain(join(dir, 'flows/login.feature'));
+  });
+
+  it('throws when no .feature files are found', () => {
+    const dir = writeSuite({ 'readme.md': 'empty suite' });
+    expect(() => compileSuite(dir)).toThrow(/no \.feature files found/);
+    expect(() => compileSuite([])).toThrow(/no \.feature files found/);
+  });
+});

From 4b1a26186b81192c5ecac1fe9fdb352e67471b4e Mon Sep 17 00:00:00 2001
From: ScriptedAlchemy <zack@module-federation.io>
Date: Wed, 10 Jun 2026 04:15:00 +0200
Subject: [PATCH 9/9] docs(testing-framework): make clear the overlay style is
 an optional escape hatch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pure .feature files are fully sufficient on their own; style 2 is for
engineering-owned dynamic suites, and style 3 (bindFeature overlay) only
earns its keep for bind-time computed values, per-environment tweaks
without forking the feature file, and a drift-validated seam between prose
and JS. Adds a blunt "Which style do I need?" decision section to
example/README.md ("you probably only need style 1"), reframes the
read-this-first comments in styles 1 and 3, and aligns POC-GHERKIN.md's
mode-selection table. Docs/comments only — no behavior changes.
---
 packages/testing-framework/POC-GHERKIN.md     | 10 +++++---
 packages/testing-framework/example/README.md  | 24 +++++++++++++++++--
 .../style-1-gherkin/flows/login.feature       |  6 +++++
 .../style-3-overlay/checkout.overlay.ts       |  6 +++++
 4 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/packages/testing-framework/POC-GHERKIN.md b/packages/testing-framework/POC-GHERKIN.md
index 2221ad5632..855eab454d 100644
--- a/packages/testing-framework/POC-GHERKIN.md
+++ b/packages/testing-framework/POC-GHERKIN.md
@@ -291,11 +291,15 @@ scenarios: {
 
 ### Choosing a mode
 
+**You probably only need pure Gherkin.** Plain `.feature` files run
+end-to-end with nothing else; the other two modes are alternatives or an
+optional escape hatch, never a requirement.
+
 | Mode | Use when |
 | --- | --- |
-| Pure Gherkin (`compileFeature`) | Non-engineers own the suite; no computed values or per-env tweaks needed. |
-| Pure JS (`defineFlow`/`scenario`) | The suite is generated or heavily dynamic (loops, conditionals, computed prompts); no BDD stakeholders. |
-| Bound overlay (`bindFeature`) | Gherkin is the shared source of truth, but a few scenarios need computed variables, env-specific arg tweaks, inserted steps, or skip/only flags — without forking the feature file or restating it in JS. |
+| Pure Gherkin (`compileFeature`) | The default — fully sufficient on its own. Non-engineers own the suite; tests are plain English with no computed data. |
+| Pure JS (`defineFlow`/`scenario`) | The suite is engineering-owned and generated or heavily dynamic (loops, types, computed args); no BDD stakeholders. |
+| Bound overlay (`bindFeature`) | OPTIONAL escape hatch: Gherkin is the contract, but a handful of scenarios need programmatic exceptions — bind-time computed values, env-specific tweaks (skip in CI, verify→soft, inserted steps) without forking the feature file. Sparse + bind-time drift validation keeps the seam between prose and JS from silently rotting. |
 
 ## Example: one suite, three style folders
 
diff --git a/packages/testing-framework/example/README.md b/packages/testing-framework/example/README.md
index b47677ce08..fbaf587345 100644
--- a/packages/testing-framework/example/README.md
+++ b/packages/testing-framework/example/README.md
@@ -7,6 +7,26 @@ Two related examples live here:
 2. A copy-out **YAML runner** demo (`e2e/` + `midscene.config.ts`) — the
    Phase 0 node engine. See [below](#the-phase-0-yaml-runner-example).
 
+## Which style do I need?
+
+**You probably only need style 1.** Plain `.feature` files run end-to-end
+with nothing else — no JS, no step definitions, no overlay. If your tests
+are plain English with no computed data, stop there.
+
+- **Style 1 — pure Gherkin**: the default, complete on its own.
+- **Style 2 — pure JS/TS**: for engineering-owned suites that want loops,
+  types, and computed args as first-class code.
+- **Style 3 — overlay**: **optional — an escape hatch, never a
+  requirement.** It only earns its keep when Gherkin is the contract AND a
+  handful of scenarios need programmatic exceptions:
+  - bind-time computed values (dates, env-derived data Gherkin cannot
+    express),
+  - environment-specific tweaks without forking the feature file (skip a
+    scenario in CI, downgrade a verify to soft, insert a step),
+  - a safe seam between non-engineer-owned prose and engineer-owned JS —
+    the overlay is sparse and drift-validated at bind time, so it cannot
+    silently rot.
+
 ## Three interchangeable styles of the SAME suite
 
 The style folders author the **same multi-file test suite** for the static
@@ -18,9 +38,9 @@ step is natural language executed by AI agents.
 
 | Folder | Style | Read this first | Choose it when |
 | --- | --- | --- | --- |
-| `style-1-gherkin/` | Pure Gherkin `.feature` files | `flows/login.feature` | Non-engineers own the suite; specs are the shared language. |
+| `style-1-gherkin/` | Pure Gherkin `.feature` files | `flows/login.feature` | Non-engineers own the suite; specs are the shared language. Fully sufficient on its own. |
 | `style-2-js/` | Pure JS/TS fluent API | `flows/index.ts` | The suite is generated or heavily dynamic (loops, computed prompts). |
-| `style-3-overlay/` | Gherkin source of truth + sparse JS overlay | `checkout.overlay.ts` | Gherkin stays canonical, but a few scenarios need computed values or env tweaks. Binds **style 1's** feature files — nothing is duplicated. |
+| `style-3-overlay/` | OPTIONAL: Gherkin source of truth + sparse JS overlay | `checkout.overlay.ts` | Escape hatch only: Gherkin stays canonical, but a few scenarios need computed values or env tweaks. Binds **style 1's** feature files — nothing is duplicated. |
 
 Inside each style the layout shows real-world modular reuse:
 
diff --git a/packages/testing-framework/example/style-1-gherkin/flows/login.feature b/packages/testing-framework/example/style-1-gherkin/flows/login.feature
index b8a855afe2..3e7172a114 100644
--- a/packages/testing-framework/example/style-1-gherkin/flows/login.feature
+++ b/packages/testing-framework/example/style-1-gherkin/flows/login.feature
@@ -1,5 +1,11 @@
 # READ THIS FIRST (style 1: pure Gherkin).
 #
+# This style is COMPLETE ON ITS OWN: plain .feature files run end-to-end
+# with no JS anywhere — no step definitions, no overlay, no glue code. The
+# sibling style folders are optional alternatives (style 2) or an optional
+# escape hatch for computed values / per-env tweaks (style 3), not layers
+# you are expected to add.
+#
 # This suite has no step-definition code anywhere: every step is natural
 # language executed by AI agents (Given/When → UI actions performed by the
 # Midscene UI Agent; Then → a fail-closed verify judgment by a general
diff --git a/packages/testing-framework/example/style-3-overlay/checkout.overlay.ts b/packages/testing-framework/example/style-3-overlay/checkout.overlay.ts
index 8bb04b01fd..992e6e070c 100644
--- a/packages/testing-framework/example/style-3-overlay/checkout.overlay.ts
+++ b/packages/testing-framework/example/style-3-overlay/checkout.overlay.ts
@@ -1,6 +1,12 @@
 /**
  * READ THIS FIRST (style 3: Gherkin + sparse JS overlay).
  *
+ * OVERLAYS ARE OPTIONAL. If you don't need computed values or
+ * per-environment tweaks, you only need the .feature files — style 1 runs
+ * end-to-end on its own, and nothing about it assumes an overlay exists.
+ * This style is an escape hatch for the few scenarios where plain Gherkin
+ * isn't enough, not a required layer.
+ *
  * WHAT AN OVERLAY IS: the .feature file (style 1's checkout.feature) stays
  * the human-readable source of truth — this file is a sparse JS PATCH on
  * top of it. The overlay is keyed by scenario title, and within a scenario