From 45eb151d202caa73761ab468f479d33ea48999de Mon Sep 17 00:00:00 2001 From: Derek Meegan Date: Wed, 29 Apr 2026 11:15:14 -0700 Subject: [PATCH 1/6] =?UTF-8?q?Add=20browser-reverse=20skill=20=E2=80=94?= =?UTF-8?q?=20discover=20OpenAPI=203.1=20from=20browser-trace=20captures?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Consumes a browser-trace run (.o11y//), pairs CDP request/response events, templatizes paths, infers JSON schemas from samples, and emits an OpenAPI 3.1 document with a coverage report and confidence metadata. Pipeline: load → filter → normalize → infer → emit. Each stage is a discrete script writing to intermediate/ for debuggability. Optional --bodies flag joins a `browse network on` capture by CDP requestId so response bodies feed into schema inference. E2E tested against Hacker News, jsonplaceholder, derekmeegan.com, browserbase.com, browser-use.com, reddit.com. Co-Authored-By: Claude Opus 4.7 (1M context) --- skills/browser-reverse/BODY-CAPTURE-LIFT.md | 118 ++++++ skills/browser-reverse/REFERENCE.md | 240 ++++++++++++ skills/browser-reverse/SKILL.md | 136 +++++++ skills/browser-reverse/package.json | 6 + skills/browser-reverse/scripts/discover.mjs | 93 +++++ skills/browser-reverse/scripts/emit.mjs | 356 ++++++++++++++++++ skills/browser-reverse/scripts/filter.mjs | 66 ++++ skills/browser-reverse/scripts/infer.mjs | 139 +++++++ skills/browser-reverse/scripts/lib/io.mjs | 58 +++ .../scripts/lib/path-template.mjs | 86 +++++ skills/browser-reverse/scripts/lib/redact.mjs | 73 ++++ .../scripts/lib/schema-merge.mjs | 175 +++++++++ skills/browser-reverse/scripts/lib/yaml.mjs | 87 +++++ skills/browser-reverse/scripts/load.mjs | 170 +++++++++ skills/browser-reverse/scripts/normalize.mjs | 128 +++++++ 15 files changed, 1931 insertions(+) create mode 100644 skills/browser-reverse/BODY-CAPTURE-LIFT.md create mode 100644 skills/browser-reverse/REFERENCE.md create mode 100644 skills/browser-reverse/SKILL.md create mode 100644 skills/browser-reverse/package.json create mode 100644 skills/browser-reverse/scripts/discover.mjs create mode 100644 skills/browser-reverse/scripts/emit.mjs create mode 100644 skills/browser-reverse/scripts/filter.mjs create mode 100644 skills/browser-reverse/scripts/infer.mjs create mode 100644 skills/browser-reverse/scripts/lib/io.mjs create mode 100644 skills/browser-reverse/scripts/lib/path-template.mjs create mode 100644 skills/browser-reverse/scripts/lib/redact.mjs create mode 100644 skills/browser-reverse/scripts/lib/schema-merge.mjs create mode 100644 skills/browser-reverse/scripts/lib/yaml.mjs create mode 100644 skills/browser-reverse/scripts/load.mjs create mode 100644 skills/browser-reverse/scripts/normalize.mjs diff --git a/skills/browser-reverse/BODY-CAPTURE-LIFT.md b/skills/browser-reverse/BODY-CAPTURE-LIFT.md new file mode 100644 index 00000000..514f686e --- /dev/null +++ b/skills/browser-reverse/BODY-CAPTURE-LIFT.md @@ -0,0 +1,118 @@ +# Adding Response Body Capture to `browser-trace` — Lift Estimate + +> Grounded in the real source as of `browserbase/skills@main`. I read `SKILL.md`, +> `REFERENCE.md`, `lib.mjs`, `start-capture.mjs`, `snapshot-loop.mjs`, `bisect-cdp.mjs`, +> `bb-capture.mjs`, `bb-finalize.mjs`, `stop-capture.mjs`. + +--- + +## 1. Why this is harder than it looks + +`browser-trace` today does the simplest possible thing: it shells out to `browse cdp --domain Network --domain Console ...`, which emits one CDP event per line to stdout, and that stream is captured verbatim into `cdp/raw.ndjson`. **No CDP commands are issued back into the session.** The capture is fully one-way and stateless. + +Response bodies break that model. Bodies aren't pushed by CDP — they have to be **pulled** with a `Network.getResponseBody` request, keyed by `requestId`, **before the renderer evicts the resource**. Eviction is non-deterministic but typically happens within seconds of the response completing on a busy page. That means body capture has to be: + +- **Live** — runs concurrently with the trace, can't be done from `raw.ndjson` after the fact. +- **Bidirectional** — issues CDP commands, not just reads events. +- **Fast** — the gap between `Network.loadingFinished` and the `getResponseBody` call must be small. +- **Selective** — fetching every body would 10–100x the disk footprint and add real load on the renderer. + +This is a meaningful expansion of the skill's current architecture, not a tweak. + +--- + +## 2. The lift, by component + +### 2.1 New companion script — `scripts/body-capture.mjs` — **NEW, ~200 lines** + +The `browse cdp` subprocess can't be modified (it's an external binary), so body capture has to be a **second CDP client** running in parallel, attached to the same target. Same model as `snapshot-loop.mjs`, but instead of polling screenshots it subscribes to `Network.responseReceived` + `Network.loadingFinished` and issues `Network.getResponseBody` for matching requests. + +Responsibilities: + +- Open its own WebSocket to the CDP target (or use `browse --ws ...` if it supports request/response, which from the snapshot loop it does for one-shot commands — body capture is a long-lived subscription, so likely a raw `ws://` client). +- Maintain an in-memory map of `requestId → { url, method, contentType, status, type }` keyed off `Network.requestWillBeSent` + `Network.responseReceived`. +- On `Network.loadingFinished`: if the request matches the filter (default: `fetch`/`xhr` resourceType, JSON or form content-type, size cap), call `Network.getResponseBody` and write the result to `/cdp/network/bodies/.json`. +- Track failures (eviction races, out-of-process iframes that can't be addressed, sizes over the cap) in a sidecar `bodies/_skipped.jsonl`. +- SIGTERM-clean shutdown so `stop-capture.mjs` doesn't have to know about it specifically (it would just need to also kill `.bodies.pid`). + +**Risk:** `Network.getResponseBody` requires a session-attached target. For OOPIFs (cross-origin iframes), you have to use `Target.attachToTarget` first and route the command on the resulting session. Non-trivial. Realistic v1 punts on iframes and just records the skip reason. + +**Dependencies:** zero — Node stdlib has `ws` via `undici` /`WebSocket` (Node 22+) or you bundle a tiny WS client. The skill is currently zero-dep, so this constraint matters. + +### 2.2 `start-capture.mjs` — **MODIFIED, ~10 lines** + +Add an optional third detached subprocess: if `O11Y_BODIES=1` (or a `--bodies` flag), spawn `body-capture.mjs` the same way `snapshot-loop.mjs` is spawned, write `.bodies.pid`. Default off so existing users see no change. + +### 2.3 `stop-capture.mjs` — **MODIFIED, ~3 lines** + +Already loops over `['.cdp.pid', '.loop.pid']`. Add `'.bodies.pid'` to the list. Trivial. + +### 2.4 `bisect-cdp.mjs` — **MODIFIED, ~15 lines** + +Currently the only "network" buckets are CDP **events** (`requestWillBeSent`, `responseReceived`, `loadingFinished`, `loadingFailed`, `webSocket`). Bodies are content, not events, so they don't fit the existing `BUCKETS` predicate model. + +Two sensible places to expose them: + +1. **As-is on disk** — `cdp/network/bodies/.json` already exists from body-capture; bisect doesn't have to do anything. Per-page slicing (`cdp/pages//network/bodies/`) is the only real work: walk `network/responses.jsonl` for each page, find the matching body files, hard-link or copy them into the per-page dir. ~10 lines. +2. **Index** — emit `cdp/network/bodies-index.jsonl` mapping `{requestId, url, method, status, contentType, sizeBytes, bodyPath}` so query/grep tools don't have to walk the dir. ~5 lines. + +### 2.5 `lib.mjs` — **MODIFIED, ~5 lines** + +Add a helper `readBody(runDir, requestId) → { contentType, body, base64? }`. Useful for the new skill's `infer.mjs` and for `query.mjs`. + +### 2.6 `query.mjs` — **MODIFIED, ~20 lines** + +Add a `bodies` subcommand: list captured bodies, filter by URL/status/content-type, dump a body to stdout. Optional but cheap. + +### 2.7 `bb-capture.mjs` / `bb-finalize.mjs` — **NO CHANGES** + +They delegate to `start-capture.mjs` / `stop-capture.mjs`. Inherits body capture for free. + +### 2.8 `SKILL.md` / `REFERENCE.md` — **MODIFIED, ~50 lines** + +Document: +- The new flag/env var. +- New on-disk layout (`cdp/network/bodies/`, `bodies-index.jsonl`). +- Caveats: eviction races, OOPIF gaps, size cap, default-off. +- Filter knobs (`O11Y_BODY_TYPES`, `O11Y_BODY_MAX_KB`, `O11Y_BODY_INCLUDE_PATTERN`). +- Privacy implication: bodies can contain user data. Off by default for a reason. + +--- + +## 3. Total lift + +| Component | Type | Lines | Risk | +|---|---|---|---| +| `scripts/body-capture.mjs` | new | ~200 | **medium** — WS client, eviction races, OOPIF | +| `scripts/start-capture.mjs` | modify | ~10 | low | +| `scripts/stop-capture.mjs` | modify | ~3 | low | +| `scripts/bisect-cdp.mjs` | modify | ~15 | low | +| `scripts/lib.mjs` | modify | ~5 | low | +| `scripts/query.mjs` | modify | ~20 | low | +| `SKILL.md` + `REFERENCE.md` | modify | ~50 | low | +| **Total** | | **~300 LOC** | | + +**Calendar estimate for one engineer who knows CDP:** ~2–3 days. +- Day 1: WS client + filter + happy-path body capture against Chromium local. +- Day 2: OOPIF target attachment, size cap, skip-tracking, integration with `start`/`stop`. +- Day 3: bisect integration, query subcommand, docs, end-to-end test against a Browserbase remote session. + +**Calendar estimate without prior CDP fluency:** ~1 week. The eviction race and OOPIF target plumbing are the parts that bite. + +--- + +## 4. Risks worth calling out in the PR + +1. **Privacy.** Bodies can contain bearer tokens, PII, partial PII even when redacted at the header layer. Default-off + an opt-in flag is non-negotiable. The redaction story has to live in the consuming skill (e.g. `discover-api-spec`), not in the capture layer — capture should write what it sees. +2. **Performance.** `Network.getResponseBody` blocks on the renderer. For a page making 200 XHR requests, naive capture serializes every one of them. Mitigations: hard cap on concurrent in-flight `getResponseBody` calls (e.g. 8), aggressive content-type filter, default size cap (256 KB). +3. **Disk.** A 10-minute Browserbase session with body capture on can easily produce 100–500 MB of bodies. The skill should default to JSON-only + 256 KB cap and let users opt into more. +4. **Eviction races.** Some bodies will fail with `-32000 No data found for resource`. This is normal. `bodies/_skipped.jsonl` should record them so consumers know coverage isn't 100%. +5. **WebSocket frame data.** `Network.webSocketFrameSent` / `Received` already include the payload inline — no `getResponseBody` needed. v1 should explicitly punt on WebSocket bodies (already in the events bucket) to scope down. + +--- + +## 5. Recommendation + +Building this **into** `browser-trace` is the right call **if** the maintainers are willing to add a (default-off) feature with privacy and disk caveats. Putting it in a sibling skill is also viable but less clean — every consumer skill (api-spec, security audits, etc.) would have to reinvent the WS plumbing. + +The cleanest framing: **bodies are part of the trace, off by default, on with a flag.** Same shape as how Chrome DevTools handles "Preserve log" / "Disable cache" — capture options, not a separate tool. diff --git a/skills/browser-reverse/REFERENCE.md b/skills/browser-reverse/REFERENCE.md new file mode 100644 index 00000000..e8a9ba3c --- /dev/null +++ b/skills/browser-reverse/REFERENCE.md @@ -0,0 +1,240 @@ +# Browser Reverse — Reference + +Technical reference for the discovery pipeline, file formats, and configuration. + +## Pipeline + +``` +browser-trace run discover.mjs +.o11y//cdp/network/ ┌─────────┐ ┌────────┐ ┌──────────┐ ┌─────────┐ ┌──────┐ + requests.jsonl ──────────▶ │ load │ ─▶ │ filter │ ─▶ │ normalize│ ─▶ │ infer │ ─▶ │ emit │ + responses.jsonl └─────────┘ └────────┘ └──────────┘ └─────────┘ └──────┘ + paired filtered endpoints endpoints openapi + .jsonl .jsonl .jsonl .with- .yaml + schemas report.md + .jsonl +``` + +Each stage is a discrete script that reads a file and writes a file. `discover.mjs` is the dispatcher; pass `--stage ` to run a single stage for debugging. + +## Scripts + +All scripts are Node ESM (`type: module`). They depend only on the Node standard library. + +### `discover.mjs --run [flags]` + +Top-level dispatcher. Runs `load → filter → normalize → infer → emit` in order. With `--stage `, runs only that stage (assumes prior stages already wrote their intermediate file). + +### `load.mjs [bodies-dir]` + +- Reads `cdp/network/requests.jsonl` and `cdp/network/responses.jsonl`. +- Pairs by `requestId`. Drops `OPTIONS` (CORS preflight) and pure redirects (status 3xx with `Location` and no body — recorded as metadata on the *next* request in the chain when the requestId carries forward, otherwise dropped). +- Drops resource types that are not `XHR`, `Fetch`, or `Document` (skips `Image`, `Stylesheet`, `Font`, `Media`, `Manifest`, `Other`, `Script` unless the URL clearly looks like an API endpoint). +- **Body join**: if a `browse network` capture dir is provided (via `--bodies` or auto-detected at `/cdp/network/bodies/`), each subdir's `request.json` + `response.json` are read and joined to paired rows by `requestId`. The browse-network `id` field IS the CDP requestId for XHR/Fetch resource types, so the join is exact (not URL-or-timestamp matching). Bodies that look like JSON are parsed; otherwise the raw string is preserved. +- Output: `intermediate/paired.jsonl` — one row per pair with `{ method, url, status, reqHeaders, reqBody, respHeaders, respBody, contentType, type, ts }`. + +### `filter.mjs ` + +- Reads `intermediate/paired.jsonl`. +- Applies `--include` / `--exclude` / `--origins`. +- Applies built-in exclude list (analytics hosts, sourcemaps, service workers, fonts/CSS that snuck through). +- Output: `intermediate/filtered.jsonl`. + +### `normalize.mjs ` + +- Templatizes paths. Detection order per segment: + 1. UUID v1–v5 → `{id}` (`string`, `format: uuid`). + 2. Pure integer → `{id}` (`integer`). + 3. Hex/base62 ≥ 8 chars → `{id}` (`string`). + 4. If the same position varies across multiple samples and is short alpha → `{slug}` (`string`). + 5. Otherwise the segment is left static. +- Groups paired samples by `(origin, method, templatedPath)`. +- Collects query parameters across samples; marks `required: true` only when every sample carries the param. +- If two pre-normalization templates would collapse but yield divergent response status/content-type signatures, they're kept split and flagged. +- Output: `intermediate/endpoints.jsonl` — one row per endpoint with `{ origin, method, path, samples[], queryParams, statusCodes, normalizationFlags }`. + +### `infer.mjs ` + +- For each endpoint, runs JSON-Schema inference across request bodies and (when present) response bodies. +- Merge rules: required = present-in-all, types = union of observed types, arrays infer item schema, enum detected when ≤ 8 distinct values across ≥ 5 samples. +- Format hints: `date-time` (ISO-ish), `uri`, `email`, `uuid`. +- Picks a representative sample (most-recent successful 2xx) and writes redacted request/response example to `samples/`. +- Output: `intermediate/endpoints.with-schemas.jsonl`. + +### `emit.mjs ` + +- Builds the OpenAPI 3.1 document. +- Hoists structurally-identical schemas into `components.schemas` keyed by structural hash, with names derived from path tokens (`Item`, `Item_List`, etc.) — falls back to `Schema1`, `Schema2` if no path hint applies. +- Writes `openapi.yaml`, `openapi.json`, `report.md`, `confidence.json`. + +## File formats + +### `intermediate/paired.jsonl` + +```json +{ + "requestId": "12345.678", + "method": "GET", + "url": "https://api.example.com/v1/items/42?page=2", + "origin": "https://api.example.com", + "path": "/v1/items/42", + "query": { "page": "2" }, + "status": 200, + "type": "Fetch", + "contentType": "application/json", + "reqHeaders": { "accept": "application/json" }, + "reqBody": null, + "respHeaders": { "content-type": "application/json" }, + "respBody": null, + "ts": 1714400000000 +} +``` + +`reqBody` is the verbatim `postData` from `Network.requestWillBeSent` (parsed if JSON). `respBody` is `null` unless a `browse network` capture dir was joined in (see below) — `browse cdp` does not embed bodies. + +### Joining `browse network` bodies + +`browse network on` is a separate command from the `browse` CLI that writes per-request `request.json` + `response.json` files (with full bodies) to a temp directory. Discover joins these into the trace by `requestId`. + +Workflow: + +```bash +# during capture, alongside browser-trace +browse network on +# ...drive... +# IMPORTANT: snapshot the dir before it gets reused +cp -r "$(browse network path | jq -r .path)" .o11y//cdp/network/bodies/ +browse network off +``` + +Internals (matched in `lib/io.mjs` + `load.mjs`): + +- The browse-network entry's `request.json.id` field equals the CDP `requestId` for XHR/Fetch resource types. The join is by exact `requestId`, not URL or timestamp. +- For Document loads, the `id` field is a non-CDP UUID and won't match — those bodies are silently skipped (Documents aren't useful for API spec inference anyway). +- `response.json` from `browse network` may have empty `status` / `headers` / `mimeType` for some loads — that's fine, those are taken from the CDP firehose. Only `body` is read. +- The capture dir is shared per `browse` daemon session (`/tmp/.../browse-default-network/`). Run `browse network on` then snapshot the dir before another `browse network on` overwrites it. + +### `intermediate/endpoints.jsonl` + +```json +{ + "endpointKey": "GET https://api.example.com/v1/items/{id}", + "origin": "https://api.example.com", + "method": "GET", + "path": "/v1/items/{id}", + "rawPaths": ["/v1/items/42", "/v1/items/97"], + "pathParams": [{ "name": "id", "in": "path", "schema": { "type": "integer" } }], + "queryParams": [{ "name": "page", "in": "query", "required": false, "schema": { "type": "string" } }], + "statusCodes": [200, 200, 404], + "samples": [/* indices into paired.jsonl */], + "normalizationFlags": [] +} +``` + +### `confidence.json` + +```json +{ + "endpoints": [ + { + "key": "GET /v1/items/{id}", + "samples": 7, + "statusCodes": [200, 404], + "responseBodyKnown": false, + "requestBodyKnown": false, + "normalizationFlags": [], + "confidence": "medium" + } + ] +} +``` + +`confidence` is a coarse bucket: `low` (1–2 samples or normalization flags), `medium` (3–9 samples, no flags), `high` (≥ 10 samples, multi-status, no flags). + +## CLI flags (full) + +| Flag | Default | Notes | +|---|---|---| +| `--run ` | required | Resolves `cdp/network/{requests,responses}.jsonl` underneath | +| `--out ` | `/api-spec` | | +| `--bodies ` | auto | `browse network` capture dir to join into the trace. Auto-detected from `/cdp/network/bodies/` when present | +| `--include ` | none | Repeatable. ORed together. Applied after `--origins` | +| `--exclude ` | (defaults) | Repeatable. Combined with built-in defaults | +| `--origins ` | none | Comma-separated. If set, anything *not* matching is dropped before include/exclude | +| `--format ` | `both` | Format of the emitted spec | +| `--title ` | derived | `info.title` in the OpenAPI doc | +| `--redact ` | (defaults) | Comma-separated extra header names / JSON keys to scrub. Adds to defaults; never replaces | +| `--min-samples ` | `1` | Drop endpoints below this threshold (still listed in the report) | +| `--stage ` | (all) | One of `load`, `filter`, `normalize`, `infer`, `emit` | + +## Default exclude list + +URLs matching these patterns are dropped before any analysis (regex, applied to the full URL): + +- Analytics: `segment\.(io\|com)`, `mixpanel\.com`, `google-analytics\.com`, `googletagmanager\.com`, `datadog(hq)?\.com`, `sentry\.io`, `amplitude\.com`, `fullstory\.com`, `hotjar\.com`, `intercom\.io`, `clarity\.ms`, `cloudflareinsights\.com`, `doubleclick\.net`, `facebook\.com/tr` +- Static-only file extensions: `\.(png|jpe?g|gif|svg|webp|ico|woff2?|ttf|eot|otf|css|map|mp4|webm|mp3)(\?|$)` +- Service worker / metadata: `/sw\.js`, `/service-worker\.js`, `/manifest\.json$`, `/robots\.txt$`, `/favicon\.ico$` + +Override granularly via `--include` (which wins over default `--exclude`). + +## Default redactions + +Headers (case-insensitive): `authorization`, `cookie`, `set-cookie`, `x-csrf-token`, `x-xsrf-token`, `x-api-key`, `proxy-authorization`, plus any header name matching `*token*`, `*secret*`, `*signature*`. + +Body keys: `password`, `token`, `secret`, `api_key`, `apiKey`, `accessToken`, `refreshToken`, `creditCard`, `ssn`. + +Body values (regex): JWTs (`^eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+$`), email addresses (`@` + TLD), phone numbers (E.164-ish). + +Redacted values are replaced with `""` so type information is preserved for schema inference. + +## Path templating heuristics + +Per-segment classifier in `scripts/lib/path-template.mjs`: + +| Pattern | Replacement | OpenAPI schema | +|---|---|---| +| 8-4-4-4-12 hex (UUID) | `{id}` | `{ type: string, format: uuid }` | +| `\d+` | `{id}` | `{ type: integer }` | +| `[A-Za-z0-9]{8,}` (no vowels-only / dictionary check) | `{id}` | `{ type: string }` | +| Same-position alpha tokens varying across ≥ 2 samples | `{slug}` | `{ type: string }` | + +When multiple variable segments exist in one path, names are suffixed: `{id}`, `{id2}`, `{id3}`. The `--name-params` flag (future) will use sibling segment hints (`/products/42` → `{productId}`). + +## Confidence flags + +Possible entries in `normalizationFlags`: + +- `divergent-response-shape` — pre-normalization paths collapsed to the same template but had structurally different responses. The skill keeps them split and emits both. +- `single-sample` — endpoint observed exactly once. +- `single-status` — only one status code observed; spec lists only that response. +- `mixed-content-types` — different `content-type` values across samples. +- `request-body-only-on-some-samples` — POST/PUT seen with and without a body. + +## OpenAPI extensions + +The emitter writes a few `x-*` extensions on each operation: + +- `x-confidence`: `{ samples, statusCodes, normalizationFlags }` +- `x-origin`: the origin this operation was observed on (when multiple servers are listed) +- `x-observed-auth`: array of auth-shaped header names seen on this endpoint (e.g. `["authorization", "x-api-key"]`) +- `x-sample-count`: total number of paired samples backing the operation + +These extensions are stripped from `report.md` (which is human-facing) but preserved in the YAML/JSON. + +## Configuration via env + +| Var | Default | Effect | +|---|---|---| +| `O11Y_ROOT` | `.o11y` | Inherited from `browser-trace`. Used only when `--run` is bare run id rather than a full path | +| `DISCOVER_ENUM_MAX_DISTINCT` | `8` | Max distinct values to consider a field an enum | +| `DISCOVER_ENUM_MIN_SAMPLES` | `5` | Min samples before enum detection runs | + +## Troubleshooting + +| Symptom | Likely cause | Fix | +|---|---|---| +| `paired.jsonl` is empty | trace contains no `Network.requestWillBeSent` events for XHR/Fetch | re-run `browser-trace` exercising the dynamic flows; static-only sites won't yield endpoints | +| `openapi.yaml` has only `paths: {}` | every paired request was filtered out | check `--origins` and the default exclude list; pass `--include '.*'` to bypass filtering | +| Path templating collapses too aggressively | numeric IDs being misread as enums, or dictionary words misread as slugs | add `--exclude` for the noisy paths and re-run, or file an issue with the trace | +| Schemas show `type: "string"` for everything | request/response bodies aren't valid JSON or weren't captured | check `paired.jsonl` for `reqBody`/`respBody` content — if `null`, bodies weren't in the trace | +| Spec validator complains about `info.version` | derived version is `0.1.0-discovered` which some tools dislike | pass `--version 0.1.0` (TODO) or post-edit the file | diff --git a/skills/browser-reverse/SKILL.md b/skills/browser-reverse/SKILL.md new file mode 100644 index 00000000..6bb98eac --- /dev/null +++ b/skills/browser-reverse/SKILL.md @@ -0,0 +1,136 @@ +--- +name: browser-reverse +description: Reverse-engineer a website's HTTP API into a best-effort OpenAPI 3.1 spec by analyzing a `browser-trace` capture. Use when the user wants to discover/extract API endpoints from a browser session, build an OpenAPI doc from network traffic, or document a third-party site's XHR/fetch surface for client integration. +compatibility: "Requires Node 18+ and a `browser-trace` run directory (`.o11y//`) produced by the sibling `browser-trace` skill. The scripts use only the Node standard library — no `npm install` step. `jq` is referenced in docs for ad-hoc querying but is not required by the scripts." +license: MIT +allowed-tools: Bash, Read, Grep +--- + +# Browser Reverse + +Replay-driven API reverse-engineering. Consume a `browser-trace` capture, pair its CDP request / response events, templatize observed URLs, infer JSON schemas from samples, and emit an **OpenAPI 3.1** document plus a human-readable coverage report. + +This skill **does not capture traffic**. It is purely offline post-processing on top of `browser-trace`'s `cdp/network/*.jsonl` buckets. The two skills compose: + +``` +browser-trace → .o11y//cdp/network/{requests,responses}.jsonl +discover-api-spec → .o11y//api-spec/openapi.yaml + report.md +``` + +## When to use + +- The user wants an OpenAPI document for a third-party or undocumented website API. +- The user has a `browser-trace` run and wants endpoints + schemas extracted from it. +- The user is building a client/SDK against a site that doesn't publish a spec. +- The user wants a coverage report showing which flows would broaden the spec. + +If the user wants to **capture** traffic, send them to `browser-trace` first. + +## Two-step workflow + +### 1. Capture with `browser-trace` (and optionally bodies via `browse network on`) + +```bash +# Local Chrome example (see browser-trace SKILL.md for Browserbase variant) +"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" \ + --remote-debugging-port=9222 --user-data-dir=/tmp/chrome-spec about:blank & + +node ../browser-trace/scripts/start-capture.mjs 9222 my-site +browse env local 9222 +browse network on # capture request/response bodies +browse open https://example.com +# ...drive whatever flows you want covered... + +# Snapshot the bodies dir BEFORE turning capture off (the temp dir is shared +# per-session, so subsequent `browse network on` runs would mix your bodies +# with whatever a future capture writes if you skip this step). +cp -r "$(browse network path | jq -r .path)" .o11y/my-site/cdp/network/bodies/ +browse network off + +node ../browser-trace/scripts/stop-capture.mjs my-site +node ../browser-trace/scripts/bisect-cdp.mjs my-site +``` + +`browse network on` is **optional but strongly recommended** — without it, the spec has no response-body schemas (the CDP firehose used by `browse cdp` does not embed bodies). With it, both request bodies (already captured by CDP) *and* response bodies are joined into the trace by CDP `requestId`. + +### 2. Generate the spec + +```bash +node scripts/discover.mjs --run .o11y/my-site +# → .o11y/my-site/api-spec/openapi.yaml +# .o11y/my-site/api-spec/openapi.json +# .o11y/my-site/api-spec/report.md +# .o11y/my-site/api-spec/confidence.json +# .o11y/my-site/api-spec/samples/*.json +# .o11y/my-site/api-spec/intermediate/*.jsonl +``` + +`discover.mjs` auto-detects `/cdp/network/bodies/`. To use a body capture from elsewhere (e.g. didn't snapshot, want the live `browse network` dir), pass `--bodies ` explicitly. + +Then deliver the artifacts to the user (`exec.sendFile()` for `openapi.yaml` and `report.md`). + +## CLI flags + +| Flag | Required | Meaning | +|---|---|---| +| `--run ` | yes | Path to a `browser-trace` run directory | +| `--out ` | no | Output dir; default `/api-spec/` | +| `--bodies ` | no | `browse network` capture dir to join into the trace (auto-detected from `/cdp/network/bodies/` when present) | +| `--include ` | no | Only include URLs matching regex (repeatable) | +| `--exclude ` | no | Exclude URLs matching regex (repeatable; in addition to defaults) | +| `--origins ` | no | Comma-separated origin allow-list (e.g. `api.example.com,example.com`) | +| `--format ` | no | Output format. Default `both` | +| `--title ` | no | OpenAPI `info.title`. Default derived from primary origin | +| `--redact ` | no | Extra header names / JSON keys to redact (comma-separated) | +| `--min-samples ` | no | Minimum samples per endpoint to include. Default `1` | +| `--stage ` | no | Run only one stage: `load`, `filter`, `normalize`, `infer`, `emit` | + +## Output layout + +``` +/api-spec/ +├── openapi.yaml primary deliverable +├── openapi.json mirror +├── report.md human-readable summary + coverage caveats +├── confidence.json per-endpoint confidence + normalization flags +├── samples/ redacted request/response examples +│ └── __.json +└── intermediate/ pipeline byproducts (paired/filtered/endpoints jsonl) +``` + +## What you get from `browse cdp` and `browse network` + +Two complementary capture sources: + +| Source | Provides | Limitation | +|---|---|---| +| `browse cdp` (used by `browser-trace`) | request method/URL/headers/`postData`, response status/headers/mimeType, full event timing | **Does not embed response bodies.** Bodies must be pulled with `Network.getResponseBody`, which the firehose doesn't do. | +| `browse network on` (separate command) | request bodies AND response bodies on disk, keyed by CDP `requestId` | Capture dir is shared per `browse` session; snapshot before another `browse network on` overwrites it. | + +`discover.mjs` will pull bodies from a `browse network` dir if you pass `--bodies ` (or stash them under `/cdp/network/bodies/`, which is auto-detected). The matching is by `requestId` — `browse network` writes that into each `request.json` as `id`, and we join directly. + +What changes when bodies are present: + +- ✅ Path templating, query-param schemas, status codes, content-types — same either way. +- ✅ Request-body schemas — `postData` from CDP is enough; bodies dir is a nice-to-have for non-`postData` cases. +- ✅ **Response-body schemas** — fully inferred from real samples. Without bodies you get `{ description, content: }` skeletons. + +The report flags every endpoint that has no response-body sample. For a sketch of what it would take to teach `browser-trace` itself to capture response bodies natively (no separate `browse network on` step), see [BODY-CAPTURE-LIFT.md](BODY-CAPTURE-LIFT.md). + +## Limitations + +- **Coverage is bounded by the captured flow.** Endpoints not exercised in the trace will not appear. The skill cannot prove completeness. +- **Schemas are inductive, not contractual.** A field might be optional on the server even if every sample contained it. +- **Auth is observed, not specified.** The skill records auth-shaped headers in an `x-observed-auth` extension but won't claim a security scheme. +- **Path templating is heuristic.** Numeric / UUID / hex / slug patterns are detected per segment. Ambiguous URLs are flagged in `confidence.json`. +- **Redaction is best-effort.** Default redactions cover common credentials, but app-specific secrets may slip through; use `--redact` for known custom headers/keys. + +## Best practices + +1. **Drive the flows you want documented.** The richer the browser-trace, the richer the spec. +2. **Use `--origins` for noisy sites.** A marketing page hits dozens of analytics hosts; restrict to the API origin you care about. +3. **Inspect `report.md` first.** Low-sample endpoints, single-status endpoints, and missing request bodies are listed there with concrete suggestions. +4. **Bump `--min-samples` to 2+** when you want only confidently-shaped endpoints in the final doc — drop the long tail. +5. **Pair with `browse network on`** when response-body schemas matter. The CDP firehose alone has request bodies but not response bodies. + +For pipeline internals and the file format reference, see [REFERENCE.md](REFERENCE.md). diff --git a/skills/browser-reverse/package.json b/skills/browser-reverse/package.json new file mode 100644 index 00000000..86360e3f --- /dev/null +++ b/skills/browser-reverse/package.json @@ -0,0 +1,6 @@ +{ + "name": "browser-reverse", + "version": "0.1.0", + "private": true, + "type": "module" +} diff --git a/skills/browser-reverse/scripts/discover.mjs b/skills/browser-reverse/scripts/discover.mjs new file mode 100644 index 00000000..c349fa87 --- /dev/null +++ b/skills/browser-reverse/scripts/discover.mjs @@ -0,0 +1,93 @@ +#!/usr/bin/env node +// Top-level dispatcher: load → filter → normalize → infer → emit. +// +// Usage: +// node scripts/discover.mjs --run .o11y/ [flags] + +import path from 'node:path'; +import fs from 'node:fs'; +import { resolveRun, ensureDir } from './lib/io.mjs'; +import { load } from './load.mjs'; +import { filter } from './filter.mjs'; +import { normalize } from './normalize.mjs'; +import { infer } from './infer.mjs'; +import { emit } from './emit.mjs'; + +function parseArgs(argv) { + const opts = { + run: null, out: null, bodies: null, + include: [], exclude: [], origins: [], + format: 'both', title: null, redact: [], + minSamples: 1, stage: null, + }; + for (let i = 0; i < argv.length; i++) { + const a = argv[i]; + const next = () => argv[++i]; + switch (a) { + case '--run': opts.run = next(); break; + case '--out': opts.out = next(); break; + case '--bodies': opts.bodies = next(); break; + case '--include': opts.include.push(next()); break; + case '--exclude': opts.exclude.push(next()); break; + case '--origins': opts.origins = next().split(',').map(s => s.trim()).filter(Boolean); break; + case '--format': opts.format = next(); break; + case '--title': opts.title = next(); break; + case '--redact': opts.redact = next().split(',').map(s => s.trim()).filter(Boolean); break; + case '--min-samples': opts.minSamples = parseInt(next(), 10); break; + case '--stage': opts.stage = next(); break; + case '-h': case '--help': + printHelp(); process.exit(0); + default: + console.error(`unknown arg: ${a}`); + printHelp(); process.exit(2); + } + } + return opts; +} + +function printHelp() { + console.error(`usage: discover.mjs --run [--out ] [--bodies ] + [--include ]... [--exclude ]... + [--origins ] [--format yaml|json|both] + [--title ] [--redact ] [--min-samples ] + [--stage load|filter|normalize|infer|emit] + + --bodies Directory written by \`browse network on\`. When set, response + bodies (and request bodies for non-postData captures) are + joined into the trace by CDP requestId. Without it, the spec + has no response-body schemas (browse cdp doesn't embed bodies).`); +} + +function main() { + const opts = parseArgs(process.argv.slice(2)); + if (!opts.run) { printHelp(); process.exit(2); } + + const runPath = resolveRun(opts.run); + const outDir = opts.out ? path.resolve(opts.out) : path.join(runPath, 'api-spec'); + ensureDir(outDir); + + const stages = opts.stage ? [opts.stage] : ['load', 'filter', 'normalize', 'infer', 'emit']; + + for (const stage of stages) { + const t0 = Date.now(); + let stats; + switch (stage) { + case 'load': stats = load(runPath, outDir, { bodies: opts.bodies }); break; + case 'filter': stats = filter(outDir, { include: opts.include, exclude: opts.exclude, origins: opts.origins }); break; + case 'normalize': stats = normalize(outDir); break; + case 'infer': stats = infer(outDir, { redact: opts.redact }); break; + case 'emit': stats = emit(outDir, { minSamples: opts.minSamples, format: opts.format, title: opts.title }); break; + default: console.error(`unknown stage: ${stage}`); process.exit(2); + } + const ms = Date.now() - t0; + console.log(`[${stage}] ${ms}ms ${JSON.stringify(stats)}`); + } + + console.log(`\noutput: ${outDir}`); + for (const f of ['openapi.yaml', 'openapi.json', 'report.md', 'confidence.json']) { + const p = path.join(outDir, f); + if (fs.existsSync(p)) console.log(` ${path.relative(process.cwd(), p)}`); + } +} + +main(); diff --git a/skills/browser-reverse/scripts/emit.mjs b/skills/browser-reverse/scripts/emit.mjs new file mode 100644 index 00000000..71f1872c --- /dev/null +++ b/skills/browser-reverse/scripts/emit.mjs @@ -0,0 +1,356 @@ +#!/usr/bin/env node +// Stage 5 — Emit. +// +// Build the OpenAPI 3.1 document, hoist repeated schemas into components, and +// write openapi.yaml, openapi.json, report.md, confidence.json. + +import path from 'node:path'; +import { readJsonl, writeJson, writeText, intermediatePath, readJson } from './lib/io.mjs'; +import { structuralHash } from './lib/schema-merge.mjs'; +import { toYaml } from './lib/yaml.mjs'; + +function confidenceBucket(ep) { + const s = ep.sampleCount; + const flagged = ep.normalizationFlags.length > 0; + const multiStatus = ep.statusCodes.length >= 2; + if (s <= 2 || flagged) return 'low'; + if (s >= 10 && multiStatus) return 'high'; + return 'medium'; +} + +// Hoist structurally-identical inline schemas into components.schemas. We use a +// stable structural hash and bias names off the endpoint path so refs are +// readable (e.g. "Item" instead of "Schema7"). Recurses into nested object/array +// schemas so a Post that appears once at the top level and once as the items of +// a list still hoists as a single component. +function buildComponents(endpoints) { + const byHash = new Map(); // hash -> { name, schema, hint } + const refCount = new Map(); // hash -> count of sites referencing it + + function isObjectSchema(s) { + if (!s || typeof s !== 'object') return false; + if (s.type === 'object') return true; + if (Array.isArray(s.type) && s.type.includes('object')) return true; + return false; + } + function isArraySchema(s) { + if (!s || typeof s !== 'object') return false; + if (s.type === 'array') return true; + if (Array.isArray(s.type) && s.type.includes('array')) return true; + return false; + } + + function visit(schema, hint) { + if (!schema || typeof schema !== 'object') return; + if (isObjectSchema(schema)) { + const h = structuralHash(schema); + refCount.set(h, (refCount.get(h) || 0) + 1); + if (!byHash.has(h)) byHash.set(h, { name: null, schema, hint }); + for (const [k, child] of Object.entries(schema.properties || {})) { + visit(child, propHint(hint, k)); + } + } else if (isArraySchema(schema) && schema.items) { + visit(schema.items, hint); + } + } + + for (const ep of endpoints) { + if (ep.requestSchema) visit(ep.requestSchema, schemaHintFromPath(ep.path) + 'Request'); + for (const [, sch] of Object.entries(ep.responseSchemas || {})) { + visit(sch, schemaHintFromPath(ep.path)); + } + } + + // Hoist when (a) referenced by ≥ 2 sites, OR (b) it's an object with ≥ 4 properties. + const components = {}; + let counter = 0; + for (const [h, info] of byHash.entries()) { + const refs = refCount.get(h) || 0; + const propCount = Object.keys(info.schema.properties || {}).length; + if (refs < 2 && propCount < 4) continue; + let name = info.hint || `Schema${++counter}`; + if (components[name]) name = `${name}_${++counter}`; + info.name = name; + components[name] = info.schema; + } + + // refOrInline rewrites a schema, replacing any nested object schema that + // matches a hoisted component with a $ref. Arrays have their items rewritten. + function refOrInline(schema) { + if (!schema || typeof schema !== 'object') return schema; + if (isObjectSchema(schema)) { + const h = structuralHash(schema); + const info = byHash.get(h); + if (info && info.name) return { $ref: `#/components/schemas/${info.name}` }; + if (!schema.properties) return schema; + const rewritten = { ...schema, properties: {} }; + for (const [k, child] of Object.entries(schema.properties)) { + rewritten.properties[k] = refOrInline(child); + } + return rewritten; + } + if (isArraySchema(schema) && schema.items) { + return { ...schema, items: refOrInline(schema.items) }; + } + return schema; + } + + // Inline-rewrite the components themselves so nested objects within + // components also use $refs. + for (const [name, sch] of Object.entries(components)) { + if (sch.properties) { + components[name] = { ...sch, properties: Object.fromEntries( + Object.entries(sch.properties).map(([k, c]) => [k, refOrInline(c)]), + )}; + } + } + + return { components, refOrInline }; +} + +function propHint(parentHint, key) { + const cap = key.replace(/[^A-Za-z0-9]/g, '').replace(/^./, c => c.toUpperCase()); + return cap || (parentHint ? parentHint + 'Inner' : 'Schema'); +} + +function schemaHintFromPath(p) { + if (!p) return 'Schema'; + const parts = p.split('/').filter(s => s && !s.startsWith('{')); + if (!parts.length) return 'Root'; + const last = parts[parts.length - 1]; + return last.replace(/[^A-Za-z0-9]/g, '').replace(/^./, c => c.toUpperCase()) || 'Schema'; +} + +function makeOperation(ep, refOrInline) { + const params = []; + for (const p of ep.pathParams || []) params.push(p); + for (const p of ep.queryParams || []) params.push(p); + + const op = { + summary: `${ep.method} ${ep.path}`, + operationId: makeOpId(ep), + }; + if (params.length) op.parameters = params; + + if (ep.requestSchema && (ep.method === 'POST' || ep.method === 'PUT' || ep.method === 'PATCH' || ep.method === 'DELETE')) { + op.requestBody = { + content: { + [ep.requestContentType || 'application/json']: { + schema: refOrInline(ep.requestSchema), + ...(ep.requestExample ? { example: ep.requestExample } : {}), + }, + }, + }; + } + + const responses = {}; + const statuses = ep.statusCodes.length ? ep.statusCodes : [200]; + for (const status of statuses) { + const ct = (ep.responseContentTypes && ep.responseContentTypes[status]) || 'application/json'; + const schema = ep.responseSchemas?.[String(status)]; + const entry = { description: defaultDescriptionFor(status) }; + if (schema || ep.responseExample) { + entry.content = { + [ct]: { + ...(schema ? { schema: refOrInline(schema) } : {}), + ...(status === ep.statusCodes[0] && ep.responseExample ? { example: ep.responseExample } : {}), + }, + }; + } + responses[String(status)] = entry; + } + op.responses = responses; + + // Extensions + op['x-confidence'] = { + samples: ep.sampleCount, + statusCodes: ep.statusCodes, + normalizationFlags: ep.normalizationFlags, + confidence: confidenceBucket(ep), + }; + op['x-sample-count'] = ep.sampleCount; + if (ep.observedAuthHeaders?.length) op['x-observed-auth'] = ep.observedAuthHeaders; + op['x-origin'] = ep.origin; + + return op; +} + +function defaultDescriptionFor(status) { + const n = Number(status); + if (n >= 200 && n < 300) return 'Success'; + if (n >= 300 && n < 400) return 'Redirect'; + if (n === 400) return 'Bad request'; + if (n === 401) return 'Unauthorized'; + if (n === 403) return 'Forbidden'; + if (n === 404) return 'Not found'; + if (n >= 400 && n < 500) return 'Client error'; + if (n >= 500) return 'Server error'; + return `Status ${status}`; +} + +function makeOpId(ep) { + const parts = ep.path.split('/').filter(Boolean).map(s => s.replace(/[{}]/g, '')); + const tail = parts.map(p => p.replace(/[^A-Za-z0-9]/g, '_')).join('_'); + return `${ep.method.toLowerCase()}_${tail || 'root'}`; +} + +export function emit(outDir, opts = {}) { + const minSamples = opts.minSamples || 1; + const format = opts.format || 'both'; + const titleOverride = opts.title || null; + + const endpoints = readJsonl(intermediatePath(outDir, 'endpoints.with-schemas.jsonl')); + const kept = endpoints.filter(e => e.sampleCount >= minSamples); + const dropped = endpoints.filter(e => e.sampleCount < minSamples); + + // Servers: one entry per distinct origin, sorted by frequency. + const originCounts = new Map(); + for (const e of kept) originCounts.set(e.origin, (originCounts.get(e.origin) || 0) + e.sampleCount); + const servers = [...originCounts.entries()].sort((a, b) => b[1] - a[1]).map(([url]) => ({ url })); + + const primary = servers[0]?.url || ''; + const title = titleOverride || (primary ? `${new URL(primary).host} (discovered)` : 'Discovered API'); + + const { components, refOrInline } = buildComponents(kept); + + // Build paths: one keyed entry per templated path; each method becomes an + // operation. When the same (path, method) is observed on multiple origins + // (common for third-party analytics endpoints fanned across vendors), keep + // the highest-sample-count operation and record the other origins under + // `x-also-served-from` so no data is silently dropped. + const paths = {}; + const collisions = {}; // pathKey -> [{origin, samples}] + for (const ep of kept) { + const m = ep.method.toLowerCase(); + if (!paths[ep.path]) paths[ep.path] = {}; + const existing = paths[ep.path][m]; + if (!existing) { + paths[ep.path][m] = makeOperation(ep, refOrInline); + } else { + const key = `${m} ${ep.path}`; + if (!collisions[key]) collisions[key] = [{ origin: existing['x-origin'], samples: existing['x-sample-count'] }]; + collisions[key].push({ origin: ep.origin, samples: ep.sampleCount }); + if (ep.sampleCount > (existing['x-sample-count'] || 0)) { + paths[ep.path][m] = makeOperation(ep, refOrInline); + } + } + } + for (const [key, origins] of Object.entries(collisions)) { + const [m, p] = key.split(' '); + const op = paths[p][m]; + const winner = op['x-origin']; + op['x-also-served-from'] = origins.filter(o => o.origin !== winner).map(o => o.origin); + } + + const doc = { + openapi: '3.1.0', + info: { + title, + version: '0.1.0-discovered', + description: 'Spec discovered from a browser-trace capture by the browser-reverse skill. Inductive, not contractual — see `report.md` and `x-confidence` extensions for caveats.', + }, + servers, + paths, + }; + if (Object.keys(components).length) doc.components = { schemas: components }; + + if (format === 'yaml' || format === 'both') { + writeText(path.join(outDir, 'openapi.yaml'), toYaml(doc)); + } + if (format === 'json' || format === 'both') { + writeJson(path.join(outDir, 'openapi.json'), doc); + } + + // confidence.json + const confidence = { + endpoints: endpoints.map(ep => ({ + key: ep.endpointKey, + samples: ep.sampleCount, + statusCodes: ep.statusCodes, + requestBodyKnown: ep.requestBodyKnown, + responseBodyKnown: ep.responseBodyKnown, + normalizationFlags: ep.normalizationFlags, + confidence: confidenceBucket(ep), + includedInSpec: ep.sampleCount >= minSamples, + })), + }; + writeJson(path.join(outDir, 'confidence.json'), confidence); + + // report.md + const redaction = readJson(intermediatePath(outDir, 'redaction-stats.json'), { headers: 0, bodyKeys: 0, bodyValues: 0 }); + writeText(path.join(outDir, 'report.md'), buildReport({ kept, dropped, servers, redaction, minSamples })); + + return { + endpoints: kept.length, + droppedLowSample: dropped.length, + servers: servers.length, + components: Object.keys(components).length, + }; +} + +function buildReport({ kept, dropped, servers, redaction, minSamples }) { + const lines = []; + lines.push('# Browser-reverse: discovered API\n'); + lines.push('## Servers\n'); + for (const s of servers) lines.push(`- ${s.url}`); + if (!servers.length) lines.push('_(none)_'); + lines.push(''); + + lines.push('## Endpoints\n'); + lines.push('| Method | Path | Samples | Statuses | Confidence | Flags |'); + lines.push('|---|---|---|---|---|---|'); + const sorted = [...kept].sort((a, b) => a.path.localeCompare(b.path) || a.method.localeCompare(b.method)); + for (const ep of sorted) { + const flags = ep.normalizationFlags.length ? ep.normalizationFlags.join(', ') : '—'; + lines.push(`| ${ep.method} | \`${ep.path}\` | ${ep.sampleCount} | ${ep.statusCodes.join(', ') || '—'} | ${confidenceBucket(ep)} | ${flags} |`); + } + if (!kept.length) lines.push('| — | — | — | — | — | — |'); + lines.push(''); + + if (dropped.length) { + lines.push(`## Dropped (below --min-samples=${minSamples})\n`); + for (const ep of dropped) lines.push(`- \`${ep.method} ${ep.path}\` (${ep.sampleCount} sample${ep.sampleCount === 1 ? '' : 's'})`); + lines.push(''); + } + + lines.push('## Coverage caveats\n'); + const noResp = kept.filter(e => !e.responseBodyKnown); + if (noResp.length) { + lines.push(`- **${noResp.length}** endpoint${noResp.length === 1 ? '' : 's'} have no response-body schema. \`browse cdp\` does not embed response bodies; pair with \`browse network on\` to capture them.`); + } + const singleSample = kept.filter(e => e.sampleCount === 1); + if (singleSample.length) { + lines.push(`- **${singleSample.length}** endpoint${singleSample.length === 1 ? '' : 's'} were observed only once. Drive the same flow again to gain confidence.`); + } + const noBodyOnPost = kept.filter(e => ['POST', 'PUT', 'PATCH'].includes(e.method) && !e.requestBodyKnown); + if (noBodyOnPost.length) { + lines.push(`- **${noBodyOnPost.length}** mutation endpoint${noBodyOnPost.length === 1 ? '' : 's'} have no request body in the trace (form-encoded? non-JSON? not captured?).`); + } + + lines.push(''); + lines.push('## Redaction\n'); + lines.push(`- Headers redacted: ${redaction.headers}`); + lines.push(`- Body keys redacted: ${redaction.bodyKeys}`); + lines.push(`- Body values redacted by pattern: ${redaction.bodyValues}`); + lines.push(''); + + lines.push('## Suggested follow-up flows\n'); + const status404 = kept.filter(e => e.statusCodes.includes(404)); + if (status404.length) { + lines.push(`- Endpoints that returned 404: ${status404.slice(0, 5).map(e => '`' + e.method + ' ' + e.path + '`').join(', ')}. Re-run with valid IDs to widen the success-path schema.`); + } + if (singleSample.length) { + lines.push('- Re-exercise the single-sample endpoints listed above to promote them out of `low` confidence.'); + } + if (!status404.length && !singleSample.length) { + lines.push('- The captured flow looks reasonably balanced. Add an authenticated session if the unauth view is what was captured.'); + } + return lines.join('\n') + '\n'; +} + +if (import.meta.url === `file://${process.argv[1]}`) { + const out = process.argv[2]; + if (!out) { console.error('usage: emit.mjs '); process.exit(2); } + const stats = emit(out); + console.log(`emit: ${stats.endpoints} endpoints, ${stats.servers} server(s), ${stats.components} components${stats.droppedLowSample ? `, ${stats.droppedLowSample} dropped (low sample)` : ''}`); +} diff --git a/skills/browser-reverse/scripts/filter.mjs b/skills/browser-reverse/scripts/filter.mjs new file mode 100644 index 00000000..f681c455 --- /dev/null +++ b/skills/browser-reverse/scripts/filter.mjs @@ -0,0 +1,66 @@ +#!/usr/bin/env node +// Stage 2 — Filter. +// +// Apply --include / --exclude / --origins on top of paired.jsonl. Default +// excludes scrub analytics, sourcemaps, fonts, and other static-asset noise +// that the load stage may have let through (e.g. when looksApiUrl matched). + +import { readJsonl, writeJsonl, intermediatePath } from './lib/io.mjs'; + +const DEFAULT_EXCLUDES = [ + // Analytics / RUM / session replay + /segment\.(io|com)/i, + /mixpanel\.com/i, + /google-analytics\.com/i, + /googletagmanager\.com/i, + /datadog(hq)?\.com/i, + /sentry\.io/i, + /amplitude\.com/i, + /fullstory\.com/i, + /hotjar\.com/i, + /intercom\.io/i, + /clarity\.ms/i, + /cloudflareinsights\.com/i, + /doubleclick\.net/i, + /facebook\.com\/tr/i, + // Static assets + /\.(png|jpe?g|gif|svg|webp|ico|woff2?|ttf|eot|otf|css|map|mp4|webm|mp3|m4a)(\?|$)/i, + // SW / metadata + /\/sw\.js(\?|$)/i, + /\/service-worker\.js(\?|$)/i, + /\/manifest\.json(\?|$)/i, + /\/robots\.txt(\?|$)/i, + /\/favicon\.ico(\?|$)/i, +]; + +export function filter(outDir, opts = {}) { + const { include = [], exclude = [], origins = [] } = opts; + const includeRes = include.map(s => new RegExp(s)); + const excludeRes = [...DEFAULT_EXCLUDES, ...exclude.map(s => new RegExp(s))]; + const originSet = new Set(origins); + + const paired = readJsonl(intermediatePath(outDir, 'paired.jsonl')); + const out = []; + let droppedOrigin = 0, droppedExclude = 0, droppedInclude = 0; + + for (const row of paired) { + if (originSet.size) { + const host = row.origin ? new URL(row.origin).host : ''; + const matched = [...originSet].some(o => host === o || host.endsWith('.' + o)); + if (!matched) { droppedOrigin++; continue; } + } + if (excludeRes.some(re => re.test(row.url))) { droppedExclude++; continue; } + if (includeRes.length && !includeRes.some(re => re.test(row.url))) { droppedInclude++; continue; } + out.push(row); + } + + writeJsonl(intermediatePath(outDir, 'filtered.jsonl'), out); + return { kept: out.length, droppedOrigin, droppedExclude, droppedInclude }; +} + +if (import.meta.url === `file://${process.argv[1]}`) { + const out = process.argv[2]; + if (!out) { console.error('usage: filter.mjs '); process.exit(2); } + const stats = filter(out); + console.log(`filter: kept ${stats.kept}, dropped ${stats.droppedExclude} (exclude) ${stats.droppedOrigin} (origin) ${stats.droppedInclude} (include)`); +} diff --git a/skills/browser-reverse/scripts/infer.mjs b/skills/browser-reverse/scripts/infer.mjs new file mode 100644 index 00000000..87dbf408 --- /dev/null +++ b/skills/browser-reverse/scripts/infer.mjs @@ -0,0 +1,139 @@ +#!/usr/bin/env node +// Stage 4 — Infer. +// +// Run JSON-Schema inference across each endpoint's request/response samples, +// pick representative redacted examples, and persist the result alongside the +// endpoint metadata for the emit stage. + +import path from 'node:path'; +import crypto from 'node:crypto'; +import { readJsonl, writeJsonl, writeJson, intermediatePath, samplePath, ensureDir } from './lib/io.mjs'; +import { newProto, ingest, toSchema } from './lib/schema-merge.mjs'; +import { makeRedactor } from './lib/redact.mjs'; + +function pathHash(method, p) { + return crypto.createHash('sha1').update(`${method} ${p}`).digest('hex').slice(0, 10); +} + +function inferAuthHeaders(samples) { + const seen = new Set(); + for (const s of samples) { + for (const k of Object.keys(s.reqHeaders || {})) { + const lk = k.toLowerCase(); + if (lk === 'authorization' || lk === 'x-api-key' || /token/.test(lk) || /^x-.*-auth/.test(lk)) { + seen.add(lk); + } + } + } + return [...seen].sort(); +} + +export function infer(outDir, opts = {}) { + const redactor = makeRedactor({ extra: opts.redact || [] }); + + const endpoints = readJsonl(intermediatePath(outDir, 'endpoints.jsonl')); + const samplesByKey = new Map(); + for (const row of readJsonl(intermediatePath(outDir, 'endpoint-samples.jsonl'))) { + samplesByKey.set(row.endpointKey, row.samples); + } + + ensureDir(path.join(outDir, 'samples')); + const enriched = []; + + for (const ep of endpoints) { + const samples = samplesByKey.get(ep.endpointKey) || []; + const reqProto = newProto(); + const respProtoByStatus = new Map(); // status -> proto + + let pickedReqExample = null; + let pickedRespExample = null; + let pickedReqStatus = null, pickedRespStatus = null; + + for (const s of samples) { + if (s.reqBody != null && typeof s.reqBody === 'object') { + ingest(reqProto, s.reqBody); + if (!pickedReqExample) { pickedReqExample = s.reqBody; pickedReqStatus = s.status; } + } + if (s.respBody != null && typeof s.respBody === 'object') { + const status = s.status ?? 0; + let p = respProtoByStatus.get(status); + if (!p) { p = newProto(); respProtoByStatus.set(status, p); } + ingest(p, s.respBody); + if (s.status >= 200 && s.status < 300 && !pickedRespExample) { + pickedRespExample = s.respBody; + pickedRespStatus = s.status; + } + } + } + + const requestBodyKnown = reqProto.samples > 0; + const responseBodyKnown = [...respProtoByStatus.values()].some(p => p.samples > 0); + + const requestSchema = requestBodyKnown ? toSchema(reqProto) : null; + const responseSchemas = {}; + for (const [status, p] of respProtoByStatus.entries()) { + responseSchemas[String(status)] = toSchema(p); + } + + // Determine the canonical content-type per role from sample headers. + const reqCT = inferContentType(samples, 'reqHeaders'); + const respCTByStatus = {}; + for (const s of samples) { + const status = s.status ?? 0; + if (!respCTByStatus[status]) respCTByStatus[status] = inferContentType([s], 'respHeaders'); + } + + // Redact once and reuse for both the persisted sample file and the inline + // OpenAPI example. (Calling redactBody twice double-counts redactions.) + const ph = pathHash(ep.method, ep.path); + const reqExample = pickedReqExample != null ? redactor.redactBody(pickedReqExample) : null; + const respExample = pickedRespExample != null ? redactor.redactBody(pickedRespExample) : null; + const reqHeaders = redactor.redactHeaders(samples[0]?.reqHeaders || {}); + const respHeaders = redactor.redactHeaders(samples[0]?.respHeaders || {}); + + const example = { + endpoint: ep.endpointKey, + request: { status: pickedReqStatus, headers: reqHeaders, body: reqExample }, + response: { status: pickedRespStatus, headers: respHeaders, body: respExample }, + }; + writeJson(samplePath(outDir, ep.method, ph), example); + + enriched.push({ + ...ep, + pathHash: ph, + requestBodyKnown, + responseBodyKnown, + requestSchema, + responseSchemas, + requestContentType: reqCT, + responseContentTypes: respCTByStatus, + requestExample: reqExample, + responseExample: respExample, + observedAuthHeaders: inferAuthHeaders(samples), + }); + } + + writeJsonl(intermediatePath(outDir, 'endpoints.with-schemas.jsonl'), enriched); + + // Also persist redaction stats for the report. + writeJson(intermediatePath(outDir, 'redaction-stats.json'), redactor.counts); + + return { endpoints: enriched.length, redactor: redactor.counts }; +} + +function inferContentType(samples, headerField) { + for (const s of samples) { + const headers = s[headerField] || {}; + for (const [k, v] of Object.entries(headers)) { + if (k.toLowerCase() === 'content-type') return String(v).split(';')[0].trim(); + } + } + return null; +} + +if (import.meta.url === `file://${process.argv[1]}`) { + const out = process.argv[2]; + if (!out) { console.error('usage: infer.mjs '); process.exit(2); } + const stats = infer(out); + console.log(`infer: ${stats.endpoints} endpoints (redactions: ${stats.redactor.headers}h ${stats.redactor.bodyKeys}k ${stats.redactor.bodyValues}v)`); +} diff --git a/skills/browser-reverse/scripts/lib/io.mjs b/skills/browser-reverse/scripts/lib/io.mjs new file mode 100644 index 00000000..e6e10a82 --- /dev/null +++ b/skills/browser-reverse/scripts/lib/io.mjs @@ -0,0 +1,58 @@ +// File-IO helpers shared across the pipeline. Mirrors the conventions of +// browser-trace/scripts/lib.mjs. Node stdlib only. + +import fs from 'node:fs'; +import path from 'node:path'; + +export function ensureDir(p) { + fs.mkdirSync(p, { recursive: true }); +} + +export function readJsonl(p) { + if (!fs.existsSync(p)) return []; + const out = []; + for (const line of fs.readFileSync(p, 'utf8').split('\n')) { + if (!line) continue; + try { out.push(JSON.parse(line)); } catch { /* skip malformed */ } + } + return out; +} + +export function writeJsonl(p, items) { + ensureDir(path.dirname(p)); + const body = items.length ? items.map(o => JSON.stringify(o)).join('\n') + '\n' : ''; + fs.writeFileSync(p, body); +} + +export function readJson(p, fallback = null) { + if (!fs.existsSync(p)) return fallback; + try { return JSON.parse(fs.readFileSync(p, 'utf8')); } + catch { return fallback; } +} + +export function writeJson(p, obj) { + ensureDir(path.dirname(p)); + fs.writeFileSync(p, JSON.stringify(obj, null, 2) + '\n'); +} + +export function writeText(p, s) { + ensureDir(path.dirname(p)); + fs.writeFileSync(p, s); +} + +export function resolveRun(runArg) { + // Accept both bare run-id and full path. + if (fs.existsSync(runArg) && fs.statSync(runArg).isDirectory()) return path.resolve(runArg); + const root = process.env.O11Y_ROOT || '.o11y'; + const guess = path.join(root, runArg); + if (fs.existsSync(guess)) return path.resolve(guess); + throw new Error(`run path not found: ${runArg} (tried ${guess})`); +} + +export function intermediatePath(outDir, name) { + return path.join(outDir, 'intermediate', name); +} + +export function samplePath(outDir, method, pathHash) { + return path.join(outDir, 'samples', `${method.toLowerCase()}__${pathHash}.json`); +} diff --git a/skills/browser-reverse/scripts/lib/path-template.mjs b/skills/browser-reverse/scripts/lib/path-template.mjs new file mode 100644 index 00000000..4835958e --- /dev/null +++ b/skills/browser-reverse/scripts/lib/path-template.mjs @@ -0,0 +1,86 @@ +// Templatize concrete URL paths into OpenAPI path templates. +// +// Strategy: classify each segment in isolation; collisions across samples are +// handled by the caller (normalize.mjs), which groups samples by the resulting +// templated path and falls back to keeping endpoints split when the response +// shape disagrees. + +const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/i; +const HEX_RE = /^[0-9a-f]{8,}$/i; +const B62_RE = /^[A-Za-z0-9]{8,}$/; +const INT_RE = /^\d+$/; + +// Static-looking segments we never template even if they're numeric/hex +// (e.g. version markers like "v1", "v2", short slugs that are real path parts). +const STATIC_HINTS = /^(v\d+|api|graphql|rest|public|private|me|self)$/i; + +export function classifySegment(seg) { + if (!seg) return { kind: 'static' }; + if (STATIC_HINTS.test(seg)) return { kind: 'static' }; + if (UUID_RE.test(seg)) return { kind: 'param', name: 'id', schema: { type: 'string', format: 'uuid' } }; + if (INT_RE.test(seg)) return { kind: 'param', name: 'id', schema: { type: 'integer' } }; + if (HEX_RE.test(seg)) return { kind: 'param', name: 'id', schema: { type: 'string' } }; + if (B62_RE.test(seg) && /[A-Z]/.test(seg) && /[a-z]/.test(seg) && /\d/.test(seg)) { + return { kind: 'param', name: 'id', schema: { type: 'string' } }; + } + return { kind: 'static' }; +} + +// Single-pass templating used during the first sweep — segments are evaluated +// independently. Returns { template, params: [{name, schema, position}] }. +export function templatize(rawPath) { + const segs = rawPath.split('/'); + const params = []; + let counter = 0; + const out = segs.map((seg, i) => { + if (!seg && i > 0) return seg; + const c = classifySegment(seg); + if (c.kind === 'static') return seg; + counter++; + const name = counter === 1 ? c.name : `${c.name}${counter}`; + params.push({ name, schema: c.schema, position: i }); + return `{${name}}`; + }); + return { template: out.join('/'), params }; +} + +// Second pass: given a set of paths that share the same number of segments +// and the same statics in the obvious positions, detect "slug" segments — +// positions that are alpha and *vary* across samples but didn't trip the +// numeric/UUID/hex classifiers in pass 1. Returns the same shape as templatize. +export function templatizeWithSlugs(paths) { + if (paths.length < 2) return templatize(paths[0]); + const split = paths.map(p => p.split('/')); + const len = split[0].length; + if (!split.every(s => s.length === len)) return templatize(paths[0]); + + const params = []; + let counter = 0; + const tpl = []; + for (let i = 0; i < len; i++) { + const colSamples = split.map(s => s[i]); + const first = colSamples[0]; + if (!first && i > 0) { tpl.push(''); continue; } + + const c0 = classifySegment(first); + if (c0.kind === 'param') { + counter++; + const name = counter === 1 ? c0.name : `${c0.name}${counter}`; + params.push({ name, schema: c0.schema, position: i }); + tpl.push(`{${name}}`); + continue; + } + + const distinct = new Set(colSamples); + if (distinct.size > 1 && colSamples.every(s => /^[A-Za-z0-9_-]+$/.test(s))) { + counter++; + const name = counter === 1 ? 'slug' : `slug${counter}`; + params.push({ name, schema: { type: 'string' }, position: i }); + tpl.push(`{${name}}`); + continue; + } + + tpl.push(first); + } + return { template: tpl.join('/'), params }; +} diff --git a/skills/browser-reverse/scripts/lib/redact.mjs b/skills/browser-reverse/scripts/lib/redact.mjs new file mode 100644 index 00000000..73c941ed --- /dev/null +++ b/skills/browser-reverse/scripts/lib/redact.mjs @@ -0,0 +1,73 @@ +// Redact credentials and PII before persisting samples or schema-derived bodies. +// All redactions replace values in-place with the literal string "" +// so downstream schema inference still sees a string and types stay coherent. + +const HEADER_DENY = new Set([ + 'authorization', 'cookie', 'set-cookie', 'x-csrf-token', 'x-xsrf-token', + 'x-api-key', 'proxy-authorization', +]); + +const HEADER_PATTERNS = [/token/i, /secret/i, /signature/i, /session/i]; + +const KEY_DENY = new Set([ + 'password', 'token', 'secret', 'api_key', 'apikey', + 'accesstoken', 'refreshtoken', 'creditcard', 'ssn', +]); + +const JWT_RE = /^eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+$/; +const EMAIL_RE = /^[^\s@]+@[^\s@]+\.[^\s@]+$/; +const PHONE_RE = /^\+?[0-9][0-9\s().-]{6,}[0-9]$/; + +export function makeRedactor({ extra = [] } = {}) { + const extraHeaders = new Set(); + const extraKeys = new Set(); + for (const e of extra) { + const k = e.toLowerCase(); + extraHeaders.add(k); + extraKeys.add(k); + } + const counts = { headers: 0, bodyKeys: 0, bodyValues: 0 }; + + function isHeaderSecret(name) { + const k = name.toLowerCase(); + if (HEADER_DENY.has(k) || extraHeaders.has(k)) return true; + return HEADER_PATTERNS.some(re => re.test(k)); + } + + function isKeySecret(name) { + const k = String(name).toLowerCase().replace(/[_-]/g, ''); + return KEY_DENY.has(k) || extraKeys.has(k); + } + + function isValueSecret(v) { + if (typeof v !== 'string') return false; + if (v.length < 6) return false; + return JWT_RE.test(v) || EMAIL_RE.test(v) || PHONE_RE.test(v); + } + + function redactHeaders(h) { + if (!h || typeof h !== 'object') return h; + const out = {}; + for (const [k, v] of Object.entries(h)) { + if (isHeaderSecret(k)) { out[k] = ''; counts.headers++; } + else out[k] = v; + } + return out; + } + + function redactBody(node) { + if (Array.isArray(node)) return node.map(redactBody); + if (node && typeof node === 'object') { + const out = {}; + for (const [k, v] of Object.entries(node)) { + if (isKeySecret(k)) { out[k] = ''; counts.bodyKeys++; } + else out[k] = redactBody(v); + } + return out; + } + if (isValueSecret(node)) { counts.bodyValues++; return ''; } + return node; + } + + return { redactHeaders, redactBody, counts }; +} diff --git a/skills/browser-reverse/scripts/lib/schema-merge.mjs b/skills/browser-reverse/scripts/lib/schema-merge.mjs new file mode 100644 index 00000000..106834ce --- /dev/null +++ b/skills/browser-reverse/scripts/lib/schema-merge.mjs @@ -0,0 +1,175 @@ +// JSON-Schema (draft 2020-12 / OpenAPI 3.1 compatible) inference from sample values. +// +// The merge is associative and idempotent: mergeSchemas(merge(a,b), c) == merge(a, merge(b,c)). +// Required fields are intersected (must be present in every sample). Types are +// unioned. Arrays infer item schemas across all samples. Enum detection runs as +// a final pass once all samples are merged in. + +const ENUM_MAX = parseInt(process.env.DISCOVER_ENUM_MAX_DISTINCT || '8', 10); +const ENUM_MIN = parseInt(process.env.DISCOVER_ENUM_MIN_SAMPLES || '5', 10); + +const ISO_RE = /^\d{4}-\d{2}-\d{2}([T ]\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:?\d{2})?)?$/; +const URI_RE = /^https?:\/\/\S+$/i; +const EMAIL_RE = /^[^\s@]+@[^\s@]+\.[^\s@]+$/; +const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/i; + +function jsonType(v) { + if (v === null) return 'null'; + if (Array.isArray(v)) return 'array'; + return typeof v; // 'string', 'number', 'boolean', 'object' +} + +function inferFormat(v) { + if (typeof v !== 'string') return null; + if (UUID_RE.test(v)) return 'uuid'; + if (ISO_RE.test(v)) return 'date-time'; + if (URI_RE.test(v)) return 'uri'; + if (EMAIL_RE.test(v)) return 'email'; + return null; +} + +// Build a "pre-schema" — captures every observed sample so we can compute +// required/enum/format with global knowledge, then collapse to JSON Schema. +export function newProto() { + return { + types: new Set(), // 'string', 'integer', 'number', 'boolean', 'null', 'object', 'array' + samples: 0, + nullCount: 0, + formats: new Map(), // format -> count of samples that matched + values: new Set(), // primitive values, capped (used for enum detection) + valuesCapped: false, + properties: new Map(), // key -> proto + presence: new Map(), // key -> count of samples that contained the key + items: null, // proto for array items + }; +} + +const VALUE_CAP = 64; + +export function ingest(proto, value) { + proto.samples++; + const t = jsonType(value); + + if (t === 'null') { proto.types.add('null'); proto.nullCount++; return; } + + if (t === 'number') { + proto.types.add(Number.isInteger(value) ? 'integer' : 'number'); + } else { + proto.types.add(t); + } + + if (t === 'string' || t === 'number' || t === 'boolean' || t === 'integer') { + if (!proto.valuesCapped) { + proto.values.add(value); + if (proto.values.size > VALUE_CAP) { + proto.values.clear(); + proto.valuesCapped = true; + } + } + if (t === 'string') { + const f = inferFormat(value); + if (f) proto.formats.set(f, (proto.formats.get(f) || 0) + 1); + } + } + + if (t === 'object') { + for (const [k, v] of Object.entries(value)) { + proto.presence.set(k, (proto.presence.get(k) || 0) + 1); + let child = proto.properties.get(k); + if (!child) { child = newProto(); proto.properties.set(k, child); } + ingest(child, v); + } + } + + if (t === 'array') { + if (!proto.items) proto.items = newProto(); + for (const item of value) ingest(proto.items, item); + // Important: treat array containment as a single sample at this level — + // ingest() above already counted samples++ once. Items are sampled + // individually inside the recursive call. + } +} + +export function ingestMany(proto, values) { + for (const v of values) ingest(proto, v); + return proto; +} + +// Convert a proto into a JSON Schema fragment. +export function toSchema(proto) { + if (!proto || proto.samples === 0) return {}; + + const types = Array.from(proto.types); + const nonNull = types.filter(t => t !== 'null'); + const nullable = proto.types.has('null') && nonNull.length > 0; + + // Scalar / enum case + if (nonNull.length === 1 && !['object', 'array'].includes(nonNull[0])) { + const t = nonNull[0]; + const out = { type: t }; + if (nullable) out.type = [t, 'null']; + + if (t === 'string') { + // Format: pick the format that matched ≥ 80% of string samples + const stringSamples = proto.samples - proto.nullCount; + if (stringSamples > 0) { + for (const [f, n] of proto.formats.entries()) { + if (n / stringSamples >= 0.8) { out.format = f; break; } + } + } + } + + // Enum detection: low cardinality AND meaningful repetition (otherwise + // every distinct ID across N samples would look like an N-way enum). + const valueSamples = proto.samples - proto.nullCount; + if (!proto.valuesCapped && + proto.values.size > 0 && + proto.values.size <= ENUM_MAX && + valueSamples >= ENUM_MIN && + proto.values.size <= Math.max(2, Math.floor(valueSamples / 2))) { + out.enum = Array.from(proto.values).sort((a, b) => String(a).localeCompare(String(b))); + } + return out; + } + + // Object + if (nonNull.length === 1 && nonNull[0] === 'object') { + const properties = {}; + const required = []; + for (const [k, child] of proto.properties.entries()) { + properties[k] = toSchema(child); + const presence = proto.presence.get(k) || 0; + if (presence === proto.samples - proto.nullCount && presence > 0) required.push(k); + } + const out = { type: nullable ? ['object', 'null'] : 'object' }; + if (Object.keys(properties).length) out.properties = properties; + if (required.length) out.required = required.sort(); + return out; + } + + // Array + if (nonNull.length === 1 && nonNull[0] === 'array') { + const out = { type: nullable ? ['array', 'null'] : 'array' }; + if (proto.items) out.items = toSchema(proto.items); + return out; + } + + // Mixed types — fall back to a typed union via "type" array (OpenAPI 3.1 / draft 2020-12 OK). + const out = { type: nullable ? [...nonNull, 'null'] : nonNull }; + return out; +} + +// Convenience: build a schema directly from an array of sample values. +export function inferSchema(samples) { + const p = newProto(); + ingestMany(p, samples); + return toSchema(p); +} + +// Stable structural hash for schema deduplication when hoisting components. +export function structuralHash(schema) { + if (!schema || typeof schema !== 'object') return JSON.stringify(schema); + if (Array.isArray(schema)) return '[' + schema.map(structuralHash).join(',') + ']'; + const keys = Object.keys(schema).sort(); + return '{' + keys.map(k => JSON.stringify(k) + ':' + structuralHash(schema[k])).join(',') + '}'; +} diff --git a/skills/browser-reverse/scripts/lib/yaml.mjs b/skills/browser-reverse/scripts/lib/yaml.mjs new file mode 100644 index 00000000..0f8f697c --- /dev/null +++ b/skills/browser-reverse/scripts/lib/yaml.mjs @@ -0,0 +1,87 @@ +// Minimal YAML emitter for the OpenAPI document we build. Sufficient for the +// shapes we produce (objects, arrays, strings, numbers, booleans, null) without +// pulling in a dep. Strings are conservatively quoted whenever they contain any +// character that would change YAML parsing. + +// YAML 1.2 reserves certain characters as indicators that cannot start a plain +// scalar (they must be quoted): , [ ] { } # & * ! | > ' " % @ ` +// plus ? and : when followed by whitespace, plus - when followed by whitespace. +// We're conservative: if any of those rules might trip, single-quote the string. +const FIRST_CHAR_DENY = /^[,\[\]{}#&*!|>'"%@`]/; +const FIRST_CHAR_AMBIG = /^[-?:]/; +const SAFE_BARE = /^[A-Za-z0-9_./-][A-Za-z0-9 _./@-]*$/; +const RESERVED = new Set([ + 'true', 'false', 'null', 'yes', 'no', 'on', 'off', '~', + 'True', 'False', 'Null', 'TRUE', 'FALSE', 'NULL', +]); + +function quoteScalar(s) { + if (s === '') return "''"; + if (RESERVED.has(s)) return `'${s}'`; + if (/^-?\d+(\.\d+)?$/.test(s)) return `'${s}'`; + if (FIRST_CHAR_DENY.test(s) || FIRST_CHAR_AMBIG.test(s) || !SAFE_BARE.test(s)) { + // Prefer single quotes; if the string contains them, escape via doubling. + if (!s.includes("'") && !s.includes('\n')) return `'${s}'`; + return JSON.stringify(s); // double-quoted YAML is JSON-compatible + } + return s; +} + +function emitScalar(v) { + if (v === null || v === undefined) return 'null'; + if (typeof v === 'boolean') return v ? 'true' : 'false'; + if (typeof v === 'number') { + if (!Number.isFinite(v)) return JSON.stringify(String(v)); + return String(v); + } + return quoteScalar(String(v)); +} + +function isScalar(v) { + return v === null || v === undefined || + typeof v === 'string' || typeof v === 'number' || typeof v === 'boolean'; +} + +function emit(node, indent) { + if (isScalar(node)) return emitScalar(node); + const pad = ' '.repeat(indent); + + if (Array.isArray(node)) { + if (node.length === 0) return '[]'; + return node.map(item => { + if (isScalar(item)) return `${pad}- ${emitScalar(item)}`; + const inner = emit(item, indent + 1); + // Place the dash on the same line as the first key of an object, or as + // the opener of a nested array. + if (inner.startsWith(pad + ' ')) { + return pad + '- ' + inner.trimStart().slice(0); // first line dedented relative to dash + } + return `${pad}-\n${inner}`; + }).join('\n'); + } + + // Object + const keys = Object.keys(node); + if (keys.length === 0) return '{}'; + const lines = []; + for (const k of keys) { + const v = node[k]; + const keyStr = quoteScalar(k); + if (isScalar(v)) { + lines.push(`${pad}${keyStr}: ${emitScalar(v)}`); + } else if (Array.isArray(v)) { + if (v.length === 0) { lines.push(`${pad}${keyStr}: []`); continue; } + lines.push(`${pad}${keyStr}:`); + lines.push(emit(v, indent + 1)); + } else { + if (Object.keys(v).length === 0) { lines.push(`${pad}${keyStr}: {}`); continue; } + lines.push(`${pad}${keyStr}:`); + lines.push(emit(v, indent + 1)); + } + } + return lines.join('\n'); +} + +export function toYaml(obj) { + return emit(obj, 0) + '\n'; +} diff --git a/skills/browser-reverse/scripts/load.mjs b/skills/browser-reverse/scripts/load.mjs new file mode 100644 index 00000000..6d4ba292 --- /dev/null +++ b/skills/browser-reverse/scripts/load.mjs @@ -0,0 +1,170 @@ +#!/usr/bin/env node +// Stage 1 — Load. +// +// Read browser-trace's cdp/network/{requests,responses}.jsonl, pair them by +// requestId, drop preflight + redirects + obvious non-API resource types, and +// write `intermediate/paired.jsonl`. +// +// Optional: a `browse network on` capture directory can be passed via +// `--bodies ` (or stashed under `/cdp/network/bodies/`). Each +// per-request subdir there has request.json + response.json with the actual +// bodies. The browse-network "id" matches the CDP requestId for XHR/Fetch, so +// we join directly on requestId and inject reqBody / respBody into paired rows. + +import fs from 'node:fs'; +import path from 'node:path'; +import { readJsonl, writeJsonl, intermediatePath, ensureDir } from './lib/io.mjs'; + +const KEEP_TYPES = new Set(['XHR', 'Fetch', 'Document']); + +function tryParseJson(s) { + if (typeof s !== 'string') return s; + try { return JSON.parse(s); } catch { return s; } +} + +function looksApiUrl(url) { + return /\/(api|graphql|rest|v\d+)\b/i.test(url) || + /\.(json|jsonl|ndjson)(\?|$)/i.test(url); +} + +function urlPath(u) { + try { return new URL(u).pathname; } catch { return u; } +} + +function urlOrigin(u) { + try { const x = new URL(u); return `${x.protocol}//${x.host}`; } catch { return null; } +} + +function urlQuery(u) { + try { + const x = new URL(u); + const out = {}; + for (const [k, v] of x.searchParams.entries()) { + // Last value wins for repeats; we record the existence either way. + if (out[k] === undefined) out[k] = v; + } + return out; + } catch { return {}; } +} + +// Walk a `browse network` capture directory and return a Map keyed by the +// CDP requestId, each value `{ reqBody, respBody }`. Bodies that are valid JSON +// are returned parsed; otherwise the raw string is preserved. +function loadBrowseNetworkBodies(bodiesDir) { + const out = new Map(); + if (!bodiesDir || !fs.existsSync(bodiesDir)) return out; + const entries = fs.readdirSync(bodiesDir, { withFileTypes: true }); + for (const e of entries) { + if (!e.isDirectory()) continue; + const subdir = path.join(bodiesDir, e.name); + const reqPath = path.join(subdir, 'request.json'); + const respPath = path.join(subdir, 'response.json'); + if (!fs.existsSync(reqPath)) continue; + let req, resp; + try { req = JSON.parse(fs.readFileSync(reqPath, 'utf8')); } catch { continue; } + try { resp = fs.existsSync(respPath) ? JSON.parse(fs.readFileSync(respPath, 'utf8')) : null; } catch { resp = null; } + if (!req?.id) continue; + const reqBody = req.body != null ? tryParseJson(req.body) : null; + const respBody = resp?.body != null ? tryParseJson(resp.body) : null; + out.set(String(req.id), { reqBody, respBody }); + } + return out; +} + +export function load(runPath, outDir, opts = {}) { + const cdpDir = path.join(runPath, 'cdp', 'network'); + const requests = readJsonl(path.join(cdpDir, 'requests.jsonl')); + const responses = readJsonl(path.join(cdpDir, 'responses.jsonl')); + + // Body sources: explicit --bodies path > /cdp/network/bodies/ if present + let bodiesDir = opts.bodies || null; + if (!bodiesDir) { + const stashed = path.join(runPath, 'cdp', 'network', 'bodies'); + if (fs.existsSync(stashed)) bodiesDir = stashed; + } + const bodyMap = loadBrowseNetworkBodies(bodiesDir); + + // Index responses by requestId; if the trace has duplicates (redirects), the + // last one wins so the terminal status code is what we keep. + const respByReq = new Map(); + for (const ev of responses) { + const rid = ev?.params?.requestId; + if (rid) respByReq.set(rid, ev); + } + + const paired = []; + for (const ev of requests) { + const p = ev?.params; + if (!p?.request) continue; + + const method = p.request.method; + const url = p.request.url; + if (!url || !method) continue; + if (method === 'OPTIONS') continue; + if (url.startsWith('data:') || url.startsWith('blob:')) continue; + + // Resource type: prefer p.type (CDP), fall back to URL heuristic. + const type = p.type || 'Other'; + if (!KEEP_TYPES.has(type) && !looksApiUrl(url)) continue; + + const respEv = respByReq.get(p.requestId); + const resp = respEv?.params?.response; + const status = resp?.status ?? null; + if (status && status >= 300 && status < 400) { + // Pure redirect. The browser will issue a follow-up request with the + // same requestId carrying redirectResponse on it; we already record the + // post-redirect resource via the next requestWillBeSent. Drop the + // intermediate. + continue; + } + + const contentType = resp?.headers + ? Object.entries(resp.headers).find(([k]) => k.toLowerCase() === 'content-type')?.[1] ?? null + : null; + + let reqBody = p.request.postData ? tryParseJson(p.request.postData) : null; + let respBody = null; + + // Augment with browse-network bodies when present. Match by requestId + // (the browse-network entry's `id` IS the CDP requestId for XHR/Fetch). + const captured = bodyMap.get(String(p.requestId)); + if (captured) { + if (reqBody == null && captured.reqBody != null) reqBody = captured.reqBody; + if (captured.respBody != null) respBody = captured.respBody; + } + + paired.push({ + requestId: p.requestId, + method, + url, + origin: urlOrigin(url), + path: urlPath(url), + query: urlQuery(url), + status, + type, + contentType, + reqHeaders: p.request.headers || {}, + reqBody, + respHeaders: resp?.headers || {}, + respBody, + ts: typeof p.wallTime === 'number' ? Math.round(p.wallTime * 1000) : null, + }); + } + + ensureDir(path.join(outDir, 'intermediate')); + writeJsonl(intermediatePath(outDir, 'paired.jsonl'), paired); + return { + count: paired.length, + requests: requests.length, + responses: responses.length, + bodiesAttached: paired.filter(r => r.respBody != null).length, + bodiesDir, + }; +} + +if (import.meta.url === `file://${process.argv[1]}`) { + const [run, out, bodies] = process.argv.slice(2); + if (!run || !out) { console.error('usage: load.mjs [bodies-dir]'); process.exit(2); } + const stats = load(run, out, { bodies }); + console.log(`load: ${stats.count} paired (from ${stats.requests} req / ${stats.responses} resp)${stats.bodiesAttached ? `, ${stats.bodiesAttached} response bodies attached` : ''}`); +} diff --git a/skills/browser-reverse/scripts/normalize.mjs b/skills/browser-reverse/scripts/normalize.mjs new file mode 100644 index 00000000..e8a7e3ce --- /dev/null +++ b/skills/browser-reverse/scripts/normalize.mjs @@ -0,0 +1,128 @@ +#!/usr/bin/env node +// Stage 3 — Normalize. +// +// Group paired samples by (origin, method, templated path), collect query-param +// schemas, and detect when normalization is collapsing structurally divergent +// endpoints (flagged for the report). + +import { readJsonl, writeJsonl, intermediatePath } from './lib/io.mjs'; +import { templatize, templatizeWithSlugs } from './lib/path-template.mjs'; + +function inferQueryType(values) { + // Lightweight type inference for query-string values (always strings on the + // wire, but we can hint). + if (values.every(v => /^-?\d+$/.test(v))) return { type: 'integer' }; + if (values.every(v => /^-?\d+(\.\d+)?$/.test(v))) return { type: 'number' }; + if (values.every(v => v === 'true' || v === 'false')) return { type: 'boolean' }; + return { type: 'string' }; +} + +function statusSignature(rows) { + // A coarse "shape signature" used to detect when two raw paths that + // templatize to the same template actually behave differently. + const ct = new Set(rows.map(r => (r.contentType || '').split(';')[0].trim().toLowerCase()).filter(Boolean)); + const status = new Set(rows.map(r => (r.status != null ? Math.floor(r.status / 100) + 'xx' : 'none'))); + return [...ct].sort().join(',') + '|' + [...status].sort().join(','); +} + +export function normalize(outDir) { + const filtered = readJsonl(intermediatePath(outDir, 'filtered.jsonl')); + + // Pass 1: bucket by (origin, method, single-pass template). + const buckets = new Map(); + for (const row of filtered) { + const t = templatize(row.path); + const key = `${row.method} ${row.origin}${t.template}`; + let b = buckets.get(key); + if (!b) { b = { origin: row.origin, method: row.method, template: t.template, params: t.params, rows: [], rawPaths: new Set() }; buckets.set(key, b); } + b.rows.push(row); + b.rawPaths.add(row.path); + } + + // Pass 2: re-templatize each bucket using its raw-path set so slugs can be + // detected. This may further collapse buckets that share the same underlying + // template once slugs are recognized. + const refined = new Map(); + for (const [, b] of buckets) { + const rawPaths = [...b.rawPaths]; + const t = rawPaths.length > 1 ? templatizeWithSlugs(rawPaths) : { template: b.template, params: b.params }; + const key = `${b.method} ${b.origin}${t.template}`; + let r = refined.get(key); + if (!r) { + r = { origin: b.origin, method: b.method, template: t.template, params: t.params, rows: [], rawPaths: new Set(), originalKeys: [] }; + refined.set(key, r); + } + r.rows.push(...b.rows); + for (const p of b.rawPaths) r.rawPaths.add(p); + r.originalKeys.push({ template: b.template, sig: statusSignature(b.rows) }); + } + + // Build endpoint records. + const endpoints = []; + for (const [, e] of refined) { + const flags = []; + + // Divergent-shape check: if the bucket was collapsed from multiple pass-1 + // templates that had structurally different responses, flag it. + const sigs = new Set(e.originalKeys.map(k => k.sig)); + if (sigs.size > 1) flags.push('divergent-response-shape'); + + if (e.rows.length === 1) flags.push('single-sample'); + const statuses = new Set(e.rows.map(r => r.status).filter(s => s != null)); + if (statuses.size === 1) flags.push('single-status'); + const cts = new Set(e.rows.map(r => (r.contentType || '').split(';')[0].trim()).filter(Boolean)); + if (cts.size > 1) flags.push('mixed-content-types'); + const withBody = e.rows.filter(r => r.reqBody != null).length; + if (withBody > 0 && withBody < e.rows.length) flags.push('request-body-only-on-some-samples'); + + // Query parameter schema: collect names + sample values. + const qSamples = new Map(); + for (const r of e.rows) { + for (const k of Object.keys(r.query || {})) { + if (!qSamples.has(k)) qSamples.set(k, []); + qSamples.get(k).push(r.query[k]); + } + } + const queryParams = []; + for (const [name, values] of qSamples.entries()) { + const present = e.rows.filter(r => name in (r.query || {})).length; + queryParams.push({ + name, + in: 'query', + required: present === e.rows.length, + schema: inferQueryType(values), + }); + } + + endpoints.push({ + endpointKey: `${e.method} ${e.origin}${e.template}`, + origin: e.origin, + method: e.method, + path: e.template, + pathParams: e.params.map(p => ({ name: p.name, in: 'path', required: true, schema: p.schema })), + queryParams, + statusCodes: [...new Set(e.rows.map(r => r.status).filter(s => s != null))].sort((a, b) => a - b), + sampleRows: e.rows, // kept on the in-memory record; trimmed before write + sampleCount: e.rows.length, + rawPaths: [...e.rawPaths], + normalizationFlags: flags, + }); + } + + // Drop the heavy in-memory rows from the persisted form; infer.mjs needs + // them so we keep a parallel sidecar file. + const persisted = endpoints.map(({ sampleRows, ...rest }) => rest); + writeJsonl(intermediatePath(outDir, 'endpoints.jsonl'), persisted); + + const sidecar = endpoints.map(e => ({ endpointKey: e.endpointKey, samples: e.sampleRows })); + writeJsonl(intermediatePath(outDir, 'endpoint-samples.jsonl'), sidecar); + + return { endpoints: endpoints.length }; +} + +if (import.meta.url === `file://${process.argv[1]}`) { + const out = process.argv[2]; + if (!out) { console.error('usage: normalize.mjs '); process.exit(2); } + const stats = normalize(out); + console.log(`normalize: ${stats.endpoints} endpoints`); +} From 9446f9136f9d4dced5dce8e4bbe33e88316d9e41 Mon Sep 17 00:00:00 2001 From: Derek Meegan Date: Wed, 29 Apr 2026 13:35:50 -0700 Subject: [PATCH 2/6] Address PR #88 review: rename to browser-to-api, drop lift doc, fix bugs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Renaming and doc cleanup (per shrey150): - Rename skill from `browser-reverse` to `browser-to-api`. Updates SKILL.md frontmatter + heading, package.json, REFERENCE.md heading, the OpenAPI doc's `info.description`, and the report.md heading. - Fix the stale `discover-api-spec` reference in SKILL.md's composition diagram (left over from an earlier rename). - Drop `BODY-CAPTURE-LIFT.md` from the PR; it's a separate proposal. - Remove the `exec.sendFile()` reference in SKILL.md (browserbase-internal, not a generic skill primitive). - REFERENCE.md restructured to lead with the script/CLI/file-format reference rather than an architecture intro. Pipeline diagram dropped. Bug fixes (per Cursor Bugbot): - `filter.mjs`: rework precedence so `--include` actually rescues URLs that would be hit by a default exclude, matching the documented contract. User `--exclude` still wins. Added a unit-style test path. - `infer.mjs`: skip response-body samples whose CDP status is null. Previously they were keyed under `"0"` but `emit.mjs` only iterates `ep.statusCodes` (which excludes nulls), silently discarding the body. - `load.mjs`: fix the comment in `urlQuery()` — code is first-value-wins, not last-value-wins. Co-Authored-By: Claude Opus 4.7 (1M context) --- skills/browser-reverse/BODY-CAPTURE-LIFT.md | 118 ------------------ .../REFERENCE.md | 21 +--- .../SKILL.md | 16 +-- .../package.json | 2 +- .../scripts/discover.mjs | 0 .../scripts/emit.mjs | 4 +- .../scripts/filter.mjs | 16 ++- .../scripts/infer.mjs | 9 +- .../scripts/lib/io.mjs | 0 .../scripts/lib/path-template.mjs | 0 .../scripts/lib/redact.mjs | 0 .../scripts/lib/schema-merge.mjs | 0 .../scripts/lib/yaml.mjs | 0 .../scripts/load.mjs | 4 +- .../scripts/normalize.mjs | 0 15 files changed, 36 insertions(+), 154 deletions(-) delete mode 100644 skills/browser-reverse/BODY-CAPTURE-LIFT.md rename skills/{browser-reverse => browser-to-api}/REFERENCE.md (89%) rename skills/{browser-reverse => browser-to-api}/SKILL.md (87%) rename skills/{browser-reverse => browser-to-api}/package.json (68%) rename skills/{browser-reverse => browser-to-api}/scripts/discover.mjs (100%) rename skills/{browser-reverse => browser-to-api}/scripts/emit.mjs (98%) rename skills/{browser-reverse => browser-to-api}/scripts/filter.mjs (73%) rename skills/{browser-reverse => browser-to-api}/scripts/infer.mjs (93%) rename skills/{browser-reverse => browser-to-api}/scripts/lib/io.mjs (100%) rename skills/{browser-reverse => browser-to-api}/scripts/lib/path-template.mjs (100%) rename skills/{browser-reverse => browser-to-api}/scripts/lib/redact.mjs (100%) rename skills/{browser-reverse => browser-to-api}/scripts/lib/schema-merge.mjs (100%) rename skills/{browser-reverse => browser-to-api}/scripts/lib/yaml.mjs (100%) rename skills/{browser-reverse => browser-to-api}/scripts/load.mjs (96%) rename skills/{browser-reverse => browser-to-api}/scripts/normalize.mjs (100%) diff --git a/skills/browser-reverse/BODY-CAPTURE-LIFT.md b/skills/browser-reverse/BODY-CAPTURE-LIFT.md deleted file mode 100644 index 514f686e..00000000 --- a/skills/browser-reverse/BODY-CAPTURE-LIFT.md +++ /dev/null @@ -1,118 +0,0 @@ -# Adding Response Body Capture to `browser-trace` — Lift Estimate - -> Grounded in the real source as of `browserbase/skills@main`. I read `SKILL.md`, -> `REFERENCE.md`, `lib.mjs`, `start-capture.mjs`, `snapshot-loop.mjs`, `bisect-cdp.mjs`, -> `bb-capture.mjs`, `bb-finalize.mjs`, `stop-capture.mjs`. - ---- - -## 1. Why this is harder than it looks - -`browser-trace` today does the simplest possible thing: it shells out to `browse cdp --domain Network --domain Console ...`, which emits one CDP event per line to stdout, and that stream is captured verbatim into `cdp/raw.ndjson`. **No CDP commands are issued back into the session.** The capture is fully one-way and stateless. - -Response bodies break that model. Bodies aren't pushed by CDP — they have to be **pulled** with a `Network.getResponseBody` request, keyed by `requestId`, **before the renderer evicts the resource**. Eviction is non-deterministic but typically happens within seconds of the response completing on a busy page. That means body capture has to be: - -- **Live** — runs concurrently with the trace, can't be done from `raw.ndjson` after the fact. -- **Bidirectional** — issues CDP commands, not just reads events. -- **Fast** — the gap between `Network.loadingFinished` and the `getResponseBody` call must be small. -- **Selective** — fetching every body would 10–100x the disk footprint and add real load on the renderer. - -This is a meaningful expansion of the skill's current architecture, not a tweak. - ---- - -## 2. The lift, by component - -### 2.1 New companion script — `scripts/body-capture.mjs` — **NEW, ~200 lines** - -The `browse cdp` subprocess can't be modified (it's an external binary), so body capture has to be a **second CDP client** running in parallel, attached to the same target. Same model as `snapshot-loop.mjs`, but instead of polling screenshots it subscribes to `Network.responseReceived` + `Network.loadingFinished` and issues `Network.getResponseBody` for matching requests. - -Responsibilities: - -- Open its own WebSocket to the CDP target (or use `browse --ws ...` if it supports request/response, which from the snapshot loop it does for one-shot commands — body capture is a long-lived subscription, so likely a raw `ws://` client). -- Maintain an in-memory map of `requestId → { url, method, contentType, status, type }` keyed off `Network.requestWillBeSent` + `Network.responseReceived`. -- On `Network.loadingFinished`: if the request matches the filter (default: `fetch`/`xhr` resourceType, JSON or form content-type, size cap), call `Network.getResponseBody` and write the result to `/cdp/network/bodies/.json`. -- Track failures (eviction races, out-of-process iframes that can't be addressed, sizes over the cap) in a sidecar `bodies/_skipped.jsonl`. -- SIGTERM-clean shutdown so `stop-capture.mjs` doesn't have to know about it specifically (it would just need to also kill `.bodies.pid`). - -**Risk:** `Network.getResponseBody` requires a session-attached target. For OOPIFs (cross-origin iframes), you have to use `Target.attachToTarget` first and route the command on the resulting session. Non-trivial. Realistic v1 punts on iframes and just records the skip reason. - -**Dependencies:** zero — Node stdlib has `ws` via `undici` /`WebSocket` (Node 22+) or you bundle a tiny WS client. The skill is currently zero-dep, so this constraint matters. - -### 2.2 `start-capture.mjs` — **MODIFIED, ~10 lines** - -Add an optional third detached subprocess: if `O11Y_BODIES=1` (or a `--bodies` flag), spawn `body-capture.mjs` the same way `snapshot-loop.mjs` is spawned, write `.bodies.pid`. Default off so existing users see no change. - -### 2.3 `stop-capture.mjs` — **MODIFIED, ~3 lines** - -Already loops over `['.cdp.pid', '.loop.pid']`. Add `'.bodies.pid'` to the list. Trivial. - -### 2.4 `bisect-cdp.mjs` — **MODIFIED, ~15 lines** - -Currently the only "network" buckets are CDP **events** (`requestWillBeSent`, `responseReceived`, `loadingFinished`, `loadingFailed`, `webSocket`). Bodies are content, not events, so they don't fit the existing `BUCKETS` predicate model. - -Two sensible places to expose them: - -1. **As-is on disk** — `cdp/network/bodies/.json` already exists from body-capture; bisect doesn't have to do anything. Per-page slicing (`cdp/pages//network/bodies/`) is the only real work: walk `network/responses.jsonl` for each page, find the matching body files, hard-link or copy them into the per-page dir. ~10 lines. -2. **Index** — emit `cdp/network/bodies-index.jsonl` mapping `{requestId, url, method, status, contentType, sizeBytes, bodyPath}` so query/grep tools don't have to walk the dir. ~5 lines. - -### 2.5 `lib.mjs` — **MODIFIED, ~5 lines** - -Add a helper `readBody(runDir, requestId) → { contentType, body, base64? }`. Useful for the new skill's `infer.mjs` and for `query.mjs`. - -### 2.6 `query.mjs` — **MODIFIED, ~20 lines** - -Add a `bodies` subcommand: list captured bodies, filter by URL/status/content-type, dump a body to stdout. Optional but cheap. - -### 2.7 `bb-capture.mjs` / `bb-finalize.mjs` — **NO CHANGES** - -They delegate to `start-capture.mjs` / `stop-capture.mjs`. Inherits body capture for free. - -### 2.8 `SKILL.md` / `REFERENCE.md` — **MODIFIED, ~50 lines** - -Document: -- The new flag/env var. -- New on-disk layout (`cdp/network/bodies/`, `bodies-index.jsonl`). -- Caveats: eviction races, OOPIF gaps, size cap, default-off. -- Filter knobs (`O11Y_BODY_TYPES`, `O11Y_BODY_MAX_KB`, `O11Y_BODY_INCLUDE_PATTERN`). -- Privacy implication: bodies can contain user data. Off by default for a reason. - ---- - -## 3. Total lift - -| Component | Type | Lines | Risk | -|---|---|---|---| -| `scripts/body-capture.mjs` | new | ~200 | **medium** — WS client, eviction races, OOPIF | -| `scripts/start-capture.mjs` | modify | ~10 | low | -| `scripts/stop-capture.mjs` | modify | ~3 | low | -| `scripts/bisect-cdp.mjs` | modify | ~15 | low | -| `scripts/lib.mjs` | modify | ~5 | low | -| `scripts/query.mjs` | modify | ~20 | low | -| `SKILL.md` + `REFERENCE.md` | modify | ~50 | low | -| **Total** | | **~300 LOC** | | - -**Calendar estimate for one engineer who knows CDP:** ~2–3 days. -- Day 1: WS client + filter + happy-path body capture against Chromium local. -- Day 2: OOPIF target attachment, size cap, skip-tracking, integration with `start`/`stop`. -- Day 3: bisect integration, query subcommand, docs, end-to-end test against a Browserbase remote session. - -**Calendar estimate without prior CDP fluency:** ~1 week. The eviction race and OOPIF target plumbing are the parts that bite. - ---- - -## 4. Risks worth calling out in the PR - -1. **Privacy.** Bodies can contain bearer tokens, PII, partial PII even when redacted at the header layer. Default-off + an opt-in flag is non-negotiable. The redaction story has to live in the consuming skill (e.g. `discover-api-spec`), not in the capture layer — capture should write what it sees. -2. **Performance.** `Network.getResponseBody` blocks on the renderer. For a page making 200 XHR requests, naive capture serializes every one of them. Mitigations: hard cap on concurrent in-flight `getResponseBody` calls (e.g. 8), aggressive content-type filter, default size cap (256 KB). -3. **Disk.** A 10-minute Browserbase session with body capture on can easily produce 100–500 MB of bodies. The skill should default to JSON-only + 256 KB cap and let users opt into more. -4. **Eviction races.** Some bodies will fail with `-32000 No data found for resource`. This is normal. `bodies/_skipped.jsonl` should record them so consumers know coverage isn't 100%. -5. **WebSocket frame data.** `Network.webSocketFrameSent` / `Received` already include the payload inline — no `getResponseBody` needed. v1 should explicitly punt on WebSocket bodies (already in the events bucket) to scope down. - ---- - -## 5. Recommendation - -Building this **into** `browser-trace` is the right call **if** the maintainers are willing to add a (default-off) feature with privacy and disk caveats. Putting it in a sibling skill is also viable but less clean — every consumer skill (api-spec, security audits, etc.) would have to reinvent the WS plumbing. - -The cleanest framing: **bodies are part of the trace, off by default, on with a flag.** Same shape as how Chrome DevTools handles "Preserve log" / "Disable cache" — capture options, not a separate tool. diff --git a/skills/browser-reverse/REFERENCE.md b/skills/browser-to-api/REFERENCE.md similarity index 89% rename from skills/browser-reverse/REFERENCE.md rename to skills/browser-to-api/REFERENCE.md index e8a9ba3c..02871d7b 100644 --- a/skills/browser-reverse/REFERENCE.md +++ b/skills/browser-to-api/REFERENCE.md @@ -1,25 +1,10 @@ -# Browser Reverse — Reference +# Browser to API — Reference -Technical reference for the discovery pipeline, file formats, and configuration. - -## Pipeline - -``` -browser-trace run discover.mjs -.o11y//cdp/network/ ┌─────────┐ ┌────────┐ ┌──────────┐ ┌─────────┐ ┌──────┐ - requests.jsonl ──────────▶ │ load │ ─▶ │ filter │ ─▶ │ normalize│ ─▶ │ infer │ ─▶ │ emit │ - responses.jsonl └─────────┘ └────────┘ └──────────┘ └─────────┘ └──────┘ - paired filtered endpoints endpoints openapi - .jsonl .jsonl .jsonl .with- .yaml - schemas report.md - .jsonl -``` - -Each stage is a discrete script that reads a file and writes a file. `discover.mjs` is the dispatcher; pass `--stage ` to run a single stage for debugging. +Exhaustive reference for every script, flag, file format, and configuration knob the skill exposes. ## Scripts -All scripts are Node ESM (`type: module`). They depend only on the Node standard library. +All scripts are Node ESM (`type: module`). They depend only on the Node standard library. `discover.mjs` is the top-level dispatcher; the others are stage scripts the dispatcher calls in order. Run an individual stage with `discover.mjs --stage ` for debugging or partial reruns. ### `discover.mjs --run [flags]` diff --git a/skills/browser-reverse/SKILL.md b/skills/browser-to-api/SKILL.md similarity index 87% rename from skills/browser-reverse/SKILL.md rename to skills/browser-to-api/SKILL.md index 6bb98eac..108f1d5f 100644 --- a/skills/browser-reverse/SKILL.md +++ b/skills/browser-to-api/SKILL.md @@ -1,20 +1,20 @@ --- -name: browser-reverse -description: Reverse-engineer a website's HTTP API into a best-effort OpenAPI 3.1 spec by analyzing a `browser-trace` capture. Use when the user wants to discover/extract API endpoints from a browser session, build an OpenAPI doc from network traffic, or document a third-party site's XHR/fetch surface for client integration. +name: browser-to-api +description: Turn a website's observable HTTP traffic into a best-effort OpenAPI 3.1 spec by analyzing a `browser-trace` capture. Use when the user wants to discover/extract API endpoints from a browser session, build an OpenAPI doc from network traffic, or document a third-party site's XHR/fetch surface for client integration. compatibility: "Requires Node 18+ and a `browser-trace` run directory (`.o11y//`) produced by the sibling `browser-trace` skill. The scripts use only the Node standard library — no `npm install` step. `jq` is referenced in docs for ad-hoc querying but is not required by the scripts." license: MIT allowed-tools: Bash, Read, Grep --- -# Browser Reverse +# Browser to API -Replay-driven API reverse-engineering. Consume a `browser-trace` capture, pair its CDP request / response events, templatize observed URLs, infer JSON schemas from samples, and emit an **OpenAPI 3.1** document plus a human-readable coverage report. +Replay-driven API discovery. Consume a `browser-trace` capture, pair its CDP request / response events, templatize observed URLs, infer JSON schemas from samples, and emit an **OpenAPI 3.1** document plus a human-readable coverage report. This skill **does not capture traffic**. It is purely offline post-processing on top of `browser-trace`'s `cdp/network/*.jsonl` buckets. The two skills compose: ``` -browser-trace → .o11y//cdp/network/{requests,responses}.jsonl -discover-api-spec → .o11y//api-spec/openapi.yaml + report.md +browser-trace → .o11y//cdp/network/{requests,responses}.jsonl +browser-to-api → .o11y//api-spec/openapi.yaml + report.md ``` ## When to use @@ -67,7 +67,7 @@ node scripts/discover.mjs --run .o11y/my-site `discover.mjs` auto-detects `/cdp/network/bodies/`. To use a body capture from elsewhere (e.g. didn't snapshot, want the live `browse network` dir), pass `--bodies ` explicitly. -Then deliver the artifacts to the user (`exec.sendFile()` for `openapi.yaml` and `report.md`). +The two primary deliverables are `openapi.yaml` (machine-readable spec) and `report.md` (human-readable coverage summary). ## CLI flags @@ -115,7 +115,7 @@ What changes when bodies are present: - ✅ Request-body schemas — `postData` from CDP is enough; bodies dir is a nice-to-have for non-`postData` cases. - ✅ **Response-body schemas** — fully inferred from real samples. Without bodies you get `{ description, content: }` skeletons. -The report flags every endpoint that has no response-body sample. For a sketch of what it would take to teach `browser-trace` itself to capture response bodies natively (no separate `browse network on` step), see [BODY-CAPTURE-LIFT.md](BODY-CAPTURE-LIFT.md). +The report flags every endpoint that has no response-body sample. ## Limitations diff --git a/skills/browser-reverse/package.json b/skills/browser-to-api/package.json similarity index 68% rename from skills/browser-reverse/package.json rename to skills/browser-to-api/package.json index 86360e3f..58577884 100644 --- a/skills/browser-reverse/package.json +++ b/skills/browser-to-api/package.json @@ -1,5 +1,5 @@ { - "name": "browser-reverse", + "name": "browser-to-api", "version": "0.1.0", "private": true, "type": "module" diff --git a/skills/browser-reverse/scripts/discover.mjs b/skills/browser-to-api/scripts/discover.mjs similarity index 100% rename from skills/browser-reverse/scripts/discover.mjs rename to skills/browser-to-api/scripts/discover.mjs diff --git a/skills/browser-reverse/scripts/emit.mjs b/skills/browser-to-api/scripts/emit.mjs similarity index 98% rename from skills/browser-reverse/scripts/emit.mjs rename to skills/browser-to-api/scripts/emit.mjs index 71f1872c..5ad43272 100644 --- a/skills/browser-reverse/scripts/emit.mjs +++ b/skills/browser-to-api/scripts/emit.mjs @@ -247,7 +247,7 @@ export function emit(outDir, opts = {}) { info: { title, version: '0.1.0-discovered', - description: 'Spec discovered from a browser-trace capture by the browser-reverse skill. Inductive, not contractual — see `report.md` and `x-confidence` extensions for caveats.', + description: 'Spec discovered from a browser-trace capture by the browser-to-api skill. Inductive, not contractual — see `report.md` and `x-confidence` extensions for caveats.', }, servers, paths, @@ -290,7 +290,7 @@ export function emit(outDir, opts = {}) { function buildReport({ kept, dropped, servers, redaction, minSamples }) { const lines = []; - lines.push('# Browser-reverse: discovered API\n'); + lines.push('# Discovered API\n'); lines.push('## Servers\n'); for (const s of servers) lines.push(`- ${s.url}`); if (!servers.length) lines.push('_(none)_'); diff --git a/skills/browser-reverse/scripts/filter.mjs b/skills/browser-to-api/scripts/filter.mjs similarity index 73% rename from skills/browser-reverse/scripts/filter.mjs rename to skills/browser-to-api/scripts/filter.mjs index f681c455..9c9bab10 100644 --- a/skills/browser-reverse/scripts/filter.mjs +++ b/skills/browser-to-api/scripts/filter.mjs @@ -35,8 +35,13 @@ const DEFAULT_EXCLUDES = [ export function filter(outDir, opts = {}) { const { include = [], exclude = [], origins = [] } = opts; + // Precedence: + // 1. --origins gates everything; non-matching is dropped. + // 2. User --exclude always wins (explicit user intent). + // 3. Default excludes can be rescued by --include (REFERENCE.md contract). + // 4. When --include is set, anything that doesn't match it is dropped. + const userExcludeRes = exclude.map(s => new RegExp(s)); const includeRes = include.map(s => new RegExp(s)); - const excludeRes = [...DEFAULT_EXCLUDES, ...exclude.map(s => new RegExp(s))]; const originSet = new Set(origins); const paired = readJsonl(intermediatePath(outDir, 'paired.jsonl')); @@ -49,8 +54,13 @@ export function filter(outDir, opts = {}) { const matched = [...originSet].some(o => host === o || host.endsWith('.' + o)); if (!matched) { droppedOrigin++; continue; } } - if (excludeRes.some(re => re.test(row.url))) { droppedExclude++; continue; } - if (includeRes.length && !includeRes.some(re => re.test(row.url))) { droppedInclude++; continue; } + if (userExcludeRes.some(re => re.test(row.url))) { droppedExclude++; continue; } + + const matchesInclude = includeRes.length > 0 && includeRes.some(re => re.test(row.url)); + const matchesDefaultExclude = DEFAULT_EXCLUDES.some(re => re.test(row.url)); + if (matchesDefaultExclude && !matchesInclude) { droppedExclude++; continue; } + if (includeRes.length && !matchesInclude) { droppedInclude++; continue; } + out.push(row); } diff --git a/skills/browser-reverse/scripts/infer.mjs b/skills/browser-to-api/scripts/infer.mjs similarity index 93% rename from skills/browser-reverse/scripts/infer.mjs rename to skills/browser-to-api/scripts/infer.mjs index 87dbf408..33dfed58 100644 --- a/skills/browser-reverse/scripts/infer.mjs +++ b/skills/browser-to-api/scripts/infer.mjs @@ -55,9 +55,12 @@ export function infer(outDir, opts = {}) { if (!pickedReqExample) { pickedReqExample = s.reqBody; pickedReqStatus = s.status; } } if (s.respBody != null && typeof s.respBody === 'object') { - const status = s.status ?? 0; - let p = respProtoByStatus.get(status); - if (!p) { p = newProto(); respProtoByStatus.set(status, p); } + // Skip when we have no status: emit.mjs only renders schemas under + // statuses that appear in ep.statusCodes (which excludes nulls), so + // a body keyed under "0" would be silently discarded. + if (s.status == null) continue; + let p = respProtoByStatus.get(s.status); + if (!p) { p = newProto(); respProtoByStatus.set(s.status, p); } ingest(p, s.respBody); if (s.status >= 200 && s.status < 300 && !pickedRespExample) { pickedRespExample = s.respBody; diff --git a/skills/browser-reverse/scripts/lib/io.mjs b/skills/browser-to-api/scripts/lib/io.mjs similarity index 100% rename from skills/browser-reverse/scripts/lib/io.mjs rename to skills/browser-to-api/scripts/lib/io.mjs diff --git a/skills/browser-reverse/scripts/lib/path-template.mjs b/skills/browser-to-api/scripts/lib/path-template.mjs similarity index 100% rename from skills/browser-reverse/scripts/lib/path-template.mjs rename to skills/browser-to-api/scripts/lib/path-template.mjs diff --git a/skills/browser-reverse/scripts/lib/redact.mjs b/skills/browser-to-api/scripts/lib/redact.mjs similarity index 100% rename from skills/browser-reverse/scripts/lib/redact.mjs rename to skills/browser-to-api/scripts/lib/redact.mjs diff --git a/skills/browser-reverse/scripts/lib/schema-merge.mjs b/skills/browser-to-api/scripts/lib/schema-merge.mjs similarity index 100% rename from skills/browser-reverse/scripts/lib/schema-merge.mjs rename to skills/browser-to-api/scripts/lib/schema-merge.mjs diff --git a/skills/browser-reverse/scripts/lib/yaml.mjs b/skills/browser-to-api/scripts/lib/yaml.mjs similarity index 100% rename from skills/browser-reverse/scripts/lib/yaml.mjs rename to skills/browser-to-api/scripts/lib/yaml.mjs diff --git a/skills/browser-reverse/scripts/load.mjs b/skills/browser-to-api/scripts/load.mjs similarity index 96% rename from skills/browser-reverse/scripts/load.mjs rename to skills/browser-to-api/scripts/load.mjs index 6d4ba292..bfab6275 100644 --- a/skills/browser-reverse/scripts/load.mjs +++ b/skills/browser-to-api/scripts/load.mjs @@ -39,8 +39,10 @@ function urlQuery(u) { try { const x = new URL(u); const out = {}; + // First value wins for repeats. The downstream consumer (normalize.mjs) + // only uses parameter names + a representative value for type inference, + // so collapsing repeats to the first observation is fine. for (const [k, v] of x.searchParams.entries()) { - // Last value wins for repeats; we record the existence either way. if (out[k] === undefined) out[k] = v; } return out; diff --git a/skills/browser-reverse/scripts/normalize.mjs b/skills/browser-to-api/scripts/normalize.mjs similarity index 100% rename from skills/browser-reverse/scripts/normalize.mjs rename to skills/browser-to-api/scripts/normalize.mjs From b233aeec314ac91fd77a2643adec8aa1cb24706e Mon Sep 17 00:00:00 2001 From: Shrey Pandya Date: Tue, 12 May 2026 16:53:40 -0400 Subject: [PATCH 3/6] Add Swagger UI preview for browser-to-api --- skills/browser-to-api/REFERENCE.md | 24 ++ skills/browser-to-api/SKILL.md | 22 +- .../scripts/open-swagger-ui.mjs | 205 ++++++++++++++++++ 3 files changed, 246 insertions(+), 5 deletions(-) create mode 100644 skills/browser-to-api/scripts/open-swagger-ui.mjs diff --git a/skills/browser-to-api/REFERENCE.md b/skills/browser-to-api/REFERENCE.md index 02871d7b..77928a86 100644 --- a/skills/browser-to-api/REFERENCE.md +++ b/skills/browser-to-api/REFERENCE.md @@ -10,6 +10,17 @@ All scripts are Node ESM (`type: module`). They depend only on the Node standard Top-level dispatcher. Runs `load → filter → normalize → infer → emit` in order. With `--stage `, runs only that stage (assumes prior stages already wrote their intermediate file). +### `open-swagger-ui.mjs (--run | --spec ) [flags]` + +Preview an emitted OpenAPI spec in a local Swagger UI checkout. The script serves the Swagger UI `dist/` assets and the generated spec from one local HTTP origin, injects a per-run `swagger-initializer.js`, opens the browser by default, and keeps the server alive until interrupted. + +- `--run ` loads `/api-spec/openapi.yaml`, falling back to `openapi.json`. +- `--spec ` previews an explicit OpenAPI YAML/JSON file. +- `--swagger-ui ` points at a Swagger UI checkout/package directory. If omitted, the script tries `$SWAGGER_UI_DIR`, `~/Developer/swagger-ui`, and `node_modules/swagger-ui-dist`. +- `--host ` defaults to `127.0.0.1`. +- `--port ` defaults to a random free port. +- `--no-open` prints the URL without opening a browser. + ### `load.mjs [bodies-dir]` - Reads `cdp/network/requests.jsonl` and `cdp/network/responses.jsonl`. @@ -152,6 +163,17 @@ Internals (matched in `lib/io.mjs` + `load.mjs`): | `--min-samples ` | `1` | Drop endpoints below this threshold (still listed in the report) | | `--stage ` | (all) | One of `load`, `filter`, `normalize`, `infer`, `emit` | +## Swagger UI preview flags + +| Flag | Default | Notes | +|---|---|---| +| `--run ` | required unless `--spec` is set | Resolves a browser-trace run and previews `/api-spec/openapi.yaml` or `openapi.json` | +| `--spec ` | required unless `--run` is set | Explicit OpenAPI YAML/JSON path | +| `--swagger-ui ` | auto | Checkout/package dir containing either `dist/index.html` or `index.html` + `swagger-ui-bundle.js` | +| `--host ` | `127.0.0.1` | Preview server bind host | +| `--port ` | random | Preview server bind port | +| `--no-open` | false | Print the URL without launching the browser | + ## Default exclude list URLs matching these patterns are dropped before any analysis (regex, applied to the full URL): @@ -213,6 +235,7 @@ These extensions are stripped from `report.md` (which is human-facing) but prese | `O11Y_ROOT` | `.o11y` | Inherited from `browser-trace`. Used only when `--run` is bare run id rather than a full path | | `DISCOVER_ENUM_MAX_DISTINCT` | `8` | Max distinct values to consider a field an enum | | `DISCOVER_ENUM_MIN_SAMPLES` | `5` | Min samples before enum detection runs | +| `SWAGGER_UI_DIR` | auto | Optional Swagger UI checkout/package dir for `open-swagger-ui.mjs` | ## Troubleshooting @@ -223,3 +246,4 @@ These extensions are stripped from `report.md` (which is human-facing) but prese | Path templating collapses too aggressively | numeric IDs being misread as enums, or dictionary words misread as slugs | add `--exclude` for the noisy paths and re-run, or file an issue with the trace | | Schemas show `type: "string"` for everything | request/response bodies aren't valid JSON or weren't captured | check `paired.jsonl` for `reqBody`/`respBody` content — if `null`, bodies weren't in the trace | | Spec validator complains about `info.version` | derived version is `0.1.0-discovered` which some tools dislike | pass `--version 0.1.0` (TODO) or post-edit the file | +| `Swagger UI not found` | no local Swagger UI checkout/package was detected | clone `https://github.com/swagger-api/swagger-ui` to `~/Developer/swagger-ui`, or pass `--swagger-ui ` / set `SWAGGER_UI_DIR` | diff --git a/skills/browser-to-api/SKILL.md b/skills/browser-to-api/SKILL.md index 108f1d5f..e923026d 100644 --- a/skills/browser-to-api/SKILL.md +++ b/skills/browser-to-api/SKILL.md @@ -31,12 +31,12 @@ If the user wants to **capture** traffic, send them to `browser-trace` first. ### 1. Capture with `browser-trace` (and optionally bodies via `browse network on`) ```bash -# Local Chrome example (see browser-trace SKILL.md for Browserbase variant) -"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" \ - --remote-debugging-port=9222 --user-data-dir=/tmp/chrome-spec about:blank & +# Local example (see browser-trace SKILL.md for Browserbase variant) +browse env local +browse open about:blank +TARGET="$(browse status --json | jq -r .wsUrl)" -node ../browser-trace/scripts/start-capture.mjs 9222 my-site -browse env local 9222 +node ../browser-trace/scripts/start-capture.mjs "$TARGET" my-site browse network on # capture request/response bodies browse open https://example.com # ...drive whatever flows you want covered... @@ -69,6 +69,16 @@ node scripts/discover.mjs --run .o11y/my-site The two primary deliverables are `openapi.yaml` (machine-readable spec) and `report.md` (human-readable coverage summary). +### 3. Preview in Swagger UI when available + +If Swagger UI is installed locally, open the generated spec there: + +```bash +node scripts/open-swagger-ui.mjs --run .o11y/my-site +``` + +The helper auto-detects `$SWAGGER_UI_DIR`, `~/Developer/swagger-ui`, or `node_modules/swagger-ui-dist`. If none exists, deliver `openapi.yaml` and `report.md` directly and tell the user Swagger UI was not found. + ## CLI flags | Flag | Required | Meaning | @@ -85,6 +95,8 @@ The two primary deliverables are `openapi.yaml` (machine-readable spec) and `rep | `--min-samples ` | no | Minimum samples per endpoint to include. Default `1` | | `--stage ` | no | Run only one stage: `load`, `filter`, `normalize`, `infer`, `emit` | +`scripts/open-swagger-ui.mjs` accepts `--run ` or `--spec `, plus optional `--swagger-ui `, `--host`, `--port`, and `--no-open`. + ## Output layout ``` diff --git a/skills/browser-to-api/scripts/open-swagger-ui.mjs b/skills/browser-to-api/scripts/open-swagger-ui.mjs new file mode 100644 index 00000000..e2abc459 --- /dev/null +++ b/skills/browser-to-api/scripts/open-swagger-ui.mjs @@ -0,0 +1,205 @@ +#!/usr/bin/env node +// Preview an emitted OpenAPI spec in a local Swagger UI checkout. + +import fs from 'node:fs'; +import http from 'node:http'; +import os from 'node:os'; +import path from 'node:path'; +import { spawn } from 'node:child_process'; +import { fileURLToPath } from 'node:url'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +function parseArgs(argv) { + const opts = { + run: null, + spec: null, + swaggerUi: null, + host: '127.0.0.1', + port: 0, + open: true, + }; + for (let i = 0; i < argv.length; i++) { + const a = argv[i]; + const next = () => argv[++i]; + switch (a) { + case '--run': opts.run = next(); break; + case '--spec': opts.spec = next(); break; + case '--swagger-ui': opts.swaggerUi = next(); break; + case '--host': opts.host = next(); break; + case '--port': opts.port = Number(next()); break; + case '--no-open': opts.open = false; break; + case '-h': case '--help': + printHelp(); process.exit(0); + default: + console.error(`unknown arg: ${a}`); + printHelp(); process.exit(2); + } + } + return opts; +} + +function printHelp() { + console.error(`usage: open-swagger-ui.mjs (--run | --spec ) [flags] + + --run browser-trace run dir; uses /api-spec/openapi.yaml + --spec OpenAPI YAML/JSON file to preview + --swagger-ui Swagger UI checkout/package dir. Defaults to + $SWAGGER_UI_DIR, ~/Developer/swagger-ui, or node_modules/swagger-ui-dist + --host Bind host. Default: 127.0.0.1 + --port Bind port. Default: random free port + --no-open Print the URL without opening a browser`); +} + +function resolveRun(runArg) { + if (fs.existsSync(runArg) && fs.statSync(runArg).isDirectory()) return path.resolve(runArg); + const root = process.env.O11Y_ROOT || '.o11y'; + const guess = path.join(root, runArg); + if (fs.existsSync(guess) && fs.statSync(guess).isDirectory()) return path.resolve(guess); + throw new Error(`run path not found: ${runArg} (tried ${guess})`); +} + +function resolveSpec(opts) { + if (opts.spec) return path.resolve(opts.spec); + if (!opts.run) throw new Error('expected --run or --spec '); + + const runPath = resolveRun(opts.run); + const candidates = [ + path.join(runPath, 'api-spec', 'openapi.yaml'), + path.join(runPath, 'api-spec', 'openapi.json'), + ]; + const found = candidates.find(p => fs.existsSync(p)); + if (!found) throw new Error(`no OpenAPI spec found under ${path.join(runPath, 'api-spec')}`); + return found; +} + +function swaggerUiCandidates(explicit) { + return [ + explicit, + process.env.SWAGGER_UI_DIR, + path.join(os.homedir(), 'Developer', 'swagger-ui'), + path.resolve(process.cwd(), 'node_modules', 'swagger-ui-dist'), + path.resolve(__dirname, '..', 'node_modules', 'swagger-ui-dist'), + ].filter(Boolean); +} + +function distDirFor(candidate) { + const resolved = path.resolve(candidate); + const directDist = path.join(resolved, 'dist'); + if (fs.existsSync(path.join(directDist, 'index.html'))) return directDist; + if (fs.existsSync(path.join(resolved, 'index.html')) && fs.existsSync(path.join(resolved, 'swagger-ui-bundle.js'))) return resolved; + return null; +} + +function resolveSwaggerUi(explicit) { + for (const candidate of swaggerUiCandidates(explicit)) { + const dist = distDirFor(candidate); + if (dist) return dist; + } + + const searched = swaggerUiCandidates(explicit).map(p => ` - ${path.resolve(p)}`).join('\n'); + throw new Error(`Swagger UI not found. Searched:\n${searched}\n\nInstall it locally, then rerun:\n git clone https://github.com/swagger-api/swagger-ui.git ~/Developer/swagger-ui\n cd ~/Developer/swagger-ui && npm ci\n\nOr pass --swagger-ui / set SWAGGER_UI_DIR.`); +} + +function mimeFor(filePath) { + const ext = path.extname(filePath).toLowerCase(); + return { + '.css': 'text/css; charset=utf-8', + '.html': 'text/html; charset=utf-8', + '.js': 'application/javascript; charset=utf-8', + '.json': 'application/json; charset=utf-8', + '.map': 'application/json; charset=utf-8', + '.png': 'image/png', + '.svg': 'image/svg+xml', + '.yaml': 'application/yaml; charset=utf-8', + '.yml': 'application/yaml; charset=utf-8', + }[ext] || 'application/octet-stream'; +} + +function swaggerInitializer(specRoute) { + return `window.onload = function() { + window.ui = SwaggerUIBundle({ + url: ${JSON.stringify(specRoute)}, + dom_id: '#swagger-ui', + deepLinking: true, + presets: [ + SwaggerUIBundle.presets.apis, + SwaggerUIStandalonePreset + ], + plugins: [ + SwaggerUIBundle.plugins.DownloadUrl + ], + layout: 'StandaloneLayout' + }); +}; +`; +} + +function safeStaticPath(distDir, urlPath) { + const decoded = decodeURIComponent(urlPath); + const relative = decoded === '/' ? 'index.html' : decoded.replace(/^\/+/, ''); + const fullPath = path.resolve(distDir, relative); + const root = path.resolve(distDir); + if (fullPath !== root && !fullPath.startsWith(root + path.sep)) return null; + return fullPath; +} + +function openUrl(url) { + const opener = process.platform === 'darwin' + ? ['open', [url]] + : process.platform === 'win32' + ? ['cmd', ['/c', 'start', '', url]] + : ['xdg-open', [url]]; + const child = spawn(opener[0], opener[1], { detached: true, stdio: 'ignore' }); + child.unref(); +} + +async function main() { + const opts = parseArgs(process.argv.slice(2)); + const specPath = resolveSpec(opts); + if (!fs.existsSync(specPath)) throw new Error(`spec not found: ${specPath}`); + + const distDir = resolveSwaggerUi(opts.swaggerUi); + const specRoute = path.extname(specPath).toLowerCase() === '.json' ? '/openapi.json' : '/openapi.yaml'; + + const server = http.createServer((req, res) => { + const requestPath = new URL(req.url, `http://${opts.host}`).pathname; + if (requestPath === specRoute) { + res.writeHead(200, { 'content-type': mimeFor(specPath), 'cache-control': 'no-store' }); + fs.createReadStream(specPath).pipe(res); + return; + } + if (requestPath === '/swagger-initializer.js') { + res.writeHead(200, { 'content-type': 'application/javascript; charset=utf-8', 'cache-control': 'no-store' }); + res.end(swaggerInitializer(specRoute)); + return; + } + + const staticPath = safeStaticPath(distDir, requestPath); + if (!staticPath || !fs.existsSync(staticPath) || fs.statSync(staticPath).isDirectory()) { + res.writeHead(404, { 'content-type': 'text/plain; charset=utf-8' }); + res.end('not found\n'); + return; + } + res.writeHead(200, { 'content-type': mimeFor(staticPath) }); + fs.createReadStream(staticPath).pipe(res); + }); + + await new Promise((resolve, reject) => { + server.once('error', reject); + server.listen(opts.port, opts.host, resolve); + }); + + const address = server.address(); + const url = `http://${opts.host}:${address.port}/`; + console.log(`swagger_ui=${distDir}`); + console.log(`spec=${specPath}`); + console.log(`url=${url}`); + console.log('Press Ctrl-C to stop the preview server.'); + if (opts.open) openUrl(url); +} + +main().catch(err => { + console.error(err.message); + process.exit(1); +}); From dc07d29d02763b56f49162b72c3d30697edf4a93 Mon Sep 17 00:00:00 2001 From: Shrey Pandya Date: Wed, 13 May 2026 16:57:45 -0400 Subject: [PATCH 4/6] Noise classification, GraphQL decomposition, and client SDK generation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit normalize.mjs: - Auto-classify endpoints as api/noise/page and drop non-API traffic (tracking, analytics, bot defense, session plumbing, HTML page renders) - Detect multiplexed endpoints (GraphQL operationName, JSON-RPC method, query param dispatch) and decompose into separate logical operations - Typically drops 60-80% of captured traffic as noise emit.mjs: - Generate client.mjs — zero-dependency ES module wrapping each discovered operation as an async function with JSDoc param types - For GraphQL/APQ endpoints, embeds persisted query hashes and wires up the full request shape so callers just pass variables - Extract required headers from trace (CSRF tokens, custom headers) and include them in client defaults - Task-oriented report.md with quick-start import, curl examples, variables tables, and response samples per operation On OpenTable trace: 27 raw endpoints → 9 named operations, zero noise. Generated client with autocomplete(), restaurantsAvailability(), etc. Co-Authored-By: Claude Opus 4.6 (1M context) --- skills/browser-to-api/SKILL.md | 22 +- skills/browser-to-api/scripts/discover.mjs | 2 +- skills/browser-to-api/scripts/emit.mjs | 380 +++++++++++++++++--- skills/browser-to-api/scripts/normalize.mjs | 195 ++++++++-- 4 files changed, 518 insertions(+), 81 deletions(-) diff --git a/skills/browser-to-api/SKILL.md b/skills/browser-to-api/SKILL.md index e923026d..d62d4939 100644 --- a/skills/browser-to-api/SKILL.md +++ b/skills/browser-to-api/SKILL.md @@ -129,6 +129,26 @@ What changes when bodies are present: The report flags every endpoint that has no response-body sample. +## Automatic noise filtering + +The normalize stage automatically classifies and drops infrastructure noise: + +- **Tracking / analytics** — paths containing `/track`, `/pixel`, `/beacon`, `/impression`, `/pageview`, `/dag/v*` +- **Bot defense** — Akamai (`/akam/`), fingerprint payloads (`sensor_data`), obfuscated multi-segment paths +- **Session plumbing** — `/session`, `/authenticate/start`, cookie consent, A/B experiment endpoints +- **HTML page renders** — `GET` requests returning `text/html` (the rendered page, not the API) + +This typically drops 60-80% of captured traffic. The `--include` flag can rescue a false positive. + +## GraphQL / multiplexed endpoint decomposition + +When a single endpoint (like `/dapi/fe/gql`) is called with different `operationName` values, the skill automatically splits it into separate logical operations. Each gets its own: +- OpenAPI path entry (e.g. `/dapi/fe/gql [Autocomplete]`) +- Request/response schema inferred from only that operation's samples +- Curl example and variables table in the report + +Detection works on body fields (`operationName`, `method`, `action`) and query params (`opname`, `op`). This covers GraphQL (APQ and inline), JSON-RPC, and similar dispatch patterns. + ## Limitations - **Coverage is bounded by the captured flow.** Endpoints not exercised in the trace will not appear. The skill cannot prove completeness. @@ -141,7 +161,7 @@ The report flags every endpoint that has no response-body sample. 1. **Drive the flows you want documented.** The richer the browser-trace, the richer the spec. 2. **Use `--origins` for noisy sites.** A marketing page hits dozens of analytics hosts; restrict to the API origin you care about. -3. **Inspect `report.md` first.** Low-sample endpoints, single-status endpoints, and missing request bodies are listed there with concrete suggestions. +3. **Inspect `report.md` first.** It has curl-ready examples and response samples for every discovered operation. 4. **Bump `--min-samples` to 2+** when you want only confidently-shaped endpoints in the final doc — drop the long tail. 5. **Pair with `browse network on`** when response-body schemas matter. The CDP firehose alone has request bodies but not response bodies. diff --git a/skills/browser-to-api/scripts/discover.mjs b/skills/browser-to-api/scripts/discover.mjs index c349fa87..07023685 100644 --- a/skills/browser-to-api/scripts/discover.mjs +++ b/skills/browser-to-api/scripts/discover.mjs @@ -84,7 +84,7 @@ function main() { } console.log(`\noutput: ${outDir}`); - for (const f of ['openapi.yaml', 'openapi.json', 'report.md', 'confidence.json']) { + for (const f of ['client.mjs', 'openapi.yaml', 'openapi.json', 'report.md', 'confidence.json']) { const p = path.join(outDir, f); if (fs.existsSync(p)) console.log(` ${path.relative(process.cwd(), p)}`); } diff --git a/skills/browser-to-api/scripts/emit.mjs b/skills/browser-to-api/scripts/emit.mjs index 5ad43272..f53d640a 100644 --- a/skills/browser-to-api/scripts/emit.mjs +++ b/skills/browser-to-api/scripts/emit.mjs @@ -126,8 +126,11 @@ function makeOperation(ep, refOrInline) { for (const p of ep.pathParams || []) params.push(p); for (const p of ep.queryParams || []) params.push(p); + const summary = ep.operationName + ? `${ep.operationName} (${ep.method} ${ep.parentPath || ep.path})` + : `${ep.method} ${ep.path}`; const op = { - summary: `${ep.method} ${ep.path}`, + summary, operationId: makeOpId(ep), }; if (params.length) op.parameters = params; @@ -189,6 +192,9 @@ function defaultDescriptionFor(status) { } function makeOpId(ep) { + if (ep.operationName) { + return `${ep.method.toLowerCase()}_${ep.operationName.replace(/[^A-Za-z0-9]/g, '_')}`; + } const parts = ep.path.split('/').filter(Boolean).map(s => s.replace(/[{}]/g, '')); const tail = parts.map(p => p.replace(/[^A-Za-z0-9]/g, '_')).join('_'); return `${ep.method.toLowerCase()}_${tail || 'root'}`; @@ -203,6 +209,16 @@ export function emit(outDir, opts = {}) { const kept = endpoints.filter(e => e.sampleCount >= minSamples); const dropped = endpoints.filter(e => e.sampleCount < minSamples); + // Load raw samples for header extraction (client generation needs them) + const samplesByKey = new Map(); + for (const row of readJsonl(intermediatePath(outDir, 'endpoint-samples.jsonl'))) { + samplesByKey.set(row.endpointKey, row.samples); + } + // Attach to kept endpoints temporarily for client gen + for (const ep of kept) { + ep.sampleRows = samplesByKey.get(ep.endpointKey) || []; + } + // Servers: one entry per distinct origin, sorted by frequency. const originCounts = new Map(); for (const e of kept) originCounts.set(e.origin, (originCounts.get(e.origin) || 0) + e.sampleCount); @@ -213,30 +229,31 @@ export function emit(outDir, opts = {}) { const { components, refOrInline } = buildComponents(kept); - // Build paths: one keyed entry per templated path; each method becomes an - // operation. When the same (path, method) is observed on multiple origins - // (common for third-party analytics endpoints fanned across vendors), keep - // the highest-sample-count operation and record the other origins under - // `x-also-served-from` so no data is silently dropped. + // Build paths. Decomposed operations (e.g. GraphQL) get a synthetic path + // like /dapi/fe/gql#Autocomplete so each operation is a distinct entry. const paths = {}; - const collisions = {}; // pathKey -> [{origin, samples}] + const collisions = {}; for (const ep of kept) { const m = ep.method.toLowerCase(); - if (!paths[ep.path]) paths[ep.path] = {}; - const existing = paths[ep.path][m]; + // Use the path as-is (includes [OpName] for decomposed endpoints) + const pathKey = ep.path; + if (!paths[pathKey]) paths[pathKey] = {}; + const existing = paths[pathKey][m]; if (!existing) { - paths[ep.path][m] = makeOperation(ep, refOrInline); + paths[pathKey][m] = makeOperation(ep, refOrInline); } else { - const key = `${m} ${ep.path}`; + const key = `${m} ${pathKey}`; if (!collisions[key]) collisions[key] = [{ origin: existing['x-origin'], samples: existing['x-sample-count'] }]; collisions[key].push({ origin: ep.origin, samples: ep.sampleCount }); if (ep.sampleCount > (existing['x-sample-count'] || 0)) { - paths[ep.path][m] = makeOperation(ep, refOrInline); + paths[pathKey][m] = makeOperation(ep, refOrInline); } } } for (const [key, origins] of Object.entries(collisions)) { - const [m, p] = key.split(' '); + const [m, ...rest] = key.split(' '); + const p = rest.join(' '); + if (!paths[p]?.[m]) continue; const op = paths[p][m]; const winner = op['x-origin']; op['x-also-served-from'] = origins.filter(o => o.origin !== winner).map(o => o.origin); @@ -278,73 +295,318 @@ export function emit(outDir, opts = {}) { // report.md const redaction = readJson(intermediatePath(outDir, 'redaction-stats.json'), { headers: 0, bodyKeys: 0, bodyValues: 0 }); - writeText(path.join(outDir, 'report.md'), buildReport({ kept, dropped, servers, redaction, minSamples })); + + // client.mjs — generated SDK wrapping each operation as a callable function + const clientCode = buildClient({ kept, servers }); + if (clientCode) { + writeText(path.join(outDir, 'client.mjs'), clientCode); + } + + writeText(path.join(outDir, 'report.md'), buildReport({ kept, dropped, servers, redaction, minSamples, hasClient: !!clientCode })); return { endpoints: kept.length, droppedLowSample: dropped.length, servers: servers.length, components: Object.keys(components).length, + client: !!clientCode, }; } -function buildReport({ kept, dropped, servers, redaction, minSamples }) { - const lines = []; - lines.push('# Discovered API\n'); - lines.push('## Servers\n'); - for (const s of servers) lines.push(`- ${s.url}`); - if (!servers.length) lines.push('_(none)_'); - lines.push(''); +// --------------------------------------------------------------------------- +// Client SDK generation +// --------------------------------------------------------------------------- - lines.push('## Endpoints\n'); - lines.push('| Method | Path | Samples | Statuses | Confidence | Flags |'); - lines.push('|---|---|---|---|---|---|'); - const sorted = [...kept].sort((a, b) => a.path.localeCompare(b.path) || a.method.localeCompare(b.method)); - for (const ep of sorted) { - const flags = ep.normalizationFlags.length ? ep.normalizationFlags.join(', ') : '—'; - lines.push(`| ${ep.method} | \`${ep.path}\` | ${ep.sampleCount} | ${ep.statusCodes.join(', ') || '—'} | ${confidenceBucket(ep)} | ${flags} |`); - } - if (!kept.length) lines.push('| — | — | — | — | — | — |'); - lines.push(''); +function toFnName(name) { + // Autocomplete → autocomplete, RestaurantsAvailability → restaurantsAvailability + return name[0].toLowerCase() + name.slice(1); +} - if (dropped.length) { - lines.push(`## Dropped (below --min-samples=${minSamples})\n`); - for (const ep of dropped) lines.push(`- \`${ep.method} ${ep.path}\` (${ep.sampleCount} sample${ep.sampleCount === 1 ? '' : 's'})`); - lines.push(''); +function extractObservedHeaders(kept) { + // Pull non-standard headers that appeared consistently across requests. + // These are often required (CSRF tokens, custom auth, etc.) + const candidates = new Map(); // headerName -> { values: Set, count } + let totalSamples = 0; + const skip = new Set([ + 'content-type', 'user-agent', 'accept', 'accept-encoding', 'accept-language', + 'referer', 'origin', 'host', 'connection', 'content-length', + 'sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform', + 'sec-fetch-dest', 'sec-fetch-mode', 'sec-fetch-site', + 'cookie', 'authorization', 'x-api-key', + ]); + for (const ep of kept) { + const samples = ep.sampleRows || []; + for (const s of samples) { + totalSamples++; + for (const [k, v] of Object.entries(s.reqHeaders || {})) { + const lk = k.toLowerCase(); + if (skip.has(lk)) continue; + if (!candidates.has(lk)) candidates.set(lk, { name: k, values: new Set(), count: 0 }); + const c = candidates.get(lk); + c.count++; + c.values.add(v); + } + } + } + // Keep headers present in >50% of requests (likely required) + const result = {}; + for (const [, c] of candidates) { + if (c.count <= totalSamples * 0.5) continue; + if (c.values.size <= 5) { + result[c.name] = [...c.values][0]; + } else { + // High cardinality (e.g. CSRF tokens, correlation IDs) — include with a + // representative value. The header is likely required even if the value varies. + result[c.name] = [...c.values][0]; + } } + return result; +} - lines.push('## Coverage caveats\n'); - const noResp = kept.filter(e => !e.responseBodyKnown); - if (noResp.length) { - lines.push(`- **${noResp.length}** endpoint${noResp.length === 1 ? '' : 's'} have no response-body schema. \`browse cdp\` does not embed response bodies; pair with \`browse network on\` to capture them.`); +function buildClient({ kept, servers }) { + const baseUrl = servers[0]?.url || ''; + const operations = kept.filter(e => e.operationName); + const regular = kept.filter(e => !e.operationName); + + if (!operations.length && !regular.length) return null; + + // Detect required headers from the trace (e.g. CSRF tokens) + const observedHeaders = extractObservedHeaders(kept); + + const lines = []; + lines.push(`// Auto-generated API client from browser-trace capture.`); + lines.push(`// Usage: import { ${operations.slice(0, 3).map(e => toFnName(e.operationName)).join(', ')}${operations.length > 3 ? ', ...' : ''} } from './client.mjs';\n`); + lines.push(`const BASE = '${baseUrl}';\n`); + + lines.push(`const defaultHeaders = {`); + lines.push(` 'Content-Type': 'application/json',`); + lines.push(` 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',`); + for (const [k, v] of Object.entries(observedHeaders)) { + lines.push(` '${k}': '${v}',`); } - const singleSample = kept.filter(e => e.sampleCount === 1); - if (singleSample.length) { - lines.push(`- **${singleSample.length}** endpoint${singleSample.length === 1 ? '' : 's'} were observed only once. Drive the same flow again to gain confidence.`); + lines.push(`};\n`); + + lines.push(`async function request(path, { method = 'GET', body, query, headers } = {}) {`); + lines.push(` let url = BASE + path;`); + lines.push(` if (query) {`); + lines.push(` const qs = new URLSearchParams(Object.entries(query).filter(([, v]) => v != null));`); + lines.push(` if (qs.toString()) url += '?' + qs;`); + lines.push(` }`); + lines.push(` const res = await fetch(url, {`); + lines.push(` method,`); + lines.push(` headers: { ...defaultHeaders, ...headers },`); + lines.push(` ...(body ? { body: JSON.stringify(body) } : {}),`); + lines.push(` });`); + lines.push(` if (!res.ok) throw new Error(\`\${res.status} \${res.statusText}: \${await res.text()}\`);`); + lines.push(` const ct = res.headers.get('content-type') || '';`); + lines.push(` return ct.includes('json') ? res.json() : res.text();`); + lines.push(`}\n`); + + // GraphQL / multiplexed operations + if (operations.length) { + // Group by parent path + discriminator to emit one dispatcher per GQL endpoint + const byParent = new Map(); + for (const op of operations) { + const key = op.parentPath || op.path; + if (!byParent.has(key)) byParent.set(key, []); + byParent.get(key).push(op); + } + + for (const [parentPath, ops] of byParent) { + // Check if it's a persisted-query GraphQL endpoint + const isPersisted = ops.some(op => + op.requestExample?.extensions?.persistedQuery?.sha256Hash); + + if (isPersisted) { + // Build a hash lookup table + lines.push(`// Persisted query hashes for ${parentPath}`); + lines.push(`const HASHES = {`); + for (const op of ops) { + const hash = op.requestExample?.extensions?.persistedQuery?.sha256Hash; + if (hash) lines.push(` ${op.operationName}: '${hash}',`); + } + lines.push(`};\n`); + } + + // Emit a function per operation + for (const op of ops) { + const fnName = toFnName(op.operationName); + const vars = op.requestExample?.variables; + const varKeys = vars && typeof vars === 'object' ? Object.keys(vars) : []; + + // Build JSDoc + lines.push(`/**`); + if (varKeys.length) { + for (const k of varKeys) { + const v = vars[k]; + const t = v === null ? '*' : Array.isArray(v) ? 'Array' : typeof v; + lines.push(` * @param {${t}} variables.${k}`); + } + } + lines.push(` * @returns {Promise}`); + lines.push(` */`); + + lines.push(`export async function ${fnName}(variables = {}) {`); + if (isPersisted) { + lines.push(` return request('${parentPath}', {`); + lines.push(` method: 'POST',`); + lines.push(` query: { optype: 'query', opname: '${op.operationName}' },`); + lines.push(` body: {`); + lines.push(` operationName: '${op.operationName}',`); + lines.push(` variables,`); + lines.push(` extensions: { persistedQuery: { version: 1, sha256Hash: HASHES.${op.operationName} } },`); + lines.push(` },`); + lines.push(` });`); + } else { + lines.push(` return request('${parentPath}', {`); + lines.push(` method: 'POST',`); + lines.push(` body: { ${op.discriminatorField || 'operationName'}: '${op.operationName}', variables },`); + lines.push(` });`); + } + lines.push(`}\n`); + } + } } - const noBodyOnPost = kept.filter(e => ['POST', 'PUT', 'PATCH'].includes(e.method) && !e.requestBodyKnown); - if (noBodyOnPost.length) { - lines.push(`- **${noBodyOnPost.length}** mutation endpoint${noBodyOnPost.length === 1 ? '' : 's'} have no request body in the trace (form-encoded? non-JSON? not captured?).`); + + // Regular REST endpoints + for (const ep of regular) { + const fnName = makeOpId(ep).replace(/^(get|post|put|patch|delete)_/, (_, m) => m); + const hasBody = ['POST', 'PUT', 'PATCH'].includes(ep.method) && ep.requestBodyKnown; + + lines.push(`export async function ${fnName}(${hasBody ? 'body, ' : ''}options = {}) {`); + lines.push(` return request('${ep.path}', {`); + lines.push(` method: '${ep.method}',`); + if (hasBody) lines.push(` body,`); + lines.push(` ...options,`); + lines.push(` });`); + lines.push(`}\n`); } - lines.push(''); - lines.push('## Redaction\n'); - lines.push(`- Headers redacted: ${redaction.headers}`); - lines.push(`- Body keys redacted: ${redaction.bodyKeys}`); - lines.push(`- Body values redacted by pattern: ${redaction.bodyValues}`); - lines.push(''); + return lines.join('\n') + '\n'; +} - lines.push('## Suggested follow-up flows\n'); - const status404 = kept.filter(e => e.statusCodes.includes(404)); - if (status404.length) { - lines.push(`- Endpoints that returned 404: ${status404.slice(0, 5).map(e => '`' + e.method + ' ' + e.path + '`').join(', ')}. Re-run with valid IDs to widen the success-path schema.`); +function buildReport({ kept, dropped, servers, redaction, minSamples, hasClient }) { + const lines = []; + const baseUrl = servers[0]?.url || ''; + lines.push('# Discovered API\n'); + lines.push(`**Base URL:** \`${baseUrl || '(unknown)'}\`\n`); + + // Separate decomposed (named operations) from regular endpoints + const operations = kept.filter(e => e.operationName); + const regular = kept.filter(e => !e.operationName); + + // Quick-start with generated client + if (hasClient) { + const allFns = [...operations, ...regular]; + const fnNames = allFns.map(e => e.operationName ? toFnName(e.operationName) : makeOpId(e)); + lines.push('## Quick start\n'); + lines.push('```js'); + lines.push(`import { ${fnNames.join(', ')} } from './client.mjs';`); + lines.push('```\n'); + lines.push(`**${fnNames.length} functions**, zero dependencies. See [\`client.mjs\`](./client.mjs) for full signatures.\n`); } - if (singleSample.length) { - lines.push('- Re-exercise the single-sample endpoints listed above to promote them out of `low` confidence.'); + + // --- Named operations (GraphQL / multiplexed) --- + if (operations.length) { + lines.push('## Operations\n'); + lines.push('These are logical operations multiplexed over a single endpoint.\n'); + + const sorted = [...operations].sort((a, b) => b.sampleCount - a.sampleCount); + for (const ep of sorted) { + lines.push(`### ${ep.operationName}\n`); + lines.push(`- **Endpoint:** \`${ep.method} ${ep.parentPath || ep.path}\``); + lines.push(`- **Discriminator:** \`${ep.discriminatorField}: "${ep.operationName}"\``); + lines.push(`- **Samples:** ${ep.sampleCount} | **Statuses:** ${ep.statusCodes.join(', ') || '—'}`); + lines.push(''); + + // Curl example from request body + if (ep.requestExample) { + const body = JSON.stringify(ep.requestExample, null, 2); + const curlPath = ep.parentPath || ep.path; + lines.push('```bash'); + lines.push(`curl -X ${ep.method} '${baseUrl}${curlPath}' \\`); + lines.push(` -H 'Content-Type: application/json' \\`); + lines.push(` -d '${body}'`); + lines.push('```\n'); + } + + // Key variables (for GraphQL, show the variables object shape) + if (ep.requestExample?.variables && typeof ep.requestExample.variables === 'object') { + const vars = ep.requestExample.variables; + const varKeys = Object.keys(vars); + if (varKeys.length) { + lines.push('**Variables:**\n'); + lines.push('| Name | Example | Type |'); + lines.push('|---|---|---|'); + for (const k of varKeys) { + const v = vars[k]; + const t = Array.isArray(v) ? 'array' : typeof v; + const example = JSON.stringify(v); + const truncated = example.length > 60 ? example.slice(0, 57) + '...' : example; + lines.push(`| \`${k}\` | \`${truncated}\` | ${t} |`); + } + lines.push(''); + } + } + + // Response shape summary + if (ep.responseExample) { + const respStr = JSON.stringify(ep.responseExample, null, 2); + const truncResp = respStr.length > 1500 ? respStr.slice(0, 1500) + '\n ...\n}' : respStr; + lines.push('
Example response\n'); + lines.push('```json'); + lines.push(truncResp); + lines.push('```\n
\n'); + } + } } - if (!status404.length && !singleSample.length) { - lines.push('- The captured flow looks reasonably balanced. Add an authenticated session if the unauth view is what was captured.'); + + // --- Regular REST endpoints --- + if (regular.length) { + lines.push('## Endpoints\n'); + lines.push('| Method | Path | Samples | Statuses | Confidence |'); + lines.push('|---|---|---|---|---|'); + const sorted = [...regular].sort((a, b) => b.sampleCount - a.sampleCount); + for (const ep of sorted) { + lines.push(`| ${ep.method} | \`${ep.path}\` | ${ep.sampleCount} | ${ep.statusCodes.join(', ') || '—'} | ${confidenceBucket(ep)} |`); + } + lines.push(''); + + // Curl examples for top regular endpoints + const withExamples = sorted.filter(e => e.requestExample || e.responseExample).slice(0, 5); + for (const ep of withExamples) { + lines.push(`### \`${ep.method} ${ep.path}\`\n`); + if (ep.requestExample) { + const body = JSON.stringify(ep.requestExample, null, 2); + lines.push('```bash'); + lines.push(`curl -X ${ep.method} '${baseUrl}${ep.path}' \\`); + lines.push(` -H 'Content-Type: application/json' \\`); + lines.push(` -d '${body}'`); + lines.push('```\n'); + } + if (ep.responseExample) { + const respStr = JSON.stringify(ep.responseExample, null, 2); + const truncResp = respStr.length > 1000 ? respStr.slice(0, 1000) + '\n ...\n}' : respStr; + lines.push('
Example response\n'); + lines.push('```json'); + lines.push(truncResp); + lines.push('```\n
\n'); + } + } } + + if (!kept.length) lines.push('No API endpoints discovered.\n'); + + // --- Coverage --- + lines.push('## Coverage\n'); + lines.push(`- **${kept.length}** API endpoints discovered`); + if (dropped.length) lines.push(`- **${dropped.length}** dropped (below --min-samples=${minSamples})`); + const noResp = kept.filter(e => !e.responseBodyKnown); + if (noResp.length) lines.push(`- **${noResp.length}** missing response-body schemas`); + const singleSample = kept.filter(e => e.sampleCount === 1); + if (singleSample.length) lines.push(`- **${singleSample.length}** observed only once`); + lines.push(''); + return lines.join('\n') + '\n'; } diff --git a/skills/browser-to-api/scripts/normalize.mjs b/skills/browser-to-api/scripts/normalize.mjs index e8a7e3ce..4132ec8f 100644 --- a/skills/browser-to-api/scripts/normalize.mjs +++ b/skills/browser-to-api/scripts/normalize.mjs @@ -1,16 +1,14 @@ #!/usr/bin/env node // Stage 3 — Normalize. // -// Group paired samples by (origin, method, templated path), collect query-param -// schemas, and detect when normalization is collapsing structurally divergent -// endpoints (flagged for the report). +// Group paired samples by (origin, method, templated path), classify noise vs +// real API, decompose multiplexed endpoints (GraphQL, JSON-RPC), collect +// query-param schemas, and detect normalization anomalies. import { readJsonl, writeJsonl, intermediatePath } from './lib/io.mjs'; import { templatize, templatizeWithSlugs } from './lib/path-template.mjs'; function inferQueryType(values) { - // Lightweight type inference for query-string values (always strings on the - // wire, but we can hint). if (values.every(v => /^-?\d+$/.test(v))) return { type: 'integer' }; if (values.every(v => /^-?\d+(\.\d+)?$/.test(v))) return { type: 'number' }; if (values.every(v => v === 'true' || v === 'false')) return { type: 'boolean' }; @@ -18,13 +16,147 @@ function inferQueryType(values) { } function statusSignature(rows) { - // A coarse "shape signature" used to detect when two raw paths that - // templatize to the same template actually behave differently. const ct = new Set(rows.map(r => (r.contentType || '').split(';')[0].trim().toLowerCase()).filter(Boolean)); const status = new Set(rows.map(r => (r.status != null ? Math.floor(r.status / 100) + 'xx' : 'none'))); return [...ct].sort().join(',') + '|' + [...status].sort().join(','); } +// --------------------------------------------------------------------------- +// Noise classification — tag endpoints that are infrastructure, not user-facing +// --------------------------------------------------------------------------- +const NOISE_PATH_PATTERNS = [ + // Tracking / analytics / telemetry + /\/track(ing)?[\/\b]/i, /\/pixel/i, /\/beacon/i, /\/log[\/\b]/i, + /\/impression/i, /\/pageview/i, /\/click[\/\b]/i, + /\/session[-_]?start/i, /\/batch\/(impression|list)/i, + /\/dag\/v\d+\//i, + /\/trackgoal/i, /\/profileview/i, /\/sessionstart/i, + /\/dinerTrust/i, /\/trackDiner/i, + /\/profile-view$/i, /\/track\/search$/i, + /\/mix$/i, + // Cookie / consent / privacy + /\/cookie[-_]?consent/i, /\/consent\//i, /\/onetrust/i, + // Experimentation + /\/bucket[-_]?experiment/i, /\/experiment[\/\b]/i, /\/feature[-_]?flag/i, + // Bot defense / fingerprinting + /\/akam\//i, /\/akamai\//i, /\/human$/i, + // Session plumbing (not user-facing API) + /\/session$/i, /\/authenticate\/start$/i, +]; + +const NOISE_BODY_SIGNALS = [ + /^sensor_data$/, // Akamai bot fingerprint + /^body$/, // Obfuscated payloads (Akamai, etc.) +]; + +function classifyEndpoint(endpoint) { + const p = endpoint.path; + const m = endpoint.method; + + // HTML page renders are not API endpoints + const htmlRows = endpoint.sampleRows.filter(r => + (r.contentType || '').includes('text/html')); + if (htmlRows.length === endpoint.sampleRows.length && m === 'GET') return 'page'; + + // Path-based noise detection + if (NOISE_PATH_PATTERNS.some(re => re.test(p))) return 'noise'; + + // Obfuscated paths (random-looking segments with mixed case, no real structure) + const segs = p.split('/').filter(Boolean); + const obfuscated = segs.filter(s => + /[A-Za-z0-9_-]{8,}/.test(s) && + !/^(v\d+|api|dapi|graphql|rest|fe|gql)$/i.test(s) && + /[A-Z]/.test(s) && /[a-z]/.test(s)); + if (obfuscated.length >= 2) return 'noise'; + + // Body-based: if every sample's request body only has noise-signal keys + if (endpoint.sampleRows.length > 0) { + const allNoise = endpoint.sampleRows.every(r => { + if (!r.reqBody || typeof r.reqBody !== 'object') return false; + const keys = Object.keys(r.reqBody); + return keys.length > 0 && keys.every(k => NOISE_BODY_SIGNALS.some(re => re.test(k))); + }); + if (allNoise) return 'noise'; + } + + return 'api'; +} + +// --------------------------------------------------------------------------- +// GraphQL / multiplexed endpoint decomposition +// --------------------------------------------------------------------------- +function detectDiscriminator(rows) { + // Check if these rows share a URL path but have a body field that acts as + // a discriminator (operationName for GraphQL, method for JSON-RPC, etc.) + const candidates = ['operationName', 'method', 'action', 'type', 'command']; + for (const field of candidates) { + const values = new Set(); + let matchCount = 0; + for (const r of rows) { + if (r.reqBody && typeof r.reqBody === 'object' && typeof r.reqBody[field] === 'string') { + values.add(r.reqBody[field]); + matchCount++; + } + } + if (matchCount >= rows.length * 0.8 && values.size >= 2) { + return { field, values: [...values] }; + } + } + + // Also check query params (OpenTable uses ?opname= for GraphQL) + for (const field of ['opname', 'operationName', 'op', 'action']) { + const values = new Set(); + let matchCount = 0; + for (const r of rows) { + if (r.query && typeof r.query[field] === 'string') { + values.add(r.query[field]); + matchCount++; + } + } + if (matchCount >= rows.length * 0.8 && values.size >= 2) { + return { field, values: [...values], source: 'query' }; + } + } + + return null; +} + +function decomposeMultiplexed(endpoint) { + const disc = detectDiscriminator(endpoint.sampleRows); + if (!disc) return [endpoint]; + + const byOp = new Map(); + for (const row of endpoint.sampleRows) { + let opName; + if (disc.source === 'query') { + opName = row.query?.[disc.field] || '__unknown__'; + } else { + opName = (row.reqBody && typeof row.reqBody === 'object') + ? row.reqBody[disc.field] || '__unknown__' + : '__unknown__'; + } + if (!byOp.has(opName)) byOp.set(opName, []); + byOp.get(opName).push(row); + } + + const sub = []; + for (const [opName, rows] of byOp) { + // Build a virtual endpoint per operation + const virtualPath = `${endpoint.path} [${opName}]`; + sub.push({ + ...endpoint, + endpointKey: `${endpoint.method} ${endpoint.origin}${virtualPath}`, + path: virtualPath, + operationName: opName, + discriminatorField: disc.field, + parentPath: endpoint.path, + sampleRows: rows, + sampleCount: rows.length, + }); + } + return sub; +} + export function normalize(outDir) { const filtered = readJsonl(intermediatePath(outDir, 'filtered.jsonl')); @@ -40,8 +172,7 @@ export function normalize(outDir) { } // Pass 2: re-templatize each bucket using its raw-path set so slugs can be - // detected. This may further collapse buckets that share the same underlying - // template once slugs are recognized. + // detected. const refined = new Map(); for (const [, b] of buckets) { const rawPaths = [...b.rawPaths]; @@ -57,16 +188,12 @@ export function normalize(outDir) { r.originalKeys.push({ template: b.template, sig: statusSignature(b.rows) }); } - // Build endpoint records. - const endpoints = []; + // Build endpoint records, classify, and decompose. + const preEndpoints = []; for (const [, e] of refined) { const flags = []; - - // Divergent-shape check: if the bucket was collapsed from multiple pass-1 - // templates that had structurally different responses, flag it. const sigs = new Set(e.originalKeys.map(k => k.sig)); if (sigs.size > 1) flags.push('divergent-response-shape'); - if (e.rows.length === 1) flags.push('single-sample'); const statuses = new Set(e.rows.map(r => r.status).filter(s => s != null)); if (statuses.size === 1) flags.push('single-status'); @@ -75,7 +202,6 @@ export function normalize(outDir) { const withBody = e.rows.filter(r => r.reqBody != null).length; if (withBody > 0 && withBody < e.rows.length) flags.push('request-body-only-on-some-samples'); - // Query parameter schema: collect names + sample values. const qSamples = new Map(); for (const r of e.rows) { for (const k of Object.keys(r.query || {})) { @@ -94,7 +220,7 @@ export function normalize(outDir) { }); } - endpoints.push({ + preEndpoints.push({ endpointKey: `${e.method} ${e.origin}${e.template}`, origin: e.origin, method: e.method, @@ -102,13 +228,42 @@ export function normalize(outDir) { pathParams: e.params.map(p => ({ name: p.name, in: 'path', required: true, schema: p.schema })), queryParams, statusCodes: [...new Set(e.rows.map(r => r.status).filter(s => s != null))].sort((a, b) => a - b), - sampleRows: e.rows, // kept on the in-memory record; trimmed before write + sampleRows: e.rows, sampleCount: e.rows.length, rawPaths: [...e.rawPaths], normalizationFlags: flags, }); } + // Pass 3: classify and decompose + const endpoints = []; + let noiseCount = 0, pageCount = 0, decomposedCount = 0; + for (const ep of preEndpoints) { + const category = classifyEndpoint(ep); + if (category === 'noise') { noiseCount++; continue; } + if (category === 'page') { pageCount++; continue; } + + // Try to decompose multiplexed endpoints + const decomposed = decomposeMultiplexed(ep); + if (decomposed.length > 1) { + decomposedCount += decomposed.length; + for (const sub of decomposed) { + sub.normalizationFlags = [...(sub.normalizationFlags || [])]; + const subStatuses = new Set(sub.sampleRows.map(r => r.status).filter(s => s != null)); + sub.statusCodes = [...subStatuses].sort((a, b) => a - b); + if (sub.sampleRows.length === 1) { + if (!sub.normalizationFlags.includes('single-sample')) sub.normalizationFlags.push('single-sample'); + } + if (subStatuses.size === 1) { + if (!sub.normalizationFlags.includes('single-status')) sub.normalizationFlags.push('single-status'); + } + endpoints.push(sub); + } + } else { + endpoints.push(ep); + } + } + // Drop the heavy in-memory rows from the persisted form; infer.mjs needs // them so we keep a parallel sidecar file. const persisted = endpoints.map(({ sampleRows, ...rest }) => rest); @@ -117,12 +272,12 @@ export function normalize(outDir) { const sidecar = endpoints.map(e => ({ endpointKey: e.endpointKey, samples: e.sampleRows })); writeJsonl(intermediatePath(outDir, 'endpoint-samples.jsonl'), sidecar); - return { endpoints: endpoints.length }; + return { endpoints: endpoints.length, noise: noiseCount, pages: pageCount, decomposed: decomposedCount }; } if (import.meta.url === `file://${process.argv[1]}`) { const out = process.argv[2]; if (!out) { console.error('usage: normalize.mjs '); process.exit(2); } const stats = normalize(out); - console.log(`normalize: ${stats.endpoints} endpoints`); + console.log(`normalize: ${stats.endpoints} endpoints (${stats.noise} noise, ${stats.pages} pages dropped, ${stats.decomposed} decomposed)`); } From 5eee2ab9d79853cc2dd18df34a89de822356a70a Mon Sep 17 00:00:00 2001 From: Shrey Pandya Date: Wed, 13 May 2026 17:31:44 -0400 Subject: [PATCH 5/6] Add self-contained HTML report replacing Swagger UI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generates index.html with: - Summary stats (operations, endpoint, protocol, sample count) - Expandable cards per operation with variables table, client usage, request body, and response example - Full generated client.mjs embedded at the bottom The Swagger UI was a poor fit — 10 identical green POST bars for a single GraphQL endpoint with bracket-syntax paths that aren't even valid OpenAPI. The HTML report shows what actually matters. Co-Authored-By: Claude Opus 4.6 (1M context) --- skills/browser-to-api/scripts/discover.mjs | 2 +- skills/browser-to-api/scripts/emit.mjs | 165 +++++++++++++++++++++ 2 files changed, 166 insertions(+), 1 deletion(-) diff --git a/skills/browser-to-api/scripts/discover.mjs b/skills/browser-to-api/scripts/discover.mjs index 07023685..5dce10ed 100644 --- a/skills/browser-to-api/scripts/discover.mjs +++ b/skills/browser-to-api/scripts/discover.mjs @@ -84,7 +84,7 @@ function main() { } console.log(`\noutput: ${outDir}`); - for (const f of ['client.mjs', 'openapi.yaml', 'openapi.json', 'report.md', 'confidence.json']) { + for (const f of ['index.html', 'client.mjs', 'report.md', 'openapi.yaml', 'openapi.json', 'confidence.json']) { const p = path.join(outDir, f); if (fs.existsSync(p)) console.log(` ${path.relative(process.cwd(), p)}`); } diff --git a/skills/browser-to-api/scripts/emit.mjs b/skills/browser-to-api/scripts/emit.mjs index f53d640a..11614ed1 100644 --- a/skills/browser-to-api/scripts/emit.mjs +++ b/skills/browser-to-api/scripts/emit.mjs @@ -304,6 +304,9 @@ export function emit(outDir, opts = {}) { writeText(path.join(outDir, 'report.md'), buildReport({ kept, dropped, servers, redaction, minSamples, hasClient: !!clientCode })); + // index.html — self-contained visual report + writeText(path.join(outDir, 'index.html'), buildHtmlReport({ kept, servers, title, clientCode })); + return { endpoints: kept.length, droppedLowSample: dropped.length, @@ -610,6 +613,168 @@ function buildReport({ kept, dropped, servers, redaction, minSamples, hasClient return lines.join('\n') + '\n'; } +// --------------------------------------------------------------------------- +// HTML report +// --------------------------------------------------------------------------- + +function escHtml(s) { + return String(s).replace(/&/g, '&').replace(//g, '>').replace(/"/g, '"'); +} + +function buildHtmlReport({ kept, servers, title, clientCode }) { + const baseUrl = servers[0]?.url || ''; + const operations = kept.filter(e => e.operationName); + const regular = kept.filter(e => !e.operationName); + const all = [...operations.sort((a, b) => b.sampleCount - a.sampleCount), ...regular]; + + const opCards = all.map((ep, i) => { + const name = ep.operationName || `${ep.method} ${ep.path}`; + const fnName = ep.operationName ? toFnName(ep.operationName) : null; + const vars = ep.requestExample?.variables; + const varRows = vars && typeof vars === 'object' + ? Object.entries(vars).map(([k, v]) => { + const t = v === null ? 'null' : Array.isArray(v) ? 'array' : typeof v; + const ex = JSON.stringify(v); + return `${escHtml(k)}${escHtml(t)}${escHtml(ex.length > 50 ? ex.slice(0, 47) + '...' : ex)}`; + }).join('\n') + : ''; + + const reqBody = ep.requestExample ? JSON.stringify(ep.requestExample, null, 2) : null; + const respBody = ep.responseExample ? JSON.stringify(ep.responseExample, null, 2) : null; + const truncResp = respBody && respBody.length > 2000 ? respBody.slice(0, 2000) + '\n ...' : respBody; + + return ` +
+
+
+ POST + ${escHtml(name)} +
+
+ ${ep.sampleCount} sample${ep.sampleCount !== 1 ? 's' : ''} + ${fnName ? `${escHtml(fnName)}()` : ''} +
+
+
+ ${ep.parentPath ? `

Endpoint: ${escHtml(ep.method)} ${escHtml(baseUrl)}${escHtml(ep.parentPath)}

` : ''} + ${ep.discriminatorField ? `

Discriminator: ${escHtml(ep.discriminatorField)}: "${escHtml(ep.operationName)}"

` : ''} + + ${varRows ? ` +

Variables

+ + + ${varRows} +
NameTypeExample
` : ''} + + ${fnName ? ` +

Client usage

+
import { ${escHtml(fnName)} } from './client.mjs';
+
+const result = await ${escHtml(fnName)}(${vars ? JSON.stringify(Object.fromEntries(Object.entries(vars).filter(([,v]) => v !== '').slice(0, 4).map(([k, v]) => {
+          if (Array.isArray(v) && v.length > 2) return [k, v.slice(0, 2)];
+          return [k, v];
+        })), null, 2) : '{}'});
` : ''} + + ${reqBody ? ` +

Request body

+
${escHtml(reqBody)}
` : ''} + + ${truncResp ? ` +

Response

+
${escHtml(truncResp)}
` : ''} +
+
`; + }).join('\n'); + + return ` + + + + +${escHtml(title)} — API Report + + + +
+
+

${escHtml(title)}

+

${escHtml(baseUrl)} · ${all.length} operation${all.length !== 1 ? 's' : ''} discovered from browser trace

+
+ +
+
Operations
${all.length}
+
Endpoint
${escHtml(operations[0]?.parentPath || regular[0]?.path || '—')}
+
Protocol
${operations.length ? 'GraphQL (APQ)' : 'REST'}
+
Total samples
${all.reduce((s, e) => s + e.sampleCount, 0)}
+
+ + ${opCards} + + ${clientCode ? ` +
+

Generated client

+

Copy client.mjs into your project. Zero dependencies — uses native fetch.

+
${escHtml(clientCode)}
+
` : ''} +
+ + +`; +} + if (import.meta.url === `file://${process.argv[1]}`) { const out = process.argv[2]; if (!out) { console.error('usage: emit.mjs '); process.exit(2); } From cf3e72bc6c0e2416b740b03e84fd1379ad962fc4 Mon Sep 17 00:00:00 2001 From: Shrey Pandya Date: Wed, 13 May 2026 17:40:01 -0400 Subject: [PATCH 6/6] Replace Swagger UI with self-contained HTML report MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit emit.mjs already generates index.html as the primary visual output — update SKILL.md to match and remove the dead open-swagger-ui.mjs script. Co-Authored-By: Claude Opus 4.6 (1M context) --- skills/browser-to-api/SKILL.md | 23 +- .../scripts/open-swagger-ui.mjs | 205 ------------------ 2 files changed, 12 insertions(+), 216 deletions(-) delete mode 100644 skills/browser-to-api/scripts/open-swagger-ui.mjs diff --git a/skills/browser-to-api/SKILL.md b/skills/browser-to-api/SKILL.md index d62d4939..f1b10f15 100644 --- a/skills/browser-to-api/SKILL.md +++ b/skills/browser-to-api/SKILL.md @@ -14,7 +14,7 @@ This skill **does not capture traffic**. It is purely offline post-processing on ``` browser-trace → .o11y//cdp/network/{requests,responses}.jsonl -browser-to-api → .o11y//api-spec/openapi.yaml + report.md +browser-to-api → .o11y//api-spec/index.html + openapi.yaml + client.mjs ``` ## When to use @@ -57,7 +57,9 @@ node ../browser-trace/scripts/bisect-cdp.mjs my-site ```bash node scripts/discover.mjs --run .o11y/my-site -# → .o11y/my-site/api-spec/openapi.yaml +# → .o11y/my-site/api-spec/index.html ← open this +# .o11y/my-site/api-spec/client.mjs +# .o11y/my-site/api-spec/openapi.yaml # .o11y/my-site/api-spec/openapi.json # .o11y/my-site/api-spec/report.md # .o11y/my-site/api-spec/confidence.json @@ -67,17 +69,15 @@ node scripts/discover.mjs --run .o11y/my-site `discover.mjs` auto-detects `/cdp/network/bodies/`. To use a body capture from elsewhere (e.g. didn't snapshot, want the live `browse network` dir), pass `--bodies ` explicitly. -The two primary deliverables are `openapi.yaml` (machine-readable spec) and `report.md` (human-readable coverage summary). +### 3. Open the HTML report -### 3. Preview in Swagger UI when available - -If Swagger UI is installed locally, open the generated spec there: +After `discover.mjs` finishes, **always open the generated HTML report**: ```bash -node scripts/open-swagger-ui.mjs --run .o11y/my-site +open .o11y/my-site/api-spec/index.html ``` -The helper auto-detects `$SWAGGER_UI_DIR`, `~/Developer/swagger-ui`, or `node_modules/swagger-ui-dist`. If none exists, deliver `openapi.yaml` and `report.md` directly and tell the user Swagger UI was not found. +The report is a self-contained HTML file (no server needed) that shows each discovered operation as an expandable card with variables, client usage, request/response examples, and a generated `client.mjs` snippet at the bottom. This is the primary deliverable — always open it for the user. ## CLI flags @@ -95,15 +95,16 @@ The helper auto-detects `$SWAGGER_UI_DIR`, `~/Developer/swagger-ui`, or `node_mo | `--min-samples ` | no | Minimum samples per endpoint to include. Default `1` | | `--stage ` | no | Run only one stage: `load`, `filter`, `normalize`, `infer`, `emit` | -`scripts/open-swagger-ui.mjs` accepts `--run ` or `--spec `, plus optional `--swagger-ui `, `--host`, `--port`, and `--no-open`. ## Output layout ``` /api-spec/ -├── openapi.yaml primary deliverable +├── index.html visual report — open this (self-contained, no server) +├── client.mjs zero-dep fetch client with typed functions per operation +├── openapi.yaml machine-readable spec ├── openapi.json mirror -├── report.md human-readable summary + coverage caveats +├── report.md markdown summary + curl examples ├── confidence.json per-endpoint confidence + normalization flags ├── samples/ redacted request/response examples │ └── __.json diff --git a/skills/browser-to-api/scripts/open-swagger-ui.mjs b/skills/browser-to-api/scripts/open-swagger-ui.mjs deleted file mode 100644 index e2abc459..00000000 --- a/skills/browser-to-api/scripts/open-swagger-ui.mjs +++ /dev/null @@ -1,205 +0,0 @@ -#!/usr/bin/env node -// Preview an emitted OpenAPI spec in a local Swagger UI checkout. - -import fs from 'node:fs'; -import http from 'node:http'; -import os from 'node:os'; -import path from 'node:path'; -import { spawn } from 'node:child_process'; -import { fileURLToPath } from 'node:url'; - -const __dirname = path.dirname(fileURLToPath(import.meta.url)); - -function parseArgs(argv) { - const opts = { - run: null, - spec: null, - swaggerUi: null, - host: '127.0.0.1', - port: 0, - open: true, - }; - for (let i = 0; i < argv.length; i++) { - const a = argv[i]; - const next = () => argv[++i]; - switch (a) { - case '--run': opts.run = next(); break; - case '--spec': opts.spec = next(); break; - case '--swagger-ui': opts.swaggerUi = next(); break; - case '--host': opts.host = next(); break; - case '--port': opts.port = Number(next()); break; - case '--no-open': opts.open = false; break; - case '-h': case '--help': - printHelp(); process.exit(0); - default: - console.error(`unknown arg: ${a}`); - printHelp(); process.exit(2); - } - } - return opts; -} - -function printHelp() { - console.error(`usage: open-swagger-ui.mjs (--run | --spec ) [flags] - - --run browser-trace run dir; uses /api-spec/openapi.yaml - --spec OpenAPI YAML/JSON file to preview - --swagger-ui Swagger UI checkout/package dir. Defaults to - $SWAGGER_UI_DIR, ~/Developer/swagger-ui, or node_modules/swagger-ui-dist - --host Bind host. Default: 127.0.0.1 - --port Bind port. Default: random free port - --no-open Print the URL without opening a browser`); -} - -function resolveRun(runArg) { - if (fs.existsSync(runArg) && fs.statSync(runArg).isDirectory()) return path.resolve(runArg); - const root = process.env.O11Y_ROOT || '.o11y'; - const guess = path.join(root, runArg); - if (fs.existsSync(guess) && fs.statSync(guess).isDirectory()) return path.resolve(guess); - throw new Error(`run path not found: ${runArg} (tried ${guess})`); -} - -function resolveSpec(opts) { - if (opts.spec) return path.resolve(opts.spec); - if (!opts.run) throw new Error('expected --run or --spec '); - - const runPath = resolveRun(opts.run); - const candidates = [ - path.join(runPath, 'api-spec', 'openapi.yaml'), - path.join(runPath, 'api-spec', 'openapi.json'), - ]; - const found = candidates.find(p => fs.existsSync(p)); - if (!found) throw new Error(`no OpenAPI spec found under ${path.join(runPath, 'api-spec')}`); - return found; -} - -function swaggerUiCandidates(explicit) { - return [ - explicit, - process.env.SWAGGER_UI_DIR, - path.join(os.homedir(), 'Developer', 'swagger-ui'), - path.resolve(process.cwd(), 'node_modules', 'swagger-ui-dist'), - path.resolve(__dirname, '..', 'node_modules', 'swagger-ui-dist'), - ].filter(Boolean); -} - -function distDirFor(candidate) { - const resolved = path.resolve(candidate); - const directDist = path.join(resolved, 'dist'); - if (fs.existsSync(path.join(directDist, 'index.html'))) return directDist; - if (fs.existsSync(path.join(resolved, 'index.html')) && fs.existsSync(path.join(resolved, 'swagger-ui-bundle.js'))) return resolved; - return null; -} - -function resolveSwaggerUi(explicit) { - for (const candidate of swaggerUiCandidates(explicit)) { - const dist = distDirFor(candidate); - if (dist) return dist; - } - - const searched = swaggerUiCandidates(explicit).map(p => ` - ${path.resolve(p)}`).join('\n'); - throw new Error(`Swagger UI not found. Searched:\n${searched}\n\nInstall it locally, then rerun:\n git clone https://github.com/swagger-api/swagger-ui.git ~/Developer/swagger-ui\n cd ~/Developer/swagger-ui && npm ci\n\nOr pass --swagger-ui / set SWAGGER_UI_DIR.`); -} - -function mimeFor(filePath) { - const ext = path.extname(filePath).toLowerCase(); - return { - '.css': 'text/css; charset=utf-8', - '.html': 'text/html; charset=utf-8', - '.js': 'application/javascript; charset=utf-8', - '.json': 'application/json; charset=utf-8', - '.map': 'application/json; charset=utf-8', - '.png': 'image/png', - '.svg': 'image/svg+xml', - '.yaml': 'application/yaml; charset=utf-8', - '.yml': 'application/yaml; charset=utf-8', - }[ext] || 'application/octet-stream'; -} - -function swaggerInitializer(specRoute) { - return `window.onload = function() { - window.ui = SwaggerUIBundle({ - url: ${JSON.stringify(specRoute)}, - dom_id: '#swagger-ui', - deepLinking: true, - presets: [ - SwaggerUIBundle.presets.apis, - SwaggerUIStandalonePreset - ], - plugins: [ - SwaggerUIBundle.plugins.DownloadUrl - ], - layout: 'StandaloneLayout' - }); -}; -`; -} - -function safeStaticPath(distDir, urlPath) { - const decoded = decodeURIComponent(urlPath); - const relative = decoded === '/' ? 'index.html' : decoded.replace(/^\/+/, ''); - const fullPath = path.resolve(distDir, relative); - const root = path.resolve(distDir); - if (fullPath !== root && !fullPath.startsWith(root + path.sep)) return null; - return fullPath; -} - -function openUrl(url) { - const opener = process.platform === 'darwin' - ? ['open', [url]] - : process.platform === 'win32' - ? ['cmd', ['/c', 'start', '', url]] - : ['xdg-open', [url]]; - const child = spawn(opener[0], opener[1], { detached: true, stdio: 'ignore' }); - child.unref(); -} - -async function main() { - const opts = parseArgs(process.argv.slice(2)); - const specPath = resolveSpec(opts); - if (!fs.existsSync(specPath)) throw new Error(`spec not found: ${specPath}`); - - const distDir = resolveSwaggerUi(opts.swaggerUi); - const specRoute = path.extname(specPath).toLowerCase() === '.json' ? '/openapi.json' : '/openapi.yaml'; - - const server = http.createServer((req, res) => { - const requestPath = new URL(req.url, `http://${opts.host}`).pathname; - if (requestPath === specRoute) { - res.writeHead(200, { 'content-type': mimeFor(specPath), 'cache-control': 'no-store' }); - fs.createReadStream(specPath).pipe(res); - return; - } - if (requestPath === '/swagger-initializer.js') { - res.writeHead(200, { 'content-type': 'application/javascript; charset=utf-8', 'cache-control': 'no-store' }); - res.end(swaggerInitializer(specRoute)); - return; - } - - const staticPath = safeStaticPath(distDir, requestPath); - if (!staticPath || !fs.existsSync(staticPath) || fs.statSync(staticPath).isDirectory()) { - res.writeHead(404, { 'content-type': 'text/plain; charset=utf-8' }); - res.end('not found\n'); - return; - } - res.writeHead(200, { 'content-type': mimeFor(staticPath) }); - fs.createReadStream(staticPath).pipe(res); - }); - - await new Promise((resolve, reject) => { - server.once('error', reject); - server.listen(opts.port, opts.host, resolve); - }); - - const address = server.address(); - const url = `http://${opts.host}:${address.port}/`; - console.log(`swagger_ui=${distDir}`); - console.log(`spec=${specPath}`); - console.log(`url=${url}`); - console.log('Press Ctrl-C to stop the preview server.'); - if (opts.open) openUrl(url); -} - -main().catch(err => { - console.error(err.message); - process.exit(1); -});