From c687250bc3c67d151a56cf65ed67a8739877a9f7 Mon Sep 17 00:00:00 2001 From: anil-bd Date: Mon, 18 May 2026 22:09:15 +0200 Subject: [PATCH] feat(scraper-create): write {collector_id, ...} envelope to -o MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Today the file `-o create.json` writes only the final AI-progress payload — no `collector_id`, no name, no view_url. The documented recipe in references/recipes.md depends on jq reading the collector_id out of that file: COLLECTOR_ID=$(jq -r '.collector_id // .id' create.json) bdata scraper run "$COLLECTOR_ID" ... Today that returns the string "null" because the field doesn't exist in the file. Every script that follows the docs to chain create → run is silently broken. This change wraps every termination path (success, AI-trigger failure, status=failed, polling exception) in one envelope: { "collector_id": "c_...", "name": "audit-r4-...", "status": "done" | "failed" | "ai_trigger_failed" | "poll_failed", "completed_steps": [...], "view_url": "https://brightdata.com/cp/scrapers/c_...", "created_at": "2026-05-18T07:28:30Z", "error": "..." // failure paths only } Notable design choices: * Every termination path writes the same shape, including failure paths that previously wrote nothing. So a script using `jq -r '.collector_id'` always recovers an id when one exists — even from a stub collector that hit the AI-Flow parallel-job cap. This makes good on SKILL.md's promise that every failure path surfaces the collector_id. * `view_url` is included on every envelope so the user has a one- click recovery path to inspect / finish / delete the scraper in the dashboard, without needing to know the URL pattern. * `created_at` is taken from the template-creation response when the API provides it (`Create_template_response.created`), omitted otherwise — never invented. * New `--legacy-output` flag preserves today's bare-progress shape for one minor version so any existing scripts that depended on the old shape have a migration window. Slated for removal in the next major. * Stdout (the success summary printed to TTY) is unchanged. Only the machine-readable `-o` / `--json` / `--pretty` payload is reshaped. * Scoped strictly to `src/commands/scraper.ts` and the new envelope type. The shared HTTP client and other commands (scrape, search, discover, pipelines, browser) are untouched. Tests: 4 new `build_create_envelope` unit cases covering success, omitted-created_at, failure-with-error, and view_url-on-every- path. 5 new `handle_create_scraper` integration cases covering success envelope, the documented jq recipe, --legacy-output preserving the bare shape, AI-trigger failure envelope (the stub-collector recovery path), poll-status-failed envelope, and poll-exception envelope. Two existing tests updated from strict opts-object matches to objectContaining-style (the contract is now the envelope shape, not the bare payload). 55 / 55 scraper tests pass. The 9 pre-existing failures in unrelated suites (daemon, add-mcp, browser, discover, scrape) on main are unchanged by this PR. Spec: brightdata/skills repo, proposal at skills/scraper-studio/proposals/PR-2-create-envelope.md (to be filed alongside this PR). --- README.md | 38 +++- src/__tests__/commands/scraper.test.ts | 229 ++++++++++++++++++++++++- src/commands/scraper.ts | 105 +++++++++++- src/types/scraper.ts | 18 ++ 4 files changed, 379 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 61333d2..26fb1b9 100644 --- a/README.md +++ b/README.md @@ -319,13 +319,42 @@ brightdata scraper create [options] | `--name ` | Scraper template name (default: `cli-scraper-`) | | `--deliver-webhook ` | Webhook URL for the deliver stub (default: `https://example.com/webhook`) | | `--timeout ` | Polling timeout in seconds (default: `600`) | -| `-o, --output ` | Write output to file | +| `-o, --output ` | Write the JSON envelope to a file (see below) | | `--json` / `--pretty` | JSON output (raw / indented) | +| `--legacy-output` | Write the pre-v0.3 bare AI-progress payload to `-o` instead of the envelope. Migration only. | | `--timing` | Show request timing | | `-k, --api-key ` | Override API key | > **Note:** The scraper is created with a placeholder webhook delivery target (`https://example.com/webhook`). You can reconfigure the actual delivery endpoint in the [Bright Data web UI](https://brightdata.com/cp/scrapers) after creation. +#### Output envelope (`-o create.json`) + +Every termination path — success or failure — writes the same JSON envelope shape: + +```json +{ + "collector_id": "c_mp7x8a9b2c0d1e2f", + "name": "my-product-scraper", + "status": "done", + "completed_steps": ["prepare_intent_analyzer", "planner", "..."], + "view_url": "https://brightdata.com/cp/scrapers/c_mp7x8a9b2c0d1e2f", + "created_at": "2026-05-18T07:28:30Z" +} +``` + +On failure paths the envelope adds an `error` field and the `status` reflects the failure category (`ai_trigger_failed`, `failed`, `poll_failed`). The `collector_id` and `view_url` are still present so you can recover or inspect the half-built scraper. + +This makes the documented chain in [recipes.md](https://github.com/brightdata/skills/blob/main/skills/scraper-studio/references/recipes.md) work as written: + +```bash +brightdata scraper create https://example.com/product/1 "..." \ + --pretty -o create.json +COLLECTOR_ID=$(jq -r '.collector_id' create.json) +brightdata scraper run "$COLLECTOR_ID" https://example.com/product/2 +``` + +Use `--legacy-output` if you have an existing script that depended on the pre-v0.3 bare-progress shape; the flag is supported for one minor version while you migrate. + **Examples** ```bash @@ -333,10 +362,13 @@ brightdata scraper create [options] brightdata scraper create https://example.com/product/1 \ "Extract title, price, and image URL from this product page" -# Name the scraper and save the full AI output to a file +# Name the scraper and save the envelope to a file brightdata scraper create https://example.com/product/1 \ "Extract title, price, and image URL from this product page" \ - --name my-product-scraper --pretty -o scraper-output.json + --name my-product-scraper --pretty -o create.json + +# Capture the collector_id for chaining +COLLECTOR_ID=$(jq -r '.collector_id' create.json) # Use a custom webhook delivery URL brightdata scraper create https://example.com/product/1 \ diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index 39e7e0c..868cbdd 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -50,6 +50,7 @@ import { extract_progress_status, format_create_summary, handle_create_scraper, + build_create_envelope, handle_run_scraper, build_run_request, build_run_query, @@ -149,6 +150,218 @@ describe('commands/scraper', ()=>{ }); }); + // PR-2: the envelope contract is the whole point of the PR. + // Lock the shape, the failure-path semantics, and the legacy + // escape hatch in one place. + describe('build_create_envelope (PR-2)', ()=>{ + it('returns the documented success shape', ()=>{ + const env = build_create_envelope({ + collector_id: 'c_xyz', + name: 'product-v1', + status: 'done', + progress: {status: 'done', + completed_steps: ['a', 'b', 'c']}, + created_at: '2026-05-18T07:28:30Z', + }); + expect(env).toEqual({ + collector_id: 'c_xyz', + name: 'product-v1', + status: 'done', + completed_steps: ['a', 'b', 'c'], + view_url: 'https://brightdata.com/cp/scrapers/c_xyz', + created_at: '2026-05-18T07:28:30Z', + }); + }); + + it('omits created_at when not known', ()=>{ + const env = build_create_envelope({ + collector_id: 'c_xyz', + name: 'n', + status: 'done', + progress: {status: 'done', completed_steps: []}, + }); + expect(env).not.toHaveProperty('created_at'); + }); + + it('records the error message and partial steps on failure', + ()=>{ + const env = build_create_envelope({ + collector_id: 'c_xyz', + name: 'n', + status: 'ai_trigger_failed', + error: 'Cannot run more than 3 jobs in parallel', + }); + expect(env.collector_id).toBe('c_xyz'); + expect(env.status).toBe('ai_trigger_failed'); + expect(env.error).toMatch(/parallel/); + expect(env.completed_steps).toEqual([]); + // view_url remains useful even on failure so the user + // can inspect the stub collector in the dashboard. + expect(env.view_url) + .toBe('https://brightdata.com/cp/scrapers/c_xyz'); + }); + + it('still includes view_url on every termination path', ()=>{ + for (const status of ['done', 'failed', 'ai_trigger_failed', + 'poll_failed']) + { + const env = build_create_envelope({ + collector_id: 'c_xyz', name: 'n', status, + }); + expect(env.view_url) + .toBe('https://brightdata.com/cp/scrapers/c_xyz'); + } + }); + }); + + describe('handle_create_scraper envelope output (PR-2)', ()=>{ + const setup_success = ()=>{ + mocks.post + .mockResolvedValueOnce({ + id: 'c_xyz', name: 'product-v1', + created: '2026-05-18T07:28:30Z', + }) + .mockResolvedValueOnce({id: 'ia_xyz', queued: false}); + mocks.poll_until.mockResolvedValue({ + result: {status: 'done', + completed_steps: ['a', 'b', 'c']}, + attempts: 4, + }); + }; + + it('writes the new envelope to -o on success', async()=>{ + setup_success(); + await handle_create_scraper( + 'https://x.com/p', 'd', + {output: 'create.json', pretty: true} + ); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({ + collector_id: 'c_xyz', + name: 'product-v1', + status: 'done', + completed_steps: ['a', 'b', 'c'], + view_url: 'https://brightdata.com/cp/scrapers/c_xyz', + created_at: '2026-05-18T07:28:30Z', + }), + expect.objectContaining({output: 'create.json'}) + ); + }); + + it('the documented `jq -r .collector_id` recipe works on the ' + +'envelope', async()=>{ + // The bug PR-2 is fixing — yesterday this returned `null`. + setup_success(); + await handle_create_scraper('https://x.com/p', 'd', + {output: 'create.json'}); + const written = mocks.print.mock.calls[0][0] as { + collector_id?: string}; + expect(written.collector_id).toBe('c_xyz'); + }); + + it('--legacy-output preserves the bare progress payload', + async()=>{ + setup_success(); + await handle_create_scraper( + 'https://x.com/p', 'd', + {output: 'create.json', legacyOutput: true} + ); + const written = mocks.print.mock.calls[0][0] as { + collector_id?: unknown; status?: string}; + // Bare progress shape today: status + completed_steps, + // NO collector_id, NO view_url. + expect(written.collector_id).toBeUndefined(); + expect(written).not.toHaveProperty('view_url'); + expect(written.status).toBe('done'); + }); + + it('writes the envelope when AI trigger fails (stub-collector ' + +'recovery path)', async()=>{ + mocks.post + .mockResolvedValueOnce({id: 'c_stub', name: 'n'}) + .mockRejectedValueOnce( + new Error('Cannot run more than 3 jobs in parallel')); + const exit = vi.spyOn(process, 'exit') + .mockImplementation(()=>undefined as never); + const error = vi.spyOn(console, 'error') + .mockImplementation(()=>{}); + await handle_create_scraper( + 'https://x.com/p', 'd', + {output: 'create.json'} + ); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({ + collector_id: 'c_stub', + status: 'ai_trigger_failed', + error: expect.stringMatching(/parallel/), + view_url: 'https://brightdata.com/cp/scrapers/c_stub', + }), + expect.objectContaining({output: 'create.json'}) + ); + exit.mockRestore(); + error.mockRestore(); + }); + + it('writes the envelope when poll returns status != done', + async()=>{ + mocks.post + .mockResolvedValueOnce({id: 'c_abc', name: 'n'}) + .mockResolvedValueOnce({id: 'ia_xyz', queued: false}); + mocks.poll_until.mockResolvedValue({ + result: {status: 'failed', + completed_steps: ['planner']}, + attempts: 2, + }); + const exit = vi.spyOn(process, 'exit') + .mockImplementation(()=>undefined as never); + const error = vi.spyOn(console, 'error') + .mockImplementation(()=>{}); + await handle_create_scraper( + 'https://x.com/p', 'd', + {output: 'create.json'} + ); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({ + collector_id: 'c_abc', + status: 'failed', + completed_steps: ['planner'], + error: expect.stringMatching(/finished with status/), + }), + expect.objectContaining({output: 'create.json'}) + ); + exit.mockRestore(); + error.mockRestore(); + }); + + it('writes the envelope when polling itself throws (timeout ' + +'or network)', async()=>{ + mocks.post + .mockResolvedValueOnce({id: 'c_abc', name: 'n'}) + .mockResolvedValueOnce({id: 'ia_xyz', queued: false}); + mocks.poll_until.mockRejectedValue( + new Error( + 'Timeout after 600 seconds waiting for AI generation')); + const exit = vi.spyOn(process, 'exit') + .mockImplementation(()=>undefined as never); + const error = vi.spyOn(console, 'error') + .mockImplementation(()=>{}); + await handle_create_scraper( + 'https://x.com/p', 'd', + {output: 'create.json'} + ); + expect(mocks.print).toHaveBeenCalledWith( + expect.objectContaining({ + collector_id: 'c_abc', + status: 'poll_failed', + error: expect.stringMatching(/Timeout/), + }), + expect.objectContaining({output: 'create.json'}) + ); + exit.mockRestore(); + error.mockRestore(); + }); + }); + describe('handle_create_scraper', ()=>{ it('chains create → trigger → poll and prints JSON in non-TTY', async()=>{ @@ -194,8 +407,17 @@ describe('commands/scraper', ()=>{ timeout_label: expect.stringContaining('c_abc'), }) ); + // PR-2: -o now writes an envelope with collector_id, + // not the raw progress payload. The documented + // `jq -r '.collector_id'` recipe depends on this. expect(mocks.print).toHaveBeenCalledWith( - progress, + expect.objectContaining({ + collector_id: 'c_abc', + name: 'cli-scraper-1', + status: 'done', + completed_steps: ['a', 'b'], + view_url: 'https://brightdata.com/cp/scrapers/c_abc', + }), {json: undefined, pretty: undefined, output: undefined} ); }); @@ -209,7 +431,10 @@ describe('commands/scraper', ()=>{ result: progress, attempts: 1}); await handle_create_scraper('https://x.com', 'd', {json: true}); expect(mocks.print).toHaveBeenCalledWith( - progress, + expect.objectContaining({ + collector_id: 'c_abc', + status: 'done', + }), {json: true, pretty: undefined, output: undefined} ); }); diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 9ab4400..73e4963 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -12,6 +12,7 @@ import type { Trigger_ai_response, Ai_progress_response, Scraper_create_opts, + Create_envelope, Run_request, Trigger_immediate_response, Scraper_run_opts, @@ -83,6 +84,43 @@ const format_create_summary = ( return lines.join('\n'); }; +// PR-2: every termination path of `scraper create` writes this same +// envelope shape to -o. Solves the broken `jq -r '.collector_id'` +// recipe in references/recipes.md (today's -o file contains only the +// final progress payload, with no id field). +const build_create_envelope = (params: { + collector_id: string; + name: string; + status: string; + progress?: Ai_progress_response; + created_at?: string; + error?: string; +}): Create_envelope=>({ + collector_id: params.collector_id, + name: params.name, + status: params.status, + completed_steps: params.progress?.completed_steps ?? [], + view_url: `https://brightdata.com/cp/scrapers/${params.collector_id}`, + ...(params.created_at ? {created_at: params.created_at} : {}), + ...(params.error ? {error: params.error} : {}), +}); + +// Write the envelope (or, in --legacy-output mode, the bare progress +// payload) to wherever the user asked. Centralised so success and +// every failure path share one I/O code path. +const emit_create_output = ( + envelope: Create_envelope, + progress: Ai_progress_response|null, + opts: Scraper_create_opts +): void=>{ + const print_opts = {json: opts.json, pretty: opts.pretty, + output: opts.output}; + const payload = opts.legacyOutput && progress + ? (progress as unknown) : envelope; + if (opts.json || opts.pretty || opts.output || !is_tty) + print(payload, print_opts); +}; + const handle_create_scraper = async( url: string, description: string, @@ -100,6 +138,7 @@ const handle_create_scraper = async( const create_spinner = start_spinner('Creating scraper template...'); let collector_id = ''; let scraper_name = template_body.name; + let created_at: string|undefined; try { const template = await post( api_key, @@ -110,11 +149,14 @@ const handle_create_scraper = async( create_spinner.stop(); if (!template.id) { + // Template POST didn't return an id — no collector_id to + // envelope, so no -o file to write. Same as today. fail('Failed to create scraper template (missing id).'); return; } collector_id = template.id; scraper_name = template.name ?? template_body.name; + created_at = template.created; console.error(dim(`Template created: ${collector_id}`)); } catch(e) { create_spinner.stop(); @@ -134,9 +176,23 @@ const handle_create_scraper = async( trigger_spinner.stop(); } catch(e) { trigger_spinner.stop(); + const msg = (e as Error).message; console.error( `Failed to start AI generation for collector ` - +`${collector_id}: ${(e as Error).message}` + +`${collector_id}: ${msg}` + ); + // PR-2: write the envelope even on failure so the user's + // automation can read collector_id + status from the file. + emit_create_output( + build_create_envelope({ + collector_id, + name: scraper_name, + status: 'ai_trigger_failed', + created_at, + error: msg, + }), + null, + opts ); process.exit(1); return; @@ -171,16 +227,36 @@ const handle_create_scraper = async( `AI generation failed (collector ${collector_id}, ` +`status: ${progress.status}).` ); + emit_create_output( + build_create_envelope({ + collector_id, + name: scraper_name, + status: progress.status, + progress, + created_at, + error: `AI generation finished with status ` + +`"${progress.status}".`, + }), + progress, + opts + ); process.exit(1); return; } - const print_opts = {json: opts.json, pretty: opts.pretty, - output: opts.output}; + // Success path. + emit_create_output( + build_create_envelope({ + collector_id, + name: scraper_name, + status: progress.status, + progress, + created_at, + }), + progress, + opts + ); if (opts.json || opts.pretty || opts.output || !is_tty) - { - print(progress, print_opts); return; - } success(format_create_summary( collector_id, scraper_name, progress)); } catch(e) { @@ -189,6 +265,17 @@ const handle_create_scraper = async( const suffix = msg.includes(collector_id) ? '' : ` (collector ${collector_id})`; console.error(`${msg}${suffix}`); + emit_create_output( + build_create_envelope({ + collector_id, + name: scraper_name, + status: 'poll_failed', + created_at, + error: msg, + }), + null, + opts + ); process.exit(1); return; } @@ -547,6 +634,10 @@ const create_subcommand = new Command('create') .option('-o, --output ', 'Write output to file') .option('--json', 'Force JSON output') .option('--pretty', 'Pretty-print JSON output') + .option('--legacy-output', + 'Emit the bare AI-progress payload (pre-v0.3 shape) instead ' + +'of the new {collector_id, name, status, ...} envelope. ' + +'For one-version migration only.') .option('--timing', 'Show request timing') .option('-k, --api-key ', 'Override API key') .action(handle_create_scraper); @@ -584,6 +675,8 @@ export { build_ai_request, extract_progress_status, format_create_summary, + build_create_envelope, + emit_create_output, handle_run_scraper, build_run_request, build_run_query, diff --git a/src/types/scraper.ts b/src/types/scraper.ts index 74dbbcf..8ddabe9 100644 --- a/src/types/scraper.ts +++ b/src/types/scraper.ts @@ -45,6 +45,23 @@ type Scraper_create_opts = { pretty?: boolean; timing?: boolean; apiKey?: string; + // PR-2: when true, write the bare AI-progress payload to -o + // (today's shape) instead of the new envelope. One-version + // migration flag. + legacyOutput?: boolean; +}; + +// PR-2: machine-readable envelope written to -o on every termination +// path of `scraper create`. Replaces the previous bare-progress +// payload so the documented `jq -r '.collector_id'` recipe works. +type Create_envelope = { + collector_id: string; + name: string; + status: string; + completed_steps: string[]; + view_url: string; + created_at?: string; + error?: string; }; type Run_request = { @@ -92,6 +109,7 @@ export type { Trigger_ai_response, Ai_progress_response, Scraper_create_opts, + Create_envelope, Run_request, Trigger_immediate_response, Sync_timeout_response,