From c687250bc3c67d151a56cf65ed67a8739877a9f7 Mon Sep 17 00:00:00 2001
From: anil-bd <anil@brightdata.com>
Date: Mon, 18 May 2026 22:09:15 +0200
Subject: [PATCH] feat(scraper-create): write {collector_id, ...} envelope to
 -o
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Today the file `-o create.json` writes only the final AI-progress
payload — no `collector_id`, no name, no view_url. The documented
recipe in references/recipes.md depends on jq reading the
collector_id out of that file:

    COLLECTOR_ID=$(jq -r '.collector_id // .id' create.json)
    bdata scraper run "$COLLECTOR_ID" ...

Today that returns the string "null" because the field doesn't exist
in the file. Every script that follows the docs to chain create →
run is silently broken.

This change wraps every termination path (success, AI-trigger
failure, status=failed, polling exception) in one envelope:

    {
      "collector_id":    "c_...",
      "name":            "audit-r4-...",
      "status":          "done" | "failed" | "ai_trigger_failed" | "poll_failed",
      "completed_steps": [...],
      "view_url":        "https://brightdata.com/cp/scrapers/c_...",
      "created_at":      "2026-05-18T07:28:30Z",
      "error":           "..."   // failure paths only
    }

Notable design choices:

* Every termination path writes the same shape, including failure
  paths that previously wrote nothing. So a script using
  `jq -r '.collector_id'` always recovers an id when one exists —
  even from a stub collector that hit the AI-Flow parallel-job cap.
  This makes good on SKILL.md's promise that every failure path
  surfaces the collector_id.
* `view_url` is included on every envelope so the user has a one-
  click recovery path to inspect / finish / delete the scraper in
  the dashboard, without needing to know the URL pattern.
* `created_at` is taken from the template-creation response when
  the API provides it (`Create_template_response.created`),
  omitted otherwise — never invented.
* New `--legacy-output` flag preserves today's bare-progress shape
  for one minor version so any existing scripts that depended on
  the old shape have a migration window. Slated for removal in
  the next major.
* Stdout (the success summary printed to TTY) is unchanged. Only
  the machine-readable `-o` / `--json` / `--pretty` payload is
  reshaped.
* Scoped strictly to `src/commands/scraper.ts` and the new
  envelope type. The shared HTTP client and other commands
  (scrape, search, discover, pipelines, browser) are untouched.

Tests: 4 new `build_create_envelope` unit cases covering success,
omitted-created_at, failure-with-error, and view_url-on-every-
path. 5 new `handle_create_scraper` integration cases covering
success envelope, the documented jq recipe, --legacy-output
preserving the bare shape, AI-trigger failure envelope (the
stub-collector recovery path), poll-status-failed envelope, and
poll-exception envelope. Two existing tests updated from strict
opts-object matches to objectContaining-style (the contract is
now the envelope shape, not the bare payload).

55 / 55 scraper tests pass. The 9 pre-existing failures in
unrelated suites (daemon, add-mcp, browser, discover, scrape) on
main are unchanged by this PR.

Spec: brightdata/skills repo, proposal at
skills/scraper-studio/proposals/PR-2-create-envelope.md (to be
filed alongside this PR).
---
 README.md                              |  38 +++-
 src/__tests__/commands/scraper.test.ts | 229 ++++++++++++++++++++++++-
 src/commands/scraper.ts                | 105 +++++++++++-
 src/types/scraper.ts                   |  18 ++
 4 files changed, 379 insertions(+), 11 deletions(-)
diff --git a/README.md b/README.md
index 61333d2..26fb1b9 100644
--- a/README.md
+++ b/README.md
@@ -319,13 +319,42 @@ brightdata scraper create <url> <description> [options]
 | `--name <name>` | Scraper template name (default: `cli-scraper-<timestamp>`) |
 | `--deliver-webhook <url>` | Webhook URL for the deliver stub (default: `https://example.com/webhook`) |
 | `--timeout <seconds>` | Polling timeout in seconds (default: `600`) |
-| `-o, --output <path>` | Write output to file |
+| `-o, --output <path>` | Write the JSON envelope to a file (see below) |
 | `--json` / `--pretty` | JSON output (raw / indented) |
+| `--legacy-output` | Write the pre-v0.3 bare AI-progress payload to `-o` instead of the envelope. Migration only. |
 | `--timing` | Show request timing |
 | `-k, --api-key <key>` | Override API key |
 
 > **Note:** The scraper is created with a placeholder webhook delivery target (`https://example.com/webhook`). You can reconfigure the actual delivery endpoint in the [Bright Data web UI](https://brightdata.com/cp/scrapers) after creation.
 
+#### Output envelope (`-o create.json`)
+
+Every termination path — success or failure — writes the same JSON envelope shape:
+
+```json
+{
+  "collector_id":    "c_mp7x8a9b2c0d1e2f",
+  "name":            "my-product-scraper",
+  "status":          "done",
+  "completed_steps": ["prepare_intent_analyzer", "planner", "..."],
+  "view_url":        "https://brightdata.com/cp/scrapers/c_mp7x8a9b2c0d1e2f",
+  "created_at":      "2026-05-18T07:28:30Z"
+}
+```
+
+On failure paths the envelope adds an `error` field and the `status` reflects the failure category (`ai_trigger_failed`, `failed`, `poll_failed`). The `collector_id` and `view_url` are still present so you can recover or inspect the half-built scraper.
+
+This makes the documented chain in [recipes.md](https://github.com/brightdata/skills/blob/main/skills/scraper-studio/references/recipes.md) work as written:
+
+```bash
+brightdata scraper create https://example.com/product/1 "..." \
+    --pretty -o create.json
+COLLECTOR_ID=$(jq -r '.collector_id' create.json)
+brightdata scraper run "$COLLECTOR_ID" https://example.com/product/2
+```
+
+Use `--legacy-output` if you have an existing script that depended on the pre-v0.3 bare-progress shape; the flag is supported for one minor version while you migrate.
+
 **Examples**
 
 ```bash
@@ -333,10 +362,13 @@ brightdata scraper create <url> <description> [options]
 brightdata scraper create https://example.com/product/1 \
     "Extract title, price, and image URL from this product page"
 
-# Name the scraper and save the full AI output to a file
+# Name the scraper and save the envelope to a file
 brightdata scraper create https://example.com/product/1 \
     "Extract title, price, and image URL from this product page" \
-    --name my-product-scraper --pretty -o scraper-output.json
+    --name my-product-scraper --pretty -o create.json
+
+# Capture the collector_id for chaining
+COLLECTOR_ID=$(jq -r '.collector_id' create.json)
 
 # Use a custom webhook delivery URL
 brightdata scraper create https://example.com/product/1 \
diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts
index 39e7e0c..868cbdd 100644
--- a/src/__tests__/commands/scraper.test.ts
+++ b/src/__tests__/commands/scraper.test.ts
@@ -50,6 +50,7 @@ import {
     extract_progress_status,
     format_create_summary,
     handle_create_scraper,
+    build_create_envelope,
     handle_run_scraper,
     build_run_request,
     build_run_query,
@@ -149,6 +150,218 @@ describe('commands/scraper', ()=>{
         });
     });
 
+    // PR-2: the envelope contract is the whole point of the PR.
+    // Lock the shape, the failure-path semantics, and the legacy
+    // escape hatch in one place.
+    describe('build_create_envelope (PR-2)', ()=>{
+        it('returns the documented success shape', ()=>{
+            const env = build_create_envelope({
+                collector_id: 'c_xyz',
+                name: 'product-v1',
+                status: 'done',
+                progress: {status: 'done',
+                    completed_steps: ['a', 'b', 'c']},
+                created_at: '2026-05-18T07:28:30Z',
+            });
+            expect(env).toEqual({
+                collector_id: 'c_xyz',
+                name: 'product-v1',
+                status: 'done',
+                completed_steps: ['a', 'b', 'c'],
+                view_url: 'https://brightdata.com/cp/scrapers/c_xyz',
+                created_at: '2026-05-18T07:28:30Z',
+            });
+        });
+
+        it('omits created_at when not known', ()=>{
+            const env = build_create_envelope({
+                collector_id: 'c_xyz',
+                name: 'n',
+                status: 'done',
+                progress: {status: 'done', completed_steps: []},
+            });
+            expect(env).not.toHaveProperty('created_at');
+        });
+
+        it('records the error message and partial steps on failure',
+            ()=>{
+            const env = build_create_envelope({
+                collector_id: 'c_xyz',
+                name: 'n',
+                status: 'ai_trigger_failed',
+                error: 'Cannot run more than 3 jobs in parallel',
+            });
+            expect(env.collector_id).toBe('c_xyz');
+            expect(env.status).toBe('ai_trigger_failed');
+            expect(env.error).toMatch(/parallel/);
+            expect(env.completed_steps).toEqual([]);
+            // view_url remains useful even on failure so the user
+            // can inspect the stub collector in the dashboard.
+            expect(env.view_url)
+                .toBe('https://brightdata.com/cp/scrapers/c_xyz');
+        });
+
+        it('still includes view_url on every termination path', ()=>{
+            for (const status of ['done', 'failed', 'ai_trigger_failed',
+                'poll_failed'])
+            {
+                const env = build_create_envelope({
+                    collector_id: 'c_xyz', name: 'n', status,
+                });
+                expect(env.view_url)
+                    .toBe('https://brightdata.com/cp/scrapers/c_xyz');
+            }
+        });
+    });
+
+    describe('handle_create_scraper envelope output (PR-2)', ()=>{
+        const setup_success = ()=>{
+            mocks.post
+                .mockResolvedValueOnce({
+                    id: 'c_xyz', name: 'product-v1',
+                    created: '2026-05-18T07:28:30Z',
+                })
+                .mockResolvedValueOnce({id: 'ia_xyz', queued: false});
+            mocks.poll_until.mockResolvedValue({
+                result: {status: 'done',
+                    completed_steps: ['a', 'b', 'c']},
+                attempts: 4,
+            });
+        };
+
+        it('writes the new envelope to -o on success', async()=>{
+            setup_success();
+            await handle_create_scraper(
+                'https://x.com/p', 'd',
+                {output: 'create.json', pretty: true}
+            );
+            expect(mocks.print).toHaveBeenCalledWith(
+                expect.objectContaining({
+                    collector_id: 'c_xyz',
+                    name: 'product-v1',
+                    status: 'done',
+                    completed_steps: ['a', 'b', 'c'],
+                    view_url: 'https://brightdata.com/cp/scrapers/c_xyz',
+                    created_at: '2026-05-18T07:28:30Z',
+                }),
+                expect.objectContaining({output: 'create.json'})
+            );
+        });
+
+        it('the documented `jq -r .collector_id` recipe works on the '
+            +'envelope', async()=>{
+            // The bug PR-2 is fixing — yesterday this returned `null`.
+            setup_success();
+            await handle_create_scraper('https://x.com/p', 'd',
+                {output: 'create.json'});
+            const written = mocks.print.mock.calls[0][0] as {
+                collector_id?: string};
+            expect(written.collector_id).toBe('c_xyz');
+        });
+
+        it('--legacy-output preserves the bare progress payload',
+            async()=>{
+            setup_success();
+            await handle_create_scraper(
+                'https://x.com/p', 'd',
+                {output: 'create.json', legacyOutput: true}
+            );
+            const written = mocks.print.mock.calls[0][0] as {
+                collector_id?: unknown; status?: string};
+            // Bare progress shape today: status + completed_steps,
+            // NO collector_id, NO view_url.
+            expect(written.collector_id).toBeUndefined();
+            expect(written).not.toHaveProperty('view_url');
+            expect(written.status).toBe('done');
+        });
+
+        it('writes the envelope when AI trigger fails (stub-collector '
+            +'recovery path)', async()=>{
+            mocks.post
+                .mockResolvedValueOnce({id: 'c_stub', name: 'n'})
+                .mockRejectedValueOnce(
+                    new Error('Cannot run more than 3 jobs in parallel'));
+            const exit = vi.spyOn(process, 'exit')
+                .mockImplementation(()=>undefined as never);
+            const error = vi.spyOn(console, 'error')
+                .mockImplementation(()=>{});
+            await handle_create_scraper(
+                'https://x.com/p', 'd',
+                {output: 'create.json'}
+            );
+            expect(mocks.print).toHaveBeenCalledWith(
+                expect.objectContaining({
+                    collector_id: 'c_stub',
+                    status: 'ai_trigger_failed',
+                    error: expect.stringMatching(/parallel/),
+                    view_url: 'https://brightdata.com/cp/scrapers/c_stub',
+                }),
+                expect.objectContaining({output: 'create.json'})
+            );
+            exit.mockRestore();
+            error.mockRestore();
+        });
+
+        it('writes the envelope when poll returns status != done',
+            async()=>{
+            mocks.post
+                .mockResolvedValueOnce({id: 'c_abc', name: 'n'})
+                .mockResolvedValueOnce({id: 'ia_xyz', queued: false});
+            mocks.poll_until.mockResolvedValue({
+                result: {status: 'failed',
+                    completed_steps: ['planner']},
+                attempts: 2,
+            });
+            const exit = vi.spyOn(process, 'exit')
+                .mockImplementation(()=>undefined as never);
+            const error = vi.spyOn(console, 'error')
+                .mockImplementation(()=>{});
+            await handle_create_scraper(
+                'https://x.com/p', 'd',
+                {output: 'create.json'}
+            );
+            expect(mocks.print).toHaveBeenCalledWith(
+                expect.objectContaining({
+                    collector_id: 'c_abc',
+                    status: 'failed',
+                    completed_steps: ['planner'],
+                    error: expect.stringMatching(/finished with status/),
+                }),
+                expect.objectContaining({output: 'create.json'})
+            );
+            exit.mockRestore();
+            error.mockRestore();
+        });
+
+        it('writes the envelope when polling itself throws (timeout '
+            +'or network)', async()=>{
+            mocks.post
+                .mockResolvedValueOnce({id: 'c_abc', name: 'n'})
+                .mockResolvedValueOnce({id: 'ia_xyz', queued: false});
+            mocks.poll_until.mockRejectedValue(
+                new Error(
+                    'Timeout after 600 seconds waiting for AI generation'));
+            const exit = vi.spyOn(process, 'exit')
+                .mockImplementation(()=>undefined as never);
+            const error = vi.spyOn(console, 'error')
+                .mockImplementation(()=>{});
+            await handle_create_scraper(
+                'https://x.com/p', 'd',
+                {output: 'create.json'}
+            );
+            expect(mocks.print).toHaveBeenCalledWith(
+                expect.objectContaining({
+                    collector_id: 'c_abc',
+                    status: 'poll_failed',
+                    error: expect.stringMatching(/Timeout/),
+                }),
+                expect.objectContaining({output: 'create.json'})
+            );
+            exit.mockRestore();
+            error.mockRestore();
+        });
+    });
+
     describe('handle_create_scraper', ()=>{
         it('chains create → trigger → poll and prints JSON in non-TTY',
             async()=>{
@@ -194,8 +407,17 @@ describe('commands/scraper', ()=>{
                     timeout_label: expect.stringContaining('c_abc'),
                 })
             );
+            // PR-2: -o now writes an envelope with collector_id,
+            // not the raw progress payload. The documented
+            // `jq -r '.collector_id'` recipe depends on this.
             expect(mocks.print).toHaveBeenCalledWith(
-                progress,
+                expect.objectContaining({
+                    collector_id: 'c_abc',
+                    name: 'cli-scraper-1',
+                    status: 'done',
+                    completed_steps: ['a', 'b'],
+                    view_url: 'https://brightdata.com/cp/scrapers/c_abc',
+                }),
                 {json: undefined, pretty: undefined, output: undefined}
             );
         });
@@ -209,7 +431,10 @@ describe('commands/scraper', ()=>{
                 result: progress, attempts: 1});
             await handle_create_scraper('https://x.com', 'd', {json: true});
             expect(mocks.print).toHaveBeenCalledWith(
-                progress,
+                expect.objectContaining({
+                    collector_id: 'c_abc',
+                    status: 'done',
+                }),
                 {json: true, pretty: undefined, output: undefined}
             );
         });
diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts
index 9ab4400..73e4963 100644
--- a/src/commands/scraper.ts
+++ b/src/commands/scraper.ts
@@ -12,6 +12,7 @@ import type {
     Trigger_ai_response,
     Ai_progress_response,
     Scraper_create_opts,
+    Create_envelope,
     Run_request,
     Trigger_immediate_response,
     Scraper_run_opts,
@@ -83,6 +84,43 @@ const format_create_summary = (
     return lines.join('\n');
 };
 
+// PR-2: every termination path of `scraper create` writes this same
+// envelope shape to -o. Solves the broken `jq -r '.collector_id'`
+// recipe in references/recipes.md (today's -o file contains only the
+// final progress payload, with no id field).
+const build_create_envelope = (params: {
+    collector_id: string;
+    name: string;
+    status: string;
+    progress?: Ai_progress_response;
+    created_at?: string;
+    error?: string;
+}): Create_envelope=>({
+    collector_id: params.collector_id,
+    name: params.name,
+    status: params.status,
+    completed_steps: params.progress?.completed_steps ?? [],
+    view_url: `https://brightdata.com/cp/scrapers/${params.collector_id}`,
+    ...(params.created_at ? {created_at: params.created_at} : {}),
+    ...(params.error ? {error: params.error} : {}),
+});
+
+// Write the envelope (or, in --legacy-output mode, the bare progress
+// payload) to wherever the user asked. Centralised so success and
+// every failure path share one I/O code path.
+const emit_create_output = (
+    envelope: Create_envelope,
+    progress: Ai_progress_response|null,
+    opts: Scraper_create_opts
+): void=>{
+    const print_opts = {json: opts.json, pretty: opts.pretty,
+        output: opts.output};
+    const payload = opts.legacyOutput && progress
+        ? (progress as unknown) : envelope;
+    if (opts.json || opts.pretty || opts.output || !is_tty)
+        print(payload, print_opts);
+};
+
 const handle_create_scraper = async(
     url: string,
     description: string,
@@ -100,6 +138,7 @@ const handle_create_scraper = async(
     const create_spinner = start_spinner('Creating scraper template...');
     let collector_id = '';
     let scraper_name = template_body.name;
+    let created_at: string|undefined;
     try {
         const template = await post<Create_template_response>(
             api_key,
@@ -110,11 +149,14 @@ const handle_create_scraper = async(
         create_spinner.stop();
         if (!template.id)
         {
+            // Template POST didn't return an id — no collector_id to
+            // envelope, so no -o file to write. Same as today.
             fail('Failed to create scraper template (missing id).');
             return;
         }
         collector_id = template.id;
         scraper_name = template.name ?? template_body.name;
+        created_at = template.created;
         console.error(dim(`Template created: ${collector_id}`));
     } catch(e) {
         create_spinner.stop();
@@ -134,9 +176,23 @@ const handle_create_scraper = async(
         trigger_spinner.stop();
     } catch(e) {
         trigger_spinner.stop();
+        const msg = (e as Error).message;
         console.error(
             `Failed to start AI generation for collector `
-            +`${collector_id}: ${(e as Error).message}`
+            +`${collector_id}: ${msg}`
+        );
+        // PR-2: write the envelope even on failure so the user's
+        // automation can read collector_id + status from the file.
+        emit_create_output(
+            build_create_envelope({
+                collector_id,
+                name: scraper_name,
+                status: 'ai_trigger_failed',
+                created_at,
+                error: msg,
+            }),
+            null,
+            opts
         );
         process.exit(1);
         return;
@@ -171,16 +227,36 @@ const handle_create_scraper = async(
                 `AI generation failed (collector ${collector_id}, `
                 +`status: ${progress.status}).`
             );
+            emit_create_output(
+                build_create_envelope({
+                    collector_id,
+                    name: scraper_name,
+                    status: progress.status,
+                    progress,
+                    created_at,
+                    error: `AI generation finished with status `
+                        +`"${progress.status}".`,
+                }),
+                progress,
+                opts
+            );
             process.exit(1);
             return;
         }
-        const print_opts = {json: opts.json, pretty: opts.pretty,
-            output: opts.output};
+        // Success path.
+        emit_create_output(
+            build_create_envelope({
+                collector_id,
+                name: scraper_name,
+                status: progress.status,
+                progress,
+                created_at,
+            }),
+            progress,
+            opts
+        );
         if (opts.json || opts.pretty || opts.output || !is_tty)
-        {
-            print(progress, print_opts);
             return;
-        }
         success(format_create_summary(
             collector_id, scraper_name, progress));
     } catch(e) {
@@ -189,6 +265,17 @@ const handle_create_scraper = async(
         const suffix = msg.includes(collector_id)
             ? '' : ` (collector ${collector_id})`;
         console.error(`${msg}${suffix}`);
+        emit_create_output(
+            build_create_envelope({
+                collector_id,
+                name: scraper_name,
+                status: 'poll_failed',
+                created_at,
+                error: msg,
+            }),
+            null,
+            opts
+        );
         process.exit(1);
         return;
     }
@@ -547,6 +634,10 @@ const create_subcommand = new Command('create')
     .option('-o, --output <path>', 'Write output to file')
     .option('--json', 'Force JSON output')
     .option('--pretty', 'Pretty-print JSON output')
+    .option('--legacy-output',
+        'Emit the bare AI-progress payload (pre-v0.3 shape) instead '
+        +'of the new {collector_id, name, status, ...} envelope. '
+        +'For one-version migration only.')
     .option('--timing', 'Show request timing')
     .option('-k, --api-key <key>', 'Override API key')
     .action(handle_create_scraper);
@@ -584,6 +675,8 @@ export {
     build_ai_request,
     extract_progress_status,
     format_create_summary,
+    build_create_envelope,
+    emit_create_output,
     handle_run_scraper,
     build_run_request,
     build_run_query,
diff --git a/src/types/scraper.ts b/src/types/scraper.ts
index 74dbbcf..8ddabe9 100644
--- a/src/types/scraper.ts
+++ b/src/types/scraper.ts
@@ -45,6 +45,23 @@ type Scraper_create_opts = {
     pretty?: boolean;
     timing?: boolean;
     apiKey?: string;
+    // PR-2: when true, write the bare AI-progress payload to -o
+    // (today's shape) instead of the new envelope. One-version
+    // migration flag.
+    legacyOutput?: boolean;
+};
+
+// PR-2: machine-readable envelope written to -o on every termination
+// path of `scraper create`. Replaces the previous bare-progress
+// payload so the documented `jq -r '.collector_id'` recipe works.
+type Create_envelope = {
+    collector_id: string;
+    name: string;
+    status: string;
+    completed_steps: string[];
+    view_url: string;
+    created_at?: string;
+    error?: string;
 };
 
 type Run_request = {
@@ -92,6 +109,7 @@ export type {
     Trigger_ai_response,
     Ai_progress_response,
     Scraper_create_opts,
+    Create_envelope,
     Run_request,
     Trigger_immediate_response,
     Sync_timeout_response,