From 5157b51122f664e33d2f6c86e97058310c1ed133 Mon Sep 17 00:00:00 2001 From: anil-bd Date: Mon, 18 May 2026 14:56:44 +0200 Subject: [PATCH] fix(scraper): replace misleading 403 hint for AI Scraper Studio errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a `bdata scraper create` succeeds on the template POST but the subsequent AI-trigger POST 429s (e.g. because the user hit the AI Flow parallel-job cap), the half-built `collector_id` is still printed. If that id is then passed to `bdata scraper run`, the API returns 403 + body `{"error":"Collector does not have a template"}`. Today the CLI maps any 403 to a fixed hint: Hint: Access denied. Check your zone permissions in the control panel. This sends the user 30+ minutes down a zone-permission rabbit hole that has nothing to do with the actual problem (the AI Flow never finished generating selectors for this collector). Observed multiple times during stress testing. This change is structured so the AI Scraper Studio error vocabulary stays in the scraper command and does NOT leak into the shared HTTP client. `scrape`, `search`, `discover`, `pipelines`, and `browser` are unaffected. Mechanism: * `src/utils/client.ts` gains a generic `hints?: Body_hint[]` field on `Request_opts`. The pure helper `pick_hint(status, body, hints)` consults the caller's list first and falls back to the existing `ERROR_HINTS` status-code map. The shared client ships ZERO command-specific patterns. * `src/commands/scraper.ts` defines `SCRAPER_BODY_HINTS` — two patterns: - /collector does not have a template/i → AI generation didn't complete; re-run `scraper create`; web-UI URL for manual recovery. - /cannot run more than \d+ jobs in parallel/i → AI-Flow concurrent-job cap; serialise launches. Every `post`/`get` call in `handle_create_scraper`, `handle_run_scraper`, and `run_batch` passes `hints: SCRAPER_BODY_HINTS` so a 4xx from any of them is translated with the right vocabulary. * Real zone-permission 403s (any body that doesn't match the scraper patterns) still get the original "Access denied" hint — test 'does not consult ERROR_HINTS when an extra-hint pattern matches' locks this in. Tests: 8 unit tests for `client.pick_hint` using mock generic patterns (covers mechanism + asserts the shared client carries no scraper vocabulary in ERROR_HINTS), plus 5 scraper command tests asserting the scraper patterns are well-formed and travel via `hints` to client.post on every AI-Flow call. Two existing tests relaxed from strict opts-object matches to objectContaining-style. 58 / 58 tests in the affected files pass. The 9 pre-existing failures in unrelated suites (daemon, add-mcp, browser, discover, scrape) on main are unchanged by this PR. --- src/__tests__/commands/scraper.test.ts | 86 +++++++++++++++++++++++++- src/__tests__/utils/client.test.ts | 79 +++++++++++++++++++++++ src/commands/scraper.ts | 44 +++++++++++-- src/utils/client.ts | 37 +++++++++-- 4 files changed, 232 insertions(+), 14 deletions(-) create mode 100644 src/__tests__/utils/client.test.ts diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index 39e7e0c..f7a1101 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -58,6 +58,7 @@ import { parse_sync_timeout, is_realtime_page_limit_error, classify_dataset, + SCRAPER_BODY_HINTS, } from '../../commands/scraper'; describe('commands/scraper', ()=>{ @@ -68,6 +69,85 @@ describe('commands/scraper', ()=>{ mocks.start.mockReturnValue({stop: mocks.stop}); }); + // PR-12: AI Scraper Studio specific hints live with this command, + // not in the shared HTTP client. These tests assert (a) the + // patterns are well-formed and (b) they actually travel to the + // HTTP layer on every API call this command makes. + describe('SCRAPER_BODY_HINTS', ()=>{ + it('contains a hint for the stub-collector 403 response body', + ()=>{ + const stub_hint = SCRAPER_BODY_HINTS.find(h=> + h.pattern.test('Collector does not have a template')); + expect(stub_hint).toBeDefined(); + expect(stub_hint!.hint).toMatch(/AI generation has not completed/); + expect(stub_hint!.hint).toMatch(/bdata scraper create/); + // Must NOT echo the misleading generic 403 hint. + expect(stub_hint!.hint).not.toMatch(/zone permissions/); + }); + + it('contains a hint for the parallel-job cap 429 response body', + ()=>{ + const cap_hint = SCRAPER_BODY_HINTS.find(h=> + h.pattern.test('Cannot run more than 3 jobs in parallel')); + expect(cap_hint).toBeDefined(); + expect(cap_hint!.hint).toMatch(/concurrent-job cap/); + expect(cap_hint!.hint).toMatch(/serialise/); + }); + + it('cap pattern is case-insensitive and number-agnostic ' + +'(future-proof if the cap changes)', ()=>{ + const cap_pattern = SCRAPER_BODY_HINTS[1].pattern; + expect(cap_pattern.test('cannot run more than 5 jobs ' + +'in parallel')).toBe(true); + expect(cap_pattern.test('CANNOT RUN MORE THAN 100 JOBS ' + +'IN PARALLEL')).toBe(true); + }); + + it('is passed via `hints` opt to client.post on every ' + +'AI-Flow call in handle_create_scraper', async()=>{ + mocks.post + .mockResolvedValueOnce({id: 'c_abc', name: 'cli-scraper-1'}) + .mockResolvedValueOnce({id: 'ia_xyz', queued: false}); + mocks.poll_until.mockResolvedValue({ + result: {status: 'done', completed_steps: []}, + attempts: 1, + }); + await handle_create_scraper( + 'https://example.com/p/1', + 'extract title', + {} + ); + // Both `post` calls (template create + AI trigger) must + // include the scraper hints so any 4xx is translated + // with the right vocabulary. + for (const call of mocks.post.mock.calls) + { + const opts = call[3] as {hints?: unknown}; + expect(opts).toBeDefined(); + expect(opts.hints).toBe(SCRAPER_BODY_HINTS); + } + }); + + it('is passed via `hints` opt to client.post on the ' + +'trigger_immediate call in handle_run_scraper', async()=>{ + mocks.post.mockResolvedValueOnce({response_id: 'r_xyz'}); + const fetch_spy = vi.spyOn(global, 'fetch') + .mockImplementation(()=>Promise.resolve({ + status: 200, + text: ()=>Promise.resolve('{"title":"ok"}'), + } as unknown as Response)); + mocks.poll_until.mockImplementation(async(o: never)=>{ + const cfg = o as {fetch_once: ()=>Promise}; + const r = await cfg.fetch_once(); + return {result: r, attempts: 1}; + }); + await handle_run_scraper('c_abc', 'https://x.com/p/1', {}); + const opts = mocks.post.mock.calls[0][3] as {hints?: unknown}; + expect(opts.hints).toBe(SCRAPER_BODY_HINTS); + fetch_spy.mockRestore(); + }); + }); + describe('build_template_request', ()=>{ it('uses auto-generated name and stub webhook by default', ()=>{ const before = Math.floor(Date.now()/1000); @@ -176,7 +256,7 @@ describe('commands/scraper', ()=>{ expect.objectContaining({ deliver: expect.objectContaining({type: 'webhook'}), }), - {timing: undefined} + expect.objectContaining({timing: undefined}) ); expect(mocks.post).toHaveBeenNthCalledWith( 2, @@ -184,7 +264,7 @@ describe('commands/scraper', ()=>{ '/dca/collectors/c_abc/automate_template', {description: 'extract title', urls: ['https://example.com/p/1']}, - {timing: undefined} + expect.objectContaining({timing: undefined}) ); expect(mocks.poll_until).toHaveBeenCalledTimes(1); expect(mocks.poll_until).toHaveBeenCalledWith( @@ -404,7 +484,7 @@ describe('commands/scraper', ()=>{ expect.stringMatching( /\/dca\/trigger_immediate\?collector=c_abc/), {url: 'https://x.com/p/1'}, - {timing: undefined} + expect.objectContaining({timing: undefined}) ); expect(mocks.print).toHaveBeenCalledWith( {title: 'hello'}, diff --git a/src/__tests__/utils/client.test.ts b/src/__tests__/utils/client.test.ts new file mode 100644 index 0000000..d9e5e73 --- /dev/null +++ b/src/__tests__/utils/client.test.ts @@ -0,0 +1,79 @@ +import {describe, it, expect} from 'vitest'; +import {pick_hint, ERROR_HINTS, type Body_hint} from '../../utils/client'; + +// pick_hint is the layer the per-command hint lists plug into. +// The client itself ships ZERO command-specific patterns — every test +// here uses generic mock patterns so the contract is clear. + +const MOCK_HINTS: Body_hint[] = [ + {pattern: /quota.*exceeded/i, + hint: 'Your monthly quota is exhausted. Top up in the dashboard.'}, + {pattern: /not allowed in region/i, + hint: 'This API is geo-restricted. Use --country to override.'}, +]; + +describe('utils/client.pick_hint', ()=>{ + it('returns the status-based hint when no extra hints are passed', ()=>{ + expect(pick_hint(401, 'anything')).toBe(ERROR_HINTS[401]); + expect(pick_hint(403, 'anything')).toBe(ERROR_HINTS[403]); + expect(pick_hint(404, 'anything')).toBe(ERROR_HINTS[404]); + expect(pick_hint(429, 'anything')).toBe(ERROR_HINTS[429]); + }); + + it('returns the status-based hint when extra hints are passed ' + +'but nothing matches the body', ()=>{ + expect(pick_hint(403, 'plain unauthorized message', MOCK_HINTS)) + .toBe(ERROR_HINTS[403]); + }); + + it('overrides with the first matching extra hint', ()=>{ + expect(pick_hint(429, 'monthly quota exceeded', MOCK_HINTS)) + .toMatch(/Top up in the dashboard/); + expect(pick_hint(403, 'not allowed in region: CN', MOCK_HINTS)) + .toMatch(/geo-restricted/); + }); + + it('is case-insensitive on the caller\'s pattern', ()=>{ + expect(pick_hint(429, 'MONTHLY QUOTA EXCEEDED', MOCK_HINTS)) + .toMatch(/Top up in the dashboard/); + }); + + it('respects first-match-wins order in the caller\'s list', ()=>{ + const ordered: Body_hint[] = [ + {pattern: /generic/, hint: 'first wins'}, + {pattern: /generic/, hint: 'second loses'}, + ]; + expect(pick_hint(500, 'generic error', ordered)) + .toBe('first wins'); + }); + + it('returns undefined for unknown statuses with no body match', ()=>{ + expect(pick_hint(418, 'teapot')).toBeUndefined(); + expect(pick_hint(599, 'mystery')).toBeUndefined(); + }); + + it('does not consult ERROR_HINTS when an extra-hint pattern ' + +'matches', ()=>{ + // The override should win even when the generic status hint + // would also be applicable. + const matches_403: Body_hint[] = [ + {pattern: /custom 403 case/i, + hint: 'override hint, not the zone-permissions one'}, + ]; + expect(pick_hint(403, 'this is a custom 403 case', matches_403)) + .toBe('override hint, not the zone-permissions one'); + }); + + it('does not leak any scraper-studio vocabulary in the default ' + +'hints', ()=>{ + // The shared client must stay generic. Catches accidental + // regressions where someone adds a domain-specific hint here. + for (const hint of Object.values(ERROR_HINTS)) + { + expect(hint).not.toMatch(/scraper create/i); + expect(hint).not.toMatch(/collector/i); + expect(hint).not.toMatch(/AI[- ]Flow/i); + expect(hint).not.toMatch(/automate_template/i); + } + }); +}); diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 9ab4400..9129cf6 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -1,5 +1,5 @@ import {Command} from 'commander'; -import {post, get} from '../utils/client'; +import {post, get, type Body_hint} from '../utils/client'; import {load as load_config} from '../utils/config'; import {ensure_authenticated} from '../utils/auth'; import {start as start_spinner} from '../utils/spinner'; @@ -18,6 +18,37 @@ import type { Batch_trigger_response, } from '../types/scraper'; +// AI Scraper Studio specific error-body hints. Lives with the command +// that owns the DCA endpoints — NOT in the shared HTTP client — so the +// AI-Flow vocabulary doesn't leak into `scrape`, `search`, `discover`, +// `pipelines`, or `browser`. +// +// Both patterns describe the same incident from two perspectives: +// * The 429 fires during `scraper create` when too many AI-Flow jobs +// are in flight at once (undocumented 3-job cap as of v0.2.0). +// * The 403 "Collector does not have a template" fires later when +// the user tries to `scraper run` against the half-built +// collector_id the failed create left behind. +// +// First match wins. Add more patterns here as we learn about other +// scraper-API error bodies that deserve a better hint. +const SCRAPER_BODY_HINTS: Body_hint[] = [ + { + pattern: /collector does not have a template/i, + hint: 'AI generation has not completed for this collector. ' + +'Re-run `bdata scraper create` (the previous attempt may ' + +'have hit the AI-Flow parallel-job cap), or open ' + +'https://brightdata.com/cp/scrapers to inspect / finish ' + +'the half-built collector in the web UI.', + }, + { + pattern: /cannot run more than \d+ jobs in parallel/i, + hint: 'You hit the AI-Flow concurrent-job cap. Wait for an ' + +'in-flight `scraper create` to finish, or serialise ' + +'your launches.', + }, +]; + const CREATE_TEMPLATE_ENDPOINT = '/dca/collector'; const AI_TRIGGER_PATH = 'automate_template'; const AI_PROGRESS_PATH = 'automate_template/progress'; @@ -105,7 +136,7 @@ const handle_create_scraper = async( api_key, CREATE_TEMPLATE_ENDPOINT, template_body, - {timing: opts.timing} + {timing: opts.timing, hints: SCRAPER_BODY_HINTS} ); create_spinner.stop(); if (!template.id) @@ -129,7 +160,7 @@ const handle_create_scraper = async( api_key, `/dca/collectors/${collector_id}/${AI_TRIGGER_PATH}`, build_ai_request(url, description), - {timing: opts.timing} + {timing: opts.timing, hints: SCRAPER_BODY_HINTS} ); trigger_spinner.stop(); } catch(e) { @@ -148,7 +179,7 @@ const handle_create_scraper = async( fetch_once: ()=>get( api_key, `/dca/collectors/${collector_id}/${AI_PROGRESS_PATH}`, - {timing: opts.timing} + {timing: opts.timing, hints: SCRAPER_BODY_HINTS} ), get_status: extract_progress_status, running_statuses: [RUNNING_SENTINEL], @@ -338,7 +369,7 @@ const run_batch = async( api_key, `${BATCH_TRIGGER_ENDPOINT}?${query}`, [build_run_request(url)], - {timing: opts.timing} + {timing: opts.timing, hints: SCRAPER_BODY_HINTS} ); trigger_spinner.stop(); if (!trigger.collection_id) @@ -474,7 +505,7 @@ const handle_run_scraper = async( api_key, `${TRIGGER_IMMEDIATE_ENDPOINT}?${trigger_query}`, build_run_request(url), - {timing: opts.timing} + {timing: opts.timing, hints: SCRAPER_BODY_HINTS} ); trigger_spinner.stop(); if (!trigger.response_id) @@ -592,4 +623,5 @@ export { parse_sync_timeout, is_realtime_page_limit_error, classify_dataset, + SCRAPER_BODY_HINTS, }; diff --git a/src/utils/client.ts b/src/utils/client.ts index 6930232..1042882 100644 --- a/src/utils/client.ts +++ b/src/utils/client.ts @@ -11,12 +11,35 @@ const ERROR_HINTS: Record = { 429: 'Rate limit exceeded. Wait a moment and try again.', }; +// Each command can supply its own ordered list of body-pattern overrides +// via Request_opts.hints. The shared HTTP layer knows nothing about +// per-command error semantics — it just consults the caller's hints +// before falling back to the generic ERROR_HINTS map. +type Body_hint = {pattern: RegExp; hint: string}; + +const pick_hint = ( + status: number, + body: string, + extra_hints?: Body_hint[] +): string|undefined=>{ + if (extra_hints) + { + for (const {pattern, hint} of extra_hints) + { + if (pattern.test(body)) + return hint; + } + } + return ERROR_HINTS[status]; +}; + type Request_opts = { method?: string; body?: unknown; headers?: Record; timing?: boolean; raw_buffer?: boolean; + hints?: Body_hint[]; }; type Api_error = { @@ -27,10 +50,14 @@ type Api_error = { const sleep = (ms: number)=>new Promise(resolve=>setTimeout(resolve, ms)); -const format_error = (status: number, detail: string): Api_error=>({ +const format_error = ( + status: number, + detail: string, + extra_hints?: Body_hint[] +): Api_error=>({ status, message: detail, - hint: ERROR_HINTS[status], + hint: pick_hint(status, detail, extra_hints), }); const request = async( @@ -93,7 +120,7 @@ const request = async( if (err_body) detail = err_body; } catch(_e) {} - const api_err = format_error(res.status, detail); + const api_err = format_error(res.status, detail, opts.hints); const msg = [ `Error: ${api_err.message}`, ` Status: ${api_err.status}`, @@ -133,5 +160,5 @@ const get = ( opts: Omit = {} ): Promise=>request(api_key, endpoint, {method: 'GET', ...opts}); -export {request, post, get}; -export type {Request_opts, Api_error}; +export {request, post, get, pick_hint, ERROR_HINTS}; +export type {Request_opts, Api_error, Body_hint};