diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index 39e7e0c..f7a1101 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -58,6 +58,7 @@ import { parse_sync_timeout, is_realtime_page_limit_error, classify_dataset, + SCRAPER_BODY_HINTS, } from '../../commands/scraper'; describe('commands/scraper', ()=>{ @@ -68,6 +69,85 @@ describe('commands/scraper', ()=>{ mocks.start.mockReturnValue({stop: mocks.stop}); }); + // PR-12: AI Scraper Studio specific hints live with this command, + // not in the shared HTTP client. These tests assert (a) the + // patterns are well-formed and (b) they actually travel to the + // HTTP layer on every API call this command makes. + describe('SCRAPER_BODY_HINTS', ()=>{ + it('contains a hint for the stub-collector 403 response body', + ()=>{ + const stub_hint = SCRAPER_BODY_HINTS.find(h=> + h.pattern.test('Collector does not have a template')); + expect(stub_hint).toBeDefined(); + expect(stub_hint!.hint).toMatch(/AI generation has not completed/); + expect(stub_hint!.hint).toMatch(/bdata scraper create/); + // Must NOT echo the misleading generic 403 hint. + expect(stub_hint!.hint).not.toMatch(/zone permissions/); + }); + + it('contains a hint for the parallel-job cap 429 response body', + ()=>{ + const cap_hint = SCRAPER_BODY_HINTS.find(h=> + h.pattern.test('Cannot run more than 3 jobs in parallel')); + expect(cap_hint).toBeDefined(); + expect(cap_hint!.hint).toMatch(/concurrent-job cap/); + expect(cap_hint!.hint).toMatch(/serialise/); + }); + + it('cap pattern is case-insensitive and number-agnostic ' + +'(future-proof if the cap changes)', ()=>{ + const cap_pattern = SCRAPER_BODY_HINTS[1].pattern; + expect(cap_pattern.test('cannot run more than 5 jobs ' + +'in parallel')).toBe(true); + expect(cap_pattern.test('CANNOT RUN MORE THAN 100 JOBS ' + +'IN PARALLEL')).toBe(true); + }); + + it('is passed via `hints` opt to client.post on every ' + +'AI-Flow call in handle_create_scraper', async()=>{ + mocks.post + .mockResolvedValueOnce({id: 'c_abc', name: 'cli-scraper-1'}) + .mockResolvedValueOnce({id: 'ia_xyz', queued: false}); + mocks.poll_until.mockResolvedValue({ + result: {status: 'done', completed_steps: []}, + attempts: 1, + }); + await handle_create_scraper( + 'https://example.com/p/1', + 'extract title', + {} + ); + // Both `post` calls (template create + AI trigger) must + // include the scraper hints so any 4xx is translated + // with the right vocabulary. + for (const call of mocks.post.mock.calls) + { + const opts = call[3] as {hints?: unknown}; + expect(opts).toBeDefined(); + expect(opts.hints).toBe(SCRAPER_BODY_HINTS); + } + }); + + it('is passed via `hints` opt to client.post on the ' + +'trigger_immediate call in handle_run_scraper', async()=>{ + mocks.post.mockResolvedValueOnce({response_id: 'r_xyz'}); + const fetch_spy = vi.spyOn(global, 'fetch') + .mockImplementation(()=>Promise.resolve({ + status: 200, + text: ()=>Promise.resolve('{"title":"ok"}'), + } as unknown as Response)); + mocks.poll_until.mockImplementation(async(o: never)=>{ + const cfg = o as {fetch_once: ()=>Promise}; + const r = await cfg.fetch_once(); + return {result: r, attempts: 1}; + }); + await handle_run_scraper('c_abc', 'https://x.com/p/1', {}); + const opts = mocks.post.mock.calls[0][3] as {hints?: unknown}; + expect(opts.hints).toBe(SCRAPER_BODY_HINTS); + fetch_spy.mockRestore(); + }); + }); + describe('build_template_request', ()=>{ it('uses auto-generated name and stub webhook by default', ()=>{ const before = Math.floor(Date.now()/1000); @@ -176,7 +256,7 @@ describe('commands/scraper', ()=>{ expect.objectContaining({ deliver: expect.objectContaining({type: 'webhook'}), }), - {timing: undefined} + expect.objectContaining({timing: undefined}) ); expect(mocks.post).toHaveBeenNthCalledWith( 2, @@ -184,7 +264,7 @@ describe('commands/scraper', ()=>{ '/dca/collectors/c_abc/automate_template', {description: 'extract title', urls: ['https://example.com/p/1']}, - {timing: undefined} + expect.objectContaining({timing: undefined}) ); expect(mocks.poll_until).toHaveBeenCalledTimes(1); expect(mocks.poll_until).toHaveBeenCalledWith( @@ -404,7 +484,7 @@ describe('commands/scraper', ()=>{ expect.stringMatching( /\/dca\/trigger_immediate\?collector=c_abc/), {url: 'https://x.com/p/1'}, - {timing: undefined} + expect.objectContaining({timing: undefined}) ); expect(mocks.print).toHaveBeenCalledWith( {title: 'hello'}, diff --git a/src/__tests__/utils/client.test.ts b/src/__tests__/utils/client.test.ts new file mode 100644 index 0000000..d9e5e73 --- /dev/null +++ b/src/__tests__/utils/client.test.ts @@ -0,0 +1,79 @@ +import {describe, it, expect} from 'vitest'; +import {pick_hint, ERROR_HINTS, type Body_hint} from '../../utils/client'; + +// pick_hint is the layer the per-command hint lists plug into. +// The client itself ships ZERO command-specific patterns — every test +// here uses generic mock patterns so the contract is clear. + +const MOCK_HINTS: Body_hint[] = [ + {pattern: /quota.*exceeded/i, + hint: 'Your monthly quota is exhausted. Top up in the dashboard.'}, + {pattern: /not allowed in region/i, + hint: 'This API is geo-restricted. Use --country to override.'}, +]; + +describe('utils/client.pick_hint', ()=>{ + it('returns the status-based hint when no extra hints are passed', ()=>{ + expect(pick_hint(401, 'anything')).toBe(ERROR_HINTS[401]); + expect(pick_hint(403, 'anything')).toBe(ERROR_HINTS[403]); + expect(pick_hint(404, 'anything')).toBe(ERROR_HINTS[404]); + expect(pick_hint(429, 'anything')).toBe(ERROR_HINTS[429]); + }); + + it('returns the status-based hint when extra hints are passed ' + +'but nothing matches the body', ()=>{ + expect(pick_hint(403, 'plain unauthorized message', MOCK_HINTS)) + .toBe(ERROR_HINTS[403]); + }); + + it('overrides with the first matching extra hint', ()=>{ + expect(pick_hint(429, 'monthly quota exceeded', MOCK_HINTS)) + .toMatch(/Top up in the dashboard/); + expect(pick_hint(403, 'not allowed in region: CN', MOCK_HINTS)) + .toMatch(/geo-restricted/); + }); + + it('is case-insensitive on the caller\'s pattern', ()=>{ + expect(pick_hint(429, 'MONTHLY QUOTA EXCEEDED', MOCK_HINTS)) + .toMatch(/Top up in the dashboard/); + }); + + it('respects first-match-wins order in the caller\'s list', ()=>{ + const ordered: Body_hint[] = [ + {pattern: /generic/, hint: 'first wins'}, + {pattern: /generic/, hint: 'second loses'}, + ]; + expect(pick_hint(500, 'generic error', ordered)) + .toBe('first wins'); + }); + + it('returns undefined for unknown statuses with no body match', ()=>{ + expect(pick_hint(418, 'teapot')).toBeUndefined(); + expect(pick_hint(599, 'mystery')).toBeUndefined(); + }); + + it('does not consult ERROR_HINTS when an extra-hint pattern ' + +'matches', ()=>{ + // The override should win even when the generic status hint + // would also be applicable. + const matches_403: Body_hint[] = [ + {pattern: /custom 403 case/i, + hint: 'override hint, not the zone-permissions one'}, + ]; + expect(pick_hint(403, 'this is a custom 403 case', matches_403)) + .toBe('override hint, not the zone-permissions one'); + }); + + it('does not leak any scraper-studio vocabulary in the default ' + +'hints', ()=>{ + // The shared client must stay generic. Catches accidental + // regressions where someone adds a domain-specific hint here. + for (const hint of Object.values(ERROR_HINTS)) + { + expect(hint).not.toMatch(/scraper create/i); + expect(hint).not.toMatch(/collector/i); + expect(hint).not.toMatch(/AI[- ]Flow/i); + expect(hint).not.toMatch(/automate_template/i); + } + }); +}); diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 9ab4400..9129cf6 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -1,5 +1,5 @@ import {Command} from 'commander'; -import {post, get} from '../utils/client'; +import {post, get, type Body_hint} from '../utils/client'; import {load as load_config} from '../utils/config'; import {ensure_authenticated} from '../utils/auth'; import {start as start_spinner} from '../utils/spinner'; @@ -18,6 +18,37 @@ import type { Batch_trigger_response, } from '../types/scraper'; +// AI Scraper Studio specific error-body hints. Lives with the command +// that owns the DCA endpoints — NOT in the shared HTTP client — so the +// AI-Flow vocabulary doesn't leak into `scrape`, `search`, `discover`, +// `pipelines`, or `browser`. +// +// Both patterns describe the same incident from two perspectives: +// * The 429 fires during `scraper create` when too many AI-Flow jobs +// are in flight at once (undocumented 3-job cap as of v0.2.0). +// * The 403 "Collector does not have a template" fires later when +// the user tries to `scraper run` against the half-built +// collector_id the failed create left behind. +// +// First match wins. Add more patterns here as we learn about other +// scraper-API error bodies that deserve a better hint. +const SCRAPER_BODY_HINTS: Body_hint[] = [ + { + pattern: /collector does not have a template/i, + hint: 'AI generation has not completed for this collector. ' + +'Re-run `bdata scraper create` (the previous attempt may ' + +'have hit the AI-Flow parallel-job cap), or open ' + +'https://brightdata.com/cp/scrapers to inspect / finish ' + +'the half-built collector in the web UI.', + }, + { + pattern: /cannot run more than \d+ jobs in parallel/i, + hint: 'You hit the AI-Flow concurrent-job cap. Wait for an ' + +'in-flight `scraper create` to finish, or serialise ' + +'your launches.', + }, +]; + const CREATE_TEMPLATE_ENDPOINT = '/dca/collector'; const AI_TRIGGER_PATH = 'automate_template'; const AI_PROGRESS_PATH = 'automate_template/progress'; @@ -105,7 +136,7 @@ const handle_create_scraper = async( api_key, CREATE_TEMPLATE_ENDPOINT, template_body, - {timing: opts.timing} + {timing: opts.timing, hints: SCRAPER_BODY_HINTS} ); create_spinner.stop(); if (!template.id) @@ -129,7 +160,7 @@ const handle_create_scraper = async( api_key, `/dca/collectors/${collector_id}/${AI_TRIGGER_PATH}`, build_ai_request(url, description), - {timing: opts.timing} + {timing: opts.timing, hints: SCRAPER_BODY_HINTS} ); trigger_spinner.stop(); } catch(e) { @@ -148,7 +179,7 @@ const handle_create_scraper = async( fetch_once: ()=>get( api_key, `/dca/collectors/${collector_id}/${AI_PROGRESS_PATH}`, - {timing: opts.timing} + {timing: opts.timing, hints: SCRAPER_BODY_HINTS} ), get_status: extract_progress_status, running_statuses: [RUNNING_SENTINEL], @@ -338,7 +369,7 @@ const run_batch = async( api_key, `${BATCH_TRIGGER_ENDPOINT}?${query}`, [build_run_request(url)], - {timing: opts.timing} + {timing: opts.timing, hints: SCRAPER_BODY_HINTS} ); trigger_spinner.stop(); if (!trigger.collection_id) @@ -474,7 +505,7 @@ const handle_run_scraper = async( api_key, `${TRIGGER_IMMEDIATE_ENDPOINT}?${trigger_query}`, build_run_request(url), - {timing: opts.timing} + {timing: opts.timing, hints: SCRAPER_BODY_HINTS} ); trigger_spinner.stop(); if (!trigger.response_id) @@ -592,4 +623,5 @@ export { parse_sync_timeout, is_realtime_page_limit_error, classify_dataset, + SCRAPER_BODY_HINTS, }; diff --git a/src/utils/client.ts b/src/utils/client.ts index 6930232..1042882 100644 --- a/src/utils/client.ts +++ b/src/utils/client.ts @@ -11,12 +11,35 @@ const ERROR_HINTS: Record = { 429: 'Rate limit exceeded. Wait a moment and try again.', }; +// Each command can supply its own ordered list of body-pattern overrides +// via Request_opts.hints. The shared HTTP layer knows nothing about +// per-command error semantics — it just consults the caller's hints +// before falling back to the generic ERROR_HINTS map. +type Body_hint = {pattern: RegExp; hint: string}; + +const pick_hint = ( + status: number, + body: string, + extra_hints?: Body_hint[] +): string|undefined=>{ + if (extra_hints) + { + for (const {pattern, hint} of extra_hints) + { + if (pattern.test(body)) + return hint; + } + } + return ERROR_HINTS[status]; +}; + type Request_opts = { method?: string; body?: unknown; headers?: Record; timing?: boolean; raw_buffer?: boolean; + hints?: Body_hint[]; }; type Api_error = { @@ -27,10 +50,14 @@ type Api_error = { const sleep = (ms: number)=>new Promise(resolve=>setTimeout(resolve, ms)); -const format_error = (status: number, detail: string): Api_error=>({ +const format_error = ( + status: number, + detail: string, + extra_hints?: Body_hint[] +): Api_error=>({ status, message: detail, - hint: ERROR_HINTS[status], + hint: pick_hint(status, detail, extra_hints), }); const request = async( @@ -93,7 +120,7 @@ const request = async( if (err_body) detail = err_body; } catch(_e) {} - const api_err = format_error(res.status, detail); + const api_err = format_error(res.status, detail, opts.hints); const msg = [ `Error: ${api_err.message}`, ` Status: ${api_err.status}`, @@ -133,5 +160,5 @@ const get = ( opts: Omit = {} ): Promise=>request(api_key, endpoint, {method: 'GET', ...opts}); -export {request, post, get}; -export type {Request_opts, Api_error}; +export {request, post, get, pick_hint, ERROR_HINTS}; +export type {Request_opts, Api_error, Body_hint};