Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 83 additions & 3 deletions src/__tests__/commands/scraper.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ import {
parse_sync_timeout,
is_realtime_page_limit_error,
classify_dataset,
SCRAPER_BODY_HINTS,
} from '../../commands/scraper';

describe('commands/scraper', ()=>{
Expand All @@ -68,6 +69,85 @@ describe('commands/scraper', ()=>{
mocks.start.mockReturnValue({stop: mocks.stop});
});

// PR-12: AI Scraper Studio specific hints live with this command,
// not in the shared HTTP client. These tests assert (a) the
// patterns are well-formed and (b) they actually travel to the
// HTTP layer on every API call this command makes.
describe('SCRAPER_BODY_HINTS', ()=>{
it('contains a hint for the stub-collector 403 response body',
()=>{
const stub_hint = SCRAPER_BODY_HINTS.find(h=>
h.pattern.test('Collector does not have a template'));
expect(stub_hint).toBeDefined();
expect(stub_hint!.hint).toMatch(/AI generation has not completed/);
expect(stub_hint!.hint).toMatch(/bdata scraper create/);
// Must NOT echo the misleading generic 403 hint.
expect(stub_hint!.hint).not.toMatch(/zone permissions/);
});

it('contains a hint for the parallel-job cap 429 response body',
()=>{
const cap_hint = SCRAPER_BODY_HINTS.find(h=>
h.pattern.test('Cannot run more than 3 jobs in parallel'));
expect(cap_hint).toBeDefined();
expect(cap_hint!.hint).toMatch(/concurrent-job cap/);
expect(cap_hint!.hint).toMatch(/serialise/);
});

it('cap pattern is case-insensitive and number-agnostic '
+'(future-proof if the cap changes)', ()=>{
const cap_pattern = SCRAPER_BODY_HINTS[1].pattern;
expect(cap_pattern.test('cannot run more than 5 jobs '
+'in parallel')).toBe(true);
expect(cap_pattern.test('CANNOT RUN MORE THAN 100 JOBS '
+'IN PARALLEL')).toBe(true);
});

it('is passed via `hints` opt to client.post on every '
+'AI-Flow call in handle_create_scraper', async()=>{
mocks.post
.mockResolvedValueOnce({id: 'c_abc', name: 'cli-scraper-1'})
.mockResolvedValueOnce({id: 'ia_xyz', queued: false});
mocks.poll_until.mockResolvedValue({
result: {status: 'done', completed_steps: []},
attempts: 1,
});
await handle_create_scraper(
'https://example.com/p/1',
'extract title',
{}
);
// Both `post` calls (template create + AI trigger) must
// include the scraper hints so any 4xx is translated
// with the right vocabulary.
for (const call of mocks.post.mock.calls)
{
const opts = call[3] as {hints?: unknown};
expect(opts).toBeDefined();
expect(opts.hints).toBe(SCRAPER_BODY_HINTS);
}
});

it('is passed via `hints` opt to client.post on the '
+'trigger_immediate call in handle_run_scraper', async()=>{
mocks.post.mockResolvedValueOnce({response_id: 'r_xyz'});
const fetch_spy = vi.spyOn(global, 'fetch')
.mockImplementation(()=>Promise.resolve({
status: 200,
text: ()=>Promise.resolve('{"title":"ok"}'),
} as unknown as Response));
mocks.poll_until.mockImplementation(async(o: never)=>{
const cfg = o as {fetch_once: ()=>Promise<unknown>};
const r = await cfg.fetch_once();
return {result: r, attempts: 1};
});
await handle_run_scraper('c_abc', 'https://x.com/p/1', {});
const opts = mocks.post.mock.calls[0][3] as {hints?: unknown};
expect(opts.hints).toBe(SCRAPER_BODY_HINTS);
fetch_spy.mockRestore();
});
});

describe('build_template_request', ()=>{
it('uses auto-generated name and stub webhook by default', ()=>{
const before = Math.floor(Date.now()/1000);
Expand Down Expand Up @@ -176,15 +256,15 @@ describe('commands/scraper', ()=>{
expect.objectContaining({
deliver: expect.objectContaining({type: 'webhook'}),
}),
{timing: undefined}
expect.objectContaining({timing: undefined})
);
expect(mocks.post).toHaveBeenNthCalledWith(
2,
'api_key',
'/dca/collectors/c_abc/automate_template',
{description: 'extract title',
urls: ['https://example.com/p/1']},
{timing: undefined}
expect.objectContaining({timing: undefined})
);
expect(mocks.poll_until).toHaveBeenCalledTimes(1);
expect(mocks.poll_until).toHaveBeenCalledWith(
Expand Down Expand Up @@ -404,7 +484,7 @@ describe('commands/scraper', ()=>{
expect.stringMatching(
/\/dca\/trigger_immediate\?collector=c_abc/),
{url: 'https://x.com/p/1'},
{timing: undefined}
expect.objectContaining({timing: undefined})
);
expect(mocks.print).toHaveBeenCalledWith(
{title: 'hello'},
Expand Down
79 changes: 79 additions & 0 deletions src/__tests__/utils/client.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import {describe, it, expect} from 'vitest';
import {pick_hint, ERROR_HINTS, type Body_hint} from '../../utils/client';

// pick_hint is the layer the per-command hint lists plug into.
// The client itself ships ZERO command-specific patterns — every test
// here uses generic mock patterns so the contract is clear.

const MOCK_HINTS: Body_hint[] = [
{pattern: /quota.*exceeded/i,
hint: 'Your monthly quota is exhausted. Top up in the dashboard.'},
{pattern: /not allowed in region/i,
hint: 'This API is geo-restricted. Use --country to override.'},
];

describe('utils/client.pick_hint', ()=>{
it('returns the status-based hint when no extra hints are passed', ()=>{
expect(pick_hint(401, 'anything')).toBe(ERROR_HINTS[401]);
expect(pick_hint(403, 'anything')).toBe(ERROR_HINTS[403]);
expect(pick_hint(404, 'anything')).toBe(ERROR_HINTS[404]);
expect(pick_hint(429, 'anything')).toBe(ERROR_HINTS[429]);
});

it('returns the status-based hint when extra hints are passed '
+'but nothing matches the body', ()=>{
expect(pick_hint(403, 'plain unauthorized message', MOCK_HINTS))
.toBe(ERROR_HINTS[403]);
});

it('overrides with the first matching extra hint', ()=>{
expect(pick_hint(429, 'monthly quota exceeded', MOCK_HINTS))
.toMatch(/Top up in the dashboard/);
expect(pick_hint(403, 'not allowed in region: CN', MOCK_HINTS))
.toMatch(/geo-restricted/);
});

it('is case-insensitive on the caller\'s pattern', ()=>{
expect(pick_hint(429, 'MONTHLY QUOTA EXCEEDED', MOCK_HINTS))
.toMatch(/Top up in the dashboard/);
});

it('respects first-match-wins order in the caller\'s list', ()=>{
const ordered: Body_hint[] = [
{pattern: /generic/, hint: 'first wins'},
{pattern: /generic/, hint: 'second loses'},
];
expect(pick_hint(500, 'generic error', ordered))
.toBe('first wins');
});

it('returns undefined for unknown statuses with no body match', ()=>{
expect(pick_hint(418, 'teapot')).toBeUndefined();
expect(pick_hint(599, 'mystery')).toBeUndefined();
});

it('does not consult ERROR_HINTS when an extra-hint pattern '
+'matches', ()=>{
// The override should win even when the generic status hint
// would also be applicable.
const matches_403: Body_hint[] = [
{pattern: /custom 403 case/i,
hint: 'override hint, not the zone-permissions one'},
];
expect(pick_hint(403, 'this is a custom 403 case', matches_403))
.toBe('override hint, not the zone-permissions one');
});

it('does not leak any scraper-studio vocabulary in the default '
+'hints', ()=>{
// The shared client must stay generic. Catches accidental
// regressions where someone adds a domain-specific hint here.
for (const hint of Object.values(ERROR_HINTS))
{
expect(hint).not.toMatch(/scraper create/i);
expect(hint).not.toMatch(/collector/i);
expect(hint).not.toMatch(/AI[- ]Flow/i);
expect(hint).not.toMatch(/automate_template/i);
}
});
});
44 changes: 38 additions & 6 deletions src/commands/scraper.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import {Command} from 'commander';
import {post, get} from '../utils/client';
import {post, get, type Body_hint} from '../utils/client';
import {load as load_config} from '../utils/config';
import {ensure_authenticated} from '../utils/auth';
import {start as start_spinner} from '../utils/spinner';
Expand All @@ -18,6 +18,37 @@ import type {
Batch_trigger_response,
} from '../types/scraper';

// AI Scraper Studio specific error-body hints. Lives with the command
// that owns the DCA endpoints — NOT in the shared HTTP client — so the
// AI-Flow vocabulary doesn't leak into `scrape`, `search`, `discover`,
// `pipelines`, or `browser`.
//
// Both patterns describe the same incident from two perspectives:
// * The 429 fires during `scraper create` when too many AI-Flow jobs
// are in flight at once (undocumented 3-job cap as of v0.2.0).
// * The 403 "Collector does not have a template" fires later when
// the user tries to `scraper run` against the half-built
// collector_id the failed create left behind.
//
// First match wins. Add more patterns here as we learn about other
// scraper-API error bodies that deserve a better hint.
const SCRAPER_BODY_HINTS: Body_hint[] = [
{
pattern: /collector does not have a template/i,
hint: 'AI generation has not completed for this collector. '
+'Re-run `bdata scraper create` (the previous attempt may '
+'have hit the AI-Flow parallel-job cap), or open '
+'https://brightdata.com/cp/scrapers to inspect / finish '
+'the half-built collector in the web UI.',
},
{
pattern: /cannot run more than \d+ jobs in parallel/i,
hint: 'You hit the AI-Flow concurrent-job cap. Wait for an '
+'in-flight `scraper create` to finish, or serialise '
+'your launches.',
},
];

const CREATE_TEMPLATE_ENDPOINT = '/dca/collector';
const AI_TRIGGER_PATH = 'automate_template';
const AI_PROGRESS_PATH = 'automate_template/progress';
Expand Down Expand Up @@ -105,7 +136,7 @@ const handle_create_scraper = async(
api_key,
CREATE_TEMPLATE_ENDPOINT,
template_body,
{timing: opts.timing}
{timing: opts.timing, hints: SCRAPER_BODY_HINTS}
);
create_spinner.stop();
if (!template.id)
Expand All @@ -129,7 +160,7 @@ const handle_create_scraper = async(
api_key,
`/dca/collectors/${collector_id}/${AI_TRIGGER_PATH}`,
build_ai_request(url, description),
{timing: opts.timing}
{timing: opts.timing, hints: SCRAPER_BODY_HINTS}
);
trigger_spinner.stop();
} catch(e) {
Expand All @@ -148,7 +179,7 @@ const handle_create_scraper = async(
fetch_once: ()=>get<Ai_progress_response>(
api_key,
`/dca/collectors/${collector_id}/${AI_PROGRESS_PATH}`,
{timing: opts.timing}
{timing: opts.timing, hints: SCRAPER_BODY_HINTS}
),
get_status: extract_progress_status,
running_statuses: [RUNNING_SENTINEL],
Expand Down Expand Up @@ -338,7 +369,7 @@ const run_batch = async(
api_key,
`${BATCH_TRIGGER_ENDPOINT}?${query}`,
[build_run_request(url)],
{timing: opts.timing}
{timing: opts.timing, hints: SCRAPER_BODY_HINTS}
);
trigger_spinner.stop();
if (!trigger.collection_id)
Expand Down Expand Up @@ -474,7 +505,7 @@ const handle_run_scraper = async(
api_key,
`${TRIGGER_IMMEDIATE_ENDPOINT}?${trigger_query}`,
build_run_request(url),
{timing: opts.timing}
{timing: opts.timing, hints: SCRAPER_BODY_HINTS}
);
trigger_spinner.stop();
if (!trigger.response_id)
Expand Down Expand Up @@ -592,4 +623,5 @@ export {
parse_sync_timeout,
is_realtime_page_limit_error,
classify_dataset,
SCRAPER_BODY_HINTS,
};
37 changes: 32 additions & 5 deletions src/utils/client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,35 @@ const ERROR_HINTS: Record<number, string> = {
429: 'Rate limit exceeded. Wait a moment and try again.',
};

// Each command can supply its own ordered list of body-pattern overrides
// via Request_opts.hints. The shared HTTP layer knows nothing about
// per-command error semantics — it just consults the caller's hints
// before falling back to the generic ERROR_HINTS map.
type Body_hint = {pattern: RegExp; hint: string};

const pick_hint = (
status: number,
body: string,
extra_hints?: Body_hint[]
): string|undefined=>{
if (extra_hints)
{
for (const {pattern, hint} of extra_hints)
{
if (pattern.test(body))
return hint;
}
}
return ERROR_HINTS[status];
};

type Request_opts = {
method?: string;
body?: unknown;
headers?: Record<string, string>;
timing?: boolean;
raw_buffer?: boolean;
hints?: Body_hint[];
};

type Api_error = {
Expand All @@ -27,10 +50,14 @@ type Api_error = {

const sleep = (ms: number)=>new Promise(resolve=>setTimeout(resolve, ms));

const format_error = (status: number, detail: string): Api_error=>({
const format_error = (
status: number,
detail: string,
extra_hints?: Body_hint[]
): Api_error=>({
status,
message: detail,
hint: ERROR_HINTS[status],
hint: pick_hint(status, detail, extra_hints),
});

const request = async<T = unknown>(
Expand Down Expand Up @@ -93,7 +120,7 @@ const request = async<T = unknown>(
if (err_body)
detail = err_body;
} catch(_e) {}
const api_err = format_error(res.status, detail);
const api_err = format_error(res.status, detail, opts.hints);
const msg = [
`Error: ${api_err.message}`,
` Status: ${api_err.status}`,
Expand Down Expand Up @@ -133,5 +160,5 @@ const get = <T = unknown>(
opts: Omit<Request_opts, 'method'> = {}
): Promise<T>=>request<T>(api_key, endpoint, {method: 'GET', ...opts});

export {request, post, get};
export type {Request_opts, Api_error};
export {request, post, get, pick_hint, ERROR_HINTS};
export type {Request_opts, Api_error, Body_hint};