From 6a9927df5d1874b1c153c200dcdb3fe3d47ede82 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 29 Jun 2026 05:05:39 +0000 Subject: [PATCH] feat: Drop raw schema from get-dataset, stop nudging get-dataset-schema MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Calibration outcome for #882 (probe: 10 top store Actors; Mixpanel: the get-dataset-schema tool is rarely called): - get-dataset no longer returns the raw Apify dataset.schema (93–95% of response bytes on top Actors, 23–39% phantom fields). The flat `fields` list it already returns is the complete, projection-ready inventory. - The terminal get-dataset-items nextStep no longer nudges toward the context-heavy get-dataset-schema; it points at get-dataset for the field list instead (keeping the #1007 loaded-tool gating). - get-dataset-schema stays as an on-demand tool, unchanged. Note for apify-mcp-server-internal: get-dataset structuredContent no longer carries the schema key. https://claude.ai/code/session_01Sf9wACoa9h9y2m2WZ2Sde5 --- src/tools/storage/get_dataset.ts | 10 ++++++++-- src/tools/storage/storage_helpers.ts | 8 ++++---- src/tools/structured_output_schemas.ts | 3 ++- tests/unit/tools.get_dataset_items.test.ts | 4 ++-- tests/unit/tools.storage_helpers.test.ts | 12 ++++++------ 5 files changed, 22 insertions(+), 15 deletions(-) diff --git a/src/tools/storage/get_dataset.ts b/src/tools/storage/get_dataset.ts index b78038af..d2622e0a 100644 --- a/src/tools/storage/get_dataset.ts +++ b/src/tools/storage/get_dataset.ts @@ -25,7 +25,7 @@ export const getDataset: ToolEntry = Object.freeze({ title: 'Get dataset', description: dedent` Get metadata for a dataset (collection of structured data created by an Actor run). - The results will include dataset details such as itemCount, schema, fields, and stats. + The results will include dataset details such as itemCount, fields, and stats. Use fields to understand structure for filtering with ${HelperTools.DATASET_GET_ITEMS}. stats.inflatedBytes (when present) is the approximate uncompressed byte size — use it with itemCount to pick a safe limit and fields before fetching. Note: itemCount updates may be delayed by up to ~5 seconds. @@ -56,6 +56,10 @@ export const getDataset: ToolEntry = Object.freeze({ return buildStorageNotFound(`Dataset '${datasetId}' not found.`); } const linkContext = await getConsoleLinkContext(apifyToken, client); + // The API also returns a raw `schema` (untyped in apify-client). It is 93–95% of the + // response bytes on top store Actors and declares fields that may be absent from the + // data, so drop it — get-dataset-schema infers a compact schema from real items (#882). + const { schema, ...metadata } = dataset as typeof dataset & { schema?: unknown }; // Apify returns `fields` slash-separated AND with array indices expanded // (e.g. `latestComments/0/owner/username`). For a real Instagram-scraper // dataset this inflates ~78 schema fields into 528 paths (~85% bloat) and @@ -63,7 +67,9 @@ export const getDataset: ToolEntry = Object.freeze({ // hints for `get-dataset-items` (which expects dot-notation). Run the same // normalization `buildRunDataset` applies so this tool's `fields` matches // the structured `storages.datasets.default.fields` shape. - const normalized = dataset.fields ? { ...dataset, fields: normalizeDatasetFields(dataset.fields) } : dataset; + const normalized = metadata.fields + ? { ...metadata, fields: normalizeDatasetFields(metadata.fields) } + : metadata; const fieldCount = Array.isArray(normalized.fields) ? normalized.fields.length : undefined; // `inflatedBytes` is undeclared on the apify-client `DatasetStats` type and absent from the GET // response today (only the dataset-list endpoint returns it), so read it defensively. diff --git a/src/tools/storage/storage_helpers.ts b/src/tools/storage/storage_helpers.ts index 769dcd7e..d487d97a 100644 --- a/src/tools/storage/storage_helpers.ts +++ b/src/tools/storage/storage_helpers.ts @@ -95,14 +95,14 @@ export function buildStorageListSummaryNextStep(params: { /** * Pagination-aware {summary, nextStep}: when more items remain, point at the next page; - * otherwise point at get-dataset-schema for structure inspection. + * otherwise point at get-dataset for the field list (structure lives there, not in a heavy schema dump). */ export function buildDatasetItemsSummaryNextStep(params: { datasetId: string; itemCount: number; totalItemCount: number; offset: number; - /** Active loaded tool set; gates the terminal get-dataset-schema reference (see #1007). */ + /** Active loaded tool set; gates the terminal get-dataset reference (see #1007). */ loadedToolNames: string[]; }): { summary: string; nextStep: string } { const { datasetId, itemCount, totalItemCount, offset, loadedToolNames } = params; @@ -118,8 +118,8 @@ export function buildDatasetItemsSummaryNextStep(params: { : `Fetched ${itemCount} of ${totalItemCount} items (offset=${offset}); no more pages.`; return { summary, - nextStep: suggestTool(HelperTools.DATASET_SCHEMA_GET, loadedToolNames) - ? `Use ${HelperTools.DATASET_SCHEMA_GET} with datasetId=${datasetId} to inspect structure if needed.` + nextStep: suggestTool(HelperTools.DATASET_GET, loadedToolNames) + ? `Use ${HelperTools.DATASET_GET} with datasetId=${datasetId} to see the field list if you need the data structure.` : `No more pages. Inspect the returned items directly.`, }; } diff --git a/src/tools/structured_output_schemas.ts b/src/tools/structured_output_schemas.ts index 968e4125..e8dd0305 100644 --- a/src/tools/structured_output_schemas.ts +++ b/src/tools/structured_output_schemas.ts @@ -558,7 +558,8 @@ export const datasetItemsOutputSchema = { /** * Schema for dataset metadata (get-dataset). Documents the fields the LLM acts on; the raw API - * response carries more keys (stats, schema, access settings), allowed as additional properties. + * response carries more keys (stats, access settings), allowed as additional properties. + * The raw `schema` key is stripped by the tool — get-dataset-schema owns schema output (#882). */ export const datasetMetadataOutputSchema = { type: 'object' as const, diff --git a/tests/unit/tools.get_dataset_items.test.ts b/tests/unit/tools.get_dataset_items.test.ts index 8544e3ba..6277e6c1 100644 --- a/tests/unit/tools.get_dataset_items.test.ts +++ b/tests/unit/tools.get_dataset_items.test.ts @@ -208,7 +208,7 @@ describe('get-dataset-items', () => { ); }); - it('emits a last-page summary and a schema nextStep when all items are returned', async () => { + it('emits a last-page summary and a get-dataset nextStep when all items are returned', async () => { const result = await (getDatasetItems as HelperTool).call( stubToolCallContext({ datasetId: 'ds-1' }, stubApifyClient()), ); @@ -217,7 +217,7 @@ describe('get-dataset-items', () => { }; expect(structuredContent.summary).toBe('Fetched all 1 items.'); - expect(structuredContent.nextStep).toContain(HelperTools.DATASET_SCHEMA_GET); + expect(structuredContent.nextStep).toContain(HelperTools.DATASET_GET); expect(structuredContent.nextStep).toContain('datasetId=ds-1'); // summary + nextStep ship as a separate text block after the fenced data. expect(content[1].text).toBe(`${structuredContent.summary}\n${structuredContent.nextStep}`); diff --git a/tests/unit/tools.storage_helpers.test.ts b/tests/unit/tools.storage_helpers.test.ts index a7c1a2c1..d893fd86 100644 --- a/tests/unit/tools.storage_helpers.test.ts +++ b/tests/unit/tools.storage_helpers.test.ts @@ -21,19 +21,19 @@ describe('buildStorageNotFound()', () => { }); describe('buildDatasetItemsSummaryNextStep()', () => { - it('suggests get-dataset-schema on the terminal page when loaded', () => { + it('suggests get-dataset on the terminal page when loaded', () => { const t = buildDatasetItemsSummaryNextStep({ datasetId: 'ds-1', itemCount: 5, totalItemCount: 5, offset: 0, - loadedToolNames: [HelperTools.DATASET_SCHEMA_GET], + loadedToolNames: [HelperTools.DATASET_GET], }); - expect(t.nextStep).toContain(HelperTools.DATASET_SCHEMA_GET); + expect(t.nextStep).toContain(HelperTools.DATASET_GET); expect(t.nextStep).toContain('datasetId=ds-1'); }); - it('omits get-dataset-schema when not loaded', () => { + it('omits get-dataset when not loaded', () => { const t = buildDatasetItemsSummaryNextStep({ datasetId: 'ds-1', itemCount: 5, @@ -41,7 +41,7 @@ describe('buildDatasetItemsSummaryNextStep()', () => { offset: 0, loadedToolNames: [], }); - expect(t.nextStep).not.toContain(HelperTools.DATASET_SCHEMA_GET); + expect(t.nextStep).not.toContain(HelperTools.DATASET_GET); expect(t.nextStep).toContain('No more pages'); }); @@ -51,7 +51,7 @@ describe('buildDatasetItemsSummaryNextStep()', () => { itemCount: 20, totalItemCount: 100, offset: 0, - loadedToolNames: [HelperTools.DATASET_SCHEMA_GET], + loadedToolNames: [HelperTools.DATASET_GET], }); const unloaded = buildDatasetItemsSummaryNextStep({ datasetId: 'ds-1',