feat: Cap schema inference depth, drop raw schema from get-dataset

claude · claude · commit 7efe27770cf2 · 2026-06-11T22:59:23.000Z
Calibration decision from #882 (probe: 10 top store Actors): - generateSchemaFromItems collapses objects/arrays below depth 3 to a bare type; deep social-media items (Facebook posts: 15.3 KB) now fit the ~2K-token budget. - get-dataset stops returning the raw Apify dataset.schema (93–95% of response bytes on top Actors, 23–39% phantom fields); nextStep routes to get-dataset-schema instead. Note for apify-mcp-server-internal: get-dataset structuredContent no longer carries the schema key. https://claude.ai/code/session_01Sf9wACoa9h9y2m2WZ2Sde5
diff --git a/src/tools/common/get_dataset.ts b/src/tools/common/get_dataset.ts
@@ -22,8 +22,9 @@ export const getDataset: ToolEntry = Object.freeze({
     name: HelperTools.DATASET_GET,
     description: dedent`
         Get metadata for a dataset (collection of structured data created by an Actor run).
-        The results will include dataset details such as itemCount, schema, fields, and stats.
+        The results will include dataset details such as itemCount, fields, and stats.
         Use fields to understand structure for filtering with ${HelperTools.DATASET_GET_ITEMS}.
+        For a JSON schema of the item structure, use ${HelperTools.DATASET_SCHEMA_GET}.
         Note: itemCount updates may be delayed by up to ~5 seconds.
 
         USAGE:
@@ -51,17 +52,23 @@ export const getDataset: ToolEntry = Object.freeze({
         if (!dataset) {
             return buildStorageNotFound(`Dataset '${datasetId}' not found.`);
         }
+        // The API also returns a raw `schema` (untyped in apify-client). It is 93–95% of the
+        // response bytes on top store Actors and declares fields that may be absent from the
+        // data, so drop it — get-dataset-schema infers a compact schema from real items (#882).
+        const { schema, ...metadata } = dataset as typeof dataset & { schema?: unknown };
         // Apify returns `fields` slash-separated AND with array indices expanded
         // (e.g. `latestComments/0/owner/username`). For a real Instagram-scraper
         // dataset this inflates ~78 schema fields into 528 paths (~85% bloat) and
         // produces slash-notation paths that aren't directly usable as projection
         // hints for `get-dataset-items` (which expects dot-notation). Run the same
         // normalization `buildRunDataset` applies so this tool's `fields` matches
         // the structured `storages.datasets.default.fields` shape.
-        const normalized = dataset.fields ? { ...dataset, fields: normalizeDatasetFields(dataset.fields) } : dataset;
+        const normalized = metadata.fields
+            ? { ...metadata, fields: normalizeDatasetFields(metadata.fields) }
+            : metadata;
         const fieldCount = Array.isArray(normalized.fields) ? normalized.fields.length : undefined;
         const summary = `Dataset '${normalized.name ?? datasetId}' has ${normalized.itemCount ?? 0} items${fieldCount !== undefined ? `, ${fieldCount} fields` : ''}.`;
-        const nextStep = `Use ${HelperTools.DATASET_GET_ITEMS} with datasetId=${datasetId} and limit (for example 20) to fetch items.`;
+        const nextStep = `Use ${HelperTools.DATASET_GET_ITEMS} with datasetId=${datasetId} and limit (for example 20) to fetch items, or ${HelperTools.DATASET_SCHEMA_GET} to infer item structure.`;
         return buildStorageResponse({
             structuredContent: normalized as unknown as Record<string, unknown>,
             summary,
diff --git a/src/tools/common/get_dataset_schema.ts b/src/tools/common/get_dataset_schema.ts
@@ -8,7 +8,7 @@ import { compileSchema } from '../../utils/ajv.js';
 import { stripQuoteWrappers } from '../../utils/generic.js';
 import { getHttpStatusCode } from '../../utils/logging.js';
 import { buildMCPResponse } from '../../utils/mcp.js';
-import { generateSchemaFromItems } from '../../utils/schema_generation.js';
+import { DEFAULT_MAX_SCHEMA_DEPTH, generateSchemaFromItems } from '../../utils/schema_generation.js';
 import { datasetSchemaOutputSchema } from '../structured_output_schemas.js';
 import { buildStorageNotFound, buildStorageResponse } from './storage_helpers.js';
 
@@ -36,6 +36,7 @@ export const getDatasetSchema: ToolEntry = Object.freeze({
         Generate a JSON schema from a sample of dataset items.
         The schema describes the structure of the data and can be used for validation, documentation, or processing.
         Use this to understand the dataset before fetching many items.
+        Nesting is described up to ${DEFAULT_MAX_SCHEMA_DEPTH} levels deep; deeper objects/arrays appear as a bare type.
 
         USAGE:
         - Use when you need to infer the structure of dataset items for downstream processing or validation.
diff --git a/src/tools/structured_output_schemas.ts b/src/tools/structured_output_schemas.ts
@@ -541,7 +541,8 @@ export const datasetItemsOutputSchema = {
 
 /**
  * Schema for dataset metadata (get-dataset). Documents the fields the LLM acts on; the raw API
- * response carries more keys (stats, schema, access settings), allowed as additional properties.
+ * response carries more keys (stats, access settings), allowed as additional properties.
+ * The raw `schema` key is stripped by the tool — get-dataset-schema owns schema output (#882).
  */
 export const datasetMetadataOutputSchema = {
     type: 'object' as const,
diff --git a/src/utils/schema_generation.ts b/src/utils/schema_generation.ts
@@ -19,8 +19,15 @@ export type SchemaGenerationOptions = {
     limit?: number;
     /** If true, strips empty arrays from items before inference. Default is true. */
     clean?: boolean;
+    /**
+     * Maximum nesting depth described in the schema; objects/arrays deeper than this collapse
+     * to a bare `{ type }`. Caps token cost on deeply nested items (#882). Default is 3.
+     */
+    maxDepth?: number;
 };
 
+export const DEFAULT_MAX_SCHEMA_DEPTH = 3;
+
 /**
  * Local counterpart to the dataset API's `clean=true` — empty arrays carry no schema info.
  * Strips only empty arrays; keeps null / '' / empty objects so schema inference still sees those fields.
@@ -101,23 +108,25 @@ function inferType(value: unknown): JsonSchemaPrimitiveType {
     return 'object';
 }
 
-function inferSchema(value: unknown): JsonSchemaProperty {
+function inferSchema(value: unknown, depth: number, maxDepth: number): JsonSchemaProperty {
     const type = inferType(value);
 
     if (type === 'object') {
+        if (depth >= maxDepth) return { type: 'object' };
         const entries = Object.entries(value as Record<string, unknown>);
         if (entries.length === 0) return { type: 'object' };
         const properties: Record<string, JsonSchemaProperty> = {};
         for (const [k, v] of entries) {
-            properties[k] = inferSchema(v);
+            properties[k] = inferSchema(v, depth + 1, maxDepth);
         }
         return { type: 'object', properties };
     }
 
     if (type === 'array') {
+        if (depth >= maxDepth) return { type: 'array' };
         const arr = value as unknown[];
         if (arr.length === 0) return { type: 'array' };
-        const merged = arr.map(inferSchema).reduce(mergeSchemas);
+        const merged = arr.map((v) => inferSchema(v, depth + 1, maxDepth)).reduce(mergeSchemas);
         return { type: 'array', items: merged };
     }
 
@@ -182,14 +191,14 @@ export function generateSchemaFromItems(
     datasetItems: unknown[],
     options: SchemaGenerationOptions = {},
 ): JsonSchemaArray | null {
-    const { limit = 5, clean = true } = options;
+    const { limit = 5, clean = true, maxDepth = DEFAULT_MAX_SCHEMA_DEPTH } = options;
 
     const itemsToUse = datasetItems.slice(0, limit);
     if (itemsToUse.length === 0) return null;
 
     const processed = clean ? itemsToUse.map(cleanEmptyArrays) : itemsToUse;
 
-    const itemSchemas = processed.map(inferSchema);
+    const itemSchemas = processed.map((item) => inferSchema(item, 0, maxDepth));
     const merged = itemSchemas.reduce(mergeSchemas);
 
     return { type: 'array', items: merged };
diff --git a/tests/unit/schema_generation.test.ts b/tests/unit/schema_generation.test.ts
@@ -241,6 +241,52 @@ describe('generateSchemaFromItems — options', () => {
     });
 });
 
+describe('generateSchemaFromItems — depth cap', () => {
+    // Calibration probe (#882): unbounded recursion blew Facebook-posts schemas to ~15 KB
+    // via deep subtrees (`sharedPost`, `media`). Values deeper than maxDepth collapse to a bare type.
+    it('collapses objects deeper than the default maxDepth to a bare object type', () => {
+        const result = generateSchemaFromItems([{ a: { b: { c: { d: 1 } } } }]);
+        const c = props(result)!.a?.properties?.b?.properties?.c;
+        expect(c?.type).toBe('object');
+        expect(c?.properties).toBeUndefined();
+    });
+
+    it('collapses arrays deeper than the default maxDepth to a bare array type', () => {
+        const result = generateSchemaFromItems([{ a: { b: { c: [1, 2] } } }]);
+        const c = props(result)!.a?.properties?.b?.properties?.c;
+        expect(c?.type).toBe('array');
+        expect(c?.items).toBeUndefined();
+    });
+
+    it('keeps everything above the cap fully described', () => {
+        const result = generateSchemaFromItems([{ a: { b: { s: 'x', n: 1 } } }]);
+        const b = props(result)!.a?.properties?.b;
+        expect(b?.properties?.s?.type).toBe('string');
+        expect(b?.properties?.n?.type).toBe('integer');
+    });
+
+    it('counts array nesting toward the depth', () => {
+        const result = generateSchemaFromItems([{ a: [{ b: { c: 1 } }] }]);
+        const b = props(result)!.a?.items?.properties?.b;
+        expect(b?.type).toBe('object');
+        expect(b?.properties).toBeUndefined();
+    });
+
+    it('respects a custom maxDepth', () => {
+        const result = generateSchemaFromItems([{ a: { b: 1 } }], { maxDepth: 1 });
+        const { a } = props(result)!;
+        expect(a?.type).toBe('object');
+        expect(a?.properties).toBeUndefined();
+    });
+
+    it('merges capped and uncapped schemas across items without resurrecting depth', () => {
+        const result = generateSchemaFromItems([{ a: { b: { c: { d: 1 } } } }, { a: { b: { c: { e: 'x' } } } }]);
+        const c = props(result)!.a?.properties?.b?.properties?.c;
+        expect(c?.type).toBe('object');
+        expect(c?.properties).toBeUndefined();
+    });
+});
+
 describe('generateSchemaFromItems — user-reported regression', () => {
     it('emits all four top-level keys from the NYC sushi dataset sample', () => {
         const items = [
diff --git a/tests/unit/tools.get_dataset.test.ts b/tests/unit/tools.get_dataset.test.ts
@@ -54,6 +54,29 @@ describe('get-dataset', () => {
         expect(content[0].text).toContain("Dataset 'missing' not found");
     });
 
+    it('strips the raw schema field from the response', async () => {
+        // Calibration probe (#882): raw `dataset.schema` was 93–95% of the response bytes on
+        // top store Actors and declares fields absent from the data. get-dataset-schema is
+        // the schema source; this tool returns metadata only.
+        const result = await (getDataset as HelperTool).call(
+            stubToolCallContext(
+                { datasetId: 'ds-1' },
+                stubApifyClient({ ...MOCK_DATASET, schema: { fields: {}, views: {} } }),
+            ),
+        );
+        const { structuredContent } = result as { structuredContent: Record<string, unknown> };
+        expect(structuredContent).not.toHaveProperty('schema');
+        expect(structuredContent).toMatchObject(MOCK_DATASET);
+    });
+
+    it('points nextStep at get-dataset-schema for structure inference', async () => {
+        const result = await (getDataset as HelperTool).call(
+            stubToolCallContext({ datasetId: 'ds-1' }, stubApifyClient(MOCK_DATASET)),
+        );
+        const { structuredContent } = result as { structuredContent: { nextStep: string } };
+        expect(structuredContent.nextStep).toContain(HelperTools.DATASET_SCHEMA_GET);
+    });
+
     it('rejects empty datasetId via ajv validation', () => {
         const tool = getDataset as HelperTool;
         expect(tool.ajvValidate({ datasetId: '' })).toBe(false);