apify · jmikitova · Jun 25, 2026
diff --git a/src/const.ts b/src/const.ts
@@ -22,6 +22,35 @@ export const KV_RECORD_MAX_INLINE_BYTES = 256 * 1024;
  */
 export const DATASET_SIZE_HINT_BYTES = 50000;
 
+/**
+ * Hard ceiling on the effective row count for `get-dataset-items`. The model freely requests far more
+ * (real traces show up to 1226 items, blowing the context budget); we clamp to this before fetching so
+ * an over-asking call is served a bounded first page instead of the whole dataset, then steered to
+ * paginate via `offset`. The cheap, pre-fetch first layer; {@link DATASET_ITEMS_MAX_BYTES} is the
+ * post-fetch backstop for the case where even this many items are individually huge.
+ */
+export const MAX_DATASET_ITEMS_LIMIT = 100;
+
+/**
+ * Hard cap on a single `get-dataset-items` response, measured on the TOON text in `content[0]` — which
+ * is exactly what the model receives. The chat wraps MCP tools with `schemas: "automatic"`, so the AI
+ * SDK serializes the result via `mcpToModelOutput`, forwarding only `content` (text/image) and dropping
+ * `structuredContent`. When exceeded, trailing items are dropped until the payload fits and the caller
+ * is told to page via `offset`. Mirrors {@link DATASET_SIZE_HINT_BYTES} (~25k-token budget) — the
+ * former soft hint becomes a real truncation cap.
+ */
+export const DATASET_ITEMS_MAX_BYTES = 50000;
+
+/**
+ * Inline cap for text/JSON key-value-store records. Larger values are truncated and a link to the full
+ * record is returned instead of inlining the whole value. Parallels {@link KV_RECORD_MAX_INLINE_BYTES}
+ * (the 256 KB binary cap); text/JSON was previously inlined uncapped.
+ */
+export const KV_RECORD_MAX_INLINE_TEXT_BYTES = 50 * 1024;
+
+/** Per-snippet `content` char cap for `search-apify-docs`; full page is fetched separately via `fetch-apify-docs`. */
+export const DOCS_SNIPPET_MAX_LENGTH = 1000;
+
 /** Shared steer appended to large-output hints so the model narrows instead of refetching everything. */
 export const NARROW_OUTPUT_HINT = 'narrow with fields= or page with offset';
 

diff --git a/src/tools/common/get_dataset_items.ts b/src/tools/common/get_dataset_items.ts
@@ -1,11 +1,18 @@
 import dedent from 'dedent';
 import { z } from 'zod';
 
-import { HelperTools, HTTP_NOT_FOUND } from '../../const.js';
+import {
+    DATASET_ITEMS_MAX_BYTES,
+    HelperTools,
+    HTTP_NOT_FOUND,
+    MAX_DATASET_ITEMS_LIMIT,
+    NARROW_OUTPUT_HINT,
+} from '../../const.js';
 import type { InternalToolArgs, ToolEntry, ToolInputSchema } from '../../types.js';
 import { TOOL_TYPE } from '../../types.js';
 import { compileSchema } from '../../utils/ajv.js';
 import { buildConsoleDatasetUrl, getConsoleLinkContext } from '../../utils/console_link.js';
+import { encodeToon } from '../../utils/encode_text.js';
 import { parseCommaSeparatedList, stripQuoteWrappers } from '../../utils/generic.js';
 import { getHttpStatusCode } from '../../utils/logging.js';
 import { datasetItemsOutputSchema } from '../structured_output_schemas.js';
@@ -25,6 +32,30 @@ export function extractDotPrefixes(fields: string[]): string[] {
     return [...prefixes];
 }
 
+/**
+ * Largest prefix length of `items` whose encoded `structuredContent` stays within
+ * {@link DATASET_ITEMS_MAX_BYTES}. Measured on the TOON text (`content[0]`) — which is exactly what the
+ * model receives: the chat wraps MCP tools with `schemas: "automatic"`, so the AI SDK serializes the
+ * result via `mcpToModelOutput`, which forwards only `content` (text/image parts) and drops
+ * `structuredContent`. Binary-searches the prefix so encoding runs O(log n) times. Returns at least 1
+ * when any item exists, so an oversized single item is still surfaced (with the narrow-output hint)
+ * rather than returning an empty page.
+ */
+function maxItemsWithinByteCap<T>(items: T[], buildStructuredContent: (items: T[]) => Record<string, unknown>): number {
+    const encodedBytes = (n: number): number =>
+        Buffer.byteLength(encodeToon(buildStructuredContent(items.slice(0, n))));
+    if (items.length === 0 || encodedBytes(items.length) <= DATASET_ITEMS_MAX_BYTES) return items.length;
+
+    let lo = 1; // keep at least one item even if it alone exceeds the cap
+    let hi = items.length;
+    while (lo < hi) {
+        const mid = Math.ceil((lo + hi) / 2);
+        if (encodedBytes(mid) <= DATASET_ITEMS_MAX_BYTES) lo = mid;
+        else hi = mid - 1;
+    }
+    return lo;
+}
+
 const getDatasetItemsArgs = z.object({
     datasetId: z.string().min(1).describe('Dataset ID or username~dataset-name.'),
     clean: z
@@ -99,7 +130,9 @@ export const getDatasetItems: ToolEntry = Object.freeze({
         const flatten =
             parsed.flatten !== undefined ? parseCommaSeparatedList(parsed.flatten) : extractDotPrefixes(fields);
 
-        const effectiveLimit = parsed.limit ?? DEFAULT_DATASET_ITEMS_LIMIT;
+        // Layer 1: clamp the requested count before fetching — the model freely asks for far more
+        // (real traces: up to 1226), so bound what we transfer and let pagination serve the rest.
+        const effectiveLimit = Math.min(parsed.limit ?? DEFAULT_DATASET_ITEMS_LIMIT, MAX_DATASET_ITEMS_LIMIT);
         const datasetId = stripQuoteWrappers(parsed.datasetId);
         // `dataset(id).listItems()` throws ApifyApiError on a missing dataset
         // instead of returning undefined (only `.get()` and `.getStatistics()`
@@ -127,23 +160,44 @@ export const getDatasetItems: ToolEntry = Object.freeze({
 
         const offset = parsed.offset ?? 0;
         const apifyConsoleUrl = buildConsoleDatasetUrl(await getConsoleLinkContext(apifyToken, client), datasetId);
-        const structuredContent = {
+        const buildStructuredContent = (items: typeof v.items): Record<string, unknown> => ({
             datasetId,
             apifyConsoleUrl,
-            items: v.items,
-            itemCount: v.items.length,
+            items,
+            itemCount: items.length,
             totalItemCount: v.total,
             offset,
             limit: effectiveLimit,
-        };
+        });
+
+        // Layer 2: byte-cap the encoded response — catches the case where even the clamped page is huge
+        // (a few large items). Drop trailing items until the on-the-wire payload fits; pagination serves
+        // the rest. `totalItemCount` stays the dataset total so the next-step offset math is exact.
+        const keep = maxItemsWithinByteCap(v.items, buildStructuredContent);
+        const items = keep < v.items.length ? v.items.slice(0, keep) : v.items;
+        const truncatedByBytes = keep < v.items.length;
+        const structuredContent = buildStructuredContent(items);
 
         const { summary, nextStep } = buildDatasetItemsSummaryNextStep({
             datasetId,
-            itemCount: v.items.length,
+            // Use the actually-returned count, not the requested limit, so the next page resumes
+            // exactly where this one ended and no items are skipped.
+            itemCount: items.length,
             totalItemCount: v.total,
             offset,
             loadedToolNames: apifyMcpServer.listToolNames(),
         });
-        return buildStorageResponse({ structuredContent, summary, nextStep, toon: true, apifyConsoleUrl });
+        // When the byte cap (not just paging) forced the cut, steer the model to shrink per-item size so
+        // the next page can carry more rows instead of getting capped to the same small count again.
+        const cappedNextStep = truncatedByBytes
+            ? `Response capped at ${DATASET_ITEMS_MAX_BYTES} bytes (returned ${items.length} items). ${nextStep} To fit more rows per page, ${NARROW_OUTPUT_HINT}.`
+            : nextStep;
+        return buildStorageResponse({
+            structuredContent,
+            summary,
+            nextStep: cappedNextStep,
+            toon: true,
+            apifyConsoleUrl,
+        });
     },
 } as const);
diff --git a/src/tools/common/get_key_value_store_record.ts b/src/tools/common/get_key_value_store_record.ts
@@ -2,12 +2,12 @@ import type { AudioContent, EmbeddedResource, ImageContent, ResourceLink } from
 import dedent from 'dedent';
 import { z } from 'zod';
 
-import { HelperTools, KV_RECORD_MAX_INLINE_BYTES } from '../../const.js';
+import { HelperTools, KV_RECORD_MAX_INLINE_BYTES, KV_RECORD_MAX_INLINE_TEXT_BYTES } from '../../const.js';
 import type { InternalToolArgs, ToolEntry, ToolInputSchema } from '../../types.js';
 import { TOOL_TYPE } from '../../types.js';
 import { compileSchema } from '../../utils/ajv.js';
 import { buildConsoleKeyValueStoreUrl, getConsoleLinkContext } from '../../utils/console_link.js';
-import { computeValueBytes, stripQuoteWrappers } from '../../utils/generic.js';
+import { computeValueBytes, stripQuoteWrappers, truncateToBytes } from '../../utils/generic.js';
 import { keyValueStoreRecordOutputSchema } from '../structured_output_schemas.js';
 import {
     buildConsoleLinkContent,
@@ -138,6 +138,23 @@ export const getKeyValueStoreRecord: ToolEntry = Object.freeze({
                 ],
             };
         }
+        // Large text/JSON values were previously inlined uncapped — a multi-MB record would blow the
+        // context window. Mirror the binary cap: truncate to a byte budget and point at the full record.
+        if (bytes !== undefined && bytes > KV_RECORD_MAX_INLINE_TEXT_BYTES) {
+            const uri = await store.getRecordPublicUrl(recordKey);
+            const asText = typeof value === 'string' ? value : JSON.stringify(value);
+            const preview = truncateToBytes(asText, KV_RECORD_MAX_INLINE_TEXT_BYTES);
+            const truncatedSummary =
+                `${summary} Value truncated to ${KV_RECORD_MAX_INLINE_TEXT_BYTES} bytes ` +
+                `(full size ${bytes} bytes); fetch the full record at ${uri}.`;
+            const structuredContent = {
+                keyValueStoreId,
+                key: record.key,
+                value: preview,
+                ...(contentType && { contentType }),
+            };
+            return buildStorageResponse({ structuredContent, summary: truncatedSummary, apifyConsoleUrl });
+        }
         // Text/JSON values serialize cleanly — return them as structuredContent per the storage-tool contract.
         return buildStorageResponse({ structuredContent: { keyValueStoreId, ...record }, summary, apifyConsoleUrl });
     },

diff --git a/src/tools/common/search_apify_docs.ts b/src/tools/common/search_apify_docs.ts
@@ -1,6 +1,6 @@
 import { z } from 'zod';
 
-import { DOCS_SOURCES, HelperTools } from '../../const.js';
+import { DOCS_SNIPPET_MAX_LENGTH, DOCS_SOURCES, HelperTools } from '../../const.js';
 import type { InternalToolArgs, ToolEntry, ToolInputSchema } from '../../types.js';
 import { TOOL_TYPE } from '../../types.js';
 import { compileSchema } from '../../utils/ajv.js';
@@ -41,6 +41,12 @@ Fetch the full content of the document using the ${HelperTools.DOCS_FETCH} tool
 ${PLATFORM_DOCS_PREFERENCE}`;
 }
 
+/** Clip an Algolia snippet to {@link DOCS_SNIPPET_MAX_LENGTH} chars; full page is fetched via fetch-apify-docs. */
+function clipSnippet(content: string): string {
+    if (content.length <= DOCS_SNIPPET_MAX_LENGTH) return content;
+    return `${content.slice(0, DOCS_SNIPPET_MAX_LENGTH)}… (truncated; fetch the full doc with ${HelperTools.DOCS_FETCH})`;
+}
+
 const searchApifyDocsToolArgsSchema = z.object({
     docSource: z
         .enum(DOCS_SOURCES.map((source) => source.id) as [string, ...string[]])
@@ -115,7 +121,7 @@ ${results
     .map((result) => {
         let line = `- Document URL: ${result.url}`;
         if (result.content) {
-            line += `\n  Content: ${result.content}`;
+            line += `\n  Content: ${clipSnippet(result.content)}`;
         }
         return line;
     })
@@ -124,7 +130,7 @@ ${results
         const structuredContent = {
             results: results.map((result) => ({
                 url: result.url,
-                ...(result.content ? { content: result.content } : {}),
+                ...(result.content ? { content: clipSnippet(result.content) } : {}),
             })),
             query,
             count: results.length,

diff --git a/src/tools/core/call_actor_common.ts b/src/tools/core/call_actor_common.ts
@@ -479,8 +479,9 @@ export async function resolveAndValidateActor(params: {
             error: buildMCPResponse({
                 texts: [
                     `Input is required for Actor '${actorName}'. Please provide the input parameter based on the Actor's input schema.`,
-                    `The input schema for this Actor was retrieved and is shown below:`,
-                    `\`\`\`json\n${JSON.stringify(actor.inputSchema)}\n\`\`\``,
+                    // Point at fetch-actor-details rather than inlining the full schema, which can be
+                    // very large; fetch-actor-details returns the per-field capped schema.
+                    `Call ${HelperTools.ACTOR_GET_DETAILS} with actor='${actorName}' and output={ inputSchema: true } to retrieve the input schema.`,
                 ],
                 isError: true,
                 telemetry: {
@@ -507,13 +508,18 @@ export async function resolveAndValidateActor(params: {
             validationMissingProperty: ajvDetails.validation_missing_property,
         });
 
+        // Don't inline the full input schema — for complex Actors it can be very large. The AJV
+        // errors already identify what's wrong; point at fetch-actor-details (which returns the
+        // per-field capped schema) for the full schema instead.
         const content = [
             `Input validation failed for Actor '${actorName}'. Please ensure your input matches the Actor's input schema.`,
-            `Input schema:\n\`\`\`json\n${JSON.stringify(actor.inputSchema)}\n\`\`\``,
         ];
         if (validationSummary) {
             content.push(`Validation errors: ${validationSummary}`);
         }
+        content.push(
+            `For the full input schema, call ${HelperTools.ACTOR_GET_DETAILS} with actor='${actorName}' and output={ inputSchema: true }.`,
+        );
         return {
             error: buildMCPResponse({
                 texts: content,

diff --git a/src/utils/generic.ts b/src/utils/generic.ts
@@ -112,6 +112,17 @@ export function stripQuoteWrappers(s: string): string {
     return s.trim().replace(STRIP_QUOTE_WRAPPERS_REGEX, '').trim();
 }
 
+/**
+ * Truncate a string to at most `maxBytes` UTF-8 bytes without splitting a multi-byte codepoint.
+ * Returns the input unchanged when already within budget.
+ */
+export function truncateToBytes(s: string, maxBytes: number): string {
+    if (Buffer.byteLength(s) <= maxBytes) return s;
+    const sliced = Buffer.from(s, 'utf8').subarray(0, maxBytes).toString('utf8');
+    // A cut that lands mid-codepoint leaves a trailing replacement char (U+FFFD); drop it.
+    return sliced.endsWith('�') ? sliced.slice(0, -1) : sliced;
+}
+
 /** Best-effort byte size of a value for summaries. */
 export function computeValueBytes(value: unknown): number | undefined {
     if (Buffer.isBuffer(value)) return value.length;

diff --git a/tests/unit/tools.call_actor_common.test.ts b/tests/unit/tools.call_actor_common.test.ts
@@ -1,15 +1,24 @@
 import { ApifyApiError } from 'apify-client';
 import type { AxiosResponse } from 'axios';
-import { describe, expect, it } from 'vitest';
+import { beforeEach, describe, expect, it, vi } from 'vitest';
 
 import { APIFY_ERROR_TYPE_MEMORY_LIMIT_EXCEEDED, FAILURE_CATEGORY, HelperTools, TOOL_STATUS } from '../../src/const.js';
+import { getActorsAsTools } from '../../src/tools/core/actor_tools_factory.js';
 import {
     buildCallActorAppsDescription,
     buildCallActorDescription,
     buildCallActorErrorResponse,
     buildPermissionApprovalResponse,
     callActorArgs,
+    resolveAndValidateActor,
 } from '../../src/tools/core/call_actor_common.js';
+import type { InternalToolArgs, ToolEntry } from '../../src/types.js';
+import { TOOL_TYPE } from '../../src/types.js';
+
+vi.mock('../../src/tools/core/actor_tools_factory.js', () => ({
+    getActorsAsTools: vi.fn(),
+    fixActorNameInputAndLog: vi.fn((actor: string) => actor),
+}));
 
 describe('call_actor_common', () => {
     describe('buildCallActorDescription', () => {
@@ -250,4 +259,62 @@ describe('call_actor_common', () => {
             expect(response.content[0]?.text).toContain('This Actor requires full access to your account');
         });
     });
+
+    describe('resolveAndValidateActor — input schema is not inlined in error responses', () => {
+        // A property description large enough that a full-schema dump would dominate the response.
+        const HUGE_DESC = 'x'.repeat(30_000);
+        const bigSchema = {
+            type: 'object',
+            properties: { startUrls: { type: 'array', description: HUGE_DESC } },
+            required: ['startUrls'],
+        };
+
+        function stubActor(valid: boolean): ToolEntry {
+            const ajvValidate = Object.assign(() => valid, {
+                errors: valid ? null : [{ message: "must have required property 'startUrls'" }],
+            });
+            return {
+                type: TOOL_TYPE.ACTOR,
+                actorId: 'abc123',
+                inputSchema: bigSchema,
+                ajvValidate,
+            } as unknown as ToolEntry;
+        }
+
+        const toolArgs = { apifyClient: {}, mcpSessionId: 's1' } as unknown as InternalToolArgs;
+
+        function errorText(res: object): string {
+            const content = (res as { error?: { content?: { text?: string }[] } }).error?.content ?? [];
+            return content.map((c) => c.text ?? '').join('\n');
+        }
+
+        beforeEach(() => vi.mocked(getActorsAsTools).mockReset());
+
+        it('validation failure returns the AJV errors and a fetch-actor-details pointer, not the full schema', async () => {
+            vi.mocked(getActorsAsTools).mockResolvedValue({ tools: [stubActor(false)], errors: [] } as never);
+
+            const res = await resolveAndValidateActor({ actorName: 'apify/x', input: { foo: 1 }, toolArgs });
+            const text = errorText(res);
+
+            expect(text).not.toContain(HUGE_DESC);
+            expect(text.length).toBeLessThan(2_000);
+            expect(text).toContain(HelperTools.ACTOR_GET_DETAILS);
+            expect(text).toContain("must have required property 'startUrls'");
+        });
+
+        it('missing input returns a fetch-actor-details pointer, not the full schema', async () => {
+            vi.mocked(getActorsAsTools).mockResolvedValue({ tools: [stubActor(true)], errors: [] } as never);
+
+            const res = await resolveAndValidateActor({
+                actorName: 'apify/x',
+                input: undefined as never,
+                toolArgs,
+            });
+            const text = errorText(res);
+
+            expect(text).not.toContain(HUGE_DESC);
+            expect(text.length).toBeLessThan(2_000);
+            expect(text).toContain(HelperTools.ACTOR_GET_DETAILS);
+        });
+    });
 });