Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions src/tools/storage/get_dataset.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ export const getDataset: ToolEntry = Object.freeze({
title: 'Get dataset',
description: dedent`
Get metadata for a dataset (collection of structured data created by an Actor run).
The results will include dataset details such as itemCount, schema, fields, and stats.
The results will include dataset details such as itemCount, fields, and stats.
Use fields to understand structure for filtering with ${HelperTools.DATASET_GET_ITEMS}.
stats.inflatedBytes (when present) is the approximate uncompressed byte size — use it with itemCount to pick a safe limit and fields before fetching.
Note: itemCount updates may be delayed by up to ~5 seconds.
Expand Down Expand Up @@ -56,14 +56,20 @@ export const getDataset: ToolEntry = Object.freeze({
return buildStorageNotFound(`Dataset '${datasetId}' not found.`);
}
const linkContext = await getConsoleLinkContext(apifyToken, client);
// The API also returns a raw `schema` (untyped in apify-client). It is 93–95% of the
// response bytes on top store Actors and declares fields that may be absent from the
// data, so drop it — get-dataset-schema infers a compact schema from real items (#882).
const { schema, ...metadata } = dataset as typeof dataset & { schema?: unknown };
// Apify returns `fields` slash-separated AND with array indices expanded
// (e.g. `latestComments/0/owner/username`). For a real Instagram-scraper
// dataset this inflates ~78 schema fields into 528 paths (~85% bloat) and
// produces slash-notation paths that aren't directly usable as projection
// hints for `get-dataset-items` (which expects dot-notation). Run the same
// normalization `buildRunDataset` applies so this tool's `fields` matches
// the structured `storages.datasets.default.fields` shape.
const normalized = dataset.fields ? { ...dataset, fields: normalizeDatasetFields(dataset.fields) } : dataset;
const normalized = metadata.fields
? { ...metadata, fields: normalizeDatasetFields(metadata.fields) }
: metadata;
const fieldCount = Array.isArray(normalized.fields) ? normalized.fields.length : undefined;
// `inflatedBytes` is undeclared on the apify-client `DatasetStats` type and absent from the GET
// response today (only the dataset-list endpoint returns it), so read it defensively.
Expand Down
8 changes: 4 additions & 4 deletions src/tools/storage/storage_helpers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,14 @@ export function buildStorageListSummaryNextStep(params: {

/**
* Pagination-aware {summary, nextStep}: when more items remain, point at the next page;
* otherwise point at get-dataset-schema for structure inspection.
* otherwise point at get-dataset for the field list (structure lives there, not in a heavy schema dump).
*/
export function buildDatasetItemsSummaryNextStep(params: {
datasetId: string;
itemCount: number;
totalItemCount: number;
offset: number;
/** Active loaded tool set; gates the terminal get-dataset-schema reference (see #1007). */
/** Active loaded tool set; gates the terminal get-dataset reference (see #1007). */
loadedToolNames: string[];
}): { summary: string; nextStep: string } {
const { datasetId, itemCount, totalItemCount, offset, loadedToolNames } = params;
Expand All @@ -118,8 +118,8 @@ export function buildDatasetItemsSummaryNextStep(params: {
: `Fetched ${itemCount} of ${totalItemCount} items (offset=${offset}); no more pages.`;
return {
summary,
nextStep: suggestTool(HelperTools.DATASET_SCHEMA_GET, loadedToolNames)
? `Use ${HelperTools.DATASET_SCHEMA_GET} with datasetId=${datasetId} to inspect structure if needed.`
nextStep: suggestTool(HelperTools.DATASET_GET, loadedToolNames)
? `Use ${HelperTools.DATASET_GET} with datasetId=${datasetId} to see the field list if you need the data structure.`
: `No more pages. Inspect the returned items directly.`,
};
}
Expand Down
3 changes: 2 additions & 1 deletion src/tools/structured_output_schemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,8 @@ export const datasetItemsOutputSchema = {

/**
* Schema for dataset metadata (get-dataset). Documents the fields the LLM acts on; the raw API
* response carries more keys (stats, schema, access settings), allowed as additional properties.
* response carries more keys (stats, access settings), allowed as additional properties.
* The raw `schema` key is stripped by the tool — get-dataset-schema owns schema output (#882).
*/
export const datasetMetadataOutputSchema = {
type: 'object' as const,
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/tools.get_dataset_items.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ describe('get-dataset-items', () => {
);
});

it('emits a last-page summary and a schema nextStep when all items are returned', async () => {
it('emits a last-page summary and a get-dataset nextStep when all items are returned', async () => {
const result = await (getDatasetItems as HelperTool).call(
stubToolCallContext({ datasetId: 'ds-1' }, stubApifyClient()),
);
Expand All @@ -217,7 +217,7 @@ describe('get-dataset-items', () => {
};

expect(structuredContent.summary).toBe('Fetched all 1 items.');
expect(structuredContent.nextStep).toContain(HelperTools.DATASET_SCHEMA_GET);
expect(structuredContent.nextStep).toContain(HelperTools.DATASET_GET);
expect(structuredContent.nextStep).toContain('datasetId=ds-1');
// summary + nextStep ship as a separate text block after the fenced data.
expect(content[1].text).toBe(`${structuredContent.summary}\n${structuredContent.nextStep}`);
Expand Down
12 changes: 6 additions & 6 deletions tests/unit/tools.storage_helpers.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,27 +21,27 @@ describe('buildStorageNotFound()', () => {
});

describe('buildDatasetItemsSummaryNextStep()', () => {
it('suggests get-dataset-schema on the terminal page when loaded', () => {
it('suggests get-dataset on the terminal page when loaded', () => {
const t = buildDatasetItemsSummaryNextStep({
datasetId: 'ds-1',
itemCount: 5,
totalItemCount: 5,
offset: 0,
loadedToolNames: [HelperTools.DATASET_SCHEMA_GET],
loadedToolNames: [HelperTools.DATASET_GET],
});
expect(t.nextStep).toContain(HelperTools.DATASET_SCHEMA_GET);
expect(t.nextStep).toContain(HelperTools.DATASET_GET);
expect(t.nextStep).toContain('datasetId=ds-1');
});

it('omits get-dataset-schema when not loaded', () => {
it('omits get-dataset when not loaded', () => {
const t = buildDatasetItemsSummaryNextStep({
datasetId: 'ds-1',
itemCount: 5,
totalItemCount: 5,
offset: 0,
loadedToolNames: [],
});
expect(t.nextStep).not.toContain(HelperTools.DATASET_SCHEMA_GET);
expect(t.nextStep).not.toContain(HelperTools.DATASET_GET);
expect(t.nextStep).toContain('No more pages');
});

Expand All @@ -51,7 +51,7 @@ describe('buildDatasetItemsSummaryNextStep()', () => {
itemCount: 20,
totalItemCount: 100,
offset: 0,
loadedToolNames: [HelperTools.DATASET_SCHEMA_GET],
loadedToolNames: [HelperTools.DATASET_GET],
});
const unloaded = buildDatasetItemsSummaryNextStep({
datasetId: 'ds-1',
Expand Down
Loading