Skip to content

Commit 92a5cf9

Browse files
committed
feat(file-upload): add chunkr_create_task_req_payload to upload file request
1 parent 98effb6 commit 92a5cf9

12 files changed

Lines changed: 849 additions & 144 deletions

File tree

clients/ts-sdk/openapi.json

Lines changed: 417 additions & 0 deletions
Large diffs are not rendered by default.

clients/ts-sdk/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"files": [
1818
"dist"
1919
],
20-
"version": "0.0.73",
20+
"version": "0.0.74",
2121
"license": "MIT",
2222
"scripts": {
2323
"lint": "eslint 'src/**/*.ts'",
Binary file not shown.

clients/ts-sdk/src/functions/file/file.test.ts

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,14 @@ import { EXAMPLE_FILE_ID, TRIEVE } from "../../__tests__/constants";
1111
import fs from "fs";
1212
import { test } from "../../__tests__/utils";
1313

14-
const file = fs.readFileSync("./src/__tests__/uploadme.pdf");
14+
const uploadMeFile = fs.readFileSync("./src/__tests__/uploadme.pdf");
15+
const uploadMeFileEncoded = uploadMeFile.toString("base64");
1516

16-
const fileEncoded = file.toString("base64");
17+
const villageOfCatskillZoningRegulationsFile = fs.readFileSync(
18+
"./src/__tests__/Village_of_Catskill_Zoning_Regulations.pdf",
19+
);
20+
const villageOfCatskillZoningRegulationsFileEncoded =
21+
villageOfCatskillZoningRegulationsFile.toString("base64");
1722

1823
describe("File Tests", async () => {
1924
let trieve: TrieveSDK;
@@ -22,13 +27,25 @@ describe("File Tests", async () => {
2227
});
2328
test("uploadFile", async () => {
2429
const data = await trieve.uploadFile({
25-
base64_file: fileEncoded,
30+
base64_file: uploadMeFileEncoded,
2631
file_name: "uploadme.pdf",
2732
group_tracking_id: "file-upload-group",
2833
});
2934
expectTypeOf(data).toEqualTypeOf<UploadFileResponseBody>();
3035
});
3136

37+
test("uploadFileWithChunkr", async () => {
38+
const data = await trieve.uploadFile({
39+
base64_file: villageOfCatskillZoningRegulationsFileEncoded,
40+
file_name: "Village_of_Catskill_Zoning_Regulations.pdf",
41+
group_tracking_id: "village-of-catskill-file-upload-group",
42+
chunkr_create_task_req_payload: {
43+
pipeline: "Chunkr",
44+
},
45+
});
46+
expectTypeOf(data).toEqualTypeOf<UploadFileResponseBody>();
47+
});
48+
3249
test("createPresignedUrlForJsonl", async () => {
3350
const data = await trieve.createPresignedUrlForCsvJsonl({
3451
file_name: "flipkart.jsonl",

clients/ts-sdk/src/types.gen.ts

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,29 @@ export type AuthQuery = {
7373
redirect_uri?: (string) | null;
7474
};
7575

76+
/**
77+
* Controls the processing and generation for the segment.
78+
* - `crop_image` controls whether to crop the file's images to the segment's bounding box.
79+
* The cropped image will be stored in the segment's `image` field. Use `All` to always crop,
80+
* or `Auto` to only crop when needed for post-processing.
81+
* - `html` is the HTML output for the segment, generated either through huerstics (`Auto`) or using Chunkr fine-tuned models (`LLM`)
82+
* - `llm` is the LLM-generated output for the segment, this uses off-the-shelf models to generate a custom output for the segment
83+
* - `markdown` is the Markdown output for the segment, generated either through huerstics (`Auto`) or using Chunkr fine-tuned models (`LLM`)
84+
* - `embed_sources` defines which content sources will be included in the chunk's embed field and counted towards the chunk length.
85+
* The array's order determines the sequence in which content appears in the embed field (e.g., [Markdown, LLM] means Markdown content
86+
* is followed by LLM content). This directly affects what content is available for embedding and retrieval.
87+
*/
88+
export type AutoGenerationConfig = {
89+
crop_image?: (CroppingStrategy);
90+
embed_sources?: Array<EmbedSource>;
91+
html?: (GenerationStrategy);
92+
/**
93+
* Prompt for the LLM mode
94+
*/
95+
llm?: (string) | null;
96+
markdown?: (GenerationStrategy);
97+
};
98+
7699
export type AutocompleteReqPayload = {
77100
/**
78101
* Set content_only to true to only returning the chunk_html of the chunks. This is useful for when you want to reduce amount of data over the wire for latency improvement (typically 10-50ms). Default is false.
@@ -395,6 +418,22 @@ export type ChunkMetadataWithScore = {
395418
weight: number;
396419
};
397420

421+
/**
422+
* Controls the setting for the chunking and post-processing of each chunk.
423+
*/
424+
export type ChunkProcessing = {
425+
/**
426+
* Whether to ignore headers and footers in the chunking process.
427+
* This is recommended as headers and footers break reading order across pages.
428+
*/
429+
ignore_headers_and_footers?: boolean;
430+
/**
431+
* The target number of words in each chunk. If 0, each chunk will contain a single segment.
432+
*/
433+
target_length?: number;
434+
tokenizer?: (TokenizerType);
435+
};
436+
398437
/**
399438
* Request payload for creating a new chunk
400439
*/
@@ -834,6 +873,26 @@ export type CreateDatasetReqPayload = {
834873
tracking_id?: (string) | null;
835874
};
836875

876+
/**
877+
* Will use [chunkr.ai](https://chunkr.ai) to process the file when this object is defined. See [docs.chunkr.ai/api-references/task/create-task](https://docs.chunkr.ai/api-references/task/create-task) for detailed information about what each field on this request payload does.
878+
*/
879+
export type CreateFormWithoutFile = {
880+
chunk_processing?: ((ChunkProcessing) | null);
881+
/**
882+
* The number of seconds until task is deleted.
883+
* Expried tasks can **not** be updated, polled or accessed via web interface.
884+
*/
885+
expires_in?: (number) | null;
886+
/**
887+
* Whether to use high-resolution images for cropping and post-processing. (Latency penalty: ~7 seconds per page)
888+
*/
889+
high_resolution?: (boolean) | null;
890+
ocr_strategy?: ((OcrStrategy) | null);
891+
pipeline?: ((PipelineType) | null);
892+
segment_processing?: ((SegmentProcessing) | null);
893+
segmentation_strategy?: ((SegmentationStrategy) | null);
894+
};
895+
837896
export type CreateMessageReqPayload = {
838897
/**
839898
* The base64 encoded audio input of the user message to attach to the topic and then generate an assistant message in response to.
@@ -1022,6 +1081,13 @@ export type CreateTopicReqPayload = {
10221081
owner_id: string;
10231082
};
10241083

1084+
/**
1085+
* Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
1086+
* - `All` crops all images in the item
1087+
* - `Auto` crops images only if required for post-processing
1088+
*/
1089+
export type CroppingStrategy = 'All' | 'Auto';
1090+
10251091
export type Dataset = {
10261092
/**
10271093
* Timestamp of the creation of the dataset
@@ -1353,6 +1419,8 @@ export type EditMessageReqPayload = {
13531419
user_id?: (string) | null;
13541420
};
13551421

1422+
export type EmbedSource = 'HTML' | 'Markdown' | 'LLM' | 'Content';
1423+
13561424
export type ErrorResponseBody = {
13571425
message: string;
13581426
};
@@ -1890,6 +1958,8 @@ export type GenerateOffChunksReqPayload = {
18901958
user_id?: (string) | null;
18911959
};
18921960

1961+
export type GenerationStrategy = 'LLM' | 'Auto';
1962+
18931963
/**
18941964
* Location that you want to use as the center of the search.
18951965
*/
@@ -2294,6 +2364,29 @@ export type LatencyGraphResponse = {
22942364
points: Array<FloatTimePoint>;
22952365
};
22962366

2367+
/**
2368+
* Controls the processing and generation for the segment.
2369+
* - `crop_image` controls whether to crop the file's images to the segment's bounding box.
2370+
* The cropped image will be stored in the segment's `image` field. Use `All` to always crop,
2371+
* or `Auto` to only crop when needed for post-processing.
2372+
* - `html` is the HTML output for the segment, generated either through huerstics (`Auto`) or using Chunkr fine-tuned models (`LLM`)
2373+
* - `llm` is the LLM-generated output for the segment, this uses off-the-shelf models to generate a custom output for the segment
2374+
* - `markdown` is the Markdown output for the segment, generated either through huerstics (`Auto`) or using Chunkr fine-tuned models (`LLM`)
2375+
* - `embed_sources` defines which content sources will be included in the chunk's embed field and counted towards the chunk length.
2376+
* The array's order determines the sequence in which content appears in the embed field (e.g., [Markdown, LLM] means Markdown content
2377+
* is followed by LLM content). This directly affects what content is available for embedding and retrieval.
2378+
*/
2379+
export type LlmGenerationConfig = {
2380+
crop_image?: (CroppingStrategy);
2381+
embed_sources?: Array<EmbedSource>;
2382+
html?: (GenerationStrategy);
2383+
/**
2384+
* Prompt for the LLM model
2385+
*/
2386+
llm?: (string) | null;
2387+
markdown?: (GenerationStrategy);
2388+
};
2389+
22972390
export type LocationBoundingBox = {
22982391
bottom_right: GeoInfo;
22992392
top_left: GeoInfo;
@@ -2393,6 +2486,13 @@ export type MultiQuery = {
23932486

23942487
export type NewChunkMetadataTypes = SlimChunkMetadataWithArrayTagSet | ChunkMetadata | ContentChunkMetadata;
23952488

2489+
/**
2490+
* Controls the Optical Character Recognition (OCR) strategy.
2491+
* - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
2492+
* - `Auto`: Selectively applies OCR only to pages with missing or low-quality text. When text layer is present the bounding boxes from the text layer are used.
2493+
*/
2494+
export type OcrStrategy = 'All' | 'Auto';
2495+
23962496
export type OpenGraphMetadata = {
23972497
description?: (string) | null;
23982498
image?: (string) | null;
@@ -2458,6 +2558,9 @@ export type PartnerConfiguration = {
24582558
SLACK_LINK: string;
24592559
};
24602560

2561+
/**
2562+
* We plan to deprecate pdf2md in favor of chunkr.ai. This is a legacy option for using a vision LLM to convert a given file into markdown and then ingest it.
2563+
*/
24612564
export type Pdf2MdOptions = {
24622565
/**
24632566
* Split headings is an optional field which allows you to specify whether or not to split headings into separate chunks. Default is false.
@@ -2473,6 +2576,38 @@ export type Pdf2MdOptions = {
24732576
use_pdf2md_ocr: boolean;
24742577
};
24752578

2579+
/**
2580+
* Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
2581+
* - `All` crops all images in the item
2582+
* - `Auto` crops images only if required for post-processing
2583+
*/
2584+
export type PictureCroppingStrategy = 'All' | 'Auto';
2585+
2586+
/**
2587+
* Controls the processing and generation for the segment.
2588+
* - `crop_image` controls whether to crop the file's images to the segment's bounding box.
2589+
* The cropped image will be stored in the segment's `image` field. Use `All` to always crop,
2590+
* or `Auto` to only crop when needed for post-processing.
2591+
* - `html` is the HTML output for the segment, generated either through huerstics (`Auto`) or using Chunkr fine-tuned models (`LLM`)
2592+
* - `llm` is the LLM-generated output for the segment, this uses off-the-shelf models to generate a custom output for the segment
2593+
* - `markdown` is the Markdown output for the segment, generated either through huerstics (`Auto`) or using Chunkr fine-tuned models (`LLM`)
2594+
* - `embed_sources` defines which content sources will be included in the chunk's embed field and counted towards the chunk length.
2595+
* The array's order determines the sequence in which content appears in the embed field (e.g., [Markdown, LLM] means Markdown content
2596+
* is followed by LLM content). This directly affects what content is available for embedding and retrieval.
2597+
*/
2598+
export type PictureGenerationConfig = {
2599+
crop_image?: (PictureCroppingStrategy);
2600+
embed_sources?: Array<EmbedSource>;
2601+
html?: (GenerationStrategy);
2602+
/**
2603+
* Prompt for the LLM model
2604+
*/
2605+
llm?: (string) | null;
2606+
markdown?: (GenerationStrategy);
2607+
};
2608+
2609+
export type PipelineType = 'Azure' | 'Chunkr';
2610+
24762611
export type PopularChat = {
24772612
count: number;
24782613
name: string;
@@ -3545,6 +3680,38 @@ export type SearchesPerUserResponse = {
35453680
points: Array<FloatTimePoint>;
35463681
};
35473682

3683+
/**
3684+
* Controls the post-processing of each segment type.
3685+
*
3686+
* Allows you to generate HTML and Markdown from chunkr models for each segment type.
3687+
* By default, the HTML and Markdown are generated manually using the segmentation information except for `Table`, `Formula` and `Picture`.
3688+
* You can optionally configure custom LLM prompts and models to generate an additional `llm` field with LLM-processed content for each segment type.
3689+
*
3690+
* The configuration of which content sources (HTML, Markdown, LLM, Content) of the segment
3691+
* should be included in the chunk's `embed` field and counted towards the chunk length can be configured through the `embed_sources` setting.
3692+
*/
3693+
export type SegmentProcessing = {
3694+
Caption?: ((AutoGenerationConfig) | null);
3695+
Footnote?: ((AutoGenerationConfig) | null);
3696+
Formula?: ((LlmGenerationConfig) | null);
3697+
ListItem?: ((AutoGenerationConfig) | null);
3698+
Page?: ((LlmGenerationConfig) | null);
3699+
PageFooter?: ((AutoGenerationConfig) | null);
3700+
PageHeader?: ((AutoGenerationConfig) | null);
3701+
Picture?: ((PictureGenerationConfig) | null);
3702+
SectionHeader?: ((AutoGenerationConfig) | null);
3703+
Table?: ((LlmGenerationConfig) | null);
3704+
Text?: ((AutoGenerationConfig) | null);
3705+
Title?: ((AutoGenerationConfig) | null);
3706+
};
3707+
3708+
/**
3709+
* Controls the segmentation strategy:
3710+
* - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`, `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
3711+
* - `Page`: Treats each page as a single segment. Faster processing, but without layout element detection and only simple chunking.
3712+
*/
3713+
export type SegmentationStrategy = 'LayoutAnalysis' | 'Page';
3714+
35483715
/**
35493716
* Semantic boosting moves the dense vector of the chunk in the direction of the distance phrase for semantic search. I.e. you can force a cluster by moving every chunk for a PDF closer to its title or push a chunk with a chunk_html of "iphone" 25% closer to the term "flagship" by using the distance phrase "flagship" and a distance factor of 0.25. Conceptually it's drawing a line (euclidean/L2 distance) between the vector for the innerText of the chunk_html and distance_phrase then moving the vector of the chunk_html distance_factor*L2Distance closer to or away from the distance_phrase point along the line between the two points.
35503717
*/
@@ -3790,6 +3957,35 @@ export type TagsWithCount = {
37903957
tag: string;
37913958
};
37923959

3960+
/**
3961+
* Common tokenizers used for text processing.
3962+
*
3963+
* These values represent standard tokenization approaches and popular pre-trained
3964+
* tokenizers from the Hugging Face ecosystem.
3965+
*/
3966+
export type Tokenizer = 'Word' | 'Cl100kBase' | 'XlmRobertaBase' | 'BertBaseUncased';
3967+
3968+
/**
3969+
* Specifies which tokenizer to use for the chunking process.
3970+
*
3971+
* This type supports two ways of specifying a tokenizer:
3972+
* 1. Using a predefined tokenizer from the `Tokenizer` enum
3973+
* 2. Using any Hugging Face tokenizer by providing its model ID as a string
3974+
* (e.g. "facebook/bart-large", "Qwen/Qwen-tokenizer", etc.)
3975+
*
3976+
* When using a string, any valid Hugging Face tokenizer ID can be specified,
3977+
* which will be loaded using the Hugging Face tokenizers library.
3978+
*/
3979+
export type TokenizerType = {
3980+
Enum: Tokenizer;
3981+
} | {
3982+
/**
3983+
* Use any Hugging Face tokenizer by specifying its model ID
3984+
* Examples: "Qwen/Qwen-tokenizer", "facebook/bart-large"
3985+
*/
3986+
String: string;
3987+
};
3988+
37933989
/**
37943990
* Function for a LLM tool call
37953991
*/
@@ -4193,6 +4389,7 @@ export type UploadFileReqPayload = {
41934389
* Base64 encoded file.
41944390
*/
41954391
base64_file: string;
4392+
chunkr_create_task_req_payload?: ((CreateFormWithoutFile) | null);
41964393
/**
41974394
* Create chunks is a boolean which determines whether or not to create chunks from the file. If false, you can manually chunk the file and send the chunks to the create_chunk endpoint with the file_id to associate chunks with the file. Meant mostly for advanced users.
41984395
*/

0 commit comments

Comments
 (0)