You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: clients/ts-sdk/src/types.gen.ts
+197Lines changed: 197 additions & 0 deletions
Original file line number
Diff line number
Diff line change
@@ -73,6 +73,29 @@ export type AuthQuery = {
73
73
redirect_uri?: (string)|null;
74
74
};
75
75
76
+
/**
77
+
* Controls the processing and generation for the segment.
78
+
* - `crop_image` controls whether to crop the file's images to the segment's bounding box.
79
+
* The cropped image will be stored in the segment's `image` field. Use `All` to always crop,
80
+
* or `Auto` to only crop when needed for post-processing.
81
+
* - `html` is the HTML output for the segment, generated either through huerstics (`Auto`) or using Chunkr fine-tuned models (`LLM`)
82
+
* - `llm` is the LLM-generated output for the segment, this uses off-the-shelf models to generate a custom output for the segment
83
+
* - `markdown` is the Markdown output for the segment, generated either through huerstics (`Auto`) or using Chunkr fine-tuned models (`LLM`)
84
+
* - `embed_sources` defines which content sources will be included in the chunk's embed field and counted towards the chunk length.
85
+
* The array's order determines the sequence in which content appears in the embed field (e.g., [Markdown, LLM] means Markdown content
86
+
* is followed by LLM content). This directly affects what content is available for embedding and retrieval.
87
+
*/
88
+
exporttypeAutoGenerationConfig={
89
+
crop_image?: (CroppingStrategy);
90
+
embed_sources?: Array<EmbedSource>;
91
+
html?: (GenerationStrategy);
92
+
/**
93
+
* Prompt for the LLM mode
94
+
*/
95
+
llm?: (string)|null;
96
+
markdown?: (GenerationStrategy);
97
+
};
98
+
76
99
exporttypeAutocompleteReqPayload={
77
100
/**
78
101
* Set content_only to true to only returning the chunk_html of the chunks. This is useful for when you want to reduce amount of data over the wire for latency improvement (typically 10-50ms). Default is false.
@@ -395,6 +418,22 @@ export type ChunkMetadataWithScore = {
395
418
weight: number;
396
419
};
397
420
421
+
/**
422
+
* Controls the setting for the chunking and post-processing of each chunk.
423
+
*/
424
+
exporttypeChunkProcessing={
425
+
/**
426
+
* Whether to ignore headers and footers in the chunking process.
427
+
* This is recommended as headers and footers break reading order across pages.
428
+
*/
429
+
ignore_headers_and_footers?: boolean;
430
+
/**
431
+
* The target number of words in each chunk. If 0, each chunk will contain a single segment.
432
+
*/
433
+
target_length?: number;
434
+
tokenizer?: (TokenizerType);
435
+
};
436
+
398
437
/**
399
438
* Request payload for creating a new chunk
400
439
*/
@@ -834,6 +873,26 @@ export type CreateDatasetReqPayload = {
834
873
tracking_id?: (string)|null;
835
874
};
836
875
876
+
/**
877
+
* Will use [chunkr.ai](https://chunkr.ai) to process the file when this object is defined. See [docs.chunkr.ai/api-references/task/create-task](https://docs.chunkr.ai/api-references/task/create-task) for detailed information about what each field on this request payload does.
878
+
*/
879
+
exporttypeCreateFormWithoutFile={
880
+
chunk_processing?: ((ChunkProcessing)|null);
881
+
/**
882
+
* The number of seconds until task is deleted.
883
+
* Expried tasks can **not** be updated, polled or accessed via web interface.
884
+
*/
885
+
expires_in?: (number)|null;
886
+
/**
887
+
* Whether to use high-resolution images for cropping and post-processing. (Latency penalty: ~7 seconds per page)
* Controls the Optical Character Recognition (OCR) strategy.
2491
+
* - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
2492
+
* - `Auto`: Selectively applies OCR only to pages with missing or low-quality text. When text layer is present the bounding boxes from the text layer are used.
2493
+
*/
2494
+
exporttypeOcrStrategy='All'|'Auto';
2495
+
2396
2496
exporttypeOpenGraphMetadata={
2397
2497
description?: (string)|null;
2398
2498
image?: (string)|null;
@@ -2458,6 +2558,9 @@ export type PartnerConfiguration = {
2458
2558
SLACK_LINK: string;
2459
2559
};
2460
2560
2561
+
/**
2562
+
* We plan to deprecate pdf2md in favor of chunkr.ai. This is a legacy option for using a vision LLM to convert a given file into markdown and then ingest it.
2563
+
*/
2461
2564
exporttypePdf2MdOptions={
2462
2565
/**
2463
2566
* Split headings is an optional field which allows you to specify whether or not to split headings into separate chunks. Default is false.
@@ -2473,6 +2576,38 @@ export type Pdf2MdOptions = {
2473
2576
use_pdf2md_ocr: boolean;
2474
2577
};
2475
2578
2579
+
/**
2580
+
* Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
2581
+
* - `All` crops all images in the item
2582
+
* - `Auto` crops images only if required for post-processing
2583
+
*/
2584
+
exporttypePictureCroppingStrategy='All'|'Auto';
2585
+
2586
+
/**
2587
+
* Controls the processing and generation for the segment.
2588
+
* - `crop_image` controls whether to crop the file's images to the segment's bounding box.
2589
+
* The cropped image will be stored in the segment's `image` field. Use `All` to always crop,
2590
+
* or `Auto` to only crop when needed for post-processing.
2591
+
* - `html` is the HTML output for the segment, generated either through huerstics (`Auto`) or using Chunkr fine-tuned models (`LLM`)
2592
+
* - `llm` is the LLM-generated output for the segment, this uses off-the-shelf models to generate a custom output for the segment
2593
+
* - `markdown` is the Markdown output for the segment, generated either through huerstics (`Auto`) or using Chunkr fine-tuned models (`LLM`)
2594
+
* - `embed_sources` defines which content sources will be included in the chunk's embed field and counted towards the chunk length.
2595
+
* The array's order determines the sequence in which content appears in the embed field (e.g., [Markdown, LLM] means Markdown content
2596
+
* is followed by LLM content). This directly affects what content is available for embedding and retrieval.
2597
+
*/
2598
+
exporttypePictureGenerationConfig={
2599
+
crop_image?: (PictureCroppingStrategy);
2600
+
embed_sources?: Array<EmbedSource>;
2601
+
html?: (GenerationStrategy);
2602
+
/**
2603
+
* Prompt for the LLM model
2604
+
*/
2605
+
llm?: (string)|null;
2606
+
markdown?: (GenerationStrategy);
2607
+
};
2608
+
2609
+
exporttypePipelineType='Azure'|'Chunkr';
2610
+
2476
2611
exporttypePopularChat={
2477
2612
count: number;
2478
2613
name: string;
@@ -3545,6 +3680,38 @@ export type SearchesPerUserResponse = {
3545
3680
points: Array<FloatTimePoint>;
3546
3681
};
3547
3682
3683
+
/**
3684
+
* Controls the post-processing of each segment type.
3685
+
*
3686
+
* Allows you to generate HTML and Markdown from chunkr models for each segment type.
3687
+
* By default, the HTML and Markdown are generated manually using the segmentation information except for `Table`, `Formula` and `Picture`.
3688
+
* You can optionally configure custom LLM prompts and models to generate an additional `llm` field with LLM-processed content for each segment type.
3689
+
*
3690
+
* The configuration of which content sources (HTML, Markdown, LLM, Content) of the segment
3691
+
* should be included in the chunk's `embed` field and counted towards the chunk length can be configured through the `embed_sources` setting.
3692
+
*/
3693
+
exporttypeSegmentProcessing={
3694
+
Caption?: ((AutoGenerationConfig)|null);
3695
+
Footnote?: ((AutoGenerationConfig)|null);
3696
+
Formula?: ((LlmGenerationConfig)|null);
3697
+
ListItem?: ((AutoGenerationConfig)|null);
3698
+
Page?: ((LlmGenerationConfig)|null);
3699
+
PageFooter?: ((AutoGenerationConfig)|null);
3700
+
PageHeader?: ((AutoGenerationConfig)|null);
3701
+
Picture?: ((PictureGenerationConfig)|null);
3702
+
SectionHeader?: ((AutoGenerationConfig)|null);
3703
+
Table?: ((LlmGenerationConfig)|null);
3704
+
Text?: ((AutoGenerationConfig)|null);
3705
+
Title?: ((AutoGenerationConfig)|null);
3706
+
};
3707
+
3708
+
/**
3709
+
* Controls the segmentation strategy:
3710
+
* - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`, `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
3711
+
* - `Page`: Treats each page as a single segment. Faster processing, but without layout element detection and only simple chunking.
* Semantic boosting moves the dense vector of the chunk in the direction of the distance phrase for semantic search. I.e. you can force a cluster by moving every chunk for a PDF closer to its title or push a chunk with a chunk_html of "iphone" 25% closer to the term "flagship" by using the distance phrase "flagship" and a distance factor of 0.25. Conceptually it's drawing a line (euclidean/L2 distance) between the vector for the innerText of the chunk_html and distance_phrase then moving the vector of the chunk_html distance_factor*L2Distance closer to or away from the distance_phrase point along the line between the two points.
3550
3717
*/
@@ -3790,6 +3957,35 @@ export type TagsWithCount = {
3790
3957
tag: string;
3791
3958
};
3792
3959
3960
+
/**
3961
+
* Common tokenizers used for text processing.
3962
+
*
3963
+
* These values represent standard tokenization approaches and popular pre-trained
* Create chunks is a boolean which determines whether or not to create chunks from the file. If false, you can manually chunk the file and send the chunks to the create_chunk endpoint with the file_id to associate chunks with the file. Meant mostly for advanced users.
0 commit comments