Skip to content

Commit e21e5a8

Browse files
densumeshcdxker
authored andcommitted
feature: add the ablity to set custom tags on crawled content
1 parent 2f0a493 commit e21e5a8

5 files changed

Lines changed: 84 additions & 2 deletions

File tree

clients/ts-sdk/openapi.json

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10996,6 +10996,14 @@
1099610996
"description": "The URL to crawl",
1099710997
"nullable": true
1099810998
},
10999+
"tags": {
11000+
"type": "array",
11001+
"items": {
11002+
"type": "string"
11003+
},
11004+
"description": "Tags to add to the crawl",
11005+
"nullable": true
11006+
},
1099911007
"webhook_metadata": {
1100011008
"description": "Metadata to send back with the webhook call for each successful page scrape",
1100111009
"nullable": true
@@ -11670,6 +11678,13 @@
1167011678
"description": "No result message for when there are no chunks found above the score threshold.",
1167111679
"nullable": true
1167211680
},
11681+
"number_of_messages_to_include": {
11682+
"type": "integer",
11683+
"format": "int64",
11684+
"description": "Number of messages to include in the context window. If not specified, this defaults to 10.",
11685+
"nullable": true,
11686+
"minimum": 0
11687+
},
1167311688
"only_include_docs_used": {
1167411689
"type": "boolean",
1167511690
"description": "Only include docs used is a boolean that indicates whether or not to only include the docs that were used in the completion. If true, the completion will only include the docs that were used in the completion. If false, the completion will include all of the docs.",
@@ -12779,6 +12794,13 @@
1277912794
"description": "No result message for when there are no chunks found above the score threshold.",
1278012795
"nullable": true
1278112796
},
12797+
"number_of_messages_to_include": {
12798+
"type": "integer",
12799+
"format": "int64",
12800+
"description": "Number of messages to include in the context window. If not specified, this defaults to 10.",
12801+
"nullable": true,
12802+
"minimum": 0
12803+
},
1278212804
"only_include_docs_used": {
1278312805
"type": "boolean",
1278412806
"description": "Only include docs used is a boolean that indicates whether or not to only include the docs that were used in the completion. If true, the completion will only include the docs that were used in the completion. If false, the completion will include all of the docs.",
@@ -19371,6 +19393,13 @@
1937119393
"description": "No result message for when there are no chunks found above the score threshold.",
1937219394
"nullable": true
1937319395
},
19396+
"number_of_messages_to_include": {
19397+
"type": "integer",
19398+
"format": "int64",
19399+
"description": "Number of messages to include in the context window. If not specified, this defaults to 10.",
19400+
"nullable": true,
19401+
"minimum": 0
19402+
},
1937419403
"only_include_docs_used": {
1937519404
"type": "boolean",
1937619405
"description": "Only include docs used is a boolean that indicates whether or not to only include the docs that were used in the completion. If true, the completion will only include the docs that were used in the completion. If false, the completion will include all of the docs.",

clients/ts-sdk/src/types.gen.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -887,6 +887,10 @@ export type CrawlOptions = {
887887
* The URL to crawl
888888
*/
889889
site_url?: (string) | null;
890+
/**
891+
* Tags to add to the crawl
892+
*/
893+
tags?: Array<(string)> | null;
890894
/**
891895
* Metadata to send back with the webhook call for each successful page scrape
892896
*/
@@ -1086,6 +1090,10 @@ export type CreateMessageReqPayload = {
10861090
* No result message for when there are no chunks found above the score threshold.
10871091
*/
10881092
no_result_message?: (string) | null;
1093+
/**
1094+
* Number of messages to include in the context window. If not specified, this defaults to 10.
1095+
*/
1096+
number_of_messages_to_include?: (number) | null;
10891097
/**
10901098
* Only include docs used is a boolean that indicates whether or not to only include the docs that were used in the completion. If true, the completion will only include the docs that were used in the completion. If false, the completion will include all of the docs.
10911099
*/
@@ -1599,6 +1607,10 @@ export type EditMessageReqPayload = {
15991607
* No result message for when there are no chunks found above the score threshold.
16001608
*/
16011609
no_result_message?: (string) | null;
1610+
/**
1611+
* Number of messages to include in the context window. If not specified, this defaults to 10.
1612+
*/
1613+
number_of_messages_to_include?: (number) | null;
16021614
/**
16031615
* Only include docs used is a boolean that indicates whether or not to only include the docs that were used in the completion. If true, the completion will only include the docs that were used in the completion. If false, the completion will include all of the docs.
16041616
*/
@@ -3716,6 +3728,10 @@ export type RegenerateMessageReqPayload = {
37163728
* No result message for when there are no chunks found above the score threshold.
37173729
*/
37183730
no_result_message?: (string) | null;
3731+
/**
3732+
* Number of messages to include in the context window. If not specified, this defaults to 10.
3733+
*/
3734+
number_of_messages_to_include?: (number) | null;
37193735
/**
37203736
* Only include docs used is a boolean that indicates whether or not to only include the docs that were used in the completion. If true, the completion will only include the docs that were used in the completion. If false, the completion will include all of the docs.
37213737
*/

frontends/dashboard/src/components/CrawlingSettings.tsx

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ export const defaultCrawlOptions: CrawlOptions = {
3434
} as ScrapeOptions,
3535
webhook_urls: [],
3636
add_chunks_to_dataset: true,
37+
tags: [],
3738
};
3839

3940
export type FlatCrawlOptions = Omit<CrawlOptions, "scrape_options"> & {
@@ -71,6 +72,7 @@ export const unflattenCrawlOptions = (
7172
},
7273
webhook_urls: options.webhook_urls,
7374
add_chunks_to_dataset: options.add_chunks_to_dataset,
75+
tags: options.tags,
7476
};
7577
} else if (options && options.type == "shopify") {
7678
return {
@@ -90,6 +92,7 @@ export const unflattenCrawlOptions = (
9092
},
9193
webhook_urls: options.webhook_urls,
9294
add_chunks_to_dataset: options.add_chunks_to_dataset,
95+
tags: options.tags,
9396
};
9497
} else if (options && options.type == "youtube") {
9598
return {
@@ -107,6 +110,7 @@ export const unflattenCrawlOptions = (
107110
},
108111
webhook_urls: options.webhook_urls,
109112
add_chunks_to_dataset: options.add_chunks_to_dataset,
113+
tags: options.tags,
110114
};
111115
}
112116
return {
@@ -122,6 +126,7 @@ export const unflattenCrawlOptions = (
122126
scrape_options: null,
123127
webhook_urls: options.webhook_urls,
124128
add_chunks_to_dataset: options.add_chunks_to_dataset,
129+
tags: options.tags,
125130
};
126131
};
127132

@@ -752,6 +757,25 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => {
752757
/>
753758
<Error error={errors.body_remove_strings} />
754759
</div>
760+
<div>
761+
<div class="flex items-center gap-2">
762+
<div>Crawl Tags</div>
763+
<Tooltip
764+
tooltipText="Tags to associate with all chunks created from this crawl. Useful for filtering and organizing crawled content."
765+
body={<FaRegularCircleQuestion class="h-3 w-3 text-black" />}
766+
/>
767+
</div>
768+
<MultiStringInput
769+
placeholder="documentation"
770+
addClass="bg-magenta-100/40 px-2 text-sm rounded border border-magenta-300/40"
771+
addLabel="Add Tag"
772+
onChange={(value) => {
773+
setOptions("tags", value);
774+
}}
775+
value={options.tags || []}
776+
/>
777+
<Error error={errors.tags} />
778+
</div>
755779
</div>
756780
<Spacer h={18} />
757781
<div class="mt-5 flex justify-start">

server/src/bin/crawl-worker.rs

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,10 +177,19 @@ fn create_shopify_chunk_req_payload(
177177
product_title.clone()
178178
};
179179

180+
let mut tag_set = product.tags.clone();
181+
tag_set.extend(
182+
scrape_request
183+
.crawl_options
184+
.tags
185+
.clone()
186+
.unwrap_or_default(),
187+
);
188+
180189
Ok(ChunkReqPayload {
181190
chunk_html: Some(chunk_html),
182191
link: Some(link),
183-
tag_set: Some(product.tags.clone()),
192+
tag_set: Some(tag_set),
184193
num_value: variant.price.clone().unwrap_or_default().parse().ok(),
185194
metadata: serde_json::to_value(product.clone()).ok(),
186195
tracking_id: if group_variants {
@@ -282,6 +291,7 @@ async fn parse_chunks_with_firecrawl(
282291
let page_html = crawl_doc.html.clone().unwrap_or_default();
283292
let mut page_tags = get_tags(page_link.clone());
284293
page_tags.push(crawl_req.url.clone());
294+
page_tags.extend(crawl_req.crawl_options.tags.clone().unwrap_or_default());
285295

286296
if let Some(spec) = &spec {
287297
if let Some(ScrapeOptions::OpenApi(ref openapi_options)) =
@@ -815,7 +825,7 @@ async fn parse_youtube_chunks(
815825
video.id.video_id,
816826
transcript.start.as_secs()
817827
)),
818-
tag_set: None,
828+
tag_set: crawl_request.crawl_options.tags.clone(),
819829
metadata: Some(json!({
820830
"heading": video.snippet.title.clone(),
821831
"title": video.snippet.title.clone(),

server/src/data/models.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10030,6 +10030,8 @@ pub struct CrawlOptions {
1003010030
pub webhook_metadata: Option<serde_json::Value>,
1003110031
/// Add chunks to the dataset that the crawl is created for, defaults to true
1003210032
pub add_chunks_to_dataset: Option<bool>,
10033+
/// Tags to add to the crawl
10034+
pub tags: Option<Vec<String>>,
1003310035
}
1003410036

1003510037
#[derive(Serialize, Deserialize, Debug, ToSchema, Clone)]
@@ -10088,6 +10090,7 @@ impl CrawlOptions {
1008810090
.clone()
1008910091
.or(other.webhook_metadata.clone()),
1009010092
add_chunks_to_dataset: self.add_chunks_to_dataset.or(other.add_chunks_to_dataset),
10093+
tags: self.tags.clone().or(other.tags.clone()),
1009110094
}
1009210095
}
1009310096
}

0 commit comments

Comments
 (0)