Skip to content

Commit 2e23bbb

Browse files
skeptrunedevcdxker
authored andcommitted
feat: add fulltext_content field to ChunkReqPayload and related structures
1 parent 8ffbc2f commit 2e23bbb

8 files changed

Lines changed: 24 additions & 9 deletions

File tree

server/src/bin/backfill-qdrant-from-pg.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ async fn main() -> Result<(), ServiceError> {
248248
let upload_message = ChunkReqPayload {
249249
chunk_html: chunk.chunk_html.clone(),
250250
semantic_content: None,
251+
fulltext_content: None,
251252
link: chunk.link.clone(),
252253
tag_set: chunk.tag_set.clone().map(|tag_set| {
253254
tag_set.split(',').map(|tag| tag.to_string()).collect()

server/src/bin/crawl-worker.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -809,6 +809,7 @@ async fn parse_youtube_chunks(
809809
let create_chunk_data = ChunkReqPayload {
810810
chunk_html: Some(transcript.text),
811811
semantic_content: None,
812+
fulltext_content: None,
812813
link: Some(format!(
813814
"https://www.youtube.com/watch?v={}&t={}",
814815
video.id.video_id,

server/src/bin/csv-jsonl-worker.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -625,6 +625,7 @@ fn convert_value_to_chunkreqpayload(
625625
let mut chunk_req_payload = ChunkReqPayload {
626626
chunk_html,
627627
semantic_content: None,
628+
fulltext_content: None,
628629
link: None,
629630
tag_set: None,
630631
num_value: None,

server/src/bin/file-worker.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -543,6 +543,7 @@ async fn upload_file(
543543
let create_chunk_data = ChunkReqPayload {
544544
chunk_html: Some(page.content.clone()),
545545
semantic_content: None,
546+
fulltext_content: None,
546547
link: file_worker_message.upload_file_data.link.clone(),
547548
tag_set: file_worker_message.upload_file_data.tag_set.clone(),
548549
metadata,
@@ -658,6 +659,7 @@ async fn upload_file(
658659
let chunk = ChunkReqPayload {
659660
chunk_html: Some(html_content.clone()),
660661
semantic_content: None,
662+
fulltext_content: None,
661663
link: file_worker_message.upload_file_data.link.clone(),
662664
tag_set: file_worker_message.upload_file_data.tag_set.clone(),
663665
metadata: file_worker_message.upload_file_data.metadata.clone(),
@@ -706,6 +708,7 @@ async fn upload_file(
706708
.map(|(i, chunk_html)| ChunkReqPayload {
707709
chunk_html: Some(chunk_html),
708710
semantic_content: None,
711+
fulltext_content: None,
709712
link: file_worker_message.upload_file_data.link.clone(),
710713
tag_set: file_worker_message.upload_file_data.tag_set.clone(),
711714
metadata: file_worker_message.upload_file_data.metadata.clone(),

server/src/bin/ingestion-worker.rs

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -480,7 +480,12 @@ pub async fn bulk_upload_chunks(
480480
ChunkData {
481481
chunk_metadata,
482482
content: content.clone(),
483-
embedding_content: message.chunk.semantic_content.clone().unwrap_or(content),
483+
embedding_content: message
484+
.chunk
485+
.semantic_content
486+
.clone()
487+
.unwrap_or(content.clone()),
488+
fulltext_content: message.chunk.fulltext_content.clone().unwrap_or(content),
484489
group_ids: Some(deduped_group_ids),
485490
upsert_by_tracking_id: message.upsert_by_tracking_id,
486491
fulltext_boost: message
@@ -600,12 +605,12 @@ pub async fn bulk_upload_chunks(
600605
false => vec![None; embedding_content_and_boosts.len()],
601606
};
602607

603-
let content_and_boosts: Vec<(String, Option<FullTextBoost>, Option<SemanticBoost>)> =
608+
let fulltext_content_and_boosts: Vec<(String, Option<FullTextBoost>, Option<SemanticBoost>)> =
604609
ingestion_data
605610
.iter()
606611
.map(|data| {
607612
(
608-
data.content.clone(),
613+
data.fulltext_content.clone(),
609614
data.fulltext_boost.clone(),
610615
data.semantic_boost.clone(),
611616
)
@@ -615,10 +620,10 @@ pub async fn bulk_upload_chunks(
615620
let splade_vectors = if dataset_config.FULLTEXT_ENABLED {
616621
log::info!(
617622
"Creating sparse vectors for {} chunks",
618-
content_and_boosts.len()
623+
fulltext_content_and_boosts.len()
619624
);
620625
match get_sparse_vectors(
621-
content_and_boosts
626+
fulltext_content_and_boosts
622627
.iter()
623628
.map(|(content, boost, _)| (content.clone(), boost.clone()))
624629
.collect(),
@@ -641,7 +646,7 @@ pub async fn bulk_upload_chunks(
641646
}
642647
}
643648
} else {
644-
let content_size = content_and_boosts.len();
649+
let content_size = fulltext_content_and_boosts.len();
645650

646651
Ok(std::iter::repeat(vec![(0, 0.0)])
647652
.take(content_size)
@@ -652,7 +657,7 @@ pub async fn bulk_upload_chunks(
652657
&& std::env::var("BM25_ACTIVE").unwrap_or("false".to_string()) == "true"
653658
{
654659
get_bm25_embeddings(
655-
content_and_boosts
660+
fulltext_content_and_boosts
656661
.iter()
657662
.map(|(content, boost, _)| (content.clone(), boost.clone()))
658663
.collect(),
@@ -664,7 +669,7 @@ pub async fn bulk_upload_chunks(
664669
.map(Some)
665670
.collect()
666671
} else {
667-
vec![None; content_and_boosts.len()]
672+
vec![None; fulltext_content_and_boosts.len()]
668673
};
669674

670675
let qdrant_points = tokio_stream::iter(izip!(

server/src/data/models.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6789,6 +6789,7 @@ pub struct ChunkData {
67896789
pub chunk_metadata: ChunkMetadata,
67906790
pub content: String,
67916791
pub embedding_content: String,
6792+
pub fulltext_content: String,
67926793
pub group_ids: Option<Vec<uuid::Uuid>>,
67936794
pub upsert_by_tracking_id: bool,
67946795
pub fulltext_boost: Option<FullTextBoost>,

server/src/handlers/chunk_handler.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,10 @@ pub struct ScoringOptions {
106106
pub struct ChunkReqPayload {
107107
/// HTML content of the chunk. This can also be plaintext. The innerText of the HTML will be used to create the embedding vector. The point of using HTML is for convienience, as some users have applications where users submit HTML content.
108108
pub chunk_html: Option<String>,
109-
/// If semantic_content is present, it will be used for creating semantic embeddings instead of the innerText `chunk_html`. `chunk_html` will still be the only thing stored and always used for fulltext functionality. `chunk_html` must still be present for the chunk to be created properly.
109+
/// If semantic_content is present, it will be used for creating semantic embeddings instead of the innerText `chunk_html`. `chunk_html` will still be the only thing stored and used for fulltext functionality unless the corresponding `fulltext_content` field is defined. `chunk_html` must still be present for the chunk to be created properly.
110110
pub semantic_content: Option<String>,
111+
/// If fulltext_content is present, it will be used for creating the fulltext and bm25 sparse vectors instead of the innerText `chunk_html`. `chunk_html` will still be the only thing stored and used for semantic functionality unless the corresponding `semantic_content` field is defined. `chunk_html` must still be present for the chunk to be created properly.
112+
pub fulltext_content: Option<String>,
111113
/// Link to the chunk. This can also be any string. Frequently, this is a link to the source of the chunk. The link value will not affect the embedding creation.
112114
pub link: Option<String>,
113115
/// Tag set is a list of tags. This can be used to filter chunks by tag. Unlike with metadata filtering, HNSW indices will exist for each tag such that there is not a performance hit for filtering on them.

server/src/operators/chunk_operator.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -758,6 +758,7 @@ pub async fn bulk_insert_chunk_metadata_query(
758758
chunk_metadata,
759759
content: chunk_data.content,
760760
embedding_content: chunk_data.embedding_content,
761+
fulltext_content: chunk_data.fulltext_content,
761762
group_ids: chunk_data.group_ids,
762763
upsert_by_tracking_id: chunk_data.upsert_by_tracking_id,
763764
fulltext_boost: chunk_data.fulltext_boost,

0 commit comments

Comments
 (0)