feat: add fulltext_content field to ChunkReqPayload and related structures

skeptrunedev · cdxker · commit 2e23bbb01c5d · 2025-04-16T16:57:49.000-07:00
diff --git a/server/src/bin/backfill-qdrant-from-pg.rs b/server/src/bin/backfill-qdrant-from-pg.rs
@@ -248,6 +248,7 @@ async fn main() -> Result<(), ServiceError> {
                                 let upload_message = ChunkReqPayload {
                                     chunk_html: chunk.chunk_html.clone(),
                                     semantic_content: None,
+                                    fulltext_content: None,
                                     link: chunk.link.clone(),
                                     tag_set: chunk.tag_set.clone().map(|tag_set| {
                                         tag_set.split(',').map(|tag| tag.to_string()).collect()
diff --git a/server/src/bin/crawl-worker.rs b/server/src/bin/crawl-worker.rs
@@ -809,6 +809,7 @@ async fn parse_youtube_chunks(
                     let create_chunk_data = ChunkReqPayload {
                         chunk_html: Some(transcript.text),
                         semantic_content: None,
+                        fulltext_content: None,
                         link: Some(format!(
                             "https://www.youtube.com/watch?v={}&t={}",
                             video.id.video_id,
diff --git a/server/src/bin/csv-jsonl-worker.rs b/server/src/bin/csv-jsonl-worker.rs
@@ -625,6 +625,7 @@ fn convert_value_to_chunkreqpayload(
     let mut chunk_req_payload = ChunkReqPayload {
         chunk_html,
         semantic_content: None,
+        fulltext_content: None,
         link: None,
         tag_set: None,
         num_value: None,
diff --git a/server/src/bin/file-worker.rs b/server/src/bin/file-worker.rs
@@ -543,6 +543,7 @@ async fn upload_file(
                         let create_chunk_data = ChunkReqPayload {
                             chunk_html: Some(page.content.clone()),
                             semantic_content: None,
+                            fulltext_content: None,
                             link: file_worker_message.upload_file_data.link.clone(),
                             tag_set: file_worker_message.upload_file_data.tag_set.clone(),
                             metadata,
@@ -658,6 +659,7 @@ async fn upload_file(
         let chunk = ChunkReqPayload {
             chunk_html: Some(html_content.clone()),
             semantic_content: None,
+            fulltext_content: None,
             link: file_worker_message.upload_file_data.link.clone(),
             tag_set: file_worker_message.upload_file_data.tag_set.clone(),
             metadata: file_worker_message.upload_file_data.metadata.clone(),
@@ -706,6 +708,7 @@ async fn upload_file(
         .map(|(i, chunk_html)| ChunkReqPayload {
             chunk_html: Some(chunk_html),
             semantic_content: None,
+            fulltext_content: None,
             link: file_worker_message.upload_file_data.link.clone(),
             tag_set: file_worker_message.upload_file_data.tag_set.clone(),
             metadata: file_worker_message.upload_file_data.metadata.clone(),
diff --git a/server/src/bin/ingestion-worker.rs b/server/src/bin/ingestion-worker.rs
@@ -480,7 +480,12 @@ pub async fn bulk_upload_chunks(
             ChunkData {
                 chunk_metadata,
                 content: content.clone(),
-                embedding_content: message.chunk.semantic_content.clone().unwrap_or(content),
+                embedding_content: message
+                    .chunk
+                    .semantic_content
+                    .clone()
+                    .unwrap_or(content.clone()),
+                fulltext_content: message.chunk.fulltext_content.clone().unwrap_or(content),
                 group_ids: Some(deduped_group_ids),
                 upsert_by_tracking_id: message.upsert_by_tracking_id,
                 fulltext_boost: message
@@ -600,12 +605,12 @@ pub async fn bulk_upload_chunks(
         false => vec![None; embedding_content_and_boosts.len()],
     };
 
-    let content_and_boosts: Vec<(String, Option<FullTextBoost>, Option<SemanticBoost>)> =
+    let fulltext_content_and_boosts: Vec<(String, Option<FullTextBoost>, Option<SemanticBoost>)> =
         ingestion_data
             .iter()
             .map(|data| {
                 (
-                    data.content.clone(),
+                    data.fulltext_content.clone(),
                     data.fulltext_boost.clone(),
                     data.semantic_boost.clone(),
                 )
@@ -615,10 +620,10 @@ pub async fn bulk_upload_chunks(
     let splade_vectors = if dataset_config.FULLTEXT_ENABLED {
         log::info!(
             "Creating sparse vectors for {} chunks",
-            content_and_boosts.len()
+            fulltext_content_and_boosts.len()
         );
         match get_sparse_vectors(
-            content_and_boosts
+            fulltext_content_and_boosts
                 .iter()
                 .map(|(content, boost, _)| (content.clone(), boost.clone()))
                 .collect(),
@@ -641,7 +646,7 @@ pub async fn bulk_upload_chunks(
             }
         }
     } else {
-        let content_size = content_and_boosts.len();
+        let content_size = fulltext_content_and_boosts.len();
 
         Ok(std::iter::repeat(vec![(0, 0.0)])
             .take(content_size)
@@ -652,7 +657,7 @@ pub async fn bulk_upload_chunks(
         && std::env::var("BM25_ACTIVE").unwrap_or("false".to_string()) == "true"
     {
         get_bm25_embeddings(
-            content_and_boosts
+            fulltext_content_and_boosts
                 .iter()
                 .map(|(content, boost, _)| (content.clone(), boost.clone()))
                 .collect(),
@@ -664,7 +669,7 @@ pub async fn bulk_upload_chunks(
         .map(Some)
         .collect()
     } else {
-        vec![None; content_and_boosts.len()]
+        vec![None; fulltext_content_and_boosts.len()]
     };
 
     let qdrant_points = tokio_stream::iter(izip!(
diff --git a/server/src/data/models.rs b/server/src/data/models.rs
@@ -6789,6 +6789,7 @@ pub struct ChunkData {
     pub chunk_metadata: ChunkMetadata,
     pub content: String,
     pub embedding_content: String,
+    pub fulltext_content: String,
     pub group_ids: Option<Vec<uuid::Uuid>>,
     pub upsert_by_tracking_id: bool,
     pub fulltext_boost: Option<FullTextBoost>,
diff --git a/server/src/handlers/chunk_handler.rs b/server/src/handlers/chunk_handler.rs
@@ -106,8 +106,10 @@ pub struct ScoringOptions {
 pub struct ChunkReqPayload {
     /// HTML content of the chunk. This can also be plaintext. The innerText of the HTML will be used to create the embedding vector. The point of using HTML is for convienience, as some users have applications where users submit HTML content.
     pub chunk_html: Option<String>,
-    /// If semantic_content is present, it will be used for creating semantic embeddings instead of the innerText `chunk_html`. `chunk_html` will still be the only thing stored and always used for fulltext functionality. `chunk_html` must still be present for the chunk to be created properly.
+    /// If semantic_content is present, it will be used for creating semantic embeddings instead of the innerText `chunk_html`. `chunk_html` will still be the only thing stored and used for fulltext functionality unless the corresponding `fulltext_content` field is defined. `chunk_html` must still be present for the chunk to be created properly.
     pub semantic_content: Option<String>,
+    /// If fulltext_content is present, it will be used for creating the fulltext and bm25 sparse vectors instead of the innerText `chunk_html`. `chunk_html` will still be the only thing stored and used for semantic functionality unless the corresponding `semantic_content` field is defined. `chunk_html` must still be present for the chunk to be created properly.
+    pub fulltext_content: Option<String>,
     /// Link to the chunk. This can also be any string. Frequently, this is a link to the source of the chunk. The link value will not affect the embedding creation.
     pub link: Option<String>,
     /// Tag set is a list of tags. This can be used to filter chunks by tag. Unlike with metadata filtering, HNSW indices will exist for each tag such that there is not a performance hit for filtering on them.
diff --git a/server/src/operators/chunk_operator.rs b/server/src/operators/chunk_operator.rs
@@ -758,6 +758,7 @@ pub async fn bulk_insert_chunk_metadata_query(
                 chunk_metadata,
                 content: chunk_data.content,
                 embedding_content: chunk_data.embedding_content,
+                fulltext_content: chunk_data.fulltext_content,
                 group_ids: chunk_data.group_ids,
                 upsert_by_tracking_id: chunk_data.upsert_by_tracking_id,
                 fulltext_boost: chunk_data.fulltext_boost,