feat: support to create FTS index on list of strings (#3622)

BubbleCal · web-flow · commit 9b0c274e6284 · 2025-04-06T12:59:01.000+08:00
Signed-off-by: BubbleCal &lt;bubble-cal@outlook.com&gt;
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
@@ -1752,11 +1752,15 @@ def create_scalar_index(
             if not pa.types.is_string(field_type):
                 raise TypeError(f"NGRAM index column {column} must be a string")
         elif index_type in ["INVERTED", "FTS"]:
-            if not pa.types.is_string(field_type) and not pa.types.is_large_string(
-                field_type
+            value_type = field_type
+            if pa.types.is_list(field_type) or pa.types.is_large_list(field_type):
+                value_type = field_type.value_type
+            if not pa.types.is_string(value_type) and not pa.types.is_large_string(
+                value_type
             ):
                 raise TypeError(
-                    f"INVERTED index column {column} must be string or large string"
+                    f"INVERTED index column {column} must be string, large string"
+                    " or list of strings, but got {value_type}"
                 )
 
         if pa.types.is_duration(field_type):
diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py
@@ -371,6 +371,28 @@ def test_indexed_filter_with_fts_index(tmp_path):
     assert results["_rowid"].to_pylist() == [2, 3]
 
 
+def test_fts_on_list(tmp_path):
+    data = pa.table(
+        {
+            "text": [
+                ["lance database", "the", "search"],
+                ["lance database"],
+                ["lance", "search"],
+                ["database", "search"],
+                ["unrelated", "doc"],
+            ]
+        }
+    )
+    ds = lance.write_dataset(data, tmp_path)
+    ds.create_scalar_index("text", "INVERTED", with_position=True)
+
+    results = ds.to_table(full_text_query="lance")
+    assert results.num_rows == 3
+
+    results = ds.to_table(full_text_query=PhraseQuery("lance database", "text"))
+    assert results.num_rows == 2
+
+
 def test_fts_fuzzy_query(tmp_path):
     data = pa.table(
         {
diff --git a/rust/lance-index/src/scalar/inverted/builder.rs b/rust/lance-index/src/scalar/inverted/builder.rs
@@ -12,8 +12,8 @@ use crate::scalar::{IndexReader, IndexStore, IndexWriter, InvertedIndexParams};
 use crate::vector::graph::OrderedFloat;
 use arrow::array::{ArrayBuilder, AsArray, Int32Builder, StringBuilder};
 use arrow::datatypes;
-use arrow_array::{Int32Array, RecordBatch, StringArray};
-use arrow_schema::SchemaRef;
+use arrow_array::{Array, Int32Array, RecordBatch, StringArray, UInt64Array};
+use arrow_schema::{Field, Schema, SchemaRef};
 use crossbeam_queue::ArrayQueue;
 use datafusion::execution::SendableRecordBatchStream;
 use deepsize::DeepSizeOf;
@@ -22,10 +22,11 @@ use itertools::Itertools;
 use lance_arrow::iter_str_array;
 use lance_core::cache::FileMetadataCache;
 use lance_core::utils::tokio::{get_num_compute_intensive_cpus, CPU_RUNTIME};
-use lance_core::{Result, ROW_ID};
+use lance_core::{Error, Result, ROW_ID, ROW_ID_FIELD};
 use lance_io::object_store::ObjectStore;
 use lazy_static::lazy_static;
 use object_store::path::Path;
+use snafu::location;
 use tempfile::{tempdir, TempDir};
 use tracing::instrument;
 
@@ -108,6 +109,23 @@ impl InvertedIndexBuilder {
 
     #[instrument(level = "debug", skip_all)]
     async fn update_index(&mut self, stream: SendableRecordBatchStream) -> Result<()> {
+        let flatten_stream = stream.map(|batch| {
+            let batch = batch?;
+            let doc_col = batch.column(0);
+            match doc_col.data_type() {
+                datatypes::DataType::Utf8 | datatypes::DataType::LargeUtf8 => Ok(batch),
+                datatypes::DataType::List(_)   => {
+                    flatten_string_list::<i32>(&batch, doc_col)
+                }
+                datatypes::DataType::LargeList(_) => {
+                    flatten_string_list::<i64>(&batch, doc_col)
+                }
+                _ => {
+                   Err(Error::Index { message: format!("expect data type String, LargeString or List of String/LargeString, but got {}", doc_col.data_type()), location: location!() })
+                }
+            }
+        });
+
         let num_shards = *LANCE_FTS_NUM_SHARDS;
 
         // init the token maps
@@ -159,13 +177,15 @@ impl InvertedIndexBuilder {
         for _ in 0..num_shards {
             let _ = tokenizer_pool.push(tokenizer.clone());
         }
-        let mut stream = stream
+        let mut stream = flatten_stream
             .map(move |batch| {
                 let senders = senders.clone();
                 let tokenizer_pool = tokenizer_pool.clone();
                 CPU_RUNTIME.spawn_blocking(move || {
                     let batch = batch?;
-                    let doc_iter = iter_str_array(batch.column(0));
+
+                    let doc_col = batch.column(0);
+                    let doc_iter = iter_str_array(doc_col);
                     let row_id_col = batch[ROW_ID].as_primitive::<datatypes::UInt64Type>();
                     let docs = doc_iter
                         .zip(row_id_col.values().iter())
@@ -721,3 +741,42 @@ pub fn inverted_list_schema(with_position: bool) -> SchemaRef {
     }
     Arc::new(arrow_schema::Schema::new(fields))
 }
+
+fn flatten_string_list<Offset: arrow::array::OffsetSizeTrait>(
+    batch: &RecordBatch,
+    doc_col: &Arc<dyn Array>,
+) -> Result<RecordBatch> {
+    let docs = doc_col.as_list::<Offset>();
+    let row_ids = batch[ROW_ID].as_primitive::<datatypes::UInt64Type>();
+
+    let row_ids = row_ids
+        .values()
+        .iter()
+        .zip(docs.iter())
+        .flat_map(|(row_id, doc)| std::iter::repeat_n(*row_id, doc.map(|d| d.len()).unwrap_or(0)));
+
+    let row_ids = Arc::new(UInt64Array::from_iter_values(row_ids));
+    let docs = match docs.value_type() {
+        datatypes::DataType::Utf8 | datatypes::DataType::LargeUtf8 => docs.values().clone(),
+        _ => {
+            return Err(Error::Index {
+                message: format!(
+                    "expect data type String or LargeString but got {}",
+                    docs.value_type()
+                ),
+                location: location!(),
+            });
+        }
+    };
+
+    let schema = Schema::new(vec![
+        Field::new(
+            batch.schema().field(0).name(),
+            docs.data_type().clone(),
+            true,
+        ),
+        ROW_ID_FIELD.clone(),
+    ]);
+    let batch = RecordBatch::try_new(Arc::new(schema), vec![docs, row_ids])?;
+    Ok(batch)
+}
diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs
@@ -1742,7 +1742,7 @@ mod tests {
     use crate::index::vector::VectorIndexParams;
     use crate::utils::test::TestDatasetGenerator;
 
-    use arrow::array::{as_struct_array, AsArray};
+    use arrow::array::{as_struct_array, AsArray, GenericListBuilder, GenericStringBuilder};
     use arrow::compute::concat_batches;
     use arrow::datatypes::UInt64Type;
     use arrow_array::{
@@ -5045,7 +5045,11 @@ mod tests {
         assert_eq!(row_ids, &[0]);
     }
 
-    async fn create_fts_dataset<Offset: arrow::array::OffsetSizeTrait>(
+    async fn create_fts_dataset<
+        Offset: arrow::array::OffsetSizeTrait,
+        ListOffset: arrow::array::OffsetSizeTrait,
+    >(
+        is_list: bool,
         with_position: bool,
         tokenizer: TokenizerConfig,
     ) -> Dataset {
@@ -5055,19 +5059,46 @@ mod tests {
 
         let mut params = InvertedIndexParams::default().with_position(with_position);
         params.tokenizer_config = tokenizer;
-        let doc_col = GenericStringArray::<Offset>::from(vec![
-            "lance database the search",
-            "lance database",
-            "lance search",
-            "database search",
-            "unrelated doc",
-            "unrelated",
-            "mots accentués",
-        ]);
+        let doc_col: Arc<dyn Array> = if is_list {
+            let string_builder = GenericStringBuilder::<Offset>::new();
+            let mut list_col = GenericListBuilder::<ListOffset, _>::new(string_builder);
+            // Create a list of strings
+            list_col.values().append_value("lance database"); // for testing phrase query
+            list_col.values().append_value("the");
+            list_col.values().append_value("search");
+            list_col.append(true);
+            list_col.values().append_value("lance database"); // for testing phrase query
+            list_col.append(true);
+            list_col.values().append_value("lance");
+            list_col.values().append_value("search");
+            list_col.append(true);
+            list_col.values().append_value("database");
+            list_col.values().append_value("search");
+            list_col.append(true);
+            list_col.values().append_value("unrelated doc");
+            list_col.append(true);
+            list_col.values().append_value("unrelated");
+            list_col.append(true);
+            list_col.values().append_value("mots");
+            list_col.values().append_value("accentués");
+            list_col.append(true);
+            list_col.append(false);
+            Arc::new(list_col.finish())
+        } else {
+            Arc::new(GenericStringArray::<Offset>::from(vec![
+                "lance database the search",
+                "lance database",
+                "lance search",
+                "database search",
+                "unrelated doc",
+                "unrelated",
+                "mots accentués",
+            ]))
+        };
         let ids = UInt64Array::from_iter_values(0..doc_col.len() as u64);
         let batch = RecordBatch::try_new(
             arrow_schema::Schema::new(vec![
-                arrow_schema::Field::new("doc", doc_col.data_type().to_owned(), false),
+                arrow_schema::Field::new("doc", doc_col.data_type().to_owned(), true),
                 arrow_schema::Field::new("id", DataType::UInt64, false),
             ])
             .into(),
@@ -5086,8 +5117,15 @@ mod tests {
         dataset
     }
 
-    async fn test_fts_index<Offset: arrow::array::OffsetSizeTrait>() {
-        let ds = create_fts_dataset::<Offset>(false, TokenizerConfig::default()).await;
+    async fn test_fts_index<
+        Offset: arrow::array::OffsetSizeTrait,
+        ListOffset: arrow::array::OffsetSizeTrait,
+    >(
+        is_list: bool,
+    ) {
+        let ds =
+            create_fts_dataset::<Offset, ListOffset>(is_list, false, TokenizerConfig::default())
+                .await;
         let result = ds
             .scan()
             .project(&["id"])
@@ -5152,7 +5190,9 @@ mod tests {
         assert!(err.contains("position is not found but required for phrase queries, try recreating the index with position"),"{}",err);
 
         // recreate the index with position
-        let ds = create_fts_dataset::<Offset>(true, TokenizerConfig::default()).await;
+        let ds =
+            create_fts_dataset::<Offset, ListOffset>(is_list, true, TokenizerConfig::default())
+                .await;
         let result = ds
             .scan()
             .project(&["id"])
@@ -5235,17 +5275,21 @@ mod tests {
 
     #[tokio::test]
     async fn test_fts_index_with_string() {
-        test_fts_index::<i32>().await;
+        test_fts_index::<i32, i32>(false).await;
+        test_fts_index::<i32, i32>(true).await;
+        test_fts_index::<i32, i64>(true).await;
     }
 
     #[tokio::test]
     async fn test_fts_index_with_large_string() {
-        test_fts_index::<i64>().await;
+        test_fts_index::<i64, i32>(false).await;
+        test_fts_index::<i64, i32>(true).await;
+        test_fts_index::<i64, i64>(true).await;
     }
 
     #[tokio::test]
     async fn test_fts_accented_chars() {
-        let ds = create_fts_dataset::<i32>(false, TokenizerConfig::default()).await;
+        let ds = create_fts_dataset::<i32, i32>(false, false, TokenizerConfig::default()).await;
         let result = ds
             .scan()
             .project(&["id"])
@@ -5269,8 +5313,12 @@ mod tests {
         assert_eq!(result.num_rows(), 0);
 
         // with ascii folding enabled, the search should be accent-insensitive
-        let ds =
-            create_fts_dataset::<i32>(false, TokenizerConfig::default().ascii_folding(true)).await;
+        let ds = create_fts_dataset::<i32, i32>(
+            false,
+            false,
+            TokenizerConfig::default().ascii_folding(true),
+        )
+        .await;
         let result = ds
             .scan()
             .project(&["id"])
diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs
@@ -1577,13 +1577,22 @@ impl Scanner {
         let query = if columns.is_empty() {
             // the field is not specified,
             // try to search over all indexed fields
-            let string_columns = self.dataset.schema().fields.iter().filter_map(|f| {
-                if f.data_type() == DataType::Utf8 || f.data_type() == DataType::LargeUtf8 {
-                    Some(&f.name)
-                } else {
-                    None
-                }
-            });
+            let string_columns =
+                self.dataset
+                    .schema()
+                    .fields
+                    .iter()
+                    .filter_map(|f| match f.data_type() {
+                        DataType::Utf8 | DataType::LargeUtf8 => Some(&f.name),
+                        DataType::List(field) | DataType::LargeList(field) => {
+                            if matches!(field.data_type(), DataType::Utf8 | DataType::LargeUtf8) {
+                                Some(&f.name)
+                            } else {
+                                None
+                            }
+                        }
+                        _ => None,
+                    });
 
             let mut indexed_columns = Vec::new();
             for column in string_columns {