@@ -1742,7 +1742,7 @@ mod tests {
17421742 use crate :: index:: vector:: VectorIndexParams ;
17431743 use crate :: utils:: test:: TestDatasetGenerator ;
17441744
1745- use arrow:: array:: { as_struct_array, AsArray } ;
1745+ use arrow:: array:: { as_struct_array, AsArray , GenericListBuilder , GenericStringBuilder } ;
17461746 use arrow:: compute:: concat_batches;
17471747 use arrow:: datatypes:: UInt64Type ;
17481748 use arrow_array:: {
@@ -5045,7 +5045,11 @@ mod tests {
50455045 assert_eq ! ( row_ids, & [ 0 ] ) ;
50465046 }
50475047
5048- async fn create_fts_dataset < Offset : arrow:: array:: OffsetSizeTrait > (
5048+ async fn create_fts_dataset <
5049+ Offset : arrow:: array:: OffsetSizeTrait ,
5050+ ListOffset : arrow:: array:: OffsetSizeTrait ,
5051+ > (
5052+ is_list : bool ,
50495053 with_position : bool ,
50505054 tokenizer : TokenizerConfig ,
50515055 ) -> Dataset {
@@ -5055,19 +5059,46 @@ mod tests {
50555059
50565060 let mut params = InvertedIndexParams :: default ( ) . with_position ( with_position) ;
50575061 params. tokenizer_config = tokenizer;
5058- let doc_col = GenericStringArray :: < Offset > :: from ( vec ! [
5059- "lance database the search" ,
5060- "lance database" ,
5061- "lance search" ,
5062- "database search" ,
5063- "unrelated doc" ,
5064- "unrelated" ,
5065- "mots accentués" ,
5066- ] ) ;
5062+ let doc_col: Arc < dyn Array > = if is_list {
5063+ let string_builder = GenericStringBuilder :: < Offset > :: new ( ) ;
5064+ let mut list_col = GenericListBuilder :: < ListOffset , _ > :: new ( string_builder) ;
5065+ // Create a list of strings
5066+ list_col. values ( ) . append_value ( "lance database" ) ; // for testing phrase query
5067+ list_col. values ( ) . append_value ( "the" ) ;
5068+ list_col. values ( ) . append_value ( "search" ) ;
5069+ list_col. append ( true ) ;
5070+ list_col. values ( ) . append_value ( "lance database" ) ; // for testing phrase query
5071+ list_col. append ( true ) ;
5072+ list_col. values ( ) . append_value ( "lance" ) ;
5073+ list_col. values ( ) . append_value ( "search" ) ;
5074+ list_col. append ( true ) ;
5075+ list_col. values ( ) . append_value ( "database" ) ;
5076+ list_col. values ( ) . append_value ( "search" ) ;
5077+ list_col. append ( true ) ;
5078+ list_col. values ( ) . append_value ( "unrelated doc" ) ;
5079+ list_col. append ( true ) ;
5080+ list_col. values ( ) . append_value ( "unrelated" ) ;
5081+ list_col. append ( true ) ;
5082+ list_col. values ( ) . append_value ( "mots" ) ;
5083+ list_col. values ( ) . append_value ( "accentués" ) ;
5084+ list_col. append ( true ) ;
5085+ list_col. append ( false ) ;
5086+ Arc :: new ( list_col. finish ( ) )
5087+ } else {
5088+ Arc :: new ( GenericStringArray :: < Offset > :: from ( vec ! [
5089+ "lance database the search" ,
5090+ "lance database" ,
5091+ "lance search" ,
5092+ "database search" ,
5093+ "unrelated doc" ,
5094+ "unrelated" ,
5095+ "mots accentués" ,
5096+ ] ) )
5097+ } ;
50675098 let ids = UInt64Array :: from_iter_values ( 0 ..doc_col. len ( ) as u64 ) ;
50685099 let batch = RecordBatch :: try_new (
50695100 arrow_schema:: Schema :: new ( vec ! [
5070- arrow_schema:: Field :: new( "doc" , doc_col. data_type( ) . to_owned( ) , false ) ,
5101+ arrow_schema:: Field :: new( "doc" , doc_col. data_type( ) . to_owned( ) , true ) ,
50715102 arrow_schema:: Field :: new( "id" , DataType :: UInt64 , false ) ,
50725103 ] )
50735104 . into ( ) ,
@@ -5086,8 +5117,15 @@ mod tests {
50865117 dataset
50875118 }
50885119
5089- async fn test_fts_index < Offset : arrow:: array:: OffsetSizeTrait > ( ) {
5090- let ds = create_fts_dataset :: < Offset > ( false , TokenizerConfig :: default ( ) ) . await ;
5120+ async fn test_fts_index <
5121+ Offset : arrow:: array:: OffsetSizeTrait ,
5122+ ListOffset : arrow:: array:: OffsetSizeTrait ,
5123+ > (
5124+ is_list : bool ,
5125+ ) {
5126+ let ds =
5127+ create_fts_dataset :: < Offset , ListOffset > ( is_list, false , TokenizerConfig :: default ( ) )
5128+ . await ;
50915129 let result = ds
50925130 . scan ( )
50935131 . project ( & [ "id" ] )
@@ -5152,7 +5190,9 @@ mod tests {
51525190 assert ! ( err. contains( "position is not found but required for phrase queries, try recreating the index with position" ) , "{}" , err) ;
51535191
51545192 // recreate the index with position
5155- let ds = create_fts_dataset :: < Offset > ( true , TokenizerConfig :: default ( ) ) . await ;
5193+ let ds =
5194+ create_fts_dataset :: < Offset , ListOffset > ( is_list, true , TokenizerConfig :: default ( ) )
5195+ . await ;
51565196 let result = ds
51575197 . scan ( )
51585198 . project ( & [ "id" ] )
@@ -5235,17 +5275,21 @@ mod tests {
52355275
52365276 #[ tokio:: test]
52375277 async fn test_fts_index_with_string ( ) {
5238- test_fts_index :: < i32 > ( ) . await ;
5278+ test_fts_index :: < i32 , i32 > ( false ) . await ;
5279+ test_fts_index :: < i32 , i32 > ( true ) . await ;
5280+ test_fts_index :: < i32 , i64 > ( true ) . await ;
52395281 }
52405282
52415283 #[ tokio:: test]
52425284 async fn test_fts_index_with_large_string ( ) {
5243- test_fts_index :: < i64 > ( ) . await ;
5285+ test_fts_index :: < i64 , i32 > ( false ) . await ;
5286+ test_fts_index :: < i64 , i32 > ( true ) . await ;
5287+ test_fts_index :: < i64 , i64 > ( true ) . await ;
52445288 }
52455289
52465290 #[ tokio:: test]
52475291 async fn test_fts_accented_chars ( ) {
5248- let ds = create_fts_dataset :: < i32 > ( false , TokenizerConfig :: default ( ) ) . await ;
5292+ let ds = create_fts_dataset :: < i32 , i32 > ( false , false , TokenizerConfig :: default ( ) ) . await ;
52495293 let result = ds
52505294 . scan ( )
52515295 . project ( & [ "id" ] )
@@ -5269,8 +5313,12 @@ mod tests {
52695313 assert_eq ! ( result. num_rows( ) , 0 ) ;
52705314
52715315 // with ascii folding enabled, the search should be accent-insensitive
5272- let ds =
5273- create_fts_dataset :: < i32 > ( false , TokenizerConfig :: default ( ) . ascii_folding ( true ) ) . await ;
5316+ let ds = create_fts_dataset :: < i32 , i32 > (
5317+ false ,
5318+ false ,
5319+ TokenizerConfig :: default ( ) . ascii_folding ( true ) ,
5320+ )
5321+ . await ;
52745322 let result = ds
52755323 . scan ( )
52765324 . project ( & [ "id" ] )
0 commit comments