@@ -14,12 +14,15 @@ use std::error::Error;
1414use std:: sync:: Arc ;
1515
1616use arrow_array:: builder:: {
17- BooleanBuilder , Float64Builder , Int32Builder , StringBuilder , TimestampMillisecondBuilder ,
17+ BooleanBuilder , Float64Builder , Int32Builder , StringDictionaryBuilder ,
18+ TimestampMillisecondBuilder ,
1819} ;
20+ use arrow_array:: cast:: AsArray ;
21+ use arrow_array:: types:: Int32Type ;
1922use arrow_array:: {
20- Array , ArrayRef , BooleanArray , Date32Array , Date64Array , Decimal128Array , Float32Array ,
21- Float64Array , Int8Array , Int16Array , Int32Array , Int64Array , LargeStringArray , RecordBatch ,
22- StringArray , Time32MillisecondArray , Time32SecondArray , Time64MicrosecondArray ,
23+ Array , ArrayAccessor , ArrayRef , BooleanArray , Date32Array , Date64Array , Decimal128Array ,
24+ Float32Array , Float64Array , Int8Array , Int16Array , Int32Array , Int64Array , LargeStringArray ,
25+ RecordBatch , StringArray , Time32MillisecondArray , Time32SecondArray , Time64MicrosecondArray ,
2326 Time64NanosecondArray , TimestampMicrosecondArray , TimestampMillisecondArray ,
2427 TimestampNanosecondArray , TimestampSecondArray , UInt8Array , UInt16Array , UInt32Array ,
2528 UInt64Array ,
@@ -36,12 +39,16 @@ use crate::config::{GroupRollupMode, Scalar, ViewConfig};
3639/// [`VirtualDataSlice`].
3740pub enum ColumnBuilder {
3841 Boolean ( BooleanBuilder ) ,
39- String ( StringBuilder ) ,
42+ String ( StringDictionaryBuilder < Int32Type > ) ,
4043 Float ( Float64Builder ) ,
4144 Integer ( Int32Builder ) ,
4245 Datetime ( TimestampMillisecondBuilder ) ,
4346}
4447
48+ fn dict_data_type ( ) -> DataType {
49+ DataType :: Dictionary ( Box :: new ( DataType :: Int32 ) , Box :: new ( DataType :: Utf8 ) )
50+ }
51+
4552/// A single cell value in a row-oriented data representation.
4653///
4754/// Used when converting [`VirtualDataSlice`] to row format for JSON
@@ -90,7 +97,7 @@ impl SetVirtualDataColumn for Option<String> {
9097 }
9198
9299 fn new_builder ( ) -> ColumnBuilder {
93- ColumnBuilder :: String ( StringBuilder :: new ( ) )
100+ ColumnBuilder :: String ( StringDictionaryBuilder :: new ( ) )
94101 }
95102
96103 fn to_scalar ( self ) -> Scalar {
@@ -277,6 +284,11 @@ fn extract_scalar(array: &ArrayRef, row_idx: usize) -> Scalar {
277284 let arr = array. as_any ( ) . downcast_ref :: < StringArray > ( ) . unwrap ( ) ;
278285 Scalar :: String ( arr. value ( row_idx) . to_string ( ) )
279286 } ,
287+ DataType :: Dictionary ( ..) => {
288+ let dict = array. as_dictionary :: < Int32Type > ( ) ;
289+ let values = dict. downcast_dict :: < StringArray > ( ) . unwrap ( ) ;
290+ Scalar :: String ( values. value ( row_idx) . to_string ( ) )
291+ } ,
280292 DataType :: Float64 => {
281293 let arr = array. as_any ( ) . downcast_ref :: < Float64Array > ( ) . unwrap ( ) ;
282294 Scalar :: Float ( arr. value ( row_idx) )
@@ -350,14 +362,26 @@ fn coerce_column(
350362 array : & ArrayRef ,
351363) -> Result < ( Field , ArrayRef ) , Box < dyn Error > > {
352364 match field. data_type ( ) {
353- DataType :: Boolean
354- | DataType :: Utf8
355- | DataType :: Float64
356- | DataType :: Int32
357- | DataType :: Date32 => Ok ( (
365+ DataType :: Boolean | DataType :: Float64 | DataType :: Int32 | DataType :: Date32 => Ok ( (
358366 Field :: new ( name, field. data_type ( ) . clone ( ) , true ) ,
359367 array. clone ( ) ,
360368 ) ) ,
369+ DataType :: Dictionary ( ..) => Ok ( ( Field :: new ( name, dict_data_type ( ) , true ) , array. clone ( ) ) ) ,
370+ DataType :: Utf8 => {
371+ let arr = array. as_any ( ) . downcast_ref :: < StringArray > ( ) . unwrap ( ) ;
372+ let mut builder = StringDictionaryBuilder :: < Int32Type > :: new ( ) ;
373+ for i in 0 ..arr. len ( ) {
374+ if arr. is_null ( i) {
375+ builder. append_null ( ) ;
376+ } else {
377+ builder. append_value ( arr. value ( i) ) ;
378+ }
379+ }
380+ Ok ( (
381+ Field :: new ( name, dict_data_type ( ) , true ) ,
382+ Arc :: new ( builder. finish ( ) ) as ArrayRef ,
383+ ) )
384+ } ,
361385 DataType :: Timestamp ( TimeUnit :: Millisecond , _) => Ok ( (
362386 Field :: new ( name, DataType :: Timestamp ( TimeUnit :: Millisecond , None ) , true ) ,
363387 array. clone ( ) ,
@@ -502,20 +526,27 @@ fn coerce_column(
502526 } ,
503527 DataType :: LargeUtf8 => {
504528 let arr = array. as_any ( ) . downcast_ref :: < LargeStringArray > ( ) . unwrap ( ) ;
505- let result: StringArray = arr. iter ( ) . map ( |v| v. map ( |v| v. to_string ( ) ) ) . collect ( ) ;
529+ let mut builder = StringDictionaryBuilder :: < Int32Type > :: new ( ) ;
530+ for i in 0 ..arr. len ( ) {
531+ if arr. is_null ( i) {
532+ builder. append_null ( ) ;
533+ } else {
534+ builder. append_value ( arr. value ( i) ) ;
535+ }
536+ }
506537 Ok ( (
507- Field :: new ( name, DataType :: Utf8 , true ) ,
508- Arc :: new ( result ) as ArrayRef ,
538+ Field :: new ( name, dict_data_type ( ) , true ) ,
539+ Arc :: new ( builder . finish ( ) ) as ArrayRef ,
509540 ) )
510541 } ,
511542 dt => {
512543 tracing:: warn!(
513- "Coercing unknown Arrow type {} to Utf8 for column '{}'" ,
544+ "Coercing unknown Arrow type {} to Dictionary for column '{}'" ,
514545 dt,
515546 name
516547 ) ;
517548 let num_rows = array. len ( ) ;
518- let mut builder = StringBuilder :: new ( ) ;
549+ let mut builder = StringDictionaryBuilder :: < Int32Type > :: new ( ) ;
519550 for i in 0 ..num_rows {
520551 if array. is_null ( i) {
521552 builder. append_null ( ) ;
@@ -525,7 +556,7 @@ fn coerce_column(
525556 }
526557 }
527558 Ok ( (
528- Field :: new ( name, DataType :: Utf8 , true ) ,
559+ Field :: new ( name, dict_data_type ( ) , true ) ,
529560 Arc :: new ( builder. finish ( ) ) as ArrayRef ,
530561 ) )
531562 } ,
@@ -667,9 +698,10 @@ impl VirtualDataSlice {
667698 Field :: new ( name, DataType :: Boolean , true ) ,
668699 Arc :: new ( b. finish ( ) ) ,
669700 ) ,
670- ColumnBuilder :: String ( b) => {
671- ( Field :: new ( name, DataType :: Utf8 , true ) , Arc :: new ( b. finish ( ) ) )
672- } ,
701+ ColumnBuilder :: String ( b) => (
702+ Field :: new ( name, dict_data_type ( ) , true ) ,
703+ Arc :: new ( b. finish ( ) ) ,
704+ ) ,
673705 ColumnBuilder :: Float ( b) => (
674706 Field :: new ( name, DataType :: Float64 , true ) ,
675707 Arc :: new ( b. finish ( ) ) ,
@@ -736,7 +768,9 @@ impl VirtualDataSlice {
736768 let cell = if col. is_null ( row_idx) {
737769 match field. data_type ( ) {
738770 DataType :: Boolean => VirtualDataCell :: Boolean ( None ) ,
739- DataType :: Utf8 => VirtualDataCell :: String ( None ) ,
771+ DataType :: Utf8 | DataType :: Dictionary ( ..) => {
772+ VirtualDataCell :: String ( None )
773+ } ,
740774 DataType :: Float64 => VirtualDataCell :: Float ( None ) ,
741775 DataType :: Int32 => VirtualDataCell :: Integer ( None ) ,
742776 DataType :: Timestamp ( TimeUnit :: Millisecond , _) => {
@@ -754,6 +788,11 @@ impl VirtualDataSlice {
754788 let arr = col. as_any ( ) . downcast_ref :: < StringArray > ( ) . unwrap ( ) ;
755789 VirtualDataCell :: String ( Some ( arr. value ( row_idx) . to_string ( ) ) )
756790 } ,
791+ DataType :: Dictionary ( ..) => {
792+ let dict = col. as_dictionary :: < Int32Type > ( ) ;
793+ let values = dict. downcast_dict :: < StringArray > ( ) . unwrap ( ) ;
794+ VirtualDataCell :: String ( Some ( values. value ( row_idx) . to_string ( ) ) )
795+ } ,
757796 DataType :: Float64 => {
758797 let arr = col. as_any ( ) . downcast_ref :: < Float64Array > ( ) . unwrap ( ) ;
759798 VirtualDataCell :: Float ( Some ( arr. value ( row_idx) ) )
@@ -832,6 +871,21 @@ impl VirtualDataSlice {
832871 . collect :: < Vec < _ > > ( ) ,
833872 ) ?
834873 } ,
874+ DataType :: Dictionary ( ..) => {
875+ let dict = col. as_dictionary :: < Int32Type > ( ) ;
876+ let values = dict. downcast_dict :: < StringArray > ( ) . unwrap ( ) ;
877+ serde_json:: to_value (
878+ ( 0 ..num_rows)
879+ . map ( |i| {
880+ if col. is_null ( i) {
881+ None
882+ } else {
883+ Some ( values. value ( i) )
884+ }
885+ } )
886+ . collect :: < Vec < _ > > ( ) ,
887+ ) ?
888+ } ,
835889 DataType :: Float64 => {
836890 let arr = col. as_any ( ) . downcast_ref :: < Float64Array > ( ) . unwrap ( ) ;
837891 serde_json:: to_value (
0 commit comments