@@ -9,12 +9,15 @@ use crate::EqMode;
99use crate :: IntoArray ;
1010use crate :: array:: ArrayView ;
1111use crate :: array:: VTable ;
12+ use crate :: arrays:: Chunked ;
13+ use crate :: arrays:: ChunkedArray ;
1214use crate :: arrays:: Constant ;
1315use crate :: arrays:: ConstantArray ;
1416use crate :: arrays:: Dict ;
1517use crate :: arrays:: DictArray ;
1618use crate :: arrays:: ScalarFn ;
1719use crate :: arrays:: ScalarFnArray ;
20+ use crate :: arrays:: chunked:: ChunkedArrayExt ;
1821use crate :: arrays:: dict:: DictArrayExt ;
1922use crate :: arrays:: dict:: DictArraySlotsExt ;
2023use crate :: arrays:: filter:: FilterReduceAdaptor ;
@@ -37,11 +40,59 @@ pub(crate) const PARENT_RULES: ParentRuleSet<Dict> = ParentRuleSet::new(&[
3740 ParentRuleSet :: lift ( & CastReduceAdaptor ( Dict ) ) ,
3841 ParentRuleSet :: lift ( & MaskReduceAdaptor ( Dict ) ) ,
3942 ParentRuleSet :: lift ( & LikeReduceAdaptor ( Dict ) ) ,
43+ ParentRuleSet :: lift ( & DictionaryChunkedValuesPullUpRule ) ,
4044 ParentRuleSet :: lift ( & DictionaryScalarFnValuesPushDownRule ) ,
4145 ParentRuleSet :: lift ( & DictionaryScalarFnCodesPullUpRule ) ,
4246 ParentRuleSet :: lift ( & SliceReduceAdaptor ( Dict ) ) ,
4347] ) ;
4448
49+ /// Pull a common dictionary values array above chunked dictionary codes.
50+ ///
51+ /// Rewrites `Chunked<Dict<codes_i, values>>` into `Dict<Chunked<codes_i>, values>` only when
52+ /// every child dictionary shares the exact same values array allocation.
53+ #[ derive( Debug ) ]
54+ struct DictionaryChunkedValuesPullUpRule ;
55+
56+ impl ArrayParentReduceRule < Dict > for DictionaryChunkedValuesPullUpRule {
57+ type Parent = Chunked ;
58+
59+ fn reduce_parent (
60+ & self ,
61+ array : ArrayView < ' _ , Dict > ,
62+ parent : ArrayView < ' _ , Chunked > ,
63+ _child_idx : usize ,
64+ ) -> VortexResult < Option < ArrayRef > > {
65+ let values = array. values ( ) ;
66+ let codes_dtype = array. codes ( ) . dtype ( ) . clone ( ) ;
67+ let mut code_chunks = Vec :: with_capacity ( parent. nchunks ( ) ) ;
68+ let mut all_values_referenced = array. has_all_values_referenced ( ) ;
69+
70+ for chunk in parent. iter_chunks ( ) {
71+ let Some ( dict) = chunk. as_opt :: < Dict > ( ) else {
72+ return Ok ( None ) ;
73+ } ;
74+ if dict. codes ( ) . dtype ( ) != & codes_dtype {
75+ return Ok ( None ) ;
76+ }
77+ if !ArrayRef :: ptr_eq ( dict. values ( ) , values) {
78+ return Ok ( None ) ;
79+ }
80+ all_values_referenced |= dict. has_all_values_referenced ( ) ;
81+ code_chunks. push ( dict. codes ( ) . clone ( ) ) ;
82+ }
83+
84+ let codes = ChunkedArray :: try_new ( code_chunks, codes_dtype) ?. into_array ( ) ;
85+ let dict = DictArray :: try_new ( codes, values. clone ( ) ) ?;
86+ let dict = if all_values_referenced {
87+ unsafe { dict. set_all_values_referenced ( true ) }
88+ } else {
89+ dict
90+ } ;
91+
92+ Ok ( Some ( dict. into_array ( ) ) )
93+ }
94+ }
95+
4596/// Push down a scalar function to run only over the values of a dictionary array.
4697#[ derive( Debug ) ]
4798struct DictionaryScalarFnValuesPushDownRule ;
@@ -214,16 +265,72 @@ mod tests {
214265 use vortex_buffer:: buffer;
215266 use vortex_error:: VortexResult ;
216267
268+ use crate :: ArrayRef ;
217269 use crate :: IntoArray ;
218270 use crate :: arrays:: BoolArray ;
271+ use crate :: arrays:: Chunked ;
272+ use crate :: arrays:: ChunkedArray ;
219273 use crate :: arrays:: Dict ;
220274 use crate :: arrays:: DictArray ;
275+ use crate :: arrays:: PrimitiveArray ;
276+ use crate :: arrays:: chunked:: ChunkedArrayExt ;
221277 use crate :: arrays:: dict:: DictArrayExt ;
278+ use crate :: arrays:: dict:: DictArraySlotsExt ;
222279 use crate :: arrays:: scalar_fn:: ScalarFnFactoryExt ;
280+ use crate :: assert_arrays_eq;
281+ use crate :: executor:: VortexSessionExecute ;
223282 use crate :: optimizer:: ArrayOptimizer ;
224283 use crate :: scalar_fn:: EmptyOptions ;
225284 use crate :: scalar_fn:: fns:: not:: Not ;
226285
286+ #[ test]
287+ fn chunked_dict_with_shared_values_pulls_values_up ( ) -> VortexResult < ( ) > {
288+ let values = buffer ! [ 10u32 , 20 , 30 ] . into_array ( ) ;
289+ let chunk0 = DictArray :: try_new ( buffer ! [ 0u8 , 1 ] . into_array ( ) , values. clone ( ) ) ?. into_array ( ) ;
290+ let chunk1 =
291+ DictArray :: try_new ( buffer ! [ 2u8 , 0 , 1 ] . into_array ( ) , values. clone ( ) ) ?. into_array ( ) ;
292+ let array =
293+ ChunkedArray :: try_new ( vec ! [ chunk0, chunk1] , values. dtype ( ) . clone ( ) ) ?. into_array ( ) ;
294+
295+ let optimized = array. optimize ( ) ?;
296+ let dict = optimized. as_ :: < Dict > ( ) ;
297+ let codes = dict. codes ( ) . as_ :: < Chunked > ( ) ;
298+
299+ assert ! ( ArrayRef :: ptr_eq( dict. values( ) , & values) ) ;
300+ assert_eq ! ( codes. nchunks( ) , 2 ) ;
301+ let mut ctx = crate :: LEGACY_SESSION . create_execution_ctx ( ) ;
302+ assert_arrays_eq ! (
303+ optimized,
304+ PrimitiveArray :: from_iter( [ 10u32 , 20 , 30 , 10 , 20 ] ) ,
305+ & mut ctx
306+ ) ;
307+
308+ Ok ( ( ) )
309+ }
310+
311+ #[ test]
312+ fn chunked_dict_with_distinct_values_stays_chunked ( ) -> VortexResult < ( ) > {
313+ let values0 = buffer ! [ 10u32 , 20 , 30 ] . into_array ( ) ;
314+ let values1 = buffer ! [ 10u32 , 20 , 30 ] . into_array ( ) ;
315+ let chunk0 =
316+ DictArray :: try_new ( buffer ! [ 0u8 , 1 ] . into_array ( ) , values0. clone ( ) ) ?. into_array ( ) ;
317+ let chunk1 = DictArray :: try_new ( buffer ! [ 2u8 , 0 , 1 ] . into_array ( ) , values1) ?. into_array ( ) ;
318+ let array =
319+ ChunkedArray :: try_new ( vec ! [ chunk0, chunk1] , values0. dtype ( ) . clone ( ) ) ?. into_array ( ) ;
320+
321+ let optimized = array. optimize ( ) ?;
322+
323+ assert ! ( optimized. is:: <Chunked >( ) ) ;
324+ let mut ctx = crate :: LEGACY_SESSION . create_execution_ctx ( ) ;
325+ assert_arrays_eq ! (
326+ optimized,
327+ PrimitiveArray :: from_iter( [ 10u32 , 20 , 30 , 10 , 20 ] ) ,
328+ & mut ctx
329+ ) ;
330+
331+ Ok ( ( ) )
332+ }
333+
227334 #[ test]
228335 fn scalar_fn_values_pushdown_preserves_all_values_referenced ( ) -> VortexResult < ( ) > {
229336 let dict = unsafe {
0 commit comments