Skip to content

Commit 3493ecb

Browse files
authored
Chunked dict values pullup (#8577)
1 parent 937c9d7 commit 3493ecb

2 files changed

Lines changed: 109 additions & 0 deletions

File tree

vortex-array/src/arrays/dict/compute/rules.rs

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,15 @@ use crate::EqMode;
99
use crate::IntoArray;
1010
use crate::array::ArrayView;
1111
use crate::array::VTable;
12+
use crate::arrays::Chunked;
13+
use crate::arrays::ChunkedArray;
1214
use crate::arrays::Constant;
1315
use crate::arrays::ConstantArray;
1416
use crate::arrays::Dict;
1517
use crate::arrays::DictArray;
1618
use crate::arrays::ScalarFn;
1719
use crate::arrays::ScalarFnArray;
20+
use crate::arrays::chunked::ChunkedArrayExt;
1821
use crate::arrays::dict::DictArrayExt;
1922
use crate::arrays::dict::DictArraySlotsExt;
2023
use crate::arrays::filter::FilterReduceAdaptor;
@@ -37,11 +40,59 @@ pub(crate) const PARENT_RULES: ParentRuleSet<Dict> = ParentRuleSet::new(&[
3740
ParentRuleSet::lift(&CastReduceAdaptor(Dict)),
3841
ParentRuleSet::lift(&MaskReduceAdaptor(Dict)),
3942
ParentRuleSet::lift(&LikeReduceAdaptor(Dict)),
43+
ParentRuleSet::lift(&DictionaryChunkedValuesPullUpRule),
4044
ParentRuleSet::lift(&DictionaryScalarFnValuesPushDownRule),
4145
ParentRuleSet::lift(&DictionaryScalarFnCodesPullUpRule),
4246
ParentRuleSet::lift(&SliceReduceAdaptor(Dict)),
4347
]);
4448

49+
/// Pull a common dictionary values array above chunked dictionary codes.
50+
///
51+
/// Rewrites `Chunked<Dict<codes_i, values>>` into `Dict<Chunked<codes_i>, values>` only when
52+
/// every child dictionary shares the exact same values array allocation.
53+
#[derive(Debug)]
54+
struct DictionaryChunkedValuesPullUpRule;
55+
56+
impl ArrayParentReduceRule<Dict> for DictionaryChunkedValuesPullUpRule {
57+
type Parent = Chunked;
58+
59+
fn reduce_parent(
60+
&self,
61+
array: ArrayView<'_, Dict>,
62+
parent: ArrayView<'_, Chunked>,
63+
_child_idx: usize,
64+
) -> VortexResult<Option<ArrayRef>> {
65+
let values = array.values();
66+
let codes_dtype = array.codes().dtype().clone();
67+
let mut code_chunks = Vec::with_capacity(parent.nchunks());
68+
let mut all_values_referenced = array.has_all_values_referenced();
69+
70+
for chunk in parent.iter_chunks() {
71+
let Some(dict) = chunk.as_opt::<Dict>() else {
72+
return Ok(None);
73+
};
74+
if dict.codes().dtype() != &codes_dtype {
75+
return Ok(None);
76+
}
77+
if !ArrayRef::ptr_eq(dict.values(), values) {
78+
return Ok(None);
79+
}
80+
all_values_referenced |= dict.has_all_values_referenced();
81+
code_chunks.push(dict.codes().clone());
82+
}
83+
84+
let codes = ChunkedArray::try_new(code_chunks, codes_dtype)?.into_array();
85+
let dict = DictArray::try_new(codes, values.clone())?;
86+
let dict = if all_values_referenced {
87+
unsafe { dict.set_all_values_referenced(true) }
88+
} else {
89+
dict
90+
};
91+
92+
Ok(Some(dict.into_array()))
93+
}
94+
}
95+
4596
/// Push down a scalar function to run only over the values of a dictionary array.
4697
#[derive(Debug)]
4798
struct DictionaryScalarFnValuesPushDownRule;
@@ -214,16 +265,72 @@ mod tests {
214265
use vortex_buffer::buffer;
215266
use vortex_error::VortexResult;
216267

268+
use crate::ArrayRef;
217269
use crate::IntoArray;
218270
use crate::arrays::BoolArray;
271+
use crate::arrays::Chunked;
272+
use crate::arrays::ChunkedArray;
219273
use crate::arrays::Dict;
220274
use crate::arrays::DictArray;
275+
use crate::arrays::PrimitiveArray;
276+
use crate::arrays::chunked::ChunkedArrayExt;
221277
use crate::arrays::dict::DictArrayExt;
278+
use crate::arrays::dict::DictArraySlotsExt;
222279
use crate::arrays::scalar_fn::ScalarFnFactoryExt;
280+
use crate::assert_arrays_eq;
281+
use crate::executor::VortexSessionExecute;
223282
use crate::optimizer::ArrayOptimizer;
224283
use crate::scalar_fn::EmptyOptions;
225284
use crate::scalar_fn::fns::not::Not;
226285

286+
#[test]
287+
fn chunked_dict_with_shared_values_pulls_values_up() -> VortexResult<()> {
288+
let values = buffer![10u32, 20, 30].into_array();
289+
let chunk0 = DictArray::try_new(buffer![0u8, 1].into_array(), values.clone())?.into_array();
290+
let chunk1 =
291+
DictArray::try_new(buffer![2u8, 0, 1].into_array(), values.clone())?.into_array();
292+
let array =
293+
ChunkedArray::try_new(vec![chunk0, chunk1], values.dtype().clone())?.into_array();
294+
295+
let optimized = array.optimize()?;
296+
let dict = optimized.as_::<Dict>();
297+
let codes = dict.codes().as_::<Chunked>();
298+
299+
assert!(ArrayRef::ptr_eq(dict.values(), &values));
300+
assert_eq!(codes.nchunks(), 2);
301+
let mut ctx = crate::LEGACY_SESSION.create_execution_ctx();
302+
assert_arrays_eq!(
303+
optimized,
304+
PrimitiveArray::from_iter([10u32, 20, 30, 10, 20]),
305+
&mut ctx
306+
);
307+
308+
Ok(())
309+
}
310+
311+
#[test]
312+
fn chunked_dict_with_distinct_values_stays_chunked() -> VortexResult<()> {
313+
let values0 = buffer![10u32, 20, 30].into_array();
314+
let values1 = buffer![10u32, 20, 30].into_array();
315+
let chunk0 =
316+
DictArray::try_new(buffer![0u8, 1].into_array(), values0.clone())?.into_array();
317+
let chunk1 = DictArray::try_new(buffer![2u8, 0, 1].into_array(), values1)?.into_array();
318+
let array =
319+
ChunkedArray::try_new(vec![chunk0, chunk1], values0.dtype().clone())?.into_array();
320+
321+
let optimized = array.optimize()?;
322+
323+
assert!(optimized.is::<Chunked>());
324+
let mut ctx = crate::LEGACY_SESSION.create_execution_ctx();
325+
assert_arrays_eq!(
326+
optimized,
327+
PrimitiveArray::from_iter([10u32, 20, 30, 10, 20]),
328+
&mut ctx
329+
);
330+
331+
Ok(())
332+
}
333+
227334
#[test]
228335
fn scalar_fn_values_pushdown_preserves_all_values_referenced() -> VortexResult<()> {
229336
let dict = unsafe {

vortex-array/src/arrays/dict/vtable/kernel.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
use vortex_session::VortexSession;
55

66
use crate::ArrayVTable;
7+
use crate::arrays::Chunked;
78
use crate::arrays::Dict;
89
use crate::arrays::dict::TakeExecuteAdaptor;
910
use crate::optimizer::kernels::ArrayKernelsExt;
@@ -16,6 +17,7 @@ use crate::scalar_fn::fns::fill_null::FillNullExecuteAdaptor;
1617
pub(crate) fn initialize(session: &VortexSession) {
1718
let kernels = session.kernels();
1819
kernels.register_execute_parent_kernel(Binary.id(), Dict, CompareExecuteAdaptor(Dict));
20+
kernels.register_execute_parent_kernel(Dict.id(), Chunked, TakeExecuteAdaptor(Chunked));
1921
kernels.register_execute_parent_kernel(Dict.id(), Dict, TakeExecuteAdaptor(Dict));
2022
kernels.register_execute_parent_kernel(FillNull.id(), Dict, FillNullExecuteAdaptor(Dict));
2123
}

0 commit comments

Comments
 (0)