|
| 1 | +// SPDX-License-Identifier: Apache-2.0 |
| 2 | +// SPDX-FileCopyrightText: Copyright the Vortex contributors |
| 3 | + |
| 4 | +use vortex::array::ExecutionCtx; |
| 5 | +use vortex::array::IntoArray; |
| 6 | +use vortex::array::arrays::ChunkedArray; |
| 7 | +use vortex::array::arrays::chunked::ChunkedArrayExt; |
| 8 | +use vortex::error::VortexResult; |
| 9 | +use vortex::error::vortex_ensure; |
| 10 | + |
| 11 | +use crate::duckdb::VectorRef; |
| 12 | +use crate::exporter::ColumnExporter; |
| 13 | +use crate::exporter::ConversionCache; |
| 14 | +use crate::exporter::canonical; |
| 15 | +use crate::exporter::new_array_exporter; |
| 16 | + |
| 17 | +struct ChunkedExporter { |
| 18 | + chunk_offsets: Vec<usize>, |
| 19 | + chunks: Vec<Box<dyn ColumnExporter>>, |
| 20 | +} |
| 21 | + |
| 22 | +pub(crate) fn new_exporter_with_flatten( |
| 23 | + array: ChunkedArray, |
| 24 | + cache: &ConversionCache, |
| 25 | + ctx: &mut ExecutionCtx, |
| 26 | + flatten: bool, |
| 27 | +) -> VortexResult<Box<dyn ColumnExporter>> { |
| 28 | + if flatten { |
| 29 | + return canonical::new_exporter(array.into_array(), cache, ctx); |
| 30 | + } |
| 31 | + |
| 32 | + let chunk_offsets = array.chunk_offsets().to_vec(); |
| 33 | + let chunks = array |
| 34 | + .chunks() |
| 35 | + .iter() |
| 36 | + .map(|chunk| new_array_exporter(chunk.clone(), cache, ctx)) |
| 37 | + .collect::<VortexResult<Vec<_>>>()?; |
| 38 | + |
| 39 | + Ok(Box::new(ChunkedExporter { |
| 40 | + chunk_offsets, |
| 41 | + chunks, |
| 42 | + })) |
| 43 | +} |
| 44 | + |
| 45 | +impl ChunkedExporter { |
| 46 | + fn chunk_index(&self, offset: usize) -> usize { |
| 47 | + self.chunk_offsets |
| 48 | + .partition_point(|&chunk_offset| chunk_offset <= offset) |
| 49 | + .saturating_sub(1) |
| 50 | + } |
| 51 | +} |
| 52 | + |
| 53 | +impl ColumnExporter for ChunkedExporter { |
| 54 | + fn preferred_batch_len(&self, offset: usize, max_len: usize) -> usize { |
| 55 | + if max_len == 0 || self.chunks.is_empty() { |
| 56 | + return 0; |
| 57 | + } |
| 58 | + |
| 59 | + let chunk_idx = self.chunk_index(offset); |
| 60 | + let chunk_start = self.chunk_offsets[chunk_idx]; |
| 61 | + let chunk_end = self.chunk_offsets[chunk_idx + 1]; |
| 62 | + let len = (chunk_end - offset).min(max_len); |
| 63 | + self.chunks[chunk_idx].preferred_batch_len(offset - chunk_start, len) |
| 64 | + } |
| 65 | + |
| 66 | + fn export( |
| 67 | + &self, |
| 68 | + offset: usize, |
| 69 | + len: usize, |
| 70 | + vector: &mut VectorRef, |
| 71 | + ctx: &mut ExecutionCtx, |
| 72 | + ) -> VortexResult<()> { |
| 73 | + if len == 0 { |
| 74 | + return Ok(()); |
| 75 | + } |
| 76 | + |
| 77 | + let chunk_idx = self.chunk_index(offset); |
| 78 | + let chunk_start = self.chunk_offsets[chunk_idx]; |
| 79 | + let chunk_end = self.chunk_offsets[chunk_idx + 1]; |
| 80 | + let offset_in_chunk = offset - chunk_start; |
| 81 | + vortex_ensure!( |
| 82 | + offset + len <= chunk_end, |
| 83 | + "chunked DuckDB export range {offset}..{} crosses chunk boundary at {chunk_end}", |
| 84 | + offset + len |
| 85 | + ); |
| 86 | + |
| 87 | + self.chunks[chunk_idx].export(offset_in_chunk, len, vector, ctx) |
| 88 | + } |
| 89 | +} |
| 90 | + |
| 91 | +#[cfg(test)] |
| 92 | +mod tests { |
| 93 | + use vortex::array::IntoArray; |
| 94 | + use vortex::array::VortexSessionExecute; |
| 95 | + use vortex::array::arrays::ChunkedArray; |
| 96 | + use vortex::array::arrays::DictArray; |
| 97 | + use vortex::array::arrays::StructArray; |
| 98 | + use vortex::array::arrays::VarBinViewArray; |
| 99 | + use vortex::buffer::buffer; |
| 100 | + use vortex::error::VortexResult; |
| 101 | + |
| 102 | + use crate::SESSION; |
| 103 | + use crate::duckdb::DataChunk; |
| 104 | + use crate::duckdb::LogicalType; |
| 105 | + use crate::exporter::ArrayExporter; |
| 106 | + use crate::exporter::ConversionCache; |
| 107 | + |
| 108 | + #[test] |
| 109 | + fn chunked_exporter_emits_chunk_aligned_vectors() -> VortexResult<()> { |
| 110 | + let values0 = VarBinViewArray::from_iter_str(["a", "b"]).into_array(); |
| 111 | + let chunk0 = DictArray::try_new(buffer![0u8, 1].into_array(), values0)?.into_array(); |
| 112 | + let dtype = chunk0.dtype().clone(); |
| 113 | + |
| 114 | + let values1 = VarBinViewArray::from_iter_str(["c", "d", "e"]).into_array(); |
| 115 | + let chunk1 = DictArray::try_new(buffer![0u8, 1, 2].into_array(), values1)?.into_array(); |
| 116 | + |
| 117 | + let field = ChunkedArray::try_new(vec![chunk0, chunk1], dtype)?.into_array(); |
| 118 | + let array = StructArray::from_fields(&[("field", field)])?; |
| 119 | + let mut exporter = ArrayExporter::try_new( |
| 120 | + &array, |
| 121 | + &ConversionCache::default(), |
| 122 | + SESSION.create_execution_ctx(), |
| 123 | + )?; |
| 124 | + let mut chunk = DataChunk::new([LogicalType::varchar()]); |
| 125 | + |
| 126 | + assert!(exporter.export(&mut chunk, None, None)?); |
| 127 | + assert_eq!( |
| 128 | + format!("{}", String::try_from(&*chunk)?), |
| 129 | + r#"Chunk - [1 Columns] |
| 130 | +- DICTIONARY VARCHAR: 2 = [ a, b] |
| 131 | +"# |
| 132 | + ); |
| 133 | + |
| 134 | + assert!(exporter.export(&mut chunk, None, None)?); |
| 135 | + assert_eq!( |
| 136 | + format!("{}", String::try_from(&*chunk)?), |
| 137 | + r#"Chunk - [1 Columns] |
| 138 | +- DICTIONARY VARCHAR: 3 = [ c, d, e] |
| 139 | +"# |
| 140 | + ); |
| 141 | + |
| 142 | + assert!(!exporter.export(&mut chunk, None, None)?); |
| 143 | + Ok(()) |
| 144 | + } |
| 145 | +} |
0 commit comments