@@ -28,7 +28,6 @@ use vortex_array::arrays::VarBinView;
2828use vortex_array:: dtype:: FieldPath ;
2929use vortex_array:: session:: ArrayRegistry ;
3030use vortex_array:: session:: ArraySession ;
31- use vortex_btrblocks:: BtrBlocksCompressor ;
3231use vortex_btrblocks:: BtrBlocksCompressorBuilder ;
3332use vortex_btrblocks:: SchemeExt ;
3433use vortex_btrblocks:: schemes:: integer:: IntDictScheme ;
@@ -59,14 +58,6 @@ use vortex_sequence::Sequence;
5958use vortex_sparse:: Sparse ;
6059use vortex_utils:: aliases:: hash_map:: HashMap ;
6160use vortex_zigzag:: ZigZag ;
62-
63- #[ rustfmt:: skip]
64- #[ cfg( feature = "zstd" ) ]
65- use vortex_btrblocks:: {
66- schemes:: float,
67- schemes:: integer,
68- schemes:: string,
69- } ;
7061#[ cfg( feature = "zstd" ) ]
7162use vortex_zstd:: Zstd ;
7263#[ cfg( all( feature = "zstd" , feature = "unstable_encodings" ) ) ]
@@ -123,13 +114,24 @@ pub static ALLOWED_ENCODINGS: LazyLock<ArrayRegistry> = LazyLock::new(|| {
123114 session. registry ( ) . clone ( )
124115} ) ;
125116
126- /// Build a new [writer strategy][LayoutStrategy] to compress and reorganize chunks of a Vortex file.
117+ /// How the compressor was configured on [`WriteStrategyBuilder`].
118+ enum CompressorConfig {
119+ /// A [`BtrBlocksCompressorBuilder`] that [`WriteStrategyBuilder::build`] will finalize.
120+ /// `IntDictScheme` is automatically excluded from the data compressor to prevent recursive
121+ /// dictionary encoding.
122+ BtrBlocks ( BtrBlocksCompressorBuilder ) ,
123+ /// An opaque compressor used as-is for both data and stats compression.
124+ Opaque ( Arc < dyn CompressorPlugin > ) ,
125+ }
126+
127+ /// Build a new [writer strategy](LayoutStrategy) to compress and reorganize chunks of a Vortex
128+ /// file.
127129///
128130/// Vortex provides an out-of-the-box file writer that optimizes the layout of chunks on-disk,
129131/// repartitioning and compressing them to strike a balance between size on-disk,
130132/// bulk decoding performance, and IOPS required to perform an indexed read.
131133pub struct WriteStrategyBuilder {
132- compressor_override : Option < Arc < dyn CompressorPlugin > > ,
134+ compressor : CompressorConfig ,
133135 row_block_size : usize ,
134136 field_writers : HashMap < FieldPath , Arc < dyn LayoutStrategy > > ,
135137 allow_encodings : Option < ArrayRegistry > ,
@@ -141,7 +143,7 @@ impl Default for WriteStrategyBuilder {
141143 /// and then finally built yielding the [`LayoutStrategy`].
142144 fn default ( ) -> Self {
143145 Self {
144- compressor_override : None ,
146+ compressor : CompressorConfig :: BtrBlocks ( BtrBlocksCompressorBuilder :: default ( ) ) ,
145147 row_block_size : 8192 ,
146148 field_writers : HashMap :: new ( ) ,
147149 allow_encodings : Some ( ALLOWED_ENCODINGS . clone ( ) ) ,
@@ -183,97 +185,20 @@ impl WriteStrategyBuilder {
183185 self
184186 }
185187
186- /// Override the [compressor](CompressorPlugin) used for compressing chunks in the file.
187- ///
188- /// If not provided, this will use a BtrBlocks-style cascading compressor that tries to balance
189- /// total size with decoding performance.
190- ///
191- /// # Panics
192- ///
193- /// Panics if a compressor has already been set via
194- /// [`with_compressor`](Self::with_compressor),
195- /// [`with_cuda_compatible_encodings`](Self::with_cuda_compatible_encodings), or
196- /// [`with_compact_encodings`](Self::with_compact_encodings).
197- ///
198- /// These methods are mutually exclusive.
199- pub fn with_compressor < C : CompressorPlugin > ( mut self , compressor : C ) -> Self {
200- assert ! (
201- self . compressor_override. is_none( ) ,
202- "A compressor has already been configured. `with_compressor`, \
203- `with_cuda_compatible_encodings`, and `with_compact_encodings` are mutually exclusive."
204- ) ;
205- self . compressor_override = Some ( Arc :: new ( compressor) ) ;
206- self
207- }
208-
209- /// Configure a write strategy that emits only CUDA-compatible encodings.
210- ///
211- /// This method simply exists as a wrapper around [`with_compressor`].
212- ///
213- /// This configures BtrBlocks to exclude schemes without CUDA kernel support.
214- /// With the `unstable_encodings` feature, strings use buffer-level Zstd compression
215- /// (`ZstdBuffersArray`) which preserves the array buffer layout for zero-conversion
216- /// GPU decompression. Without it, strings use interleaved Zstd compression.
217- ///
218- /// # Panics
219- ///
220- /// Panics if a compressor has already been set. See [`with_compressor`]
188+ /// Override the default [`BtrBlocksCompressorBuilder`] used for compression.
221189 ///
222- /// [`with_compressor`]: Self::with_compressor.
223- #[ cfg( feature = "zstd" ) ]
224- pub fn with_cuda_compatible_encodings ( mut self ) -> Self {
225- assert ! (
226- self . compressor_override. is_none( ) ,
227- "A compressor has already been configured. `with_compressor`, \
228- `with_cuda_compatible_encodings`, and `with_compact_encodings` are mutually exclusive."
229- ) ;
230-
231- let mut builder = BtrBlocksCompressorBuilder :: default ( ) . exclude ( [
232- integer:: SparseScheme . id ( ) ,
233- integer:: RLE_INTEGER_SCHEME . id ( ) ,
234- float:: RLE_FLOAT_SCHEME . id ( ) ,
235- float:: NullDominatedSparseScheme . id ( ) ,
236- string:: StringDictScheme . id ( ) ,
237- string:: FSSTScheme . id ( ) ,
238- ] ) ;
239-
240- #[ cfg( feature = "unstable_encodings" ) ]
241- {
242- builder = builder. with_new_scheme ( & string:: ZstdBuffersScheme ) ;
243- }
244- #[ cfg( not( feature = "unstable_encodings" ) ) ]
245- {
246- builder = builder. with_new_scheme ( & string:: ZstdScheme ) ;
247- }
248-
249- self . compressor_override = Some ( Arc :: new ( builder. build ( ) ) ) ;
190+ /// The builder is finalized during [`build`](Self::build), producing two compressors: one for
191+ /// data (with `IntDictScheme` excluded) and one for stats.
192+ pub fn with_btrblocks_builder ( mut self , builder : BtrBlocksCompressorBuilder ) -> Self {
193+ self . compressor = CompressorConfig :: BtrBlocks ( builder) ;
250194 self
251195 }
252196
253- /// Configure a write strategy that uses compact encodings (Pco for numerics, Zstd for
254- /// strings/binary).
255- ///
256- /// This method simply exists as a wrapper around [`with_compressor`].
257- ///
258- /// This provides better compression ratios than the default BtrBlocks strategy,
259- /// especially for floating-point heavy datasets.
197+ /// Set the compressor to an opaque [`CompressorPlugin`].
260198 ///
261- /// # Panics
262- ///
263- /// Panics if a compressor has already been set. See [`with_compressor`]
264- ///
265- /// [`with_compressor`]: Self::with_compressor.
266- #[ cfg( feature = "zstd" ) ]
267- pub fn with_compact_encodings ( mut self ) -> Self {
268- assert ! (
269- self . compressor_override. is_none( ) ,
270- "A compressor has already been configured. `with_compressor`, \
271- `with_cuda_compatible_encodings`, and `with_compact_encodings` are mutually exclusive."
272- ) ;
273-
274- self . compressor_override = Some ( Arc :: new (
275- BtrBlocksCompressorBuilder :: default ( ) . with_compact ( ) . build ( ) ,
276- ) ) ;
199+ /// The compressor is used as-is for both data and stats compression.
200+ pub fn with_compressor < C : CompressorPlugin > ( mut self , compressor : C ) -> Self {
201+ self . compressor = CompressorConfig :: Opaque ( Arc :: new ( compressor) ) ;
277202 self
278203 }
279204
@@ -294,19 +219,18 @@ impl WriteStrategyBuilder {
294219 let buffered = BufferedStrategy :: new ( chunked, 2 * ONE_MEG ) ; // 2MB
295220
296221 // 5. compress each chunk.
297- // Exclude IntDictScheme from the default compressor because DictStrategy (step 3) already
222+ // Exclude IntDictScheme from the data compressor because DictStrategy (step 3) already
298223 // dictionary-encodes columns. Allowing IntDictScheme here would redundantly
299224 // dictionary-encode the integer codes produced by that earlier step.
300- let data_compressor: Arc < dyn CompressorPlugin > =
301- if let Some ( ref compressor) = self . compressor_override {
302- compressor. clone ( )
303- } else {
304- Arc :: new (
305- BtrBlocksCompressorBuilder :: default ( )
306- . exclude ( [ IntDictScheme . id ( ) ] )
307- . build ( ) ,
308- )
309- } ;
225+ let data_compressor: Arc < dyn CompressorPlugin > = match & self . compressor {
226+ CompressorConfig :: BtrBlocks ( builder) => Arc :: new (
227+ builder
228+ . clone ( )
229+ . exclude_schemes ( [ IntDictScheme . id ( ) ] )
230+ . build ( ) ,
231+ ) ,
232+ CompressorConfig :: Opaque ( compressor) => compressor. clone ( ) ,
233+ } ;
310234 let compressing = CompressingStrategy :: new ( buffered, data_compressor) ;
311235
312236 // 4. prior to compression, coalesce up to a minimum size
@@ -327,12 +251,10 @@ impl WriteStrategyBuilder {
327251 ) ;
328252
329253 // 2.1. | 3.1. compress stats tables and dict values.
330- let stats_compressor: Arc < dyn CompressorPlugin > =
331- if let Some ( ref compressor) = self . compressor_override {
332- compressor. clone ( )
333- } else {
334- Arc :: new ( BtrBlocksCompressor :: default ( ) )
335- } ;
254+ let stats_compressor: Arc < dyn CompressorPlugin > = match self . compressor {
255+ CompressorConfig :: BtrBlocks ( builder) => Arc :: new ( builder. build ( ) ) ,
256+ CompressorConfig :: Opaque ( compressor) => compressor,
257+ } ;
336258 let compress_then_flat = CompressingStrategy :: new ( flat, stats_compressor) ;
337259
338260 // 3. apply dict encoding or fallback
0 commit comments