Skip to content

Commit 0ad3689

Browse files
committed
compress the array tree segments
1 parent a63ae1b commit 0ad3689

1 file changed

Lines changed: 24 additions & 20 deletions

File tree

vortex-file/src/strategy.rs

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,27 @@ impl WriteStrategyBuilder {
243243
Arc::new(FlatLayoutStrategy::default())
244244
};
245245

246+
// Data compressor: excludes IntDictScheme because DictStrategy (step 3 below) already
247+
// dictionary-encodes columns; allowing it here would redundantly dictionary-encode the
248+
// integer codes produced by that earlier step.
249+
let data_compressor: Arc<dyn CompressorPlugin> = match &self.compressor {
250+
CompressorConfig::BtrBlocks(builder) => Arc::new(
251+
builder
252+
.clone()
253+
.exclude_schemes([IntDictScheme.id()])
254+
.build(),
255+
),
256+
CompressorConfig::Opaque(compressor) => Arc::clone(compressor),
257+
};
258+
// Stats compressor: used for zone-map tables, dict values, and array-tree consolidated
259+
// segments.
260+
let stats_compressor: Arc<dyn CompressorPlugin> = match &self.compressor {
261+
CompressorConfig::BtrBlocks(builder) => Arc::new(builder.clone().build()),
262+
CompressorConfig::Opaque(compressor) => Arc::clone(compressor),
263+
};
264+
let compress_then_flat = CompressingStrategy::new(Arc::clone(&flat), stats_compressor);
265+
let compress_then_flat_arc: Arc<dyn LayoutStrategy> = Arc::new(compress_then_flat.clone());
266+
246267
// Build the data pipeline leaf. Array-tree outlining requires both opt-in via
247268
// `with_array_tree(true)` AND no custom flat strategy (the user's strategy owns the
248269
// leaf format in that case).
@@ -256,7 +277,9 @@ impl WriteStrategyBuilder {
256277
} else {
257278
FlatLayoutStrategy::default()
258279
};
259-
let (collector, leaf) = writer::writer(data_flat, Arc::clone(&flat));
280+
// Use the compressed flat strategy for the consolidated array-trees segment — the
281+
// struct of (segment_id, compact_tree) dict-encodes and compresses well.
282+
let (collector, leaf) = writer::writer(data_flat, Arc::clone(&compress_then_flat_arc));
260283
(Arc::new(leaf), Some(collector))
261284
};
262285

@@ -266,18 +289,6 @@ impl WriteStrategyBuilder {
266289
let buffered = BufferedStrategy::new(chunked, 2 * ONE_MEG); // 2MB
267290

268291
// 5. compress each chunk.
269-
// Exclude IntDictScheme from the data compressor because DictStrategy (step 3) already
270-
// dictionary-encodes columns. Allowing IntDictScheme here would redundantly
271-
// dictionary-encode the integer codes produced by that earlier step.
272-
let data_compressor: Arc<dyn CompressorPlugin> = match &self.compressor {
273-
CompressorConfig::BtrBlocks(builder) => Arc::new(
274-
builder
275-
.clone()
276-
.exclude_schemes([IntDictScheme.id()])
277-
.build(),
278-
),
279-
CompressorConfig::Opaque(compressor) => Arc::clone(compressor),
280-
};
281292
let compressing = CompressingStrategy::new(buffered, data_compressor);
282293

283294
// 4. prior to compression, coalesce up to a minimum size
@@ -297,13 +308,6 @@ impl WriteStrategyBuilder {
297308
},
298309
);
299310

300-
// 2.1. | 3.1. compress stats tables and dict values.
301-
let stats_compressor: Arc<dyn CompressorPlugin> = match self.compressor {
302-
CompressorConfig::BtrBlocks(builder) => Arc::new(builder.build()),
303-
CompressorConfig::Opaque(compressor) => compressor,
304-
};
305-
let compress_then_flat = CompressingStrategy::new(flat, stats_compressor);
306-
307311
// 3. apply dict encoding or fallback
308312
let dict = DictStrategy::new(
309313
coalescing.clone(),

0 commit comments

Comments
 (0)