@@ -273,7 +273,7 @@ pub fn estimate_compression_ratio_with_sampling<S: Scheme + ?Sized>(
273273 sample ( array, SAMPLE_SIZE , sample_count)
274274 } ;
275275
276- let mut sample_data = ArrayAndStats :: new ( sample_array, ctx . stats_options ( ) ) ;
276+ let mut sample_data = ArrayAndStats :: new ( sample_array, scheme . stats_options ( ) ) ;
277277 let sample_ctx = ctx. as_sample ( ) ;
278278
279279 let after = scheme
@@ -286,3 +286,47 @@ pub fn estimate_compression_ratio_with_sampling<S: Scheme + ?Sized>(
286286
287287 Ok ( ratio)
288288}
289+
290+ #[ cfg( test) ]
291+ mod tests {
292+ use vortex_array:: IntoArray ;
293+ use vortex_array:: arrays:: PrimitiveArray ;
294+ use vortex_array:: validity:: Validity ;
295+ use vortex_buffer:: buffer;
296+ use vortex_error:: VortexResult ;
297+
298+ use super :: estimate_compression_ratio_with_sampling;
299+ use crate :: CascadingCompressor ;
300+ use crate :: builtins:: FloatDictScheme ;
301+ use crate :: ctx:: CompressorContext ;
302+
303+ /// Regression test for <https://github.com/spiraldb/vortex/issues/7227>.
304+ ///
305+ /// `estimate_compression_ratio_with_sampling` must use the *scheme's* stats options
306+ /// (which request distinct-value counting) rather than the context's stats options
307+ /// (which may not). With the old code this panicked inside `dictionary_encode` because
308+ /// distinct values were never computed for the sample.
309+ #[ test]
310+ fn sampling_uses_scheme_stats_options ( ) -> VortexResult < ( ) > {
311+ // Low-cardinality float array so FloatDictScheme considers it compressible.
312+ let array = PrimitiveArray :: new (
313+ buffer ! [ 1.0f32 , 2.0 , 1.0 , 2.0 , 1.0 , 2.0 , 1.0 , 2.0 ] ,
314+ Validity :: NonNullable ,
315+ )
316+ . into_array ( ) ;
317+
318+ let compressor = CascadingCompressor :: new ( vec ! [ & FloatDictScheme ] ) ;
319+
320+ // A context with default stats_options (count_distinct_values = false) and
321+ // marked as a sample so the function skips the sampling step and compresses
322+ // the array directly.
323+ let ctx = CompressorContext :: default ( ) . as_sample ( ) ;
324+
325+ // Before the fix this panicked with:
326+ // "this must be present since `DictScheme` declared that we need distinct values"
327+ let ratio =
328+ estimate_compression_ratio_with_sampling ( & FloatDictScheme , & compressor, & array, ctx) ?;
329+ assert ! ( ratio. is_finite( ) ) ;
330+ Ok ( ( ) )
331+ }
332+ }
0 commit comments