@@ -17,6 +17,7 @@ use vortex_compressor::builtins::FloatDictScheme;
1717use vortex_compressor:: builtins:: StringDictScheme ;
1818use vortex_compressor:: estimate:: CompressionEstimate ;
1919use vortex_compressor:: estimate:: DeferredEstimate ;
20+ use vortex_compressor:: estimate:: EstimateScore ;
2021use vortex_compressor:: estimate:: EstimateVerdict ;
2122use vortex_compressor:: scheme:: AncestorExclusion ;
2223use vortex_compressor:: scheme:: ChildSelection ;
@@ -737,20 +738,29 @@ impl Scheme for SequenceScheme {
737738 return CompressionEstimate :: Verdict ( EstimateVerdict :: Skip ) ;
738739 }
739740
740- // TODO(connor): Why do we sequence encode the whole thing and then throw it away? And then
741- // why do we divide the ratio by 2???
742-
741+ // TODO(connor): `sequence_encode` allocates the encoded array just to confirm feasibility.
742+ // A cheaper `is_sequence` probe would let us skip the allocation entirely.
743743 CompressionEstimate :: Deferred ( DeferredEstimate :: Callback ( Box :: new (
744- |_compressor, data, _ctx, exec_ctx| {
745- let Some ( encoded) = sequence_encode ( data. array_as_primitive ( ) , exec_ctx) ? else {
746- // If we are unable to sequence encode this array, make sure we skip.
744+ |_compressor, data, best_so_far, _ctx, exec_ctx| {
745+ // `SequenceArray` stores exactly two scalars (base and multiplier), so the best
746+ // achievable compression ratio is `array_len / 2`.
747+ let compressed_size = 2usize ;
748+ let max_ratio = data. array_len ( ) as f64 / compressed_size as f64 ;
749+
750+ // If we cannot beat the best so far, then we do not want to even try sequence
751+ // encoding the data.
752+ let threshold = best_so_far. and_then ( EstimateScore :: finite_ratio) ;
753+ if threshold. is_some_and ( |t| max_ratio <= t) {
747754 return Ok ( EstimateVerdict :: Skip ) ;
748- } ;
755+ }
749756
750- // TODO(connor): This doesn't really make sense?
751- // Since two values are required to store base and multiplier the compression ratio is
752- // divided by 2.
753- Ok ( EstimateVerdict :: Ratio ( encoded. len ( ) as f64 / 2.0 ) )
757+ // TODO(connor): We should pass this array back to the compressor in the case that
758+ // we do want to sequence encode this so that we do not need to recompress.
759+ if sequence_encode ( data. array_as_primitive ( ) , exec_ctx) ?. is_none ( ) {
760+ return Ok ( EstimateVerdict :: Skip ) ;
761+ }
762+ // TODO(connor): Should we get the actual ratio here?
763+ Ok ( EstimateVerdict :: Ratio ( max_ratio) )
754764 } ,
755765 ) ) )
756766 }
0 commit comments