Skip to content

Commit e52fef5

Browse files
committed
clean up scoring logic
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
1 parent de7fc45 commit e52fef5

2 files changed

Lines changed: 52 additions & 22 deletions

File tree

vortex-compressor/src/compressor.rs

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,7 @@ impl CascadingCompressor {
400400
EstimateVerdict::Skip => {}
401401
EstimateVerdict::AlwaysUse => return Ok(Some((scheme, WinnerEstimate::AlwaysUse))),
402402
EstimateVerdict::Ratio(ratio) => {
403-
let score = EstimateScore::Finite(ratio);
403+
let score = EstimateScore::FiniteCompression(ratio);
404404
if is_better_score(score, &best) {
405405
best = Some((scheme, score));
406406
}
@@ -1069,7 +1069,7 @@ mod tests {
10691069

10701070
assert!(matches!(
10711071
winner,
1072-
Some((scheme, WinnerEstimate::Score(EstimateScore::Finite(2.0))))
1072+
Some((scheme, WinnerEstimate::Score(EstimateScore::FiniteCompression(2.0))))
10731073
if scheme.id() == DirectRatioScheme.id()
10741074
));
10751075
Ok(())
@@ -1086,7 +1086,7 @@ mod tests {
10861086

10871087
assert!(matches!(
10881088
winner,
1089-
Some((scheme, WinnerEstimate::Score(EstimateScore::Finite(3.0))))
1089+
Some((scheme, WinnerEstimate::Score(EstimateScore::FiniteCompression(3.0))))
10901090
if scheme.id() == CallbackRatioScheme.id()
10911091
));
10921092
Ok(())
@@ -1109,6 +1109,23 @@ mod tests {
11091109
Ok(())
11101110
}
11111111

1112+
#[test]
1113+
fn finite_ratio_does_not_displace_zero_byte_sample() -> VortexResult<()> {
1114+
let compressor = CascadingCompressor::new(vec![&ZeroBytesSamplingScheme, &HugeRatioScheme]);
1115+
let schemes: [&'static dyn Scheme; 2] = [&ZeroBytesSamplingScheme, &HugeRatioScheme];
1116+
let mut data = estimate_test_data();
1117+
1118+
let winner =
1119+
compressor.choose_best_scheme(&schemes, &mut data, CompressorContext::new())?;
1120+
1121+
assert!(matches!(
1122+
winner,
1123+
Some((scheme, WinnerEstimate::Score(EstimateScore::ZeroBytes)))
1124+
if scheme.id() == ZeroBytesSamplingScheme.id()
1125+
));
1126+
Ok(())
1127+
}
1128+
11121129
#[test]
11131130
fn all_null_array_compresses_to_constant() -> VortexResult<()> {
11141131
let array = PrimitiveArray::new(
@@ -1151,7 +1168,7 @@ mod tests {
11511168
// "this must be present since `DictScheme` declared that we need distinct values"
11521169
let score =
11531170
estimate_compression_ratio_with_sampling(&FloatDictScheme, &compressor, &array, ctx)?;
1154-
assert!(matches!(score, EstimateScore::Finite(ratio) if ratio.is_finite()));
1171+
assert!(matches!(score, EstimateScore::FiniteCompression(ratio) if ratio.is_finite()));
11551172
Ok(())
11561173
}
11571174

vortex-compressor/src/estimate.rs

Lines changed: 31 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,13 @@ pub enum DeferredEstimate {
8989
/// Ranked estimate used for comparing non-terminal compression candidates.
9090
#[derive(Debug, Clone, Copy, PartialEq)]
9191
pub(super) enum EstimateScore {
92-
/// A finite compression ratio.
93-
Finite(f64),
92+
/// A finite compression ratio. Higher means a smaller amount of data, so it is better.
93+
FiniteCompression(f64),
9494
/// Trial compression produced a 0-byte output.
95+
///
96+
/// This is treated as an unbounded score for scheme selection, but has no finite trace ratio.
97+
/// Final full-array compression still has to pass the normal before/after acceptance check, so
98+
/// a zero-byte sample can choose a scheme but cannot force a larger output to be returned.
9599
ZeroBytes,
96100
}
97101

@@ -101,17 +105,39 @@ impl EstimateScore {
101105
if after_nbytes == 0 {
102106
Self::ZeroBytes
103107
} else {
104-
Self::Finite(before_nbytes as f64 / after_nbytes as f64)
108+
Self::FiniteCompression(before_nbytes as f64 / after_nbytes as f64)
105109
}
106110
}
107111

108112
/// Returns the traceable numeric ratio, omitting the zero-byte special case.
109113
pub(super) fn trace_ratio(self) -> Option<f64> {
110114
match self {
111-
Self::Finite(ratio) => Some(ratio),
115+
Self::FiniteCompression(ratio) => Some(ratio),
112116
Self::ZeroBytes => None,
113117
}
114118
}
119+
120+
/// Returns whether this estimate is eligible to compete.
121+
fn is_valid(self) -> bool {
122+
match self {
123+
Self::FiniteCompression(ratio) => {
124+
ratio.is_finite() && !ratio.is_subnormal() && ratio > 1.0
125+
}
126+
Self::ZeroBytes => true,
127+
}
128+
}
129+
130+
/// Returns whether this estimate beats another valid estimate.
131+
fn beats(self, other: Self) -> bool {
132+
match (self, other) {
133+
(Self::ZeroBytes, Self::FiniteCompression(_)) => true,
134+
(Self::ZeroBytes, Self::ZeroBytes) => false,
135+
(Self::FiniteCompression(_), Self::ZeroBytes) => false,
136+
(Self::FiniteCompression(ratio), Self::FiniteCompression(best_ratio)) => {
137+
ratio > best_ratio
138+
}
139+
}
140+
}
115141
}
116142

117143
/// Winner estimate carried from scheme selection into result tracing.
@@ -138,20 +164,7 @@ pub(super) fn is_better_score(
138164
score: EstimateScore,
139165
best: &Option<(&'static dyn Scheme, EstimateScore)>,
140166
) -> bool {
141-
match score {
142-
EstimateScore::ZeroBytes => {
143-
best.is_none_or(|(_, best_score)| !matches!(best_score, EstimateScore::ZeroBytes))
144-
}
145-
EstimateScore::Finite(ratio) => {
146-
ratio.is_finite()
147-
&& !ratio.is_subnormal()
148-
&& ratio > 1.0
149-
&& best.is_none_or(|(_, best_score)| match best_score {
150-
EstimateScore::Finite(best_ratio) => ratio > best_ratio,
151-
EstimateScore::ZeroBytes => false,
152-
})
153-
}
154-
}
167+
score.is_valid() && best.is_none_or(|(_, best_score)| score.beats(best_score))
155168
}
156169

157170
/// Estimates compression ratio by compressing a ~1% sample of the data.

0 commit comments

Comments
 (0)