@@ -195,19 +195,43 @@ pub struct VectorSearchRunRecord {
195195 pub env_triple : Option < String > ,
196196}
197197
198+ /// Canonicalize a TPC scale factor string for v3 dim emission.
199+ ///
200+ /// Bench-orchestrator passes raw strings like `"1.0"` and `"100.0"`, but the
201+ /// v2 → v3 migrate path canonicalizes integer-valued scale factors to `"1"` and
202+ /// `"100"` (because v2 chart names carried integer-looking values). Live
203+ /// records must use the same canonical form so they merge with migrated
204+ /// history into a single chart group instead of forking off a sibling group
205+ /// keyed on `SF=1.0` vs `SF=1`.
206+ ///
207+ /// Falls back to the trimmed input on parse failure or non-finite values, so a
208+ /// scale factor we cannot interpret as a number passes through unchanged
209+ /// rather than being silently rewritten.
210+ fn canonical_tpc_scale_factor ( scale_factor : & str ) -> String {
211+ let trimmed = scale_factor. trim ( ) ;
212+ match trimmed. parse :: < f64 > ( ) {
213+ Ok ( value) if value. is_finite ( ) => format ! ( "{value}" ) ,
214+ _ => scale_factor. to_string ( ) ,
215+ }
216+ }
217+
198218/// Map a [`BenchmarkDataset`] to the `(dataset, dataset_variant, scale_factor)`
199219/// triple emitted in `query_measurement` records.
200220///
201221/// Mirrors the `Per-suite dim values` table in
202222/// `benchmarks-website/planning/benchmark-mapping.md`.
203223pub fn benchmark_dataset_dims ( d : & BenchmarkDataset ) -> ( String , Option < String > , Option < String > ) {
204224 match d {
205- BenchmarkDataset :: TpcH { scale_factor } => {
206- ( "tpch" . to_string ( ) , None , Some ( scale_factor. clone ( ) ) )
207- }
208- BenchmarkDataset :: TpcDS { scale_factor } => {
209- ( "tpcds" . to_string ( ) , None , Some ( scale_factor. clone ( ) ) )
210- }
225+ BenchmarkDataset :: TpcH { scale_factor } => (
226+ "tpch" . to_string ( ) ,
227+ None ,
228+ Some ( canonical_tpc_scale_factor ( scale_factor) ) ,
229+ ) ,
230+ BenchmarkDataset :: TpcDS { scale_factor } => (
231+ "tpcds" . to_string ( ) ,
232+ None ,
233+ Some ( canonical_tpc_scale_factor ( scale_factor) ) ,
234+ ) ,
211235 // ClickBench: the migrate path leaves `dataset_variant` NULL because
212236 // v2 record names did not encode flavor, so the live emitter does the
213237 // same to keep historical and live records in one `clickbench` group.
@@ -270,6 +294,11 @@ pub fn query_measurement_record(
270294///
271295/// Caller passes `dataset` (the compress-bench dataset name) and the
272296/// `op`. `dataset_variant` is reserved and unused at alpha.
297+ ///
298+ /// `dataset` is lowercased here to match the v2 → v3 migrate classifier,
299+ /// which stores `dataset = series.to_lowercase()`. Callers like
300+ /// `Dataset::v3_dataset_dims` may therefore return mixed-case names without
301+ /// having to duplicate the case-folding rule.
273302pub fn compression_time_record (
274303 timing : & CompressionTimingMeasurement ,
275304 dataset : & str ,
@@ -279,7 +308,7 @@ pub fn compression_time_record(
279308) -> V3Record {
280309 V3Record :: CompressionTime ( CompressionTimeRecord {
281310 commit_sha : GIT_COMMIT_ID . clone ( ) ,
282- dataset : dataset. to_string ( ) ,
311+ dataset : dataset. to_lowercase ( ) ,
283312 dataset_variant : dataset_variant. map ( str:: to_string) ,
284313 format : timing. format . name ( ) . to_string ( ) ,
285314 op : compress_op_label ( op) . to_string ( ) ,
@@ -290,6 +319,9 @@ pub fn compression_time_record(
290319}
291320
292321/// Build a `compression_size` record.
322+ ///
323+ /// `dataset` is lowercased here for the same reason as
324+ /// [`compression_time_record`].
293325pub fn compression_size_record (
294326 dataset : & str ,
295327 dataset_variant : Option < & str > ,
@@ -298,7 +330,7 @@ pub fn compression_size_record(
298330) -> V3Record {
299331 V3Record :: CompressionSize ( CompressionSizeRecord {
300332 commit_sha : GIT_COMMIT_ID . clone ( ) ,
301- dataset : dataset. to_string ( ) ,
333+ dataset : dataset. to_lowercase ( ) ,
302334 dataset_variant : dataset_variant. map ( str:: to_string) ,
303335 format : format. name ( ) . to_string ( ) ,
304336 value_bytes,
@@ -505,16 +537,21 @@ mod tests {
505537 }
506538
507539 #[ test]
508- fn snapshot_compression_time_public_bi_variant ( ) -> anyhow:: Result < ( ) > {
540+ fn snapshot_compression_time_public_bi ( ) -> anyhow:: Result < ( ) > {
541+ // PBI compression records flatten the `public-bi` parent into the
542+ // dataset axis: a `PBIBenchmark` named `CMSprovider` emits
543+ // `dataset = "cmsprovider", dataset_variant = NULL`. Mixed-case input
544+ // here exercises both `PBIBenchmark::v3_dataset_dims` (no variant) and
545+ // `compression_time_record` (lowercase-folded dataset).
509546 let timing = CompressionTimingMeasurement {
510- name : "compress time/cms-provider " . to_string ( ) ,
547+ name : "compress time/CMSprovider " . to_string ( ) ,
511548 format : Format :: OnDiskVortex ,
512549 time : Duration :: from_nanos ( 5_000_000 ) ,
513550 } ;
514551 let record = compression_time_record (
515552 & timing,
516- "public-bi " ,
517- Some ( "cms-provider" ) ,
553+ "CMSprovider " ,
554+ None ,
518555 CompressOp :: Compress ,
519556 vec ! [ 5_500_000 , 5_000_000 , 5_200_000 ] ,
520557 ) ;
@@ -598,6 +635,74 @@ mod tests {
598635 }
599636 }
600637
638+ #[ test]
639+ fn compression_records_lowercase_dataset_for_v2_history_match ( ) {
640+ // The v2 → v3 migrate classifier stores `dataset = series.to_lowercase()`
641+ // for compress-bench records (see `benchmarks-website/migrate/src/classifier.rs`).
642+ // Datasets whose `Dataset::name()` returns mixed case
643+ // (`TPC-H l_comment chunked`, every PBI name like `Arade`/`CMSprovider`)
644+ // would otherwise emit live records that do not merge with their
645+ // migrated history. Lowercasing inside the v3 helpers keeps the trait
646+ // API simple for non-v3 callers while still matching migrate's shape.
647+ let timing = CompressionTimingMeasurement {
648+ name : "compress time/TPC-H l_comment chunked" . to_string ( ) ,
649+ format : Format :: OnDiskVortex ,
650+ time : Duration :: from_nanos ( 1_000_000 ) ,
651+ } ;
652+ let record = compression_time_record (
653+ & timing,
654+ "TPC-H l_comment chunked" ,
655+ None ,
656+ CompressOp :: Compress ,
657+ vec ! [ 1_000_000 ] ,
658+ ) ;
659+ let V3Record :: CompressionTime ( time) = & record else {
660+ panic ! ( "expected CompressionTime variant, got {record:?}" ) ;
661+ } ;
662+ assert_eq ! ( time. dataset, "tpc-h l_comment chunked" ) ;
663+
664+ let record = compression_size_record ( "CMSprovider" , None , Format :: OnDiskVortex , 42 ) ;
665+ let V3Record :: CompressionSize ( size) = & record else {
666+ panic ! ( "expected CompressionSize variant, got {record:?}" ) ;
667+ } ;
668+ assert_eq ! ( size. dataset, "cmsprovider" ) ;
669+ }
670+
671+ #[ test]
672+ fn tpc_scale_factors_are_canonicalized_for_query_dims ( ) {
673+ // Bench-orchestrator passes raw TPC scale factors like `"1.0"` and `"100.0"`,
674+ // but the v2 → v3 migrate path canonicalizes integer-valued scale factors
675+ // to `"1"` and `"100"` (because v2 chart names carried integer-looking
676+ // values). The live emitter must do the same so live ingests merge with
677+ // migrated history into a single chart group instead of forking off a
678+ // sibling group keyed on `SF=1.0` vs `SF=1`.
679+ let cases = [
680+ ( "1.0" , "1" ) ,
681+ ( "100.0" , "100" ) ,
682+ ( "1" , "1" ) ,
683+ ( "100" , "100" ) ,
684+ ( "0.01" , "0.01" ) ,
685+ ] ;
686+ for ( input, expected) in cases {
687+ let ( _, _, sf) = benchmark_dataset_dims ( & BenchmarkDataset :: TpcH {
688+ scale_factor : input. to_string ( ) ,
689+ } ) ;
690+ assert_eq ! (
691+ sf. as_deref( ) ,
692+ Some ( expected) ,
693+ "TpcH scale factor {input:?} should canonicalize to {expected:?}" ,
694+ ) ;
695+ let ( _, _, sf) = benchmark_dataset_dims ( & BenchmarkDataset :: TpcDS {
696+ scale_factor : input. to_string ( ) ,
697+ } ) ;
698+ assert_eq ! (
699+ sf. as_deref( ) ,
700+ Some ( expected) ,
701+ "TpcDS scale factor {input:?} should canonicalize to {expected:?}" ,
702+ ) ;
703+ }
704+ }
705+
601706 #[ test]
602707 fn jsonl_round_trips_one_record_per_line ( ) -> anyhow:: Result < ( ) > {
603708 let record = compression_size_record ( "taxi" , None , Format :: Parquet , 100 ) ;
0 commit comments