Skip to content

Commit 89ebbd7

Browse files
authored
perf: Implement more microbenchmarks for cast expressions (#3031)
1 parent 1e525a4 commit 89ebbd7

10 files changed

Lines changed: 790 additions & 102 deletions

spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala

Lines changed: 0 additions & 96 deletions
This file was deleted.
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.spark.sql.benchmark
21+
22+
case class CastBooleanConfig(
23+
name: String,
24+
query: String,
25+
extraCometConfigs: Map[String, String] = Map.empty)
26+
27+
/**
28+
* Benchmark to measure performance of Comet cast operations involving Boolean type. To run this
29+
* benchmark:
30+
* {{{
31+
* SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastBooleanBenchmark
32+
* }}}
33+
* Results will be written to "spark/benchmarks/CometCastBooleanBenchmark-**results.txt".
34+
*/
35+
object CometCastBooleanBenchmark extends CometBenchmarkBase {
36+
37+
private val castFunctions = Seq("CAST", "TRY_CAST")
38+
39+
// Boolean to String
40+
private val boolToStringConfigs = for {
41+
castFunc <- castFunctions
42+
} yield CastBooleanConfig(
43+
s"$castFunc Boolean to String",
44+
s"SELECT $castFunc(c_bool AS STRING) FROM parquetV1Table")
45+
46+
// Boolean to numeric types
47+
private val boolToNumericTypes =
48+
Seq("BYTE", "SHORT", "INT", "LONG", "FLOAT", "DOUBLE", "DECIMAL(10,2)")
49+
private val boolToNumericConfigs = for {
50+
castFunc <- castFunctions
51+
targetType <- boolToNumericTypes
52+
} yield CastBooleanConfig(
53+
s"$castFunc Boolean to $targetType",
54+
s"SELECT $castFunc(c_bool AS $targetType) FROM parquetV1Table")
55+
56+
// Numeric to Boolean
57+
private val numericTypes = Seq(
58+
("BYTE", "c_byte"),
59+
("SHORT", "c_short"),
60+
("INT", "c_int"),
61+
("LONG", "c_long"),
62+
("FLOAT", "c_float"),
63+
("DOUBLE", "c_double"),
64+
("DECIMAL(10,2)", "c_decimal"))
65+
66+
private val numericToBoolConfigs = for {
67+
castFunc <- castFunctions
68+
(sourceType, colName) <- numericTypes
69+
} yield CastBooleanConfig(
70+
s"$castFunc $sourceType to Boolean",
71+
s"SELECT $castFunc($colName AS BOOLEAN) FROM parquetV1Table")
72+
73+
override def runCometBenchmark(mainArgs: Array[String]): Unit = {
74+
val values = 1024 * 1024 // 1M rows
75+
76+
// Generate boolean data for boolean-to-other casts
77+
runBenchmarkWithTable("Boolean to other types casts", values) { v =>
78+
withTempPath { dir =>
79+
withTempTable("parquetV1Table") {
80+
// Data distribution: 1% NULL, 50/50 true/false
81+
prepareTable(
82+
dir,
83+
spark.sql(s"""
84+
SELECT CASE
85+
WHEN value % 100 = 0 THEN NULL
86+
ELSE (value % 2 = 0)
87+
END AS c_bool
88+
FROM $tbl
89+
"""))
90+
91+
(boolToStringConfigs ++ boolToNumericConfigs).foreach { config =>
92+
runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs)
93+
}
94+
}
95+
}
96+
}
97+
98+
// Generate numeric data for numeric-to-boolean casts
99+
runBenchmarkWithTable("Numeric to Boolean casts", values) { v =>
100+
withTempPath { dir =>
101+
withTempTable("parquetV1Table") {
102+
// Data distribution: 1% NULL per column, values in {-1, 0, 1} (~33% each)
103+
prepareTable(
104+
dir,
105+
spark.sql(s"""
106+
SELECT
107+
CASE WHEN value % 100 = 0 THEN NULL ELSE CAST((value % 3) - 1 AS BYTE) END AS c_byte,
108+
CASE WHEN value % 100 = 1 THEN NULL ELSE CAST((value % 3) - 1 AS SHORT) END AS c_short,
109+
CASE WHEN value % 100 = 2 THEN NULL ELSE CAST((value % 3) - 1 AS INT) END AS c_int,
110+
CASE WHEN value % 100 = 3 THEN NULL ELSE CAST((value % 3) - 1 AS LONG) END AS c_long,
111+
CASE WHEN value % 100 = 4 THEN NULL ELSE CAST((value % 3) - 1 AS FLOAT) END AS c_float,
112+
CASE WHEN value % 100 = 5 THEN NULL ELSE CAST((value % 3) - 1 AS DOUBLE) END AS c_double,
113+
CASE WHEN value % 100 = 6 THEN NULL ELSE CAST((value % 3) - 1 AS DECIMAL(10,2)) END AS c_decimal
114+
FROM $tbl
115+
"""))
116+
117+
numericToBoolConfigs.foreach { config =>
118+
runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs)
119+
}
120+
}
121+
}
122+
}
123+
}
124+
}
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.spark.sql.benchmark
21+
22+
case class CastNumericToNumericConfig(
23+
name: String,
24+
query: String,
25+
extraCometConfigs: Map[String, String] = Map.empty)
26+
27+
/**
28+
* Benchmark to measure performance of Comet cast between numeric types. To run this benchmark:
29+
* {{{
30+
* SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToNumericBenchmark
31+
* }}}
32+
* Results will be written to "spark/benchmarks/CometCastNumericToNumericBenchmark-**results.txt".
33+
*/
34+
object CometCastNumericToNumericBenchmark extends CometBenchmarkBase {
35+
36+
private val castFunctions = Seq("CAST", "TRY_CAST")
37+
38+
// Integer widening conversions
39+
private val integerWideningPairs = Seq(
40+
("BYTE", "c_byte", "SHORT"),
41+
("BYTE", "c_byte", "INT"),
42+
("BYTE", "c_byte", "LONG"),
43+
("SHORT", "c_short", "INT"),
44+
("SHORT", "c_short", "LONG"),
45+
("INT", "c_int", "LONG"))
46+
47+
// Integer narrowing conversions
48+
private val integerNarrowingPairs = Seq(
49+
("LONG", "c_long", "INT"),
50+
("LONG", "c_long", "SHORT"),
51+
("LONG", "c_long", "BYTE"),
52+
("INT", "c_int", "SHORT"),
53+
("INT", "c_int", "BYTE"),
54+
("SHORT", "c_short", "BYTE"))
55+
56+
// Floating point conversions
57+
private val floatPairs = Seq(("FLOAT", "c_float", "DOUBLE"), ("DOUBLE", "c_double", "FLOAT"))
58+
59+
// Integer to floating point conversions
60+
private val intToFloatPairs = Seq(
61+
("BYTE", "c_byte", "FLOAT"),
62+
("SHORT", "c_short", "FLOAT"),
63+
("INT", "c_int", "FLOAT"),
64+
("LONG", "c_long", "FLOAT"),
65+
("INT", "c_int", "DOUBLE"),
66+
("LONG", "c_long", "DOUBLE"))
67+
68+
// Floating point to integer conversions
69+
private val floatToIntPairs = Seq(
70+
("FLOAT", "c_float", "INT"),
71+
("FLOAT", "c_float", "LONG"),
72+
("DOUBLE", "c_double", "INT"),
73+
("DOUBLE", "c_double", "LONG"))
74+
75+
// Decimal conversions
76+
private val decimalPairs = Seq(
77+
("INT", "c_int", "DECIMAL(10,2)"),
78+
("LONG", "c_long", "DECIMAL(20,4)"),
79+
("DOUBLE", "c_double", "DECIMAL(15,5)"),
80+
("DECIMAL(10,2)", "c_decimal", "INT"),
81+
("DECIMAL(10,2)", "c_decimal", "LONG"),
82+
("DECIMAL(10,2)", "c_decimal", "DOUBLE"))
83+
84+
private def generateConfigs(
85+
pairs: Seq[(String, String, String)]): Seq[CastNumericToNumericConfig] = {
86+
for {
87+
castFunc <- castFunctions
88+
(sourceType, colName, targetType) <- pairs
89+
} yield CastNumericToNumericConfig(
90+
s"$castFunc $sourceType to $targetType",
91+
s"SELECT $castFunc($colName AS $targetType) FROM parquetV1Table")
92+
}
93+
94+
override def runCometBenchmark(mainArgs: Array[String]): Unit = {
95+
val values = 1024 * 1024 // 1M rows
96+
97+
// Generate input data once with all numeric types
98+
runBenchmarkWithTable("Numeric to Numeric casts", values) { v =>
99+
withTempPath { dir =>
100+
withTempTable("parquetV1Table") {
101+
// Data distribution: 1% NULL per column
102+
// - c_byte: full range -64 to 63
103+
// - c_short: full range -16384 to 16383
104+
// - c_int: centered around 0 (-2.5M to +2.5M)
105+
// - c_long: large positive values (0 to ~5 billion)
106+
// - c_float/c_double: 4% special values (NaN/Infinity), rest centered around 0
107+
// - c_decimal: values from -25000.00 to +25000.00
108+
prepareTable(
109+
dir,
110+
spark.sql(s"""
111+
SELECT
112+
CASE WHEN value % 100 = 0 THEN NULL ELSE CAST((value % 128) - 64 AS BYTE) END AS c_byte,
113+
CASE WHEN value % 100 = 1 THEN NULL ELSE CAST((value % 32768) - 16384 AS SHORT) END AS c_short,
114+
CASE WHEN value % 100 = 2 THEN NULL ELSE CAST(value - 2500000 AS INT) END AS c_int,
115+
CASE WHEN value % 100 = 3 THEN NULL ELSE CAST(value * 1000 AS LONG) END AS c_long,
116+
CASE
117+
WHEN value % 100 = 4 THEN NULL
118+
WHEN value % 100 = 5 THEN CAST('NaN' AS FLOAT)
119+
WHEN value % 100 = 6 THEN CAST('Infinity' AS FLOAT)
120+
WHEN value % 100 = 7 THEN CAST('-Infinity' AS FLOAT)
121+
ELSE CAST((value - 2500000) / 100.0 AS FLOAT)
122+
END AS c_float,
123+
CASE
124+
WHEN value % 100 = 8 THEN NULL
125+
WHEN value % 100 = 9 THEN CAST('NaN' AS DOUBLE)
126+
WHEN value % 100 = 10 THEN CAST('Infinity' AS DOUBLE)
127+
WHEN value % 100 = 11 THEN CAST('-Infinity' AS DOUBLE)
128+
ELSE CAST((value - 2500000) / 100.0 AS DOUBLE)
129+
END AS c_double,
130+
CASE WHEN value % 100 = 12 THEN NULL ELSE CAST((value - 2500000) / 100.0 AS DECIMAL(10,2)) END AS c_decimal
131+
FROM $tbl
132+
"""))
133+
134+
// Run all benchmark categories
135+
(generateConfigs(integerWideningPairs) ++
136+
generateConfigs(integerNarrowingPairs) ++
137+
generateConfigs(floatPairs) ++
138+
generateConfigs(intToFloatPairs) ++
139+
generateConfigs(floatToIntPairs) ++
140+
generateConfigs(decimalPairs)).foreach { config =>
141+
runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs)
142+
}
143+
}
144+
}
145+
}
146+
}
147+
}

0 commit comments

Comments
 (0)