Skip to content

Commit fc8b30d

Browse files
committed
address feedback
1 parent ef6249f commit fc8b30d

9 files changed

Lines changed: 31 additions & 8 deletions

spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ object CometCastBooleanBenchmark extends CometBenchmarkBase {
7777
runBenchmarkWithTable("Boolean to other types casts", values) { v =>
7878
withTempPath { dir =>
7979
withTempTable("parquetV1Table") {
80+
// Data distribution: 1% NULL, 50/50 true/false
8081
prepareTable(
8182
dir,
8283
spark.sql(s"""
@@ -98,6 +99,7 @@ object CometCastBooleanBenchmark extends CometBenchmarkBase {
9899
runBenchmarkWithTable("Numeric to Boolean casts", values) { v =>
99100
withTempPath { dir =>
100101
withTempTable("parquetV1Table") {
102+
// Data distribution: 1% NULL per column, values in {-1, 0, 1} (~33% each)
101103
prepareTable(
102104
dir,
103105
spark.sql(s"""

spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,13 @@ object CometCastNumericToNumericBenchmark extends CometBenchmarkBase {
9898
runBenchmarkWithTable("Numeric to Numeric casts", values) { v =>
9999
withTempPath { dir =>
100100
withTempTable("parquetV1Table") {
101-
// Generate varied numeric data including edge cases
101+
// Data distribution: 1% NULL per column
102+
// - c_byte: full range -64 to 63
103+
// - c_short: full range -16384 to 16383
104+
// - c_int: centered around 0 (-2.5M to +2.5M)
105+
// - c_long: large positive values (0 to ~5 billion)
106+
// - c_float/c_double: 4% special values (NaN/Infinity), rest centered around 0
107+
// - c_decimal: values from -25000.00 to +25000.00
102108
prepareTable(
103109
dir,
104110
spark.sql(s"""

spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,13 @@ object CometCastNumericToStringBenchmark extends CometBenchmarkBase {
6060
runBenchmarkWithTable("Numeric to String casts", values) { v =>
6161
withTempPath { dir =>
6262
withTempTable("parquetV1Table") {
63-
// Generate varied numeric data including edge cases
63+
// Data distribution: 1% NULL per column
64+
// - c_bool: 50/50 true/false
65+
// - c_byte: full range -64 to 63
66+
// - c_short: full range -16384 to 16383
67+
// - c_int/c_long: large values centered around 0
68+
// - c_float/c_double: 3% special values (NaN/Infinity), rest centered around 0
69+
// - c_decimal: values from -25000.00 to +25000.00
6470
prepareTable(
6571
dir,
6672
spark.sql(s"""

spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,7 @@ object CometCastNumericToTemporalBenchmark extends CometBenchmarkBase {
5858
runBenchmarkWithTable("Int to Date casts", values) { v =>
5959
withTempPath { dir =>
6060
withTempTable("parquetV1Table") {
61-
// Generate INT values representing days since epoch (1970-01-01)
62-
// Range: ~-18000 to +18000 days (roughly 1920 to 2020)
61+
// Data distribution: 1% NULL, days since epoch spanning ~100 years (1920-2020)
6362
prepareTable(
6463
dir,
6564
spark.sql(s"""
@@ -81,8 +80,7 @@ object CometCastNumericToTemporalBenchmark extends CometBenchmarkBase {
8180
runBenchmarkWithTable("Long to Timestamp casts", values) { v =>
8281
withTempPath { dir =>
8382
withTempTable("parquetV1Table") {
84-
// Generate LONG values representing microseconds since epoch
85-
// Range: 2020-2021 timestamps
83+
// Data distribution: 1% NULL, microseconds since epoch spanning ~1 year from 2020-01-01
8684
prepareTable(
8785
dir,
8886
spark.sql(s"""

spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToNumericBenchmark.scala

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,11 @@ object CometCastStringToNumericBenchmark extends CometBenchmarkBase {
6868
runBenchmarkWithTable("String to numeric casts", values) { v =>
6969
withTempPath { dir =>
7070
withTempTable("parquetV1Table") {
71-
// Generate numeric strings with both integer and decimal values
72-
// Also include some special values: nulls (~2%), NaN (~2%), Infinity (~2%)
71+
// Data distribution:
72+
// - 2% NULL, 2% 'NaN', 2% 'Infinity', 2% '-Infinity'
73+
// - 12% small integers (0-98)
74+
// - 40% medium integers (0-999,998)
75+
// - 40% decimals centered around 0 (approx -5000.00 to +5000.00)
7376
prepareTable(
7477
dir,
7578
spark.sql(s"""

spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ object CometCastStringToTemporalBenchmark extends CometBenchmarkBase {
5858
runBenchmarkWithTable("date data generation", values) { v =>
5959
withTempPath { dateDir =>
6060
withTempTable("parquetV1Table") {
61+
// Data distribution: 10% invalid strings, 90% valid date strings spanning ~10 years
6162
prepareTable(
6263
dateDir,
6364
spark.sql(s"""
@@ -80,6 +81,7 @@ object CometCastStringToTemporalBenchmark extends CometBenchmarkBase {
8081
runBenchmarkWithTable("timestamp data generation", values) { v =>
8182
withTempPath { timestampDir =>
8283
withTempTable("parquetV1Table") {
84+
// Data distribution: 10% invalid strings, 90% valid timestamp strings (1970 epoch range)
8385
prepareTable(
8486
timestampDir,
8587
spark.sql(s"""

spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ object CometCastTemporalToNumericBenchmark extends CometBenchmarkBase {
6262
runBenchmarkWithTable("Date to Numeric casts", values) { v =>
6363
withTempPath { dir =>
6464
withTempTable("parquetV1Table") {
65+
// Data distribution: 1% NULL, dates spanning ~10 years from 2020-01-01
6566
prepareTable(
6667
dir,
6768
spark.sql(s"""
@@ -83,6 +84,7 @@ object CometCastTemporalToNumericBenchmark extends CometBenchmarkBase {
8384
runBenchmarkWithTable("Timestamp to Numeric casts", values) { v =>
8485
withTempPath { dir =>
8586
withTempTable("parquetV1Table") {
87+
// Data distribution: 1% NULL, timestamps spanning ~1 year from 2020-01-01
8688
prepareTable(
8789
dir,
8890
spark.sql(s"""

spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ object CometCastTemporalToStringBenchmark extends CometBenchmarkBase {
5555
runBenchmarkWithTable("Date to String casts", values) { v =>
5656
withTempPath { dir =>
5757
withTempTable("parquetV1Table") {
58+
// Data distribution: 1% NULL, dates spanning ~10 years from 2020-01-01
5859
prepareTable(
5960
dir,
6061
spark.sql(s"""
@@ -76,6 +77,7 @@ object CometCastTemporalToStringBenchmark extends CometBenchmarkBase {
7677
runBenchmarkWithTable("Timestamp to String casts", values) { v =>
7778
withTempPath { dir =>
7879
withTempTable("parquetV1Table") {
80+
// Data distribution: 1% NULL, timestamps spanning ~1 year from 2020-01-01
7981
prepareTable(
8082
dir,
8183
spark.sql(s"""

spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ object CometCastTemporalToTemporalBenchmark extends CometBenchmarkBase {
5757
runBenchmarkWithTable("Date to Timestamp casts", values) { v =>
5858
withTempPath { dir =>
5959
withTempTable("parquetV1Table") {
60+
// Data distribution: 1% NULL, dates spanning ~10 years from 2020-01-01
6061
prepareTable(
6162
dir,
6263
spark.sql(s"""
@@ -78,6 +79,7 @@ object CometCastTemporalToTemporalBenchmark extends CometBenchmarkBase {
7879
runBenchmarkWithTable("Timestamp to Date casts", values) { v =>
7980
withTempPath { dir =>
8081
withTempTable("parquetV1Table") {
82+
// Data distribution: 1% NULL, timestamps spanning ~1 year from 2020-01-01
8183
prepareTable(
8284
dir,
8385
spark.sql(s"""

0 commit comments

Comments
 (0)