refactor: Split read benchmarks and add addParquetScanCases helper

andygrove · claude · andygrove · commit 02e8ca467199 · 2026-02-05T09:07:26.000-07:00
Extract iceberg benchmarks into CometIcebergReadBenchmark and add
addParquetScanCases helper to CometBenchmarkBase to eliminate the
repeated 3-case pattern (Spark / native_datafusion / native_iceberg_compat)
across all parquet benchmarks.

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometBenchmarkBase.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometBenchmarkBase.scala
@@ -38,6 +38,7 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.DecimalType
 
 import org.apache.comet.CometConf
+import org.apache.comet.CometConf.{SCAN_NATIVE_DATAFUSION, SCAN_NATIVE_ICEBERG_COMPAT}
 import org.apache.comet.CometSparkSessionExtensions
 
 trait CometBenchmarkBase
@@ -164,6 +165,32 @@ trait CometBenchmarkBase
     benchmark.run()
   }
 
+  protected def addParquetScanCases(
+      benchmark: Benchmark,
+      query: String,
+      caseSuffix: String = "",
+      extraConf: Map[String, String] = Map.empty): Unit = {
+    val suffix = if (caseSuffix.nonEmpty) s" ($caseSuffix)" else ""
+
+    benchmark.addCase(s"SQL Parquet - Spark$suffix") { _ =>
+      withSQLConf(extraConf.toSeq: _*) {
+        spark.sql(query).noop()
+      }
+    }
+
+    for (scanImpl <- Seq(SCAN_NATIVE_DATAFUSION, SCAN_NATIVE_ICEBERG_COMPAT)) {
+      benchmark.addCase(s"SQL Parquet - Comet ($scanImpl)$suffix") { _ =>
+        withSQLConf(
+          (extraConf ++ Map(
+            CometConf.COMET_ENABLED.key -> "true",
+            CometConf.COMET_EXEC_ENABLED.key -> "true",
+            CometConf.COMET_NATIVE_SCAN_IMPL.key -> scanImpl)).toSeq: _*) {
+          spark.sql(query).noop()
+        }
+      }
+    }
+  }
+
   protected def prepareTable(dir: File, df: DataFrame, partition: Option[String] = None): Unit = {
     val testDf = if (partition.isDefined) {
       df.write.partitionBy(partition.get)
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometIcebergReadBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometIcebergReadBenchmark.scala
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.benchmark
+
+import org.apache.spark.benchmark.Benchmark
+import org.apache.spark.sql.types._
+
+import org.apache.comet.CometConf
+
+/**
+ * Benchmark to measure Comet Iceberg read performance. To run this benchmark:
+ * `SPARK_GENERATE_BENCHMARK_FILES=1 make
+ * benchmark-org.apache.spark.sql.benchmark.CometIcebergReadBenchmark` Results will be written to
+ * "spark/benchmarks/CometIcebergReadBenchmark-**results.txt".
+ */
+object CometIcebergReadBenchmark extends CometBenchmarkBase {
+
+  def icebergScanBenchmark(values: Int, dataType: DataType): Unit = {
+    val sqlBenchmark =
+      new Benchmark(s"SQL Single ${dataType.sql} Iceberg Column Scan", values, output = output)
+
+    withTempPath { dir =>
+      withTempTable("icebergTable") {
+        prepareIcebergTable(
+          dir,
+          spark.sql(s"SELECT CAST(value as ${dataType.sql}) id FROM $tbl"),
+          "icebergTable")
+
+        val query = dataType match {
+          case BooleanType => "sum(cast(id as bigint))"
+          case _ => "sum(id)"
+        }
+
+        sqlBenchmark.addCase("SQL Iceberg - Spark") { _ =>
+          withSQLConf(
+            "spark.memory.offHeap.enabled" -> "true",
+            "spark.memory.offHeap.size" -> "10g") {
+            spark.sql(s"select $query from icebergTable").noop()
+          }
+        }
+
+        sqlBenchmark.addCase("SQL Iceberg - Comet Iceberg-Rust") { _ =>
+          withSQLConf(
+            CometConf.COMET_ENABLED.key -> "true",
+            CometConf.COMET_EXEC_ENABLED.key -> "true",
+            "spark.memory.offHeap.enabled" -> "true",
+            "spark.memory.offHeap.size" -> "10g",
+            CometConf.COMET_ICEBERG_NATIVE_ENABLED.key -> "true") {
+            spark.sql(s"select $query from icebergTable").noop()
+          }
+        }
+
+        sqlBenchmark.run()
+      }
+    }
+  }
+
+  override def runCometBenchmark(mainArgs: Array[String]): Unit = {
+    runBenchmarkWithTable("SQL Single Numeric Iceberg Column Scan", 1024 * 1024 * 128) { v =>
+      Seq(BooleanType, ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType)
+        .foreach(icebergScanBenchmark(v, _))
+    }
+  }
+}
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometPartitionColumnBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometPartitionColumnBenchmark.scala
@@ -21,9 +21,6 @@ package org.apache.spark.sql.benchmark
 
 import org.apache.spark.benchmark.Benchmark
 
-import org.apache.comet.CometConf
-import org.apache.comet.CometConf.{SCAN_NATIVE_DATAFUSION, SCAN_NATIVE_ICEBERG_COMPAT}
-
 /**
  * Benchmark to measure partition column scan performance. This exercises the CometConstantVector
  * path where constant columns are exported as 1-element Arrow arrays and expanded on the native
@@ -59,53 +56,16 @@ object CometPartitionColumnBenchmark extends CometBenchmarkBase {
           .parquet(parquetDir)
         spark.read.parquet(parquetDir).createOrReplaceTempView("parquetV1Table")
 
-        sqlBenchmark.addCase("SQL Parquet - Spark") { _ =>
-          spark.sql("select sum(id) from parquetV1Table").noop()
-        }
-
-        sqlBenchmark.addCase("SQL Parquet - Comet Native DataFusion") { _ =>
-          withSQLConf(
-            CometConf.COMET_ENABLED.key -> "true",
-            CometConf.COMET_EXEC_ENABLED.key -> "true",
-            CometConf.COMET_NATIVE_SCAN_IMPL.key -> SCAN_NATIVE_DATAFUSION) {
-            spark.sql("select sum(id) from parquetV1Table").noop()
-          }
-        }
-
-        sqlBenchmark.addCase("SQL Parquet - Comet Native Iceberg Compat") { _ =>
-          withSQLConf(
-            CometConf.COMET_ENABLED.key -> "true",
-            CometConf.COMET_EXEC_ENABLED.key -> "true",
-            CometConf.COMET_NATIVE_SCAN_IMPL.key -> SCAN_NATIVE_ICEBERG_COMPAT) {
-            spark.sql("select sum(id) from parquetV1Table").noop()
-          }
-        }
+        addParquetScanCases(sqlBenchmark, "select sum(id) from parquetV1Table")
 
         // Also benchmark reading partition columns themselves
         val partSumExpr =
           (1 to numPartitionCols).map(i => s"sum(length(p$i))").mkString(", ")
 
-        sqlBenchmark.addCase("SQL Parquet - Spark (read partition cols)") { _ =>
-          spark.sql(s"select $partSumExpr from parquetV1Table").noop()
-        }
-
-        sqlBenchmark.addCase("SQL Parquet - Comet Native DataFusion (partition cols)") { _ =>
-          withSQLConf(
-            CometConf.COMET_ENABLED.key -> "true",
-            CometConf.COMET_EXEC_ENABLED.key -> "true",
-            CometConf.COMET_NATIVE_SCAN_IMPL.key -> SCAN_NATIVE_DATAFUSION) {
-            spark.sql(s"select $partSumExpr from parquetV1Table").noop()
-          }
-        }
-
-        sqlBenchmark.addCase("SQL Parquet - Comet Native Iceberg Compat (partition cols)") { _ =>
-          withSQLConf(
-            CometConf.COMET_ENABLED.key -> "true",
-            CometConf.COMET_EXEC_ENABLED.key -> "true",
-            CometConf.COMET_NATIVE_SCAN_IMPL.key -> SCAN_NATIVE_ICEBERG_COMPAT) {
-            spark.sql(s"select $partSumExpr from parquetV1Table").noop()
-          }
-        }
+        addParquetScanCases(
+          sqlBenchmark,
+          s"select $partSumExpr from parquetV1Table",
+          caseSuffix = "partition cols")
 
         sqlBenchmark.run()
       }
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometReadBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometReadBenchmark.scala