refactor: Split read benchmarks and add addParquetScanCases helper (#3407)

andygrove · web-flow · commit 454ca6899c1d · 2026-02-06T13:24:50.000-07:00
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometBenchmarkBase.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometBenchmarkBase.scala
@@ -38,6 +38,7 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.DecimalType
 
 import org.apache.comet.CometConf
+import org.apache.comet.CometConf.{SCAN_NATIVE_DATAFUSION, SCAN_NATIVE_ICEBERG_COMPAT}
 import org.apache.comet.CometSparkSessionExtensions
 
 trait CometBenchmarkBase
@@ -164,6 +165,32 @@ trait CometBenchmarkBase
     benchmark.run()
   }
 
+  protected def addParquetScanCases(
+      benchmark: Benchmark,
+      query: String,
+      caseSuffix: String = "",
+      extraConf: Map[String, String] = Map.empty): Unit = {
+    val suffix = if (caseSuffix.nonEmpty) s" ($caseSuffix)" else ""
+
+    benchmark.addCase(s"SQL Parquet - Spark$suffix") { _ =>
+      withSQLConf(extraConf.toSeq: _*) {
+        spark.sql(query).noop()
+      }
+    }
+
+    for (scanImpl <- Seq(SCAN_NATIVE_DATAFUSION, SCAN_NATIVE_ICEBERG_COMPAT)) {
+      benchmark.addCase(s"SQL Parquet - Comet ($scanImpl)$suffix") { _ =>
+        withSQLConf(
+          (extraConf ++ Map(
+            CometConf.COMET_ENABLED.key -> "true",
+            CometConf.COMET_EXEC_ENABLED.key -> "true",
+            CometConf.COMET_NATIVE_SCAN_IMPL.key -> scanImpl)).toSeq: _*) {
+          spark.sql(query).noop()
+        }
+      }
+    }
+  }
+
   protected def prepareTable(dir: File, df: DataFrame, partition: Option[String] = None): Unit = {
     val testDf = if (partition.isDefined) {
       df.write.partitionBy(partition.get)
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometIcebergReadBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometIcebergReadBenchmark.scala
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.benchmark
+
+import org.apache.spark.benchmark.Benchmark
+import org.apache.spark.sql.types._
+
+import org.apache.comet.CometConf
+
+/**
+ * Benchmark to measure Comet Iceberg read performance. To run this benchmark:
+ * `SPARK_GENERATE_BENCHMARK_FILES=1 make
+ * benchmark-org.apache.spark.sql.benchmark.CometIcebergReadBenchmark` Results will be written to
+ * "spark/benchmarks/CometIcebergReadBenchmark-**results.txt".
+ */
+object CometIcebergReadBenchmark extends CometBenchmarkBase {
+
+  def icebergScanBenchmark(values: Int, dataType: DataType): Unit = {
+    val sqlBenchmark =
+      new Benchmark(s"SQL Single ${dataType.sql} Iceberg Column Scan", values, output = output)
+
+    withTempPath { dir =>
+      withTempTable("icebergTable") {
+        prepareIcebergTable(
+          dir,
+          spark.sql(s"SELECT CAST(value as ${dataType.sql}) id FROM $tbl"),
+          "icebergTable")
+
+        val query = dataType match {
+          case BooleanType => "sum(cast(id as bigint))"
+          case _ => "sum(id)"
+        }
+
+        sqlBenchmark.addCase("SQL Iceberg - Spark") { _ =>
+          withSQLConf(
+            "spark.memory.offHeap.enabled" -> "true",
+            "spark.memory.offHeap.size" -> "10g") {
+            spark.sql(s"select $query from icebergTable").noop()
+          }
+        }
+
+        sqlBenchmark.addCase("SQL Iceberg - Comet Iceberg-Rust") { _ =>
+          withSQLConf(
+            CometConf.COMET_ENABLED.key -> "true",
+            CometConf.COMET_EXEC_ENABLED.key -> "true",
+            "spark.memory.offHeap.enabled" -> "true",
+            "spark.memory.offHeap.size" -> "10g",
+            CometConf.COMET_ICEBERG_NATIVE_ENABLED.key -> "true") {
+            spark.sql(s"select $query from icebergTable").noop()
+          }
+        }
+
+        sqlBenchmark.run()
+      }
+    }
+  }
+
+  override def runCometBenchmark(mainArgs: Array[String]): Unit = {
+    runBenchmarkWithTable("SQL Single Numeric Iceberg Column Scan", 1024 * 1024 * 128) { v =>
+      Seq(BooleanType, ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType)
+        .foreach(icebergScanBenchmark(v, _))
+    }
+  }
+}
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometPartitionColumnBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometPartitionColumnBenchmark.scala
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.benchmark
+
+import org.apache.spark.benchmark.Benchmark
+
+/**
+ * Benchmark to measure partition column scan performance. This exercises the CometConstantVector
+ * path where constant columns are exported as 1-element Arrow arrays and expanded on the native
+ * side.
+ *
+ * To run this benchmark:
+ * {{{
+ * SPARK_GENERATE_BENCHMARK_FILES=1 make \
+ *   benchmark-org.apache.spark.sql.benchmark.CometPartitionColumnBenchmark
+ * }}}
+ *
+ * Results will be written to "spark/benchmarks/CometPartitionColumnBenchmark-**results.txt".
+ */
+object CometPartitionColumnBenchmark extends CometBenchmarkBase {
+
+  def partitionColumnScanBenchmark(values: Int, numPartitionCols: Int): Unit = {
+    val sqlBenchmark = new Benchmark(
+      s"Partitioned Scan with $numPartitionCols partition column(s)",
+      values,
+      output = output)
+
+    withTempPath { dir =>
+      withTempTable("parquetV1Table") {
+        val partCols =
+          (1 to numPartitionCols).map(i => s"'part$i' as p$i").mkString(", ")
+        val partNames = (1 to numPartitionCols).map(i => s"p$i")
+        val df = spark.sql(s"SELECT value as id, $partCols FROM $tbl")
+        val parquetDir = dir.getCanonicalPath + "/parquetV1"
+        df.write
+          .partitionBy(partNames: _*)
+          .mode("overwrite")
+          .option("compression", "snappy")
+          .parquet(parquetDir)
+        spark.read.parquet(parquetDir).createOrReplaceTempView("parquetV1Table")
+
+        addParquetScanCases(sqlBenchmark, "select sum(id) from parquetV1Table")
+
+        // Also benchmark reading partition columns themselves
+        val partSumExpr =
+          (1 to numPartitionCols).map(i => s"sum(length(p$i))").mkString(", ")
+
+        addParquetScanCases(
+          sqlBenchmark,
+          s"select $partSumExpr from parquetV1Table",
+          caseSuffix = "partition cols")
+
+        sqlBenchmark.run()
+      }
+    }
+  }
+
+  override def runCometBenchmark(mainArgs: Array[String]): Unit = {
+    runBenchmarkWithTable("Partitioned Column Scan", 1024 * 1024 * 15) { v =>
+      for (numPartCols <- List(1, 5)) {
+        partitionColumnScanBenchmark(v, numPartCols)
+      }
+    }
+  }
+}
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometReadBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometReadBenchmark.scala