bench: Delta benchmarks and TPC runner infrastructure

schenksj · claude · schenksj · commit 0a21380d6a65 · 2026-04-12T21:25:39.000-04:00
- CometDeltaReadBenchmark: per-type read benchmarks mirroring Iceberg
- CometDeltaBenchmarkTest: end-to-end benchmark harness
- CometBenchmarkBase: add prepareDeltaTable alongside prepareIcebergTable
- create-delta-tables.py: TPC-H/TPC-DS Parquet-to-Delta converter
- comet-delta.toml / comet-delta-hashjoin.toml: TPC engine configs

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/benchmarks/tpc/create-delta-tables.py b/benchmarks/tpc/create-delta-tables.py
@@ -0,0 +1,110 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Convert TPC-H or TPC-DS Parquet data to Delta Lake tables.
+
+Usage:
+    spark-submit \
+        --packages io.delta:delta-spark_2.12:3.3.2 \
+        --conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension \
+        --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog \
+        create-delta-tables.py \
+        --benchmark tpch \
+        --parquet-path /path/to/tpch/parquet \
+        --warehouse /path/to/delta-warehouse
+
+    spark-submit \
+        --packages io.delta:delta-spark_2.12:3.3.2 \
+        --conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension \
+        --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog \
+        create-delta-tables.py \
+        --benchmark tpcds \
+        --parquet-path /path/to/tpcds/parquet \
+        --warehouse /path/to/delta-warehouse
+"""
+
+import argparse
+import os
+import sys
+
+from pyspark.sql import SparkSession
+
+
+TPCH_TABLES = [
+    "customer", "lineitem", "nation", "orders",
+    "part", "partsupp", "region", "supplier"
+]
+
+TPCDS_TABLES = [
+    "call_center", "catalog_page", "catalog_returns", "catalog_sales",
+    "customer", "customer_address", "customer_demographics", "date_dim",
+    "household_demographics", "income_band", "inventory", "item",
+    "promotion", "reason", "ship_mode", "store", "store_returns",
+    "store_sales", "time_dim", "warehouse", "web_page", "web_returns",
+    "web_sales", "web_site"
+]
+
+
+def create_delta_tables(spark, benchmark, parquet_path, warehouse):
+    tables = TPCH_TABLES if benchmark == "tpch" else TPCDS_TABLES
+
+    for table_name in tables:
+        input_path = os.path.join(parquet_path, table_name)
+        output_path = os.path.join(warehouse, table_name)
+
+        if not os.path.exists(input_path) and not input_path.startswith("s3"):
+            print(f"  Skipping {table_name}: {input_path} does not exist")
+            continue
+
+        print(f"  Converting {table_name}: {input_path} -> {output_path}")
+        df = spark.read.parquet(input_path)
+        df.write.format("delta").mode("overwrite").save(output_path)
+        print(f"    {table_name}: {df.count()} rows written")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert TPC Parquet data to Delta Lake tables"
+    )
+    parser.add_argument(
+        "--benchmark", required=True, choices=["tpch", "tpcds"],
+        help="Which TPC benchmark to convert"
+    )
+    parser.add_argument(
+        "--parquet-path", required=True,
+        help="Path to the TPC Parquet data directory"
+    )
+    parser.add_argument(
+        "--warehouse", required=True,
+        help="Path to the Delta warehouse directory"
+    )
+    args = parser.parse_args()
+
+    spark = SparkSession.builder \
+        .appName(f"Create Delta {args.benchmark.upper()} Tables") \
+        .getOrCreate()
+
+    print(f"Converting {args.benchmark.upper()} tables from Parquet to Delta...")
+    create_delta_tables(spark, args.benchmark, args.parquet_path, args.warehouse)
+    print("Done.")
+
+    spark.stop()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/tpc/engines/comet-delta-hashjoin.toml b/benchmarks/tpc/engines/comet-delta-hashjoin.toml
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[engine]
+name = "comet-delta-hashjoin"
+
+[env]
+required = ["COMET_JAR", "DELTA_JAR", "DELTA_WAREHOUSE"]
+
+[spark_submit]
+jars = ["$COMET_JAR", "$DELTA_JAR"]
+driver_class_path = ["$COMET_JAR", "$DELTA_JAR"]
+
+[spark_conf]
+"spark.driver.extraClassPath" = "$COMET_JAR:$DELTA_JAR"
+"spark.executor.extraClassPath" = "$COMET_JAR:$DELTA_JAR"
+"spark.plugins" = "org.apache.spark.CometPlugin"
+"spark.shuffle.manager" = "org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager"
+"spark.comet.expression.Cast.allowIncompatible" = "true"
+"spark.comet.enabled" = "true"
+"spark.comet.exec.enabled" = "true"
+"spark.comet.exec.shuffle.enabled" = "true"
+"spark.comet.scan.deltaNative.enabled" = "true"
+"spark.comet.explainFallback.enabled" = "true"
+"spark.sql.extensions" = "io.delta.sql.DeltaSparkSessionExtension"
+"spark.sql.catalog.spark_catalog" = "org.apache.spark.sql.delta.catalog.DeltaCatalog"
+"spark.sql.join.preferSortMergeJoin" = "false"
+"spark.sql.autoBroadcastJoinThreshold" = "-1"
+
+[tpcbench_args]
+data_path = "$DELTA_WAREHOUSE"
diff --git a/benchmarks/tpc/engines/comet-delta.toml b/benchmarks/tpc/engines/comet-delta.toml
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[engine]
+name = "comet-delta"
+
+[env]
+required = ["COMET_JAR", "DELTA_JAR", "DELTA_WAREHOUSE"]
+
+[spark_submit]
+jars = ["$COMET_JAR", "$DELTA_JAR"]
+driver_class_path = ["$COMET_JAR", "$DELTA_JAR"]
+
+[spark_conf]
+"spark.driver.extraClassPath" = "$COMET_JAR:$DELTA_JAR"
+"spark.executor.extraClassPath" = "$COMET_JAR:$DELTA_JAR"
+"spark.plugins" = "org.apache.spark.CometPlugin"
+"spark.shuffle.manager" = "org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager"
+"spark.comet.expression.Cast.allowIncompatible" = "true"
+"spark.comet.enabled" = "true"
+"spark.comet.exec.enabled" = "true"
+"spark.comet.scan.deltaNative.enabled" = "true"
+"spark.comet.explainFallback.enabled" = "true"
+"spark.sql.extensions" = "io.delta.sql.DeltaSparkSessionExtension"
+"spark.sql.catalog.spark_catalog" = "org.apache.spark.sql.delta.catalog.DeltaCatalog"
+
+[tpcbench_args]
+data_path = "$DELTA_WAREHOUSE"
diff --git a/spark/src/test/scala/org/apache/comet/CometDeltaBenchmarkTest.scala b/spark/src/test/scala/org/apache/comet/CometDeltaBenchmarkTest.scala
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet
+
+import java.nio.file.Files
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.CometTestBase
+
+/**
+ * Quick benchmark comparing vanilla Spark+Delta vs Comet+Delta-kernel.
+ *
+ * Run with: export SPARK_LOCAL_IP=127.0.0.1 && ./mvnw -Pspark-3.5 -pl spark -am test \
+ * -Dsuites=org.apache.comet.CometDeltaBenchmarkTest -Dmaven.gitcommitid.skip
+ */
+class CometDeltaBenchmarkTest extends CometTestBase {
+
+  private def deltaSparkAvailable: Boolean =
+    try {
+      Class.forName("org.apache.spark.sql.delta.DeltaParquetFileFormat")
+      true
+    } catch {
+      case _: ClassNotFoundException => false
+    }
+
+  override protected def sparkConf: SparkConf = {
+    val conf = super.sparkConf
+    conf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
+    conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
+    conf.set("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem")
+    conf.set("spark.databricks.delta.testOnly.dataFileNamePrefix", "")
+    conf.set("spark.databricks.delta.testOnly.dvFileNamePrefix", "")
+    conf
+  }
+
+  test("benchmark: SUM aggregation - vanilla vs Comet native Delta") {
+    assume(deltaSparkAvailable, "delta-spark not on the test classpath")
+
+    val tempDir = Files.createTempDirectory("comet-delta-bench").toFile
+    try {
+      val tablePath = new java.io.File(tempDir, "bench").getAbsolutePath
+      val numRows = 5 * 1000 * 1000 // 5M rows
+      val numFiles = 4
+
+      // scalastyle:off println
+      println(s"\n=== Comet Delta Benchmark: $numRows rows, $numFiles files ===\n")
+      // scalastyle:on println
+
+      // Generate data
+      val ss = spark
+      import ss.implicits._
+      val df =
+        (0 until numRows).map(i => (i.toLong, i * 1.5, s"name_$i")).toDF("id", "score", "name")
+      df.repartition(numFiles).write.format("delta").save(tablePath)
+
+      val warmupIters = 2
+      val benchIters = 5
+
+      // Vanilla Spark+Delta
+      val vanillaTimes = new scala.collection.mutable.ArrayBuffer[Long]()
+      withSQLConf(
+        CometConf.COMET_ENABLED.key -> "false",
+        CometConf.COMET_EXEC_ENABLED.key -> "false") {
+        for (i <- 0 until (warmupIters + benchIters)) {
+          val start = System.nanoTime()
+          spark.sql(s"SELECT SUM(id), SUM(score) FROM delta.`$tablePath`").collect()
+          val elapsed = (System.nanoTime() - start) / 1000000
+          if (i >= warmupIters) vanillaTimes += elapsed
+        }
+      }
+
+      // Comet native Delta
+      val cometTimes = new scala.collection.mutable.ArrayBuffer[Long]()
+      withSQLConf(
+        CometConf.COMET_ENABLED.key -> "true",
+        CometConf.COMET_EXEC_ENABLED.key -> "true",
+        CometConf.COMET_DELTA_NATIVE_ENABLED.key -> "true") {
+        for (i <- 0 until (warmupIters + benchIters)) {
+          val start = System.nanoTime()
+          spark.sql(s"SELECT SUM(id), SUM(score) FROM delta.`$tablePath`").collect()
+          val elapsed = (System.nanoTime() - start) / 1000000
+          if (i >= warmupIters) cometTimes += elapsed
+        }
+      }
+
+      val vanillaAvg = vanillaTimes.sum.toDouble / vanillaTimes.size
+      val cometAvg = cometTimes.sum.toDouble / cometTimes.size
+      val speedup = vanillaAvg / cometAvg
+
+      // scalastyle:off println
+      println(f"\n=== Results (${benchIters} iterations, ${warmupIters} warmup) ===")
+      println(
+        f"  Vanilla Spark+Delta: ${vanillaAvg}%.0f ms avg (${vanillaTimes.mkString(", ")} ms)")
+      println(f"  Comet Native Delta:  ${cometAvg}%.0f ms avg (${cometTimes.mkString(", ")} ms)")
+      println(f"  Speedup: ${speedup}%.2fx")
+      println()
+      // scalastyle:on println
+
+      // Don't assert on speedup - just report numbers.
+      // On debug builds the native path may actually be slower due to no LTO.
+    } finally {
+      def deleteRecursively(file: java.io.File): Unit = {
+        if (file.isDirectory) { Option(file.listFiles()).foreach(_.foreach(deleteRecursively)) }
+        file.delete()
+      }
+      deleteRecursively(tempDir)
+    }
+  }
+
+  test("benchmark: filter scan - vanilla vs Comet native Delta") {
+    assume(deltaSparkAvailable, "delta-spark not on the test classpath")
+
+    val tempDir = Files.createTempDirectory("comet-delta-bench-filter").toFile
+    try {
+      val tablePath = new java.io.File(tempDir, "bench").getAbsolutePath
+      val numRows = 2 * 1000 * 1000
+      val numFiles = 4
+
+      // scalastyle:off println
+      println(s"\n=== Comet Delta Filter Benchmark: $numRows rows, $numFiles files ===\n")
+      // scalastyle:on println
+
+      val ss = spark
+      import ss.implicits._
+      val df =
+        (0 until numRows).map(i => (i.toLong, i * 1.5, s"name_$i")).toDF("id", "score", "name")
+      df.repartition(numFiles).write.format("delta").save(tablePath)
+
+      val warmupIters = 2
+      val benchIters = 5
+      val query = s"SELECT COUNT(*), SUM(score) FROM delta.`$tablePath` WHERE id > ${numRows / 2}"
+
+      val vanillaTimes = new scala.collection.mutable.ArrayBuffer[Long]()
+      withSQLConf(
+        CometConf.COMET_ENABLED.key -> "false",
+        CometConf.COMET_EXEC_ENABLED.key -> "false") {
+        for (i <- 0 until (warmupIters + benchIters)) {
+          val start = System.nanoTime()
+          spark.sql(query).collect()
+          val elapsed = (System.nanoTime() - start) / 1000000
+          if (i >= warmupIters) vanillaTimes += elapsed
+        }
+      }
+
+      val cometTimes = new scala.collection.mutable.ArrayBuffer[Long]()
+      withSQLConf(
+        CometConf.COMET_ENABLED.key -> "true",
+        CometConf.COMET_EXEC_ENABLED.key -> "true",
+        CometConf.COMET_DELTA_NATIVE_ENABLED.key -> "true") {
+        for (i <- 0 until (warmupIters + benchIters)) {
+          val start = System.nanoTime()
+          spark.sql(query).collect()
+          val elapsed = (System.nanoTime() - start) / 1000000
+          if (i >= warmupIters) cometTimes += elapsed
+        }
+      }
+
+      val vanillaAvg = vanillaTimes.sum.toDouble / vanillaTimes.size
+      val cometAvg = cometTimes.sum.toDouble / cometTimes.size
+      val speedup = vanillaAvg / cometAvg
+
+      // scalastyle:off println
+      println(f"\n=== Filter Results (${benchIters} iterations, ${warmupIters} warmup) ===")
+      println(
+        f"  Vanilla Spark+Delta: ${vanillaAvg}%.0f ms avg (${vanillaTimes.mkString(", ")} ms)")
+      println(f"  Comet Native Delta:  ${cometAvg}%.0f ms avg (${cometTimes.mkString(", ")} ms)")
+      println(f"  Speedup: ${speedup}%.2fx")
+      println()
+      // scalastyle:on println
+    } finally {
+      def deleteRecursively(file: java.io.File): Unit = {
+        if (file.isDirectory) { Option(file.listFiles()).foreach(_.foreach(deleteRecursively)) }
+        file.delete()
+      }
+      deleteRecursively(tempDir)
+    }
+  }
+}
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometBenchmarkBase.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometBenchmarkBase.scala
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometDeltaReadBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometDeltaReadBenchmark.scala