Final approach

Kazantsev Maksim · Kazantsev Maksim · commit d8c7760f43a7 · 2026-01-06T19:15:50.000+04:00
diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
@@ -241,25 +241,25 @@ case class CometScanRule(session: SparkSession) extends Rule[SparkPlan] with Com
         if (!partitionSchemaSupported) {
           fallbackReasons += s"Partition schema ${scan.readPartitionSchema} is not supported"
         }
-        val columnNameOfCorruptedRecords =
+        val corruptedRecordsColumnName =
           SQLConf.get.getConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD)
-        val hasNoCorruptedColumn =
-          !scan.readDataSchema.fieldNames.contains(columnNameOfCorruptedRecords)
-        if (!hasNoCorruptedColumn) {
-          fallbackReasons += "Comet doesn't support the processing of corrupted records in Spark"
+        val containsCorruptedRecordsColumn =
+          !scan.readDataSchema.fieldNames.contains(corruptedRecordsColumnName)
+        if (!containsCorruptedRecordsColumn) {
+          fallbackReasons += "Comet doesn't support the processing of corrupted records"
         }
-        val inferSchemaEnabled = scan.options.getBoolean("inferSchema", false)
-        if (inferSchemaEnabled) {
+        val isInferSchemaEnabled = scan.options.getBoolean("inferSchema", false)
+        if (isInferSchemaEnabled) {
           fallbackReasons += "Comet doesn't support inferSchema=true option"
         }
         val delimiter = scan.options.get("delimiter")
-        val isSingleCharDelimiter = delimiter.length == 1
-        if (!isSingleCharDelimiter) {
-          fallbackReasons += s"Comet doesn't support delimiter: '$delimiter' " +
-            s"with more then one character"
+        val isSingleCharacterDelimiter = delimiter.length == 1
+        if (!isSingleCharacterDelimiter) {
+          fallbackReasons +=
+            s"Comet supports only single-character delimiters, but got: '$delimiter'"
         }
-        if (schemaSupported && partitionSchemaSupported && hasNoCorruptedColumn
-          && !inferSchemaEnabled && isSingleCharDelimiter) {
+        if (schemaSupported && partitionSchemaSupported && containsCorruptedRecordsColumn
+          && !isInferSchemaEnabled && isSingleCharacterDelimiter) {
           CometBatchScanExec(
             scanExec.clone().asInstanceOf[BatchScanExec],
             runtimeFilters = scanExec.runtimeFilters)
diff --git a/spark/src/main/scala/org/apache/comet/testing/CsvGenerator.scala b/spark/src/main/scala/org/apache/comet/testing/CsvGenerator.scala
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometCsvNativeScanExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometCsvNativeScanExec.scala
@@ -29,12 +29,17 @@ import org.apache.spark.sql.execution.datasources.FilePartition
 import org.apache.spark.sql.execution.datasources.v2.BatchScanExec
 import org.apache.spark.sql.execution.datasources.v2.csv.CSVScan
 
+import com.google.common.base.Objects
+
 import org.apache.comet.{CometConf, ConfigEntry}
 import org.apache.comet.objectstore.NativeConfig
-import org.apache.comet.serde.{CometOperatorSerde, Compatible, OperatorOuterClass, SupportLevel}
+import org.apache.comet.serde.{CometOperatorSerde, OperatorOuterClass}
 import org.apache.comet.serde.OperatorOuterClass.Operator
 import org.apache.comet.serde.operator.{partition2Proto, schema2Proto}
 
+/*
+ * Native CSV scan operator that delegates file reading to datafusion.
+ */
 case class CometCsvNativeScanExec(
     override val nativeOp: Operator,
     override val output: Seq[Attribute],
@@ -53,17 +58,28 @@ case class CometCsvNativeScanExec(
   override protected def doCanonicalize(): SparkPlan = {
     CometCsvNativeScanExec(nativeOp, output, originalPlan, serializedPlanOpt)
   }
+
+  override def equals(obj: Any): Boolean = {
+    obj match {
+      case other: CometCsvNativeScanExec =>
+        this.output == other.output &&
+        this.serializedPlanOpt == other.serializedPlanOpt &&
+        this.originalPlan == other.originalPlan
+      case _ =>
+        false
+    }
+  }
+
+  override def hashCode(): Int = {
+    Objects.hashCode(output, serializedPlanOpt, originalPlan)
+  }
 }
 
 object CometCsvNativeScanExec extends CometOperatorSerde[CometBatchScanExec] {
 
   override def enabledConfig: Option[ConfigEntry[Boolean]] = Some(
     CometConf.COMET_CSV_V2_NATIVE_ENABLED)
 
-  override def getSupportLevel(operator: CometBatchScanExec): SupportLevel = {
-    Compatible()
-  }
-
   override def convert(
       op: CometBatchScanExec,
       builder: Operator.Builder,
diff --git a/spark/src/test/scala/org/apache/comet/csv/CometCsvNativeReadSuite.scala b/spark/src/test/scala/org/apache/comet/csv/CometCsvNativeReadSuite.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
 
 import org.apache.comet.CometConf
 
-class CometCsvReadSuite extends CometTestBase {
+class CometCsvNativeReadSuite extends CometTestBase {
   private val TEST_CSV_PATH_NO_HEADER = "src/test/resources/test-data/csv-test-1.csv"
   private val TEST_CSV_PATH_HAS_HEADER = "src/test/resources/test-data/csv-test-2.csv"
 
@@ -73,7 +73,7 @@ class CometCsvReadSuite extends CometTestBase {
         .csv(TEST_CSV_PATH_NO_HEADER)
       checkSparkAnswerAndFallbackReason(
         df,
-        "Comet doesn't support the processing of corrupted records in Spark")
+        "Comet doesn't support the processing of corrupted records")
       df = spark.read
         .options(Map("header" -> "false", "delimiter" -> ",", "inferSchema" -> "true"))
         .csv(TEST_CSV_PATH_NO_HEADER)
@@ -83,7 +83,7 @@ class CometCsvReadSuite extends CometTestBase {
         .csv(TEST_CSV_PATH_NO_HEADER)
       checkSparkAnswerAndFallbackReason(
         df,
-        "Comet doesn't support delimiter: ',,' with more then one character")
+        "Comet supports only single-character delimiters, but got: ',,'")
     }
   }
 }
diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometNativeCsvReadBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometNativeCsvReadBenchmark.scala
@@ -19,71 +19,168 @@
 
 package org.apache.spark.sql.benchmark
 
-import java.io.File
-
-import scala.util.Random
-
 import org.apache.spark.benchmark.Benchmark
-import org.apache.spark.sql.benchmark.CometExecBenchmark.withSQLConf
+import org.apache.spark.sql.benchmark.CometNativeCsvReadBenchmark.TPCHSchemas._
+import org.apache.spark.sql.execution.benchmark.TPCDSQueryBenchmarkArguments
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.{DataTypes, StructType}
+import org.apache.spark.sql.types._
 
 import org.apache.comet.CometConf
-import org.apache.comet.testing.{CsvGenerator, FuzzDataGenerator, SchemaGenOptions}
 
 /**
- * Benchmark to measure Comet read performance. To run this benchmark:
+ * @param tableName
+ *   Name of the TPC-H table. Must match one of the standard table names: region, nation, part,
+ *   supplier, partsupp, customer, orders, lineitem.
+ *
+ * @param schema
+ *   Table data structure in Spark StructType format.
+ */
+case class NativeCsvReadConfig(tableName: String, schema: StructType)
+
+/**
+ * Benchmark to measure Comet csv read performance. To run this benchmark:
  * `SPARK_GENERATE_BENCHMARK_FILES=1 make
- * benchmark-org.apache.spark.sql.benchmark.CometNativeCsvReadBenchmark` Results will be written
- * to "spark/benchmarks/CometNativeCsvReadBenchmark-**results.txt".
+ * benchmark-org.apache.spark.sql.benchmark.CometNativeCsvReadBenchmark -- --data-location
+ * /tmp/tpcds` Results will be written to
+ * "spark/benchmarks/CometNativeCsvReadBenchmark-**results.txt".
  */
 object CometNativeCsvReadBenchmark extends CometBenchmarkBase {
 
-  private def prepareCsvTable(dir: File, schema: StructType, numRows: Int): Unit = {
-    val random = new Random(42)
-    CsvGenerator.makeCsvFile(random, spark, schema, dir.getCanonicalPath, numRows)
+  private def runNativeCsvBenchmark(
+      dataLocation: String,
+      tableName: String,
+      schema: StructType,
+      valuesPerPartition: Int,
+      numIters: Int): Unit = {
+    val benchmark =
+      new Benchmark(s"Native csv read - `$tableName` table", valuesPerPartition, output = output)
+    val filePath = s"$dataLocation/$tableName.csv"
+    benchmark.addCase("Spark", numIters) { _ =>
+      withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> "") {
+        spark.read
+          .schema(schema)
+          .options(Map("header" -> "true", "delimiter" -> ","))
+          .csv(filePath)
+          .noop()
+      }
+    }
+    benchmark.addCase("Native", numIters) { _ =>
+      withSQLConf(
+        CometConf.COMET_ENABLED.key -> "true",
+        CometConf.COMET_EXEC_ENABLED.key -> "true",
+        CometConf.COMET_CSV_V2_NATIVE_ENABLED.key -> "true",
+        SQLConf.USE_V1_SOURCE_LIST.key -> "") {
+        spark.read
+          .schema(schema)
+          .options(Map("header" -> "true", "delimiter" -> ","))
+          .csv(filePath)
+          .noop()
+      }
+    }
+    benchmark.run()
   }
 
+  private val testCases = Seq(
+    /* NativeCsvReadConfig("orders", ordersSchema),
+    NativeCsvReadConfig("region", regionSchema),*/
+    NativeCsvReadConfig("nation", nationSchema)
+    /*NativeCsvReadConfig("part", partSchema),
+    NativeCsvReadConfig("supplier", supplierSchema),
+    NativeCsvReadConfig("partsupp", partsuppSchema),
+    NativeCsvReadConfig("customer", customerSchema),
+    NativeCsvReadConfig("lineitem", lineitemSchema)*/ )
+
   override def runCometBenchmark(args: Array[String]): Unit = {
-    val numRows = 2000000
-    val benchmark = new Benchmark(s"Native csv read - $numRows rows", numRows, output = output)
-    withTempPath { dir =>
-      val schema = FuzzDataGenerator.generateSchema(
-        SchemaGenOptions(primitiveTypes = Seq(
-          DataTypes.BooleanType,
-          DataTypes.ByteType,
-          DataTypes.ShortType,
-          DataTypes.IntegerType,
-          DataTypes.LongType,
-          DataTypes.FloatType,
-          DataTypes.DoubleType,
-          DataTypes.createDecimalType(10, 2),
-          DataTypes.createDecimalType(36, 18),
-          DataTypes.DateType,
-          DataTypes.StringType)))
-      prepareCsvTable(dir, schema, numRows)
-      benchmark.addCase("Simple csv v2 read - spark") { _ =>
-        withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> "") {
-          spark.read
-            .schema(schema)
-            .csv(dir.getCanonicalPath)
-            .noop()
-        }
-      }
-      benchmark.addCase("Simple csv v2 read - comet native") { _ =>
-        withSQLConf(
-          CometConf.COMET_ENABLED.key -> "true",
-          CometConf.COMET_EXEC_ENABLED.key -> "true",
-          CometConf.COMET_CSV_V2_NATIVE_ENABLED.key -> "true",
-          CometConf.COMET_EXPLAIN_FALLBACK_ENABLED.key -> "true",
-          SQLConf.USE_V1_SOURCE_LIST.key -> "") {
-          spark.read
-            .schema(schema)
-            .csv(dir.getCanonicalPath)
-            .noop()
-        }
-      }
-      benchmark.run()
+    val benchmarkArgs = new TPCDSQueryBenchmarkArguments(args)
+    val valuesPerPartition = 1024 * 1024 * 2
+    val numIters = 1
+    testCases.foreach { config =>
+      runNativeCsvBenchmark(
+        benchmarkArgs.dataLocation,
+        config.tableName,
+        config.schema,
+        valuesPerPartition,
+        numIters)
     }
   }
+
+  object TPCHSchemas {
+
+    val regionSchema: StructType = new StructType()
+      .add("r_regionkey", IntegerType, nullable = true)
+      .add("r_name", StringType, nullable = true)
+      .add("r_comment", StringType, nullable = true)
+
+    val nationSchema: StructType = new StructType()
+      .add("n_nationkey", IntegerType, nullable = true)
+      .add("n_name", StringType, nullable = true)
+      .add("n_regionkey", IntegerType, nullable = true)
+      .add("n_comment", StringType, nullable = true)
+
+    val partSchema: StructType = new StructType()
+      .add("p_partkey", IntegerType, nullable = true)
+      .add("p_name", StringType, nullable = true)
+      .add("p_mfgr", StringType, nullable = true)
+      .add("p_brand", StringType, nullable = true)
+      .add("p_type", StringType, nullable = true)
+      .add("p_size", IntegerType, nullable = true)
+      .add("p_container", StringType, nullable = true)
+      .add("p_retailprice", DoubleType, nullable = true)
+      .add("p_comment", StringType, nullable = true)
+
+    val supplierSchema: StructType = new StructType()
+      .add("s_suppkey", IntegerType, nullable = true)
+      .add("s_name", StringType, nullable = true)
+      .add("s_address", StringType, nullable = true)
+      .add("s_nationkey", IntegerType, nullable = true)
+      .add("s_phone", StringType, nullable = true)
+      .add("s_acctbal", DoubleType, nullable = true)
+      .add("s_comment", StringType, nullable = true)
+
+    val partsuppSchema: StructType = new StructType()
+      .add("ps_partkey", IntegerType, nullable = true)
+      .add("ps_suppkey", IntegerType, nullable = true)
+      .add("ps_availqty", IntegerType, nullable = true)
+      .add("ps_supplycost", DoubleType, nullable = true)
+      .add("ps_comment", StringType, nullable = true)
+
+    val customerSchema: StructType = new StructType()
+      .add("c_custkey", IntegerType, nullable = true)
+      .add("c_name", StringType, nullable = true)
+      .add("c_address", StringType, nullable = true)
+      .add("c_nationkey", IntegerType, nullable = true)
+      .add("c_phone", StringType, nullable = true)
+      .add("c_acctbal", DoubleType, nullable = true)
+      .add("c_mktsegment", StringType, nullable = true)
+      .add("c_comment", StringType, nullable = true)
+
+    val ordersSchema: StructType = new StructType()
+      .add("o_orderkey", IntegerType, nullable = true)
+      .add("o_custkey", IntegerType, nullable = true)
+      .add("o_orderstatus", StringType, nullable = true)
+      .add("o_totalprice", DoubleType, nullable = true)
+      .add("o_orderdate", DateType, nullable = true)
+      .add("o_orderpriority", StringType, nullable = true)
+      .add("o_clerk", StringType, nullable = true)
+      .add("o_shippriority", IntegerType, nullable = true)
+      .add("o_comment", StringType, nullable = true)
+
+    val lineitemSchema: StructType = new StructType()
+      .add("l_orderkey", IntegerType, nullable = true)
+      .add("l_partkey", IntegerType, nullable = true)
+      .add("l_suppkey", IntegerType, nullable = true)
+      .add("l_linenumber", IntegerType, nullable = true)
+      .add("l_quantity", IntegerType, nullable = true)
+      .add("l_extendedprice", DoubleType, nullable = true)
+      .add("l_discount", DoubleType, nullable = true)
+      .add("l_tax", DoubleType, nullable = true)
+      .add("l_returnflag", StringType, nullable = true)
+      .add("l_linestatus", StringType, nullable = true)
+      .add("l_shipdate", DateType, nullable = true)
+      .add("l_commitdate", DateType, nullable = true)
+      .add("l_receiptdate", DateType, nullable = true)
+      .add("l_shipinstruct", StringType, nullable = true)
+      .add("l_shipmode", StringType, nullable = true)
+      .add("l_comment", StringType, nullable = true)
+  }
 }