apache
diff --git a/‎.github/workflows/pr_build_linux.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/pr_build_linux.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/pr_build_macos.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/pr_build_macos.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎AGENTS.md‎
Lines changed: 35 additions & 0 deletions b/‎AGENTS.md‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 4 additions & 4 deletions b/‎README.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎common/src/main/java/org/apache/comet/udf/CometUdfBridge.java‎
Lines changed: 42 additions & 54 deletions b/‎common/src/main/java/org/apache/comet/udf/CometUdfBridge.java‎
Lines changed: 42 additions & 54 deletions
diff --git a/‎dev/diffs/3.4.3.diff‎
Lines changed: 1 addition & 1 deletion b/‎dev/diffs/3.4.3.diff‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/diffs/3.5.8.diff‎
Lines changed: 1 addition & 1 deletion b/‎dev/diffs/3.5.8.diff‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/diffs/4.0.2.diff‎
Lines changed: 6 additions & 16 deletions b/‎dev/diffs/4.0.2.diff‎
Lines changed: 6 additions & 16 deletions
@@ -329,6 +329,7 @@ jobs:
               org.apache.comet.parquet.ParquetReadV1Suite
               org.apache.comet.parquet.ParquetReadV2Suite
               org.apache.comet.parquet.ParquetReadFromFakeHadoopFsSuite
+              org.apache.comet.parquet.ParquetTimestampLtzAsNtzSuite
               org.apache.spark.sql.comet.ParquetDatetimeRebaseV1Suite
               org.apache.spark.sql.comet.ParquetDatetimeRebaseV2Suite
               org.apache.spark.sql.comet.ParquetEncryptionITCase
 
@@ -177,6 +177,7 @@ jobs:
               org.apache.comet.parquet.ParquetReadV1Suite
               org.apache.comet.parquet.ParquetReadV2Suite
               org.apache.comet.parquet.ParquetReadFromFakeHadoopFsSuite
+              org.apache.comet.parquet.ParquetTimestampLtzAsNtzSuite
               org.apache.spark.sql.comet.ParquetDatetimeRebaseV1Suite
               org.apache.spark.sql.comet.ParquetDatetimeRebaseV2Suite
               org.apache.spark.sql.comet.ParquetEncryptionITCase
 
@@ -0,0 +1,35 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Agent Guidelines for Apache DataFusion Comet
+
+Read the [Contributor Guide](docs/source/contributor-guide/index.md) before making changes.
+Relevant entry points:
+
+- [Development Guide](docs/source/contributor-guide/development.md): build, test, and common
+  pitfalls (including why `-pl` must not be used and the JVM/native build order).
+- [Spark SQL Tests](docs/source/contributor-guide/spark-sql-tests.md): the only supported way
+  to modify files under `dev/diffs/`. **Never hand-edit a diff file.** Clone Spark, apply the
+  existing diff, modify the Spark source, then regenerate the diff as documented there.
+- [Adding a New Expression](docs/source/contributor-guide/adding_a_new_expression.md) /
+  [Adding a New Operator](docs/source/contributor-guide/adding_a_new_operator.md).
+- [Debugging Guide](docs/source/contributor-guide/debugging.md).
+
+When opening a pull request, use the [PR template](.github/pull_request_template.md) and fill
+in every section.
@@ -40,17 +40,17 @@ Apache DataFusion Comet is a high-performance accelerator for Apache Spark, buil
 performance of Apache Spark workloads while leveraging commodity hardware and seamlessly integrating with the
 Spark ecosystem without requiring any code changes.
 
-**Comet provides a 2x speedup for TPC-H @ 1TB, resulting in 50% cost savings.**
+**Comet provides a ~2x speedup for TPC-DS @ SF 1000 (1TB), resulting in ~50% cost savings.**
 
 That 2x speedup gives you a choice: finish the same Spark workload in half the time on the cluster you already have,
 or match your current Spark performance on roughly half the resources. Either way, the gain translates directly into
 lower cloud bills, reduced on-prem capacity, and lower energy usage, with no changes to your existing Spark SQL,
 DataFrame, or PySpark code. Comet runs on commodity hardware: no GPUs, FPGAs, or other specialized accelerators are
 required, so the savings come from better utilization of the infrastructure you already run on.
 
-![](docs/source/_static/images/benchmark-results/0.15.0/tpch_allqueries.png)
+![](docs/source/_static/images/benchmark-results/0.16.0/tpcds_allqueries.png)
 
-![](docs/source/_static/images/benchmark-results/0.15.0/tpch_queries_compare.png)
+![](docs/source/_static/images/benchmark-results/0.16.0/tpcds_queries_speedup_rel.png)
 
 See the [Comet Benchmarking Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html) for more details.
 
@@ -81,7 +81,7 @@ benefits of Comet's acceleration capabilities without disrupting your Spark appl
 
 ## Getting Started
 
-Comet supports Apache Spark 3.4, 3.5, and 4.0, and provides experimental support for Spark 4.1 and 4.2. See the
+Comet supports Apache Spark 3.4, 3.5, 4.0, and 4.1, and provides experimental support for Spark 4.2. See the
 [installation guide](https://datafusion.apache.org/comet/user-guide/installation.html) for the detailed
 version, Java, and Scala compatibility matrix.
 
 
@@ -19,8 +19,7 @@
 
 package org.apache.comet.udf;
 
-import java.util.LinkedHashMap;
-import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
 
 import org.apache.arrow.c.ArrowArray;
 import org.apache.arrow.c.ArrowSchema;
@@ -38,23 +37,10 @@
  */
 public class CometUdfBridge {
 
-  // Per-thread, bounded LRU of UDF instances keyed by class name. Comet
-  // native execution threads (Tokio/DataFusion worker pool) are reused
-  // across tasks within an executor, so the effective lifetime of cached
-  // entries is the worker thread (i.e. the executor JVM). Fine for
-  // stateless UDFs; future stateful UDFs would need explicit per-task
-  // isolation.
-  private static final int CACHE_CAPACITY = 64;
-
-  private static final ThreadLocal<LinkedHashMap<String, CometUDF>> INSTANCES =
-      ThreadLocal.withInitial(
-          () ->
-              new LinkedHashMap<String, CometUDF>(CACHE_CAPACITY, 0.75f, true) {
-                @Override
-                protected boolean removeEldestEntry(Map.Entry<String, CometUDF> eldest) {
-                  return size() > CACHE_CAPACITY;
-                }
-              });
+  // Process-wide cache of UDF instances keyed by class name. CometUDF
+  // implementations are required to be stateless (see CometUDF), so a
+  // single shared instance per class is safe across native worker threads.
+  private static final ConcurrentHashMap<String, CometUDF> INSTANCES = new ConcurrentHashMap<>();
 
   /**
    * Called from native via JNI.
@@ -64,19 +50,15 @@ protected boolean removeEldestEntry(Map.Entry<String, CometUDF> eldest) {
    * @param inputSchemaPtrs addresses of pre-allocated FFI_ArrowSchema structs (one per input)
    * @param outArrayPtr address of pre-allocated FFI_ArrowArray for the result
    * @param outSchemaPtr address of pre-allocated FFI_ArrowSchema for the result
-   * @param numRows number of rows in the current batch. Mirrors DataFusion's {@code
-   *     ScalarFunctionArgs.number_rows} and gives UDFs an explicit batch-size signal for cases
-   *     where no input arg is a batch-length array (e.g. a zero-arg non-deterministic ScalaUDF).
-   *     UDFs that already read size from their input vectors can ignore it.
-   * @param taskContext Spark {@link TaskContext} captured on the driving Spark task thread and
-   *     passed through from native. May be {@code null} when the bridge is invoked outside a Spark
-   *     task (unit tests, direct native driver runs). When non-null and the current thread has no
-   *     {@code TaskContext} of its own, the bridge installs it as the thread-local for the duration
-   *     of the UDF call so the UDF body (including partition-sensitive built-ins like {@code Rand}
-   *     / {@code Uuid} / {@code MonotonicallyIncreasingID} that read the partition index via {@code
-   *     TaskContext.get().partitionId()}) sees the real context rather than null. The thread-local
-   *     is cleared in a {@code finally} so Tokio workers don't leak a stale TaskContext across
-   *     invocations.
+   * @param numRows row count of the current batch. Mirrors DataFusion's {@code
+   *     ScalarFunctionArgs.number_rows}; the only batch-size signal a zero-input UDF (e.g. a
+   *     zero-arg non-deterministic ScalaUDF) ever sees.
+   * @param taskContext propagated Spark {@link TaskContext} from the driving Spark task thread, or
+   *     {@code null} outside a Spark task. Treated as ground truth for the call: installed as the
+   *     thread-local on entry, with the prior value (if any) saved and restored in {@code finally}.
+   *     Lets partition-sensitive built-ins ({@code Rand}, {@code Uuid}, {@code
+   *     MonotonicallyIncreasingID}) work from Tokio workers and avoids reusing a stale TaskContext
+   *     left on a worker by a previous task.
    */
   public static void evaluate(
       String udfClassName,
@@ -86,17 +68,23 @@ public static void evaluate(
       long outSchemaPtr,
       int numRows,
       TaskContext taskContext) {
-    boolean installedTaskContext = false;
-    if (taskContext != null && TaskContext.get() == null) {
+    // Save-and-restore rather than only-install-if-null: the propagated context is the ground
+    // truth for this call. Any value already on the thread is either (a) the same object on a
+    // Spark task thread, or (b) stale from a prior task on a reused Tokio worker.
+    TaskContext prior = TaskContext.get();
+    if (taskContext != null) {
       CometTaskContextShim.set(taskContext);
-      installedTaskContext = true;
     }
     try {
       evaluateInternal(
           udfClassName, inputArrayPtrs, inputSchemaPtrs, outArrayPtr, outSchemaPtr, numRows);
     } finally {
-      if (installedTaskContext) {
-        CometTaskContextShim.unset();
+      if (taskContext != null) {
+        if (prior != null) {
+          CometTaskContextShim.set(prior);
+        } else {
+          CometTaskContextShim.unset();
+        }
       }
     }
   }
@@ -108,23 +96,23 @@ private static void evaluateInternal(
       long outArrayPtr,
       long outSchemaPtr,
       int numRows) {
-    LinkedHashMap<String, CometUDF> cache = INSTANCES.get();
-    CometUDF udf = cache.get(udfClassName);
-    if (udf == null) {
-      try {
-        // Resolve via the executor's context classloader so user-supplied UDF jars
-        // (added via spark.jars / --jars) are visible.
-        ClassLoader cl = Thread.currentThread().getContextClassLoader();
-        if (cl == null) {
-          cl = CometUdfBridge.class.getClassLoader();
-        }
-        udf =
-            (CometUDF) Class.forName(udfClassName, true, cl).getDeclaredConstructor().newInstance();
-      } catch (ReflectiveOperationException e) {
-        throw new RuntimeException("Failed to instantiate CometUDF: " + udfClassName, e);
-      }
-      cache.put(udfClassName, udf);
-    }
+    CometUDF udf =
+        INSTANCES.computeIfAbsent(
+            udfClassName,
+            name -> {
+              try {
+                // Resolve via the executor's context classloader so user-supplied UDF jars
+                // (added via spark.jars / --jars) are visible.
+                ClassLoader cl = Thread.currentThread().getContextClassLoader();
+                if (cl == null) {
+                  cl = CometUdfBridge.class.getClassLoader();
+                }
+                return (CometUDF)
+                    Class.forName(name, true, cl).getDeclaredConstructor().newInstance();
+              } catch (ReflectiveOperationException e) {
+                throw new RuntimeException("Failed to instantiate CometUDF: " + name, e);
+              }
+            });
 
     BufferAllocator allocator = org.apache.comet.package$.MODULE$.CometArrowAllocator();
 
 
@@ -2170,7 +2170,7 @@ index 29cb224c878..ee5a87fa200 100644
 
 -  test("SPARK-36182: can't read TimestampLTZ as TimestampNTZ") {
 +  test("SPARK-36182: can't read TimestampLTZ as TimestampNTZ",
-+    IgnoreCometNativeDataFusion("https://github.com/apache/datafusion-comet/issues/3720")) {
++    IgnoreCometNativeDataFusion("https://github.com/apache/datafusion-comet/issues/4219")) {
      val data = (1 to 1000).map { i =>
        val ts = new java.sql.Timestamp(i)
        Row(ts)
 
@@ -2137,7 +2137,7 @@ index f6472ba3d9d..5ea2d938664 100644
 
 -  test("SPARK-36182: can't read TimestampLTZ as TimestampNTZ") {
 +  test("SPARK-36182: can't read TimestampLTZ as TimestampNTZ",
-+    IgnoreCometNativeDataFusion("https://github.com/apache/datafusion-comet/issues/3720")) {
++    IgnoreCometNativeDataFusion("https://github.com/apache/datafusion-comet/issues/4219")) {
      val data = (1 to 1000).map { i =>
        val ts = new java.sql.Timestamp(i)
        Row(ts)
 
@@ -2729,7 +2729,7 @@ index 4474ec1fd42..05fa0257c82 100644
        checkAnswer(
          // "fruit" column in this file is encoded using DELTA_LENGTH_BYTE_ARRAY.
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
-index bba71f1c48d..35247c13ad9 100644
+index bba71f1c48d..5a111a937a9 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
 +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -27,6 +27,7 @@ import org.apache.parquet.hadoop.ParquetOutputFormat
@@ -2740,17 +2740,7 @@ index bba71f1c48d..35247c13ad9 100644
  import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
  import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow
  import org.apache.spark.sql.catalyst.util.ArrayData
-@@ -185,7 +186,8 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS
-     }
-   }
- 
--  test("SPARK-47447: read TimestampLTZ as TimestampNTZ") {
-+  test("SPARK-47447: read TimestampLTZ as TimestampNTZ",
-+    IgnoreCometNativeDataFusion("https://github.com/apache/datafusion-comet/issues/3720")) {
-     val providedSchema = StructType(Seq(StructField("time", TimestampNTZType, false)))
- 
-     Seq("INT96", "TIMESTAMP_MICROS", "TIMESTAMP_MILLIS").foreach { tsType =>
-@@ -996,7 +998,11 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS
+@@ -996,7 +997,11 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS
          Seq(Some("A"), Some("A"), None).toDF().repartition(1)
            .write.parquet(path.getAbsolutePath)
          val df = spark.read.parquet(path.getAbsolutePath)
@@ -2763,7 +2753,7 @@ index bba71f1c48d..35247c13ad9 100644
        }
      }
    }
-@@ -1042,7 +1048,8 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS
+@@ -1042,7 +1047,8 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS
      testMigration(fromTsType = "TIMESTAMP_MICROS", toTsType = "INT96")
    }
 
@@ -2773,7 +2763,7 @@ index bba71f1c48d..35247c13ad9 100644
      def readParquet(schema: String, path: File): DataFrame = {
        spark.read.schema(schema).parquet(path.toString)
      }
-@@ -1060,7 +1067,8 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS
+@@ -1060,7 +1066,8 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS
          checkAnswer(readParquet(schema2, path), df)
        }
 
@@ -2783,7 +2773,7 @@ index bba71f1c48d..35247c13ad9 100644
          val schema1 = "a DECIMAL(3, 2), b DECIMAL(18, 3), c DECIMAL(37, 3)"
          checkAnswer(readParquet(schema1, path), df)
          val schema2 = "a DECIMAL(3, 0), b DECIMAL(18, 1), c DECIMAL(37, 1)"
-@@ -1084,7 +1092,8 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS
+@@ -1084,7 +1091,8 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS
        val df = sql(s"SELECT 1 a, 123456 b, ${Int.MaxValue.toLong * 10} c, CAST('1.2' AS BINARY) d")
        df.write.parquet(path.toString)
 
@@ -2793,7 +2783,7 @@ index bba71f1c48d..35247c13ad9 100644
          checkAnswer(readParquet("a DECIMAL(3, 2)", path), sql("SELECT 1.00"))
          checkAnswer(readParquet("a DECIMAL(11, 2)", path), sql("SELECT 1.00"))
          checkAnswer(readParquet("b DECIMAL(3, 2)", path), Row(null))
-@@ -1131,7 +1140,8 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS
+@@ -1131,7 +1139,8 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS
      }
    }