apache
diff --git a/‎.github/workflows/pr_build_linux.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/pr_build_linux.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/pr_build_macos.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/pr_build_macos.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 0 deletions b/‎README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎common/src/main/scala/org/apache/comet/CometConf.scala‎
Lines changed: 12 additions & 5 deletions b/‎common/src/main/scala/org/apache/comet/CometConf.scala‎
Lines changed: 12 additions & 5 deletions
diff --git a/‎dev/ensure-jars-have-correct-contents.sh‎
Lines changed: 1 addition & 0 deletions b/‎dev/ensure-jars-have-correct-contents.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/contributor-guide/adding_a_new_expression.md‎
Lines changed: 23 additions & 0 deletions b/‎docs/source/contributor-guide/adding_a_new_expression.md‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎docs/source/user-guide/latest/configs.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/user-guide/latest/configs.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎native/core/src/execution/operators/parquet_writer.rs‎
Lines changed: 67 additions & 53 deletions b/‎native/core/src/execution/operators/parquet_writer.rs‎
Lines changed: 67 additions & 53 deletions
diff --git a/‎native/core/src/execution/planner.rs‎
Lines changed: 7 additions & 0 deletions b/‎native/core/src/execution/planner.rs‎
Lines changed: 7 additions & 0 deletions
@@ -122,6 +122,7 @@ jobs:
               org.apache.comet.exec.CometAsyncShuffleSuite
               org.apache.comet.exec.DisableAQECometShuffleSuite
               org.apache.comet.exec.DisableAQECometAsyncShuffleSuite
+              org.apache.spark.shuffle.sort.SpillSorterSuite
           - name: "parquet"
             value: |
               org.apache.comet.parquet.CometParquetWriterSuite
@@ -160,6 +161,7 @@ jobs:
             value: |
               org.apache.comet.CometExpressionSuite
               org.apache.comet.CometExpressionCoverageSuite
+              org.apache.comet.CometHashExpressionSuite
               org.apache.comet.CometTemporalExpressionSuite
               org.apache.comet.CometArrayExpressionSuite
               org.apache.comet.CometCastSuite
 
@@ -85,6 +85,7 @@ jobs:
               org.apache.comet.exec.CometAsyncShuffleSuite
               org.apache.comet.exec.DisableAQECometShuffleSuite
               org.apache.comet.exec.DisableAQECometAsyncShuffleSuite
+              org.apache.spark.shuffle.sort.SpillSorterSuite
           - name: "parquet"
             value: |
               org.apache.comet.parquet.CometParquetWriterSuite
@@ -123,6 +124,7 @@ jobs:
             value: |
               org.apache.comet.CometExpressionSuite
               org.apache.comet.CometExpressionCoverageSuite
+              org.apache.comet.CometHashExpressionSuite
               org.apache.comet.CometTemporalExpressionSuite
               org.apache.comet.CometArrayExpressionSuite
               org.apache.comet.CometCastSuite
 
@@ -21,11 +21,14 @@ under the License.
 
 [![Apache licensed][license-badge]][license-url]
 [![Discord chat][discord-badge]][discord-url]
+[![Pending PRs][pending-pr-badge]][pending-pr-url]
 
 [license-badge]: https://img.shields.io/badge/license-Apache%20v2-blue.svg
 [license-url]: https://github.com/apache/datafusion-comet/blob/main/LICENSE.txt
 [discord-badge]: https://img.shields.io/discord/885562378132000778.svg?logo=discord&style=flat-square
 [discord-url]: https://discord.gg/3EAr4ZX6JK
+[pending-pr-badge]: https://img.shields.io/github/issues-search/apache/datafusion-comet?query=is%3Apr+is%3Aopen+draft%3Afalse+review%3Arequired+status%3Asuccess&label=Pending%20PRs&logo=github
+[pending-pr-url]: https://github.com/apache/datafusion-comet/pulls?q=is%3Apr+is%3Aopen+draft%3Afalse+review%3Arequired+status%3Asuccess+sort%3Aupdated-desc
 
 <img src="docs/source/_static/images/DataFusionComet-Logo-Light.png" width="512" alt="logo"/>
 
 
@@ -111,6 +111,10 @@ object CometConf extends ShimCometConf {
       .booleanConf
       .createWithDefault(false)
 
+  // Deprecated: native_comet uses mutable buffers incompatible with Arrow FFI best practices
+  // and does not support complex types. Use native_iceberg_compat or auto instead.
+  // See: https://github.com/apache/datafusion-comet/issues/2186
+  @deprecated("Use SCAN_AUTO instead", "0.9.0")
   val SCAN_NATIVE_COMET = "native_comet"
   val SCAN_NATIVE_DATAFUSION = "native_datafusion"
   val SCAN_NATIVE_ICEBERG_COMPAT = "native_iceberg_compat"
@@ -121,11 +125,14 @@ object CometConf extends ShimCometConf {
     .doc(
       s"The implementation of Comet Native Scan to use. Available modes are `$SCAN_NATIVE_COMET`," +
         s"`$SCAN_NATIVE_DATAFUSION`, and `$SCAN_NATIVE_ICEBERG_COMPAT`. " +
-        s"`$SCAN_NATIVE_COMET` is for the original Comet native scan which uses a jvm based " +
-        "parquet file reader and native column decoding. Supports simple types only " +
-        s"`$SCAN_NATIVE_DATAFUSION` is a fully native implementation of scan based on DataFusion" +
-        s"`$SCAN_NATIVE_ICEBERG_COMPAT` is a native implementation that exposes apis to read " +
-        s"parquet columns natively. `$SCAN_AUTO` chooses the best scan.")
+        s"`$SCAN_NATIVE_COMET` (DEPRECATED) is for the original Comet native scan which " +
+        "uses a jvm based parquet file reader and native column decoding. " +
+        "Supports simple types only. " +
+        s"`$SCAN_NATIVE_DATAFUSION` is a fully native implementation of scan based on " +
+        "DataFusion. " +
+        s"`$SCAN_NATIVE_ICEBERG_COMPAT` is the recommended native implementation that " +
+        "exposes apis to read parquet columns natively and supports complex types. " +
+        s"`$SCAN_AUTO` (default) chooses the best scan.")
     .internal()
     .stringConf
     .transform(_.toLowerCase(Locale.ROOT))
 
@@ -86,6 +86,7 @@ allowed_expr+="|^org/apache/spark/shuffle/$"
 allowed_expr+="|^org/apache/spark/shuffle/sort/$"
 allowed_expr+="|^org/apache/spark/shuffle/sort/CometShuffleExternalSorter.*$"
 allowed_expr+="|^org/apache/spark/shuffle/sort/RowPartition.class$"
+allowed_expr+="|^org/apache/spark/shuffle/sort/SpillSorter.*$"
 allowed_expr+="|^org/apache/spark/shuffle/comet/.*$"
 allowed_expr+="|^org/apache/spark/sql/$"
 # allow ExplainPlanGenerator trait since it may not be available in older Spark versions
 
@@ -236,6 +236,29 @@ test("unhex") {
 }
 ```
 
+#### Testing with Literal Values
+
+When writing tests that use literal values (e.g., `SELECT my_func('literal')`), Spark's constant folding optimizer may evaluate the expression at planning time rather than execution time. This means your Comet implementation might not actually be exercised during the test.
+
+To ensure literal expressions are executed by Comet, disable the constant folding optimizer:
+
+```scala
+test("my_func with literals") {
+  withSQLConf(SQLConf.OPTIMIZER_EXCLUDED_RULES.key ->
+      "org.apache.spark.sql.catalyst.optimizer.ConstantFolding") {
+    checkSparkAnswerAndOperator("SELECT my_func('literal_value')")
+  }
+}
+```
+
+This is particularly important for:
+
+- Edge case tests using specific literal values (e.g., null handling, overflow conditions)
+- Tests verifying behavior with special input values
+- Any test where the expression inputs are entirely literal
+
+When possible, prefer testing with column references from tables (as shown in the `unhex` example above), which naturally avoids the constant folding issue.
+
 ### Adding the Expression To the Protobuf Definition
 
 Once you have the expression implemented in Scala, you might need to update the protobuf definition to include the new expression. You may not need to do this if the expression is already covered by the existing protobuf definition (e.g. you're adding a new scalar function that uses the `ScalarFunc` message).
 
@@ -333,6 +333,7 @@ These settings can be used to determine which parts of the plan are accelerated
 | `spark.comet.expression.TruncTimestamp.enabled` | Enable Comet acceleration for `TruncTimestamp` | true |
 | `spark.comet.expression.UnaryMinus.enabled` | Enable Comet acceleration for `UnaryMinus` | true |
 | `spark.comet.expression.Unhex.enabled` | Enable Comet acceleration for `Unhex` | true |
+| `spark.comet.expression.UnixDate.enabled` | Enable Comet acceleration for `UnixDate` | true |
 | `spark.comet.expression.UnscaledValue.enabled` | Enable Comet acceleration for `UnscaledValue` | true |
 | `spark.comet.expression.Upper.enabled` | Enable Comet acceleration for `Upper` | true |
 | `spark.comet.expression.WeekDay.enabled` | Enable Comet acceleration for `WeekDay` | true |
 
@@ -19,16 +19,20 @@
 
 use std::{
     any::Any,
+    collections::HashMap,
     fmt,
     fmt::{Debug, Formatter},
     fs::File,
     io::Cursor,
     sync::Arc,
 };
 
-use opendal::{services::Hdfs, Operator};
-use url::Url;
+use opendal::Operator;
 
+use crate::execution::shuffle::CompressionCodec;
+use crate::parquet::parquet_support::{
+    create_hdfs_operator, is_hdfs_scheme, prepare_object_store_with_configs,
+};
 use arrow::datatypes::{Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
 use async_trait::async_trait;
@@ -50,8 +54,7 @@ use parquet::{
     basic::{Compression, ZstdLevel},
     file::properties::WriterProperties,
 };
-
-use crate::execution::shuffle::CompressionCodec;
+use url::Url;
 
 /// Enum representing different types of Arrow writers based on storage backend
 enum ParquetWriter {
@@ -200,6 +203,8 @@ pub struct ParquetWriterExec {
     partition_id: i32,
     /// Column names to use in the output Parquet file
     column_names: Vec<String>,
+    /// Object store configuration options
+    object_store_options: HashMap<String, String>,
     /// Metrics
     metrics: ExecutionPlanMetricsSet,
     /// Cache for plan properties
@@ -218,6 +223,7 @@ impl ParquetWriterExec {
         compression: CompressionCodec,
         partition_id: i32,
         column_names: Vec<String>,
+        object_store_options: HashMap<String, String>,
     ) -> Result<Self> {
         // Preserve the input's partitioning so each partition writes its own file
         let input_partitioning = input.output_partitioning().clone();
@@ -238,6 +244,7 @@ impl ParquetWriterExec {
             compression,
             partition_id,
             column_names,
+            object_store_options,
             metrics: ExecutionPlanMetricsSet::new(),
             cache,
         })
@@ -255,10 +262,11 @@ impl ParquetWriterExec {
     /// Create an Arrow writer based on the storage scheme
     ///
     /// # Arguments
-    /// * `storage_scheme` - The storage backend ("hdfs", "s3", or "local")
     /// * `output_file_path` - The full path to the output file
     /// * `schema` - The Arrow schema for the Parquet file
     /// * `props` - Writer properties including compression
+    /// * `runtime_env` - Runtime environment for object store registration
+    /// * `object_store_options` - Configuration options for object store
     ///
     /// # Returns
     /// * `Ok(ParquetWriter)` - A writer appropriate for the storage scheme
@@ -267,71 +275,61 @@ impl ParquetWriterExec {
         output_file_path: &str,
         schema: SchemaRef,
         props: WriterProperties,
+        runtime_env: Arc<datafusion::execution::runtime_env::RuntimeEnv>,
+        object_store_options: &HashMap<String, String>,
     ) -> Result<ParquetWriter> {
-        // Determine storage scheme from output_file_path
-        let storage_scheme = if output_file_path.starts_with("hdfs://") {
-            "hdfs"
-        } else if output_file_path.starts_with("s3://") || output_file_path.starts_with("s3a://") {
-            "s3"
-        } else {
-            "local"
-        };
+        // Parse URL and match on storage scheme directly
+        let url = Url::parse(output_file_path).map_err(|e| {
+            DataFusionError::Execution(format!("Failed to parse URL '{}': {}", output_file_path, e))
+        })?;
 
-        match storage_scheme {
-            "hdfs" => {
-                // Parse the output_file_path to extract namenode and path
-                // Expected format: hdfs://namenode:port/path/to/file
-                let url = Url::parse(output_file_path).map_err(|e| {
+        if is_hdfs_scheme(&url, object_store_options) {
+            // HDFS storage
+            {
+                // Use prepare_object_store_with_configs to create and register the object store
+                let (_object_store_url, object_store_path) = prepare_object_store_with_configs(
+                    runtime_env,
+                    output_file_path.to_string(),
+                    object_store_options,
+                )
+                .map_err(|e| {
                     DataFusionError::Execution(format!(
-                        "Failed to parse HDFS URL '{}': {}",
+                        "Failed to prepare object store for '{}': {}",
                         output_file_path, e
                     ))
                 })?;
 
-                // Extract namenode (scheme + host + port)
-                let namenode = format!(
-                    "{}://{}{}",
-                    url.scheme(),
-                    url.host_str().unwrap_or("localhost"),
-                    url.port()
-                        .map(|p| format!(":{}", p))
-                        .unwrap_or_else(|| ":9000".to_string())
-                );
-
-                // Extract the path (without the scheme and host)
-                let hdfs_path = url.path().to_string();
-
                 // For remote storage (HDFS, S3), write to an in-memory buffer
                 let buffer = Vec::new();
                 let cursor = Cursor::new(buffer);
                 let arrow_parquet_buffer_writer = ArrowWriter::try_new(cursor, schema, Some(props))
                     .map_err(|e| {
-                        DataFusionError::Execution(format!(
-                            "Failed to create {} writer: {}",
-                            storage_scheme, e
-                        ))
+                        DataFusionError::Execution(format!("Failed to create HDFS writer: {}", e))
                     })?;
 
-                let builder = Hdfs::default().name_node(&namenode);
-                let op = Operator::new(builder)
-                    .map_err(|e| {
-                        DataFusionError::Execution(format!(
-                            "Failed to create HDFS operator for '{}' (namenode: {}): {}",
-                            output_file_path, namenode, e
-                        ))
-                    })?
-                    .finish();
+                // Create HDFS operator with configuration options using the helper function
+                let op = create_hdfs_operator(&url).map_err(|e| {
+                    DataFusionError::Execution(format!(
+                        "Failed to create HDFS operator for '{}': {}",
+                        output_file_path, e
+                    ))
+                })?;
 
                 // HDFS writer will be created lazily on first write
-                // Use only the path part for the HDFS writer
+                // Use the path from prepare_object_store_with_configs
                 Ok(ParquetWriter::Remote(
                     arrow_parquet_buffer_writer,
                     None,
                     op,
-                    hdfs_path,
+                    object_store_path.to_string(),
                 ))
             }
-            "local" => {
+        } else if output_file_path.starts_with("file://")
+            || output_file_path.starts_with("file:")
+            || !output_file_path.contains("://")
+        {
+            // Local file system
+            {
                 // For a local file system, write directly to file
                 // Strip file:// or file: prefix if present
                 let local_path = output_file_path
@@ -368,10 +366,12 @@ impl ParquetWriterExec {
                 })?;
                 Ok(ParquetWriter::LocalFile(writer))
             }
-            _ => Err(DataFusionError::Execution(format!(
-                "Unsupported storage scheme: {}",
-                storage_scheme
-            ))),
+        } else {
+            // Unsupported storage scheme
+            Err(DataFusionError::Execution(format!(
+                "Unsupported storage scheme in path: {}",
+                output_file_path
+            )))
         }
     }
 }
@@ -435,6 +435,7 @@ impl ExecutionPlan for ParquetWriterExec {
                 self.compression.clone(),
                 self.partition_id,
                 self.column_names.clone(),
+                self.object_store_options.clone(),
             )?)),
             _ => Err(DataFusionError::Internal(
                 "ParquetWriterExec requires exactly one child".to_string(),
@@ -454,6 +455,7 @@ impl ExecutionPlan for ParquetWriterExec {
         let bytes_written = MetricBuilder::new(&self.metrics).counter("bytes_written", partition);
         let rows_written = MetricBuilder::new(&self.metrics).counter("rows_written", partition);
 
+        let runtime_env = context.runtime_env();
         let input = self.input.execute(partition, context)?;
         let input_schema = self.input.schema();
         let work_dir = self.work_dir.clone();
@@ -488,7 +490,14 @@ impl ExecutionPlan for ParquetWriterExec {
             .set_compression(compression)
             .build();
 
-        let mut writer = Self::create_arrow_writer(&part_file, Arc::clone(&output_schema), props)?;
+        let object_store_options = self.object_store_options.clone();
+        let mut writer = Self::create_arrow_writer(
+            &part_file,
+            Arc::clone(&output_schema),
+            props,
+            runtime_env,
+            &object_store_options,
+        )?;
 
         // Clone schema for use in async closure
         let schema_for_write = Arc::clone(&output_schema);
@@ -732,10 +741,14 @@ mod tests {
         // Create ParquetWriter using the create_arrow_writer method
         // Use full HDFS URL format
         let full_output_path = format!("hdfs://namenode:9000{}", output_path);
+        let session_ctx = datafusion::prelude::SessionContext::new();
+        let runtime_env = session_ctx.runtime_env();
         let mut writer = ParquetWriterExec::create_arrow_writer(
             &full_output_path,
             create_test_record_batch(1)?.schema(),
             props,
+            runtime_env,
+            &HashMap::new(),
         )?;
 
         // Write 5 batches in a loop
@@ -802,6 +815,7 @@ mod tests {
             CompressionCodec::None,
             0, // partition_id
             column_names,
+            HashMap::new(), // object_store_options
         )?;
 
         // Create a session context and execute the plan
 
@@ -1248,6 +1248,12 @@ impl PhysicalPlanner {
                     ))),
                 }?;
 
+                let object_store_options: HashMap<String, String> = writer
+                    .object_store_options
+                    .iter()
+                    .map(|(k, v)| (k.clone(), v.clone()))
+                    .collect();
+
                 let parquet_writer = Arc::new(ParquetWriterExec::try_new(
                     Arc::clone(&child.native_plan),
                     writer.output_path.clone(),
@@ -1261,6 +1267,7 @@ impl PhysicalPlanner {
                     codec,
                     self.partition,
                     writer.column_names.clone(),
+                    object_store_options,
                 )?);
 
                 Ok((