apache
diff --git a/‎.github/workflows/pr_build_linux.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/pr_build_linux.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/pr_build_macos.yml‎
Lines changed: 13 additions & 5 deletions b/‎.github/workflows/pr_build_macos.yml‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎.github/workflows/spark_sql_test.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/spark_sql_test.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎dev/benchmarks/comet-tpch.sh‎
Lines changed: 0 additions & 1 deletion b/‎dev/benchmarks/comet-tpch.sh‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎docs/source/user-guide/latest/compatibility.md‎
Lines changed: 5 additions & 5 deletions b/‎docs/source/user-guide/latest/compatibility.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎docs/source/user-guide/latest/configs.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/source/user-guide/latest/configs.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎native/Cargo.lock‎
Lines changed: 7 additions & 6 deletions b/‎native/Cargo.lock‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎native/core/src/execution/expressions/mod.rs‎
Lines changed: 1 addition & 0 deletions b/‎native/core/src/execution/expressions/mod.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎native/core/src/execution/expressions/strings.rs‎
Lines changed: 125 additions & 0 deletions b/‎native/core/src/execution/expressions/strings.rs‎
Lines changed: 125 additions & 0 deletions
@@ -161,6 +161,7 @@ jobs:
               org.apache.comet.CometStringExpressionSuite
               org.apache.comet.CometBitwiseExpressionSuite
               org.apache.comet.CometMapExpressionSuite
+              org.apache.comet.CometJsonExpressionSuite
               org.apache.comet.expressions.conditional.CometIfSuite
               org.apache.comet.expressions.conditional.CometCoalesceSuite
               org.apache.comet.expressions.conditional.CometCaseWhenSuite
 
@@ -57,11 +57,9 @@ jobs:
             java_version: "17"
             maven_opts: "-Pspark-3.5 -Pscala-2.13"
 
-            # TODO fails with OOM
-            # https://github.com/apache/datafusion-comet/issues/1949
-#          - name: "Spark 4.0, JDK 17, Scala 2.13"
-#            java_version: "17"
-#            maven_opts: "-Pspark-4.0 -Pscala-2.13"
+          - name: "Spark 4.0, JDK 17, Scala 2.13"
+            java_version: "17"
+            maven_opts: "-Pspark-4.0 -Pscala-2.13"
 
         suite:
           - name: "fuzz"
@@ -126,12 +124,22 @@ jobs:
               org.apache.comet.CometStringExpressionSuite
               org.apache.comet.CometBitwiseExpressionSuite
               org.apache.comet.CometMapExpressionSuite
+              org.apache.comet.CometJsonExpressionSuite
               org.apache.comet.expressions.conditional.CometIfSuite
               org.apache.comet.expressions.conditional.CometCoalesceSuite
               org.apache.comet.expressions.conditional.CometCaseWhenSuite
           - name: "sql"
             value: |
               org.apache.spark.sql.CometToPrettyStringSuite
+
+        exclude:
+          # Skip fuzz suite for Spark 4.0
+          # https://github.com/apache/datafusion-comet/issues/2965
+          - profile:
+              name: "Spark 4.0, JDK 17, Scala 2.13"
+            suite:
+              name: "fuzz"
+
       fail-fast: false
     name: ${{ matrix.os }}/${{ matrix.profile.name }} [${{ matrix.suite.name }}]
     runs-on: ${{ matrix.os }}
 
@@ -59,6 +59,10 @@ jobs:
           - {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
           - {name: "sql_hive-2", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest"}
           - {name: "sql_hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest"}
+        # Skip sql_hive-1 for Spark 4.0 due to https://github.com/apache/datafusion-comet/issues/2946
+        exclude:
+          - spark-version: {short: '4.0', full: '4.0.1', java: 17}
+            module: {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
       fail-fast: false
     name: spark-sql-${{ matrix.module.name }}/${{ matrix.os }}/spark-${{ matrix.spark-version.full }}/java-${{ matrix.spark-version.java }}
     runs-on: ${{ matrix.os }}
 
@@ -50,5 +50,4 @@ $SPARK_HOME/bin/spark-submit \
     --data $TPCH_DATA \
     --queries $TPCH_QUERIES \
     --output . \
-    --write /tmp \
     --iterations 1
@@ -32,12 +32,11 @@ Comet has the following limitations when reading Parquet files:
 
 ## ANSI Mode
 
-Comet will fall back to Spark for the following expressions when ANSI mode is enabled. Thes expressions can be enabled by setting
+Comet will fall back to Spark for the following expressions when ANSI mode is enabled. These expressions can be enabled by setting
 `spark.comet.expression.EXPRNAME.allowIncompatible=true`, where `EXPRNAME` is the Spark expression class name. See
 the [Comet Supported Expressions Guide](expressions.md) for more information on this configuration setting.
 
 - Average
-- Sum
 - Cast (in some cases)
 
 There is an [epic](https://github.com/apache/datafusion-comet/issues/313) where we are tracking the work to fully implement ANSI support.
@@ -159,6 +158,8 @@ The following cast operations are generally compatible with Spark except for the
 | string | short |  |
 | string | integer |  |
 | string | long |  |
+| string | float |  |
+| string | double |  |
 | string | binary |  |
 | string | date | Only supports years between 262143 BC and 262142 AD |
 | binary | string |  |
@@ -181,9 +182,8 @@ The following cast operations are not compatible with Spark for all inputs and a
 |-|-|-|
 | float | decimal  | There can be rounding differences |
 | double | decimal  | There can be rounding differences |
-| string | float  | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. |
-| string | double  | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. |
-| string | decimal  | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. Returns 0.0 instead of null if input contains no digits |
+| string | decimal  | Does not support fullwidth unicode digits (e.g \\uFF10)
+or strings containing null bytes (e.g \\u0000) |
 | string | timestamp  | Not all valid formats are supported |
 <!-- prettier-ignore-end -->
 <!--END:INCOMPAT_CAST_TABLE-->
 
@@ -264,6 +264,7 @@ These settings can be used to determine which parts of the plan are accelerated
 | `spark.comet.expression.IsNaN.enabled` | Enable Comet acceleration for `IsNaN` | true |
 | `spark.comet.expression.IsNotNull.enabled` | Enable Comet acceleration for `IsNotNull` | true |
 | `spark.comet.expression.IsNull.enabled` | Enable Comet acceleration for `IsNull` | true |
+| `spark.comet.expression.JsonToStructs.enabled` | Enable Comet acceleration for `JsonToStructs` | true |
 | `spark.comet.expression.KnownFloatingPointNormalized.enabled` | Enable Comet acceleration for `KnownFloatingPointNormalized` | true |
 | `spark.comet.expression.Length.enabled` | Enable Comet acceleration for `Length` | true |
 | `spark.comet.expression.LessThan.enabled` | Enable Comet acceleration for `LessThan` | true |
@@ -306,6 +307,7 @@ These settings can be used to determine which parts of the plan are accelerated
 | `spark.comet.expression.Signum.enabled` | Enable Comet acceleration for `Signum` | true |
 | `spark.comet.expression.Sin.enabled` | Enable Comet acceleration for `Sin` | true |
 | `spark.comet.expression.Sinh.enabled` | Enable Comet acceleration for `Sinh` | true |
+| `spark.comet.expression.Size.enabled` | Enable Comet acceleration for `Size` | true |
 | `spark.comet.expression.SortOrder.enabled` | Enable Comet acceleration for `SortOrder` | true |
 | `spark.comet.expression.SparkPartitionID.enabled` | Enable Comet acceleration for `SparkPartitionID` | true |
 | `spark.comet.expression.Sqrt.enabled` | Enable Comet acceleration for `Sqrt` | true |
 
@@ -22,6 +22,7 @@ pub mod bitwise;
 pub mod comparison;
 pub mod logical;
 pub mod nullcheck;
+pub mod strings;
 pub mod subquery;
 
 pub use datafusion_comet_spark_expr::EvalMode;
 
@@ -0,0 +1,125 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! String expression builders
+
+use std::cmp::max;
+use std::sync::Arc;
+
+use arrow::datatypes::SchemaRef;
+use datafusion::common::ScalarValue;
+use datafusion::physical_expr::expressions::{LikeExpr, Literal};
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion_comet_proto::spark_expression::Expr;
+use datafusion_comet_spark_expr::{FromJson, RLike, SubstringExpr};
+
+use crate::execution::{
+    expressions::extract_expr,
+    operators::ExecutionError,
+    planner::{expression_registry::ExpressionBuilder, PhysicalPlanner},
+    serde::to_arrow_datatype,
+};
+
+/// Builder for Substring expressions
+pub struct SubstringBuilder;
+
+impl ExpressionBuilder for SubstringBuilder {
+    fn build(
+        &self,
+        spark_expr: &Expr,
+        input_schema: SchemaRef,
+        planner: &PhysicalPlanner,
+    ) -> Result<Arc<dyn PhysicalExpr>, ExecutionError> {
+        let expr = extract_expr!(spark_expr, Substring);
+        let child = planner.create_expr(expr.child.as_ref().unwrap(), input_schema)?;
+        // Spark Substring's start is 1-based when start > 0
+        let start = expr.start - i32::from(expr.start > 0);
+        // substring negative len is treated as 0 in Spark
+        let len = max(expr.len, 0);
+
+        Ok(Arc::new(SubstringExpr::new(
+            child,
+            start as i64,
+            len as u64,
+        )))
+    }
+}
+
+/// Builder for Like expressions
+pub struct LikeBuilder;
+
+impl ExpressionBuilder for LikeBuilder {
+    fn build(
+        &self,
+        spark_expr: &Expr,
+        input_schema: SchemaRef,
+        planner: &PhysicalPlanner,
+    ) -> Result<Arc<dyn PhysicalExpr>, ExecutionError> {
+        let expr = extract_expr!(spark_expr, Like);
+        let left = planner.create_expr(expr.left.as_ref().unwrap(), Arc::clone(&input_schema))?;
+        let right = planner.create_expr(expr.right.as_ref().unwrap(), input_schema)?;
+
+        Ok(Arc::new(LikeExpr::new(false, false, left, right)))
+    }
+}
+
+/// Builder for Rlike (regex like) expressions
+pub struct RlikeBuilder;
+
+impl ExpressionBuilder for RlikeBuilder {
+    fn build(
+        &self,
+        spark_expr: &Expr,
+        input_schema: SchemaRef,
+        planner: &PhysicalPlanner,
+    ) -> Result<Arc<dyn PhysicalExpr>, ExecutionError> {
+        let expr = extract_expr!(spark_expr, Rlike);
+        let left = planner.create_expr(expr.left.as_ref().unwrap(), Arc::clone(&input_schema))?;
+        let right = planner.create_expr(expr.right.as_ref().unwrap(), input_schema)?;
+
+        match right.as_any().downcast_ref::<Literal>().unwrap().value() {
+            ScalarValue::Utf8(Some(pattern)) => Ok(Arc::new(RLike::try_new(left, pattern)?)),
+            _ => Err(ExecutionError::GeneralError(
+                "RLike only supports scalar patterns".to_string(),
+            )),
+        }
+    }
+}
+
+pub struct FromJsonBuilder;
+
+impl ExpressionBuilder for FromJsonBuilder {
+    fn build(
+        &self,
+        spark_expr: &Expr,
+        input_schema: SchemaRef,
+        planner: &PhysicalPlanner,
+    ) -> Result<Arc<dyn PhysicalExpr>, ExecutionError> {
+        let expr = extract_expr!(spark_expr, FromJson);
+        let child = planner.create_expr(
+            expr.child.as_ref().ok_or_else(|| {
+                ExecutionError::GeneralError("FromJson missing child".to_string())
+            })?,
+            input_schema,
+        )?;
+        let schema =
+            to_arrow_datatype(expr.schema.as_ref().ok_or_else(|| {
+                ExecutionError::GeneralError("FromJson missing schema".to_string())
+            })?);
+        Ok(Arc::new(FromJson::new(child, schema, &expr.timezone)))
+    }
+}