Skip to content

Commit 9500bbb

Browse files
Merge branch 'apache:main' into main
2 parents 231aa90 + 0ae6515 commit 9500bbb

64 files changed

Lines changed: 4259 additions & 742 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/pr_build_linux.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ jobs:
161161
org.apache.comet.CometStringExpressionSuite
162162
org.apache.comet.CometBitwiseExpressionSuite
163163
org.apache.comet.CometMapExpressionSuite
164+
org.apache.comet.CometJsonExpressionSuite
164165
org.apache.comet.expressions.conditional.CometIfSuite
165166
org.apache.comet.expressions.conditional.CometCoalesceSuite
166167
org.apache.comet.expressions.conditional.CometCaseWhenSuite

.github/workflows/pr_build_macos.yml

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,9 @@ jobs:
5757
java_version: "17"
5858
maven_opts: "-Pspark-3.5 -Pscala-2.13"
5959

60-
# TODO fails with OOM
61-
# https://github.com/apache/datafusion-comet/issues/1949
62-
# - name: "Spark 4.0, JDK 17, Scala 2.13"
63-
# java_version: "17"
64-
# maven_opts: "-Pspark-4.0 -Pscala-2.13"
60+
- name: "Spark 4.0, JDK 17, Scala 2.13"
61+
java_version: "17"
62+
maven_opts: "-Pspark-4.0 -Pscala-2.13"
6563

6664
suite:
6765
- name: "fuzz"
@@ -126,12 +124,22 @@ jobs:
126124
org.apache.comet.CometStringExpressionSuite
127125
org.apache.comet.CometBitwiseExpressionSuite
128126
org.apache.comet.CometMapExpressionSuite
127+
org.apache.comet.CometJsonExpressionSuite
129128
org.apache.comet.expressions.conditional.CometIfSuite
130129
org.apache.comet.expressions.conditional.CometCoalesceSuite
131130
org.apache.comet.expressions.conditional.CometCaseWhenSuite
132131
- name: "sql"
133132
value: |
134133
org.apache.spark.sql.CometToPrettyStringSuite
134+
135+
exclude:
136+
# Skip fuzz suite for Spark 4.0
137+
# https://github.com/apache/datafusion-comet/issues/2965
138+
- profile:
139+
name: "Spark 4.0, JDK 17, Scala 2.13"
140+
suite:
141+
name: "fuzz"
142+
135143
fail-fast: false
136144
name: ${{ matrix.os }}/${{ matrix.profile.name }} [${{ matrix.suite.name }}]
137145
runs-on: ${{ matrix.os }}

.github/workflows/spark_sql_test.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ jobs:
5959
- {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
6060
- {name: "sql_hive-2", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest"}
6161
- {name: "sql_hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest"}
62+
# Skip sql_hive-1 for Spark 4.0 due to https://github.com/apache/datafusion-comet/issues/2946
63+
exclude:
64+
- spark-version: {short: '4.0', full: '4.0.1', java: 17}
65+
module: {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
6266
fail-fast: false
6367
name: spark-sql-${{ matrix.module.name }}/${{ matrix.os }}/spark-${{ matrix.spark-version.full }}/java-${{ matrix.spark-version.java }}
6468
runs-on: ${{ matrix.os }}

dev/benchmarks/comet-tpch.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,5 +50,4 @@ $SPARK_HOME/bin/spark-submit \
5050
--data $TPCH_DATA \
5151
--queries $TPCH_QUERIES \
5252
--output . \
53-
--write /tmp \
5453
--iterations 1

docs/source/user-guide/latest/compatibility.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,11 @@ Comet has the following limitations when reading Parquet files:
3232

3333
## ANSI Mode
3434

35-
Comet will fall back to Spark for the following expressions when ANSI mode is enabled. Thes expressions can be enabled by setting
35+
Comet will fall back to Spark for the following expressions when ANSI mode is enabled. These expressions can be enabled by setting
3636
`spark.comet.expression.EXPRNAME.allowIncompatible=true`, where `EXPRNAME` is the Spark expression class name. See
3737
the [Comet Supported Expressions Guide](expressions.md) for more information on this configuration setting.
3838

3939
- Average
40-
- Sum
4140
- Cast (in some cases)
4241

4342
There is an [epic](https://github.com/apache/datafusion-comet/issues/313) where we are tracking the work to fully implement ANSI support.
@@ -159,6 +158,8 @@ The following cast operations are generally compatible with Spark except for the
159158
| string | short | |
160159
| string | integer | |
161160
| string | long | |
161+
| string | float | |
162+
| string | double | |
162163
| string | binary | |
163164
| string | date | Only supports years between 262143 BC and 262142 AD |
164165
| binary | string | |
@@ -181,9 +182,8 @@ The following cast operations are not compatible with Spark for all inputs and a
181182
|-|-|-|
182183
| float | decimal | There can be rounding differences |
183184
| double | decimal | There can be rounding differences |
184-
| string | float | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. |
185-
| string | double | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. |
186-
| string | decimal | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. Returns 0.0 instead of null if input contains no digits |
185+
| string | decimal | Does not support fullwidth unicode digits (e.g \\uFF10)
186+
or strings containing null bytes (e.g \\u0000) |
187187
| string | timestamp | Not all valid formats are supported |
188188
<!-- prettier-ignore-end -->
189189
<!--END:INCOMPAT_CAST_TABLE-->

docs/source/user-guide/latest/configs.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,7 @@ These settings can be used to determine which parts of the plan are accelerated
264264
| `spark.comet.expression.IsNaN.enabled` | Enable Comet acceleration for `IsNaN` | true |
265265
| `spark.comet.expression.IsNotNull.enabled` | Enable Comet acceleration for `IsNotNull` | true |
266266
| `spark.comet.expression.IsNull.enabled` | Enable Comet acceleration for `IsNull` | true |
267+
| `spark.comet.expression.JsonToStructs.enabled` | Enable Comet acceleration for `JsonToStructs` | true |
267268
| `spark.comet.expression.KnownFloatingPointNormalized.enabled` | Enable Comet acceleration for `KnownFloatingPointNormalized` | true |
268269
| `spark.comet.expression.Length.enabled` | Enable Comet acceleration for `Length` | true |
269270
| `spark.comet.expression.LessThan.enabled` | Enable Comet acceleration for `LessThan` | true |
@@ -306,6 +307,7 @@ These settings can be used to determine which parts of the plan are accelerated
306307
| `spark.comet.expression.Signum.enabled` | Enable Comet acceleration for `Signum` | true |
307308
| `spark.comet.expression.Sin.enabled` | Enable Comet acceleration for `Sin` | true |
308309
| `spark.comet.expression.Sinh.enabled` | Enable Comet acceleration for `Sinh` | true |
310+
| `spark.comet.expression.Size.enabled` | Enable Comet acceleration for `Size` | true |
309311
| `spark.comet.expression.SortOrder.enabled` | Enable Comet acceleration for `SortOrder` | true |
310312
| `spark.comet.expression.SparkPartitionID.enabled` | Enable Comet acceleration for `SparkPartitionID` | true |
311313
| `spark.comet.expression.Sqrt.enabled` | Enable Comet acceleration for `Sqrt` | true |

native/Cargo.lock

Lines changed: 7 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

native/core/src/execution/expressions/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ pub mod bitwise;
2222
pub mod comparison;
2323
pub mod logical;
2424
pub mod nullcheck;
25+
pub mod strings;
2526
pub mod subquery;
2627

2728
pub use datafusion_comet_spark_expr::EvalMode;
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! String expression builders
19+
20+
use std::cmp::max;
21+
use std::sync::Arc;
22+
23+
use arrow::datatypes::SchemaRef;
24+
use datafusion::common::ScalarValue;
25+
use datafusion::physical_expr::expressions::{LikeExpr, Literal};
26+
use datafusion::physical_expr::PhysicalExpr;
27+
use datafusion_comet_proto::spark_expression::Expr;
28+
use datafusion_comet_spark_expr::{FromJson, RLike, SubstringExpr};
29+
30+
use crate::execution::{
31+
expressions::extract_expr,
32+
operators::ExecutionError,
33+
planner::{expression_registry::ExpressionBuilder, PhysicalPlanner},
34+
serde::to_arrow_datatype,
35+
};
36+
37+
/// Builder for Substring expressions
38+
pub struct SubstringBuilder;
39+
40+
impl ExpressionBuilder for SubstringBuilder {
41+
fn build(
42+
&self,
43+
spark_expr: &Expr,
44+
input_schema: SchemaRef,
45+
planner: &PhysicalPlanner,
46+
) -> Result<Arc<dyn PhysicalExpr>, ExecutionError> {
47+
let expr = extract_expr!(spark_expr, Substring);
48+
let child = planner.create_expr(expr.child.as_ref().unwrap(), input_schema)?;
49+
// Spark Substring's start is 1-based when start > 0
50+
let start = expr.start - i32::from(expr.start > 0);
51+
// substring negative len is treated as 0 in Spark
52+
let len = max(expr.len, 0);
53+
54+
Ok(Arc::new(SubstringExpr::new(
55+
child,
56+
start as i64,
57+
len as u64,
58+
)))
59+
}
60+
}
61+
62+
/// Builder for Like expressions
63+
pub struct LikeBuilder;
64+
65+
impl ExpressionBuilder for LikeBuilder {
66+
fn build(
67+
&self,
68+
spark_expr: &Expr,
69+
input_schema: SchemaRef,
70+
planner: &PhysicalPlanner,
71+
) -> Result<Arc<dyn PhysicalExpr>, ExecutionError> {
72+
let expr = extract_expr!(spark_expr, Like);
73+
let left = planner.create_expr(expr.left.as_ref().unwrap(), Arc::clone(&input_schema))?;
74+
let right = planner.create_expr(expr.right.as_ref().unwrap(), input_schema)?;
75+
76+
Ok(Arc::new(LikeExpr::new(false, false, left, right)))
77+
}
78+
}
79+
80+
/// Builder for Rlike (regex like) expressions
81+
pub struct RlikeBuilder;
82+
83+
impl ExpressionBuilder for RlikeBuilder {
84+
fn build(
85+
&self,
86+
spark_expr: &Expr,
87+
input_schema: SchemaRef,
88+
planner: &PhysicalPlanner,
89+
) -> Result<Arc<dyn PhysicalExpr>, ExecutionError> {
90+
let expr = extract_expr!(spark_expr, Rlike);
91+
let left = planner.create_expr(expr.left.as_ref().unwrap(), Arc::clone(&input_schema))?;
92+
let right = planner.create_expr(expr.right.as_ref().unwrap(), input_schema)?;
93+
94+
match right.as_any().downcast_ref::<Literal>().unwrap().value() {
95+
ScalarValue::Utf8(Some(pattern)) => Ok(Arc::new(RLike::try_new(left, pattern)?)),
96+
_ => Err(ExecutionError::GeneralError(
97+
"RLike only supports scalar patterns".to_string(),
98+
)),
99+
}
100+
}
101+
}
102+
103+
pub struct FromJsonBuilder;
104+
105+
impl ExpressionBuilder for FromJsonBuilder {
106+
fn build(
107+
&self,
108+
spark_expr: &Expr,
109+
input_schema: SchemaRef,
110+
planner: &PhysicalPlanner,
111+
) -> Result<Arc<dyn PhysicalExpr>, ExecutionError> {
112+
let expr = extract_expr!(spark_expr, FromJson);
113+
let child = planner.create_expr(
114+
expr.child.as_ref().ok_or_else(|| {
115+
ExecutionError::GeneralError("FromJson missing child".to_string())
116+
})?,
117+
input_schema,
118+
)?;
119+
let schema =
120+
to_arrow_datatype(expr.schema.as_ref().ok_or_else(|| {
121+
ExecutionError::GeneralError("FromJson missing schema".to_string())
122+
})?);
123+
Ok(Arc::new(FromJson::new(child, schema, &expr.timezone)))
124+
}
125+
}

0 commit comments

Comments
 (0)