Skip to content

Commit 0f2e41e

Browse files
committed
Merge branch 'main' into feature/array-position
2 parents 736ca63 + eb502ff commit 0f2e41e

151 files changed

Lines changed: 642 additions & 349 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.claude/skills/review-comet-pr/SKILL.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,6 @@ Categories include: `aggregate/`, `array/`, `string/`, `math/`, `struct/`, `map/
149149
**SQL file structure:**
150150

151151
```sql
152-
-- ConfigMatrix: parquet.enable.dictionary=false,true
153-
154152
-- Create test data
155153
statement
156154
CREATE TABLE test_crc32(col string, a int, b float) USING parquet

docs/source/contributor-guide/adding_a_new_expression.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -217,8 +217,6 @@ It is important to verify that the new expression is correctly recognized by the
217217
Create a `.sql` file under the appropriate subdirectory in `spark/src/test/resources/sql-tests/expressions/` (e.g., `string/`, `math/`, `array/`). The file should create a table with test data, then run queries that exercise the expression. Here is an example for the `unhex` expression:
218218

219219
```sql
220-
-- ConfigMatrix: parquet.enable.dictionary=false,true
221-
222220
statement
223221
CREATE TABLE test_unhex(col string) USING parquet
224222

docs/source/contributor-guide/sql-file-tests.md

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,6 @@ A test file consists of SQL comments, directives, statements, and queries separa
7676
lines. Here is a minimal example:
7777

7878
```sql
79-
-- ConfigMatrix: parquet.enable.dictionary=false,true
80-
8179
statement
8280
CREATE TABLE test_abs(v double) USING parquet
8381

@@ -106,16 +104,19 @@ Runs the entire file once per combination of values. Multiple `ConfigMatrix` lin
106104
cross product of all combinations.
107105

108106
```sql
109-
-- ConfigMatrix: parquet.enable.dictionary=false,true
107+
-- ConfigMatrix: spark.sql.optimizer.inSetConversionThreshold=100,0
110108
```
111109

112110
This generates two test cases:
113111

114112
```
115-
sql-file: expressions/cast/cast.sql [parquet.enable.dictionary=false]
116-
sql-file: expressions/cast/cast.sql [parquet.enable.dictionary=true]
113+
sql-file: expressions/conditional/in_set.sql [spark.sql.optimizer.inSetConversionThreshold=100]
114+
sql-file: expressions/conditional/in_set.sql [spark.sql.optimizer.inSetConversionThreshold=0]
117115
```
118116

117+
Only add a `ConfigMatrix` directive when there is a real reason to run the test under
118+
multiple configurations. Do not add `ConfigMatrix` directives speculatively.
119+
119120
#### `MinSparkVersion`
120121

121122
Skips the file when running on a Spark version older than the specified version.
@@ -223,12 +224,9 @@ SELECT array(1, 2, 3)[10]
223224

224225
2. Add the Apache license header as a SQL comment.
225226

226-
3. Add a `ConfigMatrix` directive if the test should run with multiple Parquet configurations.
227-
Most expression tests use:
228-
229-
```sql
230-
-- ConfigMatrix: parquet.enable.dictionary=false,true
231-
```
227+
3. Add a `ConfigMatrix` directive only if the test needs to run under multiple configurations
228+
(e.g., testing behavior that varies with a specific Spark config). Do not add `ConfigMatrix`
229+
directives speculatively.
232230

233231
4. Create tables and insert test data using `statement` blocks. Include edge cases such as
234232
`NULL`, boundary values, and negative numbers.

docs/source/user-guide/latest/compatibility.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,19 @@ Cast operations in Comet fall into three levels of support:
136136
Spark.
137137
- **N/A**: Spark does not support this cast.
138138

139+
### String to Decimal
140+
141+
Comet's native `CAST(string AS DECIMAL)` implementation matches Apache Spark's behavior,
142+
including:
143+
144+
- Leading and trailing ASCII whitespace is trimmed before parsing.
145+
- Null bytes (`\u0000`) at the start or end of a string are trimmed, matching Spark's
146+
`UTF8String` behavior. Null bytes embedded in the middle of a string produce `NULL`.
147+
- Fullwidth Unicode digits (U+FF10–U+FF19, e.g. `123.45`) are treated as their ASCII
148+
equivalents, so `CAST('123.45' AS DECIMAL(10,2))` returns `123.45`.
149+
- Scientific notation (e.g. `1.23E+5`) is supported.
150+
- Special values (`inf`, `infinity`, `nan`) produce `NULL`.
151+
139152
### String to Timestamp
140153

141154
Comet's native `CAST(string AS TIMESTAMP)` implementation supports all timestamp formats accepted

native/Cargo.lock

Lines changed: 6 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

native/core/src/execution/expressions/temporal.rs

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ use datafusion::logical_expr::ScalarUDF;
2525
use datafusion::physical_expr::{PhysicalExpr, ScalarFunctionExpr};
2626
use datafusion_comet_proto::spark_expression::Expr;
2727
use datafusion_comet_spark_expr::{
28-
SparkHour, SparkMinute, SparkSecond, SparkUnixTimestamp, TimestampTruncExpr,
28+
SparkHour, SparkHoursTransform, SparkMinute, SparkSecond, SparkUnixTimestamp,
29+
TimestampTruncExpr,
2930
};
3031

3132
use crate::execution::{
@@ -160,3 +161,29 @@ impl ExpressionBuilder for TruncTimestampBuilder {
160161
Ok(Arc::new(TimestampTruncExpr::new(child, format, timezone)))
161162
}
162163
}
164+
165+
pub struct HoursTransformBuilder;
166+
167+
impl ExpressionBuilder for HoursTransformBuilder {
168+
fn build(
169+
&self,
170+
spark_expr: &Expr,
171+
input_schema: SchemaRef,
172+
planner: &PhysicalPlanner,
173+
) -> Result<Arc<dyn PhysicalExpr>, ExecutionError> {
174+
let expr = extract_expr!(spark_expr, HoursTransform);
175+
let child = planner.create_expr(expr.child.as_ref().unwrap(), Arc::clone(&input_schema))?;
176+
let args = vec![child];
177+
let comet_hours_transform = Arc::new(ScalarUDF::new_from_impl(SparkHoursTransform::new()));
178+
let field_ref = Arc::new(Field::new("hours_transform", DataType::Int32, true));
179+
let expr: ScalarFunctionExpr = ScalarFunctionExpr::new(
180+
"hours_transform",
181+
comet_hours_transform,
182+
args,
183+
field_ref,
184+
Arc::new(ConfigOptions::default()),
185+
);
186+
187+
Ok(Arc::new(expr))
188+
}
189+
}

native/core/src/execution/planner/expression_registry.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ pub enum ExpressionType {
110110
Second,
111111
TruncTimestamp,
112112
UnixTimestamp,
113+
HoursTransform,
113114
}
114115

115116
/// Registry for expression builders
@@ -310,6 +311,10 @@ impl ExpressionRegistry {
310311
ExpressionType::TruncTimestamp,
311312
Box::new(TruncTimestampBuilder),
312313
);
314+
self.builders.insert(
315+
ExpressionType::HoursTransform,
316+
Box::new(HoursTransformBuilder),
317+
);
313318
}
314319

315320
/// Extract expression type from Spark protobuf expression
@@ -382,6 +387,7 @@ impl ExpressionRegistry {
382387
Some(ExprStruct::Second(_)) => Ok(ExpressionType::Second),
383388
Some(ExprStruct::TruncTimestamp(_)) => Ok(ExpressionType::TruncTimestamp),
384389
Some(ExprStruct::UnixTimestamp(_)) => Ok(ExpressionType::UnixTimestamp),
390+
Some(ExprStruct::HoursTransform(_)) => Ok(ExpressionType::HoursTransform),
385391

386392
Some(other) => Err(ExecutionError::GeneralError(format!(
387393
"Unsupported expression type: {:?}",

native/proto/src/proto/expr.proto

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ message Expr {
8888
UnixTimestamp unix_timestamp = 65;
8989
FromJson from_json = 66;
9090
ToCsv to_csv = 67;
91+
HoursTransform hours_transform = 68;
9192
}
9293

9394
// Optional QueryContext for error reporting (contains SQL text and position)
@@ -356,6 +357,10 @@ message Hour {
356357
string timezone = 2;
357358
}
358359

360+
message HoursTransform {
361+
Expr child = 1;
362+
}
363+
359364
message Minute {
360365
Expr child = 1;
361366
string timezone = 2;

native/spark-expr/src/conversion_funcs/string.rs

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,40 @@ fn cast_string_to_decimal256_impl(
438438
))
439439
}
440440

441+
/// Normalize fullwidth Unicode digits (U+FF10–U+FF19) to their ASCII equivalents.
442+
///
443+
/// Spark's UTF8String parser treats fullwidth digits as numerically equivalent to
444+
/// ASCII digits, e.g. "123.45" parses as 123.45. Each fullwidth digit encodes
445+
/// to exactly three UTF-8 bytes: [0xEF, 0xBC, 0x90+n] for digit n. The ASCII
446+
/// equivalent is 0x30+n, so the conversion is: third_byte - 0x60.
447+
///
448+
/// All other bytes (ASCII or other multi-byte sequences) are passed through
449+
/// unchanged, so the output is valid UTF-8 whenever the input is.
450+
fn normalize_fullwidth_digits(s: &str) -> String {
451+
let bytes = s.as_bytes();
452+
let mut out = Vec::with_capacity(s.len());
453+
let mut i = 0;
454+
while i < bytes.len() {
455+
if i + 2 < bytes.len()
456+
&& bytes[i] == 0xEF
457+
&& bytes[i + 1] == 0xBC
458+
&& bytes[i + 2] >= 0x90
459+
&& bytes[i + 2] <= 0x99
460+
{
461+
// e.g. 0x91 - 0x60 = 0x31 = b'1'
462+
out.push(bytes[i + 2] - 0x60);
463+
i += 3;
464+
} else {
465+
out.push(bytes[i]);
466+
i += 1;
467+
}
468+
}
469+
// SAFETY: we only replace valid 3-byte UTF-8 sequences [EF BC 9X] with a
470+
// single ASCII byte; all other bytes are copied unchanged, preserving the
471+
// UTF-8 invariant of the input.
472+
unsafe { String::from_utf8_unchecked(out) }
473+
}
474+
441475
/// Parse a decimal string into mantissa and scale
442476
/// e.g., "123.45" -> (12345, 2), "-0.001" -> (-1, 3) , 0e50 -> (0,50) etc
443477
/// Parse a string to decimal following Spark's behavior
@@ -446,16 +480,30 @@ fn parse_string_to_decimal(input_str: &str, precision: u8, scale: i8) -> SparkRe
446480
let mut start = 0;
447481
let mut end = string_bytes.len();
448482

449-
// trim whitespaces
450-
while start < end && string_bytes[start].is_ascii_whitespace() {
483+
// Trim ASCII whitespace and null bytes from both ends. Spark's UTF8String
484+
// trims null bytes the same way it trims whitespace: "123\u0000" and
485+
// "\u0000123" both parse as 123. Null bytes in the middle are not trimmed
486+
// and will fail the digit validation in parse_decimal_str, producing NULL.
487+
while start < end && (string_bytes[start].is_ascii_whitespace() || string_bytes[start] == 0) {
451488
start += 1;
452489
}
453-
while end > start && string_bytes[end - 1].is_ascii_whitespace() {
490+
while end > start && (string_bytes[end - 1].is_ascii_whitespace() || string_bytes[end - 1] == 0)
491+
{
454492
end -= 1;
455493
}
456494

457495
let trimmed = &input_str[start..end];
458496

497+
// Normalize fullwidth digits to ASCII. Fast path skips the allocation for
498+
// pure-ASCII strings, which is the common case.
499+
let normalized;
500+
let trimmed = if trimmed.bytes().any(|b| b > 0x7F) {
501+
normalized = normalize_fullwidth_digits(trimmed);
502+
normalized.as_str()
503+
} else {
504+
trimmed
505+
};
506+
459507
if trimmed.is_empty() {
460508
return Ok(None);
461509
}

0 commit comments

Comments
 (0)