apache
diff --git a/‎.claude/skills/review-comet-pr/SKILL.md‎
Lines changed: 0 additions & 2 deletions b/‎.claude/skills/review-comet-pr/SKILL.md‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎docs/source/contributor-guide/adding_a_new_expression.md‎
Lines changed: 0 additions & 2 deletions b/‎docs/source/contributor-guide/adding_a_new_expression.md‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎docs/source/contributor-guide/sql-file-tests.md‎
Lines changed: 9 additions & 11 deletions b/‎docs/source/contributor-guide/sql-file-tests.md‎
Lines changed: 9 additions & 11 deletions
diff --git a/‎docs/source/user-guide/latest/compatibility.md‎
Lines changed: 13 additions & 0 deletions b/‎docs/source/user-guide/latest/compatibility.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎native/Cargo.lock‎
Lines changed: 6 additions & 6 deletions b/‎native/Cargo.lock‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎native/core/src/execution/expressions/temporal.rs‎
Lines changed: 28 additions & 1 deletion b/‎native/core/src/execution/expressions/temporal.rs‎
Lines changed: 28 additions & 1 deletion
diff --git a/‎native/core/src/execution/planner/expression_registry.rs‎
Lines changed: 6 additions & 0 deletions b/‎native/core/src/execution/planner/expression_registry.rs‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎native/proto/src/proto/expr.proto‎
Lines changed: 5 additions & 0 deletions b/‎native/proto/src/proto/expr.proto‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎native/spark-expr/src/conversion_funcs/string.rs‎
Lines changed: 51 additions & 3 deletions b/‎native/spark-expr/src/conversion_funcs/string.rs‎
Lines changed: 51 additions & 3 deletions
@@ -149,8 +149,6 @@ Categories include: `aggregate/`, `array/`, `string/`, `math/`, `struct/`, `map/
 **SQL file structure:**
 
 ```sql
--- ConfigMatrix: parquet.enable.dictionary=false,true
-
 -- Create test data
 statement
 CREATE TABLE test_crc32(col string, a int, b float) USING parquet
 
@@ -217,8 +217,6 @@ It is important to verify that the new expression is correctly recognized by the
 Create a `.sql` file under the appropriate subdirectory in `spark/src/test/resources/sql-tests/expressions/` (e.g., `string/`, `math/`, `array/`). The file should create a table with test data, then run queries that exercise the expression. Here is an example for the `unhex` expression:
 
 ```sql
--- ConfigMatrix: parquet.enable.dictionary=false,true
-
 statement
 CREATE TABLE test_unhex(col string) USING parquet
 
 
@@ -76,8 +76,6 @@ A test file consists of SQL comments, directives, statements, and queries separa
 lines. Here is a minimal example:
 
 ```sql
--- ConfigMatrix: parquet.enable.dictionary=false,true
-
 statement
 CREATE TABLE test_abs(v double) USING parquet
 
@@ -106,16 +104,19 @@ Runs the entire file once per combination of values. Multiple `ConfigMatrix` lin
 cross product of all combinations.
 
 ```sql
--- ConfigMatrix: parquet.enable.dictionary=false,true
+-- ConfigMatrix: spark.sql.optimizer.inSetConversionThreshold=100,0
 ```
 
 This generates two test cases:
 
 ```
-sql-file: expressions/cast/cast.sql [parquet.enable.dictionary=false]
-sql-file: expressions/cast/cast.sql [parquet.enable.dictionary=true]
+sql-file: expressions/conditional/in_set.sql [spark.sql.optimizer.inSetConversionThreshold=100]
+sql-file: expressions/conditional/in_set.sql [spark.sql.optimizer.inSetConversionThreshold=0]
 ```
 
+Only add a `ConfigMatrix` directive when there is a real reason to run the test under
+multiple configurations. Do not add `ConfigMatrix` directives speculatively.
+
 #### `MinSparkVersion`
 
 Skips the file when running on a Spark version older than the specified version.
@@ -223,12 +224,9 @@ SELECT array(1, 2, 3)[10]
 
 2. Add the Apache license header as a SQL comment.
 
-3. Add a `ConfigMatrix` directive if the test should run with multiple Parquet configurations.
-   Most expression tests use:
-
-   ```sql
-   -- ConfigMatrix: parquet.enable.dictionary=false,true
-   ```
+3. Add a `ConfigMatrix` directive only if the test needs to run under multiple configurations
+   (e.g., testing behavior that varies with a specific Spark config). Do not add `ConfigMatrix`
+   directives speculatively.
 
 4. Create tables and insert test data using `statement` blocks. Include edge cases such as
    `NULL`, boundary values, and negative numbers.
 
@@ -136,6 +136,19 @@ Cast operations in Comet fall into three levels of support:
   Spark.
 - **N/A**: Spark does not support this cast.
 
+### String to Decimal
+
+Comet's native `CAST(string AS DECIMAL)` implementation matches Apache Spark's behavior,
+including:
+
+- Leading and trailing ASCII whitespace is trimmed before parsing.
+- Null bytes (`\u0000`) at the start or end of a string are trimmed, matching Spark's
+  `UTF8String` behavior. Null bytes embedded in the middle of a string produce `NULL`.
+- Fullwidth Unicode digits (U+FF10–U+FF19, e.g. `１２３.４５`) are treated as their ASCII
+  equivalents, so `CAST('１２３.４５' AS DECIMAL(10,2))` returns `123.45`.
+- Scientific notation (e.g. `1.23E+5`) is supported.
+- Special values (`inf`, `infinity`, `nan`) produce `NULL`.
+
 ### String to Timestamp
 
 Comet's native `CAST(string AS TIMESTAMP)` implementation supports all timestamp formats accepted
 
@@ -25,7 +25,8 @@ use datafusion::logical_expr::ScalarUDF;
 use datafusion::physical_expr::{PhysicalExpr, ScalarFunctionExpr};
 use datafusion_comet_proto::spark_expression::Expr;
 use datafusion_comet_spark_expr::{
-    SparkHour, SparkMinute, SparkSecond, SparkUnixTimestamp, TimestampTruncExpr,
+    SparkHour, SparkHoursTransform, SparkMinute, SparkSecond, SparkUnixTimestamp,
+    TimestampTruncExpr,
 };
 
 use crate::execution::{
@@ -160,3 +161,29 @@ impl ExpressionBuilder for TruncTimestampBuilder {
         Ok(Arc::new(TimestampTruncExpr::new(child, format, timezone)))
     }
 }
+
+pub struct HoursTransformBuilder;
+
+impl ExpressionBuilder for HoursTransformBuilder {
+    fn build(
+        &self,
+        spark_expr: &Expr,
+        input_schema: SchemaRef,
+        planner: &PhysicalPlanner,
+    ) -> Result<Arc<dyn PhysicalExpr>, ExecutionError> {
+        let expr = extract_expr!(spark_expr, HoursTransform);
+        let child = planner.create_expr(expr.child.as_ref().unwrap(), Arc::clone(&input_schema))?;
+        let args = vec![child];
+        let comet_hours_transform = Arc::new(ScalarUDF::new_from_impl(SparkHoursTransform::new()));
+        let field_ref = Arc::new(Field::new("hours_transform", DataType::Int32, true));
+        let expr: ScalarFunctionExpr = ScalarFunctionExpr::new(
+            "hours_transform",
+            comet_hours_transform,
+            args,
+            field_ref,
+            Arc::new(ConfigOptions::default()),
+        );
+
+        Ok(Arc::new(expr))
+    }
+}
@@ -110,6 +110,7 @@ pub enum ExpressionType {
     Second,
     TruncTimestamp,
     UnixTimestamp,
+    HoursTransform,
 }
 
 /// Registry for expression builders
@@ -310,6 +311,10 @@ impl ExpressionRegistry {
             ExpressionType::TruncTimestamp,
             Box::new(TruncTimestampBuilder),
         );
+        self.builders.insert(
+            ExpressionType::HoursTransform,
+            Box::new(HoursTransformBuilder),
+        );
     }
 
     /// Extract expression type from Spark protobuf expression
@@ -382,6 +387,7 @@ impl ExpressionRegistry {
             Some(ExprStruct::Second(_)) => Ok(ExpressionType::Second),
             Some(ExprStruct::TruncTimestamp(_)) => Ok(ExpressionType::TruncTimestamp),
             Some(ExprStruct::UnixTimestamp(_)) => Ok(ExpressionType::UnixTimestamp),
+            Some(ExprStruct::HoursTransform(_)) => Ok(ExpressionType::HoursTransform),
 
             Some(other) => Err(ExecutionError::GeneralError(format!(
                 "Unsupported expression type: {:?}",
 
@@ -88,6 +88,7 @@ message Expr {
     UnixTimestamp unix_timestamp = 65;
     FromJson from_json = 66;
     ToCsv to_csv = 67;
+    HoursTransform hours_transform = 68;
   }
 
   // Optional QueryContext for error reporting (contains SQL text and position)
@@ -356,6 +357,10 @@ message Hour {
   string timezone = 2;
 }
 
+message HoursTransform {
+  Expr child = 1;
+}
+
 message Minute {
   Expr child = 1;
   string timezone = 2;
 
@@ -438,6 +438,40 @@ fn cast_string_to_decimal256_impl(
     ))
 }
 
+/// Normalize fullwidth Unicode digits (U+FF10–U+FF19) to their ASCII equivalents.
+///
+/// Spark's UTF8String parser treats fullwidth digits as numerically equivalent to
+/// ASCII digits, e.g. "１２３.４５" parses as 123.45. Each fullwidth digit encodes
+/// to exactly three UTF-8 bytes: [0xEF, 0xBC, 0x90+n] for digit n. The ASCII
+/// equivalent is 0x30+n, so the conversion is: third_byte - 0x60.
+///
+/// All other bytes (ASCII or other multi-byte sequences) are passed through
+/// unchanged, so the output is valid UTF-8 whenever the input is.
+fn normalize_fullwidth_digits(s: &str) -> String {
+    let bytes = s.as_bytes();
+    let mut out = Vec::with_capacity(s.len());
+    let mut i = 0;
+    while i < bytes.len() {
+        if i + 2 < bytes.len()
+            && bytes[i] == 0xEF
+            && bytes[i + 1] == 0xBC
+            && bytes[i + 2] >= 0x90
+            && bytes[i + 2] <= 0x99
+        {
+            // e.g. 0x91 - 0x60 = 0x31 = b'1'
+            out.push(bytes[i + 2] - 0x60);
+            i += 3;
+        } else {
+            out.push(bytes[i]);
+            i += 1;
+        }
+    }
+    // SAFETY: we only replace valid 3-byte UTF-8 sequences [EF BC 9X] with a
+    // single ASCII byte; all other bytes are copied unchanged, preserving the
+    // UTF-8 invariant of the input.
+    unsafe { String::from_utf8_unchecked(out) }
+}
+
 /// Parse a decimal string into mantissa and scale
 /// e.g., "123.45" -> (12345, 2), "-0.001" -> (-1, 3) , 0e50 -> (0,50) etc
 /// Parse a string to decimal following Spark's behavior
@@ -446,16 +480,30 @@ fn parse_string_to_decimal(input_str: &str, precision: u8, scale: i8) -> SparkRe
     let mut start = 0;
     let mut end = string_bytes.len();
 
-    // trim whitespaces
-    while start < end && string_bytes[start].is_ascii_whitespace() {
+    // Trim ASCII whitespace and null bytes from both ends. Spark's UTF8String
+    // trims null bytes the same way it trims whitespace: "123\u0000" and
+    // "\u0000123" both parse as 123. Null bytes in the middle are not trimmed
+    // and will fail the digit validation in parse_decimal_str, producing NULL.
+    while start < end && (string_bytes[start].is_ascii_whitespace() || string_bytes[start] == 0) {
         start += 1;
     }
-    while end > start && string_bytes[end - 1].is_ascii_whitespace() {
+    while end > start && (string_bytes[end - 1].is_ascii_whitespace() || string_bytes[end - 1] == 0)
+    {
         end -= 1;
     }
 
     let trimmed = &input_str[start..end];
 
+    // Normalize fullwidth digits to ASCII. Fast path skips the allocation for
+    // pure-ASCII strings, which is the common case.
+    let normalized;
+    let trimmed = if trimmed.bytes().any(|b| b > 0x7F) {
+        normalized = normalize_fullwidth_digits(trimmed);
+        normalized.as_str()
+    } else {
+        trimmed
+    };
+
     if trimmed.is_empty() {
         return Ok(None);
     }