apache
diff --git a/‎docs/source/user-guide/latest/compatibility.md‎
Lines changed: 13 additions & 0 deletions b/‎docs/source/user-guide/latest/compatibility.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎docs/source/user-guide/latest/tuning.md‎
Lines changed: 6 additions & 1 deletion b/‎docs/source/user-guide/latest/tuning.md‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎native/core/src/execution/expressions/temporal.rs‎
Lines changed: 28 additions & 1 deletion b/‎native/core/src/execution/expressions/temporal.rs‎
Lines changed: 28 additions & 1 deletion
diff --git a/‎native/core/src/execution/jni_api.rs‎
Lines changed: 22 additions & 2 deletions b/‎native/core/src/execution/jni_api.rs‎
Lines changed: 22 additions & 2 deletions
diff --git a/‎native/core/src/execution/memory_pools/config.rs‎
Lines changed: 4 additions & 1 deletion b/‎native/core/src/execution/memory_pools/config.rs‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎native/core/src/execution/memory_pools/mod.rs‎
Lines changed: 28 additions & 14 deletions b/‎native/core/src/execution/memory_pools/mod.rs‎
Lines changed: 28 additions & 14 deletions
diff --git a/‎native/core/src/execution/planner/expression_registry.rs‎
Lines changed: 6 additions & 0 deletions b/‎native/core/src/execution/planner/expression_registry.rs‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎native/proto/src/proto/expr.proto‎
Lines changed: 5 additions & 0 deletions b/‎native/proto/src/proto/expr.proto‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎native/spark-expr/src/conversion_funcs/string.rs‎
Lines changed: 51 additions & 3 deletions b/‎native/spark-expr/src/conversion_funcs/string.rs‎
Lines changed: 51 additions & 3 deletions
@@ -136,6 +136,19 @@ Cast operations in Comet fall into three levels of support:
   Spark.
 - **N/A**: Spark does not support this cast.
 
+### String to Decimal
+
+Comet's native `CAST(string AS DECIMAL)` implementation matches Apache Spark's behavior,
+including:
+
+- Leading and trailing ASCII whitespace is trimmed before parsing.
+- Null bytes (`\u0000`) at the start or end of a string are trimmed, matching Spark's
+  `UTF8String` behavior. Null bytes embedded in the middle of a string produce `NULL`.
+- Fullwidth Unicode digits (U+FF10–U+FF19, e.g. `１２３.４５`) are treated as their ASCII
+  equivalents, so `CAST('１２３.４５' AS DECIMAL(10,2))` returns `123.45`.
+- Scientific notation (e.g. `1.23E+5`) is supported.
+- Special values (`inf`, `infinity`, `nan`) produce `NULL`.
+
 ### String to Timestamp
 
 Comet's native `CAST(string AS TIMESTAMP)` implementation supports all timestamp formats accepted
 
@@ -61,7 +61,12 @@ The valid pool types are:
 - `fair_unified` (default when `spark.memory.offHeap.enabled=true` is set)
 - `greedy_unified`
 
-The `fair_unified` pool types prevents operators from using more than an even fraction of the available memory
+Both pool types are shared across all native execution contexts within the same Spark task. When
+Comet executes a shuffle, it runs two native execution contexts concurrently (e.g. one for
+pre-shuffle operators and one for the shuffle writer). The shared pool ensures that the combined
+memory usage stays within the per-task limit.
+
+The `fair_unified` pool prevents operators from using more than an even fraction of the available memory
 (i.e. `pool_size / num_reservations`). This pool works best when you know beforehand
 the query has multiple operators that will likely all need to spill. Sometimes it will cause spills even
 when there is sufficient memory in order to leave enough memory for other operators.
 
@@ -25,7 +25,8 @@ use datafusion::logical_expr::ScalarUDF;
 use datafusion::physical_expr::{PhysicalExpr, ScalarFunctionExpr};
 use datafusion_comet_proto::spark_expression::Expr;
 use datafusion_comet_spark_expr::{
-    SparkHour, SparkMinute, SparkSecond, SparkUnixTimestamp, TimestampTruncExpr,
+    SparkHour, SparkHoursTransform, SparkMinute, SparkSecond, SparkUnixTimestamp,
+    TimestampTruncExpr,
 };
 
 use crate::execution::{
@@ -160,3 +161,29 @@ impl ExpressionBuilder for TruncTimestampBuilder {
         Ok(Arc::new(TimestampTruncExpr::new(child, format, timezone)))
     }
 }
+
+pub struct HoursTransformBuilder;
+
+impl ExpressionBuilder for HoursTransformBuilder {
+    fn build(
+        &self,
+        spark_expr: &Expr,
+        input_schema: SchemaRef,
+        planner: &PhysicalPlanner,
+    ) -> Result<Arc<dyn PhysicalExpr>, ExecutionError> {
+        let expr = extract_expr!(spark_expr, HoursTransform);
+        let child = planner.create_expr(expr.child.as_ref().unwrap(), Arc::clone(&input_schema))?;
+        let args = vec![child];
+        let comet_hours_transform = Arc::new(ScalarUDF::new_from_impl(SparkHoursTransform::new()));
+        let field_ref = Arc::new(Field::new("hours_transform", DataType::Int32, true));
+        let expr: ScalarFunctionExpr = ScalarFunctionExpr::new(
+            "hours_transform",
+            comet_hours_transform,
+            args,
+            field_ref,
+            Arc::new(ConfigOptions::default()),
+        );
+
+        Ok(Arc::new(expr))
+    }
+}
@@ -26,6 +26,8 @@ use crate::{
     },
     jvm_bridge::JVMClasses,
 };
+use std::collections::HashSet;
+
 use arrow::array::{Array, RecordBatch, UInt32Array};
 use arrow::compute::{take, TakeOptions};
 use arrow::datatypes::DataType as ArrowDataType;
@@ -141,15 +143,33 @@ fn unregister_and_total(thread_id: u64, context_id: i64) -> usize {
             map.remove(&thread_id);
             return 0;
         }
-        return pools.values().map(|p| p.reserved()).sum::<usize>();
+        let mut seen = HashSet::new();
+        return pools
+            .values()
+            .filter_map(|p| {
+                let ptr = Arc::as_ptr(p) as *const ();
+                seen.insert(ptr).then(|| p.reserved())
+            })
+            .sum::<usize>();
     }
     0
 }
 
 fn total_reserved_for_thread(thread_id: u64) -> usize {
     let map = get_thread_memory_pools().lock();
     map.get(&thread_id)
-        .map(|pools| pools.values().map(|p| p.reserved()).sum::<usize>())
+        .map(|pools| {
+            // Deduplicate pools that share the same underlying allocation
+            // (e.g. task-shared pools registered by multiple execution contexts)
+            let mut seen = HashSet::new();
+            pools
+                .values()
+                .filter_map(|p| {
+                    let ptr = Arc::as_ptr(p) as *const ();
+                    seen.insert(ptr).then(|| p.reserved())
+                })
+                .sum::<usize>()
+        })
         .unwrap_or(0)
 }
 
 
@@ -34,7 +34,10 @@ impl MemoryPoolType {
     pub(crate) fn is_task_shared(&self) -> bool {
         matches!(
             self,
-            MemoryPoolType::GreedyTaskShared | MemoryPoolType::FairSpillTaskShared
+            MemoryPoolType::GreedyTaskShared
+                | MemoryPoolType::FairSpillTaskShared
+                | MemoryPoolType::FairUnified
+                | MemoryPoolType::GreedyUnified
         )
     }
 }
 
@@ -42,22 +42,36 @@ pub(crate) fn create_memory_pool(
     const NUM_TRACKED_CONSUMERS: usize = 10;
     match memory_pool_config.pool_type {
         MemoryPoolType::GreedyUnified => {
-            // Set Comet memory pool for native
-            let memory_pool =
-                CometUnifiedMemoryPool::new(comet_task_memory_manager, task_attempt_id);
-            Arc::new(TrackConsumersPool::new(
-                memory_pool,
-                NonZeroUsize::new(NUM_TRACKED_CONSUMERS).unwrap(),
-            ))
+            let mut memory_pool_map = TASK_SHARED_MEMORY_POOLS.lock().unwrap();
+            let per_task_memory_pool =
+                memory_pool_map.entry(task_attempt_id).or_insert_with(|| {
+                    let pool: Arc<dyn MemoryPool> = Arc::new(TrackConsumersPool::new(
+                        CometUnifiedMemoryPool::new(
+                            Arc::clone(&comet_task_memory_manager),
+                            task_attempt_id,
+                        ),
+                        NonZeroUsize::new(NUM_TRACKED_CONSUMERS).unwrap(),
+                    ));
+                    PerTaskMemoryPool::new(pool)
+                });
+            per_task_memory_pool.num_plans += 1;
+            Arc::clone(&per_task_memory_pool.memory_pool)
         }
         MemoryPoolType::FairUnified => {
-            // Set Comet fair memory pool for native
-            let memory_pool =
-                CometFairMemoryPool::new(comet_task_memory_manager, memory_pool_config.pool_size);
-            Arc::new(TrackConsumersPool::new(
-                memory_pool,
-                NonZeroUsize::new(NUM_TRACKED_CONSUMERS).unwrap(),
-            ))
+            let mut memory_pool_map = TASK_SHARED_MEMORY_POOLS.lock().unwrap();
+            let per_task_memory_pool =
+                memory_pool_map.entry(task_attempt_id).or_insert_with(|| {
+                    let pool: Arc<dyn MemoryPool> = Arc::new(TrackConsumersPool::new(
+                        CometFairMemoryPool::new(
+                            Arc::clone(&comet_task_memory_manager),
+                            memory_pool_config.pool_size,
+                        ),
+                        NonZeroUsize::new(NUM_TRACKED_CONSUMERS).unwrap(),
+                    ));
+                    PerTaskMemoryPool::new(pool)
+                });
+            per_task_memory_pool.num_plans += 1;
+            Arc::clone(&per_task_memory_pool.memory_pool)
         }
         MemoryPoolType::Greedy => Arc::new(TrackConsumersPool::new(
             GreedyMemoryPool::new(memory_pool_config.pool_size),
 
@@ -110,6 +110,7 @@ pub enum ExpressionType {
     Second,
     TruncTimestamp,
     UnixTimestamp,
+    HoursTransform,
 }
 
 /// Registry for expression builders
@@ -310,6 +311,10 @@ impl ExpressionRegistry {
             ExpressionType::TruncTimestamp,
             Box::new(TruncTimestampBuilder),
         );
+        self.builders.insert(
+            ExpressionType::HoursTransform,
+            Box::new(HoursTransformBuilder),
+        );
     }
 
     /// Extract expression type from Spark protobuf expression
@@ -382,6 +387,7 @@ impl ExpressionRegistry {
             Some(ExprStruct::Second(_)) => Ok(ExpressionType::Second),
             Some(ExprStruct::TruncTimestamp(_)) => Ok(ExpressionType::TruncTimestamp),
             Some(ExprStruct::UnixTimestamp(_)) => Ok(ExpressionType::UnixTimestamp),
+            Some(ExprStruct::HoursTransform(_)) => Ok(ExpressionType::HoursTransform),
 
             Some(other) => Err(ExecutionError::GeneralError(format!(
                 "Unsupported expression type: {:?}",
 
@@ -88,6 +88,7 @@ message Expr {
     UnixTimestamp unix_timestamp = 65;
     FromJson from_json = 66;
     ToCsv to_csv = 67;
+    HoursTransform hours_transform = 68;
   }
 
   // Optional QueryContext for error reporting (contains SQL text and position)
@@ -356,6 +357,10 @@ message Hour {
   string timezone = 2;
 }
 
+message HoursTransform {
+  Expr child = 1;
+}
+
 message Minute {
   Expr child = 1;
   string timezone = 2;
 
@@ -438,6 +438,40 @@ fn cast_string_to_decimal256_impl(
     ))
 }
 
+/// Normalize fullwidth Unicode digits (U+FF10–U+FF19) to their ASCII equivalents.
+///
+/// Spark's UTF8String parser treats fullwidth digits as numerically equivalent to
+/// ASCII digits, e.g. "１２３.４５" parses as 123.45. Each fullwidth digit encodes
+/// to exactly three UTF-8 bytes: [0xEF, 0xBC, 0x90+n] for digit n. The ASCII
+/// equivalent is 0x30+n, so the conversion is: third_byte - 0x60.
+///
+/// All other bytes (ASCII or other multi-byte sequences) are passed through
+/// unchanged, so the output is valid UTF-8 whenever the input is.
+fn normalize_fullwidth_digits(s: &str) -> String {
+    let bytes = s.as_bytes();
+    let mut out = Vec::with_capacity(s.len());
+    let mut i = 0;
+    while i < bytes.len() {
+        if i + 2 < bytes.len()
+            && bytes[i] == 0xEF
+            && bytes[i + 1] == 0xBC
+            && bytes[i + 2] >= 0x90
+            && bytes[i + 2] <= 0x99
+        {
+            // e.g. 0x91 - 0x60 = 0x31 = b'1'
+            out.push(bytes[i + 2] - 0x60);
+            i += 3;
+        } else {
+            out.push(bytes[i]);
+            i += 1;
+        }
+    }
+    // SAFETY: we only replace valid 3-byte UTF-8 sequences [EF BC 9X] with a
+    // single ASCII byte; all other bytes are copied unchanged, preserving the
+    // UTF-8 invariant of the input.
+    unsafe { String::from_utf8_unchecked(out) }
+}
+
 /// Parse a decimal string into mantissa and scale
 /// e.g., "123.45" -> (12345, 2), "-0.001" -> (-1, 3) , 0e50 -> (0,50) etc
 /// Parse a string to decimal following Spark's behavior
@@ -446,16 +480,30 @@ fn parse_string_to_decimal(input_str: &str, precision: u8, scale: i8) -> SparkRe
     let mut start = 0;
     let mut end = string_bytes.len();
 
-    // trim whitespaces
-    while start < end && string_bytes[start].is_ascii_whitespace() {
+    // Trim ASCII whitespace and null bytes from both ends. Spark's UTF8String
+    // trims null bytes the same way it trims whitespace: "123\u0000" and
+    // "\u0000123" both parse as 123. Null bytes in the middle are not trimmed
+    // and will fail the digit validation in parse_decimal_str, producing NULL.
+    while start < end && (string_bytes[start].is_ascii_whitespace() || string_bytes[start] == 0) {
         start += 1;
     }
-    while end > start && string_bytes[end - 1].is_ascii_whitespace() {
+    while end > start && (string_bytes[end - 1].is_ascii_whitespace() || string_bytes[end - 1] == 0)
+    {
         end -= 1;
     }
 
     let trimmed = &input_str[start..end];
 
+    // Normalize fullwidth digits to ASCII. Fast path skips the allocation for
+    // pure-ASCII strings, which is the common case.
+    let normalized;
+    let trimmed = if trimmed.bytes().any(|b| b > 0x7F) {
+        normalized = normalize_fullwidth_digits(trimmed);
+        normalized.as_str()
+    } else {
+        trimmed
+    };
+
     if trimmed.is_empty() {
         return Ok(None);
     }
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,10 @@ impl MemoryPoolType {`
`34`	`34`	`pub(crate) fn is_task_shared(&self) -> bool {`
`35`	`35`	`matches!(`
`36`	`36`	`self,`
`37`		`- MemoryPoolType::GreedyTaskShared \| MemoryPoolType::FairSpillTaskShared`
	`37`	`+ MemoryPoolType::GreedyTaskShared`
	`38`	`+ \| MemoryPoolType::FairSpillTaskShared`
	`39`	`+ \| MemoryPoolType::FairUnified`
	`40`	`+ \| MemoryPoolType::GreedyUnified`
`38`	`41`	`)`
`39`	`42`	`}`
`40`	`43`	`}`