@@ -438,6 +438,40 @@ fn cast_string_to_decimal256_impl(
438438 ) )
439439}
440440
441+ /// Normalize fullwidth Unicode digits (U+FF10–U+FF19) to their ASCII equivalents.
442+ ///
443+ /// Spark's UTF8String parser treats fullwidth digits as numerically equivalent to
444+ /// ASCII digits, e.g. "123.45" parses as 123.45. Each fullwidth digit encodes
445+ /// to exactly three UTF-8 bytes: [0xEF, 0xBC, 0x90+n] for digit n. The ASCII
446+ /// equivalent is 0x30+n, so the conversion is: third_byte - 0x60.
447+ ///
448+ /// All other bytes (ASCII or other multi-byte sequences) are passed through
449+ /// unchanged, so the output is valid UTF-8 whenever the input is.
450+ fn normalize_fullwidth_digits ( s : & str ) -> String {
451+ let bytes = s. as_bytes ( ) ;
452+ let mut out = Vec :: with_capacity ( s. len ( ) ) ;
453+ let mut i = 0 ;
454+ while i < bytes. len ( ) {
455+ if i + 2 < bytes. len ( )
456+ && bytes[ i] == 0xEF
457+ && bytes[ i + 1 ] == 0xBC
458+ && bytes[ i + 2 ] >= 0x90
459+ && bytes[ i + 2 ] <= 0x99
460+ {
461+ // e.g. 0x91 - 0x60 = 0x31 = b'1'
462+ out. push ( bytes[ i + 2 ] - 0x60 ) ;
463+ i += 3 ;
464+ } else {
465+ out. push ( bytes[ i] ) ;
466+ i += 1 ;
467+ }
468+ }
469+ // SAFETY: we only replace valid 3-byte UTF-8 sequences [EF BC 9X] with a
470+ // single ASCII byte; all other bytes are copied unchanged, preserving the
471+ // UTF-8 invariant of the input.
472+ unsafe { String :: from_utf8_unchecked ( out) }
473+ }
474+
441475/// Parse a decimal string into mantissa and scale
442476/// e.g., "123.45" -> (12345, 2), "-0.001" -> (-1, 3) , 0e50 -> (0,50) etc
443477/// Parse a string to decimal following Spark's behavior
@@ -446,16 +480,30 @@ fn parse_string_to_decimal(input_str: &str, precision: u8, scale: i8) -> SparkRe
446480 let mut start = 0 ;
447481 let mut end = string_bytes. len ( ) ;
448482
449- // trim whitespaces
450- while start < end && string_bytes[ start] . is_ascii_whitespace ( ) {
483+ // Trim ASCII whitespace and null bytes from both ends. Spark's UTF8String
484+ // trims null bytes the same way it trims whitespace: "123\u0000" and
485+ // "\u0000123" both parse as 123. Null bytes in the middle are not trimmed
486+ // and will fail the digit validation in parse_decimal_str, producing NULL.
487+ while start < end && ( string_bytes[ start] . is_ascii_whitespace ( ) || string_bytes[ start] == 0 ) {
451488 start += 1 ;
452489 }
453- while end > start && string_bytes[ end - 1 ] . is_ascii_whitespace ( ) {
490+ while end > start && ( string_bytes[ end - 1 ] . is_ascii_whitespace ( ) || string_bytes[ end - 1 ] == 0 )
491+ {
454492 end -= 1 ;
455493 }
456494
457495 let trimmed = & input_str[ start..end] ;
458496
497+ // Normalize fullwidth digits to ASCII. Fast path skips the allocation for
498+ // pure-ASCII strings, which is the common case.
499+ let normalized;
500+ let trimmed = if trimmed. bytes ( ) . any ( |b| b > 0x7F ) {
501+ normalized = normalize_fullwidth_digits ( trimmed) ;
502+ normalized. as_str ( )
503+ } else {
504+ trimmed
505+ } ;
506+
459507 if trimmed. is_empty ( ) {
460508 return Ok ( None ) ;
461509 }
0 commit comments