@@ -329,7 +329,6 @@ where
329329 )
330330 . map ( |( value, regex, start, nth, flags, subexp) | match regex {
331331 None => Ok ( None ) ,
332- Some ( "" ) => Ok ( Some ( 0 ) ) ,
333332 Some ( regex) => get_index (
334333 value,
335334 regex,
@@ -395,11 +394,8 @@ where
395394{
396395 let value = match value {
397396 None => return Ok ( None ) ,
398- Some ( "" ) => return Ok ( Some ( 0 ) ) ,
399397 Some ( value) => value,
400398 } ;
401- let pattern: & Regex = compile_and_cache_regex ( pattern, flags, regex_cache) ?;
402- // println!("get_index: value = {}, pattern = {}, start = {}, n = {}, subexpr = {}, flags = {:?}", value, pattern, start, n, subexpr, flags);
403399 if start < 1 {
404400 return Err ( ArrowError :: ComputeError (
405401 "regexp_instr() requires start to be 1-based" . to_string ( ) ,
@@ -412,8 +408,22 @@ where
412408 ) ) ;
413409 }
414410
415- // --- Simplified byte_start_offset calculation ---
416411 let total_chars = value. chars ( ) . count ( ) as i64 ;
412+ if pattern. is_empty ( ) {
413+ compile_and_cache_regex ( pattern, flags, regex_cache) ?;
414+ if subexpr > 0 {
415+ return Ok ( Some ( 0 ) ) ;
416+ }
417+
418+ let match_position = start. saturating_add ( n) . saturating_sub ( 1 ) ;
419+ return Ok ( Some ( if match_position <= total_chars + 1 {
420+ match_position
421+ } else {
422+ 0
423+ } ) ) ;
424+ }
425+
426+ let pattern: & Regex = compile_and_cache_regex ( pattern, flags, regex_cache) ?;
417427 let byte_start_offset: usize = if start > total_chars {
418428 // If start is beyond the total characters, it means we start searching
419429 // after the string effectively. No matches possible.
@@ -426,7 +436,6 @@ where
426436 . map ( |( idx, _) | idx)
427437 . unwrap_or ( 0 ) // Should not happen if start is valid and <= total_chars
428438 } ;
429- // --- End simplified calculation ---
430439
431440 let search_slice = & value[ byte_start_offset..] ;
432441
@@ -492,7 +501,7 @@ mod tests {
492501 fn test_case_sensitive_regexp_instr_nulls ( ) {
493502 let v = "" ;
494503 let r = "" ;
495- let expected = 0 ;
504+ let expected = 1 ;
496505 let regex_sv = ScalarValue :: Utf8 ( Some ( r. to_string ( ) ) ) ;
497506 let re = regexp_instr_with_scalar_values ( & [ v. to_string ( ) . into ( ) , regex_sv] ) ;
498507 // let res_exp = re.unwrap();
@@ -511,10 +520,11 @@ mod tests {
511520 "no match here" ,
512521 "abc" ,
513522 "ДатаФусион数据融合📊🔥" ,
523+ "abc" ,
514524 ] ;
515- let regex = [ "o" , "d" , "123" , "z" , "gg" , "📊" ] ;
525+ let regex = [ "o" , "d" , "123" , "z" , "gg" , "📊" , "" ] ;
516526
517- let expected: Vec < i64 > = vec ! [ 5 , 4 , 4 , 0 , 0 , 15 ] ;
527+ let expected: Vec < i64 > = vec ! [ 5 , 4 , 4 , 0 , 0 , 15 , 1 ] ;
518528
519529 izip ! ( values. iter( ) , regex. iter( ) )
520530 . enumerate ( )
@@ -772,10 +782,11 @@ mod tests {
772782 "xyz123xyz" ,
773783 "no match here" ,
774784 "" ,
785+ "abc" ,
775786 ] ) ;
776- let regex = A :: from ( vec ! [ "o" , "d" , "123" , "z" , "gg" ] ) ;
787+ let regex = A :: from ( vec ! [ "o" , "d" , "123" , "z" , "gg" , "" ] ) ;
777788
778- let expected = Int64Array :: from ( vec ! [ 5 , 4 , 4 , 0 , 0 ] ) ;
789+ let expected = Int64Array :: from ( vec ! [ 5 , 4 , 4 , 0 , 0 , 1 ] ) ;
779790 let re = regexp_instr_func ( & [ Arc :: new ( values) , Arc :: new ( regex) ] ) . unwrap ( ) ;
780791 assert_eq ! ( re. as_ref( ) , & expected) ;
781792 }
@@ -784,10 +795,10 @@ mod tests {
784795 where
785796 A : From < Vec < & ' static str > > + Array + ' static ,
786797 {
787- let values = A :: from ( vec ! [ "abcabcabc" , "abcabcabc" , "" ] ) ;
788- let regex = A :: from ( vec ! [ "abc" , "abc" , "gg" ] ) ;
789- let start = Int64Array :: from ( vec ! [ 4 , 5 , 5 ] ) ;
790- let expected = Int64Array :: from ( vec ! [ 4 , 7 , 0 ] ) ;
798+ let values = A :: from ( vec ! [ "abcabcabc" , "abcabcabc" , "" , "abc" ] ) ;
799+ let regex = A :: from ( vec ! [ "abc" , "abc" , "gg" , "" ] ) ;
800+ let start = Int64Array :: from ( vec ! [ 4 , 5 , 5 , 2 ] ) ;
801+ let expected = Int64Array :: from ( vec ! [ 4 , 7 , 0 , 2 ] ) ;
791802
792803 let re = regexp_instr_func ( & [ Arc :: new ( values) , Arc :: new ( regex) , Arc :: new ( start) ] )
793804 . unwrap ( ) ;
@@ -798,11 +809,17 @@ mod tests {
798809 where
799810 A : From < Vec < & ' static str > > + Array + ' static ,
800811 {
801- let values = A :: from ( vec ! [ "abcabcabc" , "abcabcabc" , "abcabcabc" , "abcabcabc" ] ) ;
802- let regex = A :: from ( vec ! [ "abc" , "abc" , "abc" , "abc" ] ) ;
803- let start = Int64Array :: from ( vec ! [ 1 , 1 , 1 , 1 ] ) ;
804- let nth = Int64Array :: from ( vec ! [ 1 , 2 , 3 , 4 ] ) ;
805- let expected = Int64Array :: from ( vec ! [ 1 , 4 , 7 , 0 ] ) ;
812+ let values = A :: from ( vec ! [
813+ "abcabcabc" ,
814+ "abcabcabc" ,
815+ "abcabcabc" ,
816+ "abcabcabc" ,
817+ "abc" ,
818+ ] ) ;
819+ let regex = A :: from ( vec ! [ "abc" , "abc" , "abc" , "abc" , "" ] ) ;
820+ let start = Int64Array :: from ( vec ! [ 1 , 1 , 1 , 1 , 2 ] ) ;
821+ let nth = Int64Array :: from ( vec ! [ 1 , 2 , 3 , 4 , 3 ] ) ;
822+ let expected = Int64Array :: from ( vec ! [ 1 , 4 , 7 , 0 , 4 ] ) ;
806823
807824 let re = regexp_instr_func ( & [
808825 Arc :: new ( values) ,
0 commit comments