@@ -68,7 +68,7 @@ fn is_middle_dot_numeric_word(chars: &[char]) -> bool {
6868 . all ( |c| c. is_ascii_digit ( ) || matches ! ( * c, '\u{00B7}' | '\u{22C5}' | '\u{2212}' | '-' ) )
6969}
7070
71- fn has_adjacent_korean_word ( tokens : & [ Token < ' _ > ] , index : usize ) -> bool {
71+ fn adjacent_korean_word_flags ( tokens : & [ Token < ' _ > ] , index : usize ) -> ( bool , bool ) {
7272 let prev_has_korean = index
7373 . checked_sub ( 1 )
7474 . and_then ( |mut i| {
@@ -95,6 +95,11 @@ fn has_adjacent_korean_word(tokens: &[Token<'_>], index: usize) -> bool {
9595 }
9696 } ;
9797
98+ ( prev_has_korean, next_has_korean)
99+ }
100+
101+ fn has_adjacent_korean_word ( tokens : & [ Token < ' _ > ] , index : usize ) -> bool {
102+ let ( prev_has_korean, next_has_korean) = adjacent_korean_word_flags ( tokens, index) ;
98103 prev_has_korean || next_has_korean
99104}
100105
@@ -132,6 +137,37 @@ fn is_strong_mixed_math_candidate(chars: &[char], text: &str) -> bool {
132137 || has_combining_mark
133138}
134139
140+ fn should_wrap_math_sentence ( chars : & [ char ] , text : & str ) -> bool {
141+ if chars. len ( ) <= 1 {
142+ return false ;
143+ }
144+
145+ let has_letters = chars. iter ( ) . any ( |c| c. is_ascii_alphabetic ( ) ) ;
146+ let has_digits = chars. iter ( ) . any ( |c| c. is_ascii_digit ( ) ) ;
147+ let has_math_symbol = chars
148+ . iter ( )
149+ . any ( |c| math_symbol_shortcut:: is_math_symbol_char ( * c) ) ;
150+ let has_superscript = chars. iter ( ) . any ( |c| is_superscript ( * c) ) ;
151+ let has_subscript = chars. iter ( ) . any ( |c| is_subscript ( * c) ) ;
152+ let has_combining_mark = chars. iter ( ) . any ( |c| is_combining_math_mark ( * c) ) ;
153+ let has_math_operator = chars. iter ( ) . any ( |c| {
154+ matches ! (
155+ c,
156+ '+' | '=' | '>' | '<' | '.' | ',' | '-' | '\u{2212}' | '/' | '!'
157+ )
158+ } ) ;
159+ let has_brackets = chars
160+ . iter ( )
161+ . any ( |c| matches ! ( c, '(' | ')' | '[' | ']' | '{' | '}' ) ) ;
162+
163+ is_strong_mixed_math_candidate ( chars, text)
164+ || ( has_digits && ( has_math_operator || has_math_symbol || has_brackets) )
165+ || ( has_letters && has_digits)
166+ || ( has_letters && has_brackets)
167+ || ( has_letters && has_math_operator)
168+ || ( has_superscript || has_subscript || has_combining_mark)
169+ }
170+
135171fn try_encode_math_slice ( chars : & [ char ] ) -> Option < Vec < u8 > > {
136172 if chars. is_empty ( ) || chars. iter ( ) . any ( |c| is_korean_char ( * c) ) {
137173 return None ;
@@ -247,14 +283,14 @@ fn is_math_expression(chars: &[char], text: &str) -> bool {
247283 . first ( )
248284 . is_some_and ( |c| math_symbol_shortcut:: is_math_symbol_char ( * c) ) ;
249285
250- // Number-base notation like 1010₂ should not be treated as generic math expression .
286+ // Number-base notation like 1010₂ is a math expression and should use the math engine .
251287 if chars. first ( ) . is_some_and ( |c| c. is_ascii_digit ( ) )
252288 && chars. iter ( ) . any ( |c| matches ! ( * c, '\u{2080}' ..='\u{2089}' ) )
253289 && chars
254290 . iter ( )
255291 . all ( |c| c. is_ascii_digit ( ) || matches ! ( * c, '\u{2080}' ..='\u{2089}' ) )
256292 {
257- return false ;
293+ return true ;
258294 }
259295
260296 // Common phone/date/range tokens like 02-799-1000 should stay non-math.
@@ -577,7 +613,25 @@ impl TokenRule for MathExpressionTokenRule {
577613
578614 // Try to encode via math engine
579615 match math:: encoder:: encode_math_expression ( text) {
580- Ok ( bytes) => Ok ( TokenAction :: Replace ( Token :: PreEncoded ( bytes) ) ) ,
616+ Ok ( bytes) => {
617+ let ( prev_has_korean, next_has_korean) = adjacent_korean_word_flags ( tokens, index) ;
618+ let should_wrap = should_wrap_math_sentence ( & word. chars , text) ;
619+ let mut wrapped = Vec :: with_capacity (
620+ bytes. len ( )
621+ + usize:: from ( prev_has_korean && should_wrap)
622+ + usize:: from ( next_has_korean && should_wrap) ,
623+ ) ;
624+
625+ if prev_has_korean && should_wrap {
626+ wrapped. push ( 0 ) ;
627+ }
628+ wrapped. extend_from_slice ( & bytes) ;
629+ if next_has_korean && should_wrap {
630+ wrapped. push ( 0 ) ;
631+ }
632+
633+ Ok ( TokenAction :: Replace ( Token :: PreEncoded ( wrapped) ) )
634+ }
581635 Err ( _) => {
582636 // If math encoding fails, let the character-level rules handle it
583637 Ok ( TokenAction :: Noop )
@@ -679,6 +733,12 @@ mod tests {
679733 assert ! ( is_math_expression( & chars, "⅔" ) ) ;
680734 }
681735
736+ #[ test]
737+ fn test_is_math_base_notation ( ) {
738+ let chars: Vec < char > = "1010₂" . chars ( ) . collect ( ) ;
739+ assert ! ( is_math_expression( & chars, "1010₂" ) ) ;
740+ }
741+
682742 #[ test]
683743 fn split_mixed_math_word_extracts_math_prefix ( ) {
684744 let chars: Vec < char > = "tan의" . chars ( ) . collect ( ) ;
0 commit comments