Skip to content

Commit c058df6

Browse files
committed
Add fraction parsing module with LaTeX/Unicode support
1 parent ea81685 commit c058df6

3 files changed

Lines changed: 224 additions & 4 deletions

File tree

libs/braillify/src/char_struct.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use crate::{math_symbol_shortcut::is_math_symbol_char, symbol_shortcut::is_symbol_char};
1+
use crate::{math_symbol_shortcut::is_math_symbol_char, symbol_shortcut::is_symbol_char, fraction::is_unicode_fraction};
22

33
/// Character in Korean
44
#[derive(Debug)]
@@ -58,6 +58,7 @@ pub enum CharType {
5858
Number(char),
5959
Symbol(char),
6060
MathSymbol(char),
61+
Fraction(char),
6162
Space(char),
6263
}
6364

@@ -75,6 +76,9 @@ impl CharType {
7576
if is_math_symbol_char(c) {
7677
return Ok(Self::MathSymbol(c));
7778
}
79+
if is_unicode_fraction(c) {
80+
return Ok(Self::Fraction(c));
81+
}
7882
let code = c as u32;
7983
if (0x3131..=0x3163).contains(&code) {
8084
return Ok(Self::KoreanPart(c));
@@ -144,6 +148,9 @@ mod test {
144148
CharType::Space(ch) => {
145149
assert!(ch.is_whitespace());
146150
}
151+
CharType::Fraction(ch) => {
152+
assert!(is_unicode_fraction(ch));
153+
}
147154
}
148155
}
149156
}

libs/braillify/src/fraction.rs

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
use unicode_normalization::UnicodeNormalization;
2+
3+
const FRACTION_SLASH: char = '\u{2044}';
4+
5+
fn consume_whitespace(iter: &mut std::iter::Peekable<std::str::Chars>) {
6+
while let Some(c) = iter.peek() {
7+
if c.is_whitespace() {
8+
iter.next();
9+
} else {
10+
break;
11+
}
12+
}
13+
}
14+
15+
fn encode_number_string(s: &str, part_name: &str) -> Result<Vec<u8>, String> {
16+
let mut result = Vec::new();
17+
for c in s.chars() {
18+
if !c.is_ascii_digit() {
19+
return Err(format!("Invalid {} part (non-ascii digit): {}", part_name, c));
20+
}
21+
result.extend(crate::number::encode_number(c));
22+
}
23+
Ok(result)
24+
}
25+
26+
pub fn encode_fraction(numerator: &str, denominator: &str) -> Result<Vec<u8>, String> {
27+
let mut result = vec![60];
28+
result.extend(encode_number_string(denominator, "fraction denominator")?);
29+
result.push(12);
30+
result.push(60);
31+
result.extend(encode_number_string(numerator, "fraction numerator")?);
32+
Ok(result)
33+
}
34+
35+
pub fn encode_fraction_in_context(numerator: &str, denominator: &str) -> Result<Vec<u8>, String> {
36+
let mut result = vec![60];
37+
result.extend(encode_number_string(numerator, "fraction numerator")?);
38+
result.push(56);
39+
result.push(12);
40+
result.push(60);
41+
result.extend(encode_number_string(denominator, "fraction denominator")?);
42+
Ok(result)
43+
}
44+
45+
pub fn encode_mixed_fraction(whole: &str, numerator: &str, denominator: &str) -> Result<Vec<u8>, String> {
46+
let mut result = vec![60];
47+
result.extend(encode_number_string(whole, "whole number")?);
48+
result.extend(encode_fraction(numerator, denominator)?);
49+
Ok(result)
50+
}
51+
52+
fn normalize_digit(c: char) -> Option<char> {
53+
match c {
54+
'0' | '⁰' | '₀' => Some('0'),
55+
'1' | '¹' | '₁' => Some('1'),
56+
'2' | '²' | '₂' => Some('2'),
57+
'3' | '³' | '₃' => Some('3'),
58+
'4' | '⁴' | '₄' => Some('4'),
59+
'5' | '⁵' | '₅' => Some('5'),
60+
'6' | '⁶' | '₆' => Some('6'),
61+
'7' | '⁷' | '₇' => Some('7'),
62+
'8' | '⁸' | '₈' => Some('8'),
63+
'9' | '⁹' | '₉' => Some('9'),
64+
_ => None,
65+
}
66+
}
67+
68+
fn read_braced_content(
69+
iter: &mut std::iter::Peekable<std::str::Chars>
70+
) -> Option<String> {
71+
consume_whitespace(iter);
72+
73+
if iter.next()? != '{' { return None; }
74+
75+
let mut content = String::new();
76+
while let Some(c) = iter.peek() {
77+
match c {
78+
'}' => {
79+
iter.next();
80+
return if content.is_empty() { None } else { Some(content) };
81+
}
82+
_ if c.is_whitespace() => {
83+
iter.next();
84+
}
85+
_ => {
86+
if let Some(digit) = normalize_digit(*c) {
87+
content.push(digit);
88+
iter.next();
89+
} else {
90+
return None;
91+
}
92+
}
93+
}
94+
}
95+
None
96+
}
97+
98+
pub fn parse_latex_fraction(s: &str) -> Option<(Option<String>, String, String)> {
99+
let mut iter = s.trim().chars().peekable();
100+
101+
if iter.next()? != '$' { return None; }
102+
103+
consume_whitespace(&mut iter);
104+
105+
let mut whole_part_str = String::new();
106+
while let Some(digit) = iter.peek().and_then(|c| normalize_digit(*c)) {
107+
whole_part_str.push(digit);
108+
iter.next();
109+
}
110+
let whole_part = if whole_part_str.is_empty() { None } else { Some(whole_part_str) };
111+
112+
consume_whitespace(&mut iter);
113+
114+
if iter.next() != Some('\\') ||
115+
iter.next() != Some('f') ||
116+
iter.next() != Some('r') ||
117+
iter.next() != Some('a') ||
118+
iter.next() != Some('c') {
119+
return None;
120+
}
121+
122+
let numerator = read_braced_content(&mut iter)?;
123+
let denominator = read_braced_content(&mut iter)?;
124+
125+
consume_whitespace(&mut iter);
126+
127+
if iter.next()? != '$' { return None; }
128+
129+
consume_whitespace(&mut iter);
130+
131+
if iter.next().is_some() {
132+
return None;
133+
}
134+
135+
Some((whole_part, numerator, denominator))
136+
}
137+
138+
pub fn parse_unicode_fraction(c: char) -> Option<(String, String)> {
139+
let decomposed = c.nfkd().collect::<String>();
140+
if !decomposed.contains(FRACTION_SLASH) {
141+
return None;
142+
}
143+
144+
let parts: Vec<&str> = decomposed.split(FRACTION_SLASH).collect();
145+
146+
if parts.len() == 2 {
147+
let num_str = parts[0].trim();
148+
let den_str = parts[1].trim();
149+
if num_str.is_empty() || den_str.is_empty() {
150+
return None;
151+
}
152+
if !num_str.chars().all(|c| c.is_ascii_digit()) {
153+
return None;
154+
}
155+
if !den_str.chars().all(|c| c.is_ascii_digit()) {
156+
return None;
157+
}
158+
Some((num_str.to_string(), den_str.to_string()))
159+
} else {
160+
None
161+
}
162+
}
163+
164+
pub fn is_unicode_fraction(c: char) -> bool {
165+
parse_unicode_fraction(c).is_some()
166+
}

libs/braillify/src/lib.rs

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
use jauem::choseong::encode_choseong;
22
use moeum::jungsong::encode_jungsong;
33
use utils::has_choseong_o;
4+
use once_cell::sync::Lazy;
5+
use regex::Regex;
46

57
use crate::{
68
char_struct::CharType,
@@ -11,6 +13,11 @@ use crate::{
1113
split::split_korean_jauem,
1214
};
1315

16+
static FRACTION_REGEX: Lazy<Regex> = Lazy::new(|| {
17+
Regex::new(r#"^(\d+)\/(\d+)"#)
18+
.expect("Failed to compile FRACTION_REGEX")
19+
});
20+
1421
mod char_shortcut;
1522
mod char_struct;
1623
#[cfg(feature = "cli")]
@@ -29,6 +36,7 @@ mod symbol_shortcut;
2936
mod unicode;
3037
mod utils;
3138
mod word_shortcut;
39+
mod fraction;
3240

3341
pub struct Encoder {
3442
is_english: bool,
@@ -74,6 +82,16 @@ impl Encoder {
7482
skip_count: &mut usize,
7583
result: &mut Vec<u8>,
7684
) -> Result<(), String> {
85+
if word.starts_with('$') && word.ends_with('$') {
86+
if let Some((whole, num, den)) = fraction::parse_latex_fraction(word) {
87+
if let Some(w) = whole {
88+
result.extend(fraction::encode_mixed_fraction(&w, &num, &den)?);
89+
} else {
90+
result.extend(fraction::encode_fraction(&num, &den)?);
91+
}
92+
return Ok(());
93+
}
94+
}
7795
if let Some((_, code, rest)) = word_shortcut::split_word_shortcut(word) {
7896
result.extend(code);
7997
if !rest.is_empty() {
@@ -281,15 +299,44 @@ impl Encoder {
281299
}
282300
CharType::Number(c) => {
283301
if !is_number {
284-
// 제43항 숫자 사이에 마침표, 쉼표, 연결표가 붙어 나올 때에는 뒤의 숫자에 수표를 적지 않는다.
302+
let remaining_word: String = word_chars[i..].iter().collect();
303+
304+
if let Some(captures) = FRACTION_REGEX.captures(&remaining_word) {
305+
let numerator = &captures[1];
306+
let denominator = &captures[2];
307+
let match_len = captures[0].len();
308+
let k = i + match_len;
309+
310+
let is_date_or_range =
311+
(numerator.len() > 1 || denominator.len() > 1) ||
312+
(k < word_len && word_chars[k] == '/') ||
313+
(k < word_len && word_chars[k] == '~');
314+
315+
if !is_date_or_range {
316+
result.extend(fraction::encode_fraction_in_context(numerator, denominator)?);
317+
*skip_count = match_len - 1;
318+
is_number = true;
319+
continue;
320+
}
321+
}
322+
// 제43항 숫자 사이에 마침표, 쉼표, 연결표가 붙어 나올 때에는 뒤의 숫자에 수표를 적지 않는다.
285323
if !(i > 0 && ['.', ','].contains(&word_chars[i - 1])) {
286324
// 제40항 숫자는 수표 ⠼을 앞세워 다음과 같이 적는다.
287325
result.push(60);
288326
}
289327
is_number = true;
290-
}
328+
}
291329
result.extend(number::encode_number(c));
292-
}
330+
},
331+
CharType::Fraction(c) => {
332+
if let Some((num_str, den_str)) = fraction::parse_unicode_fraction(c) {
333+
result.extend(fraction::encode_fraction(
334+
&num_str,
335+
&den_str
336+
)?);
337+
is_number = true;
338+
}
339+
},
293340
CharType::Symbol(c) => {
294341
if c == ','
295342
&& is_number

0 commit comments

Comments
 (0)