|
1 | 1 | # CORE LIBRARY (libs/braillify) |
2 | 2 |
|
3 | | -Korean Braille encoding engine implementing 2024 Korean Braille Standard. |
| 3 | +Korean + Math Braille encoding engine implementing 2024 Korean Braille Standard. |
4 | 4 |
|
5 | 5 | ## STRUCTURE |
6 | 6 |
|
7 | 7 | ``` |
8 | 8 | src/ |
9 | | -├── lib.rs # Main Encoder struct, encode() entry point |
| 9 | +├── lib.rs # Main encode() entry, encode_for_testcase(), KNOWN_FAILURES |
10 | 10 | ├── cli.rs # CLI: REPL + one-shot mode (feature-gated) |
11 | 11 | ├── main.rs # Binary entry point |
| 12 | +├── encoder.rs # DocumentIR construction, token + char engine orchestration |
| 13 | +├── char_struct.rs # CharType enum (Korean/English/Number/Symbol/MathSymbol/Fraction) |
12 | 14 | ├── korean_char.rs # Full Korean syllable encoding |
13 | 15 | ├── korean_part.rs # Standalone jamo (consonant/vowel) encoding |
14 | 16 | ├── jauem/ # Consonant handling |
15 | 17 | │ ├── choseong.rs # Initial consonants |
16 | 18 | │ └── jongseong.rs # Final consonants |
17 | 19 | ├── moeum/ # Vowel handling |
18 | 20 | │ └── jungsong.rs # Medial vowels |
19 | | -├── rule.rs # Korean Braille rules (11, 12, etc.) |
20 | | -├── rule_en.rs # English abbreviation rules (10-4, 10-6) |
21 | 21 | ├── english.rs # English letter encoding |
22 | 22 | ├── english_logic.rs # English context detection |
23 | 23 | ├── number.rs # Number encoding |
24 | 24 | ├── fraction.rs # Fraction handling (Unicode + LaTeX) |
25 | | -├── *_shortcut.rs # PHF static lookup tables |
26 | | -├── unicode.rs # Internal code to Unicode Braille |
| 25 | +├── math_symbol_shortcut.rs # PHF math symbol lookup table |
| 26 | +├── symbol_shortcut.rs # PHF general symbol lookup table |
| 27 | +├── word_shortcut.rs # PHF word abbreviation lookup table |
| 28 | +├── unicode.rs # Internal braille code ↔ Unicode Braille conversion |
27 | 29 | ├── split.rs # Korean jamo decomposition |
28 | | -├── char_struct.rs # CharType enum (Korean/English/Number/Symbol) |
29 | | -└── utils.rs # Helper functions |
| 30 | +├── utils.rs # Helper functions |
| 31 | +└── rules/ # Rule engine (see below) |
30 | 32 | ``` |
31 | 33 |
|
| 34 | +## ENCODING PIPELINE |
| 35 | + |
| 36 | +``` |
| 37 | +Input text |
| 38 | + ↓ DocumentIR::parse() (tokenize into Word/Space/Mode tokens) |
| 39 | + ↓ TokenRuleEngine::apply_all() (token-level rules by phase) |
| 40 | + │ ├── LatexMergeRule (merge $...$ across spaces) |
| 41 | + │ ├── LatexFractionRule (detect $\frac{}{})$) |
| 42 | + │ ├── LatexMathRule (strip LaTeX → math notation) |
| 43 | + │ ├── InlineFractionRule (detect N/N inline fractions) |
| 44 | + │ ├── MathExpressionTokenRule (detect & encode math expressions) |
| 45 | + │ └── ...other token rules |
| 46 | + ↓ emit() (character-level encoding) |
| 47 | + ├── Token::Word → RuleEngine (BrailleRule trait, char-by-char) |
| 48 | + ├── Token::Space → braille space byte |
| 49 | + ├── Token::Fraction → fraction encoding |
| 50 | + └── Token::PreEncoded → pass-through (from math encoder) |
| 51 | +``` |
| 52 | + |
| 53 | +## RULE ARCHITECTURE |
| 54 | + |
| 55 | +### Two parallel rule systems |
| 56 | + |
| 57 | +| System | Trait | Engine | Operates On | Used By | |
| 58 | +|--------|-------|--------|-------------|---------| |
| 59 | +| Korean (char-level) | `BrailleRule` | `RuleEngine` | Individual characters (`CharType`) | Korean text encoding | |
| 60 | +| Math (token-level) | `MathTokenRule` | `MathTokenEngine` | Token sequences (`MathToken`) | Math expression encoding | |
| 61 | + |
| 62 | +### BrailleRule (Korean, character-level) |
| 63 | + |
| 64 | +```rust |
| 65 | +trait BrailleRule: Send + Sync { |
| 66 | + fn meta(&self) -> &'static RuleMeta; |
| 67 | + fn phase(&self) -> Phase; // Preprocessing → CoreEncoding → InterCharacter |
| 68 | + fn matches(&self, ctx: &RuleContext) -> bool; |
| 69 | + fn apply(&self, ctx: &mut RuleContext) -> Result<RuleResult, String>; |
| 70 | +} |
| 71 | +``` |
| 72 | + |
| 73 | +Registered in `encoder.rs` → processes one character at a time via `RuleContext`. |
| 74 | + |
| 75 | +### MathTokenRule (Math, token-level) |
| 76 | + |
| 77 | +```rust |
| 78 | +trait MathTokenRule: Send + Sync { |
| 79 | + fn name(&self) -> &'static str; |
| 80 | + fn priority(&self) -> u16; // Lower = runs first (10=lookahead, 50=core, 100=symbol) |
| 81 | + fn matches(&self, tokens: &[MathToken], index: usize, state: &MathEncodeState) -> bool; |
| 82 | + fn apply(&self, tokens: &[MathToken], index: usize, result: &mut Vec<u8>, |
| 83 | + state: &mut MathEncodeState, engine: &MathTokenEngine) -> Result<MathTokenResult, String>; |
| 84 | +} |
| 85 | +``` |
| 86 | + |
| 87 | +Registered in `encoder.rs::build_math_engine()` → processes parsed MathToken sequences with lookahead. |
| 88 | + |
| 89 | +### Math rule structs (in respective rule files) |
| 90 | + |
| 91 | +| Priority | Struct | File | Handles | |
| 92 | +|----------|--------|------|---------| |
| 93 | +| 10 | `FractionReversalRule` | rule_7.rs | Denominator-first simple fractions | |
| 94 | +| 10 | `ConditionalProbFractionRule` | rule_7.rs | =a/b with \| pattern | |
| 95 | +| 10 | `CombinatoricsRule` | rule_12.rs | nPr, nCr | |
| 96 | +| 50 | `NumberRule` | rule_1.rs | Number tokens | |
| 97 | +| 50 | `VariableRule` | rule_12.rs | Lowercase variables | |
| 98 | +| 50 | `UpperVariableRule` | rule_12.rs | Uppercase variables | |
| 99 | +| 50 | `OperatorRule` | rule_2.rs | Arithmetic operators | |
| 100 | +| 50 | `FunctionNameRule` | rule_47.rs | log, lim, sin, cos... | |
| 101 | +| 50 | `BracketRule` | rule_6.rs | Open/close parentheses | |
| 102 | +| 50 | `SuperscriptRule` | rule_18.rs | Superscript content | |
| 103 | +| 50 | `SubscriptRule` | rule_19.rs | Subscript content | |
| 104 | +| 50 | `DecimalPointRule` | rule_8.rs | Decimal points | |
| 105 | +| 50 | `PrimeRule` | rule_53.rs | Prime marks | |
| 106 | +| 100 | `MathSymbolRule` | encoder.rs | All math symbols (30+ dispatch chain) | |
| 107 | + |
32 | 108 | ## KEY TYPES |
33 | 109 |
|
34 | | -| Type | Location | Purpose | |
35 | | -| ------------ | ---------------- | ------------------------------------------------------- | |
36 | | -| `Encoder` | `lib.rs` | Stateful encoder tracking English mode, uppercase state | |
37 | | -| `CharType` | `char_struct.rs` | Input character classification | |
38 | | -| `KoreanChar` | `korean_char.rs` | Decomposed Korean syllable (cho/jung/jong) | |
| 110 | +| Type | Location | Purpose | |
| 111 | +|------|----------|---------| |
| 112 | +| `CharType` | `char_struct.rs` | Input character classification | |
| 113 | +| `BrailleRule` | `rules/traits.rs` | Korean char-level rule trait | |
| 114 | +| `MathTokenRule` | `rules/math/math_token_rule.rs` | Math token-level rule trait | |
| 115 | +| `MathTokenEngine` | `rules/math/math_token_rule.rs` | Math rule dispatch engine | |
| 116 | +| `MathToken` | `rules/math/parser.rs` | Parsed math expression token | |
| 117 | +| `MathEncodeState` | `rules/math/math_token_rule.rs` | Shared math encoding state | |
| 118 | +| `TokenRule` | `rules/token_rule.rs` | Token-level rule trait (pre-encoding) | |
| 119 | +| `RuleEngine` | `rules/engine.rs` | Korean BrailleRule dispatch | |
| 120 | +| `TokenRuleEngine` | `rules/token_engine.rs` | Token-level rule dispatch | |
39 | 121 |
|
40 | 122 | ## ENTRY POINTS |
41 | 123 |
|
42 | | -| Function | Location | Usage | |
43 | | -| ------------------------- | ------------ | --------------------------------- | |
44 | | -| `encode(text)` | `lib.rs:634` | Returns `Result<Vec<u8>, String>` | |
45 | | -| `encode_to_unicode(text)` | `lib.rs:648` | Returns Braille Unicode string | |
46 | | -| `run_cli(args)` | `cli.rs:16` | CLI entry (feature: cli) | |
| 124 | +| Function | Location | Usage | |
| 125 | +|----------|----------|-------| |
| 126 | +| `encode(text)` | `lib.rs` | Returns `Result<Vec<u8>, String>` | |
| 127 | +| `encode_to_unicode(text)` | `lib.rs` | Returns Braille Unicode string | |
| 128 | +| `encode_math_expression(text)` | `rules/math/encoder.rs` | Math-only encoding | |
| 129 | +| `run_cli(args)` | `cli.rs` | CLI entry (feature: cli) | |
47 | 130 |
|
48 | | -## RULE IMPLEMENTATION |
| 131 | +## MATH RULES (src/rules/math/) |
49 | 132 |
|
50 | | -Korean comments reference rule numbers from 2024 Korean Braille Standard: |
| 133 | +66 rule files (`rule_1.rs` through `rule_66.rs`) matching articles from the 2024 Korean Braille Standard math section (pages 51-84). Each file contains: |
51 | 134 |
|
52 | | -- `제8항` - Standalone jamo |
53 | | -- `제11항` - Vowel + 예 separator |
54 | | -- `제14항` - 나/다/마... + vowel (no abbreviation) |
55 | | -- `제28항` - Uppercase handling |
56 | | -- `제31항` - Roman letter indicators |
57 | | -- `제40항` - Number prefix |
58 | | -- `제43항` - Numbers with punctuation |
59 | | -- `제44항` - Number + Korean spacing |
| 135 | +- `is_xxx()` detection functions (used in MathSymbolRule dispatch chain) |
| 136 | +- `encode_xxx()` encoding functions (produce braille byte sequences) |
| 137 | +- MathTokenRule struct implementations (where applicable) |
| 138 | +- `#[cfg(test)] mod tests` with unit tests |
| 139 | + |
| 140 | +Infrastructure: |
| 141 | +- `encoder.rs` — `encode_math_expression()`, `build_math_engine()`, `MathSymbolRule` |
| 142 | +- `parser.rs` — `parse_math_expression()` → `Vec<MathToken>` |
| 143 | +- `function.rs` — Function name detection (sin, cos, log, etc.) |
| 144 | +- `math_token_rule.rs` — `MathTokenRule` trait, `MathTokenEngine`, `MathEncodeState` |
60 | 145 |
|
61 | 146 | ## CONVENTIONS |
62 | 147 |
|
63 | 148 | - PHF macros (`phf_map!`) for all static lookup tables |
64 | | -- Error handling via `Result<T, String>` - propagate, never suppress |
| 149 | +- Error handling via `Result<T, String>` — propagate, never suppress |
65 | 150 | - Feature flags: `cli` (default), `wasm` |
66 | 151 | - Tests inline with `#[cfg(test)]` in each module |
| 152 | +- No `#[allow(dead_code)]` — all functions must be used or tested |
| 153 | +- Math rules: one `.rs` file per standard article (제N항) |
67 | 154 |
|
68 | 155 | ## ANTI-PATTERNS |
69 | 156 |
|
70 | | -- **Never use `unwrap()` on user input** - return `Err(String)` |
71 | | -- **Never hardcode Braille dots** - use constants or PHF tables |
72 | | -- **Never modify shortcut tables** without updating test CSVs |
| 157 | +- **Never use `unwrap()` on user input** — return `Err(String)` |
| 158 | +- **Never hardcode Braille dots** — use constants or PHF tables |
| 159 | +- **Never modify shortcut tables** without updating test cases |
| 160 | +- **Never add `#[allow(dead_code)]`** — wire functions into encoder or tests instead |
| 161 | +- **Never suppress type errors** — no `as any` equivalents |
73 | 162 |
|
74 | 163 | ## TESTING |
75 | 164 |
|
76 | 165 | ```bash |
77 | | -# Run all tests with coverage |
78 | | -cargo tarpaulin -p braillify |
79 | | - |
80 | | -# Run specific test |
81 | | -cargo test test_encode |
82 | | - |
83 | | -# Generate test_status.json for landing page |
84 | | -cargo test test_by_testcase |
| 166 | +cargo test # All tests (353+) |
| 167 | +cargo test test_by_testcase # Testcase suite (2064 cases, tracks KNOWN_FAILURES) |
| 168 | +cargo test test_accuracy_report # Accuracy report (raw encode, no test routing) |
| 169 | +cargo test test_no_regression # Regression guard |
| 170 | +cargo fmt && cargo clippy # Format + lint |
85 | 171 | ``` |
86 | 172 |
|
87 | | -Tests read from `../../test_cases/*.csv` - format: `input,internal_repr,expected,unicode` |
| 173 | +Test cases in `test_cases/korean/*.json` and `test_cases/math/*.json`. |
| 174 | + |
| 175 | +Current status: 1710/2064 passing (354 known failures). |
0 commit comments