|
2 | 2 | Copyright © 2023-2025 François G. Dorais. All rights reserved. |
3 | 3 | Released under Apache 2.0 license as described in the file LICENSE. |
4 | 4 | -/ |
5 | | - |
| 5 | +module |
6 | 6 | import UnicodeBasic.CharacterDatabase |
7 | 7 | import UnicodeBasic.Hangul |
8 | | -import UnicodeBasic.Types |
| 8 | +public import UnicodeBasic.Types |
| 9 | + |
| 10 | +public section |
9 | 11 |
|
10 | 12 | namespace Unicode |
11 | 13 |
|
@@ -143,29 +145,8 @@ def UnicodeData.mkTangutIdeograph (c : UInt32) : UnicodeData where |
143 | 145 | protected def UnicodeData.txt := include_str "../data/UnicodeData.txt" |
144 | 146 |
|
145 | 147 | /-- Parse `UnicodeData.txt` -/ |
146 | | -private unsafe def UnicodeData.init : IO (Array UnicodeData) := do |
147 | | - let stream := UCDStream.ofString UnicodeData.txt |
148 | | - let mut arr := #[] |
149 | | - for record in stream do |
150 | | - arr := arr.push { |
151 | | - code := ofHexString! record[0]! |
152 | | - name := record[1]! |
153 | | - gc := GC.ofAbbrev! record[2]! |
154 | | - cc := record[3]!.toNat! |
155 | | - bidi := BidiClass.ofAbbrev! record[4]! |
156 | | - decomp := getDecompositionMapping? record[5]! |
157 | | - numeric := getNumericType? record[6]! record[7]! record[8]! |
158 | | - bidiMirrored := record[9]! == "Y" |
159 | | - uppercase := if record[12]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[12]! |
160 | | - lowercase := if record[13]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[13]! |
161 | | - titlecase := if record[14]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[14]! |
162 | | - } |
163 | | - return arr |
164 | | - |
165 | | -where |
166 | | - |
167 | | - /-- Get decomposition mapping -/ |
168 | | - getDecompositionMapping? (s : String.Slice) : Option DecompositionMapping := do |
| 148 | +unsafe initialize UnicodeData.data : Array UnicodeData ← |
| 149 | + let getDecompositionMapping? (s : String.Slice) : Option DecompositionMapping := do |
169 | 150 | /- |
170 | 151 | The value of the `Decomposition_Mapping` property for a character is |
171 | 152 | provided in field 5 of `UnicodeData.txt`. This is a string-valued |
@@ -218,8 +199,10 @@ where |
218 | 199 | some ⟨tag, cs⟩ |
219 | 200 | | [] => unreachable! |
220 | 201 |
|
221 | | - /-- Get numeric type -/ |
222 | | - getNumericType? (s₁ s₂ s₃ : String.Slice) : Option NumericType := do |
| 202 | + let getDigitUnsafe (char : Char) : Fin 10 := |
| 203 | + unsafeCast (char.val - '0'.val).toNat |
| 204 | + |
| 205 | + let getNumericType? (s₁ s₂ s₃ : String.Slice) : Option NumericType := do |
223 | 206 | /- |
224 | 207 | If the character has the property value `Numeric_Type=Decimal`, then the |
225 | 208 | `Numeric_Value` of that digit is represented with an integer value |
@@ -263,14 +246,23 @@ where |
263 | 246 | else |
264 | 247 | return .decimal <| getDigitUnsafe <| s₁.front |
265 | 248 |
|
266 | | - /-- Get decimal digit -/ |
267 | | - @[inline] |
268 | | - getDigitUnsafe (char : Char) : Fin 10 := |
269 | | - unsafeCast (char.val - '0'.val).toNat |
270 | | - |
271 | | -/-- Parsed data from `UnicodeData.txt` -/ |
272 | | -@[init UnicodeData.init] |
273 | | -protected def UnicodeData.data : Array UnicodeData := #[] |
| 249 | + let stream := UCDStream.ofString UnicodeData.txt |
| 250 | + let mut arr := #[] |
| 251 | + for record in stream do |
| 252 | + arr := arr.push { |
| 253 | + code := ofHexString! record[0]! |
| 254 | + name := record[1]! |
| 255 | + gc := GC.ofAbbrev! record[2]! |
| 256 | + cc := record[3]!.toNat! |
| 257 | + bidi := BidiClass.ofAbbrev! record[4]! |
| 258 | + decomp := getDecompositionMapping? record[5]! |
| 259 | + numeric := getNumericType? record[6]! record[7]! record[8]! |
| 260 | + bidiMirrored := record[9]! == "Y" |
| 261 | + uppercase := if record[12]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[12]! |
| 262 | + lowercase := if record[13]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[13]! |
| 263 | + titlecase := if record[14]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[14]! |
| 264 | + } |
| 265 | + return arr |
274 | 266 |
|
275 | 267 | /-- Get code point data from `UnicodeData.txt` -/ |
276 | 268 | partial def getUnicodeData? (code : UInt32) : Option UnicodeData := do |
@@ -370,12 +362,12 @@ structure UnicodeDataStream where |
370 | 362 | default : UInt32 → UnicodeData := UnicodeData.mkNoncharacter |
371 | 363 | deriving Inhabited |
372 | 364 |
|
373 | | -private def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeData × UnicodeDataStream) := do |
| 365 | +def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeData × UnicodeDataStream) := do |
374 | 366 | let c := s.code |
375 | 367 | let i := s.index |
376 | 368 | if c > Unicode.max then |
377 | 369 | none |
378 | | - else if h : i < UnicodeData.data.size.toUSize then |
| 370 | + else if h : i.toNat < UnicodeData.data.size then |
379 | 371 | let d := UnicodeData.data[i] |
380 | 372 | let n := d.name |
381 | 373 | if c < d.code then |
@@ -410,5 +402,3 @@ private def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeDat |
410 | 402 |
|
411 | 403 | instance : Std.Stream UnicodeDataStream UnicodeData where |
412 | 404 | next? := UnicodeDataStream.next? |
413 | | - |
414 | | -end Unicode |
0 commit comments