Skip to content

Commit e0cbe1a

Browse files
committed
feat: modularize
1 parent d7f2437 commit e0cbe1a

9 files changed

Lines changed: 87 additions & 80 deletions

File tree

UnicodeBasic.lean

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22
Copyright © 2023-2025 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
5+
module
6+
public import UnicodeBasic.Types
7+
public import UnicodeBasic.TableLookup
58

6-
import UnicodeBasic.Types
7-
import UnicodeBasic.TableLookup
9+
public section
810

911
/-!
1012
# General API #
@@ -855,3 +857,4 @@ def isAlphabetic (char : Char) : Bool :=
855857
abbrev isAlpha := isAlphabetic
856858

857859
end Unicode
860+
end

UnicodeBasic/CharacterDatabase.lean

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22
Copyright © 2023 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
5-
5+
module
66
namespace Unicode
7+
public section
78

89
/-- Stream type for Unicode Character Database (UCD) files
910
@@ -63,5 +64,5 @@ instance : Std.Stream UCDStream (Array String.Slice) where
6364
next? := UCDStream.next?
6465

6566
end UCDStream
66-
67+
end
6768
end Unicode

UnicodeBasic/Hangul.lean

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,33 +2,34 @@
22
Copyright © 2024 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
5-
5+
module
66
namespace Unicode.Hangul
7+
public section
78

8-
private def shortNameL : Array String :=
9+
def shortNameL : Array String :=
910
#["G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S",
1011
"SS", "", "J", "JJ", "C", "K", "T", "P", "H"]
1112

12-
private def shortNameV : Array String :=
13+
def shortNameV : Array String :=
1314
#["A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
1415
"WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
1516
"I"]
1617

17-
private def shortNameT : Array String :=
18+
def shortNameT : Array String :=
1819
#["", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG",
1920
"LM", "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", "S",
2021
"SS", "NG", "J", "C", "K", "T", "P", "H"]
2122

22-
private abbrev sizeL := shortNameL.size -- 19
23-
private abbrev sizeV := shortNameV.size -- 21
24-
private abbrev sizeT := shortNameT.size -- 28
25-
private abbrev sizeLV := sizeL * sizeV -- 399
26-
private abbrev sizeVT := sizeV * sizeT -- 588
27-
private abbrev sizeLVT := sizeL * sizeV * sizeT -- 11172
23+
@[expose] abbrev sizeL := shortNameL.size -- 19
24+
@[expose] abbrev sizeV := shortNameV.size -- 21
25+
@[expose] abbrev sizeT := shortNameT.size -- 28
26+
@[expose] abbrev sizeLV := sizeL * sizeV -- 399
27+
@[expose] abbrev sizeVT := sizeV * sizeT -- 588
28+
@[expose] abbrev sizeLVT := sizeL * sizeV * sizeT -- 11172
2829

29-
private abbrev baseL : UInt32 := 0x1100
30-
private abbrev baseV : UInt32 := 0x1161
31-
private abbrev baseT : UInt32 := 0x11A7
30+
@[expose] abbrev baseL : UInt32 := 0x1100
31+
@[expose] abbrev baseV : UInt32 := 0x1161
32+
@[expose] abbrev baseT : UInt32 := 0x11A7
3233

3334
/-- Number of Hangul syllables -/
3435
def Syllable.size := sizeLVT
@@ -113,4 +114,5 @@ def getSyllable! (code : UInt32) : Syllable :=
113114
| some s => s
114115
| none => panic! "not a Hangul syllable"
115116

117+
end
116118
end Unicode.Hangul

UnicodeBasic/TableLookup.lean

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22
Copyright © 2024-2025 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
5-
5+
module
66
import UnicodeBasic.CharacterDatabase
77
import UnicodeBasic.Hangul
8-
import UnicodeBasic.Types
8+
public import UnicodeBasic.Types
9+
public section
910

1011
namespace Unicode
1112

UnicodeBasic/Types.lean

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
Copyright © 2023-2025 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
5-
5+
module
6+
public section
67
/-- Low-level conversion from `UInt32` to `Char` (*unsafe*)
78
89
This function translates to a no-op in the compiler. However, it does not
@@ -204,6 +205,7 @@ deriving DecidableEq
204205
/-- General category (GC)
205206
206207
Unicode property: `General_Category` -/
208+
@[expose]
207209
def GC := UInt32 deriving DecidableEq, Inhabited
208210

209211
namespace GC
@@ -271,22 +273,22 @@ protected def Cn : GC := (0x20000000 : UInt32)
271273
protected def C : GC := .Cc ||| .Cf ||| .Cs ||| .Co ||| .Cn
272274

273275
def mk : (major : MajorGeneralCategory) → Option (MinorGeneralCategory major) → GC
274-
| .letter, none => .L
276+
| .letter, .none => .L
275277
| _, some .casedLetter => .LC
276278
| _, some .uppercaseLetter => .Lu
277279
| _, some .lowercaseLetter => .Ll
278280
| _, some .titlecaseLetter => .Lt
279281
| _, some .modifierLetter => .Lm
280282
| _, some .otherLetter => .Lo
281-
| .mark, none => .M
283+
| .mark, .none => .M
282284
| _, some .nonspacingMark => .Mn
283285
| _, some .spacingMark => .Mc
284286
| _, some .enclosingMark => .Me
285-
| .number, none => .N
287+
| .number, .none => .N
286288
| _, some .decimalNumber => .Nd
287289
| _, some .letterNumber => .Nl
288290
| _, some .otherNumber => .No
289-
| .punctuation, none => .P
291+
| .punctuation, .none => .P
290292
| _, some .connectorPunctuation => .Pc
291293
| _, some .dashPunctuation => .Pd
292294
| _, some .groupPunctuation => .PG
@@ -296,16 +298,16 @@ def mk : (major : MajorGeneralCategory) → Option (MinorGeneralCategory major)
296298
| _, some .initialPunctuation => .Pi
297299
| _, some .finalPunctuation => .Pf
298300
| _, some .otherPunctuation => .Po
299-
| .symbol, none => .S
301+
| .symbol, .none => .S
300302
| _, some .mathSymbol => .Sm
301303
| _, some .currencySymbol => .Sc
302304
| _, some .modifierSymbol => .Sk
303305
| _, some .otherSymbol => .So
304-
| .separator, none => .Z
306+
| .separator, .none => .Z
305307
| _, some .spaceSeparator => .Zs
306308
| _, some .lineSeparator => .Zl
307309
| _, some .paragraphSeparator => .Zp
308-
| .other, none => .C
310+
| .other, .none => .C
309311
| _, some .control => .Cc
310312
| _, some .format => .Cf
311313
| _, some .surrogate => .Cs
@@ -412,11 +414,12 @@ def toAbbrev! (x : GC) : String :=
412414
| [a] => a
413415
| _ => panic! "invalid general category"
414416

415-
open Std.Format Repr in instance : Repr GC where
416-
reprPrec x := addAppParen (group (joinSep (reprAux x |>.map (text "Unicode.GC." ++ text ·)) (text " |||" ++ line)) .fill)
417+
open Std.Format Repr in
418+
def reprPrec (x : GC) := addAppParen (group (joinSep (reprAux x |>.map (text "Unicode.GC." ++ text ·)) (text " |||" ++ line)) .fill)
419+
instance : Repr GC where reprPrec
417420

418-
instance : ToString GC where
419-
toString x := " | ".intercalate (reprAux x)
421+
def toString (x : GC) := " | ".intercalate (reprAux x)
422+
instance : ToString GC where toString
420423

421424
def ofAbbrev? (s : String.Slice) : Option GC :=
422425
match s.chars.take 3 |>.toList with
@@ -464,8 +467,8 @@ def ofAbbrev? (s : String.Slice) : Option GC :=
464467

465468
def ofAbbrev! (s : String.Slice) : GC :=
466469
match ofAbbrev? s with
467-
| some c => c
468-
| none => panic! "invalid general category"
470+
| .some c => c
471+
| .none => panic! "invalid general category"
469472

470473
def ofString? (s : String.Slice) : Option GC := do
471474
let mut c := .none
@@ -475,8 +478,8 @@ def ofString? (s : String.Slice) : Option GC := do
475478

476479
def ofString! (s : String.Slice) : GC :=
477480
match ofString? s with
478-
| some c => c
479-
| none => panic! "invalid general category"
481+
| .some c => c
482+
| .none => panic! "invalid general category"
480483

481484
end GC
482485

@@ -1012,3 +1015,4 @@ instance : Repr BidiClass where
10121015
reprPrec bc _ := s!"Unicode.BidiClass.{bc.toAbbrev}"
10131016

10141017
end Unicode
1018+
end

UnicodeData.lean

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ Copyright © 2023-2026 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
55

6-
import UnicodeData.Aliases
7-
import UnicodeData.Basic
8-
import UnicodeData.PropList
6+
module
7+
public import UnicodeData.Aliases
8+
public import UnicodeData.Basic
9+
public import UnicodeData.PropList

UnicodeData/Aliases.lean

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,12 @@ Copyright © 2026 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
55

6-
import Std.Data.HashMap
6+
module
7+
public import Std.Data.HashMap
78
import UnicodeBasic.Types
89
import UnicodeBasic.CharacterDatabase
910

11+
public section
1012
namespace Unicode
1113

1214
/-- Type for list of aliases -/
@@ -125,3 +127,4 @@ def PropertyValueAliases.getShortName! (prop val : String.Slice) : String.Slice
125127
getShortName? prop val |>.get!
126128

127129
end Unicode
130+
end

UnicodeData/Basic.lean

Lines changed: 29 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,12 @@ Copyright © 2023-2025 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
55

6+
module
67
import UnicodeBasic.CharacterDatabase
78
import UnicodeBasic.Hangul
8-
import UnicodeBasic.Types
9+
public import UnicodeBasic.Types
910

11+
public section
1012
namespace Unicode
1113

1214
/-- Structure for data from `UnicodeData.txt` -/
@@ -143,29 +145,8 @@ def UnicodeData.mkTangutIdeograph (c : UInt32) : UnicodeData where
143145
protected def UnicodeData.txt := include_str "../data/UnicodeData.txt"
144146

145147
/-- Parse `UnicodeData.txt` -/
146-
private unsafe def UnicodeData.init : IO (Array UnicodeData) := do
147-
let stream := UCDStream.ofString UnicodeData.txt
148-
let mut arr := #[]
149-
for record in stream do
150-
arr := arr.push {
151-
code := ofHexString! record[0]!
152-
name := record[1]!
153-
gc := GC.ofAbbrev! record[2]!
154-
cc := record[3]!.toNat!
155-
bidi := BidiClass.ofAbbrev! record[4]!
156-
decomp := getDecompositionMapping? record[5]!
157-
numeric := getNumericType? record[6]! record[7]! record[8]!
158-
bidiMirrored := record[9]! == "Y"
159-
uppercase := if record[12]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[12]!
160-
lowercase := if record[13]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[13]!
161-
titlecase := if record[14]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[14]!
162-
}
163-
return arr
164-
165-
where
166-
167-
/-- Get decomposition mapping -/
168-
getDecompositionMapping? (s : String.Slice) : Option DecompositionMapping := do
148+
unsafe initialize UnicodeData.data : Array UnicodeData ←
149+
let getDecompositionMapping? (s : String.Slice) : Option DecompositionMapping := do
169150
/-
170151
The value of the `Decomposition_Mapping` property for a character is
171152
provided in field 5 of `UnicodeData.txt`. This is a string-valued
@@ -218,8 +199,10 @@ where
218199
some ⟨tag, cs⟩
219200
| [] => unreachable!
220201

221-
/-- Get numeric type -/
222-
getNumericType? (s₁ s₂ s₃ : String.Slice) : Option NumericType := do
202+
let getDigitUnsafe (char : Char) : Fin 10 :=
203+
unsafeCast (char.val - '0'.val).toNat
204+
205+
let getNumericType? (s₁ s₂ s₃ : String.Slice) : Option NumericType := do
223206
/-
224207
If the character has the property value `Numeric_Type=Decimal`, then the
225208
`Numeric_Value` of that digit is represented with an integer value
@@ -263,14 +246,23 @@ where
263246
else
264247
return .decimal <| getDigitUnsafe <| s₁.front
265248

266-
/-- Get decimal digit -/
267-
@[inline]
268-
getDigitUnsafe (char : Char) : Fin 10 :=
269-
unsafeCast (char.val - '0'.val).toNat
270-
271-
/-- Parsed data from `UnicodeData.txt` -/
272-
@[init UnicodeData.init]
273-
protected def UnicodeData.data : Array UnicodeData := #[]
249+
let stream := UCDStream.ofString UnicodeData.txt
250+
let mut arr := #[]
251+
for record in stream do
252+
arr := arr.push {
253+
code := ofHexString! record[0]!
254+
name := record[1]!
255+
gc := GC.ofAbbrev! record[2]!
256+
cc := record[3]!.toNat!
257+
bidi := BidiClass.ofAbbrev! record[4]!
258+
decomp := getDecompositionMapping? record[5]!
259+
numeric := getNumericType? record[6]! record[7]! record[8]!
260+
bidiMirrored := record[9]! == "Y"
261+
uppercase := if record[12]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[12]!
262+
lowercase := if record[13]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[13]!
263+
titlecase := if record[14]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[14]!
264+
}
265+
return arr
274266

275267
/-- Get code point data from `UnicodeData.txt` -/
276268
partial def getUnicodeData? (code : UInt32) : Option UnicodeData := do
@@ -370,12 +362,12 @@ structure UnicodeDataStream where
370362
default : UInt32 → UnicodeData := UnicodeData.mkNoncharacter
371363
deriving Inhabited
372364

373-
private def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeData × UnicodeDataStream) := do
365+
def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeData × UnicodeDataStream) := do
374366
let c := s.code
375367
let i := s.index
376368
if c > Unicode.max then
377369
none
378-
else if h : i < UnicodeData.data.size.toUSize then
370+
else if h : i.toNat < UnicodeData.data.size then
379371
let d := UnicodeData.data[i]
380372
let n := d.name
381373
if c < d.code then
@@ -412,3 +404,4 @@ instance : Std.Stream UnicodeDataStream UnicodeData where
412404
next? := UnicodeDataStream.next?
413405

414406
end Unicode
407+
end

UnicodeData/PropList.lean

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@ Copyright © 2023-2025 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
55

6+
module
67
import UnicodeBasic.Types
78
import UnicodeBasic.CharacterDatabase
89

10+
public section
911
namespace Unicode
1012

1113
/-- Structure containing supported character properties from `PropList.txt` -/
@@ -35,7 +37,7 @@ deriving Inhabited, Repr
3537
/-- Raw string form `PropList.txt` -/
3638
protected def PropList.txt := include_str "../data/PropList.txt"
3739

38-
private unsafe def PropList.init : IO PropList := do
40+
unsafe initialize PropList.data : PropList
3941
let stream := UCDStream.ofString PropList.txt
4042
let mut list : PropList := {}
4143
for record in stream do
@@ -66,10 +68,6 @@ private unsafe def PropList.init : IO PropList := do
6668
list := {list with deprecated := list.deprecated.push val}
6769
return list
6870

69-
/-- Parsed data from `PropList.txt` -/
70-
@[init PropList.init]
71-
protected def PropList.data : PropList := {}
72-
7371
-- TODO: stop reinventing the wheel!
7472
/-- Binary search -/
7573
private partial def find (code : UInt32) (data : Array (UInt32 × Option UInt32)) (lo hi : Nat) : Nat :=
@@ -176,3 +174,4 @@ def PropList.isDeprecated (code : UInt32) : Bool :=
176174
| (_, some top) => code <= top
177175

178176
end Unicode
177+
end

0 commit comments

Comments
 (0)