Skip to content

Commit 3da26bd

Browse files
committed
chore: merge main into nightly-testing
2 parents 4c08c70 + 603cc6e commit 3da26bd

10 files changed

Lines changed: 83 additions & 89 deletions

File tree

UnicodeBasic.lean

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22
Copyright © 2023-2025 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
5+
module
6+
public import UnicodeBasic.Types
7+
public import UnicodeBasic.TableLookup
58

6-
import UnicodeBasic.Types
7-
import UnicodeBasic.TableLookup
9+
public section
810

911
/-!
1012
# General API #

UnicodeBasic/CharacterDatabase.lean

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
Copyright © 2023 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
5+
module
6+
7+
public section
58

69
namespace Unicode
710

@@ -61,7 +64,3 @@ protected def next? (stream : UCDStream) : Option (Array String.Slice × UCDStre
6164

6265
instance : Std.Stream UCDStream (Array String.Slice) where
6366
next? := UCDStream.next?
64-
65-
end UCDStream
66-
67-
end Unicode

UnicodeBasic/Hangul.lean

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,33 +2,36 @@
22
Copyright © 2024 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
5+
module
6+
7+
public section
58

69
namespace Unicode.Hangul
710

8-
private def shortNameL : Array String :=
11+
def shortNameL : Array String :=
912
#["G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S",
1013
"SS", "", "J", "JJ", "C", "K", "T", "P", "H"]
1114

12-
private def shortNameV : Array String :=
15+
def shortNameV : Array String :=
1316
#["A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
1417
"WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
1518
"I"]
1619

17-
private def shortNameT : Array String :=
20+
def shortNameT : Array String :=
1821
#["", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG",
1922
"LM", "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", "S",
2023
"SS", "NG", "J", "C", "K", "T", "P", "H"]
2124

22-
private abbrev sizeL := shortNameL.size -- 19
23-
private abbrev sizeV := shortNameV.size -- 21
24-
private abbrev sizeT := shortNameT.size -- 28
25-
private abbrev sizeLV := sizeL * sizeV -- 399
26-
private abbrev sizeVT := sizeV * sizeT -- 588
27-
private abbrev sizeLVT := sizeL * sizeV * sizeT -- 11172
25+
abbrev sizeL := shortNameL.size -- 19
26+
abbrev sizeV := shortNameV.size -- 21
27+
abbrev sizeT := shortNameT.size -- 28
28+
abbrev sizeLV := sizeL * sizeV -- 399
29+
abbrev sizeVT := sizeV * sizeT -- 588
30+
abbrev sizeLVT := sizeL * sizeV * sizeT -- 11172
2831

29-
private abbrev baseL : UInt32 := 0x1100
30-
private abbrev baseV : UInt32 := 0x1161
31-
private abbrev baseT : UInt32 := 0x11A7
32+
abbrev baseL : UInt32 := 0x1100
33+
abbrev baseV : UInt32 := 0x1161
34+
abbrev baseT : UInt32 := 0x11A7
3235

3336
/-- Number of Hangul syllables -/
3437
def Syllable.size := sizeLVT
@@ -112,5 +115,3 @@ def getSyllable! (code : UInt32) : Syllable :=
112115
match getSyllable? code with
113116
| some s => s
114117
| none => panic! "not a Hangul syllable"
115-
116-
end Unicode.Hangul

UnicodeBasic/TableLookup.lean

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
Copyright © 2024-2025 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
5-
5+
module
66
import UnicodeBasic.CharacterDatabase
77
import UnicodeBasic.Hangul
8-
import UnicodeBasic.Types
8+
public import UnicodeBasic.Types
9+
10+
public section
911

1012
namespace Unicode
1113

@@ -406,5 +408,3 @@ def lookupScriptName (s : Script) : Option String.Slice :=
406408
where
407409
str : String := include_str "../data/table/Script_Name.txt"
408410
table : Thunk <| Array (UInt32 × String.Slice) := parseTable str fun _ n => n[0]!
409-
410-
end Unicode

UnicodeBasic/Types.lean

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
Copyright © 2023-2025 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
5+
module
6+
public import Std.Data.HashMap
57

6-
import Std.Data.HashMap
8+
public section
79

810
/-- Low-level conversion from `UInt32` to `Char` (*unsafe*)
911
@@ -206,6 +208,7 @@ deriving DecidableEq
206208
/-- General category (GC)
207209
208210
Unicode property: `General_Category` -/
211+
@[expose]
209212
def GC := UInt32 deriving DecidableEq, Inhabited
210213

211214
namespace GC
@@ -414,11 +417,12 @@ def toAbbrev! (x : GC) : String :=
414417
| [a] => a
415418
| _ => panic! "invalid general category"
416419

417-
open Std.Format Repr in instance : Repr GC where
418-
reprPrec x := addAppParen (group (joinSep (reprAux x |>.map (text "Unicode.GC." ++ text ·)) (text " |||" ++ line)) .fill)
420+
open Std.Format Repr in
421+
def reprPrec (x : GC) := addAppParen (group (joinSep (reprAux x |>.map (text "Unicode.GC." ++ text ·)) (text " |||" ++ line)) .fill)
422+
instance : Repr GC where reprPrec
419423

420-
instance : ToString GC where
421-
toString x := " | ".intercalate (reprAux x)
424+
def toString (x : GC) := " | ".intercalate (reprAux x)
425+
instance : ToString GC where toString
422426

423427
def ofAbbrev? (s : String.Slice) : Option GC :=
424428
match s.chars.take 3 |>.toList with
@@ -466,7 +470,7 @@ def ofAbbrev? (s : String.Slice) : Option GC :=
466470

467471
def ofAbbrev! (s : String.Slice) : GC :=
468472
match ofAbbrev? s with
469-
| some c => c
473+
| .some c => c
470474
| none => panic! "invalid general category"
471475

472476
def ofString? (s : String.Slice) : Option GC := do
@@ -477,7 +481,7 @@ def ofString? (s : String.Slice) : Option GC := do
477481

478482
def ofString! (s : String.Slice) : GC :=
479483
match ofString? s with
480-
| some c => c
484+
| .some c => c
481485
| none => panic! "invalid general category"
482486

483487
end GC
@@ -1072,5 +1076,3 @@ def ofAbbrev? (abbr : String.Slice) : Option Script :=
10721076
def ofAbbrev! (abbr : String.Slice) : Script := ofAbbrev? abbr |>.get!
10731077

10741078
end Script
1075-
1076-
end Unicode

UnicodeData.lean

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ Copyright © 2023-2026 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
55

6-
import UnicodeData.Aliases
7-
import UnicodeData.Basic
8-
import UnicodeData.PropList
9-
import UnicodeData.Scripts
6+
module
7+
public import UnicodeData.Aliases
8+
public import UnicodeData.Basic
9+
public import UnicodeData.PropList
10+
public import UnicodeData.Scripts

UnicodeData/Aliases.lean

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@
22
Copyright © 2026 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
5-
6-
import Std.Data.HashMap
5+
module
6+
public import Std.Data.HashMap
77
import UnicodeBasic.Types
88
import UnicodeBasic.CharacterDatabase
99

10+
public section
11+
1012
namespace Unicode
1113

1214
/-- Type for list of aliases -/

UnicodeData/Basic.lean

Lines changed: 29 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
Copyright © 2023-2025 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
5-
5+
module
66
import UnicodeBasic.CharacterDatabase
77
import UnicodeBasic.Hangul
8-
import UnicodeBasic.Types
8+
public import UnicodeBasic.Types
9+
10+
public section
911

1012
namespace Unicode
1113

@@ -143,29 +145,8 @@ def UnicodeData.mkTangutIdeograph (c : UInt32) : UnicodeData where
143145
protected def UnicodeData.txt := include_str "../data/UnicodeData.txt"
144146

145147
/-- Parse `UnicodeData.txt` -/
146-
private unsafe def UnicodeData.init : IO (Array UnicodeData) := do
147-
let stream := UCDStream.ofString UnicodeData.txt
148-
let mut arr := #[]
149-
for record in stream do
150-
arr := arr.push {
151-
code := ofHexString! record[0]!
152-
name := record[1]!
153-
gc := GC.ofAbbrev! record[2]!
154-
cc := record[3]!.toNat!
155-
bidi := BidiClass.ofAbbrev! record[4]!
156-
decomp := getDecompositionMapping? record[5]!
157-
numeric := getNumericType? record[6]! record[7]! record[8]!
158-
bidiMirrored := record[9]! == "Y"
159-
uppercase := if record[12]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[12]!
160-
lowercase := if record[13]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[13]!
161-
titlecase := if record[14]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[14]!
162-
}
163-
return arr
164-
165-
where
166-
167-
/-- Get decomposition mapping -/
168-
getDecompositionMapping? (s : String.Slice) : Option DecompositionMapping := do
148+
unsafe initialize UnicodeData.data : Array UnicodeData ←
149+
let getDecompositionMapping? (s : String.Slice) : Option DecompositionMapping := do
169150
/-
170151
The value of the `Decomposition_Mapping` property for a character is
171152
provided in field 5 of `UnicodeData.txt`. This is a string-valued
@@ -218,8 +199,10 @@ where
218199
some ⟨tag, cs⟩
219200
| [] => unreachable!
220201

221-
/-- Get numeric type -/
222-
getNumericType? (s₁ s₂ s₃ : String.Slice) : Option NumericType := do
202+
let getDigitUnsafe (char : Char) : Fin 10 :=
203+
unsafeCast (char.val - '0'.val).toNat
204+
205+
let getNumericType? (s₁ s₂ s₃ : String.Slice) : Option NumericType := do
223206
/-
224207
If the character has the property value `Numeric_Type=Decimal`, then the
225208
`Numeric_Value` of that digit is represented with an integer value
@@ -263,14 +246,23 @@ where
263246
else
264247
return .decimal <| getDigitUnsafe <| s₁.front
265248

266-
/-- Get decimal digit -/
267-
@[inline]
268-
getDigitUnsafe (char : Char) : Fin 10 :=
269-
unsafeCast (char.val - '0'.val).toNat
270-
271-
/-- Parsed data from `UnicodeData.txt` -/
272-
@[init UnicodeData.init]
273-
protected def UnicodeData.data : Array UnicodeData := #[]
249+
let stream := UCDStream.ofString UnicodeData.txt
250+
let mut arr := #[]
251+
for record in stream do
252+
arr := arr.push {
253+
code := ofHexString! record[0]!
254+
name := record[1]!
255+
gc := GC.ofAbbrev! record[2]!
256+
cc := record[3]!.toNat!
257+
bidi := BidiClass.ofAbbrev! record[4]!
258+
decomp := getDecompositionMapping? record[5]!
259+
numeric := getNumericType? record[6]! record[7]! record[8]!
260+
bidiMirrored := record[9]! == "Y"
261+
uppercase := if record[12]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[12]!
262+
lowercase := if record[13]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[13]!
263+
titlecase := if record[14]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[14]!
264+
}
265+
return arr
274266

275267
/-- Get code point data from `UnicodeData.txt` -/
276268
partial def getUnicodeData? (code : UInt32) : Option UnicodeData := do
@@ -370,12 +362,12 @@ structure UnicodeDataStream where
370362
default : UInt32 → UnicodeData := UnicodeData.mkNoncharacter
371363
deriving Inhabited
372364

373-
private def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeData × UnicodeDataStream) := do
365+
def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeData × UnicodeDataStream) := do
374366
let c := s.code
375367
let i := s.index
376368
if c > Unicode.max then
377369
none
378-
else if h : i < UnicodeData.data.size.toUSize then
370+
else if h : i.toNat < UnicodeData.data.size then
379371
let d := UnicodeData.data[i]
380372
let n := d.name
381373
if c < d.code then
@@ -410,5 +402,3 @@ private def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeDat
410402

411403
instance : Std.Stream UnicodeDataStream UnicodeData where
412404
next? := UnicodeDataStream.next?
413-
414-
end Unicode

UnicodeData/PropList.lean

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
Copyright © 2023-2025 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
5-
5+
module
66
import UnicodeBasic.Types
77
import UnicodeBasic.CharacterDatabase
88

9+
public section
10+
911
namespace Unicode
1012

1113
/-- Structure containing supported character properties from `PropList.txt` -/
@@ -35,7 +37,7 @@ deriving Inhabited, Repr
3537
/-- Raw string form `PropList.txt` -/
3638
protected def PropList.txt := include_str "../data/PropList.txt"
3739

38-
private unsafe def PropList.init : IO PropList := do
40+
unsafe initialize PropList.data : PropList
3941
let stream := UCDStream.ofString PropList.txt
4042
let mut list : PropList := {}
4143
for record in stream do
@@ -66,10 +68,6 @@ private unsafe def PropList.init : IO PropList := do
6668
list := {list with deprecated := list.deprecated.push val}
6769
return list
6870

69-
/-- Parsed data from `PropList.txt` -/
70-
@[init PropList.init]
71-
protected def PropList.data : PropList := {}
72-
7371
-- TODO: stop reinventing the wheel!
7472
/-- Binary search -/
7573
private partial def find (code : UInt32) (data : Array (UInt32 × Option UInt32)) (lo hi : Nat) : Nat :=
@@ -174,5 +172,3 @@ def PropList.isDeprecated (code : UInt32) : Bool :=
174172
match data[find code data 0 data.size]! with
175173
| (val, none) => code == val
176174
| (_, some top) => code <= top
177-
178-
end Unicode

UnicodeData/Scripts.lean

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22
Copyright © 2026 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
5-
6-
import Std.Data.HashMap
5+
module
6+
public import UnicodeData.Aliases
77
import UnicodeBasic.Types
88
import UnicodeBasic.CharacterDatabase
9-
import UnicodeData.Aliases
9+
10+
public section
1011

1112
namespace Unicode
1213

0 commit comments

Comments
 (0)