1- from pathlib import Path
2- import pprint
1+ from __future__ import annotations
32import re
4- import string
3+ import struct
54import sys
5+ from pathlib import Path
66from textwrap import dedent
77
88
99HERE = Path (__file__ ).parent .resolve ()
1010DERIVEDAGES = HERE .joinpath ("DerivedAge.txt" )
1111
12- CYTHON_INFILE = HERE .joinpath ("unicode_age.pyx.in" )
13- CYTHON_TEMPLATE = string .Template (CYTHON_INFILE .read_text ())
1412
13+ def _write_spans (spans : list , ucd_version : tuple , outfile : Path ):
14+ span_fmt = "iibb"
15+ VersionSpan = struct .Struct (span_fmt )
1516
16- def _write_spans (spans : list , c_out : Path , cython_out : Path ):
17- c_src = dedent ("""
18- // 8 + 8 + 2 = 18 bytes per span
19- // 1283 'real' spans, 435 singleton spans as of Unicode 15.0
20- // ~31 KB of storage required (in practice the actual consumed memory is ~21 KB? not sure why that is...)
21- typedef struct {
22- int start;
23- int stop;
24- char major;
25- char minor;
26- } versionSpan_t;
17+ Nbytes = len (spans ) * VersionSpan .size
18+ buf = bytearray (Nbytes )
2719
20+ for n , s in enumerate (spans ):
21+ VersionSpan .pack_into (buf , n * VersionSpan .size , * s )
2822
29- static const versionSpan_t versionSpans[] = {
30- """ )
31-
32- c_src += "\t "
23+ py_src = dedent (f"""
24+ from __future__ import annotations
25+ import struct
3326
34- for (start , stop , major , minor ) in spans :
35- line = f"{{0x{ start :06x} , 0x{ stop :06x} , { major } , { minor } }}"
36- c_src += f"{ line } ,\n \t "
27+ UCD_VERSION = { ucd_version }
3728
38- c_src += " \n };"
29+ VersionSpan = struct.Struct( { span_fmt !r } )
3930
40- pyx_src = CYTHON_TEMPLATE .substitute ({"numSpans" : len (spans )})
31+ def iter_spans():
32+ yield from VersionSpan.iter_unpack(VERSION_SPANS)
4133
34+ VERSION_SPANS = { repr (buf )}
35+ """ )
4236
43- c_out .write_text (c_src )
44- print (f"Wrote to { c_out } " )
4537
46- cython_out .write_text (pyx_src )
47- print (f"Wrote to { cython_out } " )
38+ outfile .write_text (py_src )
39+ print (f"Wrote to { outfile } " )
4840
4941
50- def _derivedage_spans ():
42+ def _derivedage_spans (fn ):
5143 CODEPT = r"[0-9A-Fa-f]+"
5244 PATT = rf"^({ CODEPT } )(?:\.\.({ CODEPT } ))?\s*;\s*([\d.]+)\s*#.*"
5345
54- with open (DERIVEDAGES , "r" ) as f :
46+ with open (fn , "r" ) as f :
5547 for line in f :
5648 if line .strip () and line .startswith ("#" ):
5749 continue
@@ -69,14 +61,31 @@ def _derivedage_spans():
6961 yield start , stop , major , minor
7062
7163
64+ def parse_ucdversion (fn : Path ) -> tuple [int , int , int ]:
65+ with open (fn , "r" ) as f :
66+ patt = r"DerivedAge-(?P<version>\d+\.\d+\.\d+)\.txt"
67+ m = re .search (patt , f .readline ())
68+ if not m :
69+ raise ValueError ("Cannot determine UCD version of {str(fn)!r}" )
7270
73- def main ():
74- spans = list ( _derivedage_spans ())
71+ ver = tuple ( int ( val ) for val in m . group ( "version" ). split ( '.' ))
72+ return ver
7573
76- C_OUTFILE = HERE .joinpath ("src" , "unicode_age.h" )
77- CYTHON_OUTFILE = HERE .joinpath ("src" , "unicode_age.pyx" )
7874
79- _write_spans (spans , c_out = C_OUTFILE , cython_out = CYTHON_OUTFILE )
75+ def main ():
76+ ucd_version = parse_ucdversion (DERIVEDAGES )
77+ print (f"Scanning for version spans for UCD { ucd_version } : { str (DERIVEDAGES )} " )
78+ spans = list (_derivedage_spans (DERIVEDAGES ))
79+ print (f"Found { len (spans )} versioned spans" )
80+
81+ UNICODE_AGE = HERE .joinpath ("src" , "unicode_age" )
82+ PYTHON_OUTFILE = UNICODE_AGE .joinpath ("unicode_age_db.py" )
83+
84+ _write_spans (
85+ spans ,
86+ ucd_version = ucd_version ,
87+ outfile = PYTHON_OUTFILE ,
88+ )
8089
8190
8291if __name__ == "__main__" :
0 commit comments