99 */
1010class UpdateCharData
1111{
12- /** @var string */
13- public static $ filepathSrc = 'https://www.unicode.org/Public/security/latest/confusables.txt ' ;
12+ /** @var string */
13+ public static $ filepathSrc = 'https://www.unicode.org/Public/security/latest/confusables.txt ' ;
1414
1515 /** @var array<string,charInfo> */
16- protected $ charData = array ();
17-
18- /**
19- * Update confusableData.php
20- *
21- * @return void
22- */
23- public static function update ()
24- {
25- $ filepathOut = __DIR__ . '/../src/Debug/Dump/charData.php ' ;
16+ protected $ charData = array ();
17+
18+ /**
19+ * Update confusableData.php
20+ *
21+ * @return void
22+ */
23+ public static function update ()
24+ {
25+ $ filepathOut = __DIR__ . '/../src/Debug/Dump/charData.php ' ;
2626 $ comment = '/**
2727 * This file is generated automatically from confusables.txt
2828 * https://www.unicode.org/Public/security/latest/confusables.txt
@@ -31,136 +31,136 @@ public static function update()
3131 *
3232 * @phpcs:disable SlevomatCodingStandard.Arrays.AlphabeticallySortedByKeys
3333 */ ' ;
34- $ php = '<?php // phpcs:ignore SlevomatCodingStandard.Files.FileLength ' . "\n\n"
34+ $ php = '<?php // phpcs:ignore SlevomatCodingStandard.Files.FileLength ' . "\n\n"
3535 . \preg_replace ('/^[ ]{12}/m ' , ' ' , $ comment ) . "\n\n"
36- . 'return ' . self ::varExportPretty (self ::build ()) . "; \n" ;
36+ . 'return ' . self ::varExportPretty (self ::build ()) . "; \n" ;
3737 $ php = \preg_replace_callback ('/[ \'"](.)[ \'"] => /u ' , static function ($ matches ) {
3838 $ char = $ matches [1 ];
3939 $ codePoint = \mb_ord ($ char );
4040 return $ codePoint < 0x80
4141 ? '" \\x ' . \dechex ($ codePoint ) . '" => '
4242 : '\'' . $ char . '\' => ' ;
4343 }, $ php );
44- \file_put_contents ($ filepathOut , $ php );
45- }
46-
47- /**
48- * Build char data
49- *
50- * @return array<string,array<string,string|bool>>
51- */
52- public static function build ()
53- {
54- $ rows = self ::getParsedRows ();
55-
56- // only interested in chars that are confusable with an ascii char
57- // not interested in ascii chars that are confusable with other ascii chars
58- $ rows = \array_filter ($ rows , static function ($ row ) {
59- $ isCharAAscii = \strlen ($ row ['charA ' ]) === 1 && \ord ($ row ['charA ' ]) < 0x80 ;
60- $ isCharBAscii = \strlen ($ row ['charB ' ]) === 1 && \ord ($ row ['charB ' ]) < 0x80 ;
61- return $ isCharAAscii === false && $ isCharBAscii ;
62- });
63-
64- \usort ($ rows , static function ($ rowA , $ rowB ) {
65- return \strcmp ($ rowA ['charA ' ], $ rowB ['charA ' ]);
66- });
67-
68- // rekey
69- $ rowsNew = require __DIR__ . '/charData.php ' ;
70- foreach ($ rows as $ row ) {
71- $ key = $ row ['charA ' ];
72- if (isset ($ rowsNew [$ key ])) {
73- continue ;
74- }
75- unset($ row ['charA ' ]);
76- $ rowsNew [$ key ] = array (
77- 'codePoint ' => $ row ['charACodePoint ' ],
78- 'desc ' => $ row ['charADesc ' ],
79- 'similarTo ' => $ row ['charB ' ],
80- );
81- }
82-
83- \ksort ($ rowsNew );
84-
85- return $ rowsNew ;
86- }
87-
88- /**
89- * Return parsed data for all confusable data
90- *
91- * @return array<string,string|bool>[]
92- */
93- private static function getParsedRows ()
94- {
95- $ rows = \file (self ::$ filepathSrc );
96- $ rows = \array_filter ($ rows , static function ($ row ) {
97- $ isEmptyOrComment = \strlen (\trim ($ row )) === 0 || $ row [0 ] === '# ' ;
98- return $ isEmptyOrComment === false ;
99- });
100-
101- return \array_map (static function ($ row ) {
102- return self ::parseRow ($ row );
103- }, $ rows );
104- }
105-
106- /**
107- * Parse confusable.txt row
108- *
109- * @param string $row non-comment row from data file
110- *
111- * @return array<string,mixed>
112- */
113- protected static function parseRow ($ row )
114- {
115- $ parts = \explode ('; ' , $ row , 3 );
116- $ parts = \array_map ('trim ' , $ parts );
117- $ parts = \array_combine (['charACodePoint ' , 'charBCodePoint ' , 'comment ' ], $ parts );
118-
119- $ parts ['charACodePoint ' ] = \implode (' ' , \array_map (static function ($ codePoint ) {
120- // remove leading 00 pairs
121- return \preg_replace ('/^(00)+/ ' , '' , $ codePoint );
122- }, \explode (' ' , $ parts ['charACodePoint ' ])));
123-
124- $ parts ['charBCodePoint ' ] = \implode (' ' , \array_map (static function ($ codePoint ) {
125- // remove leading 00 pairs
126- return \preg_replace ('/^(00)+/ ' , '' , $ codePoint );
127- }, \explode (' ' , $ parts ['charBCodePoint ' ])));
128-
129- \preg_match ('/^(?P<category>\w+)\t#(?P<notXid>\*?)\s*(?P<example>\(.*?\))\s*(?P<charADesc>.*?) → (?P<charBDesc>.*?)(\s+#.*)?$/u ' , $ parts ['comment ' ], $ matches );
130- $ parts = \array_merge ($ parts , $ matches );
131-
132- return array (
133- 'charA ' => \implode ('' , \array_map (static function ($ hex ) {
134- $ codePoint = \hexdec ($ hex );
135- return \mb_chr ($ codePoint , 'UTF-8 ' );
136- }, \explode (' ' , $ parts ['charACodePoint ' ]))),
137- 'charACodePoint ' => $ parts ['charACodePoint ' ],
138- 'charADesc ' => $ parts ['charADesc ' ],
139-
140- 'charB ' => \implode ('' , \array_map (static function ($ hex ) {
141- $ codePoint = \hexdec ($ hex );
142- return \mb_chr ($ codePoint , 'UTF-8 ' );
143- }, \explode (' ' , $ parts ['charBCodePoint ' ]))),
144- 'isXid ' => empty ($ parts ['notXid ' ]),
145- );
146- }
147-
148- /**
149- * export value as valid php
150- *
151- * @param mixed $val Value to export
152- *
153- * @return string
154- */
155- protected static function varExportPretty ($ val )
156- {
157- $ php = \var_export ($ val , true );
158- $ php = \str_replace ('array ( ' , 'array( ' , $ php );
159- $ php = \preg_replace ('/=> \n\s+array/ ' , '=> array ' , $ php );
160- $ php = \preg_replace_callback ('/^(\s*)/m ' , static function ($ matches ) {
161- return \str_repeat ($ matches [1 ], 2 );
162- }, $ php );
163- $ php = \str_replace ('\'\' . "\0" . \'\'' , '"\x00" ' , $ php );
164- return $ php ;
165- }
44+ \file_put_contents ($ filepathOut , $ php );
45+ }
46+
47+ /**
48+ * Build char data
49+ *
50+ * @return array<string,array<string,string|bool>>
51+ */
52+ public static function build ()
53+ {
54+ $ rows = self ::getParsedRows ();
55+
56+ // only interested in chars that are confusable with an ascii char
57+ // not interested in ascii chars that are confusable with other ascii chars
58+ $ rows = \array_filter ($ rows , static function ($ row ) {
59+ $ isCharAAscii = \strlen ($ row ['charA ' ]) === 1 && \ord ($ row ['charA ' ]) < 0x80 ;
60+ $ isCharBAscii = \strlen ($ row ['charB ' ]) === 1 && \ord ($ row ['charB ' ]) < 0x80 ;
61+ return $ isCharAAscii === false && $ isCharBAscii ;
62+ });
63+
64+ \usort ($ rows , static function ($ rowA , $ rowB ) {
65+ return \strcmp ($ rowA ['charA ' ], $ rowB ['charA ' ]);
66+ });
67+
68+ // rekey
69+ $ rowsNew = require __DIR__ . '/charData.php ' ;
70+ foreach ($ rows as $ row ) {
71+ $ key = $ row ['charA ' ];
72+ if (isset ($ rowsNew [$ key ])) {
73+ continue ;
74+ }
75+ unset($ row ['charA ' ]);
76+ $ rowsNew [$ key ] = array (
77+ 'codePoint ' => $ row ['charACodePoint ' ],
78+ 'desc ' => $ row ['charADesc ' ],
79+ 'similarTo ' => $ row ['charB ' ],
80+ );
81+ }
82+
83+ \ksort ($ rowsNew );
84+
85+ return $ rowsNew ;
86+ }
87+
88+ /**
89+ * Return parsed data for all confusable data
90+ *
91+ * @return array<string,string|bool>[]
92+ */
93+ private static function getParsedRows ()
94+ {
95+ $ rows = \file (self ::$ filepathSrc );
96+ $ rows = \array_filter ($ rows , static function ($ row ) {
97+ $ isEmptyOrComment = \strlen (\trim ($ row )) === 0 || $ row [0 ] === '# ' ;
98+ return $ isEmptyOrComment === false ;
99+ });
100+
101+ return \array_map (static function ($ row ) {
102+ return self ::parseRow ($ row );
103+ }, $ rows );
104+ }
105+
106+ /**
107+ * Parse confusable.txt row
108+ *
109+ * @param string $row non-comment row from data file
110+ *
111+ * @return array<string,mixed>
112+ */
113+ protected static function parseRow ($ row )
114+ {
115+ $ parts = \explode ('; ' , $ row , 3 );
116+ $ parts = \array_map ('trim ' , $ parts );
117+ $ parts = \array_combine (['charACodePoint ' , 'charBCodePoint ' , 'comment ' ], $ parts );
118+
119+ $ parts ['charACodePoint ' ] = \implode (' ' , \array_map (static function ($ codePoint ) {
120+ // remove leading 00 pairs
121+ return \preg_replace ('/^(00)+/ ' , '' , $ codePoint );
122+ }, \explode (' ' , $ parts ['charACodePoint ' ])));
123+
124+ $ parts ['charBCodePoint ' ] = \implode (' ' , \array_map (static function ($ codePoint ) {
125+ // remove leading 00 pairs
126+ return \preg_replace ('/^(00)+/ ' , '' , $ codePoint );
127+ }, \explode (' ' , $ parts ['charBCodePoint ' ])));
128+
129+ \preg_match ('/^(?P<category>\w+)\t#(?P<notXid>\*?)\s*(?P<example>\(.*?\))\s*(?P<charADesc>.*?) → (?P<charBDesc>.*?)(\s+#.*)?$/u ' , $ parts ['comment ' ], $ matches );
130+ $ parts = \array_merge ($ parts , $ matches );
131+
132+ return array (
133+ 'charA ' => \implode ('' , \array_map (static function ($ hex ) {
134+ $ codePoint = \hexdec ($ hex );
135+ return \mb_chr ($ codePoint , 'UTF-8 ' );
136+ }, \explode (' ' , $ parts ['charACodePoint ' ]))),
137+ 'charACodePoint ' => $ parts ['charACodePoint ' ],
138+ 'charADesc ' => $ parts ['charADesc ' ],
139+
140+ 'charB ' => \implode ('' , \array_map (static function ($ hex ) {
141+ $ codePoint = \hexdec ($ hex );
142+ return \mb_chr ($ codePoint , 'UTF-8 ' );
143+ }, \explode (' ' , $ parts ['charBCodePoint ' ]))),
144+ 'isXid ' => empty ($ parts ['notXid ' ]),
145+ );
146+ }
147+
148+ /**
149+ * export value as valid php
150+ *
151+ * @param mixed $val Value to export
152+ *
153+ * @return string
154+ */
155+ protected static function varExportPretty ($ val )
156+ {
157+ $ php = \var_export ($ val , true );
158+ $ php = \str_replace ('array ( ' , 'array( ' , $ php );
159+ $ php = \preg_replace ('/=> \n\s+array/ ' , '=> array ' , $ php );
160+ $ php = \preg_replace_callback ('/^(\s*)/m ' , static function ($ matches ) {
161+ return \str_repeat ($ matches [1 ], 2 );
162+ }, $ php );
163+ $ php = \str_replace ('\'\' . "\0" . \'\'' , '"\x00" ' , $ php );
164+ return $ php ;
165+ }
166166}
0 commit comments