3434 * '😕' === $smilies->read_token( 'Not sure :?.', 9, $bytes_skipped );
3535 * 2 === $bytes_skipped;
3636 *
37- * $php = $smilies->precomputed_php_source_table();
38- * // Value of $php .
37+ * echo $smilies->precomputed_php_source_table( ' ' );
38+ * // Output .
3939 * WP_Token_Map::from_precomputed_table(
4040 * 2,
4141 * array(),
4848class WP_Token_Map {
4949 /**
5050 * Maximum length for each key and each transformed value in the table (in bytes).
51+ *
52+ * @since 6.6.0
5153 */
5254 const MAX_LENGTH = 256 ;
5355
5456 /**
5557 * How many bytes of each key are used to form a group key for lookup.
5658 * This also determines whether a word is considered short or long.
5759 *
60+ * @since 6.6.0
61+ *
5862 * @var int
5963 */
6064 private $ key_length = 2 ;
@@ -88,6 +92,8 @@ class WP_Token_Map {
8892 * This lookup data structure is designed to optimize cache locality and
8993 * minimize indirect memory reads when matching strings in the set.
9094 *
95+ * @since 6.6.0
96+ *
9197 * @var array
9298 */
9399 private $ large_words = array ();
@@ -104,6 +110,8 @@ class WP_Token_Map {
104110 * // Stores array( 'GT', 'LT', 'gt', 'lt' ).
105111 * "GT\x00LT\x00gt\x00lt\x00"
106112 *
113+ * @since 6.6.0
114+ *
107115 * @var string
108116 */
109117 private $ small_words = '' ;
@@ -119,6 +127,8 @@ class WP_Token_Map {
119127 *
120128 * array( '>', '<', '>', '<' )
121129 *
130+ * @since 6.6.0
131+ *
122132 * @var string[]
123133 */
124134 private $ small_mappings = array ();
@@ -135,9 +145,12 @@ class WP_Token_Map {
135145 * ':?' => '😕',
136146 * ) );
137147 *
148+ * @since 6.6.0
149+ *
138150 * @param array $mappings The keys transform into the values, both are strings.
139151 * @param int $key_length Determines the group key length. Leave at the default value
140152 * of 2 unless there's an empirical reason to change it.
153+ *
141154 * @return WP_Token_Map|null Token map, unless unable to create it.
142155 */
143156 public static function from_array ( $ mappings , $ key_length = 2 ) {
@@ -197,6 +210,22 @@ public static function from_array( $mappings, $key_length = 2 ) {
197210 return $ map ;
198211 }
199212
213+ /**
214+ * Creates a token map from a pre-computed table.
215+ * This skips the initialization cost of generating the table.
216+ *
217+ * This function should only be used to load data created with
218+ * WP_Token_Map::precomputed_php_source_tag().
219+ *
220+ * @since 6.6.0
221+ *
222+ * @param int $key_length Group key length.
223+ * @param array $large_words Large word groups and packed strings.
224+ * @param string $small_words Small words packed string.
225+ * @param array $small_mappings Small word mappings.
226+ *
227+ * @return WP_Token_Map Map with precomputed data loaded.
228+ */
200229 public static function from_precomputed_table ( $ key_length , $ large_words , $ small_words , $ small_mappings ) {
201230 $ map = new WP_Token_Map ();
202231
@@ -208,6 +237,19 @@ public static function from_precomputed_table( $key_length, $large_words, $small
208237 return $ map ;
209238 }
210239
240+ /**
241+ * Indicates if a given word is a lookup key in the map.
242+ *
243+ * Example:
244+ *
245+ * true === $smilies->contains( ':)' );
246+ * false === $smilies->contains( 'simile' );
247+ *
248+ * @since 6.6.0
249+ *
250+ * @param string $word Determine if this word is a lookup key in the map.
251+ * @return bool Whether there's an entry for the given word in the map.
252+ */
211253 public function contains ( $ word ) {
212254 if ( $ this ->key_length >= strlen ( $ word ) ) {
213255 $ word_at = strpos ( $ this ->small_words , str_pad ( $ word , $ this ->key_length + 1 , "\x00" ), STR_PAD_RIGHT );
@@ -246,7 +288,48 @@ public function contains( $word ) {
246288 return false ;
247289 }
248290
249- public function read_token ( $ text , $ offset = 0 , &$ skip_bytes ) {
291+ /**
292+ * If the text starting at a given offset is a lookup key in the map,
293+ * return the corresponding transformation from the map, else `false`.
294+ *
295+ * This function returns the translated string, but accepts an optional
296+ * parameter `$skip_bytes` which communicates how many bytes long the
297+ * lookup key was, if it found one. This can be used to advance a cursor
298+ * in calling code if a lookup key was found.
299+ *
300+ * Example:
301+ *
302+ * false === $smilies->read_token( 'Not sure :?.', 0, $bytes_skipped );
303+ * '😕' === $smilies->read_token( 'Not sure :?.', 9, $bytes_skipped );
304+ * 2 === $bytes_skipped;
305+ *
306+ * Example:
307+ *
308+ * while ( $at < strlen( $input ) ) {
309+ * $next_at = strpos( $input, ':', $at );
310+ * if ( false === $next_at ) {
311+ * break;
312+ * }
313+ *
314+ * $smily = $smilies->read_token( $input, $next_at, $bytes_skipped );
315+ * if ( false === $next_at ) {
316+ * ++$at;
317+ * continue;
318+ * }
319+ *
320+ * $prefix = substr( $input, $at, $next_at - $at );
321+ * $at += $bytes_skipped;
322+ * $output .= "{$prefix}{$smily}";
323+ * }
324+ *
325+ * @since 6.6.0
326+ *
327+ * @param string $text String in which to search for a lookup key.
328+ * @param ?int $offset How many bytes into the string where the lookup key ought to start.
329+ * @param ?int &$skip_bytes Holds byte-length of found lookup key if matched, otherwise not set.
330+ * @return string|false Mapped value of lookup key if found, otherwise `false`.
331+ */
332+ public function read_token ( $ text , $ offset = 0 , &$ skip_bytes = null ) {
250333 $ text_length = strlen ( $ text );
251334
252335 // Search for a long word first, if the text is long enough, and if that fails, a short one.
@@ -297,42 +380,77 @@ public function read_token( $text, $offset = 0, &$skip_bytes ) {
297380 return $ this ->small_mappings [ $ at / ( $ this ->key_length + 1 ) ];
298381 }
299382
383+ /**
384+ * Exports the token map into an associate array of key/value pairs.
385+ *
386+ * Example:
387+ *
388+ * $smilies->to_array() === array(
389+ * '8O' => '😯',
390+ * ':(' => '🙁',
391+ * ':)' => '🙂',
392+ * ':?' => '😕',
393+ * );
394+ *
395+ * @return array The lookup key/substitution values as an associate array.
396+ */
300397 public function to_array () {
301398 $ tokens = array ();
302399
303400 $ at = 0 ;
304401 $ small_mapping = 0 ;
305402 $ small_length = strlen ( $ this ->small_words );
306403 while ( $ at < $ small_length ) {
307- $ token = array ();
308-
309- $ token [] = rtrim ( substr ( $ this ->small_words , $ at , $ this ->key_length + 1 ), "\x00" );
310- $ token [] = $ this ->small_mappings [ $ small_mapping ++ ];
311- $ tokens [] = $ token ;
404+ $ key = rtrim ( substr ( $ this ->small_words , $ at , $ this ->key_length + 1 ), "\x00" );
405+ $ value = $ this ->small_mappings [ $ small_mapping ++ ];
406+ $ tokens [ $ key ] = $ value ;
312407
313408 $ at += $ this ->key_length + 1 ;
314409 }
315410
316411 foreach ( $ this ->large_words as $ prefix => $ group ) {
317- $ at = 0 ;
318- while ( $ at < strlen ( $ group ) ) {
319- $ token = array ();
320-
321- $ length = unpack ( 'C ' , $ group [ $ at ++ ] )[1 ];
322- $ token [] = $ prefix . substr ( $ group , $ at , $ length );
412+ $ group_length = strlen ( $ group );
413+ $ at = 0 ;
414+ while ( $ at < $ group_length ) {
415+ $ length = unpack ( 'C ' , $ group [ $ at ++ ] )[1 ];
416+ $ key = $ prefix . substr ( $ group , $ at , $ length );
323417
324- $ at += $ length ;
325- $ length = unpack ( 'C ' , $ group [ $ at ++ ] )[1 ];
326- $ token [] = substr ( $ group , $ at , $ length );
418+ $ at += $ length ;
419+ $ length = unpack ( 'C ' , $ group [ $ at ++ ] )[1 ];
420+ $ value = substr ( $ group , $ at , $ length );
327421
328- $ tokens [] = $ token ;
329- $ at += $ length ;
422+ $ tokens [ $ key ] = $ value ;
423+ $ at += $ length ;
330424 }
331425 }
332426
333427 return $ tokens ;
334428 }
335429
430+ /**
431+ * Export the token map for quick loading in PHP source code.
432+ *
433+ * This function has a specific purpose, to make loading of static token maps fast.
434+ * It's used to ensure that the HTML character reference lookups add a minimal cost
435+ * to initializing the PHP process.
436+ *
437+ * Example:
438+ *
439+ * echo $smilies->precomputed_php_source_table( ' ' );
440+ *
441+ * // Output.
442+ * WP_Token_Map::from_precomputed_table(
443+ * 2,
444+ * array(),
445+ * "8O\x00:)\x00:(\x00:?\x00",
446+ * array( "😯", "🙂", "🙁", "😕" )
447+ * );
448+ *
449+ * @since 6.6.0
450+ *
451+ * @param ?string $indent Use this string for indentation, or rely on the default horizontal tab character.
452+ * @return string Value which can be pasted into a PHP source file for quick loading of table.
453+ */
336454 public function precomputed_php_source_table ( $ indent = "\t" ) {
337455 $ i1 = $ indent ;
338456 $ i2 = $ indent . $ indent ;
@@ -345,10 +463,11 @@ public function precomputed_php_source_table( $indent = "\t" ) {
345463 sort ( $ prefixes );
346464 foreach ( $ prefixes as $ prefix ) {
347465 $ group = $ this ->large_words [ $ prefix ];
466+ $ group_length = strlen ( $ group );
348467 $ comment_line = "{$ i2 }// " ;
349468 $ data_line = "{$ i2 }' {$ prefix }' => \"" ;
350469 $ at = 0 ;
351- while ( $ at < strlen ( $ group ) ) {
470+ while ( $ at < $ group_length ) {
352471 $ token_length = unpack ( 'C ' , $ group [ $ at ++ ] )[1 ];
353472 $ token = substr ( $ group , $ at , $ token_length );
354473 $ at += $ token_length ;
@@ -361,16 +480,18 @@ public function precomputed_php_source_table( $indent = "\t" ) {
361480
362481 $ mapping = preg_replace_callback (
363482 "~[ \\x00- \\x1f \\x22 \\x5c]~ " ,
364- static function ( $ match ) {
365- switch ( $ match [0 ] ) {
483+ static function ( $ match_result ) {
484+ switch ( $ match_result [0 ] ) {
366485 case '" ' :
367486 return '\\" ' ;
368487
369488 case '\\' :
370489 return '\\\\' ;
490+
491+ default :
492+ $ hex = dechex ( ord ( $ match_result [0 ] ) );
493+ return "\\x {$ hex }" ;
371494 }
372- $ hex = dechex ( ord ( $ match [0 ] ) );
373- return "\\x {$ hex }" ;
374495 },
375496 $ mapping
376497 );
@@ -408,6 +529,21 @@ static function ( $match ) {
408529 return $ output ;
409530 }
410531
532+ /**
533+ * Compares two strings, returning the longest, or whichever
534+ * is first alphabetically if they are the same length.
535+ *
536+ * This is an important sort when building the token map because
537+ * it should not form a match on a substring of a longer potential
538+ * match. For example, it should not detect `Cap` when matching
539+ * against the string `CapitalDifferentialD`.
540+ *
541+ * @since 6.6.0
542+ *
543+ * @param string $a First string to compare.
544+ * @param string $b Second string to compare.
545+ * @return int -1 if `$a` is less than `$b`; 1 if `$a` is greater than `$b`, and 0 if they are equal.
546+ */
411547 private static function longest_first_then_alphabetical ( $ a , $ b ) {
412548 if ( $ a [0 ] === $ b [0 ] ) {
413549 return 0 ;
0 commit comments