Skip to content

Commit d9ac3f7

Browse files
committed
Document token map.
1 parent 26f36ae commit d9ac3f7

2 files changed

Lines changed: 167 additions & 24 deletions

File tree

src/wp-includes/class-wp-token-map.php

Lines changed: 160 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@
3434
* '😕' === $smilies->read_token( 'Not sure :?.', 9, $bytes_skipped );
3535
* 2 === $bytes_skipped;
3636
*
37-
* $php = $smilies->precomputed_php_source_table();
38-
* // Value of $php.
37+
* echo $smilies->precomputed_php_source_table( ' ' );
38+
* // Output.
3939
* WP_Token_Map::from_precomputed_table(
4040
* 2,
4141
* array(),
@@ -48,13 +48,17 @@
4848
class WP_Token_Map {
4949
/**
5050
* Maximum length for each key and each transformed value in the table (in bytes).
51+
*
52+
* @since 6.6.0
5153
*/
5254
const MAX_LENGTH = 256;
5355

5456
/**
5557
* How many bytes of each key are used to form a group key for lookup.
5658
* This also determines whether a word is considered short or long.
5759
*
60+
* @since 6.6.0
61+
*
5862
* @var int
5963
*/
6064
private $key_length = 2;
@@ -88,6 +92,8 @@ class WP_Token_Map {
8892
* This lookup data structure is designed to optimize cache locality and
8993
* minimize indirect memory reads when matching strings in the set.
9094
*
95+
* @since 6.6.0
96+
*
9197
* @var array
9298
*/
9399
private $large_words = array();
@@ -104,6 +110,8 @@ class WP_Token_Map {
104110
* // Stores array( 'GT', 'LT', 'gt', 'lt' ).
105111
* "GT\x00LT\x00gt\x00lt\x00"
106112
*
113+
* @since 6.6.0
114+
*
107115
* @var string
108116
*/
109117
private $small_words = '';
@@ -119,6 +127,8 @@ class WP_Token_Map {
119127
*
120128
* array( '>', '<', '>', '<' )
121129
*
130+
* @since 6.6.0
131+
*
122132
* @var string[]
123133
*/
124134
private $small_mappings = array();
@@ -135,9 +145,12 @@ class WP_Token_Map {
135145
* ':?' => '😕',
136146
* ) );
137147
*
148+
* @since 6.6.0
149+
*
138150
* @param array $mappings The keys transform into the values, both are strings.
139151
* @param int $key_length Determines the group key length. Leave at the default value
140152
* of 2 unless there's an empirical reason to change it.
153+
*
141154
* @return WP_Token_Map|null Token map, unless unable to create it.
142155
*/
143156
public static function from_array( $mappings, $key_length = 2 ) {
@@ -197,6 +210,22 @@ public static function from_array( $mappings, $key_length = 2 ) {
197210
return $map;
198211
}
199212

213+
/**
214+
* Creates a token map from a pre-computed table.
215+
* This skips the initialization cost of generating the table.
216+
*
217+
* This function should only be used to load data created with
218+
* WP_Token_Map::precomputed_php_source_tag().
219+
*
220+
* @since 6.6.0
221+
*
222+
* @param int $key_length Group key length.
223+
* @param array $large_words Large word groups and packed strings.
224+
* @param string $small_words Small words packed string.
225+
* @param array $small_mappings Small word mappings.
226+
*
227+
* @return WP_Token_Map Map with precomputed data loaded.
228+
*/
200229
public static function from_precomputed_table( $key_length, $large_words, $small_words, $small_mappings ) {
201230
$map = new WP_Token_Map();
202231

@@ -208,6 +237,19 @@ public static function from_precomputed_table( $key_length, $large_words, $small
208237
return $map;
209238
}
210239

240+
/**
241+
* Indicates if a given word is a lookup key in the map.
242+
*
243+
* Example:
244+
*
245+
* true === $smilies->contains( ':)' );
246+
* false === $smilies->contains( 'simile' );
247+
*
248+
* @since 6.6.0
249+
*
250+
* @param string $word Determine if this word is a lookup key in the map.
251+
* @return bool Whether there's an entry for the given word in the map.
252+
*/
211253
public function contains( $word ) {
212254
if ( $this->key_length >= strlen( $word ) ) {
213255
$word_at = strpos( $this->small_words, str_pad( $word, $this->key_length + 1, "\x00" ), STR_PAD_RIGHT );
@@ -246,7 +288,48 @@ public function contains( $word ) {
246288
return false;
247289
}
248290

249-
public function read_token( $text, $offset = 0, &$skip_bytes ) {
291+
/**
292+
* If the text starting at a given offset is a lookup key in the map,
293+
* return the corresponding transformation from the map, else `false`.
294+
*
295+
* This function returns the translated string, but accepts an optional
296+
* parameter `$skip_bytes` which communicates how many bytes long the
297+
* lookup key was, if it found one. This can be used to advance a cursor
298+
* in calling code if a lookup key was found.
299+
*
300+
* Example:
301+
*
302+
* false === $smilies->read_token( 'Not sure :?.', 0, $bytes_skipped );
303+
* '😕' === $smilies->read_token( 'Not sure :?.', 9, $bytes_skipped );
304+
* 2 === $bytes_skipped;
305+
*
306+
* Example:
307+
*
308+
* while ( $at < strlen( $input ) ) {
309+
* $next_at = strpos( $input, ':', $at );
310+
* if ( false === $next_at ) {
311+
* break;
312+
* }
313+
*
314+
* $smily = $smilies->read_token( $input, $next_at, $bytes_skipped );
315+
* if ( false === $next_at ) {
316+
* ++$at;
317+
* continue;
318+
* }
319+
*
320+
* $prefix = substr( $input, $at, $next_at - $at );
321+
* $at += $bytes_skipped;
322+
* $output .= "{$prefix}{$smily}";
323+
* }
324+
*
325+
* @since 6.6.0
326+
*
327+
* @param string $text String in which to search for a lookup key.
328+
* @param ?int $offset How many bytes into the string where the lookup key ought to start.
329+
* @param ?int &$skip_bytes Holds byte-length of found lookup key if matched, otherwise not set.
330+
* @return string|false Mapped value of lookup key if found, otherwise `false`.
331+
*/
332+
public function read_token( $text, $offset = 0, &$skip_bytes = null ) {
250333
$text_length = strlen( $text );
251334

252335
// Search for a long word first, if the text is long enough, and if that fails, a short one.
@@ -297,42 +380,77 @@ public function read_token( $text, $offset = 0, &$skip_bytes ) {
297380
return $this->small_mappings[ $at / ( $this->key_length + 1 ) ];
298381
}
299382

383+
/**
384+
* Exports the token map into an associate array of key/value pairs.
385+
*
386+
* Example:
387+
*
388+
* $smilies->to_array() === array(
389+
* '8O' => '😯',
390+
* ':(' => '🙁',
391+
* ':)' => '🙂',
392+
* ':?' => '😕',
393+
* );
394+
*
395+
* @return array The lookup key/substitution values as an associate array.
396+
*/
300397
public function to_array() {
301398
$tokens = array();
302399

303400
$at = 0;
304401
$small_mapping = 0;
305402
$small_length = strlen( $this->small_words );
306403
while ( $at < $small_length ) {
307-
$token = array();
308-
309-
$token[] = rtrim( substr( $this->small_words, $at, $this->key_length + 1 ), "\x00" );
310-
$token[] = $this->small_mappings[ $small_mapping++ ];
311-
$tokens[] = $token;
404+
$key = rtrim( substr( $this->small_words, $at, $this->key_length + 1 ), "\x00" );
405+
$value = $this->small_mappings[ $small_mapping++ ];
406+
$tokens[ $key ] = $value;
312407

313408
$at += $this->key_length + 1;
314409
}
315410

316411
foreach ( $this->large_words as $prefix => $group ) {
317-
$at = 0;
318-
while ( $at < strlen( $group ) ) {
319-
$token = array();
320-
321-
$length = unpack( 'C', $group[ $at++ ] )[1];
322-
$token[] = $prefix . substr( $group, $at, $length );
412+
$group_length = strlen( $group );
413+
$at = 0;
414+
while ( $at < $group_length ) {
415+
$length = unpack( 'C', $group[ $at++ ] )[1];
416+
$key = $prefix . substr( $group, $at, $length );
323417

324-
$at += $length;
325-
$length = unpack( 'C', $group[ $at++ ] )[1];
326-
$token[] = substr( $group, $at, $length );
418+
$at += $length;
419+
$length = unpack( 'C', $group[ $at++ ] )[1];
420+
$value = substr( $group, $at, $length );
327421

328-
$tokens[] = $token;
329-
$at += $length;
422+
$tokens[ $key ] = $value;
423+
$at += $length;
330424
}
331425
}
332426

333427
return $tokens;
334428
}
335429

430+
/**
431+
* Export the token map for quick loading in PHP source code.
432+
*
433+
* This function has a specific purpose, to make loading of static token maps fast.
434+
* It's used to ensure that the HTML character reference lookups add a minimal cost
435+
* to initializing the PHP process.
436+
*
437+
* Example:
438+
*
439+
* echo $smilies->precomputed_php_source_table( ' ' );
440+
*
441+
* // Output.
442+
* WP_Token_Map::from_precomputed_table(
443+
* 2,
444+
* array(),
445+
* "8O\x00:)\x00:(\x00:?\x00",
446+
* array( "😯", "🙂", "🙁", "😕" )
447+
* );
448+
*
449+
* @since 6.6.0
450+
*
451+
* @param ?string $indent Use this string for indentation, or rely on the default horizontal tab character.
452+
* @return string Value which can be pasted into a PHP source file for quick loading of table.
453+
*/
336454
public function precomputed_php_source_table( $indent = "\t" ) {
337455
$i1 = $indent;
338456
$i2 = $indent . $indent;
@@ -345,10 +463,11 @@ public function precomputed_php_source_table( $indent = "\t" ) {
345463
sort( $prefixes );
346464
foreach ( $prefixes as $prefix ) {
347465
$group = $this->large_words[ $prefix ];
466+
$group_length = strlen( $group );
348467
$comment_line = "{$i2}//";
349468
$data_line = "{$i2}'{$prefix}' => \"";
350469
$at = 0;
351-
while ( $at < strlen( $group ) ) {
470+
while ( $at < $group_length ) {
352471
$token_length = unpack( 'C', $group[ $at++ ] )[1];
353472
$token = substr( $group, $at, $token_length );
354473
$at += $token_length;
@@ -361,16 +480,18 @@ public function precomputed_php_source_table( $indent = "\t" ) {
361480

362481
$mapping = preg_replace_callback(
363482
"~[\\x00-\\x1f\\x22\\x5c]~",
364-
static function ( $match ) {
365-
switch ( $match[0] ) {
483+
static function ( $match_result ) {
484+
switch ( $match_result[0] ) {
366485
case '"':
367486
return '\\"';
368487

369488
case '\\':
370489
return '\\\\';
490+
491+
default:
492+
$hex = dechex( ord( $match_result[0] ) );
493+
return "\\x{$hex}";
371494
}
372-
$hex = dechex( ord( $match[0] ) );
373-
return "\\x{$hex}";
374495
},
375496
$mapping
376497
);
@@ -408,6 +529,21 @@ static function ( $match ) {
408529
return $output;
409530
}
410531

532+
/**
533+
* Compares two strings, returning the longest, or whichever
534+
* is first alphabetically if they are the same length.
535+
*
536+
* This is an important sort when building the token map because
537+
* it should not form a match on a substring of a longer potential
538+
* match. For example, it should not detect `Cap` when matching
539+
* against the string `CapitalDifferentialD`.
540+
*
541+
* @since 6.6.0
542+
*
543+
* @param string $a First string to compare.
544+
* @param string $b Second string to compare.
545+
* @return int -1 if `$a` is less than `$b`; 1 if `$a` is greater than `$b`, and 0 if they are equal.
546+
*/
411547
private static function longest_first_then_alphabetical( $a, $b ) {
412548
if ( $a[0] === $b[0] ) {
413549
return 0;

src/wp-includes/html-api/html5-named-character-entities.php

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
<?php
22

3+
/**
4+
* Auto-generated class for looking up HTML named character references.
5+
*
6+
* @package WordPress
7+
* @since 6.6.0
8+
*/
9+
310
// phpcs:disable
411

512
global $html5_named_character_entity_set;

0 commit comments

Comments
 (0)