|
43 | 43 | * array( "😯", "🙂", "🙁", "😕" ) |
44 | 44 | * ); |
45 | 45 | * |
| 46 | + * ## Determining Key Length. |
| 47 | + * |
| 48 | + * The choice of the size of the key length should be based on the data being stored in |
| 49 | + * the token map. It should divide the data as evenly as possible, but should not create |
| 50 | + * so many groups that a large fraction of the groups only contain a single token. |
| 51 | + * |
| 52 | + * For the HTML5 named character references, a key length of 2 was found to provide a |
| 53 | + * sufficient spread and should be a good default for relatively large sets of tokens. |
| 54 | + * |
| 55 | + * However, for some data sets this might be too long. For example, a list of smilies |
| 56 | + * may be too small for a key length of 2. Perhaps 1 would be more appropriate. It's |
| 57 | + * best to experiment and determine empirically which values are appropriate. |
| 58 | + * |
| 59 | + * ## Generate Pre-Computed Source Code. |
| 60 | + * |
| 61 | + * Since the `WP_Token_Map` is designed for relatively static lookups, it can be |
| 62 | + * advantageous to precompute the values and instantiate a table that has already |
| 63 | + * sorted and grouped the tokens and built the lookup strings. |
| 64 | + * |
| 65 | + * This can be done with `WP_Token_Map::precomputed_php_source_table()`. |
| 66 | + * |
| 67 | + * Note that if there is a leading character that all tokens need, such as `&` for |
| 68 | + * HTML named character references, it can be beneficial to exclude this from the |
| 69 | + * token map. Instead, find occurrences of the leading character and then use the |
| 70 | + * token map to see if the following characters complete the token. |
| 71 | + * |
| 72 | + * Example: |
| 73 | + * |
| 74 | + * $map = WP_Token_Map::from_array( array( 'simple_smile:' => '🙂', 'sob:' => '😭' ) ); |
| 75 | + * echo $map->precomputed_php_source_table(); |
| 76 | + * // Output |
| 77 | + * WP_Token_Map::from_precomputed_table( |
| 78 | + * 2, |
| 79 | + * "si\x00so\x00", |
| 80 | + * array( |
| 81 | + * // simple_smile:[🙂]. |
| 82 | + * "\x0bmple_smile:\x04🙂", |
| 83 | + * // sob:[😭]. |
| 84 | + * "\x02b:\x04😭", |
| 85 | + * ), |
| 86 | + * "", |
| 87 | + * array() |
| 88 | + * ); |
| 89 | + * |
| 90 | + * This precomputed value can be stored directly in source code and will skip the |
| 91 | + * startup cost of generating the lookup strings. See `$html5_named_character_entities`. |
| 92 | + * |
46 | 93 | * @since 6.6.0 |
47 | 94 | */ |
48 | 95 | class WP_Token_Map { |
@@ -164,7 +211,7 @@ public static function from_array( $mappings, $key_length = 2 ) { |
164 | 211 | $groups = array(); |
165 | 212 | $shorts = array(); |
166 | 213 | foreach ( $mappings as $word => $mapping ) { |
167 | | - if ( ! is_string( $word ) || self::MAX_LENGTH <= strlen( $word ) ) { |
| 214 | + if ( self::MAX_LENGTH <= strlen( $word ) ) { |
168 | 215 | return null; |
169 | 216 | } |
170 | 217 |
|
|
0 commit comments