add docs, remove is_string

dmsnell · dmsnell · commit a99fda866cfe · 2024-05-01T16:23:15.000-07:00
diff --git a/src/wp-includes/class-wp-token-map.php b/src/wp-includes/class-wp-token-map.php
@@ -43,6 +43,53 @@
  *          array( "😯", "🙂", "🙁", "😕" )
  *      );
  *
+ * ## Determining Key Length.
+ *
+ * The choice of the size of the key length should be based on the data being stored in
+ * the token map. It should divide the data as evenly as possible, but should not create
+ * so many groups that a large fraction of the groups only contain a single token.
+ *
+ * For the HTML5 named character references, a key length of 2 was found to provide a
+ * sufficient spread and should be a good default for relatively large sets of tokens.
+ *
+ * However, for some data sets this might be too long. For example, a list of smilies
+ * may be too small for a key length of 2. Perhaps 1 would be more appropriate. It's
+ * best to experiment and determine empirically which values are appropriate.
+ *
+ * ## Generate Pre-Computed Source Code.
+ *
+ * Since the `WP_Token_Map` is designed for relatively static lookups, it can be
+ * advantageous to precompute the values and instantiate a table that has already
+ * sorted and grouped the tokens and built the lookup strings.
+ *
+ * This can be done with `WP_Token_Map::precomputed_php_source_table()`.
+ *
+ * Note that if there is a leading character that all tokens need, such as `&` for
+ * HTML named character references, it can be beneficial to exclude this from the
+ * token map. Instead, find occurrences of the leading character and then use the
+ * token map to see if the following characters complete the token.
+ *
+ * Example:
+ *
+ *     $map = WP_Token_Map::from_array( array( 'simple_smile:' => '🙂', 'sob:' => '😭' ) );
+ *     echo $map->precomputed_php_source_table();
+ *     // Output
+ *     WP_Token_Map::from_precomputed_table(
+ *         2,
+ *         "si\x00so\x00",
+ *         array(
+ *                 // simple_smile:[🙂].
+ *                 "\x0bmple_smile:\x04🙂",
+ *                 // sob:[😭].
+ *                 "\x02b:\x04😭",
+ *         ),
+ *         "",
+ *         array()
+ *     );
+ *
+ * This precomputed value can be stored directly in source code and will skip the
+ * startup cost of generating the lookup strings. See `$html5_named_character_entities`.
+ *
  * @since 6.6.0
  */
 class WP_Token_Map {
@@ -164,7 +211,7 @@ public static function from_array( $mappings, $key_length = 2 ) {
 		$groups = array();
 		$shorts = array();
 		foreach ( $mappings as $word => $mapping ) {
-			if ( ! is_string( $word ) || self::MAX_LENGTH <= strlen( $word ) ) {
+			if ( self::MAX_LENGTH <= strlen( $word ) ) {
 				return null;
 			}