|
| 1 | +<?php |
| 2 | + |
| 3 | +class WP_Token_Set { |
| 4 | + const MAX_LENGTH = 256; |
| 5 | + |
| 6 | + private $key_length = 2; |
| 7 | + |
| 8 | + /** |
| 9 | + * Stores an optimized form of the word set, where words are grouped |
| 10 | + * by first two letters and then collapsed into a string. |
| 11 | + * |
| 12 | + * @var array |
| 13 | + */ |
| 14 | + private $large_words = array(); |
| 15 | + |
| 16 | + /** |
| 17 | + * Stores an optimized row of short words, where every entry is two |
| 18 | + * bytes long and zero-extended if the word is only a single byte. |
| 19 | + * |
| 20 | + * @var string |
| 21 | + */ |
| 22 | + private $small_words = ''; |
| 23 | + |
| 24 | + public static function from_array( $words, $key_length = 2 ) { |
| 25 | + $set = new WP_Token_Set(); |
| 26 | + $set->key_length = $key_length; |
| 27 | + |
| 28 | + // Start by grouping words. |
| 29 | + |
| 30 | + $groups = array(); |
| 31 | + $shorts = array(); |
| 32 | + foreach ( $words as $word ) { |
| 33 | + if ( ! is_string( $word ) || self::MAX_LENGTH <= strlen( $word ) ) { |
| 34 | + return null; |
| 35 | + } |
| 36 | + |
| 37 | + $length = strlen( $word ); |
| 38 | + |
| 39 | + if ( $key_length >= $length ) { |
| 40 | + $shorts[] = $word; |
| 41 | + } else { |
| 42 | + $group = substr( $word, 0, $key_length ); |
| 43 | + |
| 44 | + if ( ! isset( $groups[ $group ] ) ) { |
| 45 | + $groups[ $group ] = array(); |
| 46 | + } |
| 47 | + |
| 48 | + $groups[ $group ][] = substr( $word, $key_length ); |
| 49 | + } |
| 50 | + } |
| 51 | + |
| 52 | + // Sort the words by longest-first, then alphabetical. |
| 53 | + |
| 54 | + usort( $shorts, array( self::class, 'longest_first_then_alphabetical' ) ); |
| 55 | + foreach ( $groups as $group_key => $group ) { |
| 56 | + usort( $groups[ $group_key ], array( self::class, 'longest_first_then_alphabetical' ) ); |
| 57 | + } |
| 58 | + |
| 59 | + // Finally construct the optimized lookups. |
| 60 | + |
| 61 | + foreach ( $shorts as $word ) { |
| 62 | + $set->small_words .= str_pad( $word, $key_length, "\x00" ); |
| 63 | + } |
| 64 | + |
| 65 | + foreach ( $groups as $group => $group_words ) { |
| 66 | + $group_string = ''; |
| 67 | + |
| 68 | + foreach ( $group_words as $word ) { |
| 69 | + $group_string .= chr( strlen( $word ) ) . $word; |
| 70 | + } |
| 71 | + |
| 72 | + $set->large_words[ $group ] = $group_string; |
| 73 | + } |
| 74 | + |
| 75 | + return $set; |
| 76 | + } |
| 77 | + |
| 78 | + public static function from_precomputed_table( $key_length, $large_words, $small_words ) { |
| 79 | + $set = new WP_Token_Set(); |
| 80 | + |
| 81 | + $set->key_length = $key_length; |
| 82 | + $set->large_words = $large_words; |
| 83 | + $set->small_words = $small_words; |
| 84 | + |
| 85 | + return $set; |
| 86 | + } |
| 87 | + |
| 88 | + public function contains( $word ) { |
| 89 | + if ( $this->key_length >= strlen( $word ) ) { |
| 90 | + return str_contains( $this->small_words, str_pad( $word, $this->key_length, "\x00" ) ); |
| 91 | + } |
| 92 | + |
| 93 | + $group_key = substr( $word, 0, $this->key_length ); |
| 94 | + if ( ! isset( $this->large_words[ $group_key ] ) ) { |
| 95 | + return false; |
| 96 | + } |
| 97 | + |
| 98 | + $group = $this->large_words[ $group_key ]; |
| 99 | + $slug = substr( $word, $this->key_length ); |
| 100 | + $length = strlen( $slug ); |
| 101 | + $at = 0; |
| 102 | + while ( $at < strlen( $group ) ) { |
| 103 | + $token_length = ord( $group[ $at++ ] ); |
| 104 | + if ( $token_length === $length && 0 === substr_compare( $group, $slug, $at, $token_length ) ) { |
| 105 | + return true; |
| 106 | + } |
| 107 | + |
| 108 | + $at += $token_length; |
| 109 | + } |
| 110 | + |
| 111 | + return false; |
| 112 | + } |
| 113 | + |
| 114 | + public function read_token( $text, $offset ) { |
| 115 | + $text_length = strlen( $text ); |
| 116 | + |
| 117 | + // Search for a long word first, if the text is long enough, and if that fails, a short one. |
| 118 | + if ( $this->key_length < $text_length ) { |
| 119 | + $group_key = substr( $text, $offset, $this->key_length ); |
| 120 | + |
| 121 | + if ( ! isset( $this->large_words[ $group_key ] ) ) { |
| 122 | + return false; |
| 123 | + } |
| 124 | + |
| 125 | + $group = $this->large_words[ $group_key ]; |
| 126 | + $group_length = strlen( $group ); |
| 127 | + $at = 0; |
| 128 | + while ( $at < $group_length ) { |
| 129 | + $token_length = ord( $group[ $at++ ] ); |
| 130 | + $token = substr( $group, $at, $token_length ); |
| 131 | + |
| 132 | + if ( 0 === substr_compare( $text, $token, $offset + $this->key_length, $token_length ) ) { |
| 133 | + return $group_key . $token; |
| 134 | + } |
| 135 | + |
| 136 | + $at += $token_length; |
| 137 | + } |
| 138 | + } |
| 139 | + |
| 140 | + // Perhaps a short word then. |
| 141 | + $small_text = str_pad( substr( $text, $offset, $this->key_length ), $this->key_length, "\x00" ); |
| 142 | + $at = strpos( $this->small_words, $small_text ); |
| 143 | + |
| 144 | + return false !== $at |
| 145 | + ? rtrim( substr( $this->small_words, $at, $this->key_length ), "\x00" ) |
| 146 | + : false; |
| 147 | + } |
| 148 | + |
| 149 | + public function to_array() { |
| 150 | + $tokens = array(); |
| 151 | + |
| 152 | + $at = 0; |
| 153 | + while ( $at < strlen( $this->small_words ) ) { |
| 154 | + $tokens[] = rtrim( substr( $this->small_words, $at, $this->key_length ), "\x00" ); |
| 155 | + $at += $this->key_length; |
| 156 | + } |
| 157 | + |
| 158 | + foreach ( $this->large_words as $prefix => $group ) { |
| 159 | + $at = 0; |
| 160 | + while ( $at < strlen( $group ) ) { |
| 161 | + $length = ord( $group[ $at++ ] ); |
| 162 | + $tokens[] = $prefix . rtrim( substr( $group, $at, $length ), "\x00" ); |
| 163 | + $at += $length; |
| 164 | + } |
| 165 | + } |
| 166 | + |
| 167 | + return $tokens; |
| 168 | + } |
| 169 | + |
| 170 | + public function precomputed_php_source_table( $indent = "\t" ) { |
| 171 | + $i1 = $indent; |
| 172 | + $i2 = $indent . $indent; |
| 173 | + |
| 174 | + $output = self::class . "::from_precomputed_table(\n"; |
| 175 | + $output .= "{$i1}{$this->key_length},\n"; |
| 176 | + $output .= "{$i1}array(\n"; |
| 177 | + |
| 178 | + $prefixes = array_keys( $this->large_words ); |
| 179 | + sort( $prefixes ); |
| 180 | + foreach ( $prefixes as $prefix ) { |
| 181 | + $group = $this->large_words[ $prefix ]; |
| 182 | + $comment_line = "{$i2}//"; |
| 183 | + $data_line = "{$i2}'{$prefix}' => \""; |
| 184 | + $at = 0; |
| 185 | + while ( $at < strlen( $group ) ) { |
| 186 | + $length = ord( $group[ $at++ ] ); |
| 187 | + $digits = str_pad( dechex( $length ), 2, '0', STR_PAD_LEFT ); |
| 188 | + $token = substr( $group, $at, $length ); |
| 189 | + $at += $length; |
| 190 | + |
| 191 | + $comment_line .= " &{$prefix}{$token}"; |
| 192 | + $data_line .= "\\x{$digits}{$token}"; |
| 193 | + } |
| 194 | + $comment_line .= "\n"; |
| 195 | + $data_line .= "\",\n"; |
| 196 | + |
| 197 | + $output .= $comment_line; |
| 198 | + $output .= $data_line; |
| 199 | + } |
| 200 | + |
| 201 | + $output .= "{$i1}),\n"; |
| 202 | + $small_text = str_replace( "\x00", '\x00', $this->small_words ); |
| 203 | + $output .= "{$i1}'{$small_text}'\n"; |
| 204 | + $output .= ");\n"; |
| 205 | + |
| 206 | + return $output; |
| 207 | + } |
| 208 | + |
| 209 | + private static function longest_first_then_alphabetical( $a, $b ) { |
| 210 | + if ( $a === $b ) { |
| 211 | + return 0; |
| 212 | + } |
| 213 | + |
| 214 | + $la = strlen( $a ); |
| 215 | + $lb = strlen( $b ); |
| 216 | + |
| 217 | + // Longer strings are less-than for comparison's sake. |
| 218 | + if ( $la !== $lb ) { |
| 219 | + return $lb - $la; |
| 220 | + } |
| 221 | + |
| 222 | + return strcmp( $a, $b ); |
| 223 | + } |
| 224 | +} |
0 commit comments