Skip to content

Commit 77b3a5b

Browse files
committed
Experiment: Add optimized set lookup class
1 parent 046ce88 commit 77b3a5b

4 files changed

Lines changed: 610 additions & 2 deletions

File tree

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
<?php
2+
3+
class WP_Token_Set {
4+
const MAX_LENGTH = 256;
5+
6+
private $key_length = 2;
7+
8+
/**
9+
* Stores an optimized form of the word set, where words are grouped
10+
* by first two letters and then collapsed into a string.
11+
*
12+
* @var array
13+
*/
14+
private $large_words = array();
15+
16+
/**
17+
* Stores an optimized row of short words, where every entry is two
18+
* bytes long and zero-extended if the word is only a single byte.
19+
*
20+
* @var string
21+
*/
22+
private $small_words = '';
23+
24+
public static function from_array( $words, $key_length = 2 ) {
25+
$set = new WP_Token_Set();
26+
$set->key_length = $key_length;
27+
28+
// Start by grouping words.
29+
30+
$groups = array();
31+
$shorts = array();
32+
foreach ( $words as $word ) {
33+
if ( ! is_string( $word ) || self::MAX_LENGTH <= strlen( $word ) ) {
34+
return null;
35+
}
36+
37+
$length = strlen( $word );
38+
39+
if ( $key_length >= $length ) {
40+
$shorts[] = $word;
41+
} else {
42+
$group = substr( $word, 0, $key_length );
43+
44+
if ( ! isset( $groups[ $group ] ) ) {
45+
$groups[ $group ] = array();
46+
}
47+
48+
$groups[ $group ][] = substr( $word, $key_length );
49+
}
50+
}
51+
52+
// Sort the words by longest-first, then alphabetical.
53+
54+
usort( $shorts, array( self::class, 'longest_first_then_alphabetical' ) );
55+
foreach ( $groups as $group_key => $group ) {
56+
usort( $groups[ $group_key ], array( self::class, 'longest_first_then_alphabetical' ) );
57+
}
58+
59+
// Finally construct the optimized lookups.
60+
61+
foreach ( $shorts as $word ) {
62+
$set->small_words .= str_pad( $word, $key_length, "\x00" );
63+
}
64+
65+
foreach ( $groups as $group => $group_words ) {
66+
$group_string = '';
67+
68+
foreach ( $group_words as $word ) {
69+
$group_string .= chr( strlen( $word ) ) . $word;
70+
}
71+
72+
$set->large_words[ $group ] = $group_string;
73+
}
74+
75+
return $set;
76+
}
77+
78+
public static function from_precomputed_table( $key_length, $large_words, $small_words ) {
79+
$set = new WP_Token_Set();
80+
81+
$set->key_length = $key_length;
82+
$set->large_words = $large_words;
83+
$set->small_words = $small_words;
84+
85+
return $set;
86+
}
87+
88+
public function contains( $word ) {
89+
if ( $this->key_length >= strlen( $word ) ) {
90+
return str_contains( $this->small_words, str_pad( $word, $this->key_length, "\x00" ) );
91+
}
92+
93+
$group_key = substr( $word, 0, $this->key_length );
94+
if ( ! isset( $this->large_words[ $group_key ] ) ) {
95+
return false;
96+
}
97+
98+
$group = $this->large_words[ $group_key ];
99+
$slug = substr( $word, $this->key_length );
100+
$length = strlen( $slug );
101+
$at = 0;
102+
while ( $at < strlen( $group ) ) {
103+
$token_length = ord( $group[ $at++ ] );
104+
if ( $token_length === $length && 0 === substr_compare( $group, $slug, $at, $token_length ) ) {
105+
return true;
106+
}
107+
108+
$at += $token_length;
109+
}
110+
111+
return false;
112+
}
113+
114+
public function read_token( $text, $offset ) {
115+
$text_length = strlen( $text );
116+
117+
// Search for a long word first, if the text is long enough, and if that fails, a short one.
118+
if ( $this->key_length < $text_length ) {
119+
$group_key = substr( $text, $offset, $this->key_length );
120+
121+
if ( ! isset( $this->large_words[ $group_key ] ) ) {
122+
return false;
123+
}
124+
125+
$group = $this->large_words[ $group_key ];
126+
$group_length = strlen( $group );
127+
$at = 0;
128+
while ( $at < $group_length ) {
129+
$token_length = ord( $group[ $at++ ] );
130+
$token = substr( $group, $at, $token_length );
131+
132+
if ( 0 === substr_compare( $text, $token, $offset + $this->key_length, $token_length ) ) {
133+
return $group_key . $token;
134+
}
135+
136+
$at += $token_length;
137+
}
138+
}
139+
140+
// Perhaps a short word then.
141+
$small_text = str_pad( substr( $text, $offset, $this->key_length ), $this->key_length, "\x00" );
142+
$at = strpos( $this->small_words, $small_text );
143+
144+
return false !== $at
145+
? rtrim( substr( $this->small_words, $at, $this->key_length ), "\x00" )
146+
: false;
147+
}
148+
149+
public function to_array() {
150+
$tokens = array();
151+
152+
$at = 0;
153+
while ( $at < strlen( $this->small_words ) ) {
154+
$tokens[] = rtrim( substr( $this->small_words, $at, $this->key_length ), "\x00" );
155+
$at += $this->key_length;
156+
}
157+
158+
foreach ( $this->large_words as $prefix => $group ) {
159+
$at = 0;
160+
while ( $at < strlen( $group ) ) {
161+
$length = ord( $group[ $at++ ] );
162+
$tokens[] = $prefix . rtrim( substr( $group, $at, $length ), "\x00" );
163+
$at += $length;
164+
}
165+
}
166+
167+
return $tokens;
168+
}
169+
170+
public function precomputed_php_source_table( $indent = "\t" ) {
171+
$i1 = $indent;
172+
$i2 = $indent . $indent;
173+
174+
$output = self::class . "::from_precomputed_table(\n";
175+
$output .= "{$i1}{$this->key_length},\n";
176+
$output .= "{$i1}array(\n";
177+
178+
$prefixes = array_keys( $this->large_words );
179+
sort( $prefixes );
180+
foreach ( $prefixes as $prefix ) {
181+
$group = $this->large_words[ $prefix ];
182+
$comment_line = "{$i2}//";
183+
$data_line = "{$i2}'{$prefix}' => \"";
184+
$at = 0;
185+
while ( $at < strlen( $group ) ) {
186+
$length = ord( $group[ $at++ ] );
187+
$digits = str_pad( dechex( $length ), 2, '0', STR_PAD_LEFT );
188+
$token = substr( $group, $at, $length );
189+
$at += $length;
190+
191+
$comment_line .= " &{$prefix}{$token}";
192+
$data_line .= "\\x{$digits}{$token}";
193+
}
194+
$comment_line .= "\n";
195+
$data_line .= "\",\n";
196+
197+
$output .= $comment_line;
198+
$output .= $data_line;
199+
}
200+
201+
$output .= "{$i1}),\n";
202+
$small_text = str_replace( "\x00", '\x00', $this->small_words );
203+
$output .= "{$i1}'{$small_text}'\n";
204+
$output .= ");\n";
205+
206+
return $output;
207+
}
208+
209+
private static function longest_first_then_alphabetical( $a, $b ) {
210+
if ( $a === $b ) {
211+
return 0;
212+
}
213+
214+
$la = strlen( $a );
215+
$lb = strlen( $b );
216+
217+
// Longer strings are less-than for comparison's sake.
218+
if ( $la !== $lb ) {
219+
return $lb - $la;
220+
}
221+
222+
return strcmp( $a, $b );
223+
}
224+
}

0 commit comments

Comments
 (0)