Skip to content

Commit 96bebe4

Browse files
committed
Add case-insensitivity and early-abort for small words when none exist.
1 parent 03adf30 commit 96bebe4

1 file changed

Lines changed: 47 additions & 21 deletions

File tree

src/wp-includes/class-wp-token-map.php

Lines changed: 47 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -355,12 +355,20 @@ public static function from_precomputed_table( $key_length, $groups, $large_word
355355
*
356356
* @since 6.6.0
357357
*
358-
* @param string $word Determine if this word is a lookup key in the map.
358+
* @param string $word Determine if this word is a lookup key in the map.
359+
* @param ?string $case_sensitivity 'case-insensitive' to ignore ASCII case or default of 'case-sensitive'.
359360
* @return bool Whether there's an entry for the given word in the map.
360361
*/
361-
public function contains( $word ) {
362+
public function contains( $word, $case_sensitivity = 'case-sensitive' ) {
363+
$ignore_case = 'case-insensitive' === $case_sensitivity;
364+
362365
if ( $this->key_length >= strlen( $word ) ) {
363-
$word_at = strpos( $this->small_words, str_pad( $word, $this->key_length + 1, "\x00" ), STR_PAD_RIGHT );
366+
if ( 0 === strlen( $this->small_words ) ) {
367+
return false;
368+
}
369+
370+
$term = str_pad( $word, $this->key_length + 1, "\x00", STR_PAD_RIGHT );
371+
$word_at = $ignore_case ? stripos( $this->small_words, $term ) : strpos( $this->small_words, $term );
364372
if ( false === $word_at ) {
365373
return false;
366374
}
@@ -369,7 +377,7 @@ public function contains( $word ) {
369377
}
370378

371379
$group_key = substr( $word, 0, $this->key_length );
372-
$group_at = strpos( $this->groups, $group_key );
380+
$group_at = $ignore_case ? stripos( $this->groups, $group_key ) : strpos( $this->groups, $group_key );
373381
if ( false === $group_at ) {
374382
return false;
375383
}
@@ -386,7 +394,7 @@ public function contains( $word ) {
386394
$mapping_length = unpack( 'C', $group[ $at++ ] )[1];
387395
$mapping_at = $at;
388396

389-
if ( $token_length === $length && 0 === substr_compare( $group, $slug, $token_at, $token_length ) ) {
397+
if ( $token_length === $length && 0 === substr_compare( $group, $slug, $token_at, $token_length, $ignore_case ) ) {
390398
return true;
391399
}
392400

@@ -432,22 +440,26 @@ public function contains( $word ) {
432440
*
433441
* @since 6.6.0
434442
*
435-
* @param string $text String in which to search for a lookup key.
436-
* @param ?int $offset How many bytes into the string where the lookup key ought to start.
437-
* @param ?int &$skip_bytes Holds byte-length of found lookup key if matched, otherwise not set.
443+
* @param string $text String in which to search for a lookup key.
444+
* @param ?int $offset How many bytes into the string where the lookup key ought to start.
445+
* @param ?int &$skip_bytes Holds byte-length of found lookup key if matched, otherwise not set.
446+
* @param ?string $case_sensitivity 'case-insensitive' to ignore ASCII case or default of 'case-sensitive'.
438447
* @return string|false Mapped value of lookup key if found, otherwise `false`.
439448
*/
440-
public function read_token( $text, $offset = 0, &$skip_bytes = null ) {
449+
public function read_token( $text, $offset = 0, &$skip_bytes = null, $case_sensitivity = 'case-sensitive' ) {
450+
$ignore_case = 'case-insensitive' === $case_sensitivity;
441451
$text_length = strlen( $text );
442452

443453
// Search for a long word first, if the text is long enough, and if that fails, a short one.
444454
if ( $text_length > $this->key_length ) {
445455
$group_key = substr( $text, $offset, $this->key_length );
446456

447-
$group_at = strpos( $this->groups, $group_key );
457+
$group_at = $ignore_case ? stripos( $this->groups, $group_key ) : strpos( $this->groups, $group_key );
448458
if ( false === $group_at ) {
449459
// Perhaps a short word then.
450-
return $this->read_small_token( $text, $offset, $skip_bytes );
460+
return strlen( $this->small_words ) > 0
461+
? $this->read_small_token( $text, $offset, $skip_bytes, $case_sensitivity )
462+
: false;
451463
}
452464

453465
$group = $this->large_words[ $group_at / ( $this->key_length + 1 ) ];
@@ -460,7 +472,7 @@ public function read_token( $text, $offset = 0, &$skip_bytes = null ) {
460472
$mapping_length = unpack( 'C', $group[ $at++ ] )[1];
461473
$mapping_at = $at;
462474

463-
if ( 0 === substr_compare( $text, $token, $offset + $this->key_length, $token_length ) ) {
475+
if ( 0 === substr_compare( $text, $token, $offset + $this->key_length, $token_length, $ignore_case ) ) {
464476
$skip_bytes = $this->key_length + $token_length;
465477
return substr( $group, $mapping_at, $mapping_length );
466478
}
@@ -470,26 +482,37 @@ public function read_token( $text, $offset = 0, &$skip_bytes = null ) {
470482
}
471483

472484
// Perhaps a short word then.
473-
return $this->read_small_token( $text, $offset, $skip_bytes );
485+
return strlen( $this->small_words ) > 0
486+
? $this->read_small_token( $text, $offset, $skip_bytes, $case_sensitivity )
487+
: false;
474488
}
475489

476490
/**
477491
* Finds a match for a short word at the index.
478492
*
479493
* @since 6.6.0.
480494
*
481-
* @param string $text String in which to search for a lookup key.
482-
* @param ?int $offset How many bytes into the string where the lookup key ought to start.
483-
* @param ?int &$skip_bytes Holds byte-length of found lookup key if matched, otherwise not set.
495+
* @param string $text String in which to search for a lookup key.
496+
* @param ?int $offset How many bytes into the string where the lookup key ought to start.
497+
* @param ?int &$skip_bytes Holds byte-length of found lookup key if matched, otherwise not set.
498+
* @param ?string $case_sensitivity 'case-insensitive' to ignore ASCII case or default of 'case-sensitive'.
484499
* @return string|false Mapped value of lookup key if found, otherwise `false`.
485500
*/
486-
private function read_small_token( $text, $offset, &$skip_bytes ) {
487-
$small_length = strlen( $this->small_words );
488-
$starting_char = $text[ $offset ];
501+
private function read_small_token( $text, $offset, &$skip_bytes, $case_sensitivity = 'case-sensitive' ) {
502+
$ignore_case = 'case-insensitive' === $case_sensitivity;
503+
$small_length = strlen( $this->small_words );
504+
$search_text = substr( $text, $offset, $this->key_length );
505+
if ( $ignore_case ) {
506+
$search_text = strtoupper( $search_text );
507+
}
508+
$starting_char = $search_text[0];
489509

490510
$at = 0;
491511
while ( $at < $small_length ) {
492-
if ( $starting_char !== $this->small_words[ $at ] ) {
512+
if (
513+
$starting_char !== $this->small_words[ $at ] &&
514+
( ! $ignore_case || strtoupper( $this->small_words[ $at ] ) !== $starting_char )
515+
) {
493516
$at += $this->key_length + 1;
494517
continue;
495518
}
@@ -500,7 +523,10 @@ private function read_small_token( $text, $offset, &$skip_bytes ) {
500523
return $this->small_mappings[ $at / ( $this->key_length + 1 ) ];
501524
}
502525

503-
if ( $text[ $offset + $adjust ] !== $this->small_words[ $at + $adjust ] ) {
526+
if (
527+
$search_text[ $adjust ] !== $this->small_words[ $at + $adjust ] &&
528+
( ! $ignore_case || strtoupper( $this->small_words[ $at + $adjust ] !== $search_text[ $adjust ] ) )
529+
) {
504530
$at += $this->key_length + 1;
505531
continue 2;
506532
}

0 commit comments

Comments
 (0)