1010 * @since 6.6.0
1111 */
1212class WP_HTML_Decoder {
13- public static function attribute_starts_with ( $ attribute_value , $ search_text , $ case_sensitivity ) {
14- $ length = strlen ( $ search_text );
15- $ loose_case = 'case-insensitive ' === $ case_sensitivity ;
16- $ at = 0 ;
17- $ i = 0 ;
18-
19- while ( $ i < $ length && $ at < strlen ( $ attribute_value ) ) {
13+ /**
14+ * Indicates if an attribute value starts with a given raw string value.
15+ *
16+ * Use this method to determine if an attribute value starts with a given string, regardless
17+ * of how it might be encoded in HTML. For instance, `http:` could be represented as `http:`
18+ * or as `http:` or as `http:` or as `http:`, or in many other ways.
19+ *
20+ * Example:
21+ *
22+ * $value = 'http://wordpress.org/';
23+ * true === WP_HTML_Decoder::attribute_starts_with( $value, 0, null, 'http:', 'case-insensitive' );
24+ * false === WP_HTML_Decoder::attribute_starts_with( $value, 0, null, 'https:', 'case-insensitive' );
25+ *
26+ * The `$value_at` and `$value_length` parameters may be used to avoid string allocations when the
27+ * attribute value is found within a larger string.
28+ *
29+ * Example:
30+ *
31+ * $html = '<a href="https://wordpress.org" title="© 2024">';
32+ * // 1 1 2 2 3 3 4 4 5 5
33+ * // 01234567890 5 0 5 0 5 0 5 0 5
34+ *
35+ * $href = array( 9, 26 ); // At, Length
36+ * $title = array( 44, 9 ); // At, Length
37+ *
38+ * true === WP_HTML_Decode::attribute_starts_with( $html, $href[0], $href[1], 'https://', 'case-insensitive' );
39+ * true === WP_HTML_Decode::attribute_starts_with( $html, $title[0], $title[1], '©' );
40+ *
41+ * false === WP_HTML_Decode::attribute_starts_with( $html, $href[0], $href[1], 'http://', 'case-insensitive' );
42+ * false === WP_HTML_Decode::attribute_starts_with( $html, $title[0], $title[1], '©' );
43+ *
44+ * @param string $raw_haystack String containing the raw non-decoded attribute value.
45+ * @param int $value_at How many bytes into the haystack where the attribute value begins.
46+ * @param int $value_length How many bytes the attribute value spans.
47+ * Passing `null` indicates that the value spans to the end of the string.
48+ * @param string $search_text Does the attribute value start with this plain string.
49+ * @param ?string $case_sensitivity Set to `case-insensitive` to ignore ASCII case when matching.
50+ *
51+ * @return bool Whether the attribute value starts with the given string.
52+ */
53+ public static function attribute_starts_with ( $ raw_haystack , $ value_at , $ value_length , $ search_text , $ case_sensitivity = 'case-sensitive ' ) {
54+ $ search_length = strlen ( $ search_text );
55+ $ loose_case = 'case-insensitive ' === $ case_sensitivity ;
56+ $ haystack_end = isset ( $ value_length ) ? ( $ value_at + $ value_length ) : strlen ( $ raw_haystack );
57+ $ search_at = 0 ;
58+
59+ while ( $ search_at < $ search_length && $ value_at < $ haystack_end ) {
2060 $ chars_match = $ loose_case
21- ? strtolower ( $ attribute_value [ $ at ] ) === strtolower ( $ search_text [ $ i ] )
22- : $ attribute_value [ $ at ] === $ search_text [ $ i ];
61+ ? strtolower ( $ raw_haystack [ $ value_at ] ) === strtolower ( $ search_text [ $ search_at ] )
62+ : $ raw_haystack [ $ value_at ] === $ search_text [ $ search_at ];
2363
24- $ is_introducer = '& ' === $ attribute_value [ $ at ];
64+ $ is_introducer = '& ' === $ raw_haystack [ $ value_at ];
2565 $ next_chunk = $ is_introducer
26- ? self ::read_character_reference ( $ attribute_value , $ at , false , $ skip_bytes )
66+ ? self ::read_character_reference ( $ raw_haystack , $ value_at , false , $ skip_bytes )
2767 : false ;
2868
69+ // If there's no character reference and the characters don't match, the match fails.
2970 if ( false === $ next_chunk && ! $ chars_match ) {
3071 return false ;
3172 }
3273
74+ // If there's no character reference but the character do match, then it could still match.
3375 if ( false === $ next_chunk && $ chars_match ) {
34- ++$ at ;
35- ++$ i ;
76+ ++$ value_at ;
77+ ++$ search_at ;
3678 continue ;
3779 }
3880
39- if ( 0 !== substr_compare ( $ search_text , $ next_chunk , $ i , strlen ( $ next_chunk ), $ loose_case ) ) {
81+ // If there is a character reference, then the decoded value must exactly match what follows in the search string.
82+ if ( 0 !== substr_compare ( $ search_text , $ next_chunk , $ search_at , strlen ( $ next_chunk ), $ loose_case ) ) {
4083 return false ;
4184 }
4285
43- $ at += $ skip_bytes ;
44- $ i += strlen ( $ next_chunk );
86+ // The character reference matched, so continue checking.
87+ $ value_at += $ skip_bytes ;
88+ $ search_at += strlen ( $ next_chunk );
4589 }
4690
4791 return true ;
@@ -263,10 +307,9 @@ public static function read_character_reference( $text, $at, $allow_ambiguous_am
263307
264308 $ after_name = $ name_at + $ name_length ;
265309
266- // If we have an un-ambiguous ampersand we can safely leave it in .
310+ // If the match ended with a semicolon then it should always be decoded .
267311 if ( '; ' === $ text [ $ name_at + $ name_length - 1 ] ) {
268312 $ skip_bytes = $ after_name - $ at ;
269- // @todo bring back the WP_Token_Map so we can decode these.
270313 return $ name ;
271314 }
272315
@@ -287,7 +330,6 @@ public static function read_character_reference( $text, $at, $allow_ambiguous_am
287330 // It's non-ambiguous, safe to leave it in.
288331 if ( ! $ ambiguous_follower ) {
289332 $ skip_bytes = $ after_name - $ at ;
290- // @todo Bring back WP_Token_Map to replace properly.
291333 return $ name ;
292334 }
293335
0 commit comments