@@ -61,6 +61,80 @@ static function ( int $errno, string $errstr ) use ( &$errors ) {
6161 $ this ->assertSame ( "& \x00b " , $ decoded , 'Should have decoded the text without changing it. ' );
6262 }
6363
64+ /**
65+ * Ensures semicolonless legacy references decode before non-ASCII UTF-8 bytes in attributes.
66+ */
67+ public function test_semicolonless_legacy_reference_before_multibyte_attribute_follower () {
68+ $ previous_locale = setlocale ( LC_CTYPE , 0 );
69+ $ affected_locale = setlocale ( LC_CTYPE , 'C.UTF-8 ' , 'en_US.UTF-8 ' , 'de_DE.UTF-8 ' , 'fr_FR.UTF-8 ' );
70+
71+ if ( false === $ affected_locale || ! ctype_alnum ( "\xC2" ) ) {
72+ if ( false !== $ previous_locale ) {
73+ setlocale ( LC_CTYPE , $ previous_locale );
74+ }
75+
76+ $ this ->markTestSkipped ( 'Requires an LC_CTYPE locale where ctype_alnum() classifies high-bit bytes as alphanumeric. ' );
77+ }
78+
79+ $ raw_attribute = "Á \xC2\x80" ;
80+
81+ try {
82+ $ this ->assertSame (
83+ "\xC3\x81\xC2\x80" ,
84+ WP_HTML_Decoder::decode_attribute ( $ raw_attribute ),
85+ 'Should have decoded the semicolonless legacy reference before a multibyte follower. '
86+ );
87+
88+ $ match_byte_length = null ;
89+ $ this ->assertSame (
90+ "\xC3\x81" ,
91+ WP_HTML_Decoder::read_character_reference ( 'attribute ' , $ raw_attribute , 0 , $ match_byte_length ),
92+ 'Should have matched the semicolonless legacy reference before a multibyte follower. '
93+ );
94+ $ this ->assertSame ( strlen ( 'Á ' ), $ match_byte_length );
95+ } finally {
96+ if ( false !== $ previous_locale ) {
97+ setlocale ( LC_CTYPE , $ previous_locale );
98+ }
99+ }
100+ }
101+
102+ /**
103+ * Ensures semicolonless legacy references remain ambiguous before ASCII alnum or equals.
104+ *
105+ * @dataProvider data_ambiguous_ascii_attribute_followers
106+ *
107+ * @param string $raw_attribute Raw attribute value with an ambiguous legacy reference follower.
108+ */
109+ public function test_semicolonless_legacy_reference_before_ascii_attribute_follower_is_ambiguous ( $ raw_attribute ) {
110+ $ this ->assertSame (
111+ $ raw_attribute ,
112+ WP_HTML_Decoder::decode_attribute ( $ raw_attribute ),
113+ 'Should not have decoded an ambiguous semicolonless legacy reference. '
114+ );
115+
116+ $ match_byte_length = 'sentinel ' ;
117+ $ this ->assertNull (
118+ WP_HTML_Decoder::read_character_reference ( 'attribute ' , $ raw_attribute , 0 , $ match_byte_length ),
119+ 'Should not have matched an ambiguous semicolonless legacy reference. '
120+ );
121+ $ this ->assertSame ( 'sentinel ' , $ match_byte_length );
122+ }
123+
124+ /**
125+ * Data provider.
126+ *
127+ * @return array[].
128+ */
129+ public static function data_ambiguous_ascii_attribute_followers () {
130+ return array (
131+ 'ASCII digit ' => array ( 'Á0 ' ),
132+ 'ASCII uppercase alpha ' => array ( 'ÁA ' ),
133+ 'ASCII lowercase alpha ' => array ( 'Áa ' ),
134+ 'equals ' => array ( 'Á= ' ),
135+ );
136+ }
137+
64138 /**
65139 * Ensures proper detection of attribute prefixes ignoring ASCII case.
66140 *
0 commit comments