forked from WordPress/wordpress-develop
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwpHtmlProcessorHtml5lib.php
More file actions
314 lines (268 loc) · 10.7 KB
/
wpHtmlProcessorHtml5lib.php
File metadata and controls
314 lines (268 loc) · 10.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
<?php
/**
* Unit tests covering HTML API functionality.
*
* @todo add descriptive paragraph here.
*
* @package WordPress
* @subpackage HTML-API
*
* @since 6.5.0
*
* @group html-api
*/
class Tests_HtmlApi_Html5lib extends WP_UnitTestCase {
/**
* The HTML Processor only accepts HTML in document <body>.
* Do not run tests that look for anything in document <head>.
*/
const SKIP_HEAD_TESTS = true;
/**
* Skip specific tests that may not be supported.
*/
const SKIP_TESTS = array(
'adoption01/line0046' => 'Unimplemented: Reconstruction of active formatting elements.',
'adoption01/line0159' => 'Unimplemented: Reconstruction of active formatting elements.',
'adoption01/line0318' => 'Unimplemented: Reconstruction of active formatting elements.',
'entities02/line0100' => 'Encoded characters without semicolon termination in attribute values are not handled properly',
'entities02/line0114' => 'Encoded characters without semicolon termination in attribute values are not handled properly',
'entities02/line0128' => 'Encoded characters without semicolon termination in attribute values are not handled properly',
'entities02/line0142' => 'Encoded characters without semicolon termination in attribute values are not handled properly',
'entities02/line0156' => 'Encoded characters without semicolon termination in attribute values are not handled properly',
'tests1/line0342' => "Closing P tag implicitly creates opener, which we don't visit.",
'tests15/line0001' => 'Unimplemented: Reconstruction of active formatting elements.',
'tests15/line0022' => 'Unimplemented: Reconstruction of active formatting elements.',
'tests20/line0497' => "Closing P tag implicitly creates opener, which we don't visit.",
'tests23/line0001' => 'Unimplemented: Reconstruction of active formatting elements.',
'tests23/line0041' => 'Unimplemented: Reconstruction of active formatting elements.',
'tests23/line0069' => 'Unimplemented: Reconstruction of active formatting elements.',
'tests23/line0101' => 'Unimplemented: Reconstruction of active formatting elements.',
'tests26/line0263' => 'BUG: An active formatting element should be created for a trailing text node.',
);
/**
* Verify the parsing results of the HTML Processor against the
* test cases in the Html5lib tests project.
*
* @dataProvider data_external_html5lib_tests
*
* @param string $fragment_context Context element in which to parse HTML, such as BODY or SVG.
* @param string $html Given test HTML.
* @param string $result Tree structure of parsed HTML.
*/
public function test_parse( $fragment_context, $html, $result ) {
if ( self::SKIP_HEAD_TESTS ) {
$html_start = "<html>\n <head>\n <body>\n";
if (
strlen( $result ) < strlen( $html_start ) ||
substr( $result, 0, strlen( $html_start ) ) !== $html_start
) {
$this->markTestSkipped( 'Skip test with expected content in <head> (unsupported).' );
}
}
if ( array_key_exists( $this->dataName(), self::SKIP_TESTS ) ) {
$this->markTestSkipped( self::SKIP_TESTS[ $this->dataName() ] );
}
$processed_tree = self::build_html5_treelike_string( $fragment_context, $html );
if ( null === $processed_tree ) {
$this->markTestIncomplete( 'Test includes unsupported markup.' );
}
$this->assertEquals( $result, $processed_tree, "HTML was not processed correctly:\n{$html}" );
}
/**
* Data provider.
*
* Tests from https://github.com/html5lib/html5lib-tests
*
* @return array[]
*/
public function data_external_html5lib_tests() {
$test_dir = DIR_TESTDATA . '/html5lib-tests/tree-construction/';
$handle = opendir( $test_dir );
while ( false !== ( $entry = readdir( $handle ) ) ) {
if ( ! stripos( $entry, '.dat' ) ) {
continue;
}
foreach ( self::parse_html5_dat_testfile( $test_dir . $entry ) as $k => $test ) {
// strip .dat extension from filename
$test_suite = substr( $entry, 0, -4 );
$line = str_pad( strval( $test[0] ), 4, '0', STR_PAD_LEFT );
yield "{$test_suite}/line{$line}" => array_slice( $test, 1 );
}
}
closedir( $handle );
}
/**
* Generates the tree-like structure represented in the Html5lib tests.
*
* @param string $fragment_context Context element in which to parse HTML, such as BODY or SVG.
* @param string $html Given test HTML.
* @return string|null Tree structure of parsed HTML, if supported, else null.
*/
public static function build_html5_treelike_string( $fragment_context, $html ) {
$p = WP_HTML_Processor::create_fragment( $html, "<{$fragment_context}>" );
if ( null === $p ) {
return null;
}
$output = "<html>\n <head>\n <body>\n";
// If we haven't set our bookmark, assume we're 2 levels deep:
// html > body > [position]
$indent_level = 2;
$indent = ' ';
while ( $p->next_token() ) {
if ( $p->get_last_error() !== null ) {
return null;
}
switch ( $p->get_token_type() ) {
case '#tag':
if ( $p->is_tag_closer() ) {
$indent_level--;
break;
}
$indent_level = count( $p->get_breadcrumbs() );
$t = strtolower( $p->get_tag() );
$output .= str_repeat( $indent, $indent_level - 1 ) . "<{$t}>\n";
$attribute_names = $p->get_attribute_names_with_prefix( '' );
if ( $attribute_names ) {
sort( $attribute_names, SORT_STRING );
foreach ( $attribute_names as $attribute_name ) {
$val = $p->get_attribute( $attribute_name );
// Attributes with no value are `true` with the HTML API,
// We map use the empty string value in the tree structure.
if ( true === $val ) {
$val = '';
}
$output .= str_repeat( $indent, $indent_level ) . "{$attribute_name}=\"{$val}\"\n";
}
}
break;
case '#text':
$output .= str_repeat( $indent, $indent_level ) . "\"{$p->get_modifiable_text()}\"\n";
break;
case '#cdata-section':
break;
case '#processing-instruction':
break;
case '#comment':
// Comments must be "<" then "!-- " then the data then " -->".
$output .= str_repeat( $indent, $indent_level ) . "<!-- {$p->get_modifiable_text()} -->\n";
break;
case '#doctype':
break;
case '#presumptuous-tag':
break;
case '#funky-comment':
break;
}
}
if ( WP_HTML_Processor::ERROR_UNSUPPORTED === $p->get_last_error() ) {
return null;
}
if ( $p->paused_at_incomplete_token() ) {
return null;
}
return $output . "\n";
}
/**
* Convert a given Html5lib test file into a test triplet.
*
* @param string $filename Path to `.dat` file with test cases.
*
* @return array|Generator Test triplets of HTML fragment context element,
* HTML, and the DOM structure it represents.
*/
public static function parse_html5_dat_testfile( $filename ) {
$handle = fopen( $filename, 'r', false );
/**
* Represents which section of the test case is being parsed.
*
* @var ?string
*/
$state = null;
$line_number = 0;
$test_html = '';
$test_dom = '';
$test_context_element = 'body';
$test_line_number = 0;
while ( false !== ( $line = fgets( $handle ) ) ) {
++$line_number;
if ( '#' === $line[0] ) {
// Finish section.
if ( "#data\n" === $line ) {
// Yield when switching from a previous state.
if ( $state ) {
yield array(
$test_line_number,
$test_context_element,
// Remove the trailing newline
substr( $test_html, 0, -1 ),
$test_dom,
);
}
// Finish previous test.
$test_line_number = $line_number;
$test_html = '';
$test_dom = '';
}
$state = trim( substr( $line, 1 ) );
continue;
}
switch ( $state ) {
/*
* Each test must begin with a string "#data" followed by a newline (LF). All
* subsequent lines until a line that says "#errors" are the test data and must be
* passed to the system being tested unchanged, except with the final newline (on the
* last line) removed.
*/
case 'data':
$test_html .= $line;
break;
/*
* Then there *may* be a line that says "#document-fragment", which must
* be followed by a newline (LF), followed by a string of characters that
* indicates the context element, followed by a newline (LF). If the
* string of characters starts with "svg ", the context element is in
* the SVG namespace and the substring after "svg " is the local name.
* If the string of characters starts with "math ", the context element
* is in the MathML namespace and the substring after "math " is the
* local name. Otherwise, the context element is in the HTML namespace
* and the string is the local name. If this line is present the "#data"
* must be parsed using the HTML fragment parsing algorithm with the
* context element as context.
*/
case 'document-fragment':
$test_context_element = explode( ' ', $line )[0];
break;
/*
* Then there must be a line that says "#document", which must be followed by a dump of
* the tree of the parsed DOM. Each node must be represented by a single line. Each line
* must start with "| ", followed by two spaces per parent node that the node has before
* the root document node.
*
* - Element nodes must be represented by a "<" then the tag name string ">", and all the attributes must be given, sorted lexicographically by UTF-16 code unit according to their attribute name string, on subsequent lines, as if they were children of the element node.
* - Attribute nodes must have the attribute name string, then an "=" sign, then the attribute value in double quotes (").
* - Text nodes must be the string, in double quotes. Newlines aren't escaped.
* - Comments must be "<" then "!-- " then the data then " -->".
* - DOCTYPEs must be "<!DOCTYPE " then the name then if either of the system id or public id is non-empty a space, public id in double-quotes, another space an the system id in double-quotes, and then in any case ">".
* - Processing instructions must be "<?", then the target, then a space, then the data and then ">". (The HTML parser cannot emit processing instructions, but scripts can, and the WebVTT to DOM rules can emit them.)
* - Template contents are represented by the string "content" with the children below it.
*/
case 'document':
if ( '|' === $line[0] ) {
$test_dom .= substr( $line, 2 );
} else {
$test_dom .= $line;
}
break;
}
}
fclose( $handle );
// Return the last result when reaching the end of the file.
return array(
$test_line_number,
$test_context_element,
// Remove the trailing newline
substr( $test_html, 0, -1 ),
$test_dom,
);
}
}