Skip to content

Commit 60d1738

Browse files
committed
Clean up and refactor test document parsing
1 parent 2871f31 commit 60d1738

1 file changed

Lines changed: 37 additions & 10 deletions

File tree

tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php

Lines changed: 37 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -242,24 +242,51 @@ public static function parse_html5_dat_testfile( $filename ) {
242242
* the tree of the parsed DOM. Each node must be represented by a single line. Each line
243243
* must start with "| ", followed by two spaces per parent node that the node has before
244244
* the root document node.
245+
*
246+
* - Element nodes must be represented by a "<" then the tag name string ">", and all the attributes must be given, sorted lexicographically by UTF-16 code unit according to their attribute name string, on subsequent lines, as if they were children of the element node.
247+
* - Attribute nodes must have the attribute name string, then an "=" sign, then the attribute value in double quotes (").
248+
* - Text nodes must be the string, in double quotes. Newlines aren't escaped.
249+
* - Comments must be "<" then "!-- " then the data then " -->".
250+
* - DOCTYPEs must be "<!DOCTYPE " then the name then if either of the system id or public id is non-empty a space, public id in double-quotes, another space an the system id in double-quotes, and then in any case ">".
251+
* - Processing instructions must be "<?", then the target, then a space, then the data and then ">". (The HTML parser cannot emit processing instructions, but scripts can, and the WebVTT to DOM rules can emit them.)
252+
* - Template contents are represented by the string "content" with the children below it.
245253
*/
246254
case 'document':
247255
if ( '|' === $line[0] ) {
248256
$candidate = substr( $line, 2 );
249-
$trimmed = trim( $candidate );
250-
// Only take lines that look like tags
251-
// At least 3 chars (< + tag + >)
252-
// Tag must start with ascii alphabetic
253-
if ( strlen( $trimmed ) > 2 && '<' === $trimmed[0] && ctype_alpha( $trimmed[1] ) ) {
254-
$test_dom .= $candidate;
257+
258+
// Remove leading spaces and the trailing newline
259+
$trimmed = ltrim( substr( $candidate, 0, -1 ) );
260+
261+
// Text: "…
262+
if ( $trimmed[0] === '"' ) {
263+
// Skip for now
264+
break;
255265
}
256266

257-
if (
258-
( $trimmed[0] !== '<' || $trimmed[ strlen($trimmed) - 1 ] !== '>' ) &&
259-
$trimmed[0] !== '"'
260-
) {
267+
// Attribute: name="value"
268+
if ( $trimmed[ strlen($trimmed) - 1 ] === '"' ) {
261269
$test_dom .= $candidate;
270+
break;
271+
}
272+
273+
// Tags: <tag-name>
274+
// Comments: <!-- comment text -->
275+
// Doctypes: <!DOCTYPE … >
276+
// Processing instructions: <?target >
277+
if ( $trimmed[0] === '<' && $trimmed[ strlen($trimmed) - 1 ] === '>' ) {
278+
// Tags: <tag-name>
279+
if ( ctype_alpha( $trimmed[1] ) ) {
280+
$test_dom .= $candidate;
281+
break;
282+
}
283+
// Skip everything else for now
284+
break;
262285
}
286+
} else {
287+
// This is a text node that includes unescaped newlines.
288+
// Everything else should be singles lines starting with "| ".
289+
// @todo Skip for now, add to $test_dom when we handle text nodes.
263290
}
264291
break;
265292
}

0 commit comments

Comments
 (0)