2727 * - Unwrapping a tag by removing its parent.
2828 * - Inserting and removing nodes.
2929 * - Reading and changing inner content.
30+ * - Navigating up or around HTML structure.
3031 *
3132 * ## Usage
3233 *
6667 * `array( 'IMG' )` matches all IMG elements and `array( 'P', 'IMG' )`
6768 * matches all IMG elements directly inside a P element. To ensure that no
6869 * partial matches erroneously match it's possible to specify in a query
69- * the full breadcrumb match.
70+ * the full breadcrumb match all the way down from the root HTML element .
7071 *
7172 * Example:
7273 *
7677 *
7778 * $html = '<figure><img><figcaption>A <em>lovely</em> day outside</figcaption></figure>';
7879 * // ---- Matches here.
79- * $processor->next_tag( array( 'breadcrumbs' => array( 'FIGURE', 'IMG', ' FIGCAPTION', 'EM' ) ) );
80+ * $processor->next_tag( array( 'breadcrumbs' => array( 'FIGURE', 'FIGCAPTION', 'EM' ) ) );
8081 *
8182 * $html = '<div><img></div><img>';
8283 * // ----- Matches here, because IMG must be a direct child of the implicit BODY.
100101 *
101102 * - Links: A.
102103 * - The formatting elements: B, BIG, CODE, EM, FONT, I, SMALL, STRIKE, STRONG, TT, U.
103- * - Containers: DIV, FIGCAPTION, FIGURE.
104+ * - Containers: DIV, FIGCAPTION, FIGURE, SPAN.
105+ * - Form elements: BUTTON.
104106 * - Paragraph: P.
105107 * - Void elements: IMG.
106108 *
116118 * - Markup involving only those tags listed above.
117119 * - Fully-balanced and non-overlapping tags.
118120 * - HTML with unexpected tag closers.
121+ * - Some unbalanced or overlapping tags.
119122 * - P tags after unclosed P tags.
123+ * - BUTTON tags after unclosed BUTTON tags.
120124 * - A tags after unclosed A tags that don't involve any active formatting elements.
121125 *
122126 * @since 6.4.0
126130 */
127131class WP_HTML_Processor extends WP_HTML_Tag_Processor {
128132 /**
129- * HTML processing requires more bookmarks than basic tag processing.
133+ * The maximum number of bookmarks allowed to exist at any given time.
134+ *
135+ * HTML processing requires more bookmarks than basic tag processing,
136+ * so this class constant from the Tag Processor is overwritten.
130137 *
131138 * @since 6.4.0
132139 *
@@ -137,6 +144,8 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
137144 /**
138145 * Static query for instructing the Tag Processor to visit every token.
139146 *
147+ * @access private
148+ *
140149 * @since 6.4.0
141150 *
142151 * @var array
@@ -158,8 +167,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
158167 /**
159168 * Used to create unique bookmark names.
160169 *
170+ * This class sets a bookmark for every tag in the HTML document that it encounters.
171+ * The bookmark name is auto-generated and increments, starting with `1`. These are
172+ * internal bookmarks and are automatically released when the referring WP_HTML_Token
173+ * goes out of scope and is garbage-collected.
174+ *
161175 * @since 6.4.0
162176 *
177+ * @see WP_HTML_Processor::$release_internal_bookmark_on_destruct
178+ *
163179 * @var int
164180 */
165181 private $ bookmark_counter = 0 ;
@@ -186,7 +202,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
186202 *
187203 * @var closure
188204 */
189- private $ release_internal_bookmark = null ;
205+ private $ release_internal_bookmark_on_destruct = null ;
190206
191207 /*
192208 * Public Interface Functions
@@ -258,10 +274,14 @@ public static function createFragment( $html, $context = '<body>', $encoding = '
258274 /**
259275 * Constructor.
260276 *
277+ * Do not use this method. Use the static creator methods instead.
278+ *
261279 * @access private
262280 *
263281 * @since 6.4.0
264282 *
283+ * @see WP_HTML_Processor::createFragment()
284+ *
265285 * @param string $html HTML to process.
266286 * @param string|null $use_the_static_create_methods_instead This constructor should not be called manually.
267287 */
@@ -287,7 +307,7 @@ public function __construct( $html, $use_the_static_create_methods_instead = nul
287307 * a private method into WP_HTML_Token classes without
288308 * exposing it to any public API.
289309 */
290- $ this ->release_internal_bookmark = function ( $ name ) {
310+ $ this ->release_internal_bookmark_on_destruct = function ( $ name ) {
291311 parent ::release_bookmark ( $ name );
292312 };
293313 }
@@ -298,13 +318,15 @@ public function __construct( $html, $use_the_static_create_methods_instead = nul
298318 * Various situations lead to parsing failure but this class will
299319 * return `false` in all those cases. To determine why something
300320 * failed it's possible to request the last error. This can be
301- * helpful to know if it's possible to fix something or to give up.
321+ * helpful to know to distinguish whether a given tag couldn't
322+ * be found or if content in the document caused the processor
323+ * to give up and abort processing.
302324 *
303325 * Example
304326 *
305- * $p = WP_HTML_Processor::createFragment( '<template><strong><button><em><p><em>' );
306- * false === $p ->next_tag();
307- * WP_HTML_Processor::ERROR_UNSUPPORTED === $p ->get_last_error();
327+ * $processor = WP_HTML_Processor::createFragment( '<template><strong><button><em><p><em>' );
328+ * false === $processor ->next_tag();
329+ * WP_HTML_Processor::ERROR_UNSUPPORTED === $processor ->get_last_error();
308330 *
309331 * @since 6.4.0
310332 *
@@ -429,6 +451,20 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
429451 }
430452
431453 if ( self ::PROCESS_NEXT_NODE === $ node_to_process ) {
454+ /*
455+ * Void elements still hop onto the stack of open elements even though
456+ * there's no corresponding closing tag. This is important for managing
457+ * stack-based operations such as "navigate to parent node" or checking
458+ * on an element's breadcrumbs.
459+ *
460+ * When moving on to the next node, therefore, if the bottom-most element
461+ * on the stack is a void element, it must be closed.
462+ *
463+ * @TODO: Once self-closing foreign elements and BGSOUND are supported,
464+ * they must also be implicitly closed here too. BGSOUND is
465+ * special since it's only self-closing if the self-closing flag
466+ * is provided in the opening tag, otherwise it expects a tag closer.
467+ */
432468 $ top_node = $ this ->state ->stack_of_open_elements ->current_node ();
433469 if ( $ top_node && self ::is_void ( $ top_node ->node_name ) ) {
434470 $ this ->state ->stack_of_open_elements ->pop ();
@@ -446,7 +482,7 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
446482 $ this ->bookmark_tag (),
447483 $ this ->get_tag (),
448484 $ this ->is_tag_closer (),
449- $ this ->release_internal_bookmark
485+ $ this ->release_internal_bookmark_on_destruct
450486 );
451487
452488 try {
@@ -456,7 +492,7 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
456492
457493 default :
458494 $ this ->last_error = self ::ERROR_UNSUPPORTED ;
459- throw new WP_HTML_Unsupported_Exception ( ' Cannot parse outside of the IN BODY insertion mode. ' );
495+ throw new WP_HTML_Unsupported_Exception ( " No support for parsing in the ' { $ this -> state -> insertion_mode } ' state. " );
460496 }
461497 } catch ( WP_HTML_Unsupported_Exception $ e ) {
462498 /*
@@ -470,17 +506,22 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
470506 /**
471507 * Computes the HTML breadcrumbs for the currently-matched node, if matched.
472508 *
473- * Breadcrumbs start at the outer-most parent and descend toward the matched element.
509+ * Breadcrumbs start at the outermost parent and descend toward the matched element.
510+ * They always include the entire path from the root HTML node to the matched element.
511+ *
512+ * @TODO: It could be more efficient to expose a generator-based version of this function
513+ * to avoid creating the array copy on tag iteration. If this is done, it would likely
514+ * be more useful to walk up the stack when yielding instead of starting at the top.
474515 *
475516 * Example
476517 *
477- * $p = WP_HTML_Processor::createFragment( '<p><strong><em><img></em></strong></p>' );
478- * $p ->next_tag( 'IMG' );
479- * $p ->get_breadcrumbs() === array( 'HTML', 'BODY', 'P', 'STRONG', 'EM', 'IMG' );
518+ * $processor = WP_HTML_Processor::createFragment( '<p><strong><em><img></em></strong></p>' );
519+ * $processor ->next_tag( 'IMG' );
520+ * $processor ->get_breadcrumbs() === array( 'HTML', 'BODY', 'P', 'STRONG', 'EM', 'IMG' );
480521 *
481522 * @since 6.4.0
482523 *
483- * @return string[]|null Array of tag- names representing path to matched node, if matched, otherwise null .
524+ * @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL .
484525 */
485526 public function get_breadcrumbs () {
486527 if ( ! $ this ->get_tag () ) {
@@ -499,16 +540,14 @@ public function get_breadcrumbs() {
499540 * Parses next element in the 'in body' insertion mode.
500541 *
501542 * This internal function performs the 'in body' insertion mode
502- * logic for the generalized `self::step()` function.
503- *
504- * @access private
543+ * logic for the generalized WP_HTML_Processor::step() function.
505544 *
506545 * @since 6.4.0
507546 *
508547 * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
509548 *
510549 * @see https://html.spec.whatwg.org/#parsing-main-inbody
511- * @see self ::step
550+ * @see WP_HTML_Processor ::step
512551 *
513552 * @return bool Whether an element was found.
514553 */
@@ -698,7 +737,7 @@ private function step_in_body() {
698737 */
699738
700739 /**
701- * Creates a new bookmark for the currently-matched tag and returns generated name.
740+ * Creates a new bookmark for the currently-matched tag and returns the generated name.
702741 *
703742 * @since 6.4.0
704743 *
@@ -726,14 +765,19 @@ private function bookmark_tag() {
726765 /**
727766 * Returns the uppercase name of the matched tag.
728767 *
768+ * The semantic rules for HTML specify that certain tags be reprocessed
769+ * with a different tag name. Because of this, the tag name presented
770+ * by the HTML Processor may differ from the one reported by the HTML
771+ * Tag Processor, which doesn't apply these semantic rules.
772+ *
729773 * Example:
730774 *
731- * $p = new WP_HTML_Tag_Processor( '<div class="test">Test</div>' );
732- * $p ->next_tag() === true;
733- * $p ->get_tag() === 'DIV';
775+ * $processor = new WP_HTML_Tag_Processor( '<div class="test">Test</div>' );
776+ * $processor ->next_tag() === true;
777+ * $processor ->get_tag() === 'DIV';
734778 *
735- * $p ->next_tag() === false;
736- * $p ->get_tag() === null;
779+ * $processor ->next_tag() === false;
780+ * $processor ->get_tag() === null;
737781 *
738782 * @since 6.4.0
739783 *
@@ -775,11 +819,13 @@ public function release_bookmark( $bookmark_name ) {
775819 }
776820
777821 /**
778- * Moves the internal cursor in the Tag Processor to a given bookmark's location.
822+ * Moves the internal cursor in the HTML Processor to a given bookmark's location.
779823 *
780824 * In order to prevent accidental infinite loops, there's a
781825 * maximum limit on the number of times seek() can be called.
782826 *
827+ * @throws Exception When unable to allocate a bookmark for the next token in the input HTML document.
828+ *
783829 * @since 6.4.0
784830 *
785831 * @param string $bookmark_name Jump to the place in the document identified by this bookmark name.
@@ -1348,6 +1394,8 @@ public static function is_void( $tag_name ) {
13481394 *
13491395 * This unlock code is used to ensure that anyone calling the constructor is
13501396 * doing so with a full understanding that it's intended to be a private API.
1397+ *
1398+ * @access private
13511399 */
13521400 const CONSTRUCTOR_UNLOCK_CODE = 'Use WP_HTML_Processor::createFragment instead of calling the class constructor directly. ' ;
13531401}
0 commit comments