Skip to content

Commit de410cd

Browse files
committed
HTML API: Add functions to read inner and outer HTML.
1 parent 0c23a14 commit de410cd

4 files changed

Lines changed: 484 additions & 9 deletions

File tree

src/wp-includes/html-api/class-wp-html-open-elements.php

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,17 @@ class WP_HTML_Open_Elements {
3737
*/
3838
public $stack = array();
3939

40+
/**
41+
* Holds functions added to be called after popping an element off the stack.
42+
*
43+
* Listeners are passed the WP_HTML_Token for the item that was removed.
44+
*
45+
* @since 6.4.0
46+
*
47+
* @var array
48+
*/
49+
private $after_pop_listeners = array();
50+
4051
/**
4152
* Whether a P element is in button scope currently.
4253
*
@@ -428,5 +439,39 @@ public function after_element_pop( $item ) {
428439
$this->has_p_in_button_scope = $this->has_element_in_button_scope( 'P' );
429440
break;
430441
}
442+
443+
// Call any listeners that are registered.
444+
foreach ( $this->after_pop_listeners as $listener ) {
445+
call_user_func( $listener, $item );
446+
}
447+
}
448+
449+
/**
450+
* Creates a context in which a given listener is called after
451+
* popping an element off of the stack of open elements.
452+
*
453+
* It's unlikely that you will need this function. It exists
454+
* to aid an optimization in the `WP_HTML_Processor` and the
455+
* strange form of calling a generator inside a `foreach`
456+
* loop ensures that proper cleanup of the listener occurs.
457+
*
458+
* Example:
459+
*
460+
* $did_close = false;
461+
* $closed_a_p = function ( $item ) use ( &$did_close ) { $did_close = 'P' === $item->node_name; };
462+
* foreach ( $stack_of_open_elements->with_pop_listener( $closed_a_p ) ) {
463+
* while ( ! $did_close && $processor->next_tag() ) {
464+
* // This loop executes until _any_ P element is closed.
465+
* }
466+
* }
467+
*
468+
* @since 6.4.0
469+
*
470+
* @param callable $listener Called with the WP_HTML_Token for the item that was popped off of the stack.
471+
*/
472+
public function with_pop_listener( $listener ) {
473+
$this->after_pop_listeners[] = $listener;
474+
yield;
475+
array_pop( $this->after_pop_listeners );
431476
}
432477
}

src/wp-includes/html-api/class-wp-html-processor.php

Lines changed: 185 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ public function __construct( $html, $use_the_static_create_methods_instead = nul
309309
* failed it's possible to request the last error. This can be
310310
* helpful to know if it's possible to fix something or to give up.
311311
*
312-
* Example
312+
* Example:
313313
*
314314
* $p = WP_HTML_Processor::createFragment( '<template><strong><button><em><p><em>' );
315315
* false === $p->next_tag();
@@ -418,6 +418,93 @@ public function next_tag( $query = null ) {
418418
return false;
419419
}
420420

421+
/**
422+
* Returns the raw HTML content inside a matched tag.
423+
*
424+
* "Markup" differs from inner HTML in that it returns the raw HTML inside the matched tag.
425+
* This means that it's possible this returns HTML without matching tags, or with HTML attributes
426+
* serialized differently than a DOM API would return.
427+
*
428+
* Example:
429+
*
430+
* $processor = WP_HTML_Processor::createFragment( '<div><p>Inside <em>P</em> <i>tags</div>' );
431+
* $processor->next_tag( 'P' );
432+
* 'Inside <em>P</em> <i>tags' === $processor->get_inner_markup();
433+
*
434+
* @since 6.4.0
435+
*
436+
* @throws Exception When unable to allocate a bookmark for internal tracking of the open tag.
437+
*
438+
* @return string|null The inner markup if available, else NULL.
439+
*/
440+
public function get_inner_markup() {
441+
if ( null === $this->get_tag() ) {
442+
return null;
443+
}
444+
445+
$this->set_bookmark( 'start' );
446+
$found_tag = $this->step_until_tag_is_closed();
447+
$this->set_bookmark( 'end' );
448+
449+
if ( $found_tag ) {
450+
$inner_markup = $this->substr_bookmarks( 'after', 'start', 'before', 'end' );
451+
} else {
452+
// If there's no closing tag then the inner markup continues to the end of the document.
453+
$inner_markup = $this->substr_bookmark( 'after', 'start' );
454+
}
455+
456+
$this->seek( 'start' );
457+
$this->release_bookmark( 'start' );
458+
$this->release_bookmark( 'end' );
459+
460+
return $inner_markup;
461+
}
462+
463+
/**
464+
* Returns the raw HTML content around a matched tag, including the tag itself.
465+
*
466+
* "Markup" differs from outer HTML in that it returns the raw HTML inside the matched tag.
467+
* This means that it's possible this returns HTML without matching tags, or with HTML attributes
468+
* serialized differently than a DOM API would return.
469+
*
470+
* Example:
471+
*
472+
* $processor = WP_HTML_Processor::createFragment( '<div><p>Inside <em>P</em> <i>tags</div>' );
473+
* $processor->next_tag( 'P' );
474+
* '<p>Inside <em>P</em> <i>tags' === $processor->get_inner_markup();
475+
*
476+
* @since 6.4.0
477+
*
478+
* @throws Exception When unable to allocate a bookmark for internal tracking of the open tag.
479+
*
480+
* @return string|null The outer markup if available, else NULL.
481+
*/
482+
public function get_outer_markup() {
483+
if ( null === $this->get_tag() ) {
484+
return null;
485+
}
486+
487+
$this->set_bookmark( 'start' );
488+
$start_tag = $this->current_token->node_name;
489+
$found_tag = $this->step_until_tag_is_closed();
490+
$this->set_bookmark( 'end' );
491+
492+
if ( $found_tag ) {
493+
$did_close = $this->get_tag() === $start_tag && $this->is_tag_closer();
494+
$end_position = $did_close ? 'after' : 'before';
495+
$outer_markup = $this->substr_bookmarks( 'before', 'start', $end_position, 'end' );
496+
} else {
497+
// If there's no closing tag then the outer markup continues to the end of the document.
498+
$outer_markup = $this->substr_bookmark( 'before', 'start' );
499+
}
500+
501+
$this->seek( 'start' );
502+
$this->release_bookmark( 'start' );
503+
$this->release_bookmark( 'end' );
504+
505+
return $outer_markup;
506+
}
507+
421508
/**
422509
* Steps through the HTML document and stop at the next tag, if any.
423510
*
@@ -438,12 +525,9 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
438525
$this->state->stack_of_open_elements->pop();
439526
}
440527

441-
parent::next_tag( self::VISIT_EVERYTHING );
442-
}
443-
444-
// Finish stepping when there are no more tokens in the document.
445-
if ( null === $this->get_tag() ) {
446-
return false;
528+
if ( ! parent::next_tag( self::VISIT_EVERYTHING ) ) {
529+
return false;
530+
}
447531
}
448532

449533
$this->current_token = new WP_HTML_Token(
@@ -474,9 +558,9 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
474558
/**
475559
* Computes the HTML breadcrumbs for the currently-matched node, if matched.
476560
*
477-
* Breadcrumbs start at the outer-most parent and descend toward the matched element.
561+
* Breadcrumbs start at the outermost parent and descend toward the matched element.
478562
*
479-
* Example
563+
* Example:
480564
*
481565
* $p = WP_HTML_Processor::createFragment( '<p><strong><em><img></em></strong></p>' );
482566
* $p->next_tag( 'IMG' );
@@ -723,6 +807,98 @@ private function bookmark_tag() {
723807
return "{$this->bookmark_counter}";
724808
}
725809

810+
/**
811+
* Steps through the HTML document until the current open tag is closed.
812+
*
813+
* @since 6.4.0
814+
*
815+
* @throws Exception When unable to allocate bookmark for internal tracking.
816+
*
817+
* @return bool|null true if a closing tag was found, false if not, and null if not starting at a matched tag.
818+
*/
819+
private function step_until_tag_is_closed() {
820+
if ( null === $this->get_tag() ) {
821+
return null;
822+
}
823+
824+
/** @var WP_HTML_Token $start Reference to the opening tag when calling this function. */
825+
$start = $this->current_token;
826+
827+
/** @var bool $keep_searching Whether to continue scanning for a point where the opening tag is closed. */
828+
$keep_searching = true;
829+
830+
/**
831+
* Sets a flag indicating that the starting tag has been closed once
832+
* it's popped from the stack of open elements. This is a listener function.
833+
*
834+
* @since 6.4.0
835+
*
836+
* @see WP_HTML_Open_Elements::with_pop_listener()
837+
*
838+
* @param WP_HTML_Token $node Node that was popped.
839+
*/
840+
$tag_is_closed = function ( $node ) use ( &$keep_searching, $start ) {
841+
if ( $node === $start ) {
842+
$keep_searching = false;
843+
}
844+
};
845+
846+
/*
847+
* Normally, when stepping into each new elements, it would be required to walk up the
848+
* stack of open elements and look to see if the starting tag is still open, if it's
849+
* on the stack. By listening for elements that are popped from the stack, however, it's
850+
* possible to know if the starting tag has been closed without anything more than a
851+
* constant boolean access, as the listener is called for each tag that's closed.
852+
*
853+
* The use of the `foreach` here creates a context which ensures that the listener is
854+
* properly removed and cleaned up without having to manually remove it.
855+
*/
856+
foreach ( $this->state->stack_of_open_elements->with_pop_listener( $tag_is_closed ) as $_ ) {
857+
// Find where the tag is closed by stepping forward until it's no longer on the stack of open elements.
858+
do {
859+
$found_tag = $this->step();
860+
} while ( $found_tag && $keep_searching );
861+
}
862+
863+
return $found_tag;
864+
}
865+
866+
/**
867+
* Returns a substring of the input HTML document from a bookmark until the end.
868+
*
869+
* @since 6.4.0
870+
*
871+
* @param string $start_position "before" to clip before bookmark, "after" to clip after.
872+
* @param string $start Bookmark name at which to start clipping.
873+
* @return string Clipped substring of input HTMl document.
874+
*/
875+
private function substr_bookmark( $start_position, $start ) {
876+
$start_bookmark = $this->bookmarks[ "_{$start}" ];
877+
$start_offset = 'before' === $start_position ? $start_bookmark->start : $start_bookmark->end + 1;
878+
879+
return substr( $this->html, $start_offset );
880+
}
881+
882+
/**
883+
* Returns a substring of the input HTML document delimited by bookmarks.
884+
*
885+
* @since 6.4.0
886+
*
887+
* @param string $start_position "before" to clip before bookmark, "after" to clip after.
888+
* @param string $start Bookmark name at which to start clipping.
889+
* @param string $end_position "before" to clip before bookmark, "after" to clip after.
890+
* @param string $end Bookmark name at which to end clipping.
891+
* @return string Clipped substring of input HTMl document.
892+
*/
893+
private function substr_bookmarks( $start_position, $start, $end_position, $end ) {
894+
$start_bookmark = $this->bookmarks[ "_{$start}" ];
895+
$end_bookmark = $this->bookmarks[ "_{$end}" ];
896+
$start_offset = 'before' === $start_position ? $start_bookmark->start : $start_bookmark->end + 1;
897+
$end_offset = 'before' === $end_position ? $end_bookmark->start : $end_bookmark->end + 1;
898+
899+
return substr( $this->html, $start_offset, $end_offset - $start_offset );
900+
}
901+
726902
/*
727903
* HTML semantic overrides for Tag Processor
728904
*/

0 commit comments

Comments
 (0)