Skip to content

Commit 5732aee

Browse files
committed
HTML API: Add functions to read inner and outer HTML.
1 parent ddb67a0 commit 5732aee

5 files changed

Lines changed: 483 additions & 8 deletions

File tree

src/wp-includes/html-api/class-wp-html-open-elements.php

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,17 @@ class WP_HTML_Open_Elements {
3737
*/
3838
public $stack = array();
3939

40+
/**
41+
* Holds functions added to be called after popping an element off the stack.
42+
*
43+
* Listeners are passed the WP_HTML_Token for the item that was removed.
44+
*
45+
* @since 6.4.0
46+
*
47+
* @var array
48+
*/
49+
private $after_pop_listeners = array();
50+
4051
/**
4152
* Whether a P element is in button scope currently.
4253
*
@@ -428,5 +439,39 @@ public function after_element_pop( $item ) {
428439
$this->has_p_in_button_scope = $this->has_element_in_button_scope( 'P' );
429440
break;
430441
}
442+
443+
// Call any listeners that are registered.
444+
foreach ( $this->after_pop_listeners as $listener ) {
445+
call_user_func( $listener, $item );
446+
}
447+
}
448+
449+
/**
450+
* Creates a context in which a given listener is called after
451+
* popping an element off of the stack of open elements.
452+
*
453+
* It's unlikely that you will need this function. It exists
454+
* to aid an optimization in the `WP_HTML_Processor` and the
455+
* strange form of calling a generator inside a `foreach`
456+
* loop ensures that proper cleanup of the listener occurs.
457+
*
458+
* Example:
459+
*
460+
* $did_close = false;
461+
* $closed_a_p = function ( $item ) use ( &$did_close ) { $did_close = 'P' === $item->node_name; };
462+
* foreach ( $stack_of_open_elements->with_pop_listener( $closed_a_p ) ) {
463+
* while ( ! $did_close && $processor->next_tag() ) {
464+
* // This loop executes until _any_ P element is closed.
465+
* }
466+
* }
467+
*
468+
* @since 6.4.0
469+
*
470+
* @param callable $listener Called with the WP_HTML_Token for the item that was popped off of the stack.
471+
*/
472+
public function with_pop_listener( $listener ) {
473+
$this->after_pop_listeners[] = $listener;
474+
yield;
475+
array_pop( $this->after_pop_listeners );
431476
}
432477
}

src/wp-includes/html-api/class-wp-html-processor.php

Lines changed: 184 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,7 @@ public function __construct( $html, $use_the_static_create_methods_instead = nul
322322
* be found or if content in the document caused the processor
323323
* to give up and abort processing.
324324
*
325-
* Example
325+
* Example:
326326
*
327327
* $processor = WP_HTML_Processor::create_fragment( '<template><strong><button><em><p><em>' );
328328
* false === $processor->next_tag();
@@ -473,6 +473,93 @@ public function matches_breadcrumbs( $breadcrumbs ) {
473473
return false;
474474
}
475475

476+
/**
477+
* Returns the raw HTML content inside a matched tag.
478+
*
479+
* "Markup" differs from inner HTML in that it returns the raw HTML inside the matched tag.
480+
* This means that it's possible this returns HTML without matching tags, or with HTML attributes
481+
* serialized differently than a DOM API would return.
482+
*
483+
* Example:
484+
*
485+
* $processor = WP_HTML_Processor::createFragment( '<div><p>Inside <em>P</em> <i>tags</div>' );
486+
* $processor->next_tag( 'P' );
487+
* 'Inside <em>P</em> <i>tags' === $processor->get_raw_inner_markup();
488+
*
489+
* @since 6.4.0
490+
*
491+
* @throws Exception When unable to allocate a bookmark for internal tracking of the open tag.
492+
*
493+
* @return string|null The inner markup if available, else NULL.
494+
*/
495+
public function get_raw_inner_markup() {
496+
if ( null === $this->get_tag() ) {
497+
return null;
498+
}
499+
500+
$this->set_bookmark( 'start' );
501+
$found_tag = $this->step_until_tag_is_closed();
502+
$this->set_bookmark( 'end' );
503+
504+
if ( $found_tag ) {
505+
$inner_markup = $this->substr_bookmarks( 'after', 'start', 'before', 'end' );
506+
} else {
507+
// If there's no closing tag then the inner markup continues to the end of the document.
508+
$inner_markup = $this->substr_bookmark( 'after', 'start' );
509+
}
510+
511+
$this->seek( 'start' );
512+
$this->release_bookmark( 'start' );
513+
$this->release_bookmark( 'end' );
514+
515+
return $inner_markup;
516+
}
517+
518+
/**
519+
* Returns the raw HTML content around a matched tag, including the tag itself.
520+
*
521+
* "Markup" differs from outer HTML in that it returns the raw HTML inside the matched tag.
522+
* This means that it's possible this returns HTML without matching tags, or with HTML attributes
523+
* serialized differently than a DOM API would return.
524+
*
525+
* Example:
526+
*
527+
* $processor = WP_HTML_Processor::createFragment( '<div><p>Inside <em>P</em> <i>tags</div>' );
528+
* $processor->next_tag( 'P' );
529+
* '<p>Inside <em>P</em> <i>tags' === $processor->get_raw_inner_markup();
530+
*
531+
* @since 6.4.0
532+
*
533+
* @throws Exception When unable to allocate a bookmark for internal tracking of the open tag.
534+
*
535+
* @return string|null The outer markup if available, else NULL.
536+
*/
537+
public function get_raw_outer_markup() {
538+
if ( null === $this->get_tag() ) {
539+
return null;
540+
}
541+
542+
$this->set_bookmark( 'start' );
543+
$start_tag = $this->current_token->node_name;
544+
$found_tag = $this->step_until_tag_is_closed();
545+
$this->set_bookmark( 'end' );
546+
547+
if ( $found_tag ) {
548+
$did_close = $this->get_tag() === $start_tag && $this->is_tag_closer();
549+
$end_position = $did_close ? 'after' : 'before';
550+
$outer_markup = $this->substr_bookmarks( 'before', 'start', $end_position, 'end' );
551+
} else {
552+
// If there's no closing tag then the outer markup continues to the end of the document.
553+
$outer_markup = $this->substr_bookmark( 'before', 'start' );
554+
}
555+
556+
$this->seek( 'start' );
557+
$this->release_bookmark( 'start' );
558+
$this->release_bookmark( 'end' );
559+
560+
return $outer_markup;
561+
}
562+
476563
/**
477564
* Steps through the HTML document and stop at the next tag, if any.
478565
*
@@ -512,12 +599,9 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
512599
$this->state->stack_of_open_elements->pop();
513600
}
514601

515-
parent::next_tag( self::VISIT_EVERYTHING );
516-
}
517-
518-
// Finish stepping when there are no more tokens in the document.
519-
if ( null === $this->get_tag() ) {
520-
return false;
602+
if ( ! parent::next_tag( self::VISIT_EVERYTHING ) ) {
603+
return false;
604+
}
521605
}
522606

523607
$this->state->current_token = new WP_HTML_Token(
@@ -555,7 +639,7 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
555639
* to avoid creating the array copy on tag iteration. If this is done, it would likely
556640
* be more useful to walk up the stack when yielding instead of starting at the top.
557641
*
558-
* Example
642+
* Example:
559643
*
560644
* $processor = WP_HTML_Processor::create_fragment( '<p><strong><em><img></em></strong></p>' );
561645
* $processor->next_tag( 'IMG' );
@@ -800,6 +884,98 @@ private function bookmark_tag() {
800884
return "{$this->bookmark_counter}";
801885
}
802886

887+
/**
888+
* Steps through the HTML document until the current open tag is closed.
889+
*
890+
* @since 6.4.0
891+
*
892+
* @throws Exception When unable to allocate bookmark for internal tracking.
893+
*
894+
* @return bool|null true if a closing tag was found, false if not, and null if not starting at a matched tag.
895+
*/
896+
private function step_until_tag_is_closed() {
897+
if ( null === $this->get_tag() ) {
898+
return null;
899+
}
900+
901+
/** @var WP_HTML_Token $start Reference to the opening tag when calling this function. */
902+
$start = $this->current_token;
903+
904+
/** @var bool $keep_searching Whether to continue scanning for a point where the opening tag is closed. */
905+
$keep_searching = true;
906+
907+
/**
908+
* Sets a flag indicating that the starting tag has been closed once
909+
* it's popped from the stack of open elements. This is a listener function.
910+
*
911+
* @since 6.4.0
912+
*
913+
* @see WP_HTML_Open_Elements::with_pop_listener()
914+
*
915+
* @param WP_HTML_Token $node Node that was popped.
916+
*/
917+
$tag_is_closed = function ( $node ) use ( &$keep_searching, $start ) {
918+
if ( $node === $start ) {
919+
$keep_searching = false;
920+
}
921+
};
922+
923+
/*
924+
* Normally, when stepping into each new elements, it would be required to walk up the
925+
* stack of open elements and look to see if the starting tag is still open, if it's
926+
* on the stack. By listening for elements that are popped from the stack, however, it's
927+
* possible to know if the starting tag has been closed without anything more than a
928+
* constant boolean access, as the listener is called for each tag that's closed.
929+
*
930+
* The use of the `foreach` here creates a context which ensures that the listener is
931+
* properly removed and cleaned up without having to manually remove it.
932+
*/
933+
foreach ( $this->state->stack_of_open_elements->with_pop_listener( $tag_is_closed ) as $_ ) {
934+
// Find where the tag is closed by stepping forward until it's no longer on the stack of open elements.
935+
do {
936+
$found_tag = $this->step();
937+
} while ( $found_tag && $keep_searching );
938+
}
939+
940+
return $found_tag;
941+
}
942+
943+
/**
944+
* Returns a substring of the input HTML document from a bookmark until the end.
945+
*
946+
* @since 6.4.0
947+
*
948+
* @param string $start_position "before" to clip before bookmark, "after" to clip after.
949+
* @param string $start Bookmark name at which to start clipping.
950+
* @return string Clipped substring of input HTMl document.
951+
*/
952+
private function substr_bookmark( $start_position, $start ) {
953+
$start_bookmark = $this->bookmarks[ "_{$start}" ];
954+
$start_offset = 'before' === $start_position ? $start_bookmark->start : $start_bookmark->end + 1;
955+
956+
return substr( $this->html, $start_offset );
957+
}
958+
959+
/**
960+
* Returns a substring of the input HTML document delimited by bookmarks.
961+
*
962+
* @since 6.4.0
963+
*
964+
* @param string $start_position "before" to clip before bookmark, "after" to clip after.
965+
* @param string $start Bookmark name at which to start clipping.
966+
* @param string $end_position "before" to clip before bookmark, "after" to clip after.
967+
* @param string $end Bookmark name at which to end clipping.
968+
* @return string Clipped substring of input HTMl document.
969+
*/
970+
private function substr_bookmarks( $start_position, $start, $end_position, $end ) {
971+
$start_bookmark = $this->bookmarks[ "_{$start}" ];
972+
$end_bookmark = $this->bookmarks[ "_{$end}" ];
973+
$start_offset = 'before' === $start_position ? $start_bookmark->start : $start_bookmark->end + 1;
974+
$end_offset = 'before' === $end_position ? $end_bookmark->start : $end_bookmark->end + 1;
975+
976+
return substr( $this->html, $start_offset, $end_offset - $start_offset );
977+
}
978+
803979
/*
804980
* HTML semantic overrides for Tag Processor
805981
*/

0 commit comments

Comments
 (0)