Skip to content

Commit 851b3c7

Browse files
committed
HTML API: Add functions to read inner and outer HTML.
1 parent e0f5297 commit 851b3c7

3 files changed

Lines changed: 335 additions & 6 deletions

File tree

src/wp-includes/html-api/class-wp-html-processor.php

Lines changed: 145 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,89 @@ public function next_tag( $query = null ) {
417417
return false;
418418
}
419419

420+
/**
421+
* Returns the raw HTMl content inside a matched tag.
422+
*
423+
* "Markup" differs from inner HTML in that it returns the raw HTML inside the matched tag.
424+
* This means that it's possible this returns HTML without matching tags, or with HTML attributes
425+
* serialized differently than a DOM API would return.
426+
*
427+
* Example
428+
* $processor = WP_HTML_Processor::createFragment( '<div><p>Inside <em>P</em> <i>tags</div>' );
429+
* $processor->next_tag( 'P' );
430+
* 'Inside <em>P</em> <i>tags' === $processor->get_inner_markup();
431+
*
432+
* @since 6.4.0
433+
*
434+
* @throws Exception When unable to allocate a bookmark for internal tracking of the open tag.
435+
*
436+
* @return string|null The inner markup if available, else NULL.
437+
*/
438+
public function get_inner_markup() {
439+
if ( null === $this->get_tag() ) {
440+
return null;
441+
}
442+
443+
parent::set_bookmark( 'start' );
444+
$found_tag = $this->step_until_tag_is_closed();
445+
parent::set_bookmark( 'end' );
446+
447+
if ( $found_tag ) {
448+
$inner_markup = $this->substr_bookmarks( 'after', 'start', 'before', 'end' );
449+
} else {
450+
// If there's no closing tag then the inner markup continues to the end of the document.
451+
$inner_markup = $this->substr_bookmark( 'after', 'start' );
452+
}
453+
454+
parent::release_bookmark( 'start' );
455+
parent::release_bookmark( 'end' );
456+
457+
return $inner_markup;
458+
}
459+
460+
/**
461+
* Returns the raw HTML content around a matched tag, including the tag itself.
462+
*
463+
* "Markup" differs from outer HTML in that it returns the raw HTML inside the matched tag.
464+
* This means that it's possible this returns HTML without matching tags, or with HTML attributes
465+
* serialized differently than a DOM API would return.
466+
*
467+
* Example
468+
* $processor = WP_HTML_Processor::createFragment( '<div><p>Inside <em>P</em> <i>tags</div>' );
469+
* $processor->next_tag( 'P' );
470+
* '<p>Inside <em>P</em> <i>tags' === $processor->get_inner_markup();
471+
*
472+
* @since 6.4.0
473+
*
474+
* @throws Exception When unable to allocate a bookmark for internal tracking of the open tag.
475+
*
476+
* @return string|null The outer markup if available, else NULL.
477+
*/
478+
public function get_outer_markup() {
479+
if ( null === $this->get_tag() ) {
480+
return null;
481+
}
482+
483+
parent::set_bookmark( 'start' );
484+
$start_tag = $this->current_token->node_name;
485+
$found_tag = $this->step_until_tag_is_closed();
486+
parent::set_bookmark( 'end' );
487+
488+
if ( $found_tag ) {
489+
$did_close = $this->get_tag() === $start_tag && $this->is_tag_closer();
490+
$end_position = $did_close ? 'after' : 'before';
491+
$outer_markup = $this->substr_bookmarks( 'before', 'start', $end_position, 'end' );
492+
} else {
493+
// If there's no closing tag then the outer markup continues to the end of the document.
494+
$outer_markup = $this->substr_bookmark( 'before', 'start' );
495+
}
496+
497+
parent::release_bookmark( 'start' );
498+
parent::release_bookmark( 'end' );
499+
500+
return $outer_markup;
501+
}
502+
420503
/**
421504
* Steps through the HTML document and stop at the next tag, if any.
422505
*
@@ -437,12 +520,9 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
437520
$this->state->stack_of_open_elements->pop();
438521
}
439522

440-
parent::next_tag( self::VISIT_EVERYTHING );
441-
}
442-
443-
// Finish stepping when there are no more tokens in the document.
444-
if ( null === $this->get_tag() ) {
445-
return false;
523+
if ( ! parent::next_tag( self::VISIT_EVERYTHING ) ) {
524+
return false;
525+
}
446526
}
447527

448528
$this->current_token = new WP_HTML_Token(
@@ -722,6 +802,65 @@ private function bookmark_tag() {
722802
return "{$this->bookmark_counter}";
723803
}
724804

805+
/**
806+
* Steps through the HTML document until the current open tag is closed.
807+
*
808+
* @since 6.4.0
809+
*
810+
* @throws Exception When unable to allocate bookmark for internal tracking.
811+
*
812+
* @return bool|null true if a closing tag was found, false if not, and null if not startnig at a matched tag.
813+
*/
814+
private function step_until_tag_is_closed() {
815+
if ( null === $this->get_tag() ) {
816+
return null;
817+
}
818+
819+
$start = $this->current_token;
820+
// @TODO: add after-pop hook to turn this into a constant boolean check.
821+
do {
822+
$found_tag = $this->step();
823+
} while ( $found_tag && $this->state->stack_of_open_elements->contains_node( $start ) );
824+
825+
return $found_tag;
826+
}
827+
828+
/**
829+
* Returns a substring of the input HTML document from a bookmark until the end.
830+
*
831+
* @since 6.4.0
832+
*
833+
* @param string $start_position "before" to clip before bookmark, "after" to clip after.
834+
* @param string $start Bookmark name at which to start clipping.
835+
* @return string Clipped substring of input HTMl document.
836+
*/
837+
private function substr_bookmark( $start_position, $start ) {
838+
$start_bookmark = $this->bookmarks[ $start ];
839+
$start_offset = 'before' === $start_position ? $start_bookmark->start : $start_bookmark->end + 1;
840+
841+
return substr( $this->html, $start_offset );
842+
}
843+
844+
/**
845+
* Returns a substring of the input HTML document delimited by bookmarks.
846+
*
847+
* @since 6.4.0
848+
*
849+
* @param string $start_position "before" to clip before bookmark, "after" to clip after.
850+
* @param string $start Bookmark name at which to start clipping.
851+
* @param string $end_position "before" to clip before bookmark, "after" to clip after.
852+
* @param string $end Bookmark name at which to end clipping.
853+
* @return string Clipped substring of input HTMl document.
854+
*/
855+
private function substr_bookmarks( $start_position, $start, $end_position, $end ) {
856+
$start_bookmark = $this->bookmarks[ $start ];
857+
$end_bookmark = $this->bookmarks[ $end ];
858+
$start_offset = 'before' === $start_position ? $start_bookmark->start : $start_bookmark->end + 1;
859+
$end_offset = 'before' === $end_position ? $end_bookmark->start : $end_bookmark->end + 1;
860+
861+
return substr( $this->html, $start_offset, $end_offset - $start_offset );
862+
}
863+
725864
/*
726865
* HTML semantic overrides for Tag Processor
727866
*/
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
<?php
2+
/**
3+
* Unit tests covering WP_HTML_Processor::get_inner_markup()
4+
*
5+
* @package WordPress
6+
* @subpackage HTML-API
7+
*
8+
* @since 6.4.0
9+
*
10+
* @group html-api
11+
*
12+
* @coversDefaultClass WP_HTML_Processor
13+
*/
14+
class Tests_HtmlApi_WpHtmlProcessorGetInnerMarkup extends WP_UnitTestCase {
15+
/**
16+
* @ticket {TICKET_NUMBER}
17+
*
18+
* @covers WP_HTML_Processor::get_inner_markup
19+
*
20+
* @since 6.4.0
21+
*/
22+
public function test_returns_null_when_not_on_a_matching_tag() {
23+
$p = WP_HTML_Processor::createFragment( '<p><div><span></span></div>' );
24+
25+
$this->assertNull( $p->get_inner_markup() );
26+
27+
$this->assertFalse( $p->next_tag( 'BUTTON' ), "Should not have found a BUTTON tag but stopped at {$p->get_tag()}." );
28+
$this->assertNull( $p->get_inner_markup() );
29+
}
30+
31+
/**
32+
* @ticket {TICKET_NUMBER}
33+
*
34+
* @covers WP_HTML_Processor::get_inner_markup
35+
*
36+
* @dataProvider data_html_with_inner_markup
37+
*
38+
* @since 6.4.0
39+
*
40+
* @param string $html_with_target_node HTML containing a node with the `target` attribute set.
41+
* @param string $expected_inner_markup Inner markup of target node.
42+
*/
43+
public function test_returns_appropriate_inner_markup( $html_with_target_node, $expected_inner_markup ) {
44+
$p = WP_HTML_Processor::createFragment( $html_with_target_node );
45+
46+
while ( $p->next_tag() && null === $p->get_attribute( 'target' ) ) {
47+
continue;
48+
}
49+
50+
$this->assertSame( $expected_inner_markup, $p->get_inner_markup(), 'Failed to return appropriate inner markup.' );
51+
}
52+
53+
/**
54+
* Data provider.
55+
*
56+
* @return array[]
57+
*/
58+
public function data_html_with_inner_markup() {
59+
$data = array(
60+
'Empty elements' => array( '<div target></div>', '' ),
61+
'Element containing only text' => array( '<div target>inside</div>', 'inside' ),
62+
'Element with nested tags' => array( '<div target>inside <span>the</span> div</div>', 'inside <span>the</span> div' ),
63+
'Unclosed element' => array( '<div target>This is <em>all</em> inside the DIV', 'This is <em>all</em> inside the DIV' ),
64+
'Unclosed elements' => array( '<div><div target>Inside <em>P</em> <i>tags</div>', 'Inside <em>P</em> <i>tags' ),
65+
'Partially-closed element' => array( '<div target>This is <em>all</em> inside the DIV</div', 'This is <em>all</em> inside the DIV</div' ),
66+
'Implicitly-closed element' => array( '<div><p target>Inside the P</div>Outside the P</p>', 'Inside the P' ),
67+
);
68+
69+
$inner_html = <<<HTML
70+
<p>This is inside the <strong>Match</strong></p>
71+
<p><img></p>
72+
<div>
73+
<figure>
74+
<img>
75+
<figcaption>Look at the <strike>picture</strike> photograph.</figcaption>
76+
</figure>
77+
</div>
78+
HTML;
79+
80+
$html = <<<HTML
81+
<div>
82+
<p>This is not in the match.
83+
<p>This is another paragraph not <a href="#">in</a> the match.
84+
</div>
85+
<div target>{$inner_html}</div>
86+
<div>
87+
<p>This is also note in the match.</p>
88+
</div>
89+
HTML;
90+
$data['Complicated inner nesting'] = array( $html, $inner_html );
91+
92+
return $data;
93+
}
94+
}
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
<?php
2+
/**
3+
* Unit tests covering WP_HTML_Processor::get_outer_html()
4+
*
5+
* @package WordPress
6+
* @subpackage HTML-API
7+
*
8+
* @since 6.4.0
9+
*
10+
* @group html-api
11+
*
12+
* @coversDefaultClass WP_HTML_Processor
13+
*/
14+
class Tests_HtmlApi_WpHtmlProcessorGetOuterMarkup extends WP_UnitTestCase {
15+
/**
16+
* Ensures that it's not possible to get inner contents when not stopped at a tag in the HTML.
17+
*
18+
* @ticket {TICKET_NUMBER}
19+
*
20+
* @covers WP_HTML_Processor::get_outer_markup
21+
*
22+
* @since 6.4.0
23+
*/
24+
public function test_returns_null_when_not_on_a_matching_tag() {
25+
$p = WP_HTML_Processor::createFragment( '<p><div><span></span></div>' );
26+
27+
$this->assertNull( $p->get_outer_markup() );
28+
29+
$this->assertFalse( $p->next_tag( 'BUTTON' ), "Should not have found a BUTTON tag but stopped at {$p->get_tag()}." );
30+
$this->assertNull( $p->get_outer_markup() );
31+
}
32+
33+
/**
34+
* @ticket {TICKET_NUMBER}
35+
*
36+
* @covers WP_HTML_Processor::get_outer_markup
37+
*
38+
* @dataProvider data_html_with_outer_markup
39+
*
40+
* @since 6.4.0
41+
*
42+
* @param string $html_with_target_node HTML containing a node with the `target` attribute set.
43+
* @param string $expected_outer_markup Outer markup of target node.
44+
*/
45+
public function test_returns_appropriate_outer_markup( $html_with_target_node, $expected_outer_markup ) {
46+
$p = WP_HTML_Processor::createFragment( $html_with_target_node );
47+
48+
while ( $p->next_tag() && null === $p->get_attribute( 'target' ) ) {
49+
continue;
50+
}
51+
52+
$this->assertSame( $expected_outer_markup, $p->get_outer_markup(), 'Failed to return appropriate inner markup.' );
53+
}
54+
55+
/**
56+
* Data provider.
57+
*
58+
* @return array[]
59+
*/
60+
public function data_html_with_outer_markup() {
61+
$data = array(
62+
'Empty elements' => array( '<div target></div>', '<div target></div>' ),
63+
'Element containing only text' => array( '<div target>inside</div>', '<div target>inside</div>' ),
64+
'Element with nested tags' => array( '<div target>inside <span>the</span> div</div>', '<div target>inside <span>the</span> div</div>' ),
65+
'Unclosed element' => array( '<div target>This is <em>all</em> inside the DIV', '<div target>This is <em>all</em> inside the DIV' ),
66+
'Unclosed elements' => array( '<div><p target>Inside <em>P</em> <i>tags</div>', '<p target>Inside <em>P</em> <i>tags' ),
67+
'Partially-closed element' => array( '<div target>This is <em>all</em> inside the DIV</div', '<div target>This is <em>all</em> inside the DIV</div' ),
68+
'Implicitly-closed element' => array( '<div><p target>Inside the P</div>Outside the P</p>', '<p target>Inside the P' ),
69+
);
70+
71+
$inner_html = <<<HTML
72+
<p>This is inside the <strong>Match</strong></p>
73+
<p><img></p>
74+
<div>
75+
<figure>
76+
<img>
77+
<figcaption>Look at the <strike>picture</strike> photograph.</figcaption>
78+
</figure>
79+
</div>
80+
HTML;
81+
82+
$html = <<<HTML
83+
<div>
84+
<p>This is not in the match.
85+
<p>This is another paragraph not <a href="#">in</a> the match.
86+
</div>
87+
<div target>{$inner_html}</div>
88+
<div>
89+
<p>This is also note in the match.</p>
90+
</div>
91+
HTML;
92+
$data['Complicated inner nesting'] = array( $html, "<div target>{$inner_html}</div>" );
93+
94+
return $data;
95+
}
96+
}

0 commit comments

Comments
 (0)