Skip to content

Commit e2a7146

Browse files
committed
HTML API: Add functions to read inner and outer HTML.
1 parent e0f5297 commit e2a7146

3 files changed

Lines changed: 305 additions & 6 deletions

File tree

src/wp-includes/html-api/class-wp-html-processor.php

Lines changed: 117 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,84 @@ public function next_tag( $query = null ) {
417417
return false;
418418
}
419419

420+
/**
421+
* Returns the raw HTMl content inside a matched tag.
422+
*
423+
* @since 6.4.0
424+
*
425+
* @throws Exception When unable to allocate a bookmark for internal tracking of the open tag.
426+
*
427+
* @return string|null The inner HTML if available, else NULL.
428+
*/
429+
public function get_inner_markup() {
430+
if ( null === $this->get_tag() ) {
431+
return null;
432+
}
433+
434+
$start = $this->current_token;
435+
parent::set_bookmark( 'start' );
436+
// @TODO: add after-pop hook to turn this into a constant boolean check.
437+
do {
438+
$found_tag = $this->step();
439+
} while ( $found_tag && $this->state->stack_of_open_elements->contains_node( $start ) );
440+
441+
/*
442+
* If there's no tag to bookmark then it means the opened tag has no closing
443+
* and the rest of the document is contained within the inner HTML.
444+
*/
445+
if ( ! $found_tag ) {
446+
$inner_html = $this->substr_bookmark( 'after', 'start' );
447+
parent::release_bookmark( 'start' );
448+
} else {
449+
parent::set_bookmark( 'end' );
450+
$inner_html = $this->substr_bookmarks( 'after', 'start', 'before', 'end' );
451+
parent::release_bookmark( 'start' );
452+
parent::release_bookmark( 'end' );
453+
}
454+
455+
return $inner_html;
456+
}
457+
458+
/**
459+
* Returns the raw HTMl content inside a matched tag.
460+
*
461+
* @since 6.4.0
462+
*
463+
* @throws Exception When unable to allocate a bookmark for internal tracking of the open tag.
464+
*
465+
* @return string|null The inner HTML if available, else NULL.
466+
*/
467+
public function get_outer_markup() {
468+
if ( null === $this->get_tag() ) {
469+
return null;
470+
}
471+
472+
$start = $this->current_token;
473+
parent::set_bookmark( 'start' );
474+
// @TODO: add after-pop hook to turn this into a constant boolean check.
475+
do {
476+
$found_tag = $this->step();
477+
} while ( $found_tag && $this->state->stack_of_open_elements->contains_node( $start ) );
478+
479+
/*
480+
* If there's no tag to bookmark then it means the opened tag has no closing
481+
* and the rest of the document is contained within the inner HTML.
482+
*/
483+
if ( ! $found_tag ) {
484+
$inner_html = $this->substr_bookmark( 'before', 'start' );
485+
} else {
486+
parent::set_bookmark( 'end' );
487+
$did_close = $this->get_tag() === $start->node_name && $this->is_tag_closer();
488+
$end_position = $did_close ? 'after' : 'before';
489+
$inner_html = $this->substr_bookmarks( 'before', 'start', $end_position, 'end' );
490+
}
491+
492+
parent::release_bookmark( 'start' );
493+
parent::release_bookmark( 'end' );
494+
495+
return $inner_html;
496+
}
497+
420498
/**
421499
* Steps through the HTML document and stop at the next tag, if any.
422500
*
@@ -437,12 +515,9 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
437515
$this->state->stack_of_open_elements->pop();
438516
}
439517

440-
parent::next_tag( self::VISIT_EVERYTHING );
441-
}
442-
443-
// Finish stepping when there are no more tokens in the document.
444-
if ( null === $this->get_tag() ) {
445-
return false;
518+
if ( ! parent::next_tag( self::VISIT_EVERYTHING ) ) {
519+
return false;
520+
}
446521
}
447522

448523
$this->current_token = new WP_HTML_Token(
@@ -722,6 +797,42 @@ private function bookmark_tag() {
722797
return "{$this->bookmark_counter}";
723798
}
724799

800+
/**
801+
* Returns a substring of the input HTML document from a bookmark until the end.
802+
*
803+
* @since 6.4.0
804+
*
805+
* @param string $start_position "before" to clip before bookmark, "after" to clip after.
806+
* @param string $start Bookmark name at which to start clipping.
807+
* @return string Clipped substring of input HTMl document.
808+
*/
809+
private function substr_bookmark( $start_position, $start ) {
810+
$start_bookmark = $this->bookmarks[ $start ];
811+
$start_offset = 'before' === $start_position ? $start_bookmark->start : $start_bookmark->end + 1;
812+
813+
return substr( $this->html, $start_offset );
814+
}
815+
816+
/**
817+
* Returns a substring of the input HTML document delimited by bookmarks.
818+
*
819+
* @since 6.4.0
820+
*
821+
* @param string $start_position "before" to clip before bookmark, "after" to clip after.
822+
* @param string $start Bookmark name at which to start clipping.
823+
* @param string $end_position "before" to clip before bookmark, "after" to clip after.
824+
* @param string $end Bookmark name at which to end clipping.
825+
* @return string Clipped substring of input HTMl document.
826+
*/
827+
private function substr_bookmarks( $start_position, $start, $end_position, $end ) {
828+
$start_bookmark = $this->bookmarks[ $start ];
829+
$end_bookmark = $this->bookmarks[ $end ];
830+
$start_offset = 'before' === $start_position ? $start_bookmark->start : $start_bookmark->end + 1;
831+
$end_offset = 'before' === $end_position ? $end_bookmark->start : $end_bookmark->end + 1;
832+
833+
return substr( $this->html, $start_offset, $end_offset - $start_offset );
834+
}
835+
725836
/*
726837
* HTML semantic overrides for Tag Processor
727838
*/
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
<?php
2+
/**
3+
* Unit tests covering WP_HTML_Processor::get_inner_markup()
4+
*
5+
* @package WordPress
6+
* @subpackage HTML-API
7+
*
8+
* @since 6.4.0
9+
*
10+
* @group html-api
11+
*
12+
* @coversDefaultClass WP_HTML_Processor
13+
*/
14+
class Tests_HtmlApi_WpHtmlProcessorGetInnerMarkup extends WP_UnitTestCase {
15+
/**
16+
* @ticket {TICKET_NUMBER}
17+
*
18+
* @covers WP_HTML_Processor::get_inner_markup
19+
*
20+
* @since 6.4.0
21+
*/
22+
public function test_returns_null_when_not_on_a_matching_tag() {
23+
$p = WP_HTML_Processor::createFragment( '<p><div><span></span></div>' );
24+
25+
$this->assertNull( $p->get_inner_markup() );
26+
27+
$this->assertFalse( $p->next_tag( 'BUTTON' ), "Should not have found a BUTTON tag but stopped at {$p->get_tag()}." );
28+
$this->assertNull( $p->get_inner_markup() );
29+
}
30+
31+
/**
32+
* @ticket {TICKET_NUMBER}
33+
*
34+
* @covers WP_HTML_Processor::get_inner_markup
35+
*
36+
* @dataProvider data_html_with_inner_markup
37+
*
38+
* @since 6.4.0
39+
*
40+
* @param string $html_with_target_node HTML containing a node with the `target` attribute set.
41+
* @param string $expected_inner_markup Inner markup of target node.
42+
*/
43+
public function test_returns_appropriate_inner_markup( $html_with_target_node, $expected_inner_markup ) {
44+
$p = WP_HTML_Processor::createFragment( $html_with_target_node );
45+
46+
while ( $p->next_tag() && null === $p->get_attribute( 'target' ) ) {
47+
continue;
48+
}
49+
50+
$this->assertSame( $expected_inner_markup, $p->get_inner_markup(), 'Failed to return appropriate inner markup.' );
51+
}
52+
53+
/**
54+
* Data provider.
55+
*
56+
* @return array[]
57+
*/
58+
public function data_html_with_inner_markup() {
59+
$data = array(
60+
'Empty elements' => array( '<div target></div>', '' ),
61+
'Element containing only text' => array( '<div target>inside</div>', 'inside' ),
62+
'Element with nested tags' => array( '<div target>inside <span>the</span> div</div>', 'inside <span>the</span> div' ),
63+
'Unclosed element' => array( '<div target>This is <em>all</em> inside the DIV', 'This is <em>all</em> inside the DIV' ),
64+
'Partially-closed element' => array( '<div target>This is <em>all</em> inside the DIV</div', 'This is <em>all</em> inside the DIV</div' ),
65+
'Implicitly-closed element' => array( '<div><p target>Inside the P</div>Outside the P</p>', 'Inside the P' ),
66+
);
67+
68+
$inner_html = <<<HTML
69+
<p>This is inside the <strong>Match</strong></p>
70+
<p><img></p>
71+
<div>
72+
<figure>
73+
<img>
74+
<figcaption>Look at the <strike>picture</strike> photograph.</figcaption>
75+
</figure>
76+
</div>
77+
HTML;
78+
79+
$html = <<<HTML
80+
<div>
81+
<p>This is not in the match.
82+
<p>This is another paragraph not <a href="#">in</a> the match.
83+
</div>
84+
<div target>{$inner_html}</div>
85+
<div>
86+
<p>This is also note in the match.</p>
87+
</div>
88+
HTML;
89+
$data['Complicated inner nesting'] = array( $html, $inner_html );
90+
91+
return $data;
92+
}
93+
}
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
<?php
2+
/**
3+
* Unit tests covering WP_HTML_Processor::get_outer_html()
4+
*
5+
* @package WordPress
6+
* @subpackage HTML-API
7+
*
8+
* @since 6.4.0
9+
*
10+
* @group html-api
11+
*
12+
* @coversDefaultClass WP_HTML_Processor
13+
*/
14+
class Tests_HtmlApi_WpHtmlProcessorGetOuterMarkup extends WP_UnitTestCase {
15+
/**
16+
* Ensures that it's not possible to get inner contents when not stopped at a tag in the HTML.
17+
*
18+
* @ticket {TICKET_NUMBER}
19+
*
20+
* @covers WP_HTML_Processor::get_outer_markup
21+
*
22+
* @since 6.4.0
23+
*/
24+
public function test_returns_null_when_not_on_a_matching_tag() {
25+
$p = WP_HTML_Processor::createFragment( '<p><div><span></span></div>' );
26+
27+
$this->assertNull( $p->get_outer_markup() );
28+
29+
$this->assertFalse( $p->next_tag( 'BUTTON' ), "Should not have found a BUTTON tag but stopped at {$p->get_tag()}." );
30+
$this->assertNull( $p->get_outer_markup() );
31+
}
32+
33+
/**
34+
* @ticket {TICKET_NUMBER}
35+
*
36+
* @covers WP_HTML_Processor::get_outer_markup
37+
*
38+
* @dataProvider data_html_with_outer_markup
39+
*
40+
* @since 6.4.0
41+
*
42+
* @param string $html_with_target_node HTML containing a node with the `target` attribute set.
43+
* @param string $expected_outer_markup Outer markup of target node.
44+
*/
45+
public function test_returns_appropriate_outer_markup( $html_with_target_node, $expected_outer_markup ) {
46+
$p = WP_HTML_Processor::createFragment( $html_with_target_node );
47+
48+
while ( $p->next_tag() && null === $p->get_attribute( 'target' ) ) {
49+
continue;
50+
}
51+
52+
$this->assertSame( $expected_outer_markup, $p->get_outer_markup(), 'Failed to return appropriate inner markup.' );
53+
}
54+
55+
/**
56+
* Data provider.
57+
*
58+
* @return array[]
59+
*/
60+
public function data_html_with_outer_markup() {
61+
$data = array(
62+
'Empty elements' => array( '<div target></div>', '<div target></div>' ),
63+
'Element containing only text' => array( '<div target>inside</div>', '<div target>inside</div>' ),
64+
'Element with nested tags' => array( '<div target>inside <span>the</span> div</div>', '<div target>inside <span>the</span> div</div>' ),
65+
'Unclosed element' => array( '<div target>This is <em>all</em> inside the DIV', '<div target>This is <em>all</em> inside the DIV' ),
66+
'Partially-closed element' => array( '<div target>This is <em>all</em> inside the DIV</div', '<div target>This is <em>all</em> inside the DIV</div' ),
67+
'Implicitly-closed element' => array( '<div><p target>Inside the P</div>Outside the P</p>', '<p target>Inside the P' ),
68+
);
69+
70+
$inner_html = <<<HTML
71+
<p>This is inside the <strong>Match</strong></p>
72+
<p><img></p>
73+
<div>
74+
<figure>
75+
<img>
76+
<figcaption>Look at the <strike>picture</strike> photograph.</figcaption>
77+
</figure>
78+
</div>
79+
HTML;
80+
81+
$html = <<<HTML
82+
<div>
83+
<p>This is not in the match.
84+
<p>This is another paragraph not <a href="#">in</a> the match.
85+
</div>
86+
<div target>{$inner_html}</div>
87+
<div>
88+
<p>This is also note in the match.</p>
89+
</div>
90+
HTML;
91+
$data['Complicated inner nesting'] = array( $html, "<div target>{$inner_html}</div>" );
92+
93+
return $data;
94+
}
95+
}

0 commit comments

Comments
 (0)