Skip to content

Commit 7d10ed6

Browse files
committed
Merge branch 'html-utils' into html-api/normalize-html
2 parents 18b5005 + edb865d commit 7d10ed6

15 files changed

Lines changed: 1098 additions & 50 deletions

highlight-html.php

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
<?php
2+
3+
require_once __DIR__ . '/src/wp-load.php';
4+
5+
define( 'C_TEXT', "\e[m" );
6+
define( 'C_REF', "\e[38;2;154;110;58m" );
7+
define( 'C_COMMENT', "\e[38;2;112;128;144m" );
8+
define( 'C_SYNTAX', "\e[38;2;153;153;153m" );
9+
define( 'C_TAGNAME', "\e[38;2;153;0;85m" );
10+
define( 'C_ANAME', "\e[38;2;102;153;0m" );
11+
define( 'C_AVALUE', "\e[38;2;0;119;170m" );
12+
13+
$options = getopt( 'fu:' ); // How rude!
14+
$do_format = isset( $options['f'] );
15+
16+
$uri = 'php://stdin';
17+
if ( isset( $options['u'] ) ) {
18+
$uri = $options['u'];
19+
if ( ! preg_match( '~^https?://~', $uri ) ) {
20+
$uri = "https://{$uri}";
21+
}
22+
}
23+
24+
$html = file_get_contents( $uri );
25+
26+
$p = new class( $html ) extends WP_HTML_Processor {
27+
public function get_raw_token() {
28+
$this->set_bookmark('here');
29+
$here = $this->bookmarks['_here'];
30+
return substr( $this->html, $here->start, $here->length );
31+
}
32+
};
33+
34+
$p = $p::create_Full_parser( $html );
35+
36+
while ( $p->next_token() ) {
37+
switch ( $p->get_token_type() ) {
38+
case '#comment':
39+
echo C_COMMENT . '<!--' . $p->get_modifiable_text() . '-->';
40+
break;
41+
42+
case '#doctype':
43+
echo C_SYNTAX . '<!DOCTYPE' . $p->get_modifiable_text() . '>';
44+
break;
45+
46+
case '#tag':
47+
print_tag( $p );
48+
break;
49+
50+
case '#text':
51+
print_text( $p );
52+
break;
53+
54+
default:
55+
die( "Unsupported syntax: {$p->get_token_type()}" );
56+
}
57+
}
58+
59+
echo "\e[m\n";
60+
61+
function print_text( $p ) {
62+
$token_name = $p->get_token_name();
63+
if ( in_array( $token_name, [ 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ], true ) ) {
64+
return C_TEXT . $p->get_modifiable_text();
65+
}
66+
67+
$raw_token = $p->get_raw_token();
68+
$text = C_TEXT;
69+
$at = 0;
70+
$was_at = 0;
71+
while ( $at < strlen( $raw_token ) ) {
72+
$next_at = strpos( $raw_token, '&', $at );
73+
if ( false === $next_at ) {
74+
break;
75+
}
76+
77+
$replacement = WP_HTML_Decoder::read_character_reference( 'data', $raw_token, $next_at, $skip_bytes );
78+
if ( isset( $replacement ) ) {
79+
$text .= substr( $raw_token, $was_at, $next_at - $was_at ) . C_REF . substr( $raw_token, $next_at, $skip_bytes ) . C_TEXT;
80+
$at = $next_at + $skip_bytes;
81+
$was_at = $at;
82+
continue;
83+
}
84+
85+
++$at;
86+
}
87+
if ( $was_at < strlen( $raw_token ) ) {
88+
$text .= substr( $raw_token, $was_at );
89+
}
90+
echo C_TEXT . $text;
91+
}
92+
93+
function print_tag( $p ) {
94+
global $do_format;
95+
96+
static $depth = 0;
97+
98+
$tag_name = $p->get_tag();
99+
$is_closer = $p->is_tag_closer();
100+
$closer = $is_closer ? '/' : '';
101+
$is_void = WP_HTML_Processor::is_void( $tag_name );
102+
$voider = $is_void ? '/' : '';
103+
104+
if ( $is_closer && in_array( $tag_name, [ 'HEAD', 'BODY', 'OL', 'UL', 'DIV' ], true ) ) {
105+
$depth--;
106+
}
107+
108+
$indent = str_pad( '', $depth * 2, ' ' );
109+
110+
if ( $do_format && (
111+
(
112+
! $is_closer && in_array( $tag_name, [
113+
'DIV', 'P', 'UL', 'OL', 'DETAILS', 'SVG', 'PATH', 'G',
114+
'LINK', 'META', 'HTML', 'HEAD', 'BODY', 'TITLE', 'TEXTAREA',
115+
'PRE', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'HGROUP',
116+
'PICTURE', 'SOURCE', 'FIGURE', 'FORM', 'TABLE', 'TR',
117+
'FIGCAPTION', 'BLOCKQUOTE', 'OBJECT', 'EMBED', 'IFRAME',
118+
'SCRIPT', 'STYLE', 'NOSCRIPT', 'NAV', 'LI'
119+
], true )
120+
) || (
121+
$is_closer && in_array( $tag_name, [
122+
'HEAD', 'HTML', 'BODY', 'PICTURE', 'FIGURE', 'TABLE'
123+
], true )
124+
)
125+
) ) {
126+
echo "\n{$indent}";
127+
}
128+
echo C_SYNTAX . '<' . $closer;
129+
130+
echo C_TAGNAME . strtolower( $p->get_tag() );
131+
$attributes = $p->get_attribute_names_with_prefix( '' ) ?? array();
132+
133+
foreach( $attributes as $name ) {
134+
$value = $p->get_attribute( $name );
135+
136+
echo ' ' . C_ANAME . $name;
137+
if ( true === $value ) {
138+
continue;
139+
}
140+
141+
echo C_SYNTAX . '="';
142+
echo C_AVALUE . str_replace( '"', '&quot;', $value );
143+
echo C_SYNTAX . '"';
144+
}
145+
echo C_SYNTAX . '>';
146+
147+
$text = $p->get_modifiable_text();
148+
if ( ! empty( $text ) ) {
149+
echo 'TITLE' === $p->get_tag() ? C_TEXT : C_COMMENT;
150+
151+
$add_newlines = (
152+
$do_format &&
153+
strlen( trim( $text ) ) > 0 &&
154+
(
155+
'SCRIPT' === $tag_name ||
156+
'STYLE' === $tag_name ||
157+
'TEXTAREA' === $tag_name ||
158+
'PRE' === $tag_name
159+
)
160+
);
161+
162+
if ( $add_newlines ) {
163+
echo "\n" . trim( $text, "\n" ) . "\n";
164+
} else {
165+
echo $text;
166+
}
167+
168+
echo C_SYNTAX . '</' . C_TAGNAME . strtolower( $p->get_tag() ) . C_SYNTAX . '>';
169+
} elseif ( in_array( $tag_name, [ 'SCRIPT', 'STYLE', 'TEXTAREA', 'PRE' ], true ) ) {
170+
echo C_SYNTAX . '</' . C_TAGNAME . strtolower( $p->get_tag() ) . C_SYNTAX . '>';
171+
}
172+
173+
if ( ! $is_closer && in_array( $tag_name, [ 'HEAD', 'BODY', 'OL', 'UL', 'DIV' ], true ) ) {
174+
$depth++;
175+
}
176+
}

html-grep.php

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
<?php
2+
3+
require_once( __DIR__ . '/src/wp-load.php' );
4+
5+
function main() {
6+
global $argv;
7+
8+
$opts = getopt( 'A:B:p:i:m:', [] );
9+
10+
if ( ! isset( $opts['p'] ) ) {
11+
die( 'Please supply a search pattern with -p, e.g. `-p "[a-f0-9]+"`' );
12+
}
13+
14+
if ( ! isset( $opts['i'] ) && ! in_array( '-', $argv, true ) ) {
15+
die( 'Please specify input filename with -i or use stdin with -, e.g. `-i file.html`' );
16+
}
17+
18+
$lines_before = ctype_digit( $opts['B'] ?? '' ) ? intval( $opts['B'] ) : 0;
19+
$lines_after = ctype_digit( $opts['A'] ?? '' ) ? intval( $opts['A'] ) : 0;
20+
21+
$max = ( isset( $opts['m'] ) && ctype_digit( $opts['m'] ) && (int) $opts['m'] > 0 )
22+
? (int) $opts['m']
23+
: 1;
24+
25+
$input = in_array( '-', $argv, true ) ? 'php://stdin' : $opts['i'];
26+
Grepper::scan( $input, $opts['p'], $lines_before, $lines_after, $max );
27+
}
28+
29+
class Debugger extends WP_HTML_Tag_Processor {
30+
public function h() {
31+
return $this->html;
32+
}
33+
34+
public function extend( $line ) {
35+
$this->html .= $line;
36+
37+
if (
38+
$this->parser_state === self::STATE_COMPLETE ||
39+
$this->parser_state === self::STATE_INCOMPLETE_INPUT
40+
) {
41+
$this->parser_state = self::STATE_READY;
42+
}
43+
}
44+
45+
public function next_token() {
46+
$r = parent::next_token();
47+
$this->set_bookmark( 'here' );
48+
return $r;
49+
}
50+
51+
public function at() {
52+
return $this->bookmarks['here'];
53+
}
54+
}
55+
56+
class Grepper {
57+
public static function scan( $input, $pattern, $before, $after, $max ) {
58+
$f = fopen( $input, 'r' );
59+
$c = 0;
60+
$n = 0;
61+
$lines = [];
62+
$lc = 1 + $before + $after;
63+
$o = static function ( $s ) { return WP_HTML_Decoder::decode_text_node( $s ); };
64+
$ws = static function ( $s ) { return preg_replace( '~[ \r\f\t\n]+~', ' ', $s ); };
65+
$pre_depth = 0;
66+
$p = new Debugger( '' );
67+
$t = '';
68+
69+
while ( false !== ( $line = fgets( $f ) ) ) {
70+
$n++;
71+
72+
$p->extend( $line );
73+
while ( $p->next_token() ) {
74+
$at = $p->at();
75+
$type = $p->get_token_type();
76+
$node_text = $o( $p->get_modifiable_text() );
77+
$node_text = $pre_depth > 0 ? $node_text : $ws( $node_text );
78+
79+
if ( '#tag' !== $type && '#text' !== $type ) {
80+
continue;
81+
}
82+
83+
switch ( $p->get_token_name() ) {
84+
case 'PRE':
85+
$pre_depth += $p->is_tag_closer() ? -1 : 1;
86+
break;
87+
88+
case '#text':
89+
$t .= $node_text;
90+
}
91+
92+
if ( preg_match( $pattern, $t, $match, PREG_OFFSET_CAPTURE ) ) {
93+
$h = (
94+
"\e[32m" .
95+
ltrim( mb_strcut( $t, 0, $match[0][1] ) ) .
96+
"\e[33m" .
97+
$match[0][0] .
98+
"\e[32m" .
99+
rtrim( mb_strcut( $t, $match[0][1] + strlen( $match[0][0] ) ) ) .
100+
"\e[90m"
101+
);
102+
103+
for ( $i = 0; $i < $after; $i++ ) {
104+
$line = fgets( $f );
105+
if ( false !== $line ) {
106+
$p->extend( $line );
107+
}
108+
}
109+
110+
$cb = mb_strcut( $p->h(), 0, $at->start );
111+
$cc = mb_strcut( $p->h(), $at->start, $at->length );
112+
$ca = mb_strcut( $p->h(), $at->start + $at->length );
113+
114+
// Limit context to N lines preview
115+
$cb = explode( "\n", $cb );
116+
$cb = array_slice( $cb, -$before );
117+
$cb = mb_strcut( implode( "\n", $cb ), -$before * 80 );
118+
119+
// Limit context to N lines preview
120+
$ca = explode( "\n", $ca );
121+
$ca = array_slice( $ca, 0, $after );
122+
$ca = mb_strcut( implode( "\n", $ca ), 0, $after * 80 );
123+
124+
// If contained in last node.
125+
$tt = $p->get_modifiable_text();
126+
if ( preg_match( $pattern, $tt, $mm, PREG_OFFSET_CAPTURE ) ) {
127+
$cc = (
128+
"\e[90m" .
129+
mb_strcut( $tt, 0, $mm[0][1] ) .
130+
"\e[33m" .
131+
$mm[0][0] .
132+
"\e[90m" .
133+
mb_strcut( $tt, $mm[0][1] + strlen( $mm[0][0] ) )
134+
);
135+
}
136+
137+
echo "\n\e[32m{$n}\e[90m: \e[31m{$p->get_token_name()} \e[90m{$h}\e[m\n";
138+
echo "\e[90m{$cb}\e[33m{$cc}\e[90m{$ca}\e[m";
139+
140+
if ( ++$c >= $max ) {
141+
fclose( $f );
142+
exit;
143+
}
144+
145+
$t = '';
146+
}
147+
148+
$t = mb_strcut( $t, -100 );
149+
}
150+
151+
}
152+
}
153+
154+
public static function indent( $lines ) {
155+
return implode( "\n", array_map(
156+
static function ( $line ) { return ' ' . $line; },
157+
explode( "\n", $lines )
158+
) );
159+
}
160+
}
161+
162+
main();
163+
164+
function is_line_breaker( $tag_name ) {
165+
switch ( $tag_name ) {
166+
case 'BLOCKQUOTE':
167+
case 'BR':
168+
case 'DD':
169+
case 'DIV':
170+
case 'DL':
171+
case 'DT':
172+
case 'H1':
173+
case 'H2':
174+
case 'H3':
175+
case 'H4':
176+
case 'H5':
177+
case 'H6':
178+
case 'HR':
179+
case 'LI':
180+
case 'OL':
181+
case 'P':
182+
case 'UL':
183+
return true;
184+
}
185+
186+
return false;
187+
}

0 commit comments

Comments
 (0)