Skip to content

Commit f21b043

Browse files
committed
Use strscan in RDoc::Markup::InlineParser token scan
1 parent bb771be commit f21b043

2 files changed

Lines changed: 58 additions & 82 deletions

File tree

lib/rdoc/markup/inline_parser.rb

Lines changed: 55 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# frozen_string_literal: true
22

33
require 'set'
4+
require 'strscan'
45

56
# Parses inline markup in RDoc text.
67
# THis parser handles em, bold, strike, tt, hard break, and tidylink.
@@ -31,11 +32,13 @@ class RDoc::Markup::InlineParser
3132

3233
STANDALONE_TAGS = { 'br' => :HARD_BREAK } # :nodoc:
3334

35+
CODEBLOCK_TAGS = %w[tt code] # :nodoc:
36+
3437
TOKENS = {
3538
**WORD_PAIRS.transform_values { [:word_pair, nil] },
3639
**TAGS.keys.to_h {|tag| ["<#{tag}>", [:open_tag, tag]] },
3740
**TAGS.keys.to_h {|tag| ["</#{tag}>", [:close_tag, tag]] },
38-
**%w[tt code].to_h {|tag| ["<#{tag}>", [:code_start, tag]] },
41+
**CODEBLOCK_TAGS.to_h {|tag| ["<#{tag}>", [:code_start, tag]] },
3942
**STANDALONE_TAGS.keys.to_h {|tag| ["<#{tag}>", [:standalone_tag, tag]] },
4043
'{' => [:tidylink_start, nil],
4144
'}' => [:tidylink_mid, nil],
@@ -47,7 +50,7 @@ class RDoc::Markup::InlineParser
4750
token_starts_regexp = TOKENS.keys.map {|s| s[0] }.uniq.map {|s| Regexp.escape(s) }.join
4851

4952
SCANNER_REGEXP =
50-
/\G(?:
53+
/(?:
5154
#{multi_char_tokens_regexp}
5255
|[^#{token_starts_regexp}\sa-zA-Z0-9\.]+ # chunk of normal text
5356
|\s+|[a-zA-Z0-9\.]+|.
@@ -56,10 +59,30 @@ class RDoc::Markup::InlineParser
5659
# Characters that can be escaped with backslash.
5760
ESCAPING_CHARS = '\\*_+`{}[]<>' # :nodoc:
5861

62+
# Pattern to match code block content until <code></tt></code> or <tt></code></tt>.
63+
CODEBLOCK_REGEXPS = CODEBLOCK_TAGS.to_h {|name| [name, /((?:\\.|[^\\])*?)<\/#{name}>/] } # :nodoc:
64+
65+
# Word contains alphanumeric and <tt>_./:[]-</tt> characters.
66+
# Word may start with <tt>#</tt> and may end with any non-space character. (e.g. <tt>#eql?</tt>).
67+
# Underscore delimiter have special rules.
68+
WORD_REGEXPS = {
69+
# Words including _, longest match.
70+
# Example: `_::A_` `_-42_` `_A::B::C.foo_bar[baz]_` `_kwarg:_`
71+
# Content must not include _ followed by non-alphanumeric character
72+
# Example: `_host_:_port_` will be `_host_` + `:` + `_port_`
73+
'_' => /#?([a-zA-Z0-9.\/:\[\]-]|_+[a-zA-Z0-9])+[^\s]?_(?=[^a-zA-Z0-9_]|\z)/,
74+
# Words allowing _ but not allowing __
75+
'__' => /#?[a-zA-Z0-9.\/:\[\]-]*(_[a-zA-Z0-9.\/:\[\]-]+)*[^\s]?__(?=[^a-zA-Z0-9]|\z)/,
76+
**%w[* ** + ++ ` ``].to_h do |s|
77+
# normal words that can be used within +word+ or *word*
78+
[s, /#?[a-zA-Z0-9_.\/:\[\]-]+[^\s]?#{Regexp.escape(s)}(?=[^a-zA-Z0-9]|\z)/]
79+
end
80+
} # :nodoc:
81+
5982
def initialize(string)
60-
@string = string
61-
@pos = 0
62-
@scan_failure_cache = Set.new
83+
@scanner = StringScanner.new(string)
84+
@last_match = nil
85+
@scanner_negative_cache = Set.new
6386
@stack = []
6487
@delimiters = {}
6588
end
@@ -188,63 +211,49 @@ def compact_string(nodes)
188211
end
189212
end
190213

191-
# Scan from the current position with a regexp that starts with \G.
192-
193-
def scan_string(pattern)
194-
if (res = @string.match(pattern, @pos))
195-
@pos = res.end(0)
196-
res[0]
197-
end
198-
end
199-
200-
# Read +len+ characters from the current position.
214+
# Scan from StringScanner with +pattern+
215+
# If +negative_cache+ is true, caches scan failure result. <tt>scan(pattern, negative_cache: true)</tt> return nil when it is called again after a failure.
216+
# Be careful to use +negative_cache+ with a pattern and position that does not match after previous failure.
201217

202-
def read(len)
203-
s = @string[@pos, len]
204-
return if s.nil? || s.empty?
218+
def strscan(pattern, negative_cache: false)
219+
return if negative_cache && @scanner_negative_cache.include?(pattern)
205220

206-
@pos += len
207-
s
208-
end
209-
210-
# Match +pattern+ from the current position.
211-
# Returns nil if not found, and caches the failure.
212-
# Be careful to use a pair of pattern and position that is cache-safe.
213-
214-
def failure_cached_match(pattern)
215-
# Cache notfound information to avoid O(N^2) search of missing closing tags
216-
return if @scan_failure_cache.include?(pattern)
217-
218-
match = @string.match(pattern, @pos)
219-
@scan_failure_cache << pattern unless match
220-
match
221+
string = @scanner.scan(pattern)
222+
@last_match = string if string
223+
@scanner_negative_cache << pattern if !string && negative_cache
224+
string
221225
end
222226

223227
# Scan and return the next token for parsing.
224228
# Returns <tt>[token_type, token_string_or_nil, extra_info]</tt>
225229

226230
def scan_token
227-
token = scan_string(SCANNER_REGEXP)
231+
last_match = @last_match
232+
token = strscan(SCANNER_REGEXP)
228233
type, name = TOKENS[token]
234+
229235
case type
230236
when :word_pair
231-
pair = read_word_pair(token)
232-
pair ? [:node, nil, { type: WORD_PAIRS[token], children: [pair]}] : [:text, token]
237+
# If the character before word pair delimiter is alphanumeric, do not treat as word pair.
238+
word_pair = strscan(WORD_REGEXPS[token]) unless /[a-zA-Z0-9]\z/.match?(last_match)
239+
240+
if word_pair.nil?
241+
[:text, token, nil]
242+
elsif token == '__' && word_pair.match?(/\A[a-zA-Z]+__\z/)
243+
# Special exception: __FILE__, __LINE__, __send__ should be treated as normal text.
244+
[:text, "#{token}#{word_pair}", nil]
245+
else
246+
[:node, nil, { type: WORD_PAIRS[token], children: [word_pair.delete_suffix(token)] }]
247+
end
233248
when :open_tag
234249
[:open, token, name]
235250
when :close_tag
236251
[:close, token, name]
237252
when :code_start
238-
if name == 'tt'
239-
close_pattern = /\G((?:\\.|[^\\])*?)<\/tt>/
240-
else
241-
close_pattern = /\G((?:\\.|[^\\])*?)<\/code>/
242-
end
243-
if (match = failure_cached_match(close_pattern))
244-
@pos = match.end(0)
253+
if (codeblock = strscan(CODEBLOCK_REGEXPS[name], negative_cache: true))
245254
# Need to unescape `\\` and `\<`.
246255
# RDoc also unescapes backslash + word separators, but this is not really necessary.
247-
content = match[1].gsub(/\\(.)/) { '\\<*+_`'.include?($1) ? $1 : $& }
256+
content = codeblock.delete_suffix("</#{name}>").gsub(/\\(.)/) { '\\<*+_`'.include?($1) ? $1 : $& }
248257
[:node, nil, { type: :TT, children: content.empty? ? [] : [content] }]
249258
else
250259
[:text, token, nil]
@@ -266,7 +275,7 @@ def scan_token
266275
[:text, token, nil]
267276
end
268277
when :escape
269-
next_char = read(1)
278+
next_char = strscan(/./)
270279
if next_char.nil?
271280
# backslash at end of string
272281
[:text, '\\', nil]
@@ -296,43 +305,7 @@ def scan_token
296305
# Example: <tt>[http://example.com/?q=\[\]]</tt> represents <tt>http://example.com/?q=[]</tt>.
297306

298307
def read_tidylink_url
299-
bracketed_url = scan_string(/\G\[([^\s\[\]\\]|\\[\[\]\\])+\]/)
308+
bracketed_url = strscan(/\[([^\s\[\]\\]|\\[\[\]\\])+\]/)
300309
bracketed_url[1...-1].gsub(/\\(.)/, '\1') if bracketed_url
301310
end
302-
303-
# Word contains alphanumeric and <tt>_./:[]-</tt> characters.
304-
# Word may start with <tt>#</tt> and may end with any non-space character. (e.g. <tt>#eql?</tt>).
305-
# Underscore delimiter have special rules.
306-
307-
WORD_REGEXPS = {
308-
# Words including _, longest match.
309-
# Example: `_::A_` `_-42_` `_A::B::C.foo_bar[baz]_` `_kwarg:_`
310-
# Content must not include _ followed by non-alphanumeric character
311-
# Example: `_host_:_port_` will be `_host_` + `:` + `_port_`
312-
'_' => /\G#?([a-zA-Z0-9.\/:\[\]-]|_+[a-zA-Z0-9])+[^\s]?(?=_[^a-zA-Z0-9_]|_\z)/,
313-
# Words allowing _ but not allowing __
314-
'__' => /\G#?[a-zA-Z0-9.\/:\[\]-]*(_[a-zA-Z0-9.\/:\[\]-]+)*[^\s]?(?=__)/,
315-
**%w[* ** + ++ ` ``].to_h do |s|
316-
# normal words that can be used within +word+ or *word*
317-
[s, /\G#?[a-zA-Z0-9_.\/:\[\]-]+[^\s]?(?=#{Regexp.escape(s)})/]
318-
end
319-
} # :nodoc:
320-
321-
# Read a word surrounded by +delimiter+ from the current position.
322-
323-
def read_word_pair(delimiter)
324-
invalid_adjascent_char_pattern = /[a-zA-Z0-9]/
325-
return if @pos != delimiter.size && invalid_adjascent_char_pattern.match?(@string[@pos - delimiter.size - 1])
326-
return unless (m = @string.match(WORD_REGEXPS[delimiter], @pos))
327-
328-
word = m[0]
329-
# Special exception: __FILE__, __LINE__, __send__ should not be treated as emphasis
330-
return if delimiter == '__' && word.match?(/\A[a-zA-Z]+\z/)
331-
332-
pos = m.end(0)
333-
unless invalid_adjascent_char_pattern.match?(@string[pos + delimiter.size])
334-
@pos = pos + delimiter.size
335-
word
336-
end
337-
end
338311
end

test/rdoc/markup/inline_parser_test.rb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ def test_escape
5555
def test_bold
5656
assert_equal([bold_node()], parse('<b></b>'))
5757
assert_equal(['*a b*'], parse('*a b*'))
58+
assert_equal(['x*a* *b*x'], parse('x*a* *b*x'))
5859
assert_equal([bold_word('bold')], parse('*bold*'))
5960
assert_equal([bold_word('bold')], parse('**bold**'))
6061
assert_equal([bold_node('bo ld')], parse('<b>bo ld</b>'))
@@ -68,6 +69,7 @@ def test_bold
6869
def test_em
6970
assert_equal([em_node()], parse('<em></em>'))
7071
assert_equal(['_a b_'], parse('_a b_'))
72+
assert_equal(['x_a_ _b_x'], parse('x_a_ _b_x'))
7173
assert_equal([em_word('em')], parse('_em_'))
7274
assert_equal([em_word('F1LE')], parse('__F1LE__'))
7375
assert_equal(['_foo_bar_baz'], parse('_foo_bar_baz'))
@@ -112,6 +114,7 @@ def test_method_like_words
112114
def test_tt
113115
assert_equal([tt_node()], parse('<tt></tt>'))
114116
assert_equal(['`a b`'], parse('`a b`'))
117+
assert_equal(['x`a` `b`x'], parse('x`a` `b`x'))
115118
assert_equal([tt_node('code')], parse('`code`'))
116119
assert_equal([tt_node('code')], parse('+code+'))
117120
assert_equal([tt_node('code')], parse('++code++'))

0 commit comments

Comments
 (0)