11# frozen_string_literal: true
22
33require 'set'
4+ require 'strscan'
45
56# Parses inline markup in RDoc text.
67# THis parser handles em, bold, strike, tt, hard break, and tidylink.
@@ -31,11 +32,13 @@ class RDoc::Markup::InlineParser
3132
3233 STANDALONE_TAGS = { 'br' => :HARD_BREAK } # :nodoc:
3334
35+ CODEBLOCK_TAGS = %w[ tt code ] # :nodoc:
36+
3437 TOKENS = {
3538 **WORD_PAIRS . transform_values { [ :word_pair , nil ] } ,
3639 **TAGS . keys . to_h { |tag | [ "<#{ tag } >" , [ :open_tag , tag ] ] } ,
3740 **TAGS . keys . to_h { |tag | [ "</#{ tag } >" , [ :close_tag , tag ] ] } ,
38- **%w[ tt code ] . to_h { |tag | [ "<#{ tag } >" , [ :code_start , tag ] ] } ,
41+ **CODEBLOCK_TAGS . to_h { |tag | [ "<#{ tag } >" , [ :code_start , tag ] ] } ,
3942 **STANDALONE_TAGS . keys . to_h { |tag | [ "<#{ tag } >" , [ :standalone_tag , tag ] ] } ,
4043 '{' => [ :tidylink_start , nil ] ,
4144 '}' => [ :tidylink_mid , nil ] ,
@@ -47,7 +50,7 @@ class RDoc::Markup::InlineParser
4750 token_starts_regexp = TOKENS . keys . map { |s | s [ 0 ] } . uniq . map { |s | Regexp . escape ( s ) } . join
4851
4952 SCANNER_REGEXP =
50- /\G (?:
53+ /(?:
5154 #{ multi_char_tokens_regexp }
5255 |[^#{ token_starts_regexp } \s a-zA-Z0-9\. ]+ # chunk of normal text
5356 |\s +|[a-zA-Z0-9\. ]+|.
@@ -56,10 +59,30 @@ class RDoc::Markup::InlineParser
5659 # Characters that can be escaped with backslash.
5760 ESCAPING_CHARS = '\\*_+`{}[]<>' # :nodoc:
5861
62+ # Pattern to match code block content until <code></tt></code> or <tt></code></tt>.
63+ CODEBLOCK_REGEXPS = CODEBLOCK_TAGS . to_h { |name | [ name , /((?:\\ .|[^\\ ])*?)<\/ #{ name } >/ ] } # :nodoc:
64+
65+ # Word contains alphanumeric and <tt>_./:[]-</tt> characters.
66+ # Word may start with <tt>#</tt> and may end with any non-space character. (e.g. <tt>#eql?</tt>).
67+ # Underscore delimiter have special rules.
68+ WORD_REGEXPS = {
69+ # Words including _, longest match.
70+ # Example: `_::A_` `_-42_` `_A::B::C.foo_bar[baz]_` `_kwarg:_`
71+ # Content must not include _ followed by non-alphanumeric character
72+ # Example: `_host_:_port_` will be `_host_` + `:` + `_port_`
73+ '_' => /#?([a-zA-Z0-9.\/ :\[ \] -]|_+[a-zA-Z0-9])+[^\s ]?_(?=[^a-zA-Z0-9_]|\z )/ ,
74+ # Words allowing _ but not allowing __
75+ '__' => /#?[a-zA-Z0-9.\/ :\[ \] -]*(_[a-zA-Z0-9.\/ :\[ \] -]+)*[^\s ]?__(?=[^a-zA-Z0-9]|\z )/ ,
76+ **%w[ * ** + ++ ` `` ] . to_h do |s |
77+ # normal words that can be used within +word+ or *word*
78+ [ s , /#?[a-zA-Z0-9_.\/ :\[ \] -]+[^\s ]?#{ Regexp . escape ( s ) } (?=[^a-zA-Z0-9]|\z )/ ]
79+ end
80+ } # :nodoc:
81+
5982 def initialize ( string )
60- @string = string
61- @pos = 0
62- @scan_failure_cache = Set . new
83+ @scanner = StringScanner . new ( string )
84+ @last_match = nil
85+ @scanner_negative_cache = Set . new
6386 @stack = [ ]
6487 @delimiters = { }
6588 end
@@ -188,63 +211,49 @@ def compact_string(nodes)
188211 end
189212 end
190213
191- # Scan from the current position with a regexp that starts with \G.
192-
193- def scan_string ( pattern )
194- if ( res = @string . match ( pattern , @pos ) )
195- @pos = res . end ( 0 )
196- res [ 0 ]
197- end
198- end
199-
200- # Read +len+ characters from the current position.
214+ # Scan from StringScanner with +pattern+
215+ # If +negative_cache+ is true, caches scan failure result. <tt>scan(pattern, negative_cache: true)</tt> return nil when it is called again after a failure.
216+ # Be careful to use +negative_cache+ with a pattern and position that does not match after previous failure.
201217
202- def read ( len )
203- s = @string [ @pos , len ]
204- return if s . nil? || s . empty?
218+ def strscan ( pattern , negative_cache : false )
219+ return if negative_cache && @scanner_negative_cache . include? ( pattern )
205220
206- @pos += len
207- s
208- end
209-
210- # Match +pattern+ from the current position.
211- # Returns nil if not found, and caches the failure.
212- # Be careful to use a pair of pattern and position that is cache-safe.
213-
214- def failure_cached_match ( pattern )
215- # Cache notfound information to avoid O(N^2) search of missing closing tags
216- return if @scan_failure_cache . include? ( pattern )
217-
218- match = @string . match ( pattern , @pos )
219- @scan_failure_cache << pattern unless match
220- match
221+ string = @scanner . scan ( pattern )
222+ @last_match = string if string
223+ @scanner_negative_cache << pattern if !string && negative_cache
224+ string
221225 end
222226
223227 # Scan and return the next token for parsing.
224228 # Returns <tt>[token_type, token_string_or_nil, extra_info]</tt>
225229
226230 def scan_token
227- token = scan_string ( SCANNER_REGEXP )
231+ last_match = @last_match
232+ token = strscan ( SCANNER_REGEXP )
228233 type , name = TOKENS [ token ]
234+
229235 case type
230236 when :word_pair
231- pair = read_word_pair ( token )
232- pair ? [ :node , nil , { type : WORD_PAIRS [ token ] , children : [ pair ] } ] : [ :text , token ]
237+ # If the character before word pair delimiter is alphanumeric, do not treat as word pair.
238+ word_pair = strscan ( WORD_REGEXPS [ token ] ) unless /[a-zA-Z0-9]\z / . match? ( last_match )
239+
240+ if word_pair . nil?
241+ [ :text , token , nil ]
242+ elsif token == '__' && word_pair . match? ( /\A [a-zA-Z]+__\z / )
243+ # Special exception: __FILE__, __LINE__, __send__ should be treated as normal text.
244+ [ :text , "#{ token } #{ word_pair } " , nil ]
245+ else
246+ [ :node , nil , { type : WORD_PAIRS [ token ] , children : [ word_pair . delete_suffix ( token ) ] } ]
247+ end
233248 when :open_tag
234249 [ :open , token , name ]
235250 when :close_tag
236251 [ :close , token , name ]
237252 when :code_start
238- if name == 'tt'
239- close_pattern = /\G ((?:\\ .|[^\\ ])*?)<\/ tt>/
240- else
241- close_pattern = /\G ((?:\\ .|[^\\ ])*?)<\/ code>/
242- end
243- if ( match = failure_cached_match ( close_pattern ) )
244- @pos = match . end ( 0 )
253+ if ( codeblock = strscan ( CODEBLOCK_REGEXPS [ name ] , negative_cache : true ) )
245254 # Need to unescape `\\` and `\<`.
246255 # RDoc also unescapes backslash + word separators, but this is not really necessary.
247- content = match [ 1 ] . gsub ( /\\ (.)/ ) { '\\<*+_`' . include? ( $1) ? $1 : $& }
256+ content = codeblock . delete_suffix ( "</ #{ name } >" ) . gsub ( /\\ (.)/ ) { '\\<*+_`' . include? ( $1) ? $1 : $& }
248257 [ :node , nil , { type : :TT , children : content . empty? ? [ ] : [ content ] } ]
249258 else
250259 [ :text , token , nil ]
@@ -266,7 +275,7 @@ def scan_token
266275 [ :text , token , nil ]
267276 end
268277 when :escape
269- next_char = read ( 1 )
278+ next_char = strscan ( /./ )
270279 if next_char . nil?
271280 # backslash at end of string
272281 [ :text , '\\' , nil ]
@@ -296,43 +305,7 @@ def scan_token
296305 # Example: <tt>[http://example.com/?q=\[\]]</tt> represents <tt>http://example.com/?q=[]</tt>.
297306
298307 def read_tidylink_url
299- bracketed_url = scan_string ( / \G \[ ([^\s \[ \] \\ ]|\\ [\[ \] \\ ])+\] /)
308+ bracketed_url = strscan ( / \[ ([^\s \[ \] \\ ]|\\ [\[ \] \\ ])+\] /)
300309 bracketed_url [ 1 ...-1 ] . gsub ( /\\ (.)/ , '\1' ) if bracketed_url
301310 end
302-
303- # Word contains alphanumeric and <tt>_./:[]-</tt> characters.
304- # Word may start with <tt>#</tt> and may end with any non-space character. (e.g. <tt>#eql?</tt>).
305- # Underscore delimiter have special rules.
306-
307- WORD_REGEXPS = {
308- # Words including _, longest match.
309- # Example: `_::A_` `_-42_` `_A::B::C.foo_bar[baz]_` `_kwarg:_`
310- # Content must not include _ followed by non-alphanumeric character
311- # Example: `_host_:_port_` will be `_host_` + `:` + `_port_`
312- '_' => /\G #?([a-zA-Z0-9.\/ :\[ \] -]|_+[a-zA-Z0-9])+[^\s ]?(?=_[^a-zA-Z0-9_]|_\z )/ ,
313- # Words allowing _ but not allowing __
314- '__' => /\G #?[a-zA-Z0-9.\/ :\[ \] -]*(_[a-zA-Z0-9.\/ :\[ \] -]+)*[^\s ]?(?=__)/ ,
315- **%w[ * ** + ++ ` `` ] . to_h do |s |
316- # normal words that can be used within +word+ or *word*
317- [ s , /\G #?[a-zA-Z0-9_.\/ :\[ \] -]+[^\s ]?(?=#{ Regexp . escape ( s ) } )/ ]
318- end
319- } # :nodoc:
320-
321- # Read a word surrounded by +delimiter+ from the current position.
322-
323- def read_word_pair ( delimiter )
324- invalid_adjascent_char_pattern = /[a-zA-Z0-9]/
325- return if @pos != delimiter . size && invalid_adjascent_char_pattern . match? ( @string [ @pos - delimiter . size - 1 ] )
326- return unless ( m = @string . match ( WORD_REGEXPS [ delimiter ] , @pos ) )
327-
328- word = m [ 0 ]
329- # Special exception: __FILE__, __LINE__, __send__ should not be treated as emphasis
330- return if delimiter == '__' && word . match? ( /\A [a-zA-Z]+\z / )
331-
332- pos = m . end ( 0 )
333- unless invalid_adjascent_char_pattern . match? ( @string [ pos + delimiter . size ] )
334- @pos = pos + delimiter . size
335- word
336- end
337- end
338311end
0 commit comments