From 78ea670d1bf97aa1a6ed3d3efd736c9f75d6e6cc Mon Sep 17 00:00:00 2001 From: tompng Date: Wed, 26 Nov 2025 01:55:35 +0900 Subject: [PATCH] Syntax highlighting using Prism.parse_lex Completely drops Ripper dependency --- lib/rdoc/generator/markup.rb | 3 - .../generator/template/aliki/css/rdoc.css | 2 - .../generator/template/darkfish/css/rdoc.css | 1 - lib/rdoc/markup/to_html.rb | 34 +- lib/rdoc/parser/ripper_state_lex.rb | 302 ------------------ lib/rdoc/parser/ruby.rb | 33 +- lib/rdoc/parser/ruby_colorizer.rb | 249 +++++++++++++++ lib/rdoc/token_stream.rb | 33 +- test/rdoc/markup/to_html_test.rb | 44 +-- test/rdoc/parser/ruby_colorizer_test.rb | 202 ++++++++++++ test/rdoc/parser/ruby_test.rb | 17 + test/rdoc/rdoc_token_stream_test.rb | 26 +- 12 files changed, 544 insertions(+), 402 deletions(-) delete mode 100644 lib/rdoc/parser/ripper_state_lex.rb create mode 100644 lib/rdoc/parser/ruby_colorizer.rb create mode 100644 test/rdoc/parser/ruby_colorizer_test.rb diff --git a/lib/rdoc/generator/markup.rb b/lib/rdoc/generator/markup.rb index 54158c29ba..5bc0c5849a 100644 --- a/lib/rdoc/generator/markup.rb +++ b/lib/rdoc/generator/markup.rb @@ -125,9 +125,6 @@ def markup_code src = RDoc::TokenStream.to_html @token_stream - # add initial whitespace so that the indent gets calculated correctly - src.prepend(' ' * @token_stream.first[:char_no]) if source_language == 'ruby' && @token_stream.first - # dedent the source common_indent = src.length src.scan(/^ *(?=\S)/) do |whitespace| diff --git a/lib/rdoc/generator/template/aliki/css/rdoc.css b/lib/rdoc/generator/template/aliki/css/rdoc.css index e3e0aec650..50e810c6ec 100644 --- a/lib/rdoc/generator/template/aliki/css/rdoc.css +++ b/lib/rdoc/generator/template/aliki/css/rdoc.css @@ -1021,7 +1021,6 @@ main h6 a:hover { .ruby-ivar { color: var(--code-orange); } .ruby-operator { color: var(--code-green); } .ruby-identifier { color: var(--code-blue); } -.ruby-node { color: var(--code-purple); } .ruby-comment { color: var(--color-neutral-500); @@ -1037,7 +1036,6 @@ main h6 a:hover { [data-theme="dark"] .ruby-ivar { color: var(--code-orange); } [data-theme="dark"] .ruby-operator { color: var(--code-green); } [data-theme="dark"] .ruby-identifier { color: var(--code-blue); } -[data-theme="dark"] .ruby-node { color: var(--code-purple); } [data-theme="dark"] .ruby-comment { color: var(--color-neutral-400); diff --git a/lib/rdoc/generator/template/darkfish/css/rdoc.css b/lib/rdoc/generator/template/darkfish/css/rdoc.css index 87bf24c36e..6f1163affa 100644 --- a/lib/rdoc/generator/template/darkfish/css/rdoc.css +++ b/lib/rdoc/generator/template/darkfish/css/rdoc.css @@ -449,7 +449,6 @@ main h6 { .ruby-ivar { color: #B57614; } /* Brown */ .ruby-operator { color: #427B58; } /* Dark Teal */ .ruby-identifier { color: #076678; } /* Deep Teal */ -.ruby-node { color: #8F3F71; } /* Plum */ .ruby-comment { color: #928374; font-style: italic; } /* Gray */ .ruby-regexp { color: #8F3F71; } /* Plum */ .ruby-value { color: #AF3A03; } /* Dark Orange */ diff --git a/lib/rdoc/markup/to_html.rb b/lib/rdoc/markup/to_html.rb index 0da0081e70..9499118120 100644 --- a/lib/rdoc/markup/to_html.rb +++ b/lib/rdoc/markup/to_html.rb @@ -2,6 +2,7 @@ require 'cgi/escape' require 'cgi/util' unless defined?(CGI::EscapeExt) require 'prism' +require 'rdoc/parser/ruby_colorizer' ## # Outputs RDoc markup as HTML. @@ -321,6 +322,15 @@ def accept_paragraph(paragraph) @res << "

\n" end + # Generate syntax highlighted html for ruby-like text. + + def parsable_text_to_html(text) + tokens = RDoc::Parser::RubyColorizer.colorize(text) + result = RDoc::TokenStream.to_html tokens + result = result + "\n" unless result.end_with?("\n") + result + end + ## # Adds +verbatim+ to the output @@ -328,27 +338,17 @@ def accept_verbatim(verbatim) text = verbatim.text.rstrip format = verbatim.format - klass = nil - # Apply Ruby syntax highlighting if # - explicitly marked as Ruby (via ruby? which accepts :ruby or :rb) # - no format specified but the text is parseable as Ruby # Otherwise, add language class when applicable and skip Ruby highlighting - content = if verbatim.ruby? || (format.nil? && parseable?(text)) - begin - tokens = RDoc::Parser::RipperStateLex.parse text - klass = ' class="ruby"' - - result = RDoc::TokenStream.to_html tokens - result = result + "\n" unless "\n" == result[-1] - result - rescue - CGI.escapeHTML text - end - else - klass = " class=\"#{format}\"" if format - CGI.escapeHTML text - end + if verbatim.ruby? || (format.nil? && parseable?(text)) + content = parsable_text_to_html(text) + klass = ' class="ruby"' + else + content = CGI.escapeHTML text + klass = " class=\"#{format}\"" if format + end if @pipe @res << "\n
#{CGI.escapeHTML text}\n
\n" diff --git a/lib/rdoc/parser/ripper_state_lex.rb b/lib/rdoc/parser/ripper_state_lex.rb deleted file mode 100644 index 2212906bbd..0000000000 --- a/lib/rdoc/parser/ripper_state_lex.rb +++ /dev/null @@ -1,302 +0,0 @@ -# frozen_string_literal: true -require 'ripper' - -## -# Wrapper for Ripper lex states - -class RDoc::Parser::RipperStateLex - # :stopdoc: - - Token = Struct.new(:line_no, :char_no, :kind, :text, :state) - - EXPR_END = Ripper::EXPR_END - EXPR_ENDFN = Ripper::EXPR_ENDFN - EXPR_ARG = Ripper::EXPR_ARG - EXPR_FNAME = Ripper::EXPR_FNAME - - class InnerStateLex < Ripper::Filter - def initialize(code) - super(code) - end - - def on_default(event, tok, data) - data << Token.new(lineno, column, event, tok, state) - end - end - - def get_squashed_tk - if @buf.empty? - tk = @tokens.shift - else - tk = @buf.shift - end - return nil if tk.nil? - case tk[:kind] - when :on_symbeg then - tk = get_symbol_tk(tk) - when :on_tstring_beg then - tk = get_string_tk(tk) - when :on_backtick then - if (tk[:state] & (EXPR_FNAME | EXPR_ENDFN)) != 0 - tk[:kind] = :on_ident - tk[:state] = Ripper::Lexer::State.new(EXPR_ARG) - else - tk = get_string_tk(tk) - end - when :on_regexp_beg then - tk = get_regexp_tk(tk) - when :on_embdoc_beg then - tk = get_embdoc_tk(tk) - when :on_heredoc_beg then - @heredoc_queue << retrieve_heredoc_info(tk) - when :on_nl, :on_ignored_nl, :on_comment, :on_heredoc_end then - if !@heredoc_queue.empty? - get_heredoc_tk(*@heredoc_queue.shift) - elsif tk[:text].nil? # :on_ignored_nl sometimes gives nil - tk[:text] = '' - end - when :on_words_beg then - tk = get_words_tk(tk) - when :on_qwords_beg then - tk = get_words_tk(tk) - when :on_symbols_beg then - tk = get_words_tk(tk) - when :on_qsymbols_beg then - tk = get_words_tk(tk) - when :on_op then - if '&.' == tk[:text] - tk[:kind] = :on_period - else - tk = get_op_tk(tk) - end - end - tk - end - - private def get_symbol_tk(tk) - is_symbol = true - symbol_tk = Token.new(tk.line_no, tk.char_no, :on_symbol) - if ":'" == tk[:text] or ':"' == tk[:text] or tk[:text].start_with?('%s') - tk1 = get_string_tk(tk) - symbol_tk[:text] = tk1[:text] - symbol_tk[:state] = tk1[:state] - else - case (tk1 = get_squashed_tk)[:kind] - when :on_ident - symbol_tk[:text] = ":#{tk1[:text]}" - symbol_tk[:state] = tk1[:state] - when :on_tstring_content - symbol_tk[:text] = ":#{tk1[:text]}" - symbol_tk[:state] = get_squashed_tk[:state] # skip :on_tstring_end - when :on_tstring_end - symbol_tk[:text] = ":#{tk1[:text]}" - symbol_tk[:state] = tk1[:state] - when :on_op - symbol_tk[:text] = ":#{tk1[:text]}" - symbol_tk[:state] = tk1[:state] - when :on_ivar - symbol_tk[:text] = ":#{tk1[:text]}" - symbol_tk[:state] = tk1[:state] - when :on_cvar - symbol_tk[:text] = ":#{tk1[:text]}" - symbol_tk[:state] = tk1[:state] - when :on_gvar - symbol_tk[:text] = ":#{tk1[:text]}" - symbol_tk[:state] = tk1[:state] - when :on_const - symbol_tk[:text] = ":#{tk1[:text]}" - symbol_tk[:state] = tk1[:state] - when :on_kw - symbol_tk[:text] = ":#{tk1[:text]}" - symbol_tk[:state] = tk1[:state] - else - is_symbol = false - tk = tk1 - end - end - if is_symbol - tk = symbol_tk - end - tk - end - - private def get_string_tk(tk) - string = tk[:text] - state = nil - kind = :on_tstring - loop do - inner_str_tk = get_squashed_tk - if inner_str_tk.nil? - break - elsif :on_tstring_end == inner_str_tk[:kind] - string = string + inner_str_tk[:text] - state = inner_str_tk[:state] - break - elsif :on_label_end == inner_str_tk[:kind] - string = string + inner_str_tk[:text] - state = inner_str_tk[:state] - kind = :on_symbol - break - else - string = string + inner_str_tk[:text] - if :on_embexpr_beg == inner_str_tk[:kind] then - kind = :on_dstring if :on_tstring == kind - end - end - end - Token.new(tk.line_no, tk.char_no, kind, string, state) - end - - private def get_regexp_tk(tk) - string = tk[:text] - state = nil - loop do - inner_str_tk = get_squashed_tk - if inner_str_tk.nil? - break - elsif :on_regexp_end == inner_str_tk[:kind] - string = string + inner_str_tk[:text] - state = inner_str_tk[:state] - break - else - string = string + inner_str_tk[:text] - end - end - Token.new(tk.line_no, tk.char_no, :on_regexp, string, state) - end - - private def get_embdoc_tk(tk) - string = tk[:text] - until :on_embdoc_end == (embdoc_tk = get_squashed_tk)[:kind] do - string = string + embdoc_tk[:text] - end - string = string + embdoc_tk[:text] - Token.new(tk.line_no, tk.char_no, :on_embdoc, string, embdoc_tk.state) - end - - private def get_heredoc_tk(heredoc_name, indent) - string = '' - start_tk = nil - prev_tk = nil - until heredoc_end?(heredoc_name, indent, tk = @tokens.shift) do - start_tk = tk unless start_tk - if (prev_tk.nil? or "\n" == prev_tk[:text][-1]) and 0 != tk[:char_no] - string = string + (' ' * tk[:char_no]) - end - string = string + tk[:text] - prev_tk = tk - end - start_tk = tk unless start_tk - prev_tk = tk unless prev_tk - @buf.unshift tk # closing heredoc - heredoc_tk = Token.new(start_tk.line_no, start_tk.char_no, :on_heredoc, string, prev_tk.state) - @buf.unshift heredoc_tk - end - - private def retrieve_heredoc_info(tk) - name = tk[:text].gsub(/\A<<[-~]?(['"`]?)(.+)\1\z/, '\2') - indent = tk[:text] =~ /\A<<[-~]/ - [name, indent] - end - - private def heredoc_end?(name, indent, tk) - result = false - if :on_heredoc_end == tk[:kind] then - tk_name = tk[:text].chomp - tk_name.lstrip! if indent - if name == tk_name - result = true - end - end - result - end - - private def get_words_tk(tk) - string = '' - start_token = tk[:text] - start_quote = tk[:text].rstrip[-1] - line_no = tk[:line_no] - char_no = tk[:char_no] - state = tk[:state] - end_quote = - case start_quote - when ?( then ?) - when ?[ then ?] - when ?{ then ?} - when ?< then ?> - else start_quote - end - end_token = nil - loop do - tk = get_squashed_tk - if tk.nil? - end_token = end_quote - break - elsif :on_tstring_content == tk[:kind] then - string += tk[:text] - elsif :on_words_sep == tk[:kind] or :on_tstring_end == tk[:kind] then - if end_quote == tk[:text].strip then - end_token = tk[:text] - break - else - string += tk[:text] - end - else - string += tk[:text] - end - end - text = "#{start_token}#{string}#{end_token}" - Token.new(line_no, char_no, :on_dstring, text, state) - end - - private def get_op_tk(tk) - redefinable_operators = %w[! != !~ % & * ** + +@ - -@ / < << <= <=> == === =~ > >= >> [] []= ^ ` | ~] - if redefinable_operators.include?(tk[:text]) and tk[:state] == EXPR_ARG then - tk[:state] = Ripper::Lexer::State.new(EXPR_ARG) - tk[:kind] = :on_ident - elsif tk[:text] =~ /^[-+]$/ then - tk_ahead = get_squashed_tk - case tk_ahead[:kind] - when :on_int, :on_float, :on_rational, :on_imaginary then - tk[:text] += tk_ahead[:text] - tk[:kind] = tk_ahead[:kind] - tk[:state] = tk_ahead[:state] - when :on_heredoc_beg, :on_tstring, :on_dstring # frozen/non-frozen string literal - tk[:text] += tk_ahead[:text] - tk[:kind] = tk_ahead[:kind] - tk[:state] = tk_ahead[:state] - else - @buf.unshift tk_ahead - end - end - tk - end - - # :startdoc: - - # New lexer for +code+. - def initialize(code) - @buf = [] - @heredoc_queue = [] - @inner_lex = InnerStateLex.new(code) - @tokens = @inner_lex.parse([]) - end - - # Returns tokens parsed from +code+. - def self.parse(code) - lex = self.new(code) - tokens = [] - begin - while tk = lex.get_squashed_tk - tokens.push tk - end - rescue StopIteration - end - tokens - end - - # Returns +true+ if lex state will be +END+ after +token+. - def self.end?(token) - (token[:state] & EXPR_END) - end -end diff --git a/lib/rdoc/parser/ruby.rb b/lib/rdoc/parser/ruby.rb index 0a0f690bac..0100b04b45 100644 --- a/lib/rdoc/parser/ruby.rb +++ b/lib/rdoc/parser/ruby.rb @@ -1,7 +1,7 @@ # frozen_string_literal: true require 'prism' -require_relative 'ripper_state_lex' +require_relative 'ruby_colorizer' # Parse and collect document from Ruby source code. @@ -198,10 +198,12 @@ def record_location(container) # :nodoc: # Scans this Ruby file for Ruby constructs def scan - @tokens = RDoc::Parser::RipperStateLex.parse(@content) @lines = @content.lines - result = Prism.parse(@content) - @program_node = result.value + result = Prism.parse_lex(@content) + @program_node, unordered_tokens = result.value + # Heredoc tokens are not in start_offset order. + # Need to sort them to use bsearch for finding tokens from location. + @prism_tokens = unordered_tokens.map(&:first).sort_by { |t| t.location.start_offset } @line_nodes = {} prepare_line_nodes(@program_node) prepare_comments(result.comments) @@ -314,7 +316,7 @@ def parse_comment_tomdoc(container, comment, line_no, start_line) meth.start_collecting_tokens(:ruby) node = @line_nodes[line_no] - tokens = node ? visible_tokens_from_location(node.location) : [] + tokens = node ? syntax_highlighted_tokens(node) : [] tokens.each { |token| meth.token_stream << token } container.add_method meth @@ -382,7 +384,7 @@ def handle_meta_method_comment(comment, directives, node) elsif line_no || node method_name ||= call_node_name_arguments(node).first if is_call_node if node - tokens = visible_tokens_from_location(node.location) + tokens = syntax_highlighted_tokens(node) line_no = node.location.start_line else tokens = [] @@ -490,21 +492,10 @@ def extract_section_comment(comment_text, prefix_line_count) # :nodoc: comment_text end - def slice_tokens(start_pos, end_pos) # :nodoc: - start_index = @tokens.bsearch_index { |t| ([t.line_no, t.char_no] <=> start_pos) >= 0 } - end_index = @tokens.bsearch_index { |t| ([t.line_no, t.char_no] <=> end_pos) >= 0 } - tokens = @tokens[start_index...end_index] - tokens.pop if tokens.last&.kind == :on_nl - tokens - end - - # Returns tokens from the given location + # Returns syntax highlighted tokens of the given node - def visible_tokens_from_location(location) - slice_tokens( - [location.start_line, location.start_character_column], - [location.end_line, location.end_character_column] - ) + def syntax_highlighted_tokens(node) + RDoc::Parser::RubyColorizer.partial_colorize(@content, node, @prism_tokens) end # Handles `public :foo, :bar` `private :foo, :bar` and `protected :foo, :bar` @@ -1018,7 +1009,7 @@ def visit_def_node(node) end name = node.name.to_s params, block_params, calls_super = MethodSignatureVisitor.scan_signature(node) - tokens = @scanner.visible_tokens_from_location(node.location) + tokens = @scanner.syntax_highlighted_tokens(node) @scanner.add_method( name, diff --git a/lib/rdoc/parser/ruby_colorizer.rb b/lib/rdoc/parser/ruby_colorizer.rb new file mode 100644 index 0000000000..4765e79c3d --- /dev/null +++ b/lib/rdoc/parser/ruby_colorizer.rb @@ -0,0 +1,249 @@ +# frozen_string_literal: true + +require 'prism' +require 'set' + +# Ruby code syntax highlighter. +# Colorize result is an array of +RDoc::Parser::RubyColorizer::ColoredToken+ +# Actual color for each token kind is determined elsewhere (e.g., HTML generator) +module RDoc::Parser::RubyColorizer + + ColoredToken = Struct.new(:kind, :text) + + # Prism operator token types except assignment '=' + OP_TOKENS = %i[ + AMPERSAND AMPERSAND_AMPERSAND + BANG BANG_EQUAL BANG_TILDE CARET COLON COLON_COLON + EQUAL_EQUAL EQUAL_GREATER EQUAL_TILDE + GREATER GREATER_GREATER + LESS LESS_EQUAL LESS_EQUAL_GREATER LESS_LESS + MINUS MINUS_GREATER PERCENT PIPE PIPE_PIPE PLUS + QUESTION_MARK SLASH STAR STAR_STAR TILDE + UAMPERSAND UMINUS UPLUS USTAR USTAR_STAR + ].to_set + + # Prism token type to ColoredToken kind map + TOKEN_TYPE_MAP = { + IDENTIFIER: :identifier, + METHOD_NAME: :identifier, + INSTANCE_VARIABLE: :ivar, + CLASS_VARIABLE: :identifier, + GLOBAL_VARIABLE: :identifier, + BACK_REFERENCE: :identifier, + NUMBERED_REFERENCE: :identifier, + CONSTANT: :constant, + LABEL: :value, + INTEGER: :value, + FLOAT: :value, + RATIONAL: :value, + IMAGINARY: :value, + COMMENT: :comment, + EMBDOC_BEGIN: :comment, + EMBDOC_LINE: :comment, + EMBDOC_END: :comment + } + + class << self + + # Colorize the entire +code+ and returns colored token stream. + def colorize(code) + result = Prism.parse_lex(code) + program_node, unordered_tokens = result.value + prism_tokens = unordered_tokens.map(&:first).sort_by! { |token| token.location.start_offset } + partial_colorize(code, program_node, prism_tokens, 0, code.bytesize) + end + + # Colorize partial +node+ in +whole_code+ and returns colored token stream. + def partial_colorize(whole_code, node, prism_tokens, start_offset = nil, end_offset = nil) + start_offset ||= node.location.start_offset + end_offset ||= node.location.end_offset + visitor = NodeColorizeVisitor.new + node.accept(visitor) + prior_tokens = visitor.tokens.sort_by {|_, start_offset, _| start_offset } + normal_tokens = normal_tokens(slice_by_location(prism_tokens, start_offset, end_offset)) + colored_tokens = unify_tokens(whole_code, prior_tokens, normal_tokens, start_offset, end_offset) + colored_tokens.unshift(ColoredToken.new(:plain, ' ' * node.location.start_column)) if node.location.start_column > 0 + colored_tokens + end + + private + + def slice_by_location(items, start_offset, end_offset) + start_index = items.bsearch_index { |item| item.location.end_offset > start_offset } || items.size + end_index = items.bsearch_index { |item| item.location.start_offset >= end_offset } || items.size + items[start_index...end_index] + end + + # Unify prior tokens and normal tokens into a single token stream. + # Prior tokens have higher priority than normal tokens. + # Also adds missing text (spaces, newlines, etc.) as :plain tokens + # so that the entire range is covered. + def unify_tokens(whole_code, prior_tokens, normal_tokens, start_offset, end_offset) + tokens = [] + offset = start_offset + + # Add missing text such as spaces and newlines as a separate :plain token + flush = -> next_offset { + return if offset == next_offset + + whole_code.byteslice(offset...next_offset).scan(/\n|\s+|[^\s]+/) do |text| + tokens << ColoredToken.new(:plain, text) + end + } + + until prior_tokens.empty? && normal_tokens.empty? + ptok = prior_tokens.first + ntok = normal_tokens.first + if ntok && (!ptok || ntok[2] <= ptok[1]) + token = normal_tokens.shift + else + token = prior_tokens.shift + end + kind, start_pos, end_pos = token + next if start_pos < offset + + flush.call(start_pos) + tokens << ColoredToken.new(kind, whole_code.byteslice(start_pos...end_pos)) + offset = end_pos + end + flush.call(end_offset) + tokens + end + + # Convert normal Prism tokens to [kind, start_offset, end_offset] + def normal_tokens(tokens) + tokens.map do |token,| + kind = + if token.type.start_with?('KEYWORD_') + :keyword + elsif OP_TOKENS.include?(token.type.to_sym) + :operator + else + TOKEN_TYPE_MAP[token.type] || :plain + end + [kind, token.location.start_offset, token.location.end_offset] + end + end + end + + # Visitor to determine node colorizing which can't be determined by tokens. + # STRING_CONTENT/EMBEXPR_BEGIN/EMBEXPR_END in string/regexp/symbol have different colorizing + class NodeColorizeVisitor < Prism::Visitor # :nodoc: + attr_reader :tokens + + def initialize + @tokens = [] + end + + def visit_symbol_node(node) + # SymbolNode#location may contain heredoc content and closing + # e.g., `<; end` + push_location(:identifier, node.name_loc) + super + end + + private + + def push_location(kind, location) + # Only push tokens that have a non-zero length + if location && location.start_offset < location.end_offset + @tokens << [kind, location.start_offset, location.end_offset] + end + end + + def handle_interpolated_parts(kind, parts) + # StringNode, EmbeddedStatementsNode brackets, and EmbeddedVariableNode hash in + # interpolated regexp/symbol/string parts should be colored as regexp/symbol/string respectively. + parts.each do |part| + case part + when Prism::StringNode + # InterpolatedStringNode#parts may have its own opening/closing. e.g., `'a' "b"` + push_location(kind, part.opening_loc) + push_location(kind, part.content_loc) + push_location(kind, part.closing_loc) + when Prism::InterpolatedStringNode + # InterpolatedStringNode#parts may contain InterpolatedStringNode. e.g., `'a' "#{}"` + part.accept(self) + when Prism::EmbeddedStatementsNode + push_location(kind, part.opening_loc) + push_location(kind, part.closing_loc) + part.accept(self) + when Prism::EmbeddedVariableNode + push_location(kind, part.operator_loc) + end + end + end + end + + private_constant :NodeColorizeVisitor +end diff --git a/lib/rdoc/token_stream.rb b/lib/rdoc/token_stream.rb index cc89397c60..71bd4a7078 100644 --- a/lib/rdoc/token_stream.rb +++ b/lib/rdoc/token_stream.rb @@ -19,28 +19,17 @@ def self.to_html(token_stream) next unless t style = case t[:kind] - when :on_const then 'ruby-constant' - when :on_kw then 'ruby-keyword' - when :on_ivar then 'ruby-ivar' - when :on_cvar then 'ruby-identifier' - when :on_gvar then 'ruby-identifier' - when '=' != t[:text] && :on_op - then 'ruby-operator' - when :on_tlambda then 'ruby-operator' - when :on_ident then 'ruby-identifier' - when :on_label then 'ruby-value' - when :on_backref, :on_dstring - then 'ruby-node' - when :on_comment then 'ruby-comment' - when :on_embdoc then 'ruby-comment' - when :on_regexp then 'ruby-regexp' - when :on_tstring then 'ruby-string' - when :on_int, :on_float, - :on_rational, :on_imaginary, - :on_heredoc, - :on_symbol, :on_CHAR then 'ruby-value' - when :on_heredoc_beg, :on_heredoc_end - then 'ruby-identifier' + when :operator then 'ruby-operator' + when :keyword then 'ruby-keyword' + when :constant then 'ruby-constant' + when :ivar then 'ruby-ivar' + when :comment then 'ruby-comment' + when :value then 'ruby-value' + when :string then 'ruby-string' + when :symbol then 'ruby-value' + when :x_string then 'ruby-string' + when :regexp then 'ruby-regexp' + when :identifier then 'ruby-identifier' end text = t[:text] diff --git a/test/rdoc/markup/to_html_test.rb b/test/rdoc/markup/to_html_test.rb index 820c8c2597..aa9ef04aa8 100644 --- a/test/rdoc/markup/to_html_test.rb +++ b/test/rdoc/markup/to_html_test.rb @@ -545,17 +545,17 @@ def foo "'", "\'\"\`", "\#", - "\#{}", + "\#{1}", "#", - "#{}", + "#{1}", /'"/, /\'\"/, /\//, /\\/, /\#/, - /\#{}/, + /\#{1}/, /#/, - /#{}/ + /#{1}/ ] end def bar @@ -570,22 +570,22 @@ def bar
def foo
   [
-    '\\',
-    '\'',
-    "'",
-    "\'\"\`",
-    "\#",
-    "\#{}",
-    "#",
-    "#{}",
+    '\\',
+    '\'',
+    "'",
+    "\'\"\`",
+    "\#",
+    "\#{1}",
+    "#",
+    "#{1}",
     /'"/,
     /\'\"/,
     /\//,
     /\\/,
     /\#/,
-    /\#{}/,
+    /\#{1}/,
     /#/,
-    /#{}/
+    /#{1}/
   ]
 end
 def bar
@@ -603,9 +603,9 @@ def foo
     `\\`,
     `\'\"\``,
     `\#`,
-    `\#{}`,
+    `\#{1}`,
     `#`,
-    `#{}`
+    `#{1}`
   ]
 end
 def bar
@@ -620,12 +620,12 @@ def bar
 
 
def foo
   [
-    `\\`,
-    `\'\"\``,
-    `\#`,
-    `\#{}`,
-    `#`,
-    `#{}`
+    `\\`,
+    `\'\"\``,
+    `\#`,
+    `\#{1}`,
+    `#`,
+    `#{1}`
   ]
 end
 def bar
diff --git a/test/rdoc/parser/ruby_colorizer_test.rb b/test/rdoc/parser/ruby_colorizer_test.rb
new file mode 100644
index 0000000000..b3fd1b71de
--- /dev/null
+++ b/test/rdoc/parser/ruby_colorizer_test.rb
@@ -0,0 +1,202 @@
+# frozen_string_literal: true
+require_relative '../helper'
+require 'rdoc/parser/ruby_colorizer'
+
+class RDocParserRubyColorizerTest < RDoc::TestCase
+  def token(kind, text)
+    RDoc::Parser::RubyColorizer::ColoredToken.new(kind, text)
+  end
+
+  def test_partial_colorize
+    code = <<~RUBY
+      class A
+        def m
+          # comment
+          42
+        end
+      end
+    RUBY
+    parse_result = Prism.parse_lex(code)
+    program_node, unordered_tokens = parse_result.value
+    prism_tokens = unordered_tokens.map(&:first).sort_by! { |token| token.location.start_offset }
+    def_node = program_node.statements.body[0].body.body[0]
+    tokens = RDoc::Parser::RubyColorizer.partial_colorize(code, def_node, prism_tokens)
+    expected = ['  ', 'def', ' ', 'm', "\n", '    ', "# comment\n", '    ', '42', "\n", '  ', 'end']
+    assert_equal(expected, tokens.map(&:text))
+  end
+
+  def test_comment
+    code = <<~RUBY
+      # comment1
+      class A
+      =begin
+      comment2
+      =end
+        def m
+          42 # comment3
+        end
+      end
+    RUBY
+    tokens = RDoc::Parser::RubyColorizer.colorize(code)
+    assert_equal(code, tokens.map(&:text).join)
+    assert_include(tokens, token(:comment, "# comment1\n"))
+    assert_include(tokens, token(:comment, "=begin\n"))
+    assert_include(tokens, token(:comment, "comment2\n"))
+    assert_include(tokens, token(:comment, "=end\n"))
+    assert_include(tokens, token(:comment, "# comment3\n"))
+  end
+
+  def test_interpolated_node
+    code = <<~'RUBY'
+      def m
+        "string#{interpolation1}example#@embvar"
+        /regexp#{interpolation2}example#$embvar/
+        `xstring#{interpolation3}example#@embvar`
+        :"symbol#{interpolation4}example#$embvar"
+      end
+    RUBY
+    tokens = RDoc::Parser::RubyColorizer.colorize(code)
+    assert_equal(code, tokens.map(&:text).join)
+
+    assert_include(tokens, token(:string, '"'))
+    assert_include(tokens, token(:string, 'string'))
+    assert_include(tokens, token(:string, '#{'))
+    assert_include(tokens, token(:identifier, 'interpolation1'))
+    assert_include(tokens, token(:string, '}'))
+    assert_include(tokens, token(:string, 'example'))
+    assert_include(tokens, token(:string, '#'))
+
+    assert_include(tokens, token(:regexp, '/'))
+    assert_include(tokens, token(:regexp, 'regexp'))
+    assert_include(tokens, token(:regexp, '#{'))
+    assert_include(tokens, token(:identifier, 'interpolation2'))
+    assert_include(tokens, token(:regexp, '}'))
+    assert_include(tokens, token(:regexp, 'example'))
+    assert_include(tokens, token(:regexp, '#'))
+
+    assert_include(tokens, token(:x_string, '`'))
+    assert_include(tokens, token(:x_string, 'xstring'))
+    assert_include(tokens, token(:x_string, '#{'))
+    assert_include(tokens, token(:identifier, 'interpolation3'))
+    assert_include(tokens, token(:x_string, '}'))
+    assert_include(tokens, token(:x_string, 'example'))
+    assert_include(tokens, token(:x_string, '#'))
+
+    assert_include(tokens, token(:symbol, ':"'))
+    assert_include(tokens, token(:symbol, 'symbol'))
+    assert_include(tokens, token(:symbol, '#{'))
+    assert_include(tokens, token(:identifier, 'interpolation4'))
+    assert_include(tokens, token(:symbol, '}'))
+    assert_include(tokens, token(:symbol, 'example'))
+    assert_include(tokens, token(:symbol, '#'))
+    assert_include(tokens, token(:symbol, '"'))
+  end
+
+  def test_percent_literal_arrays
+    code = <<~'RUBY'
+      def m
+        %w[1 2 3]
+        %W[one #{two} three]
+        %i[4 5 6]
+        %I[four #{five} six]
+      end
+    RUBY
+    tokens = RDoc::Parser::RubyColorizer.colorize(code)
+    assert_equal(code, tokens.map(&:text).join)
+    assert_include(tokens, token(:string, '%w['))
+    assert_include(tokens, token(:string, '%W['))
+    assert_include(tokens, token(:string, ']'))
+    assert_include(tokens, token(:string, '1'))
+    assert_include(tokens, token(:string, 'one'))
+    assert_include(tokens, token(:string, '#{'))
+    assert_include(tokens, token(:identifier, 'two'))
+    assert_include(tokens, token(:string, '}'))
+    assert_include(tokens, token(:symbol, '%i['))
+    assert_include(tokens, token(:symbol, '%I['))
+    assert_include(tokens, token(:symbol, ']'))
+    assert_include(tokens, token(:symbol, '4'))
+    assert_include(tokens, token(:symbol, 'four'))
+    assert_include(tokens, token(:symbol, '#{'))
+    assert_include(tokens, token(:identifier, 'five'))
+    assert_include(tokens, token(:symbol, '}'))
+  end
+
+  def test_multibyte
+    code = <<~RUBY
+      def f(s = '💎')
+        # comment 💎
+        puts '💎' + s
+      end
+    RUBY
+    tokens = RDoc::Parser::RubyColorizer.colorize(code)
+    assert_equal(code, tokens.map(&:text).join)
+  end
+
+  def test_string
+    code = <<~'RUBY'
+      # string without closing
+      ?S
+      # interpolated string node may not have opening/closing
+      # parts may have opening/closing
+      %[s3] 's4'\
+      "s5#{[?s]}s6"
+    RUBY
+    tokens = RDoc::Parser::RubyColorizer.colorize(code)
+    assert_equal(code, tokens.map(&:text).join)
+    string_token_texts = tokens.select { |t| t[:kind] == :string }.map(&:text)
+    expected_string_token_texts = %w[? S %[ s3 ] ' s4 ' " s5  #{ ? s } s6 "]
+    assert_equal(expected_string_token_texts, string_token_texts)
+  end
+
+  def test_symbol
+    code = <<~'RUBY'
+      # symbol without closing
+      :sym1
+      # symbol with opening/closing
+      :"sym2"
+      %s[sym3]
+      # opening and content has gap
+      <<~A; :\
+      A
+      sym4
+    RUBY
+    tokens = RDoc::Parser::RubyColorizer.colorize(code)
+    assert_equal(code, tokens.map(&:text).join)
+    symbol_token_texts = tokens.select { |t| t[:kind] == :symbol }.map(&:text)
+    expected_symbol_token_texts = %w[: sym1 :" sym2 " %s[ sym3 ] : sym4]
+    assert_equal(expected_symbol_token_texts, symbol_token_texts)
+  end
+
+  def test_heredoc
+    code = <<~'RUBY'
+      def f
+        str1 = <<~AA
+          single-line-heredoc
+        AA
+        str2 = <<~`BB` # comment
+          x-string-heredoc
+        BB
+        str3 = <<~CC.itself
+          multi-line
+          #{embed}
+          heredoc
+        CC
+      end
+    RUBY
+    tokens = RDoc::Parser::RubyColorizer.colorize(code)
+    assert_equal(code, tokens.map(&:text).join)
+    assert_include(tokens, token(:string, '<<~AA'))
+    assert_include(tokens, token(:x_string, '<<~`BB`'))
+    assert_include(tokens, token(:string, '<<~CC'))
+    assert_include(tokens, token(:string, "  AA\n"))
+    assert_include(tokens, token(:x_string, "  BB\n"))
+    assert_include(tokens, token(:string, "  CC\n"))
+    assert_include(tokens, token(:string, "    single-line-heredoc\n"))
+    assert_include(tokens, token(:x_string, "    x-string-heredoc\n"))
+    assert_include(tokens, token(:string, "    multi-line\n"))
+    assert_include(tokens, token(:string, '#{'))
+    assert_include(tokens, token(:identifier, 'embed'))
+    assert_include(tokens, token(:string, '}'))
+    assert_include(tokens, token(:string, "    heredoc\n"))
+  end
+end
diff --git a/test/rdoc/parser/ruby_test.rb b/test/rdoc/parser/ruby_test.rb
index b3cef3a341..a5c79071ba 100644
--- a/test/rdoc/parser/ruby_test.rb
+++ b/test/rdoc/parser/ruby_test.rb
@@ -2406,6 +2406,23 @@ def test_read_directive_linear_performance
     end
   end
 
+  def test_code_object_token_stream
+    util_parser <<~RUBY
+      class Foo
+        def foo
+          42
+        end
+
+        private def bar
+          baz
+        end
+      end
+    RUBY
+
+    foo, bar = @top_level.classes.first.method_list
+    assert_equal(['  ', 'def', ' ', 'foo', "\n", '    ', '42', "\n", '  ', 'end'], foo.token_stream.map(&:text))
+    assert_equal(['          ', 'def', ' ', 'bar', "\n", '    ', 'baz', "\n", '  ', 'end'], bar.token_stream.map(&:text))
+  end
 
   def test_markup_first_comment
     util_parser <<~RUBY
diff --git a/test/rdoc/rdoc_token_stream_test.rb b/test/rdoc/rdoc_token_stream_test.rb
index ed5e124cc6..254811c72a 100644
--- a/test/rdoc/rdoc_token_stream_test.rb
+++ b/test/rdoc/rdoc_token_stream_test.rb
@@ -5,17 +5,18 @@ class RDocTokenStreamTest < RDoc::TestCase
 
   def test_class_to_html
     tokens = [
-      { :line_no => 0, :char_no => 0, :kind => :on_const, :text => 'CONSTANT' },
-      { :line_no => 0, :char_no => 0, :kind => :on_kw, :text => 'KW' },
-      { :line_no => 0, :char_no => 0, :kind => :on_ivar, :text => 'IVAR' },
-      { :line_no => 0, :char_no => 0, :kind => :on_op, :text => 'Op' },
-      { :line_no => 0, :char_no => 0, :kind => :on_ident, :text => 'Id' },
-      { :line_no => 0, :char_no => 0, :kind => :on_backref, :text => 'Node' },
-      { :line_no => 0, :char_no => 0, :kind => :on_comment, :text => 'COMMENT' },
-      { :line_no => 0, :char_no => 0, :kind => :on_regexp, :text => 'REGEXP' },
-      { :line_no => 0, :char_no => 0, :kind => :on_tstring, :text => 'STRING' },
-      { :line_no => 0, :char_no => 0, :kind => :on_int, :text => 'Val' },
-      { :line_no => 0, :char_no => 0, :kind => :on_unknown, :text => '\\' }
+      { kind: :constant, text: 'CONSTANT' },
+      { kind: :keyword, text: 'KW' },
+      { kind: :ivar, text: 'IVAR' },
+      { kind: :operator, text: 'Op' },
+      { kind: :identifier, text: 'Id' },
+      { kind: :symbol, text: 'Symbol' },
+      { kind: :x_string, text: 'XString' },
+      { kind: :comment, text: 'COMMENT' },
+      { kind: :regexp, text: 'REGEXP' },
+      { kind: :string, text: 'STRING' },
+      { kind: :value, text: 'Val' },
+      { kind: :plain, text: '\\' }
     ]
 
     expected = [
@@ -24,7 +25,8 @@ def test_class_to_html
       'IVAR',
       'Op',
       'Id',
-      'Node',
+      'Symbol',
+      'XString',
       'COMMENT',
       'REGEXP',
       'STRING',