Skip to content

Commit 4d6b725

Browse files
authored
Move html characeter converting mechanism to regexp-handling (#1570)
`Text#to_html_characters` was a postprocess that converts ascii quotes/marks to multibyte characters. Postprocessing HTML to do that is not a good idea. Convert plain text node with regexp-handling is better.
1 parent 4806de9 commit 4d6b725

7 files changed

Lines changed: 154 additions & 232 deletions

File tree

lib/rdoc/markup/to_html.rb

Lines changed: 103 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,86 @@ class RDoc::Markup::ToHtml < RDoc::Markup::Formatter
4242

4343
# :section:
4444

45+
# Maps an encoding to a Hash of characters properly transcoded for that
46+
# encoding.
47+
#
48+
# See also encode_fallback.
49+
50+
TO_HTML_CHARACTERS = Hash.new do |h, encoding|
51+
h[encoding] = {
52+
close_dquote: encode_fallback('”', encoding, '"'),
53+
close_squote: encode_fallback('’', encoding, '\''),
54+
copyright: encode_fallback('©', encoding, '(c)'),
55+
ellipsis: encode_fallback('…', encoding, '...'),
56+
dot_ellipsis: encode_fallback('.…', encoding, '....'),
57+
em_dash: encode_fallback('—', encoding, '---'),
58+
en_dash: encode_fallback('–', encoding, '--'),
59+
open_dquote: encode_fallback('“', encoding, '"'),
60+
open_squote: encode_fallback('‘', encoding, '\''),
61+
trademark: encode_fallback('®', encoding, '(r)'),
62+
}
63+
end
64+
65+
HTML_CHARACTER_ALIASES = {
66+
'(c)' => :copyright,
67+
'(C)' => :copyright,
68+
'(r)' => :trademark,
69+
'(R)' => :trademark,
70+
'---' => :em_dash,
71+
'--' => :en_dash,
72+
'....' => :dot_ellipsis,
73+
'...' => :ellipsis,
74+
'``' => :open_dquote,
75+
"''" => :close_dquote,
76+
}
77+
78+
# Transcodes +character+ to +encoding+ with a +fallback+ character.
79+
80+
def self.encode_fallback(character, encoding, fallback)
81+
character.encode(
82+
encoding,
83+
fallback: { character => fallback },
84+
undef: :replace,
85+
replace: fallback
86+
)
87+
end
88+
89+
# Converts ascii quote pairs to multibyte quote characters
90+
class QuoteConverter
91+
92+
def initialize
93+
@in_dquote = false
94+
@in_squote = false
95+
end
96+
97+
def convert(quote, after_word:)
98+
case quote
99+
when '"'
100+
type = @in_dquote ? :close_dquote : :open_dquote
101+
@in_dquote = !@in_dquote
102+
when "'"
103+
if @in_squote
104+
type = :close_squote
105+
@in_squote = false
106+
elsif after_word
107+
# Mary's dog, my parents' house: do not start paired quotes
108+
type = :close_squote
109+
else
110+
type = :open_squote
111+
@in_squote = true
112+
end
113+
when '`'
114+
# Opening quote of <tt>`quoted sentence'</tt>.
115+
# This will conflict with code blocks <tt>`puts('hello')`</tt> in the future.
116+
if !@in_squote && !after_word
117+
type = :open_squote
118+
@in_squote = true
119+
end
120+
end
121+
TO_HTML_CHARACTERS[quote.encoding][type] if type
122+
end
123+
end
124+
45125
##
46126
# Creates a new formatter that will output HTML
47127

@@ -55,6 +135,7 @@ def initialize(pipe: false, output_decoration: true)
55135
@in_list_entry = nil
56136
@list = nil
57137
@th = nil
138+
@quote_converter = nil
58139
@in_tidylink_label = false
59140
@hard_break = "<br>\n"
60141

@@ -79,6 +160,11 @@ def init_regexp_handlings
79160
# suppress crossref: \#method \::method \ClassName \method_with_underscores
80161
@markup.add_regexp_handling(/\\(?:[#:A-Z]|[a-z]+_[a-z0-9])/, :SUPPRESSED_CROSSREF)
81162

163+
@markup.add_regexp_handling(Regexp.union(HTML_CHARACTER_ALIASES.keys), :HTML_CHARACTERS)
164+
165+
@markup.add_regexp_handling(/\b['"`]/, :QUOTE_AFTER_WORD)
166+
@markup.add_regexp_handling(/\B['"`]/, :QUOTE_NOT_AFTER_WORD)
167+
82168
init_link_notation_regexp_handlings
83169
end
84170

@@ -231,12 +317,28 @@ def handle_TIDYLINK(label_part, url)
231317

232318
def handle_inline(text) # :nodoc:
233319
@inline_output = +''
320+
@quote_converter = QuoteConverter.new
234321
super
235322
out = @inline_output
236323
@inline_output = nil
324+
@quote_converter = nil
237325
out
238326
end
239327

328+
# Converts <tt>(c), (r), --, --- , ..., ...., ``, ''</tt> to HTML characters.
329+
def handle_regexp_HTML_CHARACTERS(text)
330+
name = HTML_CHARACTER_ALIASES[text]
331+
TO_HTML_CHARACTERS[text.encoding][name] if name
332+
end
333+
334+
def handle_regexp_QUOTE_NOT_AFTER_WORD(text)
335+
@quote_converter.convert(text, after_word: false) || convert_string(text)
336+
end
337+
338+
def handle_regexp_QUOTE_AFTER_WORD(text)
339+
@quote_converter.convert(text, after_word: true) || convert_string(text)
340+
end
341+
240342
# Converts suppressed cross-reference +text+ to HTML by removing the leading backslash.
241343

242344
def handle_regexp_SUPPRESSED_CROSSREF(text)
@@ -576,9 +678,6 @@ def parseable?(text)
576678
# Converts +item+ to HTML using RDoc::Text#to_html
577679

578680
def to_html(item)
579-
# Ideally, we should convert html characters at handle_PLAIN_TEXT or somewhere else,
580-
# but we need to convert it here for now because to_html_characters converts pair of backticks to ’‘ and pair of double backticks to ”“.
581-
# Known bugs: `...` in `<code>def f(...); end</code>` and `(c) in `<a href="(c)">` will be wrongly converted.
582-
to_html_characters(handle_inline(item))
681+
handle_inline(item)
583682
end
584683
end

lib/rdoc/markup/to_html_snippet.rb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def accept_verbatim(verbatim)
109109
input = verbatim.text.rstrip
110110
text = truncate(input, @character_limit - @characters)
111111
@characters += input.length
112-
text << ' ...' unless text == input
112+
text << " #{TO_HTML_CHARACTERS[text.encoding][:ellipsis]}" unless text == input
113113

114114
super RDoc::Markup::Verbatim.new text
115115

@@ -262,14 +262,14 @@ def handle_inline(text)
262262
return ['', 0] if limit <= 0
263263
@inline_character_limit = limit
264264
res = super
265-
res << ' ...' if @inline_character_limit <= 0
265+
res << " #{TO_HTML_CHARACTERS[text.encoding][:ellipsis]}" if @inline_character_limit <= 0
266266
@characters += limit - @inline_character_limit
267267
res
268268
end
269269

270270
def to_html(item)
271271
throw :done if @characters >= @character_limit
272-
to_html_characters(handle_inline(item))
272+
handle_inline(item)
273273
end
274274

275275
##

lib/rdoc/text.rb

Lines changed: 0 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -29,34 +29,6 @@ module RDoc::Text
2929

3030
MARKUP_FORMAT.default = RDoc::Markup
3131

32-
##
33-
# Maps an encoding to a Hash of characters properly transcoded for that
34-
# encoding.
35-
#
36-
# See also encode_fallback.
37-
38-
TO_HTML_CHARACTERS = Hash.new do |h, encoding|
39-
h[encoding] = {
40-
:close_dquote => encode_fallback('”', encoding, '"'),
41-
:close_squote => encode_fallback('’', encoding, '\''),
42-
:copyright => encode_fallback('©', encoding, '(c)'),
43-
:ellipsis => encode_fallback('…', encoding, '...'),
44-
:em_dash => encode_fallback('—', encoding, '---'),
45-
:en_dash => encode_fallback('–', encoding, '--'),
46-
:open_dquote => encode_fallback('“', encoding, '"'),
47-
:open_squote => encode_fallback('‘', encoding, '\''),
48-
:trademark => encode_fallback('®', encoding, '(r)'),
49-
}
50-
end
51-
52-
##
53-
# Transcodes +character+ to +encoding+ with a +fallback+ character.
54-
55-
def self.encode_fallback(character, encoding, fallback)
56-
character.encode(encoding, :fallback => { character => fallback },
57-
:undef => :replace, :replace => fallback)
58-
end
59-
6032
##
6133
# Expands tab characters in +text+ to eight spaces
6234

@@ -193,95 +165,6 @@ def strip_stars(text)
193165
text.gsub(/^\s+$/, empty)
194166
end
195167

196-
def to_html(text)
197-
to_html_characters(text)
198-
end
199-
200-
##
201-
# Converts ampersand, dashes, ellipsis, quotes, copyright and registered
202-
# trademark symbols in +text+ to properly encoded characters.
203-
204-
def to_html_characters(text)
205-
html = (''.encode text.encoding).dup
206-
207-
encoded = RDoc::Text::TO_HTML_CHARACTERS[text.encoding]
208-
209-
s = StringScanner.new text
210-
insquotes = false
211-
indquotes = false
212-
after_word = nil
213-
214-
until s.eos? do
215-
case
216-
when s.scan(/<(tt|code)>.*?<\/\1>/) then # skip contents of tt
217-
html << s.matched
218-
when s.scan(/<(tt|code)>.*?/) then
219-
warn "mismatched <#{s[1]}> tag" # TODO signal file/line
220-
html << s.matched
221-
when s.scan(/<[^>]+\/?s*>/) then # skip HTML tags
222-
html << s.matched
223-
when s.scan(/\.\.\.(\.?)/) then
224-
html << s[1] << encoded[:ellipsis]
225-
after_word = nil
226-
when s.scan(/\(c\)/i) then
227-
html << encoded[:copyright]
228-
after_word = nil
229-
when s.scan(/\(r\)/i) then
230-
html << encoded[:trademark]
231-
after_word = nil
232-
when s.scan(/---/) then
233-
html << encoded[:em_dash]
234-
after_word = nil
235-
when s.scan(/--/) then
236-
html << encoded[:en_dash]
237-
after_word = nil
238-
when s.scan(/&quot;|"/) then
239-
html << encoded[indquotes ? :close_dquote : :open_dquote]
240-
indquotes = !indquotes
241-
after_word = nil
242-
when s.scan(/``/) then # backtick double quote
243-
html << encoded[:open_dquote]
244-
after_word = nil
245-
when s.scan(/(?:&#39;|'){2}/) then # tick double quote
246-
html << encoded[:close_dquote]
247-
after_word = nil
248-
when s.scan(/`/) then # backtick
249-
if insquotes or after_word
250-
html << '`'
251-
after_word = false
252-
else
253-
html << encoded[:open_squote]
254-
insquotes = true
255-
end
256-
when s.scan(/&#39;|'/) then # single quote
257-
if insquotes
258-
html << encoded[:close_squote]
259-
insquotes = false
260-
elsif after_word
261-
# Mary's dog, my parents' house: do not start paired quotes
262-
html << encoded[:close_squote]
263-
else
264-
html << encoded[:open_squote]
265-
insquotes = true
266-
end
267-
268-
after_word = nil
269-
else # advance to the next potentially significant character
270-
match = s.scan(/.+?(?=[<\\.("'`&-])/) #"
271-
272-
if match then
273-
html << match
274-
after_word = match =~ /\w$/
275-
else
276-
html << s.rest
277-
break
278-
end
279-
end
280-
end
281-
282-
html
283-
end
284-
285168
##
286169
# Wraps +txt+ to +line_len+
287170

test/rdoc/markup/to_html_crossref_test.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def test_convert_CROSSREF_section_with_spaces
118118

119119
def test_convert_CROSSREF_legacy_label
120120
result = @to.convert 'C1@What-27s+Here'
121-
assert_equal para("<a href=\"C1.html#class-c1-whats-here\">What\u2019s Here at <code>C1</code></a>"), result
121+
assert_equal para("<a href=\"C1.html#class-c1-whats-here\">What's Here at <code>C1</code></a>"), result
122122
end
123123

124124
def test_convert_CROSSREF_legacy_label_colon
@@ -130,7 +130,7 @@ def test_convert_CROSSREF_legacy_section
130130
@c1.add_section "What's Here"
131131

132132
result = @to.convert "C1@What-27s+Here"
133-
assert_equal para("<a href=\"C1.html#whats-here\">What\u2019s Here at <code>C1</code></a>"), result
133+
assert_equal para("<a href=\"C1.html#whats-here\">What's Here at <code>C1</code></a>"), result
134134
end
135135

136136
def test_convert_CROSSREF_constant

test/rdoc/markup/to_html_snippet_test.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -543,7 +543,7 @@ def test_convert_limit_verbatim
543543
<p>Hello There
544544
<p>This is some text, it <strong>will</strong> be cut off after 100 characters
545545
546-
<pre>This one is cut off in this verbatim ...</pre>
546+
<pre>This one is cut off in this verbatim </pre>
547547
EXPECTED
548548

549549
actual = @to.convert rdoc

test/rdoc/markup/to_html_test.rb

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -718,6 +718,51 @@ def test_convert_string
718718
assert_equal '&lt;&gt;', @to.convert_string('<>')
719719
end
720720

721+
def test_self_converter_encode_fallback
722+
assert_equal '…',
723+
RDoc::Markup::ToHtml::encode_fallback('…', Encoding::UTF_8, '...')
724+
assert_equal '...',
725+
RDoc::Markup::ToHtml::encode_fallback('…', Encoding::US_ASCII, '...')
726+
end
727+
728+
def test_convert_HTML_CHARACTER
729+
result = @to.convert "<b>(c)(r)(C)(R)...--....---``''</b>"
730+
assert_equal "\n<p><strong>©®©®…–.…—“”</strong></p>\n", result
731+
732+
result = @to.convert "<tt>(c)(r)(C)(R)...--....---``''</tt>"
733+
assert_equal "\n<p><code>(c)(r)(C)(R)...--....---``&#39;&#39;</code></p>\n", result
734+
735+
result = @to.convert "{(c)(r)(C)(R)...--....---``''}[url]"
736+
assert_equal "\n<p><a href=\"url\">©®©®…–.…—“”</a></p>\n", result
737+
738+
result = @to.convert "{link}[http://example.com/?q=(c)(r)(C)(R)...--....---``'']"
739+
assert_equal "\n<p><a href=\"http://example.com/?q=(c)(r)(C)(R)...--....---``&#39;&#39;\">link</a></p>\n", result
740+
end
741+
742+
def test_convert_HTML_CHARACTER_encoding
743+
s = '...(c)'.encode Encoding::Shift_JIS
744+
result = @to.convert s
745+
assert_equal Encoding::Shift_JIS, result.encoding
746+
747+
expected = '…(c)'.encode Encoding::Shift_JIS
748+
assert_equal "\n<p>#{expected}</p>\n", result
749+
end
750+
751+
def test_convert_QUOTE_dquote
752+
result = @to.convert '"This is a +quoted+ string." and "another"'
753+
assert_equal "\n<p>“This is a <code>quoted</code> string.” and “another”</p>\n", result
754+
end
755+
756+
def test_convert_QUOTE_squote
757+
result = @to.convert "'quote' '1+2'. I'm 'RDoc'"
758+
assert_equal "\n<p>‘quote’ ‘1+2’. I’m ‘RDoc’</p>\n", result
759+
end
760+
761+
def test_convert_QUOTE_backtick
762+
result = @to.convert "This is `quote' and this is `code`"
763+
assert_equal "\n<p>This is ‘quote’ and this is <code>code</code></p>\n", result
764+
end
765+
721766
def test_convert_HYPERLINK_irc
722767
result = @to.convert 'irc://irc.freenode.net/#ruby-lang'
723768

0 commit comments

Comments
 (0)