Skip to content

Commit 6eb654a

Browse files
committed
Move html characeter converting mechanism to regexp-handling
`Text#to_html_characters` was a postprocess that converts ascii quotes/marks to multibyte characters. Postprocessing HTML to do thaat is not a good idea. Convert plain text node is better.
1 parent 9456e79 commit 6eb654a

6 files changed

Lines changed: 148 additions & 230 deletions

File tree

lib/rdoc/markup/to_html.rb

Lines changed: 99 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,82 @@ class RDoc::Markup::ToHtml < RDoc::Markup::Formatter
4040

4141
# :section:
4242

43+
# Maps an encoding to a Hash of characters properly transcoded for that
44+
# encoding.
45+
#
46+
# See also encode_fallback.
47+
48+
TO_HTML_CHARACTERS = Hash.new do |h, encoding|
49+
h[encoding] = {
50+
:close_dquote => encode_fallback('”', encoding, '"'),
51+
:close_squote => encode_fallback('’', encoding, '\''),
52+
:copyright => encode_fallback('©', encoding, '(c)'),
53+
:ellipsis => encode_fallback('…', encoding, '...'),
54+
:dot_ellipsis => encode_fallback('.…', encoding, '....'),
55+
:em_dash => encode_fallback('—', encoding, '---'),
56+
:en_dash => encode_fallback('–', encoding, '--'),
57+
:open_dquote => encode_fallback('“', encoding, '"'),
58+
:open_squote => encode_fallback('‘', encoding, '\''),
59+
:trademark => encode_fallback('®', encoding, '(r)'),
60+
}
61+
end
62+
63+
HTML_CHARACTER_ALIASES = {
64+
'(c)' => :copyright,
65+
'(C)' => :copyright,
66+
'(r)' => :trademark,
67+
'(R)' => :trademark,
68+
'---' => :em_dash,
69+
'--' => :en_dash,
70+
'....' => :dot_ellipsis,
71+
'...' => :ellipsis,
72+
'``' => :open_dquote,
73+
"''" => :close_dquote,
74+
}
75+
76+
# Transcodes +character+ to +encoding+ with a +fallback+ character.
77+
78+
def self.encode_fallback(character, encoding, fallback)
79+
character.encode(encoding, :fallback => { character => fallback },
80+
:undef => :replace, :replace => fallback)
81+
end
82+
83+
# Converts ascii quote pairs to multibyte quote characters
84+
class QuoteConverter
85+
86+
def initialize
87+
@in_dquote = false
88+
@in_squote = false
89+
end
90+
91+
def convert(quote, after_word:)
92+
case quote
93+
when '"'
94+
type = @in_dquote ? :close_dquote : :open_dquote
95+
@in_dquote = !@in_dquote
96+
when "'"
97+
if @insquotes
98+
type = :close_squote
99+
@insquotes = false
100+
elsif after_word
101+
# Mary's dog, my parents' house: do not start paired quotes
102+
type = :close_squote
103+
else
104+
type = :open_squote
105+
@insquotes = true
106+
end
107+
when '`'
108+
# Opening quote of <tt>`quoted sentence'</tt>.
109+
# This will conflict with code blocks <tt>`puts('hello')`</tt> in the future.
110+
if !@insquotes && !after_word
111+
type = :open_squote
112+
@insquotes = true
113+
end
114+
end
115+
TO_HTML_CHARACTERS[quote.encoding][type] if type
116+
end
117+
end
118+
43119
##
44120
# Creates a new formatter that will output HTML
45121

@@ -51,6 +127,7 @@ def initialize(options, markup = nil)
51127
@in_list_entry = nil
52128
@list = nil
53129
@th = nil
130+
@quote_converter = nil
54131
@in_tidylink_label = false
55132
@hard_break = "<br>\n"
56133

@@ -75,6 +152,11 @@ def init_regexp_handlings
75152
# suppress crossref: \#method \::method \ClassName \method_with_underscores
76153
@markup.add_regexp_handling(/\\(?:[#:A-Z]|[a-z]+_[a-z0-9])/, :SUPPRESSED_CROSSREF)
77154

155+
@markup.add_regexp_handling(Regexp.union(HTML_CHARACTER_ALIASES.keys), :HTML_CHARACTERS)
156+
157+
@markup.add_regexp_handling(/\b['"`]/, :QUOTE_AFTER_WORD)
158+
@markup.add_regexp_handling(/\B['"`]/, :QUOTE_NOT_AFTER_WORD)
159+
78160
init_link_notation_regexp_handlings
79161
end
80162

@@ -227,12 +309,28 @@ def handle_TIDYLINK(label_part, url)
227309

228310
def handle_inline(text) # :nodoc:
229311
@inline_output = +''
312+
@quote_converter = QuoteConverter.new
230313
super
231314
out = @inline_output
232315
@inline_output = nil
316+
@quote_converter = nil
233317
out
234318
end
235319

320+
# Converts <tt>(c), (r), --, --- , ..., ...., ``, ""</tt> to HTML characters.
321+
def handle_regexp_HTML_CHARACTERS(text)
322+
name = HTML_CHARACTER_ALIASES[text]
323+
TO_HTML_CHARACTERS[text.encoding][name] if name
324+
end
325+
326+
def handle_regexp_QUOTE_NOT_AFTER_WORD(text)
327+
@quote_converter.convert(text, after_word: false) || convert_string(text)
328+
end
329+
330+
def handle_regexp_QUOTE_AFTER_WORD(text)
331+
@quote_converter.convert(text, after_word: true) || convert_string(text)
332+
end
333+
236334
# Converts suppressed cross-reference +text+ to HTML by removing the leading backslash.
237335

238336
def handle_regexp_SUPPRESSED_CROSSREF(text)
@@ -565,10 +663,7 @@ def parseable?(text)
565663
# Converts +item+ to HTML using RDoc::Text#to_html
566664

567665
def to_html(item)
568-
# Ideally, we should convert html characters at handle_PLAIN_TEXT or somewhere else,
569-
# but we need to convert it here for now because to_html_characters converts pair of backticks to ’‘ and pair of double backticks to ”“.
570-
# Known bugs: `...` in `<code>def f(...); end</code>` and `(c) in `<a href="(c)">` will be wrongly converted.
571-
to_html_characters(handle_inline(item))
666+
handle_inline(item)
572667
end
573668
end
574669

lib/rdoc/markup/to_html_snippet.rb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def accept_verbatim(verbatim)
109109
input = verbatim.text.rstrip
110110
text = truncate(input, @character_limit - @characters)
111111
@characters += input.length
112-
text << ' ...' unless text == input
112+
text << " #{TO_HTML_CHARACTERS[text.encoding][:ellipsis]}" unless text == input
113113

114114
super RDoc::Markup::Verbatim.new text
115115

@@ -262,14 +262,14 @@ def handle_inline(text)
262262
return ['', 0] if limit <= 0
263263
@inline_character_limit = limit
264264
res = super
265-
res << ' ...' if @inline_character_limit <= 0
265+
res << " #{TO_HTML_CHARACTERS[text.encoding][:ellipsis]}" if @inline_character_limit <= 0
266266
@characters += limit - @inline_character_limit
267267
res
268268
end
269269

270270
def to_html(item)
271271
throw :done if @characters >= @character_limit
272-
to_html_characters(handle_inline(item))
272+
handle_inline(item)
273273
end
274274

275275
##

lib/rdoc/text.rb

Lines changed: 0 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -29,34 +29,6 @@ module RDoc::Text
2929

3030
MARKUP_FORMAT.default = RDoc::Markup
3131

32-
##
33-
# Maps an encoding to a Hash of characters properly transcoded for that
34-
# encoding.
35-
#
36-
# See also encode_fallback.
37-
38-
TO_HTML_CHARACTERS = Hash.new do |h, encoding|
39-
h[encoding] = {
40-
:close_dquote => encode_fallback('”', encoding, '"'),
41-
:close_squote => encode_fallback('’', encoding, '\''),
42-
:copyright => encode_fallback('©', encoding, '(c)'),
43-
:ellipsis => encode_fallback('…', encoding, '...'),
44-
:em_dash => encode_fallback('—', encoding, '---'),
45-
:en_dash => encode_fallback('–', encoding, '--'),
46-
:open_dquote => encode_fallback('“', encoding, '"'),
47-
:open_squote => encode_fallback('‘', encoding, '\''),
48-
:trademark => encode_fallback('®', encoding, '(r)'),
49-
}
50-
end
51-
52-
##
53-
# Transcodes +character+ to +encoding+ with a +fallback+ character.
54-
55-
def self.encode_fallback(character, encoding, fallback)
56-
character.encode(encoding, :fallback => { character => fallback },
57-
:undef => :replace, :replace => fallback)
58-
end
59-
6032
##
6133
# Expands tab characters in +text+ to eight spaces
6234

@@ -193,95 +165,6 @@ def strip_stars(text)
193165
text.gsub(/^\s+$/, empty)
194166
end
195167

196-
def to_html(text)
197-
to_html_characters(text)
198-
end
199-
200-
##
201-
# Converts ampersand, dashes, ellipsis, quotes, copyright and registered
202-
# trademark symbols in +text+ to properly encoded characters.
203-
204-
def to_html_characters(text)
205-
html = (''.encode text.encoding).dup
206-
207-
encoded = RDoc::Text::TO_HTML_CHARACTERS[text.encoding]
208-
209-
s = StringScanner.new text
210-
insquotes = false
211-
indquotes = false
212-
after_word = nil
213-
214-
until s.eos? do
215-
case
216-
when s.scan(/<(tt|code)>.*?<\/\1>/) then # skip contents of tt
217-
html << s.matched
218-
when s.scan(/<(tt|code)>.*?/) then
219-
warn "mismatched <#{s[1]}> tag" # TODO signal file/line
220-
html << s.matched
221-
when s.scan(/<[^>]+\/?s*>/) then # skip HTML tags
222-
html << s.matched
223-
when s.scan(/\.\.\.(\.?)/) then
224-
html << s[1] << encoded[:ellipsis]
225-
after_word = nil
226-
when s.scan(/\(c\)/i) then
227-
html << encoded[:copyright]
228-
after_word = nil
229-
when s.scan(/\(r\)/i) then
230-
html << encoded[:trademark]
231-
after_word = nil
232-
when s.scan(/---/) then
233-
html << encoded[:em_dash]
234-
after_word = nil
235-
when s.scan(/--/) then
236-
html << encoded[:en_dash]
237-
after_word = nil
238-
when s.scan(/&quot;|"/) then
239-
html << encoded[indquotes ? :close_dquote : :open_dquote]
240-
indquotes = !indquotes
241-
after_word = nil
242-
when s.scan(/``/) then # backtick double quote
243-
html << encoded[:open_dquote]
244-
after_word = nil
245-
when s.scan(/(?:&#39;|'){2}/) then # tick double quote
246-
html << encoded[:close_dquote]
247-
after_word = nil
248-
when s.scan(/`/) then # backtick
249-
if insquotes or after_word
250-
html << '`'
251-
after_word = false
252-
else
253-
html << encoded[:open_squote]
254-
insquotes = true
255-
end
256-
when s.scan(/&#39;|'/) then # single quote
257-
if insquotes
258-
html << encoded[:close_squote]
259-
insquotes = false
260-
elsif after_word
261-
# Mary's dog, my parents' house: do not start paired quotes
262-
html << encoded[:close_squote]
263-
else
264-
html << encoded[:open_squote]
265-
insquotes = true
266-
end
267-
268-
after_word = nil
269-
else # advance to the next potentially significant character
270-
match = s.scan(/.+?(?=[<\\.("'`&-])/) #"
271-
272-
if match then
273-
html << match
274-
after_word = match =~ /\w$/
275-
else
276-
html << s.rest
277-
break
278-
end
279-
end
280-
end
281-
282-
html
283-
end
284-
285168
##
286169
# Wraps +txt+ to +line_len+
287170

test/rdoc/markup/to_html_snippet_test.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -543,7 +543,7 @@ def test_convert_limit_verbatim
543543
<p>Hello There
544544
<p>This is some text, it <strong>will</strong> be cut off after 100 characters
545545
546-
<pre>This one is cut off in this verbatim ...</pre>
546+
<pre>This one is cut off in this verbatim </pre>
547547
EXPECTED
548548

549549
actual = @to.convert rdoc

test/rdoc/markup/to_html_test.rb

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -669,6 +669,51 @@ def test_convert_string
669669
assert_equal '&lt;&gt;', @to.convert_string('<>')
670670
end
671671

672+
def test_self_converter_encode_fallback
673+
assert_equal '…',
674+
RDoc::Markup::ToHtml::encode_fallback('…', Encoding::UTF_8, '...')
675+
assert_equal '...',
676+
RDoc::Markup::ToHtml::encode_fallback('…', Encoding::US_ASCII, '...')
677+
end
678+
679+
def test_convert_HTML_CHARACTER
680+
result = @to.convert "<b>(c)(r)(C)(R)...--....---``''</b>"
681+
assert_equal "\n<p><strong>©®©®…–.…—“”</strong></p>\n", result
682+
683+
result = @to.convert "<tt>(c)(r)(C)(R)...--....---``''</tt>"
684+
assert_equal "\n<p><code>(c)(r)(C)(R)...--....---``&#39;&#39;</code></p>\n", result
685+
686+
result = @to.convert "{(c)(r)(C)(R)...--....---``''}[url]"
687+
assert_equal "\n<p><a href=\"url\">©®©®…–.…—“”</a></p>\n", result
688+
689+
result = @to.convert "{link}[http://example.com/?q=(c)(r)(C)(R)...--....---``'']"
690+
assert_equal "\n<p><a href=\"http://example.com/?q=(c)(r)(C)(R)...--....---``&#39;&#39;\">link</a></p>\n", result
691+
end
692+
693+
def test_convert_HTML_CHARACTER_encoding
694+
s = '...(c)'.encode Encoding::Shift_JIS
695+
result = @to.convert s
696+
assert_equal Encoding::Shift_JIS, result.encoding
697+
698+
expected = '…(c)'.encode Encoding::Shift_JIS
699+
assert_equal "\n<p>#{expected}</p>\n", result
700+
end
701+
702+
def test_convert_QUOTE_dquote
703+
result = @to.convert '"This is a +quoted+ string." and "another"'
704+
assert_equal "\n<p>“This is a <code>quoted</code> string.” and “another”</p>\n", result
705+
end
706+
707+
def test_convert_QUOTE_squote
708+
result = @to.convert "'quote' '1+2'. I'm 'RDoc'"
709+
assert_equal "\n<p>‘quote’ ‘1+2’. I’m ‘RDoc’</p>\n", result
710+
end
711+
712+
def test_convert_QUOTE_backtick
713+
result = @to.convert "This is `quote' and this is `code`"
714+
assert_equal "\n<p>This is ‘quote’ and this is <code>code</code></p>\n", result
715+
end
716+
672717
def test_convert_HYPERLINK_irc
673718
result = @to.convert 'irc://irc.freenode.net/#ruby-lang'
674719

0 commit comments

Comments
 (0)