5555import com .google .common .base .CharMatcher ;
5656import com .google .common .collect .ImmutableList ;
5757import com .google .common .collect .PeekingIterator ;
58- import com .google .googlejavaformat .java .javadoc .Token .Type ;
5958import java .util .ArrayDeque ;
6059import java .util .ArrayList ;
6160import java .util .Deque ;
6564/** Lexer for the Javadoc formatter. */
6665final class JavadocLexer {
6766 /** Takes a Javadoc comment, including ∕✱✱ and ✱∕, and returns tokens, including ∕✱✱ and ✱∕. */
68- static ImmutableList <Token > lex (String input ) throws LexException {
69- /*
70- * TODO(cpovirk): In theory, we should interpret Unicode escapes (yet output them in their
71- * original form). This would mean mean everything from an encoded ∕✱✱ to an encoded <pre> tag,
72- * so we'll probably never bother.
73- */
74- input = stripJavadocBeginAndEnd (input );
67+ static ImmutableList <Token > lex (String input , boolean classicJavadoc ) throws LexException {
68+ MarkdownPositions markdownPositions ;
69+ if (classicJavadoc ) {
70+ /*
71+ * TODO(cpovirk): In theory, we should interpret Unicode escapes (yet output them in their
72+ * original form). This would mean mean everything from an encoded ∕✱✱ to an encoded <pre>
73+ * tag, so we'll probably never bother.
74+ */
75+ input = stripJavadocBeginAndEnd (input );
76+ markdownPositions = MarkdownPositions .EMPTY ;
77+ } else {
78+ checkArgument (input .startsWith ("///" ));
79+ input = input .substring ("///" .length ());
80+ markdownPositions = MarkdownPositions .parse (input );
81+ }
7582 input = normalizeLineEndings (input );
76- return new JavadocLexer (new CharStream (input )).generateTokens ();
83+ return new JavadocLexer (new CharStream (input ), markdownPositions , classicJavadoc )
84+ .generateTokens ();
7785 }
7886
7987 /** The lexer crashes on windows line endings, so for now just normalize to `\n`. */
@@ -95,56 +103,65 @@ private static String stripJavadocBeginAndEnd(String input) {
95103 }
96104
97105 private final CharStream input ;
106+ private final boolean classicJavadoc ;
107+ private final MarkdownPositions markdownPositions ;
98108 private final NestingStack braceStack = new NestingStack ();
99109 private final NestingStack preStack = new NestingStack ();
100110 private final NestingStack codeStack = new NestingStack ();
101111 private final NestingStack tableStack = new NestingStack ();
102112 private boolean outerInlineTagIsSnippet ;
103113 private boolean somethingSinceNewline ;
104114
105- private JavadocLexer (CharStream input ) {
115+ private JavadocLexer (
116+ CharStream input , MarkdownPositions markdownPositions , boolean classicJavadoc ) {
106117 this .input = checkNotNull (input );
118+ this .markdownPositions = markdownPositions ;
119+ this .classicJavadoc = classicJavadoc ;
107120 }
108121
109122 private ImmutableList <Token > generateTokens () throws LexException {
110123 ImmutableList .Builder <Token > tokens = ImmutableList .builder ();
111124
112- Token token = new Token (BEGIN_JAVADOC , "/**" );
125+ Token token = new Token (BEGIN_JAVADOC , classicJavadoc ? "/**" : "/// " );
113126 tokens .add (token );
114127
115128 while (!input .isExhausted ()) {
129+ tokens .addAll (markdownPositions .tokensAt (input .position ()));
116130 token = readToken ();
117131 tokens .add (token );
118132 }
119133
120134 checkMatchingTags ();
121135
122- token = new Token (END_JAVADOC , "*/" );
136+ token = new Token (END_JAVADOC , classicJavadoc ? "*/" : " " );
123137 tokens .add (token );
124138
125139 ImmutableList <Token > result = tokens .build ();
126140 result = joinAdjacentLiteralsAndAdjacentWhitespace (result );
127- result = inferParagraphTags (result );
141+ if (classicJavadoc ) {
142+ result = inferParagraphTags (result );
143+ }
128144 result = optionalizeSpacesAfterLinks (result );
129145 result = deindentPreCodeBlocks (result );
130146 return result ;
131147 }
132148
133149 private Token readToken () throws LexException {
134- Type type = consumeToken ();
150+ Token . Type type = consumeToken ();
135151 String value = input .readAndResetRecorded ();
136152 return new Token (type , value );
137153 }
138154
139- private Type consumeToken () throws LexException {
155+ private Token . Type consumeToken () throws LexException {
140156 boolean preserveExistingFormatting = preserveExistingFormatting ();
141157
142- if (input .tryConsumeRegex (NEWLINE_PATTERN )) {
158+ Pattern newlinePattern = classicJavadoc ? CLASSIC_NEWLINE_PATTERN : MARKDOWN_NEWLINE_PATTERN ;
159+ if (input .tryConsumeRegex (newlinePattern )) {
143160 somethingSinceNewline = false ;
144161 return preserveExistingFormatting ? FORCED_NEWLINE : WHITESPACE ;
145162 } else if (input .tryConsume (" " ) || input .tryConsume ("\t " )) {
146163 // TODO(cpovirk): How about weird whitespace chars? Ideally we'd distinguish breaking vs. not.
147- // Returning LITERAL here prevent us from breaking a <pre> line. For more info, see LITERAL.
164+ // Returning LITERAL here prevents us from breaking a <pre> line. For more info, see LITERAL.
148165 return preserveExistingFormatting ? LITERAL : WHITESPACE ;
149166 }
150167
@@ -187,7 +204,7 @@ private Type consumeToken() throws LexException {
187204
188205 // Inside an inline tag, don't do any HTML interpretation.
189206 if (!braceStack .isEmpty ()) {
190- verify (input .tryConsumeRegex (LITERAL_PATTERN ));
207+ verify (input .tryConsumeRegex (literalPattern () ));
191208 return LITERAL ;
192209 }
193210
@@ -216,7 +233,7 @@ private Type consumeToken() throws LexException {
216233 }
217234
218235 if (preserveExistingFormatting ) {
219- verify (input .tryConsumeRegex (LITERAL_PATTERN ));
236+ verify (input .tryConsumeRegex (literalPattern () ));
220237 return LITERAL ;
221238 }
222239
@@ -248,7 +265,7 @@ private Type consumeToken() throws LexException {
248265 return MOE_END_STRIP_COMMENT ;
249266 } else if (input .tryConsumeRegex (HTML_COMMENT_PATTERN )) {
250267 return HTML_COMMENT ;
251- } else if (input .tryConsumeRegex (LITERAL_PATTERN )) {
268+ } else if (input .tryConsumeRegex (literalPattern () )) {
252269 return LITERAL ;
253270 }
254271 throw new AssertionError ();
@@ -274,7 +291,7 @@ private void checkMatchingTags() throws LexException {
274291 * Join together adjacent literal tokens, and join together adjacent whitespace tokens.
275292 *
276293 * <p>For literal tokens, this means something like {@code ["<b>", "foo", "</b>"] =>
277- * ["<b>foo</b>"]}. See {@link #LITERAL_PATTERN } for discussion of why those tokens are separate
294+ * ["<b>foo</b>"]}. See {@link #literalPattern() } for discussion of why those tokens are separate
278295 * to begin with.
279296 *
280297 * <p>Whitespace tokens are treated analogously. We don't really "want" to join whitespace tokens,
@@ -514,7 +531,8 @@ private static boolean hasMultipleNewlines(String s) {
514531 * We'd remove the trailing whitespace later on (in JavaCommentsHelper.rewrite), but I feel safer
515532 * stripping it now: It otherwise might confuse our line-length count, which we use for wrapping.
516533 */
517- private static final Pattern NEWLINE_PATTERN = compile ("[ \t ]*\n [ \t ]*[*]?[ \t ]?" );
534+ private static final Pattern CLASSIC_NEWLINE_PATTERN = compile ("[ \t ]*\n [ \t ]*[*]?[ \t ]?" );
535+ private static final Pattern MARKDOWN_NEWLINE_PATTERN = compile ("[ \t ]*\n [ \t ]*" );
518536
519537 // We ensure elsewhere that we match this only at the beginning of a line.
520538 // Only match tags that start with a lowercase letter, to avoid false matches on unescaped
@@ -545,17 +563,29 @@ private static boolean hasMultipleNewlines(String s) {
545563 private static final Pattern BR_PATTERN = openTagPattern ("br" );
546564 private static final Pattern SNIPPET_TAG_OPEN_PATTERN = compile ("[{]@snippet\\ b" );
547565 private static final Pattern INLINE_TAG_OPEN_PATTERN = compile ("[{]@\\ w*" );
566+
548567 /*
549568 * We exclude < so that we don't swallow following HTML tags. This lets us fix up "foo<p>" (~400
550- * hits in Google-internal code). We will join unnecessarily split "words" (like "foo<b>bar</b>")
551- * in a later step. There's a similar story for braces. I'm not sure I actually need to exclude @
552- * or *. TODO(cpovirk): Try removing them.
569+ * hits in Google-internal code).
553570 *
554- * Thanks to the "rejoin" step in joinAdjacentLiteralsAndAdjacentWhitespace(), we could get away
555- * with matching only one character here. That would eliminate the need for the regex entirely.
556- * That might be faster or slower than what we do now.
571+ * TODO(cpovirk): might not need to exclude @ or *.
557572 */
558- private static final Pattern LITERAL_PATTERN = compile (".[^ \t \n @<{}*]*" , DOTALL );
573+ private static final Pattern CLASSIC_LITERAL_PATTERN = compile (".[^ \t \n @<{}*]*" , DOTALL );
574+
575+ /*
576+ * Many characters have special meaning in Markdown. Rather than list them all, we'll just match
577+ * a sequence of alphabetic characters. Even digits can have special meaning, for numbered lists.
578+ */
579+ private static final Pattern MARKDOWN_LITERAL_PATTERN = compile (".\\ p{IsAlphabetic}*" , DOTALL );
580+
581+ /**
582+ * The pattern used for "literals", things that do not have any special formatting meaning. This
583+ * doesn't have to be a maximal sequence of literal characters, since adjacent literals will be
584+ * joined together in a later step.
585+ */
586+ private Pattern literalPattern () {
587+ return classicJavadoc ? CLASSIC_LITERAL_PATTERN : MARKDOWN_LITERAL_PATTERN ;
588+ }
559589
560590 private static Pattern openTagPattern (String namePattern ) {
561591 return compile (format ("<(?:%s)\\ b[^>]*>" , namePattern ), CASE_INSENSITIVE );
0 commit comments