Skip to content

Commit 53e2a76

Browse files
eamonnmcmanusgoogle-java-format Team
authored andcommitted
Initial support for Markdown Javadoc.
This is work in progress and *may mangle some Markdown Javadoc comments*. Later changes will address that. Currently supported Markdown constructs: newline-separated paragraphs; `# Headings`; and `- Bullet lists`. `*Emphasis*` and `**Strong emphasis**` should work too because they don't need any special treatment. Other constructs probably don't work, notably code blocks. The approach here is to identify where the Markdown constructs are and effectively insert that information into the existing logic for HTML constructs and Javadoc-specific constructs. The reason is that these constructs are still valid in Markdown Javadoc comments. So it makes sense to augment the existing logic rather than trying to make a complete parallel formatter. PiperOrigin-RevId: 893593782
1 parent 142c392 commit 53e2a76

File tree

10 files changed

+470
-84
lines changed

10 files changed

+470
-84
lines changed

core/pom.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,11 @@
3939
<groupId>com.google.guava</groupId>
4040
<artifactId>guava</artifactId>
4141
</dependency>
42+
<dependency>
43+
<groupId>org.commonmark</groupId>
44+
<artifactId>commonmark</artifactId>
45+
<version>0.28.0</version>
46+
</dependency>
4247

4348
<!-- Compile-time dependencies -->
4449
<dependency>

core/src/main/java/com/google/googlejavaformat/java/JavaCommentsHelper.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,13 @@ public String rewrite(Tok tok, int maxWidth, int column0) {
5050
}
5151
String text = tok.getOriginalText();
5252
if (tok.isJavadocComment() && options.formatJavadoc()) {
53-
text = JavadocFormatter.formatJavadoc(text, column0);
53+
if (text.startsWith("///")) {
54+
if (markdownJavadocPositions.contains(tok.getPosition())) {
55+
return JavadocFormatter.formatJavadoc(text, column0);
56+
}
57+
} else {
58+
text = JavadocFormatter.formatJavadoc(text, column0);
59+
}
5460
}
5561
List<String> lines = new ArrayList<>();
5662
Iterator<String> it = Newlines.lineIterator(text);

core/src/main/java/com/google/googlejavaformat/java/JavaInput.java

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -160,9 +160,13 @@ public boolean isSlashStarComment() {
160160

161161
@Override
162162
public boolean isJavadocComment() {
163-
// comments like `/***` are also javadoc, but their formatting probably won't be improved
164-
// by the javadoc formatter
165-
return text.startsWith("/**") && text.charAt("/**".length()) != '*' && text.length() > 4;
163+
// comments like `/***` or `////` are also javadoc, but their formatting probably won't be
164+
// improved by the javadoc formatter
165+
return ((text.startsWith("/**") && !text.startsWith("/***"))
166+
|| (Runtime.version().feature() >= 23
167+
&& text.startsWith("///")
168+
&& !text.startsWith("////")))
169+
&& text.length() > 4;
166170
}
167171

168172
@Override

core/src/main/java/com/google/googlejavaformat/java/javadoc/CharStream.java

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,18 +27,18 @@
2727
*/
2828
final class CharStream {
2929
private final String input;
30-
private int start;
30+
private int position;
3131
private int tokenEnd = -1; // Negative value means no token, and will cause an exception if used.
3232

3333
CharStream(String input) {
3434
this.input = checkNotNull(input);
3535
}
3636

3737
boolean tryConsume(String expected) {
38-
if (!input.startsWith(expected, start)) {
38+
if (!input.startsWith(expected, position)) {
3939
return false;
4040
}
41-
tokenEnd = start + expected.length();
41+
tokenEnd = position + expected.length();
4242
return true;
4343
}
4444

@@ -48,7 +48,7 @@ boolean tryConsume(String expected) {
4848
* @param pattern the pattern to search for, which must be anchored to match only at position 0
4949
*/
5050
boolean tryConsumeRegex(Pattern pattern) {
51-
Matcher matcher = pattern.matcher(input).region(start, input.length());
51+
Matcher matcher = pattern.matcher(input).region(position, input.length());
5252
if (!matcher.lookingAt()) {
5353
return false;
5454
}
@@ -57,13 +57,17 @@ boolean tryConsumeRegex(Pattern pattern) {
5757
}
5858

5959
String readAndResetRecorded() {
60-
String result = input.substring(start, tokenEnd);
61-
start = tokenEnd;
60+
String result = input.substring(position, tokenEnd);
61+
position = tokenEnd;
6262
tokenEnd = -1;
6363
return result;
6464
}
6565

6666
boolean isExhausted() {
67-
return start == input.length();
67+
return position == input.length();
68+
}
69+
70+
int position() {
71+
return position;
6872
}
6973
}

core/src/main/java/com/google/googlejavaformat/java/javadoc/JavadocFormatter.java

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,15 @@
1414

1515
package com.google.googlejavaformat.java.javadoc;
1616

17+
import static com.google.common.base.Preconditions.checkState;
1718
import static com.google.googlejavaformat.java.javadoc.JavadocLexer.lex;
1819
import static com.google.googlejavaformat.java.javadoc.Token.Type.BR_TAG;
1920
import static com.google.googlejavaformat.java.javadoc.Token.Type.PARAGRAPH_OPEN_TAG;
2021
import static java.util.regex.Pattern.CASE_INSENSITIVE;
2122
import static java.util.regex.Pattern.compile;
23+
import static java.util.stream.Collectors.joining;
2224

25+
import com.google.common.base.CharMatcher;
2326
import com.google.common.collect.ImmutableList;
2427
import com.google.googlejavaformat.java.javadoc.JavadocLexer.LexException;
2528
import java.util.List;
@@ -39,22 +42,37 @@ public final class JavadocFormatter {
3942
static final int MAX_LINE_LENGTH = 100;
4043

4144
/**
42-
* Formats the given Javadoc comment, which must start with ∕✱✱ and end with ✱∕. The output will
43-
* start and end with the same characters.
45+
* Formats the given Javadoc comment. A classic Javadoc comment must start with ∕✱✱ and end with
46+
* ✱∕, and the output will start and end with the same characters. A Markdown Javadoc comment
47+
* consists of lines each of which starts with ///, and the output will also consist of such
48+
* lines.
4449
*/
4550
public static String formatJavadoc(String input, int blockIndent) {
51+
boolean classicJavadoc =
52+
switch (input) {
53+
case String s when s.startsWith("/**") -> true;
54+
case String s when s.startsWith("///") -> false;
55+
default ->
56+
throw new IllegalArgumentException("Input does not start with /** or ///: " + input);
57+
};
58+
if (!classicJavadoc) {
59+
input = "///" + markdownCommentText(input);
60+
}
4661
ImmutableList<Token> tokens;
4762
try {
48-
tokens = lex(input);
63+
tokens = lex(input, classicJavadoc);
4964
} catch (LexException e) {
5065
return input;
5166
}
52-
String result = render(tokens, blockIndent);
53-
return makeSingleLineIfPossible(blockIndent, result);
67+
String result = render(tokens, blockIndent, classicJavadoc);
68+
if (classicJavadoc) {
69+
result = makeSingleLineIfPossible(blockIndent, result);
70+
}
71+
return result;
5472
}
5573

56-
private static String render(List<Token> input, int blockIndent) {
57-
JavadocWriter output = new JavadocWriter(blockIndent);
74+
private static String render(List<Token> input, int blockIndent, boolean classicJavadoc) {
75+
JavadocWriter output = new JavadocWriter(blockIndent, classicJavadoc);
5876
for (Token token : input) {
5977
switch (token.type()) {
6078
case BEGIN_JAVADOC -> output.writeBeginJavadoc();
@@ -137,12 +155,36 @@ private static boolean oneLineJavadoc(String line, int blockIndent) {
137155
return false;
138156
}
139157
// If the javadoc contains only a tag, use multiple lines to encourage writing a summary
140-
// fragment, unless it's /* @hide */.
158+
// fragment, unless it's /** @hide */.
141159
if (line.startsWith("@") && !line.equals("@hide")) {
142160
return false;
143161
}
144162
return true;
145163
}
146164

165+
private static final CharMatcher NOT_SPACE_OR_TAB = CharMatcher.noneOf(" \t");
166+
167+
/**
168+
* Returns the given string with the leading /// and any common leading whitespace removed from
169+
* each line. The resultant string can then be fed to a standard Markdown parser.
170+
*/
171+
private static String markdownCommentText(String input) {
172+
List<String> lines =
173+
input
174+
.lines()
175+
.peek(line -> checkState(line.contains("///"), "Line does not contain ///: %s", line))
176+
.map(line -> line.substring(line.indexOf("///") + 3))
177+
.toList();
178+
int leadingSpace =
179+
lines.stream()
180+
.filter(line -> NOT_SPACE_OR_TAB.matchesAnyOf(line))
181+
.mapToInt(NOT_SPACE_OR_TAB::indexIn)
182+
.min()
183+
.orElse(0);
184+
return lines.stream()
185+
.map(line -> line.length() < leadingSpace ? "" : line.substring(leadingSpace))
186+
.collect(joining("\n"));
187+
}
188+
147189
private JavadocFormatter() {}
148190
}

core/src/main/java/com/google/googlejavaformat/java/javadoc/JavadocLexer.java

Lines changed: 59 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@
5555
import com.google.common.base.CharMatcher;
5656
import com.google.common.collect.ImmutableList;
5757
import com.google.common.collect.PeekingIterator;
58-
import com.google.googlejavaformat.java.javadoc.Token.Type;
5958
import java.util.ArrayDeque;
6059
import java.util.ArrayList;
6160
import java.util.Deque;
@@ -65,15 +64,24 @@
6564
/** Lexer for the Javadoc formatter. */
6665
final class JavadocLexer {
6766
/** Takes a Javadoc comment, including ∕✱✱ and ✱∕, and returns tokens, including ∕✱✱ and ✱∕. */
68-
static ImmutableList<Token> lex(String input) throws LexException {
69-
/*
70-
* TODO(cpovirk): In theory, we should interpret Unicode escapes (yet output them in their
71-
* original form). This would mean mean everything from an encoded ∕✱✱ to an encoded <pre> tag,
72-
* so we'll probably never bother.
73-
*/
74-
input = stripJavadocBeginAndEnd(input);
67+
static ImmutableList<Token> lex(String input, boolean classicJavadoc) throws LexException {
68+
MarkdownPositions markdownPositions;
69+
if (classicJavadoc) {
70+
/*
71+
* TODO(cpovirk): In theory, we should interpret Unicode escapes (yet output them in their
72+
* original form). This would mean mean everything from an encoded ∕✱✱ to an encoded <pre>
73+
* tag, so we'll probably never bother.
74+
*/
75+
input = stripJavadocBeginAndEnd(input);
76+
markdownPositions = MarkdownPositions.EMPTY;
77+
} else {
78+
checkArgument(input.startsWith("///"));
79+
input = input.substring("///".length());
80+
markdownPositions = MarkdownPositions.parse(input);
81+
}
7582
input = normalizeLineEndings(input);
76-
return new JavadocLexer(new CharStream(input)).generateTokens();
83+
return new JavadocLexer(new CharStream(input), markdownPositions, classicJavadoc)
84+
.generateTokens();
7785
}
7886

7987
/** The lexer crashes on windows line endings, so for now just normalize to `\n`. */
@@ -95,56 +103,65 @@ private static String stripJavadocBeginAndEnd(String input) {
95103
}
96104

97105
private final CharStream input;
106+
private final boolean classicJavadoc;
107+
private final MarkdownPositions markdownPositions;
98108
private final NestingStack braceStack = new NestingStack();
99109
private final NestingStack preStack = new NestingStack();
100110
private final NestingStack codeStack = new NestingStack();
101111
private final NestingStack tableStack = new NestingStack();
102112
private boolean outerInlineTagIsSnippet;
103113
private boolean somethingSinceNewline;
104114

105-
private JavadocLexer(CharStream input) {
115+
private JavadocLexer(
116+
CharStream input, MarkdownPositions markdownPositions, boolean classicJavadoc) {
106117
this.input = checkNotNull(input);
118+
this.markdownPositions = markdownPositions;
119+
this.classicJavadoc = classicJavadoc;
107120
}
108121

109122
private ImmutableList<Token> generateTokens() throws LexException {
110123
ImmutableList.Builder<Token> tokens = ImmutableList.builder();
111124

112-
Token token = new Token(BEGIN_JAVADOC, "/**");
125+
Token token = new Token(BEGIN_JAVADOC, classicJavadoc ? "/**" : "///");
113126
tokens.add(token);
114127

115128
while (!input.isExhausted()) {
129+
tokens.addAll(markdownPositions.tokensAt(input.position()));
116130
token = readToken();
117131
tokens.add(token);
118132
}
119133

120134
checkMatchingTags();
121135

122-
token = new Token(END_JAVADOC, "*/");
136+
token = new Token(END_JAVADOC, classicJavadoc ? "*/" : "");
123137
tokens.add(token);
124138

125139
ImmutableList<Token> result = tokens.build();
126140
result = joinAdjacentLiteralsAndAdjacentWhitespace(result);
127-
result = inferParagraphTags(result);
141+
if (classicJavadoc) {
142+
result = inferParagraphTags(result);
143+
}
128144
result = optionalizeSpacesAfterLinks(result);
129145
result = deindentPreCodeBlocks(result);
130146
return result;
131147
}
132148

133149
private Token readToken() throws LexException {
134-
Type type = consumeToken();
150+
Token.Type type = consumeToken();
135151
String value = input.readAndResetRecorded();
136152
return new Token(type, value);
137153
}
138154

139-
private Type consumeToken() throws LexException {
155+
private Token.Type consumeToken() throws LexException {
140156
boolean preserveExistingFormatting = preserveExistingFormatting();
141157

142-
if (input.tryConsumeRegex(NEWLINE_PATTERN)) {
158+
Pattern newlinePattern = classicJavadoc ? CLASSIC_NEWLINE_PATTERN : MARKDOWN_NEWLINE_PATTERN;
159+
if (input.tryConsumeRegex(newlinePattern)) {
143160
somethingSinceNewline = false;
144161
return preserveExistingFormatting ? FORCED_NEWLINE : WHITESPACE;
145162
} else if (input.tryConsume(" ") || input.tryConsume("\t")) {
146163
// TODO(cpovirk): How about weird whitespace chars? Ideally we'd distinguish breaking vs. not.
147-
// Returning LITERAL here prevent us from breaking a <pre> line. For more info, see LITERAL.
164+
// Returning LITERAL here prevents us from breaking a <pre> line. For more info, see LITERAL.
148165
return preserveExistingFormatting ? LITERAL : WHITESPACE;
149166
}
150167

@@ -187,7 +204,7 @@ private Type consumeToken() throws LexException {
187204

188205
// Inside an inline tag, don't do any HTML interpretation.
189206
if (!braceStack.isEmpty()) {
190-
verify(input.tryConsumeRegex(LITERAL_PATTERN));
207+
verify(input.tryConsumeRegex(literalPattern()));
191208
return LITERAL;
192209
}
193210

@@ -216,7 +233,7 @@ private Type consumeToken() throws LexException {
216233
}
217234

218235
if (preserveExistingFormatting) {
219-
verify(input.tryConsumeRegex(LITERAL_PATTERN));
236+
verify(input.tryConsumeRegex(literalPattern()));
220237
return LITERAL;
221238
}
222239

@@ -248,7 +265,7 @@ private Type consumeToken() throws LexException {
248265
return MOE_END_STRIP_COMMENT;
249266
} else if (input.tryConsumeRegex(HTML_COMMENT_PATTERN)) {
250267
return HTML_COMMENT;
251-
} else if (input.tryConsumeRegex(LITERAL_PATTERN)) {
268+
} else if (input.tryConsumeRegex(literalPattern())) {
252269
return LITERAL;
253270
}
254271
throw new AssertionError();
@@ -274,7 +291,7 @@ private void checkMatchingTags() throws LexException {
274291
* Join together adjacent literal tokens, and join together adjacent whitespace tokens.
275292
*
276293
* <p>For literal tokens, this means something like {@code ["<b>", "foo", "</b>"] =>
277-
* ["<b>foo</b>"]}. See {@link #LITERAL_PATTERN} for discussion of why those tokens are separate
294+
* ["<b>foo</b>"]}. See {@link #literalPattern()} for discussion of why those tokens are separate
278295
* to begin with.
279296
*
280297
* <p>Whitespace tokens are treated analogously. We don't really "want" to join whitespace tokens,
@@ -514,7 +531,8 @@ private static boolean hasMultipleNewlines(String s) {
514531
* We'd remove the trailing whitespace later on (in JavaCommentsHelper.rewrite), but I feel safer
515532
* stripping it now: It otherwise might confuse our line-length count, which we use for wrapping.
516533
*/
517-
private static final Pattern NEWLINE_PATTERN = compile("[ \t]*\n[ \t]*[*]?[ \t]?");
534+
private static final Pattern CLASSIC_NEWLINE_PATTERN = compile("[ \t]*\n[ \t]*[*]?[ \t]?");
535+
private static final Pattern MARKDOWN_NEWLINE_PATTERN = compile("[ \t]*\n[ \t]*");
518536

519537
// We ensure elsewhere that we match this only at the beginning of a line.
520538
// Only match tags that start with a lowercase letter, to avoid false matches on unescaped
@@ -545,17 +563,29 @@ private static boolean hasMultipleNewlines(String s) {
545563
private static final Pattern BR_PATTERN = openTagPattern("br");
546564
private static final Pattern SNIPPET_TAG_OPEN_PATTERN = compile("[{]@snippet\\b");
547565
private static final Pattern INLINE_TAG_OPEN_PATTERN = compile("[{]@\\w*");
566+
548567
/*
549568
* We exclude < so that we don't swallow following HTML tags. This lets us fix up "foo<p>" (~400
550-
* hits in Google-internal code). We will join unnecessarily split "words" (like "foo<b>bar</b>")
551-
* in a later step. There's a similar story for braces. I'm not sure I actually need to exclude @
552-
* or *. TODO(cpovirk): Try removing them.
569+
* hits in Google-internal code).
553570
*
554-
* Thanks to the "rejoin" step in joinAdjacentLiteralsAndAdjacentWhitespace(), we could get away
555-
* with matching only one character here. That would eliminate the need for the regex entirely.
556-
* That might be faster or slower than what we do now.
571+
* TODO(cpovirk): might not need to exclude @ or *.
557572
*/
558-
private static final Pattern LITERAL_PATTERN = compile(".[^ \t\n@<{}*]*", DOTALL);
573+
private static final Pattern CLASSIC_LITERAL_PATTERN = compile(".[^ \t\n@<{}*]*", DOTALL);
574+
575+
/*
576+
* Many characters have special meaning in Markdown. Rather than list them all, we'll just match
577+
* a sequence of alphabetic characters. Even digits can have special meaning, for numbered lists.
578+
*/
579+
private static final Pattern MARKDOWN_LITERAL_PATTERN = compile(".\\p{IsAlphabetic}*", DOTALL);
580+
581+
/**
582+
* The pattern used for "literals", things that do not have any special formatting meaning. This
583+
* doesn't have to be a maximal sequence of literal characters, since adjacent literals will be
584+
* joined together in a later step.
585+
*/
586+
private Pattern literalPattern() {
587+
return classicJavadoc ? CLASSIC_LITERAL_PATTERN : MARKDOWN_LITERAL_PATTERN;
588+
}
559589

560590
private static Pattern openTagPattern(String namePattern) {
561591
return compile(format("<(?:%s)\\b[^>]*>", namePattern), CASE_INSENSITIVE);

0 commit comments

Comments
 (0)