Skip to content

Commit 23acd23

Browse files
Add support for C-style comments (apache#2034)
1 parent e4c5500 commit 23acd23

File tree

4 files changed

+154
-3
lines changed

4 files changed

+154
-3
lines changed

src/dialect/generic.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,10 @@ impl Dialect for GenericDialect {
177177
true
178178
}
179179

180+
fn supports_multiline_comment_hints(&self) -> bool {
181+
true
182+
}
183+
180184
fn supports_user_host_grantee(&self) -> bool {
181185
true
182186
}

src/dialect/mod.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1099,6 +1099,12 @@ pub trait Dialect: Debug + Any {
10991099
false
11001100
}
11011101

1102+
/// Returns true if the dialect supports optimizer hints in multiline comments
1103+
/// e.g. `/*!50110 KEY_BLOCK_SIZE = 1024*/`
1104+
fn supports_multiline_comment_hints(&self) -> bool {
1105+
false
1106+
}
1107+
11021108
/// Returns true if this dialect supports treating the equals operator `=` within a `SelectItem`
11031109
/// as an alias assignment operator, rather than a boolean expression.
11041110
/// For example: the following statements are equivalent for such a dialect:

src/dialect/mysql.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,11 @@ impl Dialect for MySqlDialect {
8989
true
9090
}
9191

92+
/// see <https://dev.mysql.com/doc/refman/8.4/en/comments.html>
93+
fn supports_multiline_comment_hints(&self) -> bool {
94+
true
95+
}
96+
9297
fn parse_infix(
9398
&self,
9499
parser: &mut crate::parser::Parser,

src/tokenizer.rs

Lines changed: 139 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -945,10 +945,65 @@ impl<'a> Tokenizer<'a> {
945945
while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
946946
let span = location.span_to(state.location());
947947

948-
buf.push(TokenWithSpan { token, span });
948+
// Check if this is a multiline comment hint that should be expanded
949+
match &token {
950+
Token::Whitespace(Whitespace::MultiLineComment(comment))
951+
if self.dialect.supports_multiline_comment_hints()
952+
&& comment.starts_with('!') =>
953+
{
954+
// Re-tokenize the hints and add them to the buffer
955+
self.tokenize_comment_hints(comment, span, buf)?;
956+
}
957+
_ => {
958+
buf.push(TokenWithSpan { token, span });
959+
}
960+
}
961+
962+
location = state.location();
963+
}
964+
Ok(())
965+
}
966+
967+
/// Re-tokenize optimizer hints from a multiline comment and add them to the buffer.
968+
/// For example, `/*!50110 KEY_BLOCK_SIZE = 1024*/` becomes tokens for `KEY_BLOCK_SIZE = 1024`
969+
fn tokenize_comment_hints(
970+
&self,
971+
comment: &str,
972+
span: Span,
973+
buf: &mut Vec<TokenWithSpan>,
974+
) -> Result<(), TokenizerError> {
975+
// Strip the leading '!' and any version digits (e.g., "50110")
976+
let hint_content = comment
977+
.strip_prefix('!')
978+
.unwrap_or(comment)
979+
.trim_start_matches(|c: char| c.is_ascii_digit());
980+
981+
// If there's no content after stripping, nothing to tokenize
982+
if hint_content.is_empty() {
983+
return Ok(());
984+
}
985+
986+
// Create a new tokenizer for the hint content
987+
let inner = Tokenizer::new(self.dialect, hint_content).with_unescape(self.unescape);
988+
989+
// Create a state for tracking position within the hint
990+
let mut state = State {
991+
peekable: hint_content.chars().peekable(),
992+
line: span.start.line,
993+
col: span.start.column,
994+
};
949995

996+
// Tokenize the hint content and add tokens to the buffer
997+
let mut location = state.location();
998+
while let Some(token) = inner.next_token(&mut state, buf.last().map(|t| &t.token))? {
999+
let token_span = location.span_to(state.location());
1000+
buf.push(TokenWithSpan {
1001+
token,
1002+
span: token_span,
1003+
});
9501004
location = state.location();
9511005
}
1006+
9521007
Ok(())
9531008
}
9541009

@@ -2233,7 +2288,6 @@ impl<'a> Tokenizer<'a> {
22332288
let mut s = String::new();
22342289
let mut nested = 1;
22352290
let supports_nested_comments = self.dialect.supports_nested_comments();
2236-
22372291
loop {
22382292
match chars.next() {
22392293
Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
@@ -4218,6 +4272,88 @@ mod tests {
42184272
Token::Whitespace(Whitespace::Space),
42194273
Token::make_word("y", None),
42204274
],
4221-
)
4275+
);
4276+
}
4277+
4278+
#[test]
4279+
fn tokenize_multiline_comment_with_comment_hint() {
4280+
let sql = String::from("0/*! word */1");
4281+
4282+
let dialect = MySqlDialect {};
4283+
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
4284+
let expected = vec![
4285+
Token::Number("0".to_string(), false),
4286+
Token::Whitespace(Whitespace::Space),
4287+
Token::Word(Word {
4288+
value: "word".to_string(),
4289+
quote_style: None,
4290+
keyword: Keyword::NoKeyword,
4291+
}),
4292+
Token::Whitespace(Whitespace::Space),
4293+
Token::Number("1".to_string(), false),
4294+
];
4295+
compare(expected, tokens);
4296+
}
4297+
4298+
#[test]
4299+
fn tokenize_multiline_comment_with_comment_hint_and_version() {
4300+
let sql_multi = String::from("0 /*!50110 KEY_BLOCK_SIZE = 1024*/ 1");
4301+
let dialect = MySqlDialect {};
4302+
let tokens = Tokenizer::new(&dialect, &sql_multi).tokenize().unwrap();
4303+
let expected = vec![
4304+
Token::Number("0".to_string(), false),
4305+
Token::Whitespace(Whitespace::Space),
4306+
Token::Whitespace(Whitespace::Space),
4307+
Token::Word(Word {
4308+
value: "KEY_BLOCK_SIZE".to_string(),
4309+
quote_style: None,
4310+
keyword: Keyword::KEY_BLOCK_SIZE,
4311+
}),
4312+
Token::Whitespace(Whitespace::Space),
4313+
Token::Eq,
4314+
Token::Whitespace(Whitespace::Space),
4315+
Token::Number("1024".to_string(), false),
4316+
Token::Whitespace(Whitespace::Space),
4317+
Token::Number("1".to_string(), false),
4318+
];
4319+
compare(expected, tokens);
4320+
4321+
let tokens = Tokenizer::new(&dialect, "0 /*!50110 */ 1")
4322+
.tokenize()
4323+
.unwrap();
4324+
compare(
4325+
vec![
4326+
Token::Number("0".to_string(), false),
4327+
Token::Whitespace(Whitespace::Space),
4328+
Token::Whitespace(Whitespace::Space),
4329+
Token::Whitespace(Whitespace::Space),
4330+
Token::Number("1".to_string(), false),
4331+
],
4332+
tokens,
4333+
);
4334+
4335+
let tokens = Tokenizer::new(&dialect, "0 /*!*/ 1").tokenize().unwrap();
4336+
compare(
4337+
vec![
4338+
Token::Number("0".to_string(), false),
4339+
Token::Whitespace(Whitespace::Space),
4340+
Token::Whitespace(Whitespace::Space),
4341+
Token::Number("1".to_string(), false),
4342+
],
4343+
tokens,
4344+
);
4345+
let tokens = Tokenizer::new(&dialect, "0 /*! */ 1").tokenize().unwrap();
4346+
compare(
4347+
vec![
4348+
Token::Number("0".to_string(), false),
4349+
Token::Whitespace(Whitespace::Space),
4350+
Token::Whitespace(Whitespace::Space),
4351+
Token::Whitespace(Whitespace::Space),
4352+
Token::Whitespace(Whitespace::Space),
4353+
Token::Whitespace(Whitespace::Space),
4354+
Token::Number("1".to_string(), false),
4355+
],
4356+
tokens,
4357+
);
42224358
}
42234359
}

0 commit comments

Comments
 (0)