Add support for C-style comments (apache#2034)

altmannmarcelo · web-flow · commit 23acd2376698 · 2026-02-06T15:40:42.000Z
diff --git a/src/dialect/generic.rs b/src/dialect/generic.rs
@@ -177,6 +177,10 @@ impl Dialect for GenericDialect {
         true
     }
 
+    fn supports_multiline_comment_hints(&self) -> bool {
+        true
+    }
+
     fn supports_user_host_grantee(&self) -> bool {
         true
     }
diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs
@@ -1099,6 +1099,12 @@ pub trait Dialect: Debug + Any {
         false
     }
 
+    /// Returns true if the dialect supports optimizer hints in multiline comments
+    /// e.g. `/*!50110 KEY_BLOCK_SIZE = 1024*/`
+    fn supports_multiline_comment_hints(&self) -> bool {
+        false
+    }
+
     /// Returns true if this dialect supports treating the equals operator `=` within a `SelectItem`
     /// as an alias assignment operator, rather than a boolean expression.
     /// For example: the following statements are equivalent for such a dialect:
diff --git a/src/dialect/mysql.rs b/src/dialect/mysql.rs
@@ -89,6 +89,11 @@ impl Dialect for MySqlDialect {
         true
     }
 
+    /// see <https://dev.mysql.com/doc/refman/8.4/en/comments.html>
+    fn supports_multiline_comment_hints(&self) -> bool {
+        true
+    }
+
     fn parse_infix(
         &self,
         parser: &mut crate::parser::Parser,
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -945,10 +945,65 @@ impl<'a> Tokenizer<'a> {
         while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
             let span = location.span_to(state.location());
 
-            buf.push(TokenWithSpan { token, span });
+            // Check if this is a multiline comment hint that should be expanded
+            match &token {
+                Token::Whitespace(Whitespace::MultiLineComment(comment))
+                    if self.dialect.supports_multiline_comment_hints()
+                        && comment.starts_with('!') =>
+                {
+                    // Re-tokenize the hints and add them to the buffer
+                    self.tokenize_comment_hints(comment, span, buf)?;
+                }
+                _ => {
+                    buf.push(TokenWithSpan { token, span });
+                }
+            }
+
+            location = state.location();
+        }
+        Ok(())
+    }
+
+    /// Re-tokenize optimizer hints from a multiline comment and add them to the buffer.
+    /// For example, `/*!50110 KEY_BLOCK_SIZE = 1024*/` becomes tokens for `KEY_BLOCK_SIZE = 1024`
+    fn tokenize_comment_hints(
+        &self,
+        comment: &str,
+        span: Span,
+        buf: &mut Vec<TokenWithSpan>,
+    ) -> Result<(), TokenizerError> {
+        // Strip the leading '!' and any version digits (e.g., "50110")
+        let hint_content = comment
+            .strip_prefix('!')
+            .unwrap_or(comment)
+            .trim_start_matches(|c: char| c.is_ascii_digit());
+
+        // If there's no content after stripping, nothing to tokenize
+        if hint_content.is_empty() {
+            return Ok(());
+        }
+
+        // Create a new tokenizer for the hint content
+        let inner = Tokenizer::new(self.dialect, hint_content).with_unescape(self.unescape);
+
+        // Create a state for tracking position within the hint
+        let mut state = State {
+            peekable: hint_content.chars().peekable(),
+            line: span.start.line,
+            col: span.start.column,
+        };
 
+        // Tokenize the hint content and add tokens to the buffer
+        let mut location = state.location();
+        while let Some(token) = inner.next_token(&mut state, buf.last().map(|t| &t.token))? {
+            let token_span = location.span_to(state.location());
+            buf.push(TokenWithSpan {
+                token,
+                span: token_span,
+            });
             location = state.location();
         }
+
         Ok(())
     }
 
@@ -2233,7 +2288,6 @@ impl<'a> Tokenizer<'a> {
         let mut s = String::new();
         let mut nested = 1;
         let supports_nested_comments = self.dialect.supports_nested_comments();
-
         loop {
             match chars.next() {
                 Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
@@ -4218,6 +4272,88 @@ mod tests {
                 Token::Whitespace(Whitespace::Space),
                 Token::make_word("y", None),
             ],
-        )
+        );
+    }
+
+    #[test]
+    fn tokenize_multiline_comment_with_comment_hint() {
+        let sql = String::from("0/*! word */1");
+
+        let dialect = MySqlDialect {};
+        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
+        let expected = vec![
+            Token::Number("0".to_string(), false),
+            Token::Whitespace(Whitespace::Space),
+            Token::Word(Word {
+                value: "word".to_string(),
+                quote_style: None,
+                keyword: Keyword::NoKeyword,
+            }),
+            Token::Whitespace(Whitespace::Space),
+            Token::Number("1".to_string(), false),
+        ];
+        compare(expected, tokens);
+    }
+
+    #[test]
+    fn tokenize_multiline_comment_with_comment_hint_and_version() {
+        let sql_multi = String::from("0 /*!50110 KEY_BLOCK_SIZE = 1024*/ 1");
+        let dialect = MySqlDialect {};
+        let tokens = Tokenizer::new(&dialect, &sql_multi).tokenize().unwrap();
+        let expected = vec![
+            Token::Number("0".to_string(), false),
+            Token::Whitespace(Whitespace::Space),
+            Token::Whitespace(Whitespace::Space),
+            Token::Word(Word {
+                value: "KEY_BLOCK_SIZE".to_string(),
+                quote_style: None,
+                keyword: Keyword::KEY_BLOCK_SIZE,
+            }),
+            Token::Whitespace(Whitespace::Space),
+            Token::Eq,
+            Token::Whitespace(Whitespace::Space),
+            Token::Number("1024".to_string(), false),
+            Token::Whitespace(Whitespace::Space),
+            Token::Number("1".to_string(), false),
+        ];
+        compare(expected, tokens);
+
+        let tokens = Tokenizer::new(&dialect, "0 /*!50110 */ 1")
+            .tokenize()
+            .unwrap();
+        compare(
+            vec![
+                Token::Number("0".to_string(), false),
+                Token::Whitespace(Whitespace::Space),
+                Token::Whitespace(Whitespace::Space),
+                Token::Whitespace(Whitespace::Space),
+                Token::Number("1".to_string(), false),
+            ],
+            tokens,
+        );
+
+        let tokens = Tokenizer::new(&dialect, "0 /*!*/ 1").tokenize().unwrap();
+        compare(
+            vec![
+                Token::Number("0".to_string(), false),
+                Token::Whitespace(Whitespace::Space),
+                Token::Whitespace(Whitespace::Space),
+                Token::Number("1".to_string(), false),
+            ],
+            tokens,
+        );
+        let tokens = Tokenizer::new(&dialect, "0 /*!   */ 1").tokenize().unwrap();
+        compare(
+            vec![
+                Token::Number("0".to_string(), false),
+                Token::Whitespace(Whitespace::Space),
+                Token::Whitespace(Whitespace::Space),
+                Token::Whitespace(Whitespace::Space),
+                Token::Whitespace(Whitespace::Space),
+                Token::Whitespace(Whitespace::Space),
+                Token::Number("1".to_string(), false),
+            ],
+            tokens,
+        );
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -177,6 +177,10 @@ impl Dialect for GenericDialect {`
`177`	`177`	`true`
`178`	`178`	`}`
`179`	`179`
	`180`	`+ fn supports_multiline_comment_hints(&self) -> bool {`
	`181`	`+ true`
	`182`	`+ }`
	`183`	`+`
`180`	`184`	`fn supports_user_host_grantee(&self) -> bool {`
`181`	`185`	`true`
`182`	`186`	`}`