From 896877bcee6c719a8c95c2145a17b7c25ac456e1 Mon Sep 17 00:00:00 2001 From: NullVoxPopuli <199018+NullVoxPopuli@users.noreply.github.com> Date: Wed, 21 Aug 2024 14:37:19 -0400 Subject: [PATCH 1/5] Flatten the scanner --- src/scanner.c | 316 ++++++++++++++++++++++++++- src/tree-sitter-typescript/scanner.h | 304 -------------------------- 2 files changed, 307 insertions(+), 313 deletions(-) delete mode 100644 src/tree-sitter-typescript/scanner.h diff --git a/src/scanner.c b/src/scanner.c index 8c995b5..789d5bb 100644 --- a/src/scanner.c +++ b/src/scanner.c @@ -2,18 +2,15 @@ // // // -// Scanners are not extensible, so we have to copy them, -// -// To Update: -// - delete everything under the comment block -// - find and replace "tree_sitter_javascript" with "tree_sitter_glimmer_typescript" -// - there should be 5 methods updated -// - %s/tree_sitter_javascript/tree_sitter_glimmer_typescript/g +// Scanners are not extensible, so we have to copy them // +// rename external_scanner_scan => tree_sitter_glimmer_typescript_external_scanner_scan // // // ---------------------------------------------------------- -#include "./tree-sitter-typescript/scanner.h" +#include "tree_sitter/parser.h" + +#include void *tree_sitter_glimmer_typescript_external_scanner_create() { return NULL; } @@ -23,6 +20,307 @@ unsigned tree_sitter_glimmer_typescript_external_scanner_serialize(void *payload void tree_sitter_glimmer_typescript_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {} +// bool tree_sitter_glimmer_typescript_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { +// return external_scanner_scan(payload, lexer, valid_symbols); +// } + +/*********************************** + * + * from tree-sitter-typescript v0.21.0 + * + * ***********************************/ +enum TokenType { + AUTOMATIC_SEMICOLON, + TEMPLATE_CHARS, + TERNARY_QMARK, + HTML_COMMENT, + LOGICAL_OR, + ESCAPE_SEQUENCE, + REGEX_PATTERN, + FUNCTION_SIGNATURE_AUTOMATIC_SEMICOLON, + ERROR_RECOVERY, +}; + +static void advance(TSLexer *lexer) { lexer->advance(lexer, false); } + +static void skip(TSLexer *lexer) { lexer->advance(lexer, true); } + +static bool scan_template_chars(TSLexer *lexer) { + lexer->result_symbol = TEMPLATE_CHARS; + for (bool has_content = false;; has_content = true) { + lexer->mark_end(lexer); + switch (lexer->lookahead) { + case '`': + return has_content; + case '\0': + return false; + case '$': + advance(lexer); + if (lexer->lookahead == '{') { + return has_content; + } + break; + case '\\': + return has_content; + default: + advance(lexer); + } + } +} + +static bool scan_whitespace_and_comments(TSLexer *lexer, bool *scanned_comment) { + for (;;) { + while (iswspace(lexer->lookahead)) { + skip(lexer); + } + + if (lexer->lookahead == '/') { + skip(lexer); + + if (lexer->lookahead == '/') { + skip(lexer); + while (lexer->lookahead != 0 && lexer->lookahead != '\n') { + skip(lexer); + } + *scanned_comment = true; + } else if (lexer->lookahead == '*') { + skip(lexer); + while (lexer->lookahead != 0) { + if (lexer->lookahead == '*') { + skip(lexer); + if (lexer->lookahead == '/') { + skip(lexer); + break; + } + } else { + skip(lexer); + } + } + } else { + return false; + } + } else { + return true; + } + } +} + +static bool scan_automatic_semicolon(TSLexer *lexer, const bool *valid_symbols, bool *scanned_comment) { + lexer->result_symbol = AUTOMATIC_SEMICOLON; + lexer->mark_end(lexer); + + for (;;) { + if (lexer->lookahead == 0) { + return true; + } + if (lexer->lookahead == '}') { + // Automatic semicolon insertion breaks detection of object patterns + // in a typed context: + // type F = ({a}: {a: number}) => number; + // Therefore, disable automatic semicolons when followed by typing + do { + skip(lexer); + } while (iswspace(lexer->lookahead)); + if (lexer->lookahead == ':') { + return valid_symbols[LOGICAL_OR]; // Don't return false if we're in a ternary by checking if || is valid + } + return true; + } + if (!iswspace(lexer->lookahead)) { + return false; + } + if (lexer->lookahead == '\n') { + break; + } + skip(lexer); + } + + skip(lexer); + + if (!scan_whitespace_and_comments(lexer, scanned_comment)) { + return false; + } + + switch (lexer->lookahead) { + case ',': + case '.': + case ';': + case '*': + case '%': + case '>': + case '<': + case '=': + case '?': + case '^': + case '|': + case '&': + case '/': + case ':': + return false; + + case '{': + if (valid_symbols[FUNCTION_SIGNATURE_AUTOMATIC_SEMICOLON]) { + return false; + } + break; + + // Don't insert a semicolon before a '[' or '(', unless we're parsing + // a type. Detect whether we're parsing a type or an expression using + // the validity of a binary operator token. + case '(': + case '[': + if (valid_symbols[LOGICAL_OR]) { + return false; + } + break; + + // Insert a semicolon before `--` and `++`, but not before binary `+` or `-`. + case '+': + skip(lexer); + return lexer->lookahead == '+'; + case '-': + skip(lexer); + return lexer->lookahead == '-'; + + // Don't insert a semicolon before `!=`, but do insert one before a unary `!`. + case '!': + skip(lexer); + return lexer->lookahead != '='; + + // Don't insert a semicolon before `in` or `instanceof`, but do insert one + // before an identifier. + case 'i': + skip(lexer); + + if (lexer->lookahead != 'n') { + return true; + } + skip(lexer); + + if (!iswalpha(lexer->lookahead)) { + return false; + } + + for (unsigned i = 0; i < 8; i++) { + if (lexer->lookahead != "stanceof"[i]) { + return true; + } + skip(lexer); + } + + if (!iswalpha(lexer->lookahead)) { + return false; + } + break; + } + + return true; +} + +static bool scan_ternary_qmark(TSLexer *lexer) { + for (;;) { + if (!iswspace(lexer->lookahead)) { + break; + } + skip(lexer); + } + + if (lexer->lookahead == '?') { + advance(lexer); + + /* Optional chaining. */ + if (lexer->lookahead == '?' || lexer->lookahead == '.') { + return false; + } + + lexer->mark_end(lexer); + lexer->result_symbol = TERNARY_QMARK; + + /* TypeScript optional arguments contain the ?: sequence, possibly + with whitespace. */ + for (;;) { + if (!iswspace(lexer->lookahead)) { + break; + } + advance(lexer); + } + + if (lexer->lookahead == ':' || lexer->lookahead == ')' || lexer->lookahead == ',') { + return false; + } + + if (lexer->lookahead == '.') { + advance(lexer); + if (iswdigit(lexer->lookahead)) { + return true; + } + return false; + } + return true; + } + return false; +} + +static bool scan_closing_comment(TSLexer *lexer) { + while (iswspace(lexer->lookahead) || lexer->lookahead == 0x2028 || lexer->lookahead == 0x2029) { + skip(lexer); + } + + const char *comment_start = ""; + + if (lexer->lookahead == '<') { + for (unsigned i = 0; i < 4; i++) { + if (lexer->lookahead != comment_start[i]) { + return false; + } + advance(lexer); + } + } else if (lexer->lookahead == '-') { + for (unsigned i = 0; i < 3; i++) { + if (lexer->lookahead != comment_end[i]) { + return false; + } + advance(lexer); + } + } else { + return false; + } + + while (lexer->lookahead != 0 && lexer->lookahead != '\n' && lexer->lookahead != 0x2028 && + lexer->lookahead != 0x2029) { + advance(lexer); + } + + lexer->result_symbol = HTML_COMMENT; + lexer->mark_end(lexer); + + return true; +} + bool tree_sitter_glimmer_typescript_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { - return external_scanner_scan(payload, lexer, valid_symbols); + if (valid_symbols[TEMPLATE_CHARS]) { + if (valid_symbols[AUTOMATIC_SEMICOLON]) { + return false; + } + return scan_template_chars(lexer); + } + if (valid_symbols[AUTOMATIC_SEMICOLON] || valid_symbols[FUNCTION_SIGNATURE_AUTOMATIC_SEMICOLON]) { + bool scanned_comment = false; + bool ret = scan_automatic_semicolon(lexer, valid_symbols, &scanned_comment); + if (!ret && !scanned_comment && valid_symbols[TERNARY_QMARK] && lexer->lookahead == '?') { + return scan_ternary_qmark(lexer); + } + return ret; + } + if (valid_symbols[TERNARY_QMARK]) { + return scan_ternary_qmark(lexer); + } + + if (valid_symbols[HTML_COMMENT] && !valid_symbols[LOGICAL_OR] && !valid_symbols[ESCAPE_SEQUENCE] && + !valid_symbols[REGEX_PATTERN]) { + return scan_closing_comment(lexer); + } + + return false; } diff --git a/src/tree-sitter-typescript/scanner.h b/src/tree-sitter-typescript/scanner.h deleted file mode 100644 index d7d36ae..0000000 --- a/src/tree-sitter-typescript/scanner.h +++ /dev/null @@ -1,304 +0,0 @@ -/*********************************** - * - * from tree-sitter-typescript v0.21.0 - * - * ***********************************/ -#include "tree_sitter/parser.h" - -#include - -enum TokenType { - AUTOMATIC_SEMICOLON, - TEMPLATE_CHARS, - TERNARY_QMARK, - HTML_COMMENT, - LOGICAL_OR, - ESCAPE_SEQUENCE, - REGEX_PATTERN, - FUNCTION_SIGNATURE_AUTOMATIC_SEMICOLON, - ERROR_RECOVERY, -}; - -static void advance(TSLexer *lexer) { lexer->advance(lexer, false); } - -static void skip(TSLexer *lexer) { lexer->advance(lexer, true); } - -static bool scan_template_chars(TSLexer *lexer) { - lexer->result_symbol = TEMPLATE_CHARS; - for (bool has_content = false;; has_content = true) { - lexer->mark_end(lexer); - switch (lexer->lookahead) { - case '`': - return has_content; - case '\0': - return false; - case '$': - advance(lexer); - if (lexer->lookahead == '{') { - return has_content; - } - break; - case '\\': - return has_content; - default: - advance(lexer); - } - } -} - -static bool scan_whitespace_and_comments(TSLexer *lexer, bool *scanned_comment) { - for (;;) { - while (iswspace(lexer->lookahead)) { - skip(lexer); - } - - if (lexer->lookahead == '/') { - skip(lexer); - - if (lexer->lookahead == '/') { - skip(lexer); - while (lexer->lookahead != 0 && lexer->lookahead != '\n') { - skip(lexer); - } - *scanned_comment = true; - } else if (lexer->lookahead == '*') { - skip(lexer); - while (lexer->lookahead != 0) { - if (lexer->lookahead == '*') { - skip(lexer); - if (lexer->lookahead == '/') { - skip(lexer); - break; - } - } else { - skip(lexer); - } - } - } else { - return false; - } - } else { - return true; - } - } -} - -static bool scan_automatic_semicolon(TSLexer *lexer, const bool *valid_symbols, bool *scanned_comment) { - lexer->result_symbol = AUTOMATIC_SEMICOLON; - lexer->mark_end(lexer); - - for (;;) { - if (lexer->lookahead == 0) { - return true; - } - if (lexer->lookahead == '}') { - // Automatic semicolon insertion breaks detection of object patterns - // in a typed context: - // type F = ({a}: {a: number}) => number; - // Therefore, disable automatic semicolons when followed by typing - do { - skip(lexer); - } while (iswspace(lexer->lookahead)); - if (lexer->lookahead == ':') { - return valid_symbols[LOGICAL_OR]; // Don't return false if we're in a ternary by checking if || is valid - } - return true; - } - if (!iswspace(lexer->lookahead)) { - return false; - } - if (lexer->lookahead == '\n') { - break; - } - skip(lexer); - } - - skip(lexer); - - if (!scan_whitespace_and_comments(lexer, scanned_comment)) { - return false; - } - - switch (lexer->lookahead) { - case ',': - case '.': - case ';': - case '*': - case '%': - case '>': - case '<': - case '=': - case '?': - case '^': - case '|': - case '&': - case '/': - case ':': - return false; - - case '{': - if (valid_symbols[FUNCTION_SIGNATURE_AUTOMATIC_SEMICOLON]) { - return false; - } - break; - - // Don't insert a semicolon before a '[' or '(', unless we're parsing - // a type. Detect whether we're parsing a type or an expression using - // the validity of a binary operator token. - case '(': - case '[': - if (valid_symbols[LOGICAL_OR]) { - return false; - } - break; - - // Insert a semicolon before `--` and `++`, but not before binary `+` or `-`. - case '+': - skip(lexer); - return lexer->lookahead == '+'; - case '-': - skip(lexer); - return lexer->lookahead == '-'; - - // Don't insert a semicolon before `!=`, but do insert one before a unary `!`. - case '!': - skip(lexer); - return lexer->lookahead != '='; - - // Don't insert a semicolon before `in` or `instanceof`, but do insert one - // before an identifier. - case 'i': - skip(lexer); - - if (lexer->lookahead != 'n') { - return true; - } - skip(lexer); - - if (!iswalpha(lexer->lookahead)) { - return false; - } - - for (unsigned i = 0; i < 8; i++) { - if (lexer->lookahead != "stanceof"[i]) { - return true; - } - skip(lexer); - } - - if (!iswalpha(lexer->lookahead)) { - return false; - } - break; - } - - return true; -} - -static bool scan_ternary_qmark(TSLexer *lexer) { - for (;;) { - if (!iswspace(lexer->lookahead)) { - break; - } - skip(lexer); - } - - if (lexer->lookahead == '?') { - advance(lexer); - - /* Optional chaining. */ - if (lexer->lookahead == '?' || lexer->lookahead == '.') { - return false; - } - - lexer->mark_end(lexer); - lexer->result_symbol = TERNARY_QMARK; - - /* TypeScript optional arguments contain the ?: sequence, possibly - with whitespace. */ - for (;;) { - if (!iswspace(lexer->lookahead)) { - break; - } - advance(lexer); - } - - if (lexer->lookahead == ':' || lexer->lookahead == ')' || lexer->lookahead == ',') { - return false; - } - - if (lexer->lookahead == '.') { - advance(lexer); - if (iswdigit(lexer->lookahead)) { - return true; - } - return false; - } - return true; - } - return false; -} - -static bool scan_closing_comment(TSLexer *lexer) { - while (iswspace(lexer->lookahead) || lexer->lookahead == 0x2028 || lexer->lookahead == 0x2029) { - skip(lexer); - } - - const char *comment_start = ""; - - if (lexer->lookahead == '<') { - for (unsigned i = 0; i < 4; i++) { - if (lexer->lookahead != comment_start[i]) { - return false; - } - advance(lexer); - } - } else if (lexer->lookahead == '-') { - for (unsigned i = 0; i < 3; i++) { - if (lexer->lookahead != comment_end[i]) { - return false; - } - advance(lexer); - } - } else { - return false; - } - - while (lexer->lookahead != 0 && lexer->lookahead != '\n' && lexer->lookahead != 0x2028 && - lexer->lookahead != 0x2029) { - advance(lexer); - } - - lexer->result_symbol = HTML_COMMENT; - lexer->mark_end(lexer); - - return true; -} - -static inline bool external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { - if (valid_symbols[TEMPLATE_CHARS]) { - if (valid_symbols[AUTOMATIC_SEMICOLON]) { - return false; - } - return scan_template_chars(lexer); - } - if (valid_symbols[AUTOMATIC_SEMICOLON] || valid_symbols[FUNCTION_SIGNATURE_AUTOMATIC_SEMICOLON]) { - bool scanned_comment = false; - bool ret = scan_automatic_semicolon(lexer, valid_symbols, &scanned_comment); - if (!ret && !scanned_comment && valid_symbols[TERNARY_QMARK] && lexer->lookahead == '?') { - return scan_ternary_qmark(lexer); - } - return ret; - } - if (valid_symbols[TERNARY_QMARK]) { - return scan_ternary_qmark(lexer); - } - - if (valid_symbols[HTML_COMMENT] && !valid_symbols[LOGICAL_OR] && !valid_symbols[ESCAPE_SEQUENCE] && - !valid_symbols[REGEX_PATTERN]) { - return scan_closing_comment(lexer); - } - - return false; -} From 5cbc67914cec478134b8393e107a19773c84d7e9 Mon Sep 17 00:00:00 2001 From: NullVoxPopuli <199018+NullVoxPopuli@users.noreply.github.com> Date: Wed, 21 Aug 2024 14:42:53 -0400 Subject: [PATCH 2/5] Revert "Flatten the scanner" This reverts commit 896877bcee6c719a8c95c2145a17b7c25ac456e1. --- src/scanner.c | 316 +-------------------------- src/tree-sitter-typescript/scanner.h | 304 ++++++++++++++++++++++++++ 2 files changed, 313 insertions(+), 307 deletions(-) create mode 100644 src/tree-sitter-typescript/scanner.h diff --git a/src/scanner.c b/src/scanner.c index 789d5bb..8c995b5 100644 --- a/src/scanner.c +++ b/src/scanner.c @@ -2,15 +2,18 @@ // // // -// Scanners are not extensible, so we have to copy them +// Scanners are not extensible, so we have to copy them, +// +// To Update: +// - delete everything under the comment block +// - find and replace "tree_sitter_javascript" with "tree_sitter_glimmer_typescript" +// - there should be 5 methods updated +// - %s/tree_sitter_javascript/tree_sitter_glimmer_typescript/g // -// rename external_scanner_scan => tree_sitter_glimmer_typescript_external_scanner_scan // // // ---------------------------------------------------------- -#include "tree_sitter/parser.h" - -#include +#include "./tree-sitter-typescript/scanner.h" void *tree_sitter_glimmer_typescript_external_scanner_create() { return NULL; } @@ -20,307 +23,6 @@ unsigned tree_sitter_glimmer_typescript_external_scanner_serialize(void *payload void tree_sitter_glimmer_typescript_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {} -// bool tree_sitter_glimmer_typescript_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { -// return external_scanner_scan(payload, lexer, valid_symbols); -// } - -/*********************************** - * - * from tree-sitter-typescript v0.21.0 - * - * ***********************************/ -enum TokenType { - AUTOMATIC_SEMICOLON, - TEMPLATE_CHARS, - TERNARY_QMARK, - HTML_COMMENT, - LOGICAL_OR, - ESCAPE_SEQUENCE, - REGEX_PATTERN, - FUNCTION_SIGNATURE_AUTOMATIC_SEMICOLON, - ERROR_RECOVERY, -}; - -static void advance(TSLexer *lexer) { lexer->advance(lexer, false); } - -static void skip(TSLexer *lexer) { lexer->advance(lexer, true); } - -static bool scan_template_chars(TSLexer *lexer) { - lexer->result_symbol = TEMPLATE_CHARS; - for (bool has_content = false;; has_content = true) { - lexer->mark_end(lexer); - switch (lexer->lookahead) { - case '`': - return has_content; - case '\0': - return false; - case '$': - advance(lexer); - if (lexer->lookahead == '{') { - return has_content; - } - break; - case '\\': - return has_content; - default: - advance(lexer); - } - } -} - -static bool scan_whitespace_and_comments(TSLexer *lexer, bool *scanned_comment) { - for (;;) { - while (iswspace(lexer->lookahead)) { - skip(lexer); - } - - if (lexer->lookahead == '/') { - skip(lexer); - - if (lexer->lookahead == '/') { - skip(lexer); - while (lexer->lookahead != 0 && lexer->lookahead != '\n') { - skip(lexer); - } - *scanned_comment = true; - } else if (lexer->lookahead == '*') { - skip(lexer); - while (lexer->lookahead != 0) { - if (lexer->lookahead == '*') { - skip(lexer); - if (lexer->lookahead == '/') { - skip(lexer); - break; - } - } else { - skip(lexer); - } - } - } else { - return false; - } - } else { - return true; - } - } -} - -static bool scan_automatic_semicolon(TSLexer *lexer, const bool *valid_symbols, bool *scanned_comment) { - lexer->result_symbol = AUTOMATIC_SEMICOLON; - lexer->mark_end(lexer); - - for (;;) { - if (lexer->lookahead == 0) { - return true; - } - if (lexer->lookahead == '}') { - // Automatic semicolon insertion breaks detection of object patterns - // in a typed context: - // type F = ({a}: {a: number}) => number; - // Therefore, disable automatic semicolons when followed by typing - do { - skip(lexer); - } while (iswspace(lexer->lookahead)); - if (lexer->lookahead == ':') { - return valid_symbols[LOGICAL_OR]; // Don't return false if we're in a ternary by checking if || is valid - } - return true; - } - if (!iswspace(lexer->lookahead)) { - return false; - } - if (lexer->lookahead == '\n') { - break; - } - skip(lexer); - } - - skip(lexer); - - if (!scan_whitespace_and_comments(lexer, scanned_comment)) { - return false; - } - - switch (lexer->lookahead) { - case ',': - case '.': - case ';': - case '*': - case '%': - case '>': - case '<': - case '=': - case '?': - case '^': - case '|': - case '&': - case '/': - case ':': - return false; - - case '{': - if (valid_symbols[FUNCTION_SIGNATURE_AUTOMATIC_SEMICOLON]) { - return false; - } - break; - - // Don't insert a semicolon before a '[' or '(', unless we're parsing - // a type. Detect whether we're parsing a type or an expression using - // the validity of a binary operator token. - case '(': - case '[': - if (valid_symbols[LOGICAL_OR]) { - return false; - } - break; - - // Insert a semicolon before `--` and `++`, but not before binary `+` or `-`. - case '+': - skip(lexer); - return lexer->lookahead == '+'; - case '-': - skip(lexer); - return lexer->lookahead == '-'; - - // Don't insert a semicolon before `!=`, but do insert one before a unary `!`. - case '!': - skip(lexer); - return lexer->lookahead != '='; - - // Don't insert a semicolon before `in` or `instanceof`, but do insert one - // before an identifier. - case 'i': - skip(lexer); - - if (lexer->lookahead != 'n') { - return true; - } - skip(lexer); - - if (!iswalpha(lexer->lookahead)) { - return false; - } - - for (unsigned i = 0; i < 8; i++) { - if (lexer->lookahead != "stanceof"[i]) { - return true; - } - skip(lexer); - } - - if (!iswalpha(lexer->lookahead)) { - return false; - } - break; - } - - return true; -} - -static bool scan_ternary_qmark(TSLexer *lexer) { - for (;;) { - if (!iswspace(lexer->lookahead)) { - break; - } - skip(lexer); - } - - if (lexer->lookahead == '?') { - advance(lexer); - - /* Optional chaining. */ - if (lexer->lookahead == '?' || lexer->lookahead == '.') { - return false; - } - - lexer->mark_end(lexer); - lexer->result_symbol = TERNARY_QMARK; - - /* TypeScript optional arguments contain the ?: sequence, possibly - with whitespace. */ - for (;;) { - if (!iswspace(lexer->lookahead)) { - break; - } - advance(lexer); - } - - if (lexer->lookahead == ':' || lexer->lookahead == ')' || lexer->lookahead == ',') { - return false; - } - - if (lexer->lookahead == '.') { - advance(lexer); - if (iswdigit(lexer->lookahead)) { - return true; - } - return false; - } - return true; - } - return false; -} - -static bool scan_closing_comment(TSLexer *lexer) { - while (iswspace(lexer->lookahead) || lexer->lookahead == 0x2028 || lexer->lookahead == 0x2029) { - skip(lexer); - } - - const char *comment_start = ""; - - if (lexer->lookahead == '<') { - for (unsigned i = 0; i < 4; i++) { - if (lexer->lookahead != comment_start[i]) { - return false; - } - advance(lexer); - } - } else if (lexer->lookahead == '-') { - for (unsigned i = 0; i < 3; i++) { - if (lexer->lookahead != comment_end[i]) { - return false; - } - advance(lexer); - } - } else { - return false; - } - - while (lexer->lookahead != 0 && lexer->lookahead != '\n' && lexer->lookahead != 0x2028 && - lexer->lookahead != 0x2029) { - advance(lexer); - } - - lexer->result_symbol = HTML_COMMENT; - lexer->mark_end(lexer); - - return true; -} - bool tree_sitter_glimmer_typescript_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { - if (valid_symbols[TEMPLATE_CHARS]) { - if (valid_symbols[AUTOMATIC_SEMICOLON]) { - return false; - } - return scan_template_chars(lexer); - } - if (valid_symbols[AUTOMATIC_SEMICOLON] || valid_symbols[FUNCTION_SIGNATURE_AUTOMATIC_SEMICOLON]) { - bool scanned_comment = false; - bool ret = scan_automatic_semicolon(lexer, valid_symbols, &scanned_comment); - if (!ret && !scanned_comment && valid_symbols[TERNARY_QMARK] && lexer->lookahead == '?') { - return scan_ternary_qmark(lexer); - } - return ret; - } - if (valid_symbols[TERNARY_QMARK]) { - return scan_ternary_qmark(lexer); - } - - if (valid_symbols[HTML_COMMENT] && !valid_symbols[LOGICAL_OR] && !valid_symbols[ESCAPE_SEQUENCE] && - !valid_symbols[REGEX_PATTERN]) { - return scan_closing_comment(lexer); - } - - return false; + return external_scanner_scan(payload, lexer, valid_symbols); } diff --git a/src/tree-sitter-typescript/scanner.h b/src/tree-sitter-typescript/scanner.h new file mode 100644 index 0000000..d7d36ae --- /dev/null +++ b/src/tree-sitter-typescript/scanner.h @@ -0,0 +1,304 @@ +/*********************************** + * + * from tree-sitter-typescript v0.21.0 + * + * ***********************************/ +#include "tree_sitter/parser.h" + +#include + +enum TokenType { + AUTOMATIC_SEMICOLON, + TEMPLATE_CHARS, + TERNARY_QMARK, + HTML_COMMENT, + LOGICAL_OR, + ESCAPE_SEQUENCE, + REGEX_PATTERN, + FUNCTION_SIGNATURE_AUTOMATIC_SEMICOLON, + ERROR_RECOVERY, +}; + +static void advance(TSLexer *lexer) { lexer->advance(lexer, false); } + +static void skip(TSLexer *lexer) { lexer->advance(lexer, true); } + +static bool scan_template_chars(TSLexer *lexer) { + lexer->result_symbol = TEMPLATE_CHARS; + for (bool has_content = false;; has_content = true) { + lexer->mark_end(lexer); + switch (lexer->lookahead) { + case '`': + return has_content; + case '\0': + return false; + case '$': + advance(lexer); + if (lexer->lookahead == '{') { + return has_content; + } + break; + case '\\': + return has_content; + default: + advance(lexer); + } + } +} + +static bool scan_whitespace_and_comments(TSLexer *lexer, bool *scanned_comment) { + for (;;) { + while (iswspace(lexer->lookahead)) { + skip(lexer); + } + + if (lexer->lookahead == '/') { + skip(lexer); + + if (lexer->lookahead == '/') { + skip(lexer); + while (lexer->lookahead != 0 && lexer->lookahead != '\n') { + skip(lexer); + } + *scanned_comment = true; + } else if (lexer->lookahead == '*') { + skip(lexer); + while (lexer->lookahead != 0) { + if (lexer->lookahead == '*') { + skip(lexer); + if (lexer->lookahead == '/') { + skip(lexer); + break; + } + } else { + skip(lexer); + } + } + } else { + return false; + } + } else { + return true; + } + } +} + +static bool scan_automatic_semicolon(TSLexer *lexer, const bool *valid_symbols, bool *scanned_comment) { + lexer->result_symbol = AUTOMATIC_SEMICOLON; + lexer->mark_end(lexer); + + for (;;) { + if (lexer->lookahead == 0) { + return true; + } + if (lexer->lookahead == '}') { + // Automatic semicolon insertion breaks detection of object patterns + // in a typed context: + // type F = ({a}: {a: number}) => number; + // Therefore, disable automatic semicolons when followed by typing + do { + skip(lexer); + } while (iswspace(lexer->lookahead)); + if (lexer->lookahead == ':') { + return valid_symbols[LOGICAL_OR]; // Don't return false if we're in a ternary by checking if || is valid + } + return true; + } + if (!iswspace(lexer->lookahead)) { + return false; + } + if (lexer->lookahead == '\n') { + break; + } + skip(lexer); + } + + skip(lexer); + + if (!scan_whitespace_and_comments(lexer, scanned_comment)) { + return false; + } + + switch (lexer->lookahead) { + case ',': + case '.': + case ';': + case '*': + case '%': + case '>': + case '<': + case '=': + case '?': + case '^': + case '|': + case '&': + case '/': + case ':': + return false; + + case '{': + if (valid_symbols[FUNCTION_SIGNATURE_AUTOMATIC_SEMICOLON]) { + return false; + } + break; + + // Don't insert a semicolon before a '[' or '(', unless we're parsing + // a type. Detect whether we're parsing a type or an expression using + // the validity of a binary operator token. + case '(': + case '[': + if (valid_symbols[LOGICAL_OR]) { + return false; + } + break; + + // Insert a semicolon before `--` and `++`, but not before binary `+` or `-`. + case '+': + skip(lexer); + return lexer->lookahead == '+'; + case '-': + skip(lexer); + return lexer->lookahead == '-'; + + // Don't insert a semicolon before `!=`, but do insert one before a unary `!`. + case '!': + skip(lexer); + return lexer->lookahead != '='; + + // Don't insert a semicolon before `in` or `instanceof`, but do insert one + // before an identifier. + case 'i': + skip(lexer); + + if (lexer->lookahead != 'n') { + return true; + } + skip(lexer); + + if (!iswalpha(lexer->lookahead)) { + return false; + } + + for (unsigned i = 0; i < 8; i++) { + if (lexer->lookahead != "stanceof"[i]) { + return true; + } + skip(lexer); + } + + if (!iswalpha(lexer->lookahead)) { + return false; + } + break; + } + + return true; +} + +static bool scan_ternary_qmark(TSLexer *lexer) { + for (;;) { + if (!iswspace(lexer->lookahead)) { + break; + } + skip(lexer); + } + + if (lexer->lookahead == '?') { + advance(lexer); + + /* Optional chaining. */ + if (lexer->lookahead == '?' || lexer->lookahead == '.') { + return false; + } + + lexer->mark_end(lexer); + lexer->result_symbol = TERNARY_QMARK; + + /* TypeScript optional arguments contain the ?: sequence, possibly + with whitespace. */ + for (;;) { + if (!iswspace(lexer->lookahead)) { + break; + } + advance(lexer); + } + + if (lexer->lookahead == ':' || lexer->lookahead == ')' || lexer->lookahead == ',') { + return false; + } + + if (lexer->lookahead == '.') { + advance(lexer); + if (iswdigit(lexer->lookahead)) { + return true; + } + return false; + } + return true; + } + return false; +} + +static bool scan_closing_comment(TSLexer *lexer) { + while (iswspace(lexer->lookahead) || lexer->lookahead == 0x2028 || lexer->lookahead == 0x2029) { + skip(lexer); + } + + const char *comment_start = ""; + + if (lexer->lookahead == '<') { + for (unsigned i = 0; i < 4; i++) { + if (lexer->lookahead != comment_start[i]) { + return false; + } + advance(lexer); + } + } else if (lexer->lookahead == '-') { + for (unsigned i = 0; i < 3; i++) { + if (lexer->lookahead != comment_end[i]) { + return false; + } + advance(lexer); + } + } else { + return false; + } + + while (lexer->lookahead != 0 && lexer->lookahead != '\n' && lexer->lookahead != 0x2028 && + lexer->lookahead != 0x2029) { + advance(lexer); + } + + lexer->result_symbol = HTML_COMMENT; + lexer->mark_end(lexer); + + return true; +} + +static inline bool external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { + if (valid_symbols[TEMPLATE_CHARS]) { + if (valid_symbols[AUTOMATIC_SEMICOLON]) { + return false; + } + return scan_template_chars(lexer); + } + if (valid_symbols[AUTOMATIC_SEMICOLON] || valid_symbols[FUNCTION_SIGNATURE_AUTOMATIC_SEMICOLON]) { + bool scanned_comment = false; + bool ret = scan_automatic_semicolon(lexer, valid_symbols, &scanned_comment); + if (!ret && !scanned_comment && valid_symbols[TERNARY_QMARK] && lexer->lookahead == '?') { + return scan_ternary_qmark(lexer); + } + return ret; + } + if (valid_symbols[TERNARY_QMARK]) { + return scan_ternary_qmark(lexer); + } + + if (valid_symbols[HTML_COMMENT] && !valid_symbols[LOGICAL_OR] && !valid_symbols[ESCAPE_SEQUENCE] && + !valid_symbols[REGEX_PATTERN]) { + return scan_closing_comment(lexer); + } + + return false; +} From 1e484ea8a60832df0b1b359cae05216e098bde98 Mon Sep 17 00:00:00 2001 From: NullVoxPopuli <199018+NullVoxPopuli@users.noreply.github.com> Date: Wed, 21 Aug 2024 14:49:17 -0400 Subject: [PATCH 3/5] Adjust the rust build --- bindings/rust/build.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bindings/rust/build.rs b/bindings/rust/build.rs index 5449d52..4c50a27 100644 --- a/bindings/rust/build.rs +++ b/bindings/rust/build.rs @@ -1,5 +1,7 @@ fn main() { + let root_dir = std::path::Path::new("."); let src_dir = std::path::Path::new("src"); + let typescript_dir = root_dir.join("tree-sitter-typescript").join("src"); let mut c_config = cc::Build::new(); c_config.include(&src_dir); @@ -18,6 +20,11 @@ fn main() { let scanner_path = src_dir.join("scanner.c"); c_config.file(&scanner_path); + + println!( + "cargo:rerun-if-changed={}", + typescript_dir.join("scanner.h").to_str().unwrap() + ); println!("cargo:rerun-if-changed={}", scanner_path.to_str().unwrap()); c_config.compile("parser"); From f1ed184196aa7cef2799f64a6fb4df2fc8212e43 Mon Sep 17 00:00:00 2001 From: NullVoxPopuli <199018+NullVoxPopuli@users.noreply.github.com> Date: Wed, 21 Aug 2024 14:52:02 -0400 Subject: [PATCH 4/5] Move the fuzz ci checks to their own workflow --- .github/workflows/ci.yml | 22 ---------------------- .github/workflows/fuzz.yml | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 22 deletions(-) create mode 100644 .github/workflows/fuzz.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2d15ae6..f8f6073 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -50,25 +50,3 @@ jobs: uses: tree-sitter/parser-test-action@v2 with: test-rust: ${{runner.os == 'Linux'}} - fuzz: - name: Fuzz scanner - runs-on: ubuntu-latest - if: >- - !github.event.repository.is_template && - github.event.head_commit.message != 'Initial commit' - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Check for scanner changes - id: scanner-check - shell: sh - run: |- - { - test -f src/scanner.c && ! git diff --quiet HEAD^ -- "$_" && - printf 'changed=true\n' || printf 'changed=false\n' - } >> "$GITHUB_OUTPUT" - - uses: actions/setup-node@v4 - - run: npm install - - name: Run fuzzer - uses: tree-sitter/fuzz-action@v4 - if: steps.scanner-check.outputs.changed == 'true' diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml new file mode 100644 index 0000000..9d527d8 --- /dev/null +++ b/.github/workflows/fuzz.yml @@ -0,0 +1,19 @@ +name: Fuzz Parser + +on: + push: + branches: [master] + paths: + - src/scanner.c + - src/tree-sitter-typescript/scanner.h + pull_request: + paths: + - src/scanner.c + - src/tree-sitter-typescript/scanner.h + +jobs: + fuzz: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: tree-sitter/fuzz-action@v4 From 8d87b222d047cb393416a5f8633a40e692d529ae Mon Sep 17 00:00:00 2001 From: NullVoxPopuli <199018+NullVoxPopuli@users.noreply.github.com> Date: Wed, 21 Aug 2024 14:55:17 -0400 Subject: [PATCH 5/5] Format queries --- queries/glimmer_typescript/indents.scm | 1 + 1 file changed, 1 insertion(+) diff --git a/queries/glimmer_typescript/indents.scm b/queries/glimmer_typescript/indents.scm index b15e374..237254d 100644 --- a/queries/glimmer_typescript/indents.scm +++ b/queries/glimmer_typescript/indents.scm @@ -1,2 +1,3 @@ (glimmer_opening_tag) @indent.begin + (glimmer_closing_tag) @indent.end