Skip to content

Commit 7a186fe

Browse files
authored
Update tree-sitter-markdown (#25)
1 parent a15ab30 commit 7a186fe

8 files changed

Lines changed: 36658 additions & 31939 deletions

File tree

Sources/TreeSitterMarkdown/src/parser.c

Lines changed: 4185 additions & 4195 deletions
Large diffs are not rendered by default.

Sources/TreeSitterMarkdown/src/scanner.c

Lines changed: 1491 additions & 0 deletions
Large diffs are not rendered by default.

Sources/TreeSitterMarkdown/src/scanner.cc

Lines changed: 0 additions & 1458 deletions
This file was deleted.

Sources/TreeSitterMarkdownInline/src/parser.c

Lines changed: 30596 additions & 25928 deletions
Large diffs are not rendered by default.
Lines changed: 384 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,384 @@
1+
#include <tree_sitter/parser.h>
2+
3+
// For explanation of the tokens see grammar.js
4+
typedef enum {
5+
ERROR,
6+
TRIGGER_ERROR,
7+
CODE_SPAN_START,
8+
CODE_SPAN_CLOSE,
9+
EMPHASIS_OPEN_STAR,
10+
EMPHASIS_OPEN_UNDERSCORE,
11+
EMPHASIS_CLOSE_STAR,
12+
EMPHASIS_CLOSE_UNDERSCORE,
13+
LAST_TOKEN_WHITESPACE,
14+
LAST_TOKEN_PUNCTUATION,
15+
STRIKETHROUGH_OPEN,
16+
STRIKETHROUGH_CLOSE,
17+
LATEX_SPAN_START,
18+
LATEX_SPAN_CLOSE,
19+
UNCLOSED_SPAN
20+
} TokenType;
21+
22+
// Determines if a character is punctuation as defined by the markdown spec.
23+
static bool is_punctuation(char c) {
24+
return
25+
(c >= '!' && c <= '/') ||
26+
(c >= ':' && c <= '@') ||
27+
(c >= '[' && c <= '`') ||
28+
(c >= '{' && c <= '~');
29+
}
30+
31+
// Determines if a character is ascii whitespace as defined by the markdown spec.
32+
static bool is_whitespace(char c) {
33+
return c == ' ' || c == '\t' || c == '\n' || c == '\r';
34+
}
35+
36+
// State bitflags used with `Scanner.state`
37+
38+
// TODO
39+
const uint8_t STATE_EMPHASIS_DELIMITER_MOD_3 = 0x3;
40+
// Current delimiter run is opening
41+
const uint8_t STATE_EMPHASIS_DELIMITER_IS_OPEN = 0x1 << 2;
42+
43+
// Convenience function to emit the error token. This is done to stop invalid parse branches.
44+
// Specifically:
45+
// 1. When encountering a newline after a line break that ended a paragraph, and no new block
46+
// has been opened.
47+
// 2. When encountering a new block after a soft line break.
48+
// 3. When a `$._trigger_error` token is valid, which is used to stop parse branches through
49+
// normal tree-sitter grammar rules.
50+
//
51+
// See also the `$._soft_line_break` and `$._paragraph_end_newline` tokens in grammar.js
52+
static bool error(TSLexer *lexer) {
53+
lexer->result_symbol = ERROR;
54+
return true;
55+
}
56+
57+
typedef struct {
58+
59+
// Parser state flags
60+
uint8_t state;
61+
uint8_t code_span_delimiter_length;
62+
uint8_t latex_span_delimiter_length;
63+
// The number of characters remaining in the currrent emphasis delimiter run.
64+
uint8_t num_emphasis_delimiters_left;
65+
66+
} Scanner;
67+
68+
// Write the whole state of a Scanner to a byte buffer
69+
static unsigned serialize(Scanner *s, char *buffer) {
70+
size_t i = 0;
71+
buffer[i++] = s->state;
72+
buffer[i++] = s->code_span_delimiter_length;
73+
buffer[i++] = s->latex_span_delimiter_length;
74+
buffer[i++] = s->num_emphasis_delimiters_left;
75+
return i;
76+
}
77+
78+
// Read the whole state of a Scanner from a byte buffer
79+
// `serizalize` and `deserialize` should be fully symmetric.
80+
static void deserialize(Scanner *s, const char *buffer, unsigned length) {
81+
s->state = 0;
82+
s->code_span_delimiter_length = 0;
83+
s->latex_span_delimiter_length = 0;
84+
s->num_emphasis_delimiters_left = 0;
85+
if (length > 0) {
86+
size_t i = 0;
87+
s->state = buffer[i++];
88+
s->code_span_delimiter_length = buffer[i++];
89+
s->latex_span_delimiter_length = buffer[i++];
90+
s->num_emphasis_delimiters_left = buffer[i++];
91+
}
92+
}
93+
94+
static bool parse_leaf_delimiter(TSLexer *lexer, uint8_t* delimiter_length, const bool *valid_symbols,
95+
const char delimiter, const TokenType open_token, const TokenType close_token) {
96+
size_t level = 0;
97+
while (lexer->lookahead == delimiter) {
98+
lexer->advance(lexer, false);
99+
level++;
100+
}
101+
lexer->mark_end(lexer);
102+
if (level == *delimiter_length && valid_symbols[close_token]) {
103+
*delimiter_length = 0;
104+
lexer->result_symbol = close_token;
105+
return true;
106+
} else if (valid_symbols[open_token]) {
107+
// Parse ahead to check if there is a closing delimiter
108+
size_t close_level = 0;
109+
while (!lexer->eof(lexer)) {
110+
if (lexer->lookahead == delimiter) {
111+
close_level++;
112+
} else {
113+
if (close_level == level) {
114+
// Found a matching delimiter
115+
break;
116+
} else {
117+
close_level = 0;
118+
}
119+
}
120+
lexer->advance(lexer, false);
121+
}
122+
if (close_level == level) {
123+
*delimiter_length = level;
124+
lexer->result_symbol = open_token;
125+
return true;
126+
} else if (valid_symbols[UNCLOSED_SPAN]) {
127+
lexer->result_symbol = UNCLOSED_SPAN;
128+
return true;
129+
}
130+
}
131+
return false;
132+
}
133+
134+
static bool parse_backtick(Scanner *s, TSLexer *lexer, const bool *valid_symbols) {
135+
return parse_leaf_delimiter(lexer, &s->code_span_delimiter_length, valid_symbols, '`',
136+
CODE_SPAN_START, CODE_SPAN_CLOSE);
137+
}
138+
139+
static bool parse_dollar(Scanner *s, TSLexer *lexer, const bool *valid_symbols) {
140+
return parse_leaf_delimiter(lexer, &s->latex_span_delimiter_length, valid_symbols, '$',
141+
LATEX_SPAN_START, LATEX_SPAN_CLOSE);
142+
}
143+
144+
static bool parse_star(Scanner *s, TSLexer *lexer, const bool *valid_symbols) {
145+
lexer->advance(lexer, false);
146+
// If `num_emphasis_delimiters_left` is not zero then we already decided that this should be
147+
// part of an emphasis delimiter run, so interpret it as such.
148+
if (s->num_emphasis_delimiters_left > 0) {
149+
// The `STATE_EMPHASIS_DELIMITER_IS_OPEN` state flag tells us wether it should be open
150+
// or close.
151+
if ((s->state & STATE_EMPHASIS_DELIMITER_IS_OPEN) && valid_symbols[EMPHASIS_OPEN_STAR]) {
152+
s->state &= (~STATE_EMPHASIS_DELIMITER_IS_OPEN);
153+
lexer->result_symbol = EMPHASIS_OPEN_STAR;
154+
s->num_emphasis_delimiters_left--;
155+
return true;
156+
} else if (valid_symbols[EMPHASIS_CLOSE_STAR]) {
157+
lexer->result_symbol = EMPHASIS_CLOSE_STAR;
158+
s->num_emphasis_delimiters_left--;
159+
return true;
160+
}
161+
}
162+
lexer->mark_end(lexer);
163+
// Otherwise count the number of stars
164+
size_t star_count = 1;
165+
while (lexer->lookahead == '*') {
166+
star_count++;
167+
lexer->advance(lexer, false);
168+
}
169+
bool line_end = lexer->lookahead == '\n' || lexer->lookahead == '\r' || lexer->eof(lexer);
170+
if (valid_symbols[EMPHASIS_OPEN_STAR] || valid_symbols[EMPHASIS_CLOSE_STAR]) {
171+
// The desicion made for the first star also counts for all the following stars in the
172+
// delimiter run. Rembemer how many there are.
173+
s->num_emphasis_delimiters_left = star_count - 1;
174+
// Look ahead to the next symbol (after the last star) to find out if it is whitespace
175+
// punctuation or other.
176+
bool next_symbol_whitespace = line_end || lexer->lookahead == ' ' || lexer->lookahead == '\t';
177+
bool next_symbol_punctuation = is_punctuation(lexer->lookahead);
178+
// Information about the last token is in valid_symbols. See grammar.js for these
179+
// tokens for how this is done.
180+
if (
181+
valid_symbols[EMPHASIS_CLOSE_STAR] &&
182+
!valid_symbols[LAST_TOKEN_WHITESPACE] && (
183+
!valid_symbols[LAST_TOKEN_PUNCTUATION] ||
184+
next_symbol_punctuation ||
185+
next_symbol_whitespace
186+
)
187+
) {
188+
// Closing delimiters take precedence
189+
s->state &= ~STATE_EMPHASIS_DELIMITER_IS_OPEN;
190+
lexer->result_symbol = EMPHASIS_CLOSE_STAR;
191+
return true;
192+
} else if (
193+
!next_symbol_whitespace && (
194+
!next_symbol_punctuation ||
195+
valid_symbols[LAST_TOKEN_PUNCTUATION] ||
196+
valid_symbols[LAST_TOKEN_WHITESPACE]
197+
)
198+
) {
199+
s->state |= STATE_EMPHASIS_DELIMITER_IS_OPEN;
200+
lexer->result_symbol = EMPHASIS_OPEN_STAR;
201+
return true;
202+
}
203+
}
204+
return false;
205+
}
206+
207+
static bool parse_tilde(Scanner *s, TSLexer *lexer, const bool *valid_symbols) {
208+
lexer->advance(lexer, false);
209+
// If `num_emphasis_delimiters_left` is not zero then we already decided that this should be
210+
// part of an emphasis delimiter run, so interpret it as such.
211+
if (s->num_emphasis_delimiters_left > 0) {
212+
// The `STATE_EMPHASIS_DELIMITER_IS_OPEN` state flag tells us wether it should be open
213+
// or close.
214+
if ((s->state & STATE_EMPHASIS_DELIMITER_IS_OPEN) && valid_symbols[STRIKETHROUGH_OPEN]) {
215+
s->state &= (~STATE_EMPHASIS_DELIMITER_IS_OPEN);
216+
lexer->result_symbol = STRIKETHROUGH_OPEN;
217+
s->num_emphasis_delimiters_left--;
218+
return true;
219+
} else if (valid_symbols[STRIKETHROUGH_CLOSE]) {
220+
lexer->result_symbol = STRIKETHROUGH_CLOSE;
221+
s->num_emphasis_delimiters_left--;
222+
return true;
223+
}
224+
}
225+
lexer->mark_end(lexer);
226+
// Otherwise count the number of tildes
227+
size_t star_count = 1;
228+
while (lexer->lookahead == '~') {
229+
star_count++;
230+
lexer->advance(lexer, false);
231+
}
232+
bool line_end = lexer->lookahead == '\n' || lexer->lookahead == '\r' || lexer->eof(lexer);
233+
if (valid_symbols[STRIKETHROUGH_OPEN] || valid_symbols[STRIKETHROUGH_CLOSE]) {
234+
// The desicion made for the first star also counts for all the following stars in the
235+
// delimiter run. Rembemer how many there are.
236+
s->num_emphasis_delimiters_left = star_count - 1;
237+
// Look ahead to the next symbol (after the last star) to find out if it is whitespace
238+
// punctuation or other.
239+
bool next_symbol_whitespace = line_end || lexer->lookahead == ' ' || lexer->lookahead == '\t';
240+
bool next_symbol_punctuation = is_punctuation(lexer->lookahead);
241+
// Information about the last token is in valid_symbols. See grammar.js for these
242+
// tokens for how this is done.
243+
if (
244+
valid_symbols[STRIKETHROUGH_CLOSE] &&
245+
!valid_symbols[LAST_TOKEN_WHITESPACE] && (
246+
!valid_symbols[LAST_TOKEN_PUNCTUATION] ||
247+
next_symbol_punctuation ||
248+
next_symbol_whitespace
249+
)
250+
) {
251+
// Closing delimiters take precedence
252+
s->state &= ~STATE_EMPHASIS_DELIMITER_IS_OPEN;
253+
lexer->result_symbol = STRIKETHROUGH_CLOSE;
254+
return true;
255+
} else if (
256+
!next_symbol_whitespace && (
257+
!next_symbol_punctuation ||
258+
valid_symbols[LAST_TOKEN_PUNCTUATION] ||
259+
valid_symbols[LAST_TOKEN_WHITESPACE]
260+
)
261+
) {
262+
s->state |= STATE_EMPHASIS_DELIMITER_IS_OPEN;
263+
lexer->result_symbol = STRIKETHROUGH_OPEN;
264+
return true;
265+
}
266+
}
267+
return false;
268+
}
269+
270+
static bool parse_underscore(Scanner *s, TSLexer *lexer, const bool *valid_symbols) {
271+
lexer->advance(lexer, false);
272+
// If `num_emphasis_delimiters_left` is not zero then we already decided that this should be
273+
// part of an emphasis delimiter run, so interpret it as such.
274+
if (s->num_emphasis_delimiters_left > 0) {
275+
// The `STATE_EMPHASIS_DELIMITER_IS_OPEN` state flag tells us wether it should be open
276+
// or close.
277+
if ((s->state & STATE_EMPHASIS_DELIMITER_IS_OPEN) && valid_symbols[EMPHASIS_OPEN_UNDERSCORE]) {
278+
lexer->result_symbol = EMPHASIS_OPEN_UNDERSCORE;
279+
s->num_emphasis_delimiters_left--;
280+
return true;
281+
} else if (valid_symbols[EMPHASIS_CLOSE_UNDERSCORE]) {
282+
lexer->result_symbol = EMPHASIS_CLOSE_UNDERSCORE;
283+
s->num_emphasis_delimiters_left--;
284+
return true;
285+
}
286+
}
287+
lexer->mark_end(lexer);
288+
// Otherwise count the number of stars
289+
size_t underscore_count = 1;
290+
while (lexer->lookahead == '_') {
291+
underscore_count++;
292+
lexer->advance(lexer, false);
293+
}
294+
bool line_end = lexer->lookahead == '\n' || lexer->lookahead == '\r' || lexer->eof(lexer);
295+
if (valid_symbols[EMPHASIS_OPEN_UNDERSCORE] || valid_symbols[EMPHASIS_CLOSE_UNDERSCORE]) {
296+
s->num_emphasis_delimiters_left = underscore_count - 1;
297+
bool next_symbol_whitespace = line_end || lexer->lookahead == ' ' || lexer->lookahead == '\t';
298+
bool next_symbol_punctuation = is_punctuation(lexer->lookahead);
299+
bool right_flanking = !valid_symbols[LAST_TOKEN_WHITESPACE] &&
300+
(!valid_symbols[LAST_TOKEN_PUNCTUATION] || next_symbol_punctuation || next_symbol_whitespace);
301+
bool left_flanking = !next_symbol_whitespace &&
302+
(!next_symbol_punctuation || valid_symbols[LAST_TOKEN_PUNCTUATION] || valid_symbols[LAST_TOKEN_WHITESPACE]);
303+
if (valid_symbols[EMPHASIS_CLOSE_UNDERSCORE] && right_flanking && (!left_flanking || next_symbol_punctuation)) {
304+
s->state &= ~STATE_EMPHASIS_DELIMITER_IS_OPEN;
305+
lexer->result_symbol = EMPHASIS_CLOSE_UNDERSCORE;
306+
return true;
307+
} else if (left_flanking && (!right_flanking || valid_symbols[LAST_TOKEN_PUNCTUATION])) {
308+
s->state |= STATE_EMPHASIS_DELIMITER_IS_OPEN;
309+
lexer->result_symbol = EMPHASIS_OPEN_UNDERSCORE;
310+
return true;
311+
}
312+
}
313+
return false;
314+
}
315+
316+
static bool scan(Scanner *s, TSLexer *lexer, const bool *valid_symbols) {
317+
// A normal tree-sitter rule decided that the current branch is invalid and now "requests"
318+
// an error to stop the branch
319+
if (valid_symbols[TRIGGER_ERROR]) {
320+
return error(lexer);
321+
}
322+
323+
// Decide which tokens to consider based on the first non-whitespace character
324+
switch (lexer->lookahead) {
325+
case '`':
326+
// A backtick could mark the beginning or ending of a code span or a fenced
327+
// code block.
328+
return parse_backtick(s, lexer, valid_symbols);
329+
break;
330+
case '$':
331+
return parse_dollar(s,lexer, valid_symbols);
332+
break;
333+
case '*':
334+
// A star could either mark the beginning or ending of emphasis, a list item or
335+
// thematic break.
336+
// This code is similar to the code for '_' and '+'.
337+
return parse_star(s,lexer, valid_symbols);
338+
break;
339+
case '_':
340+
return parse_underscore(s, lexer, valid_symbols);
341+
break;
342+
case '~':
343+
return parse_tilde(s, lexer, valid_symbols);
344+
break;
345+
}
346+
return false;
347+
}
348+
349+
void *tree_sitter_markdown_inline_external_scanner_create() {
350+
Scanner *s = (Scanner *)malloc(sizeof(Scanner));
351+
deserialize(s, NULL, 0);
352+
return s;
353+
}
354+
355+
bool tree_sitter_markdown_inline_external_scanner_scan(
356+
void *payload,
357+
TSLexer *lexer,
358+
const bool *valid_symbols
359+
) {
360+
Scanner *scanner = (Scanner *)payload;
361+
return scan(scanner, lexer, valid_symbols);
362+
}
363+
364+
unsigned tree_sitter_markdown_inline_external_scanner_serialize(
365+
void *payload,
366+
char* buffer
367+
) {
368+
Scanner *scanner = (Scanner *)payload;
369+
return serialize(scanner, buffer);
370+
}
371+
372+
void tree_sitter_markdown_inline_external_scanner_deserialize(
373+
void *payload,
374+
char* buffer,
375+
unsigned length
376+
) {
377+
Scanner *scanner = (Scanner *)payload;
378+
deserialize(scanner, buffer, length);
379+
}
380+
381+
void tree_sitter_markdown_inline_external_scanner_destroy(void *payload) {
382+
Scanner *scanner = (Scanner *)payload;
383+
free(scanner);
384+
}

0 commit comments

Comments
 (0)