-
Notifications
You must be signed in to change notification settings - Fork 114
Expand file tree
/
Copy pathparser.rs
More file actions
221 lines (184 loc) · 6.11 KB
/
parser.rs
File metadata and controls
221 lines (184 loc) · 6.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
mod common;
mod data;
mod ddl;
mod dml;
pub use common::source;
use pgt_lexer::{SyntaxKind, Token, WHITESPACE_TOKENS};
use pgt_text_size::{TextRange, TextSize};
use crate::diagnostics::SplitDiagnostic;
/// Main parser that exposes the `cstree` api, and collects errors and statements
/// It is modelled after a Pratt Parser. For a gentle introduction to Pratt Parsing, see https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html
pub struct Parser {
/// The ranges of the statements
ranges: Vec<(usize, usize)>,
/// The syntax errors accumulated during parsing
errors: Vec<SplitDiagnostic>,
/// The start of the current statement, if any
current_stmt_start: Option<usize>,
/// The tokens to parse
pub tokens: Vec<Token>,
eof_token: Token,
next_pos: usize,
}
/// Result of Building
#[derive(Debug)]
pub struct Parse {
/// The ranges of the errors
pub ranges: Vec<TextRange>,
/// The syntax errors accumulated during parsing
pub errors: Vec<SplitDiagnostic>,
}
impl Parser {
pub fn new(tokens: Vec<Token>) -> Self {
let eof_token = Token::eof(usize::from(
tokens
.last()
.map(|t| t.span.start())
.unwrap_or(TextSize::from(0)),
));
// next_pos should be the initialised with the first valid token already
let mut next_pos = 0;
loop {
let token = tokens.get(next_pos).unwrap_or(&eof_token);
if is_irrelevant_token(token) {
next_pos += 1;
} else {
break;
}
}
Self {
ranges: Vec::new(),
eof_token,
errors: Vec::new(),
current_stmt_start: None,
tokens,
next_pos,
}
}
pub fn finish(self) -> Parse {
Parse {
ranges: self
.ranges
.iter()
.map(|(start, end)| {
let from = self.tokens.get(*start);
let to = self.tokens.get(*end).unwrap_or(&self.eof_token);
TextRange::new(from.unwrap().span.start(), to.span.end())
})
.collect(),
errors: self.errors,
}
}
/// Start statement
pub fn start_stmt(&mut self) {
assert!(
self.current_stmt_start.is_none(),
"cannot start statement within statement at {:?}",
self.tokens.get(self.current_stmt_start.unwrap())
);
self.current_stmt_start = Some(self.next_pos);
}
/// Close statement
pub fn close_stmt(&mut self) {
assert!(self.next_pos > 0);
// go back the positions until we find the first relevant token
let mut end_token_pos = self.next_pos - 1;
loop {
let token = self.tokens.get(end_token_pos);
if end_token_pos == 0 || token.is_none() {
break;
}
if !is_irrelevant_token(token.unwrap()) {
break;
}
end_token_pos -= 1;
}
self.ranges.push((
self.current_stmt_start.expect("Expected active statement"),
end_token_pos,
));
self.current_stmt_start = None;
}
fn advance(&mut self) -> &Token {
let mut first_relevant_token = None;
loop {
let token = self.tokens.get(self.next_pos).unwrap_or(&self.eof_token);
// we need to continue with next_pos until the next relevant token after we already
// found the first one
if !is_irrelevant_token(token) {
if let Some(t) = first_relevant_token {
return t;
}
first_relevant_token = Some(token);
}
self.next_pos += 1;
}
}
fn peek(&self) -> &Token {
match self.tokens.get(self.next_pos) {
Some(token) => token,
None => &self.eof_token,
}
}
/// Look ahead to the next relevant token
/// Returns `None` if we are already at the last relevant token
fn look_ahead(&self) -> Option<&Token> {
// we need to look ahead to the next relevant token
let mut look_ahead_pos = self.next_pos + 1;
loop {
let token = self.tokens.get(look_ahead_pos)?;
if !is_irrelevant_token(token) {
return Some(token);
}
look_ahead_pos += 1;
}
}
/// Returns `None` if there are no previous relevant tokens
fn look_back(&self) -> Option<&Token> {
// we need to look back to the last relevant token
let mut look_back_pos = self.next_pos - 1;
loop {
let token = self.tokens.get(look_back_pos);
if look_back_pos == 0 || token.is_none() {
return None;
}
if !is_irrelevant_token(token.unwrap()) {
return token;
}
look_back_pos -= 1;
}
}
/// checks if the current token is of `kind` and advances if true
/// returns true if the current token is of `kind`
pub fn eat(&mut self, kind: SyntaxKind) -> bool {
if self.peek().kind == kind {
self.advance();
true
} else {
false
}
}
pub fn expect(&mut self, kind: SyntaxKind) {
if self.eat(kind) {
return;
}
self.errors.push(SplitDiagnostic::new(
format!("Expected {:#?}", kind),
self.peek().span,
));
}
}
#[cfg(windows)]
/// Returns true if the token is relevant for the paring process
///
/// On windows, a newline is represented by `\r\n` which is two characters.
fn is_irrelevant_token(t: &Token) -> bool {
WHITESPACE_TOKENS.contains(&t.kind)
&& (t.kind != SyntaxKind::Newline || t.text == "\r\n" || t.text.chars().count() == 1)
}
#[cfg(not(windows))]
/// Returns true if the token is relevant for the paring process
fn is_irrelevant_token(t: &Token) -> bool {
WHITESPACE_TOKENS.contains(&t.kind)
&& (t.kind != SyntaxKind::Newline || t.text.chars().count() == 1)
}