Skip to content

Commit fb27f6e

Browse files
Feat: Parser overhaul - 92% Python 3.12 syntax coverage (96 opcodes).
1 parent c159da6 commit fb27f6e

7 files changed

Lines changed: 2861 additions & 667 deletions

File tree

compiler/src/main.rs

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -29,31 +29,4 @@ fn main() {
2929

3030
initialize_logger();
3131

32-
let source = "def test():\n x = 42\n y = x * 2\n print(y)\n return y + 10\n\nprint(test())";
33-
34-
let chunk = modules::parser::Parser::new(source, modules::lexer::lexer(source)).parse();
35-
36-
// Instructions.
37-
for (i, ins) in chunk.instructions.iter().enumerate() {
38-
info!("{:03} {:?} {}", i, ins.opcode, ins.operand);
39-
}
40-
41-
let tokens: Vec<String> = modules::lexer::lexer(source)
42-
.map(|t| format!("{:?} [{}-{}]", t.kind, t.start, t.end))
43-
.collect();
44-
45-
info!("{:?}", tokens);
46-
47-
info!("constants: {:?}", chunk.constants);
48-
info!("names: {:?}", chunk.names);
49-
info!("annotations: {:?}", chunk.annotations);
50-
51-
if let Some((_, body)) = chunk.functions.first() { // .first() = primera función
52-
for (i, ins) in body.instructions.iter().enumerate() {
53-
info!("{:03} {:?} {}", i, ins.opcode, ins.operand);
54-
}
55-
}
56-
57-
info!("functions count: {:?}", chunk.functions.len());
58-
5932
}

compiler/src/modules/lexer.rs

Lines changed: 116 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
`lexer.rs`
3-
Tokenizes Python source into a stream of spanned Token variants.
3+
Tokenizes Python source into offset-indexed tokens. Zero-copy, zero-alloc hot path, DFA-compiled via logos.
44
55
Usage:
66
```rust
@@ -29,12 +29,13 @@ use std::cmp::Ordering;
2929
// A04:2021 Prevent asymetric DoS via deeply data structures: `handle_indent`, `lex_fstring_body`.
3030
const MAX_INDENT_DEPTH: usize = 100;
3131
const MAX_FSTRING_DEPTH: usize = 200;
32+
const MAX_SOURCE_SIZE: usize = 10 * 1024 * 1024; // 10MB
3233

3334
#[derive(Default)]
3435
pub struct LexerState {
3536

3637
/*
37-
Structure for: pending token queue, indentation stack, bracket nesting depth.
38+
Pending queue, indentation stack, bracket depth, line counter, and f-string context.
3839
*/
3940

4041
pending: VecDeque<(TokenType, usize, usize, usize)>,
@@ -49,7 +50,7 @@ pub struct LexerState {
4950
pub struct Token {
5051

5152
/*
52-
Structure: kind (token type): line, start and end for indexes of the kind.
53+
Token kind with line number and byte-offset span (start, end) into source.
5354
*/
5455

5556
pub kind: TokenType,
@@ -63,81 +64,113 @@ pub struct Token {
6364
fn open_bracket(lex: &mut Lexer<TokenType>) { lex.extras.nesting += 1; }
6465
fn close_bracket(lex: &mut Lexer<TokenType>) { lex.extras.nesting = lex.extras.nesting.saturating_sub(1); }
6566

66-
fn handle_indent (lex: &mut Lexer<TokenType>) -> logos::Skip {
67+
fn handle_indent(lex: &mut Lexer<TokenType>) -> logos::Skip {
6768

68-
/*
69-
Decides `if \n` is a statement boundary or inside brackets.
69+
/*
70+
Emits Indent/Dedent/Newline tokens or suppresses them inside bracketed expressions.
7071
*/
7172

7273
let s = lex.span();
73-
74-
let src = lex.remainder();
75-
let indent: Vec<u8> = src.bytes().take_while(|&b| b == b' ' || b == b'\t').collect();
76-
let level = indent.len();
77-
let line = s.end + level;
78-
7974
let current_line = lex.extras.line;
75+
8076
lex.extras.line += 1;
8177

82-
let next = src[indent.len()..].chars().next();
83-
84-
if lex.extras.nesting > 0 { lex.extras.pending.push_back((TokenType::Nl, current_line, s.start, s.end)); return logos::Skip; }
85-
if indent.contains(&b' ') && indent.contains(&b'\t') { lex.extras.pending.push_back((TokenType::Newline, current_line, s.start, s.end)); lex.extras.pending.push_back((TokenType::Endmarker, current_line, s.start, s.end)); return logos::Skip; }
86-
if matches!(next, Some('\n' | '\r' | '#')) { lex.extras.pending.push_back((TokenType::Nl, current_line, s.start, s.end)); return logos::Skip; }
78+
if lex.extras.nesting > 0 {
79+
lex.extras.pending.push_back((TokenType::Nl, current_line, s.start, s.end));
80+
return logos::Skip;
81+
}
82+
83+
let bytes = lex.remainder().as_bytes();
8784

85+
let mut level = 0usize;
86+
let mut has_space = false;
87+
let mut has_tab = false;
88+
89+
while level < bytes.len() && (bytes[level] == b' ' || bytes[level] == b'\t') {
90+
has_space |= bytes[level] == b' ';
91+
has_tab |= bytes[level] == b'\t';
92+
level += 1;
93+
}
94+
95+
if has_space && has_tab {
96+
lex.extras.pending.push_back((TokenType::Newline, current_line, s.start, s.end));
97+
lex.extras.pending.push_back((TokenType::Endmarker, current_line, s.start, s.end));
98+
return logos::Skip;
99+
}
100+
101+
if matches!(bytes.get(level), Some(b'\n' | b'\r' | b'#')) {
102+
lex.extras.pending.push_back((TokenType::Nl, current_line, s.start, s.end));
103+
return logos::Skip;
104+
}
105+
106+
let line = s.end + level;
88107
let current = *lex.extras.indent_stack.last().unwrap_or(&0);
89108

90109
lex.extras.pending.push_back((TokenType::Newline, current_line, s.start, s.end));
91110

92111
match level.cmp(&current) {
93112

94113
Ordering::Greater => {
95-
if lex.extras.indent_stack.len() >= MAX_INDENT_DEPTH { lex.extras.pending.push_back((TokenType::Endmarker, current_line, s.start, s.end)); return logos::Skip; }
114+
if lex.extras.indent_stack.len() >= MAX_INDENT_DEPTH {
115+
lex.extras.pending.push_back((TokenType::Endmarker, current_line, s.start, s.end));
116+
return logos::Skip;
117+
}
96118
lex.extras.indent_stack.push(level);
97119
lex.extras.pending.push_back((TokenType::Indent, lex.extras.line, line, line));
98-
},
99-
120+
}
121+
100122
Ordering::Less => while lex.extras.indent_stack.last().is_some_and(|&t| t > level) {
101123
lex.extras.indent_stack.pop();
102124
lex.extras.pending.push_back((TokenType::Dedent, lex.extras.line, line, line));
103-
},
104-
125+
}
126+
105127
Ordering::Equal => {}
106-
128+
107129
}
108130

109131
logos::Skip
110132

111133
}
112134

113135
fn lex_fstring_body(lex: &mut Lexer<TokenType>, quote: u8, triple: bool, body_start: usize) {
136+
114137
/*
115-
Scans f-string bytes, suspending at `{` to let the main lexer resume.
138+
Scans f-string bytes, emitting text segments and suspending at `{` for expression lexing.
116139
*/
140+
117141
let bytes = lex.remainder().as_bytes();
142+
118143
let mut pos = 0usize;
119144

120145
while pos < bytes.len() {
146+
121147
let closes = if triple {
122148
bytes.get(pos..pos + 3) == Some(&[quote, quote, quote])
123149
} else {
124150
bytes[pos] == quote
125151
};
126152

127153
if closes {
154+
128155
if pos > 0 {
129156
lex.extras.pending.push_back((TokenType::FstringMiddle, lex.extras.line, body_start, body_start + pos));
130157
}
158+
131159
let quote_len = if triple { 3 } else { 1 };
160+
132161
lex.bump(pos + quote_len);
133162
lex.extras.pending.push_back((TokenType::FstringEnd, lex.extras.line, body_start + pos, body_start + pos + quote_len));
163+
134164
return;
165+
135166
}
136167

137168
match bytes[pos] {
138-
b'\\' => pos += 2,
169+
170+
b'\\' => pos = (pos + 2).min(bytes.len()),
171+
139172
b'{' if bytes.get(pos + 1) != Some(&b'{') => {
140-
// Emite texto previo
173+
141174
if pos > 0 {
142175
lex.extras.pending.push_back((TokenType::FstringMiddle, lex.extras.line, body_start, body_start + pos));
143176
}
@@ -147,37 +180,49 @@ fn lex_fstring_body(lex: &mut Lexer<TokenType>, quote: u8, triple: bool, body_st
147180
lex.bump(pos + 1);
148181
return;
149182
}
150-
// Emite Lbrace
183+
151184
lex.extras.pending.push_back((TokenType::Lbrace, lex.extras.line, body_start + pos, body_start + pos + 1));
152-
// Guarda estado fstring para reanudar después del '}'
153185
lex.extras.fstring_stack.push((quote, triple, body_start + pos + 1));
154-
// Avanza el lexer hasta después del '{' — logos retoma control
155186
lex.bump(pos + 1);
156-
return; // ← PARA aquí, logos tokeniza la expresión
187+
188+
return;
189+
157190
}
191+
158192
_ => pos += 1,
193+
159194
}
195+
160196
}
197+
161198
}
162199

163200
fn lex_name_or_fstring(lex: &mut Lexer<TokenType>) -> Option<()> {
164201

165-
/*
166-
Detects f-string prefixes within identifier matches and delegates to lex_fstring_body .
202+
/*
203+
Reclassifies `f`/`fr`/`rf` identifiers as f-string starts when followed by a quote.
167204
*/
168205

169206
let s = lex.span();
207+
let slice = lex.slice().as_bytes();
170208

171-
if !matches!(lex.slice().to_ascii_lowercase().as_str(), "f" | "fr" | "rf") { return Some(()); }
209+
let is_fprefix = match slice.len() {
210+
1 => matches!(slice[0], b'f' | b'F'),
211+
2 => matches!((slice[0], slice[1]), (b'f' | b'F', b'r' | b'R') | (b'r' | b'R', b'f' | b'F')),
212+
_ => return Some(()),
213+
};
214+
215+
if !is_fprefix { return Some(()); }
172216

173217
let Some(&q) = lex.remainder().as_bytes().first() else { return Some(()); };
218+
174219
if !matches!(q, b'"' | b'\'') { return Some(()); }
175220

176221
let triple = lex.remainder().as_bytes().get(1) == Some(&q);
177-
178222
let quote_len = if triple { 3 } else { 1 };
179223

180224
lex.bump(quote_len);
225+
181226
let body_start = s.end + quote_len;
182227
lex.extras.pending.push_back((TokenType::FstringStart, lex.extras.line, s.start, body_start));
183228

@@ -188,18 +233,22 @@ fn lex_name_or_fstring(lex: &mut Lexer<TokenType>) -> Option<()> {
188233
}
189234

190235
fn close_fstring_expr(lex: &mut Lexer<TokenType>) -> logos::Skip {
191-
/* Saves correct Rbrace span before resuming f-string body scan. */
192-
let span = lex.span(); // ← span correcto ANTES de cualquier bump
236+
237+
/*
238+
Closes an f-string expression on `}`, emits Rbrace, and resumes f-string body scanning.
239+
*/
240+
241+
let span = lex.span();
242+
193243
if let Some((quote, triple, _)) = lex.extras.fstring_stack.pop() {
194-
// Empuja Rbrace con span correcto al pending
195244
lex.extras.pending.push_back((TokenType::Rbrace, lex.extras.line, span.start, span.end));
196-
// Reanuda fstring — puede hacer bump, ya no importa
197245
lex_fstring_body(lex, quote, triple, span.end);
198246
} else {
199247
lex.extras.nesting = lex.extras.nesting.saturating_sub(1);
200248
lex.extras.pending.push_back((TokenType::Rbrace, lex.extras.line, span.start, span.end));
201249
}
202-
logos::Skip // ← logos no emite nada, pending lo maneja
250+
251+
logos::Skip
203252
}
204253

205254
#[derive(Logos, Debug, PartialEq, Clone)]
@@ -362,49 +411,58 @@ pub enum TokenType {
362411
}
363412

364413
pub fn lexer(source: &str) -> impl Iterator<Item = Token> + '_ {
365-
414+
366415
/*
367-
Tokenizes Python source into a parser-ready stream, resolving indentation, soft keywords, and f-string expression boundaries.
416+
Produces a parser-ready token stream with indentation, soft keywords, and f-string boundaries resolved.
368417
*/
418+
419+
let source_len = source.len();
369420

370-
let source_len = source.len(); // ← longitud real del source
371421
let mut lex = TokenType::lexer(source);
372422
let mut done = false;
373423

424+
if source_len > MAX_SOURCE_SIZE {
425+
lex.extras.pending.push_back((TokenType::Endmarker, 0, source_len, source_len));
426+
}
427+
374428
let mut stream = std::iter::from_fn(move || {
375429

376430
if let Some(tok) = lex.extras.pending.pop_front() { return Some(tok); }
377-
378-
let result = match lex.next() {
379-
Some(Ok(tok)) => { let s = lex.span(); Some((tok, lex.extras.line, s.start, s.end)) },
380-
Some(Err(_)) => lex.extras.pending.is_empty().then_some((TokenType::Endmarker, lex.extras.line, source_len, source_len)),
381-
None if !done => { done = true; Some((TokenType::Endmarker, lex.extras.line, source_len, source_len)) }
382-
_ => None,
431+
432+
let tok = match lex.next() {
433+
Some(Ok(tok)) => { let s = lex.span(); (tok, lex.extras.line, s.start, s.end) }
434+
Some(Err(_)) if lex.extras.pending.is_empty() => (TokenType::Endmarker, lex.extras.line, source_len, source_len),
435+
Some(Err(_)) => return lex.extras.pending.pop_front(),
436+
None if !done => { done = true; (TokenType::Endmarker, lex.extras.line, source_len, source_len) }
437+
None => return None,
383438
};
384439

385-
if let Some(t) = result { lex.extras.pending.push_back(t); }
386-
387-
lex.extras.pending.pop_front()
440+
if lex.extras.pending.is_empty() {
441+
Some(tok)
442+
} else {
443+
lex.extras.pending.push_back(tok);
444+
lex.extras.pending.pop_front()
445+
}
388446

389447
}).peekable();
390448

391449
let mut ended = false;
392450

393451
std::iter::from_fn(move || {
394-
395-
if ended { return None; }
396-
452+
397453
let (tok, line, start, end) = stream.next()?;
398-
454+
455+
if ended { return None; }
399456
if tok == TokenType::Endmarker { ended = true; }
400457

401-
let as_name = matches!(tok, TokenType::Match | TokenType::Case | TokenType::Type) && matches!(stream.peek(), Some((
458+
let is_soft_keyword = matches!(tok, TokenType::Match | TokenType::Case | TokenType::Type);
459+
let next_demotes = matches!(stream.peek(), Some((
402460
TokenType::Lpar | TokenType::Colon | TokenType::Equal |
403-
TokenType::Comma | TokenType::Rpar | TokenType::Rsqb |
461+
TokenType::Comma | TokenType::Rpar | TokenType::Rsqb |
404462
TokenType::Newline, _, _, _
405463
)) | None);
406464

407-
let kind = if as_name { TokenType::Name } else { tok };
465+
let kind = if is_soft_keyword && next_demotes { TokenType::Name } else { tok };
408466

409467
Some(Token { kind, line, start, end })
410468

0 commit comments

Comments
 (0)