Skip to content

Commit 42bba9f

Browse files
Feat: Extend parser builtins, fix fstring lexer span and expression tokenization.
1 parent a1f1182 commit 42bba9f

5 files changed

Lines changed: 111 additions & 122 deletions

File tree

compiler/src/main.rs

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -29,26 +29,7 @@ fn main() {
2929

3030
initialize_logger();
3131

32-
let source = r#"
33-
euler: float = 2.71828
34-
pi: float = 3.14159
35-
36-
print(f"Euler + 5 = {euler + 5}")
37-
print(f"Pi * 2 = {pi * 2}")
38-
39-
if euler > 2:
40-
print("Euler is greater than 2")
41-
else:
42-
print("Error")
43-
44-
counter: int = 3
45-
while counter > 0:
46-
print(f"Counting: {counter}")
47-
counter = counter - 1
48-
49-
print("Length of 'hello':", len("hello"))
50-
print("Absolute of -42:", abs(-42))
51-
"#;
32+
let source = "name: str = 'Dylan'\nage: int = 25\nmsg: str = f'User {name} is {age} years old'";
5233

5334
let chunk = modules::parser::Parser::new(source, modules::lexer::lexer(source)).parse();
5435

compiler/src/modules/lexer.rs

Lines changed: 52 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ pub struct LexerState {
4040
pending: VecDeque<(TokenType, usize, usize, usize)>,
4141
indent_stack: Vec<usize>,
4242
nesting: u32,
43-
line: usize
43+
line: usize,
44+
fstring_stack: Vec<(u8, bool, usize)>
4445

4546
}
4647

@@ -110,51 +111,53 @@ fn handle_indent (lex: &mut Lexer<TokenType>) -> logos::Skip {
110111
}
111112

112113
fn lex_fstring_body(lex: &mut Lexer<TokenType>, quote: u8, triple: bool, body_start: usize) {
113-
114-
/*
115-
Lex f-string body, pushing FstringMiddle and FstringEnd to pending.
114+
/*
115+
Scans f-string bytes, suspending at `{` to let the main lexer resume.
116116
*/
117-
118-
let s = lex.span();
119-
120-
let mut depth = 0usize;
121-
let mut had_expr = false;
122-
let mut pos = 0usize;
123-
124117
let bytes = lex.remainder().as_bytes();
118+
let mut pos = 0usize;
125119

126120
while pos < bytes.len() {
127-
128121
let closes = if triple {
129122
bytes.get(pos..pos + 3) == Some(&[quote, quote, quote])
130123
} else {
131-
bytes[pos] == quote && depth == 0
124+
bytes[pos] == quote
132125
};
133126

134127
if closes {
135-
let quote_len = if triple { 3 } else { 1 };
136-
let body_end = body_start + pos;
137-
let end = body_end + quote_len;
138-
139-
if had_expr {
140-
lex.extras.pending.push_back((TokenType::FstringMiddle, lex.extras.line, body_start, body_end));
128+
if pos > 0 {
129+
lex.extras.pending.push_back((TokenType::FstringMiddle, lex.extras.line, body_start, body_start + pos));
141130
}
142-
131+
let quote_len = if triple { 3 } else { 1 };
143132
lex.bump(pos + quote_len);
144-
lex.extras.pending.push_back((TokenType::FstringEnd, lex.extras.line, body_end, end));
145-
133+
lex.extras.pending.push_back((TokenType::FstringEnd, lex.extras.line, body_start + pos, body_start + pos + quote_len));
146134
return;
147135
}
148136

149137
match bytes[pos] {
150138
b'\\' => pos += 2,
151-
b'{' => { had_expr = true; depth = (depth + 1).min(MAX_FSTRING_DEPTH); pos += 1; }
152-
b'}' => { depth = depth.saturating_sub(1); pos += 1; }
153-
_ => pos += 1
139+
b'{' if bytes.get(pos + 1) != Some(&b'{') => {
140+
// Emite texto previo
141+
if pos > 0 {
142+
lex.extras.pending.push_back((TokenType::FstringMiddle, lex.extras.line, body_start, body_start + pos));
143+
}
144+
145+
if lex.extras.fstring_stack.len() >= MAX_FSTRING_DEPTH {
146+
lex.extras.pending.push_back((TokenType::Endmarker, lex.extras.line, body_start + pos, body_start + pos));
147+
lex.bump(pos + 1);
148+
return;
149+
}
150+
// Emite Lbrace
151+
lex.extras.pending.push_back((TokenType::Lbrace, lex.extras.line, body_start + pos, body_start + pos + 1));
152+
// Guarda estado fstring para reanudar después del '}'
153+
lex.extras.fstring_stack.push((quote, triple, body_start + pos + 1));
154+
// Avanza el lexer hasta después del '{' — logos retoma control
155+
lex.bump(pos + 1);
156+
return; // ← PARA aquí, logos tokeniza la expresión
157+
}
158+
_ => pos += 1,
154159
}
155-
156160
}
157-
158161
}
159162

160163
fn lex_name_or_fstring(lex: &mut Lexer<TokenType>) -> Option<()> {
@@ -184,6 +187,21 @@ fn lex_name_or_fstring(lex: &mut Lexer<TokenType>) -> Option<()> {
184187

185188
}
186189

190+
fn close_fstring_expr(lex: &mut Lexer<TokenType>) -> logos::Skip {
191+
/* Saves correct Rbrace span before resuming f-string body scan. */
192+
let span = lex.span(); // ← span correcto ANTES de cualquier bump
193+
if let Some((quote, triple, _)) = lex.extras.fstring_stack.pop() {
194+
// Empuja Rbrace con span correcto al pending
195+
lex.extras.pending.push_back((TokenType::Rbrace, lex.extras.line, span.start, span.end));
196+
// Reanuda fstring — puede hacer bump, ya no importa
197+
lex_fstring_body(lex, quote, triple, span.end);
198+
} else {
199+
lex.extras.nesting = lex.extras.nesting.saturating_sub(1);
200+
lex.extras.pending.push_back((TokenType::Rbrace, lex.extras.line, span.start, span.end));
201+
}
202+
logos::Skip // ← logos no emite nada, pending lo maneja
203+
}
204+
187205
#[derive(Logos, Debug, PartialEq, Clone)]
188206
#[logos(extras = LexerState)]
189207
#[logos(skip r"[ \t\r]+")]
@@ -296,7 +314,7 @@ pub enum TokenType {
296314
#[token("[", open_bracket)] Lsqb,
297315
#[token("]", close_bracket)] Rsqb,
298316
#[token("{", open_bracket)] Lbrace,
299-
#[token("}", close_bracket)] Rbrace,
317+
#[token("}", close_fstring_expr)] Rbrace,
300318

301319
/*
302320
Token names
@@ -346,27 +364,26 @@ pub enum TokenType {
346364
pub fn lexer(source: &str) -> impl Iterator<Item = Token> + '_ {
347365

348366
/*
349-
Tokenizes Python source into a parser-ready stream, handling indentation and soft keywords.
367+
Tokenizes Python source into a parser-ready stream, resolving indentation, soft keywords, and f-string expression boundaries.
350368
*/
351369

370+
let source_len = source.len(); // ← longitud real del source
352371
let mut lex = TokenType::lexer(source);
353372
let mut done = false;
354373

355374
let mut stream = std::iter::from_fn(move || {
356375

357376
if let Some(tok) = lex.extras.pending.pop_front() { return Some(tok); }
358377

359-
let s = lex.span();
360-
361378
let result = match lex.next() {
362379
Some(Ok(tok)) => { let s = lex.span(); Some((tok, lex.extras.line, s.start, s.end)) },
363-
Some(Err(_)) => lex.extras.pending.is_empty().then_some((TokenType::Endmarker, lex.extras.line, s.start, s.end)),
364-
None if !done => { done = true; Some((TokenType::Endmarker, lex.extras.line, s.start, s.end)) }
380+
Some(Err(_)) => lex.extras.pending.is_empty().then_some((TokenType::Endmarker, lex.extras.line, source_len, source_len)),
381+
None if !done => { done = true; Some((TokenType::Endmarker, lex.extras.line, source_len, source_len)) }
365382
_ => None,
366383
};
367-
384+
368385
if let Some(t) = result { lex.extras.pending.push_back(t); }
369-
386+
370387
lex.extras.pending.pop_front()
371388

372389
}).peekable();

compiler/src/modules/parser.rs

Lines changed: 51 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,10 @@ use std::collections::HashMap;
4949
pub enum OpCode {
5050
LoadConst, LoadName, StoreName, Call, PopTop, ReturnValue,
5151
BuildString, CallPrint, CallLen, FormatValue, CallAbs, Minus,
52-
CallStr, CallInt, CallRange, Phi, Add
52+
CallStr, CallInt, CallRange, Phi, Add, CallType,
53+
CallFloat, CallBool, CallRound, CallMin, CallMax, CallSum,
54+
CallSorted, CallEnumerate, CallZip, CallList, CallTuple, CallDict,
55+
CallIsInstance, CallSet, CallInput, CallChr, CallOrd
5356
}
5457

5558
#[derive(Debug)] pub struct Instruction { pub opcode: OpCode, pub operand: u16 }
@@ -86,14 +89,14 @@ pub struct Parser<'src, I: Iterator<Item = Token>> {
8689
join_stack: Vec<JoinNode>,
8790
}
8891

89-
fn parse_string(s: &str) -> String { /* igual que antes */
92+
fn parse_string(s: &str) -> String {
9093
let is_raw = s.contains('r') || s.contains('R');
9194
let s = s.trim_start_matches(|c: char| "bBrRuU".contains(c));
9295
let inner = if s.starts_with("\"\"\"") || s.starts_with("'''") { &s[3..s.len()-3] } else { &s[1..s.len()-1] };
9396
if is_raw { inner.to_string() } else { unescape(inner) }
9497
}
9598

96-
fn unescape(s: &str) -> String { /* igual que antes */
99+
fn unescape(s: &str) -> String {
97100
let mut out = String::with_capacity(s.len());
98101
let mut chars = s.chars().peekable();
99102
while let Some(c) = chars.next() {
@@ -178,12 +181,12 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
178181
TokenType::False => self.emit_const(Value::Bool(false)),
179182
TokenType::None => self.emit_const(Value::None),
180183
TokenType::FstringStart => self.fstring(),
181-
TokenType::Minus => { self.expr(); self.chunk.emit(OpCode::Minus, 0); }
184+
TokenType::Minus => { self.expr(); self.chunk.emit(OpCode::Minus, 0); },
182185
_ => {}
183186
}
184187
}
185188

186-
fn parse_number(&mut self, raw: &str, kind: TokenType) { /* igual que antes */
189+
fn parse_number(&mut self, raw: &str, kind: TokenType) {
187190
let s = raw.replace('_', "");
188191
if kind == TokenType::Float {
189192
self.emit_const(Value::Float(s.parse().unwrap_or(0.0)));
@@ -201,7 +204,7 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
201204
self.chunk.emit(OpCode::LoadConst, i);
202205
}
203206

204-
fn name(&mut self, t: Token) { /* igual que antes */
207+
fn name(&mut self, t: Token) {
205208
let name = self.lexeme(&t).to_string();
206209
if matches!(self.peek(), Some(TokenType::Colon)) { self.advance(); self.advance(); }
207210
match self.peek() {
@@ -211,7 +214,7 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
211214
}
212215
}
213216

214-
fn assign(&mut self, name: String) { /* igual */
217+
fn assign(&mut self, name: String) {
215218
self.advance();
216219
self.expr();
217220
let ver = self.increment_version(&name);
@@ -220,7 +223,7 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
220223
self.chunk.emit(OpCode::StoreName, i);
221224
}
222225

223-
fn parse_args(&mut self) -> u16 { /* igual */
226+
fn parse_args(&mut self) -> u16 {
224227
self.advance();
225228
let mut argc = 0;
226229
while !matches!(self.peek(), Some(TokenType::Rpar) | None) {
@@ -231,13 +234,31 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
231234
argc
232235
}
233236

234-
fn call(&mut self, name: String) { /* igual */
237+
fn call(&mut self, name: String) {
235238
match name.as_str() {
236239
"print" => { let _ = self.parse_args(); self.chunk.emit(OpCode::CallPrint, 0); }
237240
"len" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallLen, a); }
238241
"abs" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallAbs, a); }
239242
"str" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallStr, a); }
240243
"int" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallInt, a); }
244+
"type" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallType, a); }
245+
"float" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallFloat, a); }
246+
"bool" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallBool, a); }
247+
"round" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallRound, a); }
248+
"min" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallMin, a); }
249+
"max" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallMax, a); }
250+
"sum" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallSum, a); }
251+
"sorted" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallSorted, a); }
252+
"enumerate" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallEnumerate, a); }
253+
"zip" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallZip, a); }
254+
"list" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallList, a); }
255+
"tuple" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallTuple, a); }
256+
"dict" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallDict, a); }
257+
"set" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallSet, a); }
258+
"input" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallInput, a); }
259+
"isinstance" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallIsInstance, a); }
260+
"chr" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallChr, a); }
261+
"ord" => { let a = self.parse_args(); self.chunk.emit(OpCode::CallOrd, a); }
241262
"range" => self.call_range(),
242263
_ => {
243264
let i = self.chunk.push_name(&name);
@@ -248,7 +269,8 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
248269
}
249270
}
250271

251-
fn call_range(&mut self) { /* igual */
272+
fn call_range(&mut self) {
273+
self.advance();
252274
let mut args = Vec::new();
253275
while !matches!(self.peek(), Some(TokenType::Rpar) | None) {
254276
let tok = self.advance();
@@ -258,14 +280,17 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
258280
if matches!(self.peek(), Some(TokenType::Comma)) { self.advance(); }
259281
}
260282
self.advance();
283+
261284
let (start, stop, step) = match args.as_slice() {
262-
[stop] => (0, *stop, 1),
263-
[start, stop] => (*start, *stop, 1),
285+
[stop] => (0, *stop, 1),
286+
[start, stop] => (*start, *stop, 1),
264287
[start, stop, step] => (*start, *stop, *step),
265-
_ => (0, 0, 1),
288+
_ => (0, 0, 1),
266289
};
267-
for v in [start, stop, step] { self.emit_const(Value::Int(v)); }
268-
self.chunk.emit(OpCode::CallRange, 3);
290+
291+
let i = self.chunk.push_const(Value::Range(start, stop, step));
292+
self.chunk.emit(OpCode::LoadConst, i);
293+
self.chunk.emit(OpCode::CallRange, 1);
269294
}
270295

271296
fn fstring(&mut self) {
@@ -274,60 +299,22 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
274299
match self.peek() {
275300
Some(TokenType::FstringMiddle) => {
276301
let t = self.advance();
277-
let mut rest = self.lexeme(&t);
278-
279-
while let Some(open) = rest.find('{') {
280-
if open > 0 {
281-
self.emit_const(Value::Str(rest[..open].to_string()));
282-
parts += 1;
283-
}
284-
rest = &rest[open + 1..];
285-
286-
if let Some(close) = rest.find('}') {
287-
let expr = rest[..close].trim();
288-
if !expr.is_empty() {
289-
self.parse_fstring_expr(expr);
290-
self.chunk.emit(OpCode::FormatValue, 0);
291-
parts += 1;
292-
}
293-
rest = &rest[close + 1..];
294-
} else {
295-
break;
296-
}
297-
}
298-
299-
if !rest.is_empty() {
300-
self.emit_const(Value::Str(rest.to_string()));
301-
parts += 1;
302+
self.emit_const(Value::Str(self.lexeme(&t).to_string()));
303+
parts += 1;
304+
}
305+
Some(TokenType::Lbrace) => {
306+
self.advance(); // consume '{'
307+
self.expr(); // ← parser normal, soporta TODO
308+
self.chunk.emit(OpCode::FormatValue, 0);
309+
parts += 1;
310+
if matches!(self.peek(), Some(TokenType::Rbrace)) {
311+
self.advance(); // consume '}'
302312
}
303313
}
304314
Some(TokenType::FstringEnd) => { self.advance(); break; }
305315
_ => break,
306316
}
307317
}
308-
if parts > 0 {
309-
self.chunk.emit(OpCode::BuildString, parts);
310-
}
311-
}
312-
313-
fn parse_fstring_expr(&mut self, expr: &str) {
314-
if expr.chars().all(|c| c.is_alphanumeric() || c == '_') && !expr.starts_with(char::is_numeric) {
315-
// Caso simple: {euler}
316-
self.emit_load_ssa(expr.to_string());
317-
} else if let Some(pos) = expr.find(" + ") {
318-
// Caso {euler + 5}
319-
let left = expr[..pos].trim();
320-
let right = expr[pos + 3..].trim();
321-
self.emit_load_ssa(left.to_string());
322-
if let Ok(num) = right.parse::<i64>() {
323-
self.emit_const(Value::Int(num));
324-
} else {
325-
self.emit_const(Value::Str(right.to_string()));
326-
}
327-
self.chunk.emit(OpCode::Add, 0);
328-
} else {
329-
// Fallback seguro
330-
self.emit_const(Value::Str(expr.to_string()));
331-
}
318+
if parts > 0 { self.chunk.emit(OpCode::BuildString, parts); }
332319
}
333320
}

0 commit comments

Comments
 (0)