Skip to content

Commit 08ffe92

Browse files
fix(compiler): rustc-style diagnostics and five interpreter correctness bugs
1 parent a2898b6 commit 08ffe92

16 files changed

Lines changed: 337 additions & 72 deletions

File tree

compiler/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ What this leaves is a small, fast, deterministic core: arithmetic with arbitrary
2020
* **Lexer**: Hand-written, LUT-driven scanner over CPython 3.13 token kinds. Tokens are `(start, end, kind)` offsets into the source buffer; no string copies during lexing.
2121
* **Parser**: Single-pass, Pratt precedence climbing. Emits SSA-versioned bytecode directly (`x` -> `x_1`, `x_2`) with explicit `Phi` opcodes at control-flow joins. No intermediate AST.
2222
* **Optimizer**: One peephole pass: constant folding over adjacent literal operands, plus dead-code compaction with jump remapping. Does not propagate through `LoadName`.
23-
* **VM**: Stack-based interpreter over a pre-compiled `Vec<ThreadedOp>` where operands are baked into typed enum variants. Dispatch is a flat `match` over the variant. One LoadAttr+Call superinstruction (`CallMethod`).
23+
* **VM**: Stack-based interpreter over `Vec<Instruction>` where each `Instruction` is `(opcode: OpCode, operand: u16)`. Dispatch is a flat `match` on the opcode (Rust lowers it to a jump table). One LoadAttr+Call superinstruction (`CallMethod` + `CallMethodArgs`), fused once per chunk and cached in `cache.fused_ref()`.
2424
* **Inline Caching**: Per-instruction type-recording cache for arithmetic and comparisons. After 4 stable hits the IC stores a `FastOp` (`AddInt`, `LtFloat`, ...) used as a speculative fast path with type-guard deopt.
2525
* **Template Memoization**: Pure functions called repeatedly with the same arguments return cached results after 2 hits, bypassing full execution.
2626
* **Memory**: NaN-boxed 64-bit `Val` (48-bit signed int, IEEE-754 float, bool, None, 28-bit heap index). Mark-and-sweep GC. Arbitrary-precision `BigInt` fallback for integers outside the 48-bit range.

compiler/src/main.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,16 +59,18 @@ fn parse_args() -> (String, usize, bool, bool) {
5959
}
6060

6161
fn run(path: &str, sandbox: bool, verbosity: usize, quiet: bool) -> Result<(), String> {
62-
let src = if path.ends_with(".py") {
62+
let is_file = path.ends_with(".py");
63+
let src = if is_file {
6364
fs::read_to_string(path).map_err(|_| s!("io: cannot access '", str path, "'"))?
6465
} else {
6566
path.to_string()
6667
};
68+
let diag_path = if is_file { Some(path) } else { None };
6769

6870
let (mut chunk, errs) = Parser::new(&src, lexer(&src)).parse();
6971
if !errs.is_empty() {
7072
for e in &errs {
71-
eprint_msg(&s!("syntax: ", str &e.render_with_path(path)));
73+
eprint_msg(&e.render(&src, diag_path));
7274
}
7375
exit(1);
7476
}

compiler/src/modules/parser/control.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
106106
let loop_start = self.chunk.instructions.len() as u16;
107107
self.loop_starts.push(loop_start);
108108
self.loop_breaks.push(vec![]);
109+
self.loop_kinds.push(false);
109110

110111
self.expr();
111112
self.chunk.emit(OpCode::JumpIfFalse, 0);
@@ -123,6 +124,7 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
123124
}
124125

125126
self.loop_starts.pop();
127+
self.loop_kinds.pop();
126128
for pos in self.loop_breaks.pop().unwrap_or_default() {
127129
self.patch(pos);
128130
}
@@ -157,6 +159,7 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
157159
let loop_start = self.chunk.instructions.len() as u16;
158160
self.loop_starts.push(loop_start);
159161
self.loop_breaks.push(vec![]);
162+
self.loop_kinds.push(true);
160163

161164
self.chunk.emit(OpCode::ForIter, 0);
162165
let fi = self.chunk.instructions.len() - 1;
@@ -186,6 +189,7 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
186189
}
187190

188191
self.loop_starts.pop();
192+
self.loop_kinds.pop();
189193
for pos in self.loop_breaks.pop().unwrap_or_default() {
190194
self.patch(pos);
191195
}

compiler/src/modules/parser/expr.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -222,9 +222,9 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
222222
}
223223
}
224224
TokenType::Lambda => self.parse_lambda(),
225-
_ => {
226-
if t.kind != TokenType::Endmarker { self.error("unexpected token"); }
227-
}
225+
// Anchor at the consumed token so the caret points at the
226+
// structural marker (newline, dedent, etc.) the user actually wrote.
227+
_ => self.error_at(t.start, t.end, "expected expression"),
228228
}
229229
self.postfix_tail();
230230
}

compiler/src/modules/parser/literals.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,7 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
435435
i.opcode,
436436
OpCode::CallPrint
437437
| OpCode::StoreItem
438+
| OpCode::DelItem
438439
| OpCode::StoreAttr
439440
| OpCode::CallInput
440441
| OpCode::Global

compiler/src/modules/parser/mod.rs

Lines changed: 58 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ pub struct Parser<'src, I: Iterator<Item = Token>> {
2323
pub(super) loop_starts: Vec<u16>,
2424
pub(super) last_line: usize,
2525
pub(super) loop_breaks: Vec<Vec<usize>>,
26+
// Parallel to loop_starts/loop_breaks: true for `for` loops (which push
27+
// an iter on iter_stack), false for `while`. Lets `break` emit PopIter
28+
// only when escaping a for-loop, so nested for/while combinations work.
29+
pub(super) loop_kinds: Vec<bool>,
2630
pub(super) expr_depth: usize,
2731
pub(super) saw_newline: bool,
2832
pub errors: Vec<Diagnostic>,
@@ -161,19 +165,22 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
161165
tok
162166
}
163167

168+
/* Push a Diagnostic anchored at the next token's span (or at end-of-source
169+
if we ran past EOF) and panic-mode sync to the next statement boundary
170+
so we can keep reporting downstream errors. */
164171
pub(super) fn error(&mut self, msg: &str) {
165-
let (line, byte_offset, end) = self
166-
.tokens
167-
.peek()
168-
.map(|t| (t.line, t.start, t.end))
169-
.unwrap_or((self.last_line, 0, 0));
170-
171-
let col = self.source[..byte_offset]
172-
.rfind('\n')
173-
.map(|line_start| byte_offset - line_start - 1)
174-
.unwrap_or(byte_offset);
175-
176-
self.errors.push(Diagnostic { line, col, end, msg: msg.to_string() });
172+
let n = self.source.len();
173+
let (start, end) = self.tokens.peek()
174+
.map(|t| (t.start, t.end))
175+
.unwrap_or((n, n));
176+
self.error_at(start, end, msg);
177+
}
178+
179+
/* Same as `error` but anchored at the caller-provided span. Use when
180+
a parser has already consumed the offending token and wants the
181+
diagnostic to point at it (not at whatever comes next). */
182+
pub(super) fn error_at(&mut self, start: usize, end: usize, msg: &str) {
183+
self.errors.push(Diagnostic { start, end, msg: msg.to_string() });
177184
loop {
178185
match self.tokens.peek().map(|t| t.kind) {
179186
None | Some(TokenType::Newline | TokenType::Dedent | TokenType::Endmarker) => break,
@@ -192,6 +199,11 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
192199
self.lexeme(&t).to_string()
193200
}
194201

202+
/* Surface user-visible tokens. Skips Newline (latching `saw_newline`),
203+
Nl, Comment. Treats Endmarker as None so `at_end()` and "loop while
204+
not closer/None" patterns terminate cleanly without explicit Endmarker
205+
checks at every site. The raw iterator (`self.tokens.peek()`) still
206+
sees Endmarker for diagnostic anchoring in `error()` / `eat()`. */
195207
pub(super) fn peek(&mut self) -> Option<TokenType> {
196208
loop {
197209
match self.tokens.peek().map(|t| t.kind) {
@@ -200,8 +212,8 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
200212
self.tokens.next();
201213
}
202214
Some(TokenType::Nl | TokenType::Comment) => { self.tokens.next(); }
215+
Some(TokenType::Endmarker) | None => return None,
203216
Some(k) => return Some(k),
204-
None => return None,
205217
}
206218
}
207219
}
@@ -210,16 +222,27 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
210222
self.chunk.instructions[pos].operand = self.chunk.instructions.len() as u16;
211223
}
212224

225+
/* Consume `kind` or push a diagnostic with a friendly description of
226+
what was actually found (lexeme for normal tokens, kind label for
227+
synthetic Endmarker / structural tokens, and "EOF" past end). */
213228
pub(super) fn eat(&mut self, kind: TokenType) {
214229
if matches!(self.peek(), Some(k) if k == kind) {
215230
self.advance();
216-
} else {
217-
let token_text = match self.tokens.peek() {
218-
Some(t) => &self.source[t.start..t.end],
219-
None => "EOF",
220-
};
221-
self.error(&s!("expected ", str kind.as_str(), ", got '", str token_text, "'"));
231+
return;
222232
}
233+
let label: alloc::string::String = match self.tokens.peek() {
234+
Some(t) if t.kind == TokenType::Endmarker => "EOF".to_string(),
235+
Some(t) if t.kind == TokenType::Newline || t.kind == TokenType::Nl => "newline".to_string(),
236+
Some(t) if t.kind == TokenType::Indent => "indent".to_string(),
237+
Some(t) if t.kind == TokenType::Dedent => "dedent".to_string(),
238+
Some(t) if t.start == t.end => t.kind.as_str().to_string(),
239+
Some(t) => {
240+
let mut s = alloc::string::String::with_capacity(t.end - t.start + 2);
241+
s.push('\''); s.push_str(&self.source[t.start..t.end]); s.push('\''); s
242+
}
243+
None => "EOF".to_string(),
244+
};
245+
self.error(&s!("expected ", str kind.as_str(), ", got ", str &label));
223246
}
224247

225248
pub(super) fn eat_if(&mut self, kind: TokenType) -> bool {
@@ -244,6 +267,7 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
244267
join_stack: Vec::new(),
245268
loop_starts: Vec::new(),
246269
loop_breaks: Vec::new(),
270+
loop_kinds: Vec::new(),
247271
saw_newline: false,
248272
expr_depth: 0,
249273
last_line: 0,
@@ -257,21 +281,33 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
257281
if self.at_end() { break; }
258282

259283
let produced_value = self.stmt();
260-
if !self.at_end() && produced_value { self.chunk.emit(OpCode::PopTop, 0); }
284+
// Always pop expression-statement results: the implicit ReturnValue
285+
// at chunk end returns Val::none() if the stack is empty.
286+
if produced_value { self.chunk.emit(OpCode::PopTop, 0); }
261287
}
262288

263289
if self.chunk.overflow {
264-
let line = self.errors.last().map(|e| e.line).unwrap_or(0);
290+
let n = self.source.len();
265291
self.errors.push(Diagnostic {
266-
line, col: 0, end: 0,
292+
start: n, end: n,
267293
msg: "program too large: exceeded maximum instruction limit".to_string()
268294
});
269295
}
270296

271297
if !self.errors.is_empty() {
298+
// Wipe ALL bytecode side-state so finalize_prev_slots doesn't
299+
// index `canonical` (built from `names`) with stale phi sources.
272300
self.chunk.instructions.clear();
273301
self.chunk.constants.clear();
274302
self.chunk.names.clear();
303+
self.chunk.phi_sources.clear();
304+
self.chunk.phi_map.clear();
305+
self.chunk.functions.clear();
306+
self.chunk.classes.clear();
307+
self.chunk.name_index.clear();
308+
self.chunk.nonlocals.clear();
309+
self.chunk.annotations.clear();
310+
self.loop_kinds.clear();
275311
}
276312

277313
self.chunk.emit(OpCode::ReturnValue, 0);

compiler/src/modules/parser/stmt.rs

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,23 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
5050
Some(TokenType::Yield) => {
5151
self.advance();
5252
if self.eat_if(TokenType::From) {
53+
// Lower `yield from <expr>` into a for-loop that re-yields
54+
// each value: GetIter pushes onto iter_stack, ForIter pulls
55+
// the next item, Yield re-yields it, PopTop discards the
56+
// (None) send value on resume, Jump back. LoadNone at the
57+
// end gives the expression a value (sub-generator return
58+
// value isn't tracked, so always None).
5359
self.expr();
54-
self.chunk.emit(OpCode::YieldFrom, 0);
55-
} else if matches!(self.peek(), Some(TokenType::Newline | TokenType::Endmarker)) {
60+
self.chunk.emit(OpCode::GetIter, 0);
61+
let loop_start = self.chunk.instructions.len() as u16;
62+
self.chunk.emit(OpCode::ForIter, 0);
63+
let fi = self.chunk.instructions.len() - 1;
64+
self.chunk.emit(OpCode::Yield, 0);
65+
self.chunk.emit(OpCode::PopTop, 0);
66+
self.chunk.emit(OpCode::Jump, loop_start);
67+
self.patch(fi);
68+
self.chunk.emit(OpCode::LoadNone, 0);
69+
} else if matches!(self.peek(), Some(TokenType::Newline) | None) {
5670
self.chunk.emit(OpCode::LoadNone, 0);
5771
self.chunk.emit(OpCode::Yield, 0);
5872
} else {
@@ -141,14 +155,27 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
141155
}
142156
Some(TokenType::Del) => {
143157
self.advance();
144-
let name = self.advance_text();
145-
let idx = self.push_ssa_name(&name, self.current_version(&name));
146-
self.chunk.emit(OpCode::Del, idx);
158+
loop {
159+
let name = self.advance_text();
160+
if self.eat_if(TokenType::Lsqb) {
161+
// del x[k]: load container + key, mutate in place.
162+
// Slice deletion (x[i:j]) is not supported — `expr()`
163+
// stops at `:` so the eat(Rsqb) below errors cleanly.
164+
self.emit_load_ssa(name);
165+
self.expr();
166+
self.eat(TokenType::Rsqb);
167+
self.chunk.emit(OpCode::DelItem, 0);
168+
} else {
169+
let idx = self.push_ssa_name(&name, self.current_version(&name));
170+
self.chunk.emit(OpCode::Del, idx);
171+
}
172+
if !self.eat_if(TokenType::Comma) { break; }
173+
}
147174
false
148175
}
149176
Some(TokenType::Raise) => {
150177
self.advance();
151-
if !matches!(self.peek(), Some(TokenType::Newline | TokenType::Endmarker)) {
178+
if !matches!(self.peek(), Some(TokenType::Newline) | None) {
152179
self.expr();
153180
if self.eat_if(TokenType::From) {
154181
self.expr();
@@ -166,6 +193,12 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
166193
if self.loop_breaks.is_empty() {
167194
self.error("'break' outside loop");
168195
} else {
196+
// For-loops own an iter on iter_stack; pop it before
197+
// jumping out, otherwise the surrounding for-iter would
198+
// read the abandoned iterator.
199+
if let Some(true) = self.loop_kinds.last() {
200+
self.chunk.emit(OpCode::PopIter, 0);
201+
}
169202
self.chunk.emit(OpCode::Jump, 0);
170203
if let Some(breaks) = self.loop_breaks.last_mut() {
171204
breaks.push(self.chunk.instructions.len() - 1);
@@ -251,7 +284,7 @@ impl<'src, I: Iterator<Item = Token>> Parser<'src, I> {
251284
break;
252285
}
253286
let produced_value = self.stmt();
254-
if !self.at_end() && produced_value {
287+
if produced_value {
255288
self.chunk.emit(OpCode::PopTop, 0);
256289
}
257290
if indented { continue; }

0 commit comments

Comments
 (0)