11/*
22`lexer.rs`
3- Tokenizes Python source into a stream of spanned Token variants .
3+ Tokenizes Python source into offset-indexed tokens. Zero-copy, zero-alloc hot path, DFA-compiled via logos .
44
55 Usage:
66 ```rust
@@ -29,12 +29,13 @@ use std::cmp::Ordering;
2929// A04:2021 Prevent asymetric DoS via deeply data structures: `handle_indent`, `lex_fstring_body`.
3030const MAX_INDENT_DEPTH : usize = 100 ;
3131const MAX_FSTRING_DEPTH : usize = 200 ;
32+ const MAX_SOURCE_SIZE : usize = 10 * 1024 * 1024 ; // 10MB
3233
3334#[ derive( Default ) ]
3435pub struct LexerState {
3536
3637 /*
37- Structure for: pending token queue, indentation stack, bracket nesting depth.
38+ Pending queue, indentation stack, bracket depth, line counter, and f-string context .
3839 */
3940
4041 pending : VecDeque < ( TokenType , usize , usize , usize ) > ,
@@ -49,7 +50,7 @@ pub struct LexerState {
4950pub struct Token {
5051
5152 /*
52- Structure: kind (token type): line, start and end for indexes of the kind.
53+ Token kind with line number and byte-offset span (start, end) into source.
5354 */
5455
5556 pub kind : TokenType ,
@@ -63,81 +64,113 @@ pub struct Token {
6364fn open_bracket ( lex : & mut Lexer < TokenType > ) { lex. extras . nesting += 1 ; }
6465fn close_bracket ( lex : & mut Lexer < TokenType > ) { lex. extras . nesting = lex. extras . nesting . saturating_sub ( 1 ) ; }
6566
66- fn handle_indent ( lex : & mut Lexer < TokenType > ) -> logos:: Skip {
67+ fn handle_indent ( lex : & mut Lexer < TokenType > ) -> logos:: Skip {
6768
68- /*
69- Decides `if \n` is a statement boundary or inside brackets.
69+ /*
70+ Emits Indent/Dedent/Newline tokens or suppresses them inside bracketed expressions.
7071 */
7172
7273 let s = lex. span ( ) ;
73-
74- let src = lex. remainder ( ) ;
75- let indent: Vec < u8 > = src. bytes ( ) . take_while ( |& b| b == b' ' || b == b'\t' ) . collect ( ) ;
76- let level = indent. len ( ) ;
77- let line = s. end + level;
78-
7974 let current_line = lex. extras . line ;
75+
8076 lex. extras . line += 1 ;
8177
82- let next = src[ indent. len ( ) ..] . chars ( ) . next ( ) ;
83-
84- if lex. extras . nesting > 0 { lex. extras . pending . push_back ( ( TokenType :: Nl , current_line, s. start , s. end ) ) ; return logos:: Skip ; }
85- if indent. contains ( & b' ' ) && indent. contains ( & b'\t' ) { lex. extras . pending . push_back ( ( TokenType :: Newline , current_line, s. start , s. end ) ) ; lex. extras . pending . push_back ( ( TokenType :: Endmarker , current_line, s. start , s. end ) ) ; return logos:: Skip ; }
86- if matches ! ( next, Some ( '\n' | '\r' | '#' ) ) { lex. extras . pending . push_back ( ( TokenType :: Nl , current_line, s. start , s. end ) ) ; return logos:: Skip ; }
78+ if lex. extras . nesting > 0 {
79+ lex. extras . pending . push_back ( ( TokenType :: Nl , current_line, s. start , s. end ) ) ;
80+ return logos:: Skip ;
81+ }
82+
83+ let bytes = lex. remainder ( ) . as_bytes ( ) ;
8784
85+ let mut level = 0usize ;
86+ let mut has_space = false ;
87+ let mut has_tab = false ;
88+
89+ while level < bytes. len ( ) && ( bytes[ level] == b' ' || bytes[ level] == b'\t' ) {
90+ has_space |= bytes[ level] == b' ' ;
91+ has_tab |= bytes[ level] == b'\t' ;
92+ level += 1 ;
93+ }
94+
95+ if has_space && has_tab {
96+ lex. extras . pending . push_back ( ( TokenType :: Newline , current_line, s. start , s. end ) ) ;
97+ lex. extras . pending . push_back ( ( TokenType :: Endmarker , current_line, s. start , s. end ) ) ;
98+ return logos:: Skip ;
99+ }
100+
101+ if matches ! ( bytes. get( level) , Some ( b'\n' | b'\r' | b'#' ) ) {
102+ lex. extras . pending . push_back ( ( TokenType :: Nl , current_line, s. start , s. end ) ) ;
103+ return logos:: Skip ;
104+ }
105+
106+ let line = s. end + level;
88107 let current = * lex. extras . indent_stack . last ( ) . unwrap_or ( & 0 ) ;
89108
90109 lex. extras . pending . push_back ( ( TokenType :: Newline , current_line, s. start , s. end ) ) ;
91110
92111 match level. cmp ( & current) {
93112
94113 Ordering :: Greater => {
95- if lex. extras . indent_stack . len ( ) >= MAX_INDENT_DEPTH { lex. extras . pending . push_back ( ( TokenType :: Endmarker , current_line, s. start , s. end ) ) ; return logos:: Skip ; }
114+ if lex. extras . indent_stack . len ( ) >= MAX_INDENT_DEPTH {
115+ lex. extras . pending . push_back ( ( TokenType :: Endmarker , current_line, s. start , s. end ) ) ;
116+ return logos:: Skip ;
117+ }
96118 lex. extras . indent_stack . push ( level) ;
97119 lex. extras . pending . push_back ( ( TokenType :: Indent , lex. extras . line , line, line) ) ;
98- } ,
99-
120+ }
121+
100122 Ordering :: Less => while lex. extras . indent_stack . last ( ) . is_some_and ( |& t| t > level) {
101123 lex. extras . indent_stack . pop ( ) ;
102124 lex. extras . pending . push_back ( ( TokenType :: Dedent , lex. extras . line , line, line) ) ;
103- } ,
104-
125+ }
126+
105127 Ordering :: Equal => { }
106-
128+
107129 }
108130
109131 logos:: Skip
110132
111133}
112134
113135fn lex_fstring_body ( lex : & mut Lexer < TokenType > , quote : u8 , triple : bool , body_start : usize ) {
136+
114137 /*
115- Scans f-string bytes, suspending at `{` to let the main lexer resume.
138+ Scans f-string bytes, emitting text segments and suspending at `{` for expression lexing.
116139 */
140+
117141 let bytes = lex. remainder ( ) . as_bytes ( ) ;
142+
118143 let mut pos = 0usize ;
119144
120145 while pos < bytes. len ( ) {
146+
121147 let closes = if triple {
122148 bytes. get ( pos..pos + 3 ) == Some ( & [ quote, quote, quote] )
123149 } else {
124150 bytes[ pos] == quote
125151 } ;
126152
127153 if closes {
154+
128155 if pos > 0 {
129156 lex. extras . pending . push_back ( ( TokenType :: FstringMiddle , lex. extras . line , body_start, body_start + pos) ) ;
130157 }
158+
131159 let quote_len = if triple { 3 } else { 1 } ;
160+
132161 lex. bump ( pos + quote_len) ;
133162 lex. extras . pending . push_back ( ( TokenType :: FstringEnd , lex. extras . line , body_start + pos, body_start + pos + quote_len) ) ;
163+
134164 return ;
165+
135166 }
136167
137168 match bytes[ pos] {
138- b'\\' => pos += 2 ,
169+
170+ b'\\' => pos = ( pos + 2 ) . min ( bytes. len ( ) ) ,
171+
139172 b'{' if bytes. get ( pos + 1 ) != Some ( & b'{' ) => {
140- // Emite texto previo
173+
141174 if pos > 0 {
142175 lex. extras . pending . push_back ( ( TokenType :: FstringMiddle , lex. extras . line , body_start, body_start + pos) ) ;
143176 }
@@ -147,37 +180,49 @@ fn lex_fstring_body(lex: &mut Lexer<TokenType>, quote: u8, triple: bool, body_st
147180 lex. bump ( pos + 1 ) ;
148181 return ;
149182 }
150- // Emite Lbrace
183+
151184 lex. extras . pending . push_back ( ( TokenType :: Lbrace , lex. extras . line , body_start + pos, body_start + pos + 1 ) ) ;
152- // Guarda estado fstring para reanudar después del '}'
153185 lex. extras . fstring_stack . push ( ( quote, triple, body_start + pos + 1 ) ) ;
154- // Avanza el lexer hasta después del '{' — logos retoma control
155186 lex. bump ( pos + 1 ) ;
156- return ; // ← PARA aquí, logos tokeniza la expresión
187+
188+ return ;
189+
157190 }
191+
158192 _ => pos += 1 ,
193+
159194 }
195+
160196 }
197+
161198}
162199
163200fn lex_name_or_fstring ( lex : & mut Lexer < TokenType > ) -> Option < ( ) > {
164201
165- /*
166- Detects f-string prefixes within identifier matches and delegates to lex_fstring_body .
202+ /*
203+ Reclassifies `f`/`fr`/`rf` identifiers as f-string starts when followed by a quote.
167204 */
168205
169206 let s = lex. span ( ) ;
207+ let slice = lex. slice ( ) . as_bytes ( ) ;
170208
171- if !matches ! ( lex. slice( ) . to_ascii_lowercase( ) . as_str( ) , "f" | "fr" | "rf" ) { return Some ( ( ) ) ; }
209+ let is_fprefix = match slice. len ( ) {
210+ 1 => matches ! ( slice[ 0 ] , b'f' | b'F' ) ,
211+ 2 => matches ! ( ( slice[ 0 ] , slice[ 1 ] ) , ( b'f' | b'F' , b'r' | b'R' ) | ( b'r' | b'R' , b'f' | b'F' ) ) ,
212+ _ => return Some ( ( ) ) ,
213+ } ;
214+
215+ if !is_fprefix { return Some ( ( ) ) ; }
172216
173217 let Some ( & q) = lex. remainder ( ) . as_bytes ( ) . first ( ) else { return Some ( ( ) ) ; } ;
218+
174219 if !matches ! ( q, b'"' | b'\'' ) { return Some ( ( ) ) ; }
175220
176221 let triple = lex. remainder ( ) . as_bytes ( ) . get ( 1 ) == Some ( & q) ;
177-
178222 let quote_len = if triple { 3 } else { 1 } ;
179223
180224 lex. bump ( quote_len) ;
225+
181226 let body_start = s. end + quote_len;
182227 lex. extras . pending . push_back ( ( TokenType :: FstringStart , lex. extras . line , s. start , body_start) ) ;
183228
@@ -188,18 +233,22 @@ fn lex_name_or_fstring(lex: &mut Lexer<TokenType>) -> Option<()> {
188233}
189234
190235fn close_fstring_expr ( lex : & mut Lexer < TokenType > ) -> logos:: Skip {
191- /* Saves correct Rbrace span before resuming f-string body scan. */
192- let span = lex. span ( ) ; // ← span correcto ANTES de cualquier bump
236+
237+ /*
238+ Closes an f-string expression on `}`, emits Rbrace, and resumes f-string body scanning.
239+ */
240+
241+ let span = lex. span ( ) ;
242+
193243 if let Some ( ( quote, triple, _) ) = lex. extras . fstring_stack . pop ( ) {
194- // Empuja Rbrace con span correcto al pending
195244 lex. extras . pending . push_back ( ( TokenType :: Rbrace , lex. extras . line , span. start , span. end ) ) ;
196- // Reanuda fstring — puede hacer bump, ya no importa
197245 lex_fstring_body ( lex, quote, triple, span. end ) ;
198246 } else {
199247 lex. extras . nesting = lex. extras . nesting . saturating_sub ( 1 ) ;
200248 lex. extras . pending . push_back ( ( TokenType :: Rbrace , lex. extras . line , span. start , span. end ) ) ;
201249 }
202- logos:: Skip // ← logos no emite nada, pending lo maneja
250+
251+ logos:: Skip
203252}
204253
205254#[ derive( Logos , Debug , PartialEq , Clone ) ]
@@ -362,49 +411,58 @@ pub enum TokenType {
362411}
363412
364413pub fn lexer ( source : & str ) -> impl Iterator < Item = Token > + ' _ {
365-
414+
366415 /*
367- Tokenizes Python source into a parser-ready stream, resolving indentation, soft keywords, and f-string expression boundaries.
416+ Produces a parser-ready token stream with indentation, soft keywords, and f-string boundaries resolved.
368417 */
418+
419+ let source_len = source. len ( ) ;
369420
370- let source_len = source. len ( ) ; // ← longitud real del source
371421 let mut lex = TokenType :: lexer ( source) ;
372422 let mut done = false ;
373423
424+ if source_len > MAX_SOURCE_SIZE {
425+ lex. extras . pending . push_back ( ( TokenType :: Endmarker , 0 , source_len, source_len) ) ;
426+ }
427+
374428 let mut stream = std:: iter:: from_fn ( move || {
375429
376430 if let Some ( tok) = lex. extras . pending . pop_front ( ) { return Some ( tok) ; }
377-
378- let result = match lex. next ( ) {
379- Some ( Ok ( tok) ) => { let s = lex. span ( ) ; Some ( ( tok, lex. extras . line , s. start , s. end ) ) } ,
380- Some ( Err ( _) ) => lex. extras . pending . is_empty ( ) . then_some ( ( TokenType :: Endmarker , lex. extras . line , source_len, source_len) ) ,
381- None if !done => { done = true ; Some ( ( TokenType :: Endmarker , lex. extras . line , source_len, source_len) ) }
382- _ => None ,
431+
432+ let tok = match lex. next ( ) {
433+ Some ( Ok ( tok) ) => { let s = lex. span ( ) ; ( tok, lex. extras . line , s. start , s. end ) }
434+ Some ( Err ( _) ) if lex. extras . pending . is_empty ( ) => ( TokenType :: Endmarker , lex. extras . line , source_len, source_len) ,
435+ Some ( Err ( _) ) => return lex. extras . pending . pop_front ( ) ,
436+ None if !done => { done = true ; ( TokenType :: Endmarker , lex. extras . line , source_len, source_len) }
437+ None => return None ,
383438 } ;
384439
385- if let Some ( t) = result { lex. extras . pending . push_back ( t) ; }
386-
387- lex. extras . pending . pop_front ( )
440+ if lex. extras . pending . is_empty ( ) {
441+ Some ( tok)
442+ } else {
443+ lex. extras . pending . push_back ( tok) ;
444+ lex. extras . pending . pop_front ( )
445+ }
388446
389447 } ) . peekable ( ) ;
390448
391449 let mut ended = false ;
392450
393451 std:: iter:: from_fn ( move || {
394-
395- if ended { return None ; }
396-
452+
397453 let ( tok, line, start, end) = stream. next ( ) ?;
398-
454+
455+ if ended { return None ; }
399456 if tok == TokenType :: Endmarker { ended = true ; }
400457
401- let as_name = matches ! ( tok, TokenType :: Match | TokenType :: Case | TokenType :: Type ) && matches ! ( stream. peek( ) , Some ( (
458+ let is_soft_keyword = matches ! ( tok, TokenType :: Match | TokenType :: Case | TokenType :: Type ) ;
459+ let next_demotes = matches ! ( stream. peek( ) , Some ( (
402460 TokenType :: Lpar | TokenType :: Colon | TokenType :: Equal |
403- TokenType :: Comma | TokenType :: Rpar | TokenType :: Rsqb |
461+ TokenType :: Comma | TokenType :: Rpar | TokenType :: Rsqb |
404462 TokenType :: Newline , _, _, _
405463 ) ) | None ) ;
406464
407- let kind = if as_name { TokenType :: Name } else { tok } ;
465+ let kind = if is_soft_keyword && next_demotes { TokenType :: Name } else { tok } ;
408466
409467 Some ( Token { kind, line, start, end } )
410468
0 commit comments