@@ -227,6 +227,14 @@ export function parseBisonDocument(text: string): BisonDocument {
227227 // Skip empty lines and comments
228228 if ( ! trimmed || trimmed . startsWith ( '//' ) || trimmed . startsWith ( '/*' ) ) continue ;
229229
230+ // %token directive inside the rules section (Bison allows declaring tokens after %%).
231+ // Must be handled BEFORE rule-body processing to avoid contaminating rule symbols.
232+ if ( trimmed . startsWith ( '%token' ) && braceDepth === 0 ) {
233+ const tm = trimmed . match ( / ^ % t o k e n (?: \s + < ( [ ^ > ] + ) > ) ? \s + ( .+ ) / ) ;
234+ if ( tm ) parseTokenNames ( tm [ 2 ] , tm [ 1 ] , i , doc ) ;
235+ continue ;
236+ }
237+
230238 // Inside a multi-line action block: scan for $n refs and track brace depth.
231239 if ( braceDepth > 0 ) {
232240 if ( currentRule ) {
@@ -325,34 +333,52 @@ export function parseBisonDocument(text: string): BisonDocument {
325333 if ( ch === '{' ) braceDepth ++ ;
326334 if ( ch === '}' ) braceDepth = Math . max ( 0 , braceDepth - 1 ) ;
327335 }
328-
329- // %token directive in rules section (e.g., %token CHUNKS "_chunks")
330- const inlineTokenMatch = trimmed . match ( / ^ % t o k e n \s + ( [ A - Z _ ] [ A - Z 0 - 9 _ ] * ) \s * ( " .* " ) ? / ) ;
331- if ( inlineTokenMatch ) {
332- const name = inlineTokenMatch [ 1 ] ;
333- const alias = inlineTokenMatch [ 2 ] ?. replace ( / " / g, '' ) ;
334- const col = line . indexOf ( name ) ;
335- doc . tokens . set ( name , {
336- name,
337- alias,
338- location : Range . create ( i , col , i , col + name . length ) ,
339- } ) ;
340- }
341336 }
342337
343338 return doc ;
344339}
345340
341+ /**
342+ * Encode a Bison string literal (the quoted content) into a unique, safe
343+ * identifier-like placeholder. The hex encoding ensures that "+" and "{"
344+ * produce DIFFERENT placeholders — critical for second-token disambiguation
345+ * in the shift/reduce heuristic.
346+ *
347+ * e.g. "+" → __s2b__
348+ * "(" → __s28__
349+ * "{" → __s7b__
350+ * "function" → __s66756e6374696f6e__
351+ *
352+ * All placeholders start with "__s" (lowercase) so they are valid identifiers
353+ * but FAIL the all-caps token check -- never mistaken for grammar terminals.
354+ */
355+ function strLiteralPlaceholder ( content : string ) : string {
356+ const hex = Array . from ( content )
357+ . map ( c => c . charCodeAt ( 0 ) . toString ( 16 ) . padStart ( 2 , '0' ) )
358+ . join ( '' ) ;
359+ return `__s${ hex } __` ;
360+ }
361+
362+ /** Replace every `"..."` in `text` with its unique strLiteralPlaceholder. */
363+ function replaceStringLiterals ( text : string ) : string {
364+ return text . replace ( / " ( (?: [ ^ " \\ ] | \\ .) * ) " / g, ( _ , content ) => ` ${ strLiteralPlaceholder ( content ) } ` ) ;
365+ }
366+
346367/**
347368 * Extract all grammar symbols (identifiers) from a production RHS in order.
369+ *
370+ * String literals ("+" , "{", "function", …) ARE counted as symbols because
371+ * Bison treats them exactly like tokens in the $N position numbering.
372+ * They are replaced with unique hex-encoded placeholders so that the
373+ * second-symbol disambiguation in the shift/reduce heuristic can tell
374+ * `"("` apart from `"{"` (both have different placeholders).
348375 */
349376function extractSymbols ( text : string ) : string [ ] {
350- const cleaned = text
351- . replace ( / " (?: [ ^ " \\ ] | \\ .) * " / g, ' ' ) // remove strings
352- . replace ( / \{ [ ^ } ] * \} / g, ' ' ) // remove inline actions
353- . replace ( / % p r e c \s + \S + / g, ' ' ) // remove %prec TOKEN
354- . replace ( / % e m p t y / g, ' ' ) // remove %empty
355- . replace ( / \/ \/ .* $ / g, ' ' ) // remove line comments
377+ const cleaned = replaceStringLiterals ( text )
378+ . replace ( / \{ [ ^ } ] * \} / g, ' ' ) // remove inline actions
379+ . replace ( / % p r e c \s + \S + / g, ' ' ) // remove %prec TOKEN
380+ . replace ( / % e m p t y / g, ' ' ) // remove %empty
381+ . replace ( / \/ \/ .* $ / g, ' ' ) // remove line comments
356382 . trim ( ) ;
357383 const symbols : string [ ] = [ ] ;
358384 const regex = / \b ( [ a - z A - Z _ ] [ a - z A - Z 0 - 9 _ . ] * ) \b / g;
@@ -366,14 +392,17 @@ function extractSymbols(text: string): string[] {
366392/**
367393 * Extract the first terminal or non-terminal symbol from a production RHS.
368394 * Returns undefined for empty productions (%empty) or pure action blocks.
395+ *
396+ * String literals are replaced with unique hex-encoded placeholders so that
397+ * an alternative starting with "function" has a firstSymbol starting with
398+ * `__s` (not all-caps) and is therefore not confused with a real terminal.
369399 */
370400function getFirstSymbol ( text : string ) : string | undefined {
371- const cleaned = text
372- . replace ( / " (?: [ ^ " \\ ] | \\ .) * " / g, ' ' ) // remove strings
373- . replace ( / \{ [ ^ } ] * \} / g, ' ' ) // remove inline actions
374- . replace ( / % p r e c \s + \S + / g, ' ' ) // remove %prec TOKEN
375- . replace ( / % e m p t y / g, ' ' ) // remove %empty
376- . replace ( / \/ \/ .* $ / g, ' ' ) // remove line comments
401+ const cleaned = replaceStringLiterals ( text )
402+ . replace ( / \{ [ ^ } ] * \} / g, ' ' ) // remove inline actions
403+ . replace ( / % p r e c \s + \S + / g, ' ' ) // remove %prec TOKEN
404+ . replace ( / % e m p t y / g, ' ' ) // remove %empty
405+ . replace ( / \/ \/ .* $ / g, ' ' ) // remove line comments
377406 . trim ( ) ;
378407 const m = cleaned . match ( / ^ ( [ a - z A - Z _ ] [ a - z A - Z 0 - 9 _ . ] * ) / ) ;
379408 return m ? m [ 1 ] : undefined ;
@@ -427,6 +456,38 @@ function extractDollarRefs(text: string, lineNum: number, fullLine: string): Dol
427456}
428457
429458function extractRuleReferences ( text : string , lineNum : number , fullLine : string , doc : BisonDocument ) : void {
459+ // Track string literals used as token aliases in rule bodies (e.g. "+" instead of PLUS,
460+ // "{" instead of LBRACE). We use a char-by-char scanner so that:
461+ // • `"{"` at brace-depth 0 → alias `{` (rule body)
462+ // • `"{"` inside `{ std::string s = "{"; }` → ignored (brace-depth > 0, action block)
463+ {
464+ let braceDepth = 0 ;
465+ let inString = false ;
466+ let strStart = - 1 ;
467+ for ( let ci = 0 ; ci < text . length ; ci ++ ) {
468+ const ch = text [ ci ] ;
469+ if ( inString ) {
470+ if ( ch === '\\' ) { ci ++ ; continue ; } // escape: skip next char
471+ if ( ch === '"' ) {
472+ const alias = text . substring ( strStart , ci ) ; // content between quotes
473+ if ( alias ) {
474+ const rawStr = '"' + alias + '"' ;
475+ const col = fullLine . indexOf ( rawStr ) ;
476+ if ( ! doc . ruleReferences . has ( alias ) ) doc . ruleReferences . set ( alias , [ ] ) ;
477+ doc . ruleReferences . get ( alias ) ! . push (
478+ Range . create ( lineNum , col >= 0 ? col : 0 , lineNum , ( col >= 0 ? col : 0 ) + rawStr . length ) ,
479+ ) ;
480+ }
481+ inString = false ;
482+ }
483+ } else {
484+ if ( ch === '{' ) { braceDepth ++ ; }
485+ else if ( ch === '}' ) { braceDepth = Math . max ( 0 , braceDepth - 1 ) ; }
486+ else if ( ch === '"' && braceDepth === 0 ) { inString = true ; strStart = ci + 1 ; }
487+ }
488+ }
489+ }
490+
430491 // Find identifiers in rule bodies (potential token/nonterminal references)
431492 // Skip: strings, actions (braces), %prec keyword (but keep its token), %empty, comments
432493 const cleaned = text
0 commit comments