Skip to content

Commit 2cd1c07

Browse files
authored
Merge pull request #36 from theodevelop/audit/bison-review
audit: bison review & fixes
2 parents d71cc43 + 526b962 commit 2cd1c07

4 files changed

Lines changed: 102 additions & 14 deletions

File tree

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,7 @@ dist/
33
client/out/
44
server/out/
55
*.vsix
6-
.env
6+
.env
7+
tests/_*
8+
docs/
9+
.vscode/

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ All notable changes to the **Bison/Flex Language Support** extension will be doc
1313
- **Flex — lowercase start condition names** (audit-C): all start-condition regex patterns used `[A-Z_][A-Z0-9_]*` (uppercase only). SC names that are valid C identifiers but lowercase (e.g. `%x comment`) were silently ignored, skipping `flex/undefined-sc` and `flex/unused-sc` diagnostics for them entirely.
1414
- **Flex — single-tab action separator in abbreviation ref scan** (audit-D): the heuristic that separates the pattern from the action used `\s{2,}`, which did not match a single-tab separator. `{identifier}` tokens inside the C action body (e.g. compound literals) were falsely counted as abbreviation references, suppressing `flex/unused-abbrev`.
1515
- **Cleanup**: removed two dead entries in the catch-all pattern set that contained a literal newline character and could never match a rule line.
16+
- **Bison — lowercase/mixed-case tokens in precedence declarations** (audit-E): `%left`/`%right`/`%nonassoc` used an uppercase-only regex `[A-Z_][A-Z0-9_]*`, silently dropping tokens like `kPLUS` or `tTOKEN` from the precedence table. This caused false `bison/undeclared-token` warnings and incorrect shift/reduce heuristic results for such tokens.
17+
- **Bison — `$N` references after nested sub-blocks in inline actions** (audit-F): the `extractDollarRefs` scanner used `/\{([^}]*)\}/` which stops at the first `}`, missing `$N` references that appear after a nested `{ … }` block inside the same action (e.g. `{ if (cond) { log(); } $$ = $5; }`). Replaced with a brace-depth scanner; the same fix was applied to `extractSymbols`, `getFirstSymbol`, and `extractRuleReferences` for consistency.
1618

1719
---
1820

server/src/parser/bisonParser.ts

Lines changed: 49 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ export function parseBisonDocument(text: string): BisonDocument {
166166
const precMatch = trimmed.match(/^%(left|right|nonassoc|precedence)\s+(.*)/);
167167
if (precMatch) {
168168
const kind = precMatch[1] as PrecedenceDeclaration['kind'];
169-
const rawSymbols = precMatch[2].match(/[A-Z_][A-Z0-9_]*|"[^"]*"/g) || [];
169+
const rawSymbols = precMatch[2].match(/[A-Za-z_][A-Za-z0-9_]*|"[^"]*"/g) || [];
170170
const symbols: string[] = [];
171171
const symbolRanges: Range[] = [];
172172
for (const raw of rawSymbols) {
@@ -389,6 +389,33 @@ function replaceStringLiterals(text: string): string {
389389
.replace(/'((?:[^'\\]|\\.)*)'/g, (_, content) => ` ${strLiteralPlaceholder(`'${content}'`)} `);
390390
}
391391

392+
/**
393+
* Remove all brace-balanced { ... } blocks from `text`, replacing each with `replacement`.
394+
* Handles arbitrarily nested braces, unlike /\{[^}]*\}/ which stops at the first `}`.
395+
* Unmatched `{` without a closing `}` (e.g. a multi-line action opener on its own line)
396+
* are left out of the result — the Phase 3 brace tracker handles them separately.
397+
*/
398+
function removeBalancedBraces(text: string, replacement: string = ' '): string {
399+
let result = '';
400+
let depth = 0;
401+
let pendingOpen = false; // true while inside a block that hasn't been closed yet
402+
for (let i = 0; i < text.length; i++) {
403+
if (text[i] === '{') {
404+
if (depth === 0) pendingOpen = true;
405+
depth++;
406+
} else if (text[i] === '}') {
407+
depth = Math.max(0, depth - 1);
408+
if (depth === 0 && pendingOpen) {
409+
result += replacement; // only emit placeholder when the block is fully closed
410+
pendingOpen = false;
411+
}
412+
} else if (depth === 0) {
413+
result += text[i];
414+
}
415+
}
416+
return result;
417+
}
418+
392419
/**
393420
* Extract all grammar symbols (identifiers) from a production RHS in order.
394421
*
@@ -399,8 +426,7 @@ function replaceStringLiterals(text: string): string {
399426
* `"("` apart from `"{"` (both have different placeholders).
400427
*/
401428
function extractSymbols(text: string): string[] {
402-
const cleaned = replaceStringLiterals(text)
403-
.replace(/\{[^}]*\}/g, ' __midaction__ ') // inline actions count as a symbol ($N position)
429+
const cleaned = removeBalancedBraces(replaceStringLiterals(text), ' __midaction__ ') // inline actions count as a symbol ($N position)
404430
.replace(/%prec\s+\S+/g, ' ') // remove %prec TOKEN
405431
.replace(/%empty/g, ' ') // remove %empty
406432
.replace(/\/\/.*$/g, ' ') // remove line comments
@@ -423,8 +449,7 @@ function extractSymbols(text: string): string[] {
423449
* `__s` (not all-caps) and is therefore not confused with a real terminal.
424450
*/
425451
function getFirstSymbol(text: string): string | undefined {
426-
const cleaned = replaceStringLiterals(text)
427-
.replace(/\{[^}]*\}/g, ' ') // remove inline actions
452+
const cleaned = removeBalancedBraces(replaceStringLiterals(text)) // remove inline actions
428453
.replace(/%prec\s+\S+/g, ' ') // remove %prec TOKEN
429454
.replace(/%empty/g, ' ') // remove %empty
430455
.replace(/\/\/.*$/g, ' ') // remove line comments
@@ -518,15 +543,27 @@ function parseTokenNames(text: string, type: string | undefined, lineNum: number
518543

519544
/**
520545
* Scan the inline action block(s) on a single line for $n references.
521-
* Only handles single-line { ... } blocks; multi-line actions are not detected here.
546+
* Uses a brace-depth scanner so that $n references appearing after a nested
547+
* sub-block (e.g. `{ if (x) { foo(); } $$ = $1; }`) are not missed.
548+
* Only handles single-line { ... } blocks; multi-line actions are tracked by
549+
* the caller (Phase 3 loop in parseBisonDocument).
522550
* $$ and $<type>n are deliberately skipped.
523551
*/
524552
function extractDollarRefs(text: string, lineNum: number, fullLine: string): DollarRef[] {
525553
const refs: DollarRef[] = [];
526-
const actionRegex = /\{([^}]*)\}/g;
527-
let actionMatch: RegExpExecArray | null;
528-
while ((actionMatch = actionRegex.exec(text)) !== null) {
529-
const actionContent = actionMatch[1];
554+
let i = 0;
555+
while (i < text.length) {
556+
if (text[i] !== '{') { i++; continue; }
557+
// Found the opening brace of an action block — scan to the matching '}'
558+
let depth = 1;
559+
let j = i + 1;
560+
while (j < text.length && depth > 0) {
561+
if (text[j] === '{') depth++;
562+
else if (text[j] === '}') depth--;
563+
j++;
564+
}
565+
// text[i+1 .. j-2] is the full content of this balanced action block
566+
const actionContent = text.substring(i + 1, j - 1);
530567
const dollarRegex = /\$(\d+)/g;
531568
let m: RegExpExecArray | null;
532569
while ((m = dollarRegex.exec(actionContent)) !== null) {
@@ -540,6 +577,7 @@ function extractDollarRefs(text: string, lineNum: number, fullLine: string): Dol
540577
range: Range.create(lineNum, col >= 0 ? col : 0, lineNum, (col >= 0 ? col : 0) + fullMatch.length),
541578
});
542579
}
580+
i = j; // advance past the entire balanced block
543581
}
544582
return refs;
545583
}
@@ -579,9 +617,7 @@ function extractRuleReferences(text: string, lineNum: number, fullLine: string,
579617

580618
// Find identifiers in rule bodies (potential token/nonterminal references)
581619
// Skip: strings, actions (braces), %prec keyword (but keep its token), %empty, comments
582-
const cleaned = text
583-
.replace(/"(?:[^"\\]|\\.)*"/g, '') // remove strings
584-
.replace(/\{[^}]*\}/g, '') // remove inline actions
620+
const cleaned = removeBalancedBraces(text.replace(/"(?:[^"\\]|\\.)*"/g, '')) // remove strings, then inline actions
585621
.replace(/%prec/g, '') // remove %prec keyword (keep the token name)
586622
.replace(/%empty/g, '') // remove %empty
587623
.replace(/\/\/.*$/g, ''); // remove line comments

tests/test-diagnostic-codes.ts

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,53 @@ console.log('\n=== TEST: Flex audit checks ===');
505505
assert(unused.length === 0, 'Bug-D: abbreviation used in pattern before single-tab action must not produce flex/unused-abbrev');
506506
}
507507

508+
// ─────────────────────────────────────────────────────────────────────────────
509+
// Bison audit checks
510+
// ─────────────────────────────────────────────────────────────────────────────
511+
console.log('\n=== TEST: Bison audit checks ===');
512+
513+
{
514+
// audit-E: lowercase/mixed-case tokens in %left/%right/%nonassoc must be recorded
515+
// in doc.precedence so they don't produce false bison/undeclared-token diagnostics.
516+
const src = [
517+
'%token kPLUS kMINUS kNUM',
518+
'%left kPLUS kMINUS',
519+
'%%',
520+
'start : expr ;',
521+
'expr : expr kPLUS expr { $$ = $1 + $3; }',
522+
' | expr kMINUS expr { $$ = $1 - $3; }',
523+
' | kNUM',
524+
' ;',
525+
'%%',
526+
].join('\n');
527+
const doc = parseBisonDocument(src);
528+
assert(
529+
doc.precedence.length === 1 && doc.precedence[0].symbols.includes('kPLUS') && doc.precedence[0].symbols.includes('kMINUS'),
530+
'audit-E: lowercase tokens kPLUS/kMINUS recorded in doc.precedence',
531+
);
532+
const diags = computeBisonDiagnostics(doc, src);
533+
const undeclared = diags.filter(d => d.code === 'bison/undeclared-token');
534+
assert(undeclared.length === 0, 'audit-E: no false bison/undeclared-token for lowercase precedence tokens (got ' + undeclared.length + ')');
535+
}
536+
537+
{
538+
// audit-F: $N after a nested sub-block inside an action must be detected.
539+
// Pattern: A { if (cond) { skip(); } $5; } — 1 symbol, $5 is out-of-bounds.
540+
// Old /\{[^}]*\}/ missed $5 because it appears after the inner `}`.
541+
const src = [
542+
'%token A',
543+
'%%',
544+
'start : stmt ;',
545+
'stmt : A { if (1) { int x = 0; } $$ = $5; }',
546+
' ;',
547+
'%%',
548+
].join('\n');
549+
const doc = parseBisonDocument(src);
550+
const diags = computeBisonDiagnostics(doc, src);
551+
const oob = diags.filter(d => d.code === 'bison/out-of-bounds' && d.message.includes('$5'));
552+
assert(oob.length >= 1, 'audit-F: $5 after nested sub-block in action is detected as out-of-bounds');
553+
}
554+
508555
// ─────────────────────────────────────────────────────────────────────────────
509556
// Results
510557
// ─────────────────────────────────────────────────────────────────────────────

0 commit comments

Comments
 (0)