fix(web): treat parens and | as word chars when regex mode is enabled

brendan-kellam · claude · brendan-kellam · commit 6db2c5f9754e · 2026-03-25T23:22:43.000-07:00
Adds a Lezer 'regex' dialect to the query language grammar. In regex
mode, the tokenizers no longer emit openParen/closeParen tokens, so
parentheses and pipe characters are consumed as plain word characters
rather than query grouping operators. This fixes a bug where a query
like (test|render)&lt; was incorrectly parsed as AND(ParenExpr, Term)
instead of a single regexp Term.

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/packages/queryLanguage/src/parser.terms.ts b/packages/queryLanguage/src/parser.terms.ts
@@ -23,4 +23,5 @@ export const
   RepoSetExpr = 16,
   ParenExpr = 17,
   QuotedTerm = 18,
-  Term = 19
+  Term = 19,
+  Dialect_regex = 0
diff --git a/packages/queryLanguage/src/parser.ts b/packages/queryLanguage/src/parser.ts
@@ -13,6 +13,7 @@ export const parser = LRParser.deserialize({
   tokenData: "/U~R_XY!QYZ!Qpq!Qrs!`#T#U$S#V#W%i#Y#Z'R#`#a(_#b#c(|#c#d)X#d#e)p#f#g+]#g#h,w#j#k-`#m#n.s~!VRm~XY!QYZ!Qpq!Q~!cWOY!`Zr!`rs!{s#O!`#O#P#Q#P;'S!`;'S;=`#|<%lO!`~#QOw~~#TRO;'S!`;'S;=`#^;=`O!`~#aXOY!`Zr!`rs!{s#O!`#O#P#Q#P;'S!`;'S;=`#|;=`<%l!`<%lO!`~$PP;=`<%l!`~$VQ#b#c$]#f#g$h~$`P#m#n$c~$hO!R~~$kP#V#W$n~$qP#[#]$t~$wP#]#^$z~$}P#j#k%Q~%TP#X#Y%W~%ZP#W#X%^~%aP![!]%d~%iOq~~%lQ![!]%r#c#d%w~%wOx~~%zP#b#c%}~&QP#h#i&T~&WP#X#Y&Z~&^Q#b#c&d#l#m&p~&gP#h#i&j~&mP![!]%r~&sP#h#i&v~&yP![!]&|~'ROy~~'UR![!]'_#]#^'d#c#d'v~'dOz~~'gP#`#a'j~'mP#X#Y'p~'sP![!]'_~'yP#f#g'|~(PP#_#`(S~(VP![!](Y~(_O{~~(bP#T#U(e~(hP#b#c(k~(nP#Z#[(q~(tP![!](w~(|O!T~~)PP#c#d)S~)XOs~~)[P#b#c)_~)bP#`#a)e~)hP#m#n)k~)pOt~~)sQ#f#g)y#i#j*n~)|P#]#^*P~*SP#j#k*V~*YP#T#U*]~*`P#h#i*c~*fP#X#Y*i~*nO!Q~~*qP#U#V*t~*wP#`#a*z~*}P#]#^+Q~+TP#V#W+W~+]O!P~~+`Q![!]+f#X#Y+k~+kO!S~~+nQ#d#e+t#j#k,l~+wP#c#d+z~+}Q![!]+f#g#h,T~,WP#X#Y,Z~,^P#h#i,a~,dP![!],g~,lO!V~~,oP![!],r~,wOu~~,zP#m#n,}~-QP#a#b-T~-WP![!]-Z~-`O!U~~-cP#]#^-f~-iP#g#h-l~-oP#]#^-r~-uP#U#V-x~-{P#]#^.O~.RP#`#a.U~.XP#]#^.[~._P#h#i.b~.eP#m#n.h~.kP![!].n~.sO}~~.vP#X#Y.y~.|P#g#h/P~/UOr~",
   tokenizers: [negateToken, parenToken, wordToken, closeParenToken, orToken, 0],
   topRules: {"Program":[0,1]},
+  dialects: {regex: 0},
   tokenPrec: 200,
   termNames: {"0":"⚠","1":"@top","2":"OrExpr","3":"AndExpr","4":"NegateExpr","5":"PrefixExpr","6":"ArchivedExpr","7":"RevisionExpr","8":"ContentExpr","9":"ContextExpr","10":"FileExpr","11":"ForkExpr","12":"VisibilityExpr","13":"RepoExpr","14":"LangExpr","15":"SymExpr","16":"RepoSetExpr","17":"ParenExpr","18":"QuotedTerm","19":"Term","20":"expr+","21":"(or andExpr)+","22":"␄","23":"negate","24":"openParen","25":"word","26":"closeParen","27":"or","28":"%mainskip","29":"space","30":"query","31":"andExpr","32":"expr","33":"archivedKw","34":"\"yes\"","35":"\"no\"","36":"\"only\"","37":"revisionKw","38":"value","39":"quotedString","40":"contentKw","41":"contextKw","42":"fileKw","43":"forkKw","44":"forkValue","45":"visibilityKw","46":"visibilityValue","47":"\"public\"","48":"\"private\"","49":"\"any\"","50":"repoKw","51":"langKw","52":"symKw","53":"reposetKw"}
 })
diff --git a/packages/queryLanguage/src/query.grammar b/packages/queryLanguage/src/query.grammar
@@ -4,6 +4,8 @@
 @external tokens closeParenToken from "./tokens" { closeParen }
 @external tokens orToken from "./tokens" { or }
 
+@dialects { regex }
+
 @top Program { query }
 
 @precedence {
diff --git a/packages/queryLanguage/src/tokens.ts b/packages/queryLanguage/src/tokens.ts
@@ -1,5 +1,5 @@
 import { ExternalTokenizer, InputStream, Stack } from "@lezer/lr";
-import { negate, openParen, closeParen, word, or, ParenExpr } from "./parser.terms";
+import { negate, openParen, closeParen, word, or, Dialect_regex } from "./parser.terms";
 
 // Character codes
 const SPACE = 32;
@@ -243,9 +243,14 @@ function isInsideParenExpr(input: InputStream, stack: Stack): boolean {
  * This allows words like "(pr" or "func(arg)" to be parsed as single terms
  * while "(foo bar)" is parsed as a ParenExpr.
  */
-export const parenToken = new ExternalTokenizer((input) => {
+export const parenToken = new ExternalTokenizer((input, stack) => {
     if (input.next !== OPEN_PAREN) return;
-    
+
+    // In regex mode, parens are just word characters — don't emit openParen
+    if (stack.dialectEnabled(Dialect_regex)) {
+        return;
+    }
+
     if (hasBalancedParensAt(input, 0)) {
         // Found balanced parens - emit openParen (just the '(')
         input.advance();
@@ -263,6 +268,11 @@ export const parenToken = new ExternalTokenizer((input) => {
 export const closeParenToken = new ExternalTokenizer((input, stack) => {
     if (input.next !== CLOSE_PAREN) return;
 
+    // In regex mode, parens are just word characters — don't emit closeParen
+    if (stack.dialectEnabled(Dialect_regex)) {
+        return;
+    }
+
     // Check if we should emit closeParen (when inside a ParenExpr)
     if (isInsideParenExpr(input, stack)) {
         input.advance();
@@ -312,7 +322,20 @@ export const wordToken = new ExternalTokenizer((input, stack) => {
     if (startsWithPrefix(input)) {
         return;
     }
-    
+
+    // In regex mode: consume all non-whitespace characters as a single word.
+    // Parens and | are valid regex metacharacters, not query syntax in this mode.
+    if (stack.dialectEnabled(Dialect_regex)) {
+        const startPos = input.pos;
+        while (input.next !== EOF && !isWhitespace(input.next)) {
+            input.advance();
+        }
+        if (input.pos > startPos) {
+            input.acceptToken(word);
+        }
+        return;
+    }
+
     // If starts with '(' and has balanced parens, determine whether this is a
     // regex alternation value (e.g. file:(test|spec)) or a ParenExpr grouping.
     // We're in a value context when the immediately preceding non-whitespace char
@@ -419,24 +442,28 @@ export const orToken = new ExternalTokenizer((input) => {
  * External tokenizer for negation.
  * Only tokenizes `-` as negate when followed by a prefix keyword or balanced `(`.
  */
-export const negateToken = new ExternalTokenizer((input) => {
+export const negateToken = new ExternalTokenizer((input, stack) => {
     if (input.next !== DASH) return;
-    
+
     // Look ahead using peek to see what follows the dash (skipping whitespace)
     let offset = 1;
     while (isWhitespace(input.peek(offset))) {
         offset++;
     }
-    
+
     const chAfterDash = input.peek(offset);
-    
-    // Check if followed by opening paren that starts a balanced ParenExpr
-    if (chAfterDash === OPEN_PAREN && hasBalancedParensAt(input, offset)) {
-        input.advance();
-        input.acceptToken(negate);
-        return;
+
+    // In normal mode: also check for balanced paren (negated group e.g. -(foo bar))
+    // In regex mode: skip this — parens are not query grouping operators, so emitting
+    // negate before a '(' would leave the parser without a matching ParenExpr to parse.
+    if (!stack.dialectEnabled(Dialect_regex)) {
+        if (chAfterDash === OPEN_PAREN && hasBalancedParensAt(input, offset)) {
+            input.advance();
+            input.acceptToken(negate);
+            return;
+        }
     }
-    
+
     // Check if followed by a prefix keyword (by checking for keyword followed by colon)
     let foundColon = false;
     let peekOffset = offset;
diff --git a/packages/queryLanguage/test/grammar.regex.test.ts b/packages/queryLanguage/test/grammar.regex.test.ts
@@ -0,0 +1,15 @@
+import { parser as _parser } from "../src/parser";
+import { fileTests } from "@lezer/generator/dist/test";
+import { describe, it } from "vitest";
+import { fileURLToPath } from "url";
+import * as fs from "fs";
+import * as path from "path";
+
+const regexParser = _parser.configure({ dialect: "regex" });
+const caseDir = path.dirname(fileURLToPath(import.meta.url));
+
+describe("regex", () => {
+    for (const { name, run } of fileTests(fs.readFileSync(path.join(caseDir, "regex.txt"), "utf8"), "regex.txt")) {
+        it(name, () => run(regexParser));
+    }
+});
diff --git a/packages/queryLanguage/test/grammar.test.ts b/packages/queryLanguage/test/grammar.test.ts
@@ -11,6 +11,10 @@ for (const file of fs.readdirSync(caseDir)) {
     if (!/\.txt$/.test(file)) {
         continue;
     }
+    // regex.txt is tested separately in grammar.regex.test.ts using the regex dialect parser
+    if (file === "regex.txt") {
+        continue;
+    }
 
     let name = /^[^\.]*/.exec(file)?.[0];
     describe(name ?? "unknown", () => {
diff --git a/packages/queryLanguage/test/regex.txt b/packages/queryLanguage/test/regex.txt
@@ -0,0 +1,84 @@
+# Regex alternation in parens followed by literal char
+(test|render)<
+==>
+Program(Term)
+
+# Pipe character in a term (no parens)
+test|render
+==>
+Program(Term)
+
+# Parens with alternation — no trailing char
+(foo|bar)
+==>
+Program(Term)
+
+# Parens with alternation and trailing chars
+(foo|bar)baz
+==>
+Program(Term)
+
+# Complex regex: quantifiers, anchors, character classes
+^foo.*bar$
+==>
+Program(Term)
+
+# Character class
+[abc]+
+==>
+Program(Term)
+
+# Regex with escaped paren
+func\(arg\)
+==>
+Program(Term)
+
+# Two regex terms joined with OR keyword
+(test|render) or (foo|bar)
+==>
+Program(OrExpr(Term,Term))
+
+# Two regex terms implicitly ANDed
+(test|render) (foo|bar)
+==>
+Program(AndExpr(Term,Term))
+
+# File prefix still works in regex mode
+file:test.js
+==>
+Program(PrefixExpr(FileExpr))
+
+# Prefix filter combined with regex term
+file:test.js (test|render)<
+==>
+Program(AndExpr(PrefixExpr(FileExpr),Term))
+
+# Negation of prefix still works in regex mode
+-file:test.js
+==>
+Program(NegateExpr(PrefixExpr(FileExpr)))
+
+# Quoted string still works in regex mode
+"(test|render)"
+==>
+Program(QuotedTerm)
+
+# Multiple prefix filters with regex term
+file:test.js lang:TypeScript (render|mount)
+==>
+Program(AndExpr(PrefixExpr(FileExpr),PrefixExpr(LangExpr),Term))
+
+# Dash without prefix is a plain word (not negation)
+-pattern
+==>
+Program(Term)
+
+# 'or' at start of input is a plain word
+or
+==>
+Program(Term)
+
+# Regex with pipe at top level between prefix and term
+repo:myorg (init|setup)
+==>
+Program(AndExpr(PrefixExpr(RepoExpr),Term))
diff --git a/packages/web/src/features/search/parser.ts b/packages/web/src/features/search/parser.ts
@@ -35,6 +35,13 @@ const parser = _parser.configure({
     strict: true,
 });
 
+// In regex mode, parens and | are regex metacharacters, not query grouping operators.
+// The "regex" dialect makes the tokenizer treat them as plain word characters.
+const regexParser = _parser.configure({
+    strict: true,
+    dialect: "regex",
+});
+
 type ArchivedValue = 'yes' | 'no' | 'only';
 type VisibilityValue = 'public' | 'private' | 'any';
 type ForkValue = 'yes' | 'no' | 'only';
@@ -82,7 +89,9 @@ export const parseQuerySyntaxIntoIR = async ({
 
     try {
         // First parse the query into a Lezer tree.
-        const tree = parser.parse(query);
+        // In regex mode, use the regex dialect so parens/| are treated as word characters.
+        const activeParser = (options.isRegexEnabled ?? false) ? regexParser : parser;
+        const tree = activeParser.parse(query);
 
         // Then transform the tree into the intermediate representation.
         return transformTreeToIR({