Skip to content

Commit 2b35bb0

Browse files
fix(web): treat parens and | as word chars when regex mode is enabled (#1050)
* fix(web): treat parens and | as word chars when regex mode is enabled Adds a Lezer 'regex' dialect to the query language grammar. In regex mode, the tokenizers no longer emit openParen/closeParen tokens, so parentheses and pipe characters are consumed as plain word characters rather than query grouping operators. This fixes a bug where a query like (test|render)< was incorrectly parsed as AND(ParenExpr, Term) instead of a single regexp Term. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * chore: update CHANGELOG for #1050 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 313d2d2 commit 2b35bb0

File tree

9 files changed

+160
-16
lines changed

9 files changed

+160
-16
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1313
### Fixed
1414
- Fixed Ask GitHub landing page chat box placement to be centered on the page instead of at the bottom. [#1046](https://github.com/sourcebot-dev/sourcebot/pull/1046)
1515
- Fixed issue where local git connections (`file://`) would fail when matching a file instead of a directory. [#1049](https://github.com/sourcebot-dev/sourcebot/pull/1049)
16+
- Fixed regex queries containing parentheses (e.g. `(test|render)<`) being incorrectly split into multiple search terms instead of treated as a single regex pattern. [#1050](https://github.com/sourcebot-dev/sourcebot/pull/1050)
1617

1718
## [4.16.2] - 2026-03-25
1819

packages/queryLanguage/src/parser.terms.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,5 @@ export const
2323
RepoSetExpr = 16,
2424
ParenExpr = 17,
2525
QuotedTerm = 18,
26-
Term = 19
26+
Term = 19,
27+
Dialect_regex = 0

packages/queryLanguage/src/parser.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ export const parser = LRParser.deserialize({
1313
tokenData: "/U~R_XY!QYZ!Qpq!Qrs!`#T#U$S#V#W%i#Y#Z'R#`#a(_#b#c(|#c#d)X#d#e)p#f#g+]#g#h,w#j#k-`#m#n.s~!VRm~XY!QYZ!Qpq!Q~!cWOY!`Zr!`rs!{s#O!`#O#P#Q#P;'S!`;'S;=`#|<%lO!`~#QOw~~#TRO;'S!`;'S;=`#^;=`O!`~#aXOY!`Zr!`rs!{s#O!`#O#P#Q#P;'S!`;'S;=`#|;=`<%l!`<%lO!`~$PP;=`<%l!`~$VQ#b#c$]#f#g$h~$`P#m#n$c~$hO!R~~$kP#V#W$n~$qP#[#]$t~$wP#]#^$z~$}P#j#k%Q~%TP#X#Y%W~%ZP#W#X%^~%aP![!]%d~%iOq~~%lQ![!]%r#c#d%w~%wOx~~%zP#b#c%}~&QP#h#i&T~&WP#X#Y&Z~&^Q#b#c&d#l#m&p~&gP#h#i&j~&mP![!]%r~&sP#h#i&v~&yP![!]&|~'ROy~~'UR![!]'_#]#^'d#c#d'v~'dOz~~'gP#`#a'j~'mP#X#Y'p~'sP![!]'_~'yP#f#g'|~(PP#_#`(S~(VP![!](Y~(_O{~~(bP#T#U(e~(hP#b#c(k~(nP#Z#[(q~(tP![!](w~(|O!T~~)PP#c#d)S~)XOs~~)[P#b#c)_~)bP#`#a)e~)hP#m#n)k~)pOt~~)sQ#f#g)y#i#j*n~)|P#]#^*P~*SP#j#k*V~*YP#T#U*]~*`P#h#i*c~*fP#X#Y*i~*nO!Q~~*qP#U#V*t~*wP#`#a*z~*}P#]#^+Q~+TP#V#W+W~+]O!P~~+`Q![!]+f#X#Y+k~+kO!S~~+nQ#d#e+t#j#k,l~+wP#c#d+z~+}Q![!]+f#g#h,T~,WP#X#Y,Z~,^P#h#i,a~,dP![!],g~,lO!V~~,oP![!],r~,wOu~~,zP#m#n,}~-QP#a#b-T~-WP![!]-Z~-`O!U~~-cP#]#^-f~-iP#g#h-l~-oP#]#^-r~-uP#U#V-x~-{P#]#^.O~.RP#`#a.U~.XP#]#^.[~._P#h#i.b~.eP#m#n.h~.kP![!].n~.sO}~~.vP#X#Y.y~.|P#g#h/P~/UOr~",
1414
tokenizers: [negateToken, parenToken, wordToken, closeParenToken, orToken, 0],
1515
topRules: {"Program":[0,1]},
16+
dialects: {regex: 0},
1617
tokenPrec: 200,
1718
termNames: {"0":"⚠","1":"@top","2":"OrExpr","3":"AndExpr","4":"NegateExpr","5":"PrefixExpr","6":"ArchivedExpr","7":"RevisionExpr","8":"ContentExpr","9":"ContextExpr","10":"FileExpr","11":"ForkExpr","12":"VisibilityExpr","13":"RepoExpr","14":"LangExpr","15":"SymExpr","16":"RepoSetExpr","17":"ParenExpr","18":"QuotedTerm","19":"Term","20":"expr+","21":"(or andExpr)+","22":"␄","23":"negate","24":"openParen","25":"word","26":"closeParen","27":"or","28":"%mainskip","29":"space","30":"query","31":"andExpr","32":"expr","33":"archivedKw","34":"\"yes\"","35":"\"no\"","36":"\"only\"","37":"revisionKw","38":"value","39":"quotedString","40":"contentKw","41":"contextKw","42":"fileKw","43":"forkKw","44":"forkValue","45":"visibilityKw","46":"visibilityValue","47":"\"public\"","48":"\"private\"","49":"\"any\"","50":"repoKw","51":"langKw","52":"symKw","53":"reposetKw"}
1819
})

packages/queryLanguage/src/query.grammar

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
@external tokens closeParenToken from "./tokens" { closeParen }
55
@external tokens orToken from "./tokens" { or }
66

7+
@dialects { regex }
8+
79
@top Program { query }
810

911
@precedence {

packages/queryLanguage/src/tokens.ts

Lines changed: 41 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { ExternalTokenizer, InputStream, Stack } from "@lezer/lr";
2-
import { negate, openParen, closeParen, word, or, ParenExpr } from "./parser.terms";
2+
import { negate, openParen, closeParen, word, or, Dialect_regex } from "./parser.terms";
33

44
// Character codes
55
const SPACE = 32;
@@ -243,9 +243,14 @@ function isInsideParenExpr(input: InputStream, stack: Stack): boolean {
243243
* This allows words like "(pr" or "func(arg)" to be parsed as single terms
244244
* while "(foo bar)" is parsed as a ParenExpr.
245245
*/
246-
export const parenToken = new ExternalTokenizer((input) => {
246+
export const parenToken = new ExternalTokenizer((input, stack) => {
247247
if (input.next !== OPEN_PAREN) return;
248-
248+
249+
// In regex mode, parens are just word characters — don't emit openParen
250+
if (stack.dialectEnabled(Dialect_regex)) {
251+
return;
252+
}
253+
249254
if (hasBalancedParensAt(input, 0)) {
250255
// Found balanced parens - emit openParen (just the '(')
251256
input.advance();
@@ -263,6 +268,11 @@ export const parenToken = new ExternalTokenizer((input) => {
263268
export const closeParenToken = new ExternalTokenizer((input, stack) => {
264269
if (input.next !== CLOSE_PAREN) return;
265270

271+
// In regex mode, parens are just word characters — don't emit closeParen
272+
if (stack.dialectEnabled(Dialect_regex)) {
273+
return;
274+
}
275+
266276
// Check if we should emit closeParen (when inside a ParenExpr)
267277
if (isInsideParenExpr(input, stack)) {
268278
input.advance();
@@ -312,7 +322,20 @@ export const wordToken = new ExternalTokenizer((input, stack) => {
312322
if (startsWithPrefix(input)) {
313323
return;
314324
}
315-
325+
326+
// In regex mode: consume all non-whitespace characters as a single word.
327+
// Parens and | are valid regex metacharacters, not query syntax in this mode.
328+
if (stack.dialectEnabled(Dialect_regex)) {
329+
const startPos = input.pos;
330+
while (input.next !== EOF && !isWhitespace(input.next)) {
331+
input.advance();
332+
}
333+
if (input.pos > startPos) {
334+
input.acceptToken(word);
335+
}
336+
return;
337+
}
338+
316339
// If starts with '(' and has balanced parens, determine whether this is a
317340
// regex alternation value (e.g. file:(test|spec)) or a ParenExpr grouping.
318341
// We're in a value context when the immediately preceding non-whitespace char
@@ -419,24 +442,28 @@ export const orToken = new ExternalTokenizer((input) => {
419442
* External tokenizer for negation.
420443
* Only tokenizes `-` as negate when followed by a prefix keyword or balanced `(`.
421444
*/
422-
export const negateToken = new ExternalTokenizer((input) => {
445+
export const negateToken = new ExternalTokenizer((input, stack) => {
423446
if (input.next !== DASH) return;
424-
447+
425448
// Look ahead using peek to see what follows the dash (skipping whitespace)
426449
let offset = 1;
427450
while (isWhitespace(input.peek(offset))) {
428451
offset++;
429452
}
430-
453+
431454
const chAfterDash = input.peek(offset);
432-
433-
// Check if followed by opening paren that starts a balanced ParenExpr
434-
if (chAfterDash === OPEN_PAREN && hasBalancedParensAt(input, offset)) {
435-
input.advance();
436-
input.acceptToken(negate);
437-
return;
455+
456+
// In normal mode: also check for balanced paren (negated group e.g. -(foo bar))
457+
// In regex mode: skip this — parens are not query grouping operators, so emitting
458+
// negate before a '(' would leave the parser without a matching ParenExpr to parse.
459+
if (!stack.dialectEnabled(Dialect_regex)) {
460+
if (chAfterDash === OPEN_PAREN && hasBalancedParensAt(input, offset)) {
461+
input.advance();
462+
input.acceptToken(negate);
463+
return;
464+
}
438465
}
439-
466+
440467
// Check if followed by a prefix keyword (by checking for keyword followed by colon)
441468
let foundColon = false;
442469
let peekOffset = offset;
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import { parser as _parser } from "../src/parser";
2+
import { fileTests } from "@lezer/generator/dist/test";
3+
import { describe, it } from "vitest";
4+
import { fileURLToPath } from "url";
5+
import * as fs from "fs";
6+
import * as path from "path";
7+
8+
const regexParser = _parser.configure({ dialect: "regex" });
9+
const caseDir = path.dirname(fileURLToPath(import.meta.url));
10+
11+
describe("regex", () => {
12+
for (const { name, run } of fileTests(fs.readFileSync(path.join(caseDir, "regex.txt"), "utf8"), "regex.txt")) {
13+
it(name, () => run(regexParser));
14+
}
15+
});

packages/queryLanguage/test/grammar.test.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@ for (const file of fs.readdirSync(caseDir)) {
1111
if (!/\.txt$/.test(file)) {
1212
continue;
1313
}
14+
// regex.txt is tested separately in grammar.regex.test.ts using the regex dialect parser
15+
if (file === "regex.txt") {
16+
continue;
17+
}
1418

1519
let name = /^[^\.]*/.exec(file)?.[0];
1620
describe(name ?? "unknown", () => {
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# Regex alternation in parens followed by literal char
2+
(test|render)<
3+
==>
4+
Program(Term)
5+
6+
# Pipe character in a term (no parens)
7+
test|render
8+
==>
9+
Program(Term)
10+
11+
# Parens with alternation — no trailing char
12+
(foo|bar)
13+
==>
14+
Program(Term)
15+
16+
# Parens with alternation and trailing chars
17+
(foo|bar)baz
18+
==>
19+
Program(Term)
20+
21+
# Complex regex: quantifiers, anchors, character classes
22+
^foo.*bar$
23+
==>
24+
Program(Term)
25+
26+
# Character class
27+
[abc]+
28+
==>
29+
Program(Term)
30+
31+
# Regex with escaped paren
32+
func\(arg\)
33+
==>
34+
Program(Term)
35+
36+
# Two regex terms joined with OR keyword
37+
(test|render) or (foo|bar)
38+
==>
39+
Program(OrExpr(Term,Term))
40+
41+
# Two regex terms implicitly ANDed
42+
(test|render) (foo|bar)
43+
==>
44+
Program(AndExpr(Term,Term))
45+
46+
# File prefix still works in regex mode
47+
file:test.js
48+
==>
49+
Program(PrefixExpr(FileExpr))
50+
51+
# Prefix filter combined with regex term
52+
file:test.js (test|render)<
53+
==>
54+
Program(AndExpr(PrefixExpr(FileExpr),Term))
55+
56+
# Negation of prefix still works in regex mode
57+
-file:test.js
58+
==>
59+
Program(NegateExpr(PrefixExpr(FileExpr)))
60+
61+
# Quoted string still works in regex mode
62+
"(test|render)"
63+
==>
64+
Program(QuotedTerm)
65+
66+
# Multiple prefix filters with regex term
67+
file:test.js lang:TypeScript (render|mount)
68+
==>
69+
Program(AndExpr(PrefixExpr(FileExpr),PrefixExpr(LangExpr),Term))
70+
71+
# Dash without prefix is a plain word (not negation)
72+
-pattern
73+
==>
74+
Program(Term)
75+
76+
# 'or' at start of input is a plain word
77+
or
78+
==>
79+
Program(Term)
80+
81+
# Regex with pipe at top level between prefix and term
82+
repo:myorg (init|setup)
83+
==>
84+
Program(AndExpr(PrefixExpr(RepoExpr),Term))

packages/web/src/features/search/parser.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ const parser = _parser.configure({
3535
strict: true,
3636
});
3737

38+
// In regex mode, parens and | are regex metacharacters, not query grouping operators.
39+
// The "regex" dialect makes the tokenizer treat them as plain word characters.
40+
const regexParser = _parser.configure({
41+
strict: true,
42+
dialect: "regex",
43+
});
44+
3845
type ArchivedValue = 'yes' | 'no' | 'only';
3946
type VisibilityValue = 'public' | 'private' | 'any';
4047
type ForkValue = 'yes' | 'no' | 'only';
@@ -82,7 +89,9 @@ export const parseQuerySyntaxIntoIR = async ({
8289

8390
try {
8491
// First parse the query into a Lezer tree.
85-
const tree = parser.parse(query);
92+
// In regex mode, use the regex dialect so parens/| are treated as word characters.
93+
const activeParser = (options.isRegexEnabled ?? false) ? regexParser : parser;
94+
const tree = activeParser.parse(query);
8695

8796
// Then transform the tree into the intermediate representation.
8897
return transformTreeToIR({

0 commit comments

Comments
 (0)