Skip to content

Commit 8b56ea8

Browse files
fix(query-language): preserve grouped filters in regex mode (#1138)
* fix(query-language): preserve grouped filters in regex mode * chore(changelog): add entry for pr 1138
1 parent b2941c4 commit 8b56ea8

4 files changed

Lines changed: 250 additions & 20 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1414
- Fixed missing workflow permissions in `docs-broken-links.yml` by adding explicit `permissions: {}` to follow least privilege principle. [#1131](https://github.com/sourcebot-dev/sourcebot/pull/1131)
1515
- Fixed CodeQL missing-workflow-permissions alert by adding explicit empty permissions to `deploy-railway.yml`. [#1132](https://github.com/sourcebot-dev/sourcebot/pull/1132)
1616
- [EE] Fixed XSS vulnerability (CodeQL js/xss-through-exception) in OAuth redirect flow by blocking dangerous URI schemes (`javascript:`, `data:`, `vbscript:`) at registration, authorization, and redirect layers. [#1136](https://github.com/sourcebot-dev/sourcebot/pull/1136)
17+
- Fixed regex search parsing so query-style parenthesized groups with filters still work when regex mode is enabled. [#1138](https://github.com/sourcebot-dev/sourcebot/pull/1138)
1718

1819
## [4.16.11] - 2026-04-17
1920

packages/queryLanguage/src/tokens.ts

Lines changed: 225 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,15 @@ function isAlphaNumUnderscore(ch: number): boolean {
4242
* Checks if the input at current position matches the given string.
4343
*/
4444
function matchesString(input: InputStream, str: string): boolean {
45+
return matchesStringAt(input, 0, str);
46+
}
47+
48+
/**
49+
* Checks if the input at the given offset matches the given string.
50+
*/
51+
function matchesStringAt(input: InputStream, offset: number, str: string): boolean {
4552
for (let i = 0; i < str.length; i++) {
46-
if (input.peek(i) !== str.charCodeAt(i)) {
53+
if (input.peek(offset + i) !== str.charCodeAt(i)) {
4754
return false;
4855
}
4956
}
@@ -94,23 +101,54 @@ function isOrKeyword(input: InputStream): boolean {
94101
* Checks if current position starts with a prefix keyword.
95102
*/
96103
function startsWithPrefix(input: InputStream): boolean {
104+
return startsWithPrefixAt(input, 0);
105+
}
106+
107+
/**
108+
* Checks if the input at the given offset starts with a prefix keyword.
109+
*/
110+
function startsWithPrefixAt(input: InputStream, offset: number): boolean {
97111
for (const prefix of PREFIXES) {
98-
if (matchesString(input, prefix)) {
112+
if (matchesStringAt(input, offset, prefix)) {
99113
return true;
100114
}
101115
}
102116
return false;
103117
}
104118

105119
/**
106-
* Checks if a '(' at the given offset starts a balanced ParenExpr.
107-
* Uses peek() to avoid modifying stream position.
108-
* Returns true if we find a matching ')' that closes the initial '('.
120+
* Advances past whitespace starting at the given offset.
121+
*/
122+
function skipWhitespace(input: InputStream, offset: number): number {
123+
while (isWhitespace(input.peek(offset))) {
124+
offset++;
125+
}
126+
return offset;
127+
}
128+
129+
/**
130+
* Checks whether the character at the given offset is escaped by an odd number
131+
* of immediately preceding backslashes.
132+
*/
133+
function isEscapedAt(input: InputStream, offset: number): boolean {
134+
let backslashCount = 0;
135+
let currentOffset = offset - 1;
136+
137+
while (input.peek(currentOffset) === 92 /* backslash */) {
138+
backslashCount++;
139+
currentOffset--;
140+
}
141+
142+
return backslashCount % 2 === 1;
143+
}
144+
145+
/**
146+
* Returns the offset of the closing ')' that matches the '(' at startOffset.
109147
* Handles escaped characters (backslash followed by any character).
110148
*/
111-
function hasBalancedParensAt(input: InputStream, startOffset: number): boolean {
149+
function findMatchingCloseParenOffset(input: InputStream, startOffset: number): number | null {
112150
if (input.peek(startOffset) !== OPEN_PAREN) {
113-
return false;
151+
return null;
114152
}
115153

116154
let offset = startOffset + 1;
@@ -131,15 +169,171 @@ function hasBalancedParensAt(input: InputStream, startOffset: number): boolean {
131169
} else if (ch === CLOSE_PAREN) {
132170
depth--;
133171
if (depth === 0) {
134-
return true;
172+
return offset;
135173
}
136174
}
137175
offset++;
138176
}
139177

178+
return null;
179+
}
180+
181+
/**
182+
* Checks if a '(' at the given offset starts a balanced ParenExpr.
183+
* Uses peek() to avoid modifying stream position.
184+
* Returns true if we find a matching ')' that closes the initial '('.
185+
* Handles escaped characters (backslash followed by any character).
186+
*/
187+
function hasBalancedParensAt(input: InputStream, startOffset: number): boolean {
188+
return findMatchingCloseParenOffset(input, startOffset) !== null;
189+
}
190+
191+
/**
192+
* Determines whether a balanced parenthesized expression should be treated as
193+
* query grouping in regex mode. This preserves query constructs like:
194+
* (file:a or file:b)
195+
* while still allowing bare regex atoms like:
196+
* (foo|bar)
197+
*/
198+
function isRegexQueryGroupingAt(input: InputStream, startOffset: number): boolean {
199+
const closeOffset = findMatchingCloseParenOffset(input, startOffset);
200+
if (closeOffset === null) {
201+
return false;
202+
}
203+
204+
let offset = skipWhitespace(input, startOffset + 1);
205+
if (offset >= closeOffset) {
206+
return true; // Empty parens are always grouping syntax
207+
}
208+
209+
const topLevelTokens: Array<{ start: number; end: number }> = [];
210+
211+
while (offset < closeOffset) {
212+
const tokenStart = offset;
213+
let depth = 0;
214+
let inQuote = false;
215+
216+
while (offset < closeOffset) {
217+
const ch = input.peek(offset);
218+
219+
if (ch === 92 /* backslash */) {
220+
offset += 2;
221+
continue;
222+
}
223+
224+
if (inQuote) {
225+
offset++;
226+
if (ch === QUOTE) {
227+
inQuote = false;
228+
}
229+
continue;
230+
}
231+
232+
if (ch === QUOTE) {
233+
inQuote = true;
234+
offset++;
235+
continue;
236+
}
237+
238+
if (ch === OPEN_PAREN) {
239+
depth++;
240+
offset++;
241+
continue;
242+
}
243+
244+
if (ch === CLOSE_PAREN) {
245+
if (depth === 0) {
246+
break;
247+
}
248+
depth--;
249+
offset++;
250+
continue;
251+
}
252+
253+
if (depth === 0 && isWhitespace(ch)) {
254+
break;
255+
}
256+
257+
offset++;
258+
}
259+
260+
topLevelTokens.push({ start: tokenStart, end: offset });
261+
offset = skipWhitespace(input, offset);
262+
}
263+
264+
if (topLevelTokens.length !== 1) {
265+
return true;
266+
}
267+
268+
const [{ start, end }] = topLevelTokens;
269+
const firstCh = input.peek(start);
270+
271+
if (startsWithPrefixAt(input, start)) {
272+
return true;
273+
}
274+
275+
if (firstCh === DASH) {
276+
const afterDash = skipWhitespace(input, start + 1);
277+
278+
if (startsWithPrefixAt(input, afterDash)) {
279+
return true;
280+
}
281+
282+
if (input.peek(afterDash) === OPEN_PAREN && afterDash < end) {
283+
return isRegexQueryGroupingAt(input, afterDash);
284+
}
285+
286+
return false;
287+
}
288+
289+
if (firstCh === QUOTE) {
290+
return true;
291+
}
292+
293+
if (firstCh === OPEN_PAREN && start < end) {
294+
return isRegexQueryGroupingAt(input, start);
295+
}
296+
140297
return false;
141298
}
142299

300+
/**
301+
* Finds the offset of the '(' that matches the current ')' at offset 0.
302+
* Handles escaped characters (backslash followed by any character).
303+
*/
304+
function findMatchingOpenParenOffset(input: InputStream): number | null {
305+
if (input.next !== CLOSE_PAREN) {
306+
return null;
307+
}
308+
309+
let offset = -1;
310+
let depth = 1;
311+
312+
while (true) {
313+
const ch = input.peek(offset);
314+
315+
if (ch === EOF) {
316+
return null;
317+
}
318+
319+
if (isEscapedAt(input, offset)) {
320+
offset--;
321+
continue;
322+
}
323+
324+
if (ch === CLOSE_PAREN) {
325+
depth++;
326+
} else if (ch === OPEN_PAREN) {
327+
depth--;
328+
if (depth === 0) {
329+
return offset;
330+
}
331+
}
332+
333+
offset--;
334+
}
335+
}
336+
143337
/**
144338
* Checks if we're currently inside a ParenExpr by looking backwards in the input
145339
* to count unmatched opening parens that likely started a ParenExpr.
@@ -246,15 +440,20 @@ function isInsideParenExpr(input: InputStream, stack: Stack): boolean {
246440
export const parenToken = new ExternalTokenizer((input, stack) => {
247441
if (input.next !== OPEN_PAREN) return;
248442

249-
// In regex mode, parens are just word characters — don't emit openParen
250443
if (stack.dialectEnabled(Dialect_regex)) {
251-
return;
444+
// In regex mode, only treat parens as grouping syntax when the contents
445+
// clearly look like a query expression. Otherwise they remain part of a
446+
// regex term, e.g. (foo|bar).
447+
if (!isRegexQueryGroupingAt(input, 0)) {
448+
return;
449+
}
252450
}
253451

254452
if (hasBalancedParensAt(input, 0)) {
255453
// Found balanced parens - emit openParen (just the '(')
256454
input.advance();
257455
input.acceptToken(openParen);
456+
return;
258457
}
259458
// If unbalanced, don't emit anything - let wordToken handle it
260459
});
@@ -268,15 +467,21 @@ export const parenToken = new ExternalTokenizer((input, stack) => {
268467
export const closeParenToken = new ExternalTokenizer((input, stack) => {
269468
if (input.next !== CLOSE_PAREN) return;
270469

271-
// In regex mode, parens are just word characters — don't emit closeParen
272470
if (stack.dialectEnabled(Dialect_regex)) {
471+
const matchingOpenOffset = findMatchingOpenParenOffset(input);
472+
if (matchingOpenOffset === null || !isRegexQueryGroupingAt(input, matchingOpenOffset)) {
473+
return;
474+
}
475+
input.advance();
476+
input.acceptToken(closeParen);
273477
return;
274478
}
275479

276480
// Check if we should emit closeParen (when inside a ParenExpr)
277481
if (isInsideParenExpr(input, stack)) {
278482
input.advance();
279483
input.acceptToken(closeParen);
484+
return;
280485
}
281486
// Otherwise, don't emit - let wordToken handle ')' as part of a word
282487
});
@@ -324,10 +529,16 @@ export const wordToken = new ExternalTokenizer((input, stack) => {
324529
}
325530

326531
// In regex mode: consume all non-whitespace characters as a single word.
327-
// Parens and | are valid regex metacharacters, not query syntax in this mode.
532+
// Parens remain part of the word unless they clearly start/end a query group.
328533
if (stack.dialectEnabled(Dialect_regex)) {
329534
const startPos = input.pos;
330535
while (input.next !== EOF && !isWhitespace(input.next)) {
536+
if (input.next === CLOSE_PAREN) {
537+
const matchingOpenOffset = findMatchingOpenParenOffset(input);
538+
if (matchingOpenOffset !== null && isRegexQueryGroupingAt(input, matchingOpenOffset)) {
539+
break;
540+
}
541+
}
331542
input.advance();
332543
}
333544
if (input.pos > startPos) {
@@ -454,10 +665,8 @@ export const negateToken = new ExternalTokenizer((input, stack) => {
454665
const chAfterDash = input.peek(offset);
455666

456667
// In normal mode: also check for balanced paren (negated group e.g. -(foo bar))
457-
// In regex mode: skip this — parens are not query grouping operators, so emitting
458-
// negate before a '(' would leave the parser without a matching ParenExpr to parse.
459-
if (!stack.dialectEnabled(Dialect_regex)) {
460-
if (chAfterDash === OPEN_PAREN && hasBalancedParensAt(input, offset)) {
668+
if (chAfterDash === OPEN_PAREN && hasBalancedParensAt(input, offset)) {
669+
if (!stack.dialectEnabled(Dialect_regex) || isRegexQueryGroupingAt(input, offset)) {
461670
input.advance();
462671
input.acceptToken(negate);
463672
return;
@@ -492,4 +701,3 @@ export const negateToken = new ExternalTokenizer((input, stack) => {
492701

493702
// Otherwise, don't tokenize as negate (let word handle it)
494703
});
495-

packages/queryLanguage/test/regex.txt

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,26 @@ file:test.js (test|render)<
5353
==>
5454
Program(AndExpr(PrefixExpr(FileExpr),Term))
5555

56+
# Parenthesized prefix OR group still works in regex mode
57+
(file:.yarnrc.yml or file:README.md)
58+
==>
59+
Program(ParenExpr(OrExpr(PrefixExpr(FileExpr),PrefixExpr(FileExpr))))
60+
61+
# Regex query with grouped prefix filters
62+
repo:^github\.com/sourcebot\x2ddev/sourcebot$ (file:.yarnrc.yml or file:README.md)
63+
==>
64+
Program(AndExpr(PrefixExpr(RepoExpr),ParenExpr(OrExpr(PrefixExpr(FileExpr),PrefixExpr(FileExpr)))))
65+
66+
# Multiple filters with a parenthesized regex OR group
67+
repo:^github\.com/sourcebot-dev/.+$ lang:Dockerfile ( FROM\s+node or RUN\s+npm )
68+
==>
69+
Program(AndExpr(PrefixExpr(RepoExpr),PrefixExpr(LangExpr),ParenExpr(OrExpr(Term,Term))))
70+
71+
# Negated grouped prefix filters still work in regex mode
72+
-(file:test or file:spec)
73+
==>
74+
Program(NegateExpr(ParenExpr(OrExpr(PrefixExpr(FileExpr),PrefixExpr(FileExpr)))))
75+
5676
# Negation of prefix still works in regex mode
5777
-file:test.js
5878
==>

packages/web/src/features/search/parser.ts

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ const parser = _parser.configure({
3535
strict: true,
3636
});
3737

38-
// In regex mode, parens and | are regex metacharacters, not query grouping operators.
39-
// The "regex" dialect makes the tokenizer treat them as plain word characters.
38+
// In regex mode, bare regex parens should stay part of a term while query-like
39+
// parenthesized expressions (e.g. grouped filters) should still parse as groups.
4040
const regexParser = _parser.configure({
4141
strict: true,
4242
dialect: "regex",
@@ -89,7 +89,8 @@ export const parseQuerySyntaxIntoIR = async ({
8989

9090
try {
9191
// First parse the query into a Lezer tree.
92-
// In regex mode, use the regex dialect so parens/| are treated as word characters.
92+
// In regex mode, use the regex dialect so bare regex parens stay inside
93+
// a term while query-like parenthesized groups still tokenize correctly.
9394
const activeParser = (options.isRegexEnabled ?? false) ? regexParser : parser;
9495
const tree = activeParser.parse(query);
9596

0 commit comments

Comments
 (0)