diff --git a/CHANGELOG.md b/CHANGELOG.md index a7f2003e7..7d8639266 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed missing workflow permissions in `docs-broken-links.yml` by adding explicit `permissions: {}` to follow least privilege principle. [#1131](https://github.com/sourcebot-dev/sourcebot/pull/1131) - Fixed CodeQL missing-workflow-permissions alert by adding explicit empty permissions to `deploy-railway.yml`. [#1132](https://github.com/sourcebot-dev/sourcebot/pull/1132) - [EE] Fixed XSS vulnerability (CodeQL js/xss-through-exception) in OAuth redirect flow by blocking dangerous URI schemes (`javascript:`, `data:`, `vbscript:`) at registration, authorization, and redirect layers. [#1136](https://github.com/sourcebot-dev/sourcebot/pull/1136) +- Fixed regex search parsing so query-style parenthesized groups with filters still work when regex mode is enabled. [#1138](https://github.com/sourcebot-dev/sourcebot/pull/1138) ## [4.16.11] - 2026-04-17 diff --git a/packages/queryLanguage/src/tokens.ts b/packages/queryLanguage/src/tokens.ts index 5a59dbcc6..1bb69f531 100644 --- a/packages/queryLanguage/src/tokens.ts +++ b/packages/queryLanguage/src/tokens.ts @@ -42,8 +42,15 @@ function isAlphaNumUnderscore(ch: number): boolean { * Checks if the input at current position matches the given string. */ function matchesString(input: InputStream, str: string): boolean { + return matchesStringAt(input, 0, str); +} + +/** + * Checks if the input at the given offset matches the given string. + */ +function matchesStringAt(input: InputStream, offset: number, str: string): boolean { for (let i = 0; i < str.length; i++) { - if (input.peek(i) !== str.charCodeAt(i)) { + if (input.peek(offset + i) !== str.charCodeAt(i)) { return false; } } @@ -94,8 +101,15 @@ function isOrKeyword(input: InputStream): boolean { * Checks if current position starts with a prefix keyword. */ function startsWithPrefix(input: InputStream): boolean { + return startsWithPrefixAt(input, 0); +} + +/** + * Checks if the input at the given offset starts with a prefix keyword. + */ +function startsWithPrefixAt(input: InputStream, offset: number): boolean { for (const prefix of PREFIXES) { - if (matchesString(input, prefix)) { + if (matchesStringAt(input, offset, prefix)) { return true; } } @@ -103,14 +117,38 @@ function startsWithPrefix(input: InputStream): boolean { } /** - * Checks if a '(' at the given offset starts a balanced ParenExpr. - * Uses peek() to avoid modifying stream position. - * Returns true if we find a matching ')' that closes the initial '('. + * Advances past whitespace starting at the given offset. + */ +function skipWhitespace(input: InputStream, offset: number): number { + while (isWhitespace(input.peek(offset))) { + offset++; + } + return offset; +} + +/** + * Checks whether the character at the given offset is escaped by an odd number + * of immediately preceding backslashes. + */ +function isEscapedAt(input: InputStream, offset: number): boolean { + let backslashCount = 0; + let currentOffset = offset - 1; + + while (input.peek(currentOffset) === 92 /* backslash */) { + backslashCount++; + currentOffset--; + } + + return backslashCount % 2 === 1; +} + +/** + * Returns the offset of the closing ')' that matches the '(' at startOffset. * Handles escaped characters (backslash followed by any character). */ -function hasBalancedParensAt(input: InputStream, startOffset: number): boolean { +function findMatchingCloseParenOffset(input: InputStream, startOffset: number): number | null { if (input.peek(startOffset) !== OPEN_PAREN) { - return false; + return null; } let offset = startOffset + 1; @@ -131,15 +169,171 @@ function hasBalancedParensAt(input: InputStream, startOffset: number): boolean { } else if (ch === CLOSE_PAREN) { depth--; if (depth === 0) { - return true; + return offset; } } offset++; } + return null; +} + +/** + * Checks if a '(' at the given offset starts a balanced ParenExpr. + * Uses peek() to avoid modifying stream position. + * Returns true if we find a matching ')' that closes the initial '('. + * Handles escaped characters (backslash followed by any character). + */ +function hasBalancedParensAt(input: InputStream, startOffset: number): boolean { + return findMatchingCloseParenOffset(input, startOffset) !== null; +} + +/** + * Determines whether a balanced parenthesized expression should be treated as + * query grouping in regex mode. This preserves query constructs like: + * (file:a or file:b) + * while still allowing bare regex atoms like: + * (foo|bar) + */ +function isRegexQueryGroupingAt(input: InputStream, startOffset: number): boolean { + const closeOffset = findMatchingCloseParenOffset(input, startOffset); + if (closeOffset === null) { + return false; + } + + let offset = skipWhitespace(input, startOffset + 1); + if (offset >= closeOffset) { + return true; // Empty parens are always grouping syntax + } + + const topLevelTokens: Array<{ start: number; end: number }> = []; + + while (offset < closeOffset) { + const tokenStart = offset; + let depth = 0; + let inQuote = false; + + while (offset < closeOffset) { + const ch = input.peek(offset); + + if (ch === 92 /* backslash */) { + offset += 2; + continue; + } + + if (inQuote) { + offset++; + if (ch === QUOTE) { + inQuote = false; + } + continue; + } + + if (ch === QUOTE) { + inQuote = true; + offset++; + continue; + } + + if (ch === OPEN_PAREN) { + depth++; + offset++; + continue; + } + + if (ch === CLOSE_PAREN) { + if (depth === 0) { + break; + } + depth--; + offset++; + continue; + } + + if (depth === 0 && isWhitespace(ch)) { + break; + } + + offset++; + } + + topLevelTokens.push({ start: tokenStart, end: offset }); + offset = skipWhitespace(input, offset); + } + + if (topLevelTokens.length !== 1) { + return true; + } + + const [{ start, end }] = topLevelTokens; + const firstCh = input.peek(start); + + if (startsWithPrefixAt(input, start)) { + return true; + } + + if (firstCh === DASH) { + const afterDash = skipWhitespace(input, start + 1); + + if (startsWithPrefixAt(input, afterDash)) { + return true; + } + + if (input.peek(afterDash) === OPEN_PAREN && afterDash < end) { + return isRegexQueryGroupingAt(input, afterDash); + } + + return false; + } + + if (firstCh === QUOTE) { + return true; + } + + if (firstCh === OPEN_PAREN && start < end) { + return isRegexQueryGroupingAt(input, start); + } + return false; } +/** + * Finds the offset of the '(' that matches the current ')' at offset 0. + * Handles escaped characters (backslash followed by any character). + */ +function findMatchingOpenParenOffset(input: InputStream): number | null { + if (input.next !== CLOSE_PAREN) { + return null; + } + + let offset = -1; + let depth = 1; + + while (true) { + const ch = input.peek(offset); + + if (ch === EOF) { + return null; + } + + if (isEscapedAt(input, offset)) { + offset--; + continue; + } + + if (ch === CLOSE_PAREN) { + depth++; + } else if (ch === OPEN_PAREN) { + depth--; + if (depth === 0) { + return offset; + } + } + + offset--; + } +} + /** * Checks if we're currently inside a ParenExpr by looking backwards in the input * to count unmatched opening parens that likely started a ParenExpr. @@ -246,15 +440,20 @@ function isInsideParenExpr(input: InputStream, stack: Stack): boolean { export const parenToken = new ExternalTokenizer((input, stack) => { if (input.next !== OPEN_PAREN) return; - // In regex mode, parens are just word characters — don't emit openParen if (stack.dialectEnabled(Dialect_regex)) { - return; + // In regex mode, only treat parens as grouping syntax when the contents + // clearly look like a query expression. Otherwise they remain part of a + // regex term, e.g. (foo|bar). + if (!isRegexQueryGroupingAt(input, 0)) { + return; + } } if (hasBalancedParensAt(input, 0)) { // Found balanced parens - emit openParen (just the '(') input.advance(); input.acceptToken(openParen); + return; } // If unbalanced, don't emit anything - let wordToken handle it }); @@ -268,8 +467,13 @@ export const parenToken = new ExternalTokenizer((input, stack) => { export const closeParenToken = new ExternalTokenizer((input, stack) => { if (input.next !== CLOSE_PAREN) return; - // In regex mode, parens are just word characters — don't emit closeParen if (stack.dialectEnabled(Dialect_regex)) { + const matchingOpenOffset = findMatchingOpenParenOffset(input); + if (matchingOpenOffset === null || !isRegexQueryGroupingAt(input, matchingOpenOffset)) { + return; + } + input.advance(); + input.acceptToken(closeParen); return; } @@ -277,6 +481,7 @@ export const closeParenToken = new ExternalTokenizer((input, stack) => { if (isInsideParenExpr(input, stack)) { input.advance(); input.acceptToken(closeParen); + return; } // Otherwise, don't emit - let wordToken handle ')' as part of a word }); @@ -324,10 +529,16 @@ export const wordToken = new ExternalTokenizer((input, stack) => { } // In regex mode: consume all non-whitespace characters as a single word. - // Parens and | are valid regex metacharacters, not query syntax in this mode. + // Parens remain part of the word unless they clearly start/end a query group. if (stack.dialectEnabled(Dialect_regex)) { const startPos = input.pos; while (input.next !== EOF && !isWhitespace(input.next)) { + if (input.next === CLOSE_PAREN) { + const matchingOpenOffset = findMatchingOpenParenOffset(input); + if (matchingOpenOffset !== null && isRegexQueryGroupingAt(input, matchingOpenOffset)) { + break; + } + } input.advance(); } if (input.pos > startPos) { @@ -454,10 +665,8 @@ export const negateToken = new ExternalTokenizer((input, stack) => { const chAfterDash = input.peek(offset); // In normal mode: also check for balanced paren (negated group e.g. -(foo bar)) - // In regex mode: skip this — parens are not query grouping operators, so emitting - // negate before a '(' would leave the parser without a matching ParenExpr to parse. - if (!stack.dialectEnabled(Dialect_regex)) { - if (chAfterDash === OPEN_PAREN && hasBalancedParensAt(input, offset)) { + if (chAfterDash === OPEN_PAREN && hasBalancedParensAt(input, offset)) { + if (!stack.dialectEnabled(Dialect_regex) || isRegexQueryGroupingAt(input, offset)) { input.advance(); input.acceptToken(negate); return; @@ -492,4 +701,3 @@ export const negateToken = new ExternalTokenizer((input, stack) => { // Otherwise, don't tokenize as negate (let word handle it) }); - diff --git a/packages/queryLanguage/test/regex.txt b/packages/queryLanguage/test/regex.txt index a0d4814f5..1d63508e2 100644 --- a/packages/queryLanguage/test/regex.txt +++ b/packages/queryLanguage/test/regex.txt @@ -53,6 +53,26 @@ file:test.js (test|render)< ==> Program(AndExpr(PrefixExpr(FileExpr),Term)) +# Parenthesized prefix OR group still works in regex mode +(file:.yarnrc.yml or file:README.md) +==> +Program(ParenExpr(OrExpr(PrefixExpr(FileExpr),PrefixExpr(FileExpr)))) + +# Regex query with grouped prefix filters +repo:^github\.com/sourcebot\x2ddev/sourcebot$ (file:.yarnrc.yml or file:README.md) +==> +Program(AndExpr(PrefixExpr(RepoExpr),ParenExpr(OrExpr(PrefixExpr(FileExpr),PrefixExpr(FileExpr))))) + +# Multiple filters with a parenthesized regex OR group +repo:^github\.com/sourcebot-dev/.+$ lang:Dockerfile ( FROM\s+node or RUN\s+npm ) +==> +Program(AndExpr(PrefixExpr(RepoExpr),PrefixExpr(LangExpr),ParenExpr(OrExpr(Term,Term)))) + +# Negated grouped prefix filters still work in regex mode +-(file:test or file:spec) +==> +Program(NegateExpr(ParenExpr(OrExpr(PrefixExpr(FileExpr),PrefixExpr(FileExpr))))) + # Negation of prefix still works in regex mode -file:test.js ==> diff --git a/packages/web/src/features/search/parser.ts b/packages/web/src/features/search/parser.ts index e90edcca3..3b188a825 100644 --- a/packages/web/src/features/search/parser.ts +++ b/packages/web/src/features/search/parser.ts @@ -35,8 +35,8 @@ const parser = _parser.configure({ strict: true, }); -// In regex mode, parens and | are regex metacharacters, not query grouping operators. -// The "regex" dialect makes the tokenizer treat them as plain word characters. +// In regex mode, bare regex parens should stay part of a term while query-like +// parenthesized expressions (e.g. grouped filters) should still parse as groups. const regexParser = _parser.configure({ strict: true, dialect: "regex", @@ -89,7 +89,8 @@ export const parseQuerySyntaxIntoIR = async ({ try { // First parse the query into a Lezer tree. - // In regex mode, use the regex dialect so parens/| are treated as word characters. + // In regex mode, use the regex dialect so bare regex parens stay inside + // a term while query-like parenthesized groups still tokenize correctly. const activeParser = (options.isRegexEnabled ?? false) ? regexParser : parser; const tree = activeParser.parse(query);