@@ -42,8 +42,15 @@ function isAlphaNumUnderscore(ch: number): boolean {
4242 * Checks if the input at current position matches the given string.
4343 */
4444function matchesString ( input : InputStream , str : string ) : boolean {
45+ return matchesStringAt ( input , 0 , str ) ;
46+ }
47+
48+ /**
49+ * Checks if the input at the given offset matches the given string.
50+ */
51+ function matchesStringAt ( input : InputStream , offset : number , str : string ) : boolean {
4552 for ( let i = 0 ; i < str . length ; i ++ ) {
46- if ( input . peek ( i ) !== str . charCodeAt ( i ) ) {
53+ if ( input . peek ( offset + i ) !== str . charCodeAt ( i ) ) {
4754 return false ;
4855 }
4956 }
@@ -94,23 +101,54 @@ function isOrKeyword(input: InputStream): boolean {
94101 * Checks if current position starts with a prefix keyword.
95102 */
96103function startsWithPrefix ( input : InputStream ) : boolean {
104+ return startsWithPrefixAt ( input , 0 ) ;
105+ }
106+
107+ /**
108+ * Checks if the input at the given offset starts with a prefix keyword.
109+ */
110+ function startsWithPrefixAt ( input : InputStream , offset : number ) : boolean {
97111 for ( const prefix of PREFIXES ) {
98- if ( matchesString ( input , prefix ) ) {
112+ if ( matchesStringAt ( input , offset , prefix ) ) {
99113 return true ;
100114 }
101115 }
102116 return false ;
103117}
104118
105119/**
106- * Checks if a '(' at the given offset starts a balanced ParenExpr.
107- * Uses peek() to avoid modifying stream position.
108- * Returns true if we find a matching ')' that closes the initial '('.
120+ * Advances past whitespace starting at the given offset.
121+ */
122+ function skipWhitespace ( input : InputStream , offset : number ) : number {
123+ while ( isWhitespace ( input . peek ( offset ) ) ) {
124+ offset ++ ;
125+ }
126+ return offset ;
127+ }
128+
129+ /**
130+ * Checks whether the character at the given offset is escaped by an odd number
131+ * of immediately preceding backslashes.
132+ */
133+ function isEscapedAt ( input : InputStream , offset : number ) : boolean {
134+ let backslashCount = 0 ;
135+ let currentOffset = offset - 1 ;
136+
137+ while ( input . peek ( currentOffset ) === 92 /* backslash */ ) {
138+ backslashCount ++ ;
139+ currentOffset -- ;
140+ }
141+
142+ return backslashCount % 2 === 1 ;
143+ }
144+
145+ /**
146+ * Returns the offset of the closing ')' that matches the '(' at startOffset.
109147 * Handles escaped characters (backslash followed by any character).
110148 */
111- function hasBalancedParensAt ( input : InputStream , startOffset : number ) : boolean {
149+ function findMatchingCloseParenOffset ( input : InputStream , startOffset : number ) : number | null {
112150 if ( input . peek ( startOffset ) !== OPEN_PAREN ) {
113- return false ;
151+ return null ;
114152 }
115153
116154 let offset = startOffset + 1 ;
@@ -131,15 +169,171 @@ function hasBalancedParensAt(input: InputStream, startOffset: number): boolean {
131169 } else if ( ch === CLOSE_PAREN ) {
132170 depth -- ;
133171 if ( depth === 0 ) {
134- return true ;
172+ return offset ;
135173 }
136174 }
137175 offset ++ ;
138176 }
139177
178+ return null ;
179+ }
180+
181+ /**
182+ * Checks if a '(' at the given offset starts a balanced ParenExpr.
183+ * Uses peek() to avoid modifying stream position.
184+ * Returns true if we find a matching ')' that closes the initial '('.
185+ * Handles escaped characters (backslash followed by any character).
186+ */
187+ function hasBalancedParensAt ( input : InputStream , startOffset : number ) : boolean {
188+ return findMatchingCloseParenOffset ( input , startOffset ) !== null ;
189+ }
190+
191+ /**
192+ * Determines whether a balanced parenthesized expression should be treated as
193+ * query grouping in regex mode. This preserves query constructs like:
194+ * (file:a or file:b)
195+ * while still allowing bare regex atoms like:
196+ * (foo|bar)
197+ */
198+ function isRegexQueryGroupingAt ( input : InputStream , startOffset : number ) : boolean {
199+ const closeOffset = findMatchingCloseParenOffset ( input , startOffset ) ;
200+ if ( closeOffset === null ) {
201+ return false ;
202+ }
203+
204+ let offset = skipWhitespace ( input , startOffset + 1 ) ;
205+ if ( offset >= closeOffset ) {
206+ return true ; // Empty parens are always grouping syntax
207+ }
208+
209+ const topLevelTokens : Array < { start : number ; end : number } > = [ ] ;
210+
211+ while ( offset < closeOffset ) {
212+ const tokenStart = offset ;
213+ let depth = 0 ;
214+ let inQuote = false ;
215+
216+ while ( offset < closeOffset ) {
217+ const ch = input . peek ( offset ) ;
218+
219+ if ( ch === 92 /* backslash */ ) {
220+ offset += 2 ;
221+ continue ;
222+ }
223+
224+ if ( inQuote ) {
225+ offset ++ ;
226+ if ( ch === QUOTE ) {
227+ inQuote = false ;
228+ }
229+ continue ;
230+ }
231+
232+ if ( ch === QUOTE ) {
233+ inQuote = true ;
234+ offset ++ ;
235+ continue ;
236+ }
237+
238+ if ( ch === OPEN_PAREN ) {
239+ depth ++ ;
240+ offset ++ ;
241+ continue ;
242+ }
243+
244+ if ( ch === CLOSE_PAREN ) {
245+ if ( depth === 0 ) {
246+ break ;
247+ }
248+ depth -- ;
249+ offset ++ ;
250+ continue ;
251+ }
252+
253+ if ( depth === 0 && isWhitespace ( ch ) ) {
254+ break ;
255+ }
256+
257+ offset ++ ;
258+ }
259+
260+ topLevelTokens . push ( { start : tokenStart , end : offset } ) ;
261+ offset = skipWhitespace ( input , offset ) ;
262+ }
263+
264+ if ( topLevelTokens . length !== 1 ) {
265+ return true ;
266+ }
267+
268+ const [ { start, end } ] = topLevelTokens ;
269+ const firstCh = input . peek ( start ) ;
270+
271+ if ( startsWithPrefixAt ( input , start ) ) {
272+ return true ;
273+ }
274+
275+ if ( firstCh === DASH ) {
276+ const afterDash = skipWhitespace ( input , start + 1 ) ;
277+
278+ if ( startsWithPrefixAt ( input , afterDash ) ) {
279+ return true ;
280+ }
281+
282+ if ( input . peek ( afterDash ) === OPEN_PAREN && afterDash < end ) {
283+ return isRegexQueryGroupingAt ( input , afterDash ) ;
284+ }
285+
286+ return false ;
287+ }
288+
289+ if ( firstCh === QUOTE ) {
290+ return true ;
291+ }
292+
293+ if ( firstCh === OPEN_PAREN && start < end ) {
294+ return isRegexQueryGroupingAt ( input , start ) ;
295+ }
296+
140297 return false ;
141298}
142299
300+ /**
301+ * Finds the offset of the '(' that matches the current ')' at offset 0.
302+ * Handles escaped characters (backslash followed by any character).
303+ */
304+ function findMatchingOpenParenOffset ( input : InputStream ) : number | null {
305+ if ( input . next !== CLOSE_PAREN ) {
306+ return null ;
307+ }
308+
309+ let offset = - 1 ;
310+ let depth = 1 ;
311+
312+ while ( true ) {
313+ const ch = input . peek ( offset ) ;
314+
315+ if ( ch === EOF ) {
316+ return null ;
317+ }
318+
319+ if ( isEscapedAt ( input , offset ) ) {
320+ offset -- ;
321+ continue ;
322+ }
323+
324+ if ( ch === CLOSE_PAREN ) {
325+ depth ++ ;
326+ } else if ( ch === OPEN_PAREN ) {
327+ depth -- ;
328+ if ( depth === 0 ) {
329+ return offset ;
330+ }
331+ }
332+
333+ offset -- ;
334+ }
335+ }
336+
143337/**
144338 * Checks if we're currently inside a ParenExpr by looking backwards in the input
145339 * to count unmatched opening parens that likely started a ParenExpr.
@@ -246,15 +440,20 @@ function isInsideParenExpr(input: InputStream, stack: Stack): boolean {
246440export const parenToken = new ExternalTokenizer ( ( input , stack ) => {
247441 if ( input . next !== OPEN_PAREN ) return ;
248442
249- // In regex mode, parens are just word characters — don't emit openParen
250443 if ( stack . dialectEnabled ( Dialect_regex ) ) {
251- return ;
444+ // In regex mode, only treat parens as grouping syntax when the contents
445+ // clearly look like a query expression. Otherwise they remain part of a
446+ // regex term, e.g. (foo|bar).
447+ if ( ! isRegexQueryGroupingAt ( input , 0 ) ) {
448+ return ;
449+ }
252450 }
253451
254452 if ( hasBalancedParensAt ( input , 0 ) ) {
255453 // Found balanced parens - emit openParen (just the '(')
256454 input . advance ( ) ;
257455 input . acceptToken ( openParen ) ;
456+ return ;
258457 }
259458 // If unbalanced, don't emit anything - let wordToken handle it
260459} ) ;
@@ -268,15 +467,21 @@ export const parenToken = new ExternalTokenizer((input, stack) => {
268467export const closeParenToken = new ExternalTokenizer ( ( input , stack ) => {
269468 if ( input . next !== CLOSE_PAREN ) return ;
270469
271- // In regex mode, parens are just word characters — don't emit closeParen
272470 if ( stack . dialectEnabled ( Dialect_regex ) ) {
471+ const matchingOpenOffset = findMatchingOpenParenOffset ( input ) ;
472+ if ( matchingOpenOffset === null || ! isRegexQueryGroupingAt ( input , matchingOpenOffset ) ) {
473+ return ;
474+ }
475+ input . advance ( ) ;
476+ input . acceptToken ( closeParen ) ;
273477 return ;
274478 }
275479
276480 // Check if we should emit closeParen (when inside a ParenExpr)
277481 if ( isInsideParenExpr ( input , stack ) ) {
278482 input . advance ( ) ;
279483 input . acceptToken ( closeParen ) ;
484+ return ;
280485 }
281486 // Otherwise, don't emit - let wordToken handle ')' as part of a word
282487} ) ;
@@ -324,10 +529,16 @@ export const wordToken = new ExternalTokenizer((input, stack) => {
324529 }
325530
326531 // In regex mode: consume all non-whitespace characters as a single word.
327- // Parens and | are valid regex metacharacters, not query syntax in this mode .
532+ // Parens remain part of the word unless they clearly start/end a query group .
328533 if ( stack . dialectEnabled ( Dialect_regex ) ) {
329534 const startPos = input . pos ;
330535 while ( input . next !== EOF && ! isWhitespace ( input . next ) ) {
536+ if ( input . next === CLOSE_PAREN ) {
537+ const matchingOpenOffset = findMatchingOpenParenOffset ( input ) ;
538+ if ( matchingOpenOffset !== null && isRegexQueryGroupingAt ( input , matchingOpenOffset ) ) {
539+ break ;
540+ }
541+ }
331542 input . advance ( ) ;
332543 }
333544 if ( input . pos > startPos ) {
@@ -454,10 +665,8 @@ export const negateToken = new ExternalTokenizer((input, stack) => {
454665 const chAfterDash = input . peek ( offset ) ;
455666
456667 // In normal mode: also check for balanced paren (negated group e.g. -(foo bar))
457- // In regex mode: skip this — parens are not query grouping operators, so emitting
458- // negate before a '(' would leave the parser without a matching ParenExpr to parse.
459- if ( ! stack . dialectEnabled ( Dialect_regex ) ) {
460- if ( chAfterDash === OPEN_PAREN && hasBalancedParensAt ( input , offset ) ) {
668+ if ( chAfterDash === OPEN_PAREN && hasBalancedParensAt ( input , offset ) ) {
669+ if ( ! stack . dialectEnabled ( Dialect_regex ) || isRegexQueryGroupingAt ( input , offset ) ) {
461670 input . advance ( ) ;
462671 input . acceptToken ( negate ) ;
463672 return ;
@@ -492,4 +701,3 @@ export const negateToken = new ExternalTokenizer((input, stack) => {
492701
493702 // Otherwise, don't tokenize as negate (let word handle it)
494703} ) ;
495-
0 commit comments