sourcebot/packages/queryLanguage/src/tokens.ts at 04354d775a0e842beca0fc799a7e89d5b7ccf99f · sourcebot-dev/sourcebot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
import { ExternalTokenizer, InputStream, Stack } from "@lezer/lr";
import { negate, openParen, closeParen, word, or, Dialect_regex } from "./parser.terms";

// Character codes
const SPACE = 32;
const TAB = 9;
const NEWLINE = 10;
const QUOTE = 34;
const OPEN_PAREN = 40;
const CLOSE_PAREN = 41;
const DASH = 45;
const COLON = 58;
const EOF = -1;

// Prefix keywords that should not be consumed as words
const PREFIXES = [
    "archived:",
    "rev:",
    "content:", "c:",
    "context:",
    "file:", "f:",
    "fork:",
    "visibility:",
    "repo:", "r:",
    "lang:",
    "sym:",
    "reposet:",
];

function isWhitespace(ch: number): boolean {
    return ch === SPACE || ch === TAB || ch === NEWLINE;
}

function isAlphaNumUnderscore(ch: number): boolean {
    return (ch >= 65 && ch <= 90) ||  // A-Z
           (ch >= 97 && ch <= 122) || // a-z
           (ch >= 48 && ch <= 57) ||  // 0-9
           ch === 95;                  // _
}

/**
 * Checks if the input at current position matches the given string.
 */
function matchesString(input: InputStream, str: string): boolean {
    for (let i = 0; i < str.length; i++) {
        if (input.peek(i) !== str.charCodeAt(i)) {
            return false;
        }
    }
    return true;
}

/**
 * Checks if current position starts with "or" that will be recognized as the OR operator.
 * This matches the logic in orToken - only returns true if "or" is NOT at start,
 * is followed by non-alphanumeric, AND there's actual content after it (not EOF).
 */
function isOrKeyword(input: InputStream): boolean {
    if (input.peek(0) !== 111 /* 'o' */ || input.peek(1) !== 114 /* 'r' */) {
        return false;
    }

    // Don't match "or" at the start of input
    if (input.pos === 0) {
        return false;
    }

    const afterOr = input.peek(2);

    // Must not be alphanumeric (to avoid matching "orange")
    if (isAlphaNumUnderscore(afterOr)) {
        return false;
    }

    // Must not be EOF (at EOF, "or" should be a word, not a keyword)
    if (afterOr === EOF) {
        return false;
    }

    // Check that what follows (after whitespace) is not EOF
    let offset = 2;
    while (isWhitespace(input.peek(offset))) {
        offset++;
    }
    if (input.peek(offset) === EOF) {
        return false;
    }

    // It's a valid OR keyword
    return true;
}

/**
 * Checks if current position starts with a prefix keyword.
 */
function startsWithPrefix(input: InputStream): boolean {
    for (const prefix of PREFIXES) {
        if (matchesString(input, prefix)) {
            return true;
        }
    }
    return false;
}

/**
 * Checks if a '(' at the given offset starts a balanced ParenExpr.
 * Uses peek() to avoid modifying stream position.
 * Returns true if we find a matching ')' that closes the initial '('.
 * Handles escaped characters (backslash followed by any character).
 */
function hasBalancedParensAt(input: InputStream, startOffset: number): boolean {
    if (input.peek(startOffset) !== OPEN_PAREN) {
        return false;
    }

    let offset = startOffset + 1;
    let depth = 1;

    while (true) {
        const ch = input.peek(offset);
        if (ch === EOF) break;

        // Handle escaped characters - skip the next character after a backslash
        if (ch === 92 /* backslash */) {
            offset += 2; // Skip backslash and the escaped character
            continue;
        }

        if (ch === OPEN_PAREN) {
            depth++;
        } else if (ch === CLOSE_PAREN) {
            depth--;
            if (depth === 0) {
                return true;
            }
        }
        offset++;
    }

    return false;
}

/**
 * Checks if we're currently inside a ParenExpr by looking backwards in the input
 * to count unmatched opening parens that likely started a ParenExpr.
 *
 * We only consider a '(' as a ParenExpr start if it's preceded by whitespace or
 * start-of-input, since "test()" has a '(' that's part of a word, not a ParenExpr.
 * Handles escaped characters (backslash followed by any character).
 */
function hasUnmatchedOpenParen(input: InputStream): boolean {
    // Count parens backwards from current position
    let depth = 0;
    let offset = -1;

    // Look backwards up to 1000 characters (reasonable limit)
    while (offset >= -1000) {
        const ch = input.peek(offset);
        if (ch === EOF) {
            // Reached start of input - if we have negative depth, there's an unmatched '('
            return depth < 0;
        }

        // Check if this character is escaped (preceded by backslash)
        // Note: we need to be careful about escaped backslashes (\\)
        // For simplicity, if we see a backslash immediately before, skip this char
        const prevCh = input.peek(offset - 1);
        if (prevCh === 92 /* backslash */) {
            // Check if the backslash itself is escaped
            const prevPrevCh = input.peek(offset - 2);
            if (prevPrevCh !== 92) {
                // Single backslash - this char is escaped, skip it
                offset--;
                continue;
            }
            // Double backslash - the backslash is escaped, so current char is not
        }

        if (ch === CLOSE_PAREN) {
            depth++;
        } else if (ch === OPEN_PAREN) {
            // Check what's before this '('
            const beforeParen = input.peek(offset - 1);

            // A '(' starts a ParenExpr if it's preceded by:
            // - EOF or whitespace (e.g., "(hello)" or "test (hello)")
            // - '-' for negation (e.g., "-(hello)")
            // - ':' for prefix values (e.g., "repo:(foo or bar)")
            const isDefinitelyParenExprStart =
                beforeParen === EOF ||
                isWhitespace(beforeParen) ||
                beforeParen === DASH ||
                beforeParen === COLON;

            // Special case: '(' preceded by '(' could be nested ParenExprs like "((hello))"
            // BUT it could also be part of a word like "test((nested))"
            // To distinguish: if prev is '(', check what's before THAT '('
            let isParenExprStart = isDefinitelyParenExprStart;
            if (!isParenExprStart && beforeParen === OPEN_PAREN) {
                // Check what's before the previous '('
                const prevPrevCh = input.peek(offset - 2);
                // Only count as ParenExpr if the preceding '(' is also at a token boundary
                isParenExprStart =
                    prevPrevCh === EOF ||
                    isWhitespace(prevPrevCh) ||
                    prevPrevCh === DASH ||
                    prevPrevCh === COLON;
            }

            if (isParenExprStart) {
                // This '(' likely started a ParenExpr
                depth--;
                if (depth < 0) {
                    // Found an unmatched opening paren that started a ParenExpr
                    return true;
                }
            }
            // If beforeParen is something else, this '(' is part of a word like "test()"
            // Don't count it in our depth tracking
        }
        offset--;
    }

    return false;
}

/**
 * Checks if the parser stack indicates we're currently inside a ParenExpr.
 */
function isInsideParenExpr(input: InputStream, stack: Stack): boolean {
    // First try the standard parser state check
    if (stack.canShift(closeParen)) {
        return true;
    }

    // If that fails, use a heuristic: look backwards for unmatched '('
    // This handles cases where the parser needs to reduce before shifting closeParen
    return hasUnmatchedOpenParen(input);
}

/**
 * External tokenizer for '(' - emits openParen only if there's a balanced ')'.
 * This allows words like "(pr" or "func(arg)" to be parsed as single terms
 * while "(foo bar)" is parsed as a ParenExpr.
 */
export const parenToken = new ExternalTokenizer((input, stack) => {
    if (input.next !== OPEN_PAREN) return;

    // In regex mode, parens are just word characters — don't emit openParen
    if (stack.dialectEnabled(Dialect_regex)) {
        return;
    }

    if (hasBalancedParensAt(input, 0)) {
        // Found balanced parens - emit openParen (just the '(')
        input.advance();
        input.acceptToken(openParen);
    }
    // If unbalanced, don't emit anything - let wordToken handle it
});

/**
 * External tokenizer for ')' - emits closeParen when appropriate.
 * We emit closeParen if:
 * 1. The parser can immediately shift it (canShift returns true), OR
 * 2. We're likely inside a ParenExpr based on other heuristics
 */
export const closeParenToken = new ExternalTokenizer((input, stack) => {
    if (input.next !== CLOSE_PAREN) return;

    // In regex mode, parens are just word characters — don't emit closeParen
    if (stack.dialectEnabled(Dialect_regex)) {
        return;
    }

    // Check if we should emit closeParen (when inside a ParenExpr)
    if (isInsideParenExpr(input, stack)) {
        input.advance();
        input.acceptToken(closeParen);
    }
    // Otherwise, don't emit - let wordToken handle ')' as part of a word
});

/**
 * External tokenizer for words - allows '(' and ')' when not part of a ParenExpr.
 *
 * Rules:
 * - Don't match if starts with balanced '(' (let parenToken handle it)
 * - Don't match if starts with ')' and we're inside a ParenExpr (let closeParenToken handle it)
 * - Don't match if starts with valid quotedString
 * - Don't match if starts with "or" keyword
 * - Don't match if starts with a prefix keyword
 * - Otherwise, consume everything including '(' and ')' as part of the word
 *   (except stop at ')' when inside a ParenExpr)
 */
export const wordToken = new ExternalTokenizer((input, stack) => {
    // Can't start with whitespace or EOF
    if (isWhitespace(input.next) || input.next === EOF) {
        return;
    }

    // Check for valid quoted string (starts with " and has closing ")
    if (input.next === QUOTE) {
        // Look for closing quote
        let offset = 1;
        while (true) {
            const ch = input.peek(offset);
            if (ch === EOF || ch === NEWLINE) break; // Unclosed quote - treat as word
            if (ch === QUOTE) return; // Valid quoted string - let grammar handle it
            if (ch === 92 /* backslash */) offset++; // Skip escaped char
            offset++;
        }
        // Unclosed quote - fall through to treat as word
    }

    // Don't match 'or' keyword (followed by non-alphanumeric)
    if (isOrKeyword(input)) {
        return;
    }

    // Don't match prefix keywords
    if (startsWithPrefix(input)) {
        return;
    }

    // In regex mode: consume all non-whitespace characters as a single word.
    // Parens and | are valid regex metacharacters, not query syntax in this mode.
    if (stack.dialectEnabled(Dialect_regex)) {
        const startPos = input.pos;
        while (input.next !== EOF && !isWhitespace(input.next)) {
            input.advance();
        }
        if (input.pos > startPos) {
            input.acceptToken(word);
        }
        return;
    }

    // If starts with '(' and has balanced parens, determine whether this is a
    // regex alternation value (e.g. file:(test|spec)) or a ParenExpr grouping.
    // We're in a value context when the immediately preceding non-whitespace char
    // is ':', meaning we're right after a prefix keyword. In that case consume the
    // entire '(...)' as a word using depth-tracking so the consuming loop doesn't
    // stop early at ')'. Otherwise defer to parenToken for grouping.
    let inValueParenContext = false;
    if (input.next === OPEN_PAREN && hasBalancedParensAt(input, 0)) {
        let backOffset = -1;
        while (isWhitespace(input.peek(backOffset))) {
            backOffset--;
        }
        if (input.peek(backOffset) === COLON) {
            inValueParenContext = true;
        } else {
            return; // Not a value context — defer to parenToken for grouping
        }
    }

    const startPos = input.pos;

    if (inValueParenContext) {
        // Consume the parenthesized pattern with depth tracking so we consume
        // the matching ')' without stopping early. A ')' at depth 0 means we've
        // hit an outer ParenExpr closing paren — stop without consuming it.
        let depth = 0;
        while (input.next !== EOF) {
            const ch = input.next;
            if (isWhitespace(ch)) break;
            if (ch === OPEN_PAREN) {
                depth++;
            } else if (ch === CLOSE_PAREN) {
                if (depth === 0) break; // outer ParenExpr closing — don't consume
                depth--;
            }
            input.advance();
        }
    } else {
    // Consume characters
    while (input.next !== EOF) {
        const ch = input.next;

        // Stop at whitespace
        if (isWhitespace(ch)) break;

        // Stop at ')' if closeParenToken would handle it (when inside a ParenExpr)
        // This allows "(hello)" to work while "test)" becomes a single word
        if (ch === CLOSE_PAREN && isInsideParenExpr(input, stack)) {
            break;
        }

        // Don't stop at '(' in the middle - just consume it
        // (balanced paren check at START is handled above)

        input.advance();
    }
    }

    if (input.pos > startPos) {
        input.acceptToken(word);
    }
});

/**
 * External tokenizer for 'or' keyword.
 * Only tokenizes "or" as the OR operator when:
 * 1. It's NOT at the start of input (to treat "or test" as a term)
 * 2. It's followed by a non-alphanumeric character (to avoid "orange")
 * 3. It's NOT at EOF (to treat "test or" as two terms, not an OR expression)
 */
export const orToken = new ExternalTokenizer((input) => {
    // Check if we're at "or"
    if (input.next !== 111 /* 'o' */) return;
    if (input.peek(1) !== 114 /* 'r' */) return;

    // Don't match "or" at the start of input (position 0)
    // "or test" should parse as a single term, not an OR expression
    if (input.pos === 0) return;

    // Check what follows "or"
    const afterOr = input.peek(2);

    // Must not be alphanumeric or underscore (to avoid matching "orange", "order", etc.)
    if (isAlphaNumUnderscore(afterOr)) return;

    // Must not be EOF (to treat "test or" as two terms)
    if (afterOr === EOF) return;

    // Also check that what follows (after skipping whitespace) is not EOF
    // This handles "test or   " (or followed by only whitespace)
    let offset = 2;
    while (isWhitespace(input.peek(offset))) {
        offset++;
    }
    if (input.peek(offset) === EOF) return;

    // Valid OR operator - emit it
    input.advance(); // 'o'
    input.advance(); // 'r'
    input.acceptToken(or);
});

/**
 * External tokenizer for negation.
 * Only tokenizes `-` as negate when followed by a prefix keyword or balanced `(`.
 */
export const negateToken = new ExternalTokenizer((input, stack) => {
    if (input.next !== DASH) return;

    // Look ahead using peek to see what follows the dash (skipping whitespace)
    let offset = 1;
    while (isWhitespace(input.peek(offset))) {
        offset++;
    }

    const chAfterDash = input.peek(offset);

    // In normal mode: also check for balanced paren (negated group e.g. -(foo bar))
    // In regex mode: skip this — parens are not query grouping operators, so emitting
    // negate before a '(' would leave the parser without a matching ParenExpr to parse.
    if (!stack.dialectEnabled(Dialect_regex)) {
        if (chAfterDash === OPEN_PAREN && hasBalancedParensAt(input, offset)) {
            input.advance();
            input.acceptToken(negate);
            return;
        }
    }

    // Check if followed by a prefix keyword (by checking for keyword followed by colon)
    let foundColon = false;
    let peekOffset = offset;

    while (true) {
        const ch = input.peek(peekOffset);
        if (ch === EOF) break;

        if (ch === COLON) {
            foundColon = true;
            break;
        }
        // Hit a delimiter (whitespace, paren, or quote) - not a prefix keyword
        if (isWhitespace(ch) || ch === OPEN_PAREN || ch === CLOSE_PAREN || ch === QUOTE) {
            break;
        }
        peekOffset++;
    }

    if (foundColon) {
        // It's a prefix keyword, accept as negate
        input.advance();
        input.acceptToken(negate);
        return;
    }

    // Otherwise, don't tokenize as negate (let word handle it)
});