Skip to content

Commit 3ea62a8

Browse files
h3n4lclaude
andcommitted
feat(mongodb): add MongoDB Shell (mongosh) parser
Add ANTLR grammar for parsing MongoDB shell syntax with Go code generation. Supported features: - Shell commands: show dbs, show databases, show collections - Database statements: db.collection.method(...) with method chains - Collection access: dot notation, bracket notation, getCollection() - Read methods: find(), findOne() - Cursor modifiers: sort(), limit(), skip(), projection(), project() - Helper functions as distinct AST nodes: ObjectId(), ISODate(), Date(), UUID(), Long(), NumberLong(), Int32(), NumberInt(), Double(), Decimal128(), NumberDecimal(), Timestamp(), RegExp() - Document syntax: unquoted/quoted keys, nested documents, arrays, trailing commas - Regex literals: /pattern/flags - Literals: strings (single/double quoted), numbers, booleans, null - Comments: line (//) and block (/* */) The 'new' keyword is intentionally not supported for helper functions. When users write 'new ObjectId()', the parser generates a helpful error message using ANTLR's NotifyErrorListeners mechanism. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 12c6f17 commit 3ea62a8

21 files changed

Lines changed: 10683 additions & 0 deletions

mongodb/Makefile

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
all: build test
2+
3+
build:
4+
@echo "Building MongoDB Shell parser..."
5+
antlr -Dlanguage=Go -package mongodb -visitor -o . MongoShellLexer.g4 MongoShellParser.g4
6+
7+
test:
8+
go test -v -run TestMongoShellParser
9+
10+
.PHONY: all build test

mongodb/MongoShellLexer.g4

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
/*
2+
* MongoDB Shell (mongosh) Lexer Grammar
3+
* For use with ANTLR 4
4+
*/
5+
6+
lexer grammar MongoShellLexer;
7+
8+
// Keywords
9+
SHOW: 'show';
10+
DBS: 'dbs';
11+
DATABASES: 'databases';
12+
COLLECTIONS: 'collections';
13+
DB: 'db';
14+
NEW: 'new';
15+
TRUE: 'true';
16+
FALSE: 'false';
17+
NULL: 'null';
18+
GET_COLLECTION: 'getCollection';
19+
GET_COLLECTION_NAMES: 'getCollectionNames';
20+
21+
// Helper function names (recognized as distinct tokens)
22+
OBJECT_ID: 'ObjectId';
23+
ISO_DATE: 'ISODate';
24+
DATE: 'Date';
25+
UUID: 'UUID';
26+
LONG: 'Long';
27+
NUMBER_LONG: 'NumberLong';
28+
INT32: 'Int32';
29+
NUMBER_INT: 'NumberInt';
30+
DOUBLE: 'Double';
31+
DECIMAL128: 'Decimal128';
32+
NUMBER_DECIMAL: 'NumberDecimal';
33+
TIMESTAMP: 'Timestamp';
34+
REG_EXP: 'RegExp';
35+
36+
// Cursor modifiers (methods)
37+
FIND: 'find';
38+
FIND_ONE: 'findOne';
39+
SORT: 'sort';
40+
LIMIT: 'limit';
41+
SKIP_: 'skip';
42+
PROJECTION: 'projection';
43+
PROJECT: 'project';
44+
45+
// Punctuation
46+
LPAREN: '(';
47+
RPAREN: ')';
48+
LBRACE: '{';
49+
RBRACE: '}';
50+
LBRACKET: '[';
51+
RBRACKET: ']';
52+
COLON: ':';
53+
COMMA: ',';
54+
DOT: '.';
55+
SEMI: ';';
56+
57+
// Operators (for query operators like $gt, $lt, etc.)
58+
DOLLAR: '$';
59+
60+
// Comments - must come before REGEX_LITERAL to properly capture /* ... */
61+
LINE_COMMENT
62+
: '//' ~[\r\n]* -> channel(HIDDEN)
63+
;
64+
65+
BLOCK_COMMENT
66+
: '/*' .*? '*/' -> channel(HIDDEN)
67+
;
68+
69+
// Regex literal
70+
REGEX_LITERAL
71+
: '/' REGEX_BODY '/' REGEX_FLAGS?
72+
;
73+
74+
fragment REGEX_BODY
75+
: REGEX_CHAR+
76+
;
77+
78+
fragment REGEX_CHAR
79+
: ~[/\r\n\\]
80+
| '\\' .
81+
;
82+
83+
fragment REGEX_FLAGS
84+
: [gimsuy]+
85+
;
86+
87+
// Numbers
88+
NUMBER
89+
: '-'? INT ('.' [0-9]+)? EXPONENT?
90+
| '-'? '.' [0-9]+ EXPONENT?
91+
;
92+
93+
fragment INT
94+
: '0'
95+
| [1-9] [0-9]*
96+
;
97+
98+
fragment EXPONENT
99+
: [eE] [+-]? [0-9]+
100+
;
101+
102+
// Strings - both single and double quoted
103+
DOUBLE_QUOTED_STRING
104+
: '"' (ESC | ~["\\])* '"'
105+
;
106+
107+
SINGLE_QUOTED_STRING
108+
: '\'' (ESC | ~['\\])* '\''
109+
;
110+
111+
fragment ESC
112+
: '\\' (["\\/bfnrt] | UNICODE | '\'')
113+
;
114+
115+
fragment UNICODE
116+
: 'u' HEX HEX HEX HEX
117+
;
118+
119+
fragment HEX
120+
: [0-9a-fA-F]
121+
;
122+
123+
// Identifiers - for unquoted keys, collection names, method names
124+
// Allows $-prefixed identifiers for MongoDB operators like $gt, $in, etc.
125+
IDENTIFIER
126+
: [$_a-zA-Z] [$_a-zA-Z0-9]*
127+
;
128+
129+
// Whitespace
130+
WS
131+
: [ \t\r\n]+ -> channel(HIDDEN)
132+
;

mongodb/MongoShellParser.g4

Lines changed: 255 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,255 @@
1+
/*
2+
* MongoDB Shell (mongosh) Parser Grammar
3+
* For use with ANTLR 4
4+
*
5+
* Supports MVP read operations:
6+
* - Shell commands: show dbs, show databases, show collections
7+
* - Database statements: db.collection.method(...)
8+
* - Read methods: find(), findOne()
9+
* - Cursor modifiers: sort(), limit(), skip(), projection(), project()
10+
* - Helper functions: ObjectId(), ISODate(), UUID(), Long(), etc.
11+
* - Document syntax with unquoted keys and trailing commas
12+
*/
13+
14+
parser grammar MongoShellParser;
15+
16+
options { tokenVocab=MongoShellLexer; }
17+
18+
// Entry point - a program can contain multiple statements
19+
program
20+
: statement* EOF
21+
;
22+
23+
// A statement is either a shell command or a database statement
24+
statement
25+
: shellCommand SEMI?
26+
| dbStatement SEMI?
27+
;
28+
29+
// Shell commands: show dbs, show databases, show collections
30+
shellCommand
31+
: SHOW (DBS | DATABASES) # showDatabases
32+
| SHOW COLLECTIONS # showCollections
33+
;
34+
35+
// Database statements: db.collection.method(...) or db.getCollectionNames()
36+
dbStatement
37+
: DB DOT GET_COLLECTION_NAMES LPAREN RPAREN methodChain? # getCollectionNames
38+
| DB collectionAccess methodChain # collectionOperation
39+
;
40+
41+
// Collection access patterns
42+
collectionAccess
43+
: DOT identifier # dotAccess
44+
| LBRACKET stringLiteral RBRACKET # bracketAccess
45+
| DOT GET_COLLECTION LPAREN stringLiteral RPAREN # getCollectionAccess
46+
;
47+
48+
// Method chain: one or more method calls chained with dots
49+
methodChain
50+
: DOT methodCall (DOT methodCall)*
51+
;
52+
53+
// Method call: methodName(arguments?)
54+
methodCall
55+
: findMethod
56+
| findOneMethod
57+
| sortMethod
58+
| limitMethod
59+
| skipMethod
60+
| projectionMethod
61+
| genericMethod
62+
;
63+
64+
// Specific method rules for better AST structure
65+
findMethod
66+
: FIND LPAREN argument? RPAREN
67+
;
68+
69+
findOneMethod
70+
: FIND_ONE LPAREN argument? RPAREN
71+
;
72+
73+
sortMethod
74+
: SORT LPAREN document RPAREN
75+
;
76+
77+
limitMethod
78+
: LIMIT LPAREN NUMBER RPAREN
79+
;
80+
81+
skipMethod
82+
: SKIP_ LPAREN NUMBER RPAREN
83+
;
84+
85+
projectionMethod
86+
: (PROJECTION | PROJECT) LPAREN document RPAREN
87+
;
88+
89+
// Generic method for extensibility (other methods will be caught here)
90+
genericMethod
91+
: identifier LPAREN arguments? RPAREN
92+
;
93+
94+
// Arguments: comma-separated list of values
95+
arguments
96+
: argument (COMMA argument)* COMMA?
97+
;
98+
99+
argument
100+
: value
101+
;
102+
103+
// Document: { key: value, ... } with optional trailing comma
104+
document
105+
: LBRACE (pair (COMMA pair)* COMMA?)? RBRACE
106+
;
107+
108+
// Key-value pair
109+
pair
110+
: key COLON value
111+
;
112+
113+
// Key: can be unquoted identifier or quoted string
114+
key
115+
: identifier # unquotedKey
116+
| stringLiteral # quotedKey
117+
;
118+
119+
// Value: document, array, helper function, regex, or literal
120+
value
121+
: document # documentValue
122+
| array # arrayValue
123+
| helperFunction # helperValue
124+
| REGEX_LITERAL # regexLiteralValue
125+
| regExpConstructor # regexpConstructorValue
126+
| literal # literalValue
127+
;
128+
129+
// Array: [ value, ... ] with optional trailing comma
130+
array
131+
: LBRACKET (value (COMMA value)* COMMA?)? RBRACKET
132+
;
133+
134+
// Helper functions - each is a distinct node type for easy AST walking
135+
// Note: 'new' keyword is not supported. Use ObjectId(), ISODate(), Date() directly.
136+
helperFunction
137+
: objectIdHelper
138+
| isoDateHelper
139+
| dateHelper
140+
| uuidHelper
141+
| longHelper
142+
| int32Helper
143+
| doubleHelper
144+
| decimal128Helper
145+
| timestampHelper
146+
;
147+
148+
// ObjectId("hex") or ObjectId()
149+
objectIdHelper
150+
: OBJECT_ID LPAREN stringLiteral? RPAREN
151+
| NEW OBJECT_ID { p.NotifyErrorListeners("'new' keyword is not supported. Use ObjectId() directly", nil, nil) }
152+
;
153+
154+
// ISODate("iso-string") or ISODate()
155+
isoDateHelper
156+
: ISO_DATE LPAREN stringLiteral? RPAREN
157+
| NEW ISO_DATE { p.NotifyErrorListeners("'new' keyword is not supported. Use ISODate() directly", nil, nil) }
158+
;
159+
160+
// Date() or Date("string") or Date(timestamp)
161+
dateHelper
162+
: DATE LPAREN (stringLiteral | NUMBER)? RPAREN
163+
| NEW DATE { p.NotifyErrorListeners("'new' keyword is not supported. Use Date() directly", nil, nil) }
164+
;
165+
166+
// UUID("uuid-string")
167+
uuidHelper
168+
: UUID LPAREN stringLiteral RPAREN
169+
;
170+
171+
// Long(n), Long("n"), NumberLong(n), NumberLong("n")
172+
longHelper
173+
: (LONG | NUMBER_LONG) LPAREN (NUMBER | stringLiteral) RPAREN
174+
;
175+
176+
// Int32(n), NumberInt(n)
177+
int32Helper
178+
: (INT32 | NUMBER_INT) LPAREN NUMBER RPAREN
179+
;
180+
181+
// Double(n)
182+
doubleHelper
183+
: DOUBLE LPAREN NUMBER RPAREN
184+
;
185+
186+
// Decimal128("n"), NumberDecimal("n")
187+
decimal128Helper
188+
: (DECIMAL128 | NUMBER_DECIMAL) LPAREN stringLiteral RPAREN
189+
;
190+
191+
// Timestamp({t: n, i: n}) or Timestamp(t, i)
192+
timestampHelper
193+
: TIMESTAMP LPAREN document RPAREN # timestampDocHelper
194+
| TIMESTAMP LPAREN NUMBER COMMA NUMBER RPAREN # timestampArgsHelper
195+
;
196+
197+
// RegExp("pattern", "flags") constructor
198+
regExpConstructor
199+
: REG_EXP LPAREN stringLiteral (COMMA stringLiteral)? RPAREN
200+
| NEW REG_EXP { p.NotifyErrorListeners("'new' keyword is not supported. Use RegExp() directly", nil, nil) }
201+
;
202+
203+
// Literals
204+
literal
205+
: stringLiteral # stringLiteralValue
206+
| NUMBER # numberLiteral
207+
| TRUE # trueLiteral
208+
| FALSE # falseLiteral
209+
| NULL # nullLiteral
210+
;
211+
212+
// String literal - both single and double quoted
213+
stringLiteral
214+
: DOUBLE_QUOTED_STRING
215+
| SINGLE_QUOTED_STRING
216+
;
217+
218+
// Identifier - used for unquoted keys, collection names, method names
219+
// Includes MongoDB operators like $gt, $in, etc.
220+
identifier
221+
: IDENTIFIER
222+
| DOLLAR IDENTIFIER
223+
// Keywords that can also be used as identifiers
224+
| SHOW
225+
| DBS
226+
| DATABASES
227+
| COLLECTIONS
228+
| DB
229+
| NEW
230+
| TRUE
231+
| FALSE
232+
| NULL
233+
| FIND
234+
| FIND_ONE
235+
| SORT
236+
| LIMIT
237+
| SKIP_
238+
| PROJECTION
239+
| PROJECT
240+
| GET_COLLECTION
241+
| GET_COLLECTION_NAMES
242+
| OBJECT_ID
243+
| ISO_DATE
244+
| DATE
245+
| UUID
246+
| LONG
247+
| NUMBER_LONG
248+
| INT32
249+
| NUMBER_INT
250+
| DOUBLE
251+
| DECIMAL128
252+
| NUMBER_DECIMAL
253+
| TIMESTAMP
254+
| REG_EXP
255+
;

0 commit comments

Comments
 (0)