Skip to content

Commit abeb08d

Browse files
committed
initial commit for grammar API
Signed-off-by: Eric Wei <mengwei.eric@gmail.com>
1 parent 8073b4e commit abeb08d

11 files changed

Lines changed: 943 additions & 8 deletions

File tree

async-query-core/build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ configurations {
4343
}
4444

4545
dependencies {
46-
antlr "org.antlr:antlr4:4.7.1"
46+
antlr "org.antlr:antlr4:4.13.2"
4747

4848
implementation project(':core')
4949
implementation 'org.json:json:20231013'

common/build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ repositories {
3333
}
3434

3535
dependencies {
36-
api "org.antlr:antlr4-runtime:4.7.1"
36+
api "org.antlr:antlr4-runtime:4.13.2"
3737
api group: 'com.google.guava', name: 'guava', version: "${guava_version}"
3838
api group: 'org.apache.logging.log4j', name: 'log4j-core', version:"${versions.log4j}"
3939
api group: 'org.apache.commons', name: 'commons-lang3', version: "${commons_lang3_version}"
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.sql.executor.autocomplete;
7+
8+
import java.util.Map;
9+
import lombok.Builder;
10+
import lombok.Data;
11+
12+
/**
13+
* Autocomplete artifact bundle containing everything needed for client-side grammar-based
14+
* autocomplete.
15+
*
16+
* <p>This bundle is language-agnostic and can be used for PPL, SQL, or any ANTLR-based language. It
17+
* contains:
18+
*
19+
* <ul>
20+
* <li>Serialized ATN data for lexer and parser (for antlr4ng runtime)
21+
* <li>Vocabulary and rule names (for token/rule interpretation)
22+
* <li>Static catalogs (commands, functions, keywords, snippets)
23+
* <li>Token classification mapping (for suggestion categorization)
24+
* </ul>
25+
*
26+
* <p>Frontend uses this bundle to:
27+
*
28+
* <ol>
29+
* <li>Deserialize ATNs with antlr4ng
30+
* <li>Create LexerInterpreter and ParserInterpreter
31+
* <li>Use antlr4-c3 to find valid tokens at cursor
32+
* <li>Generate suggestions from catalogs
33+
* </ol>
34+
*/
35+
@Data
36+
@Builder
37+
public class AutocompleteArtifact {
38+
39+
// ============================================================================
40+
// Identity & versioning
41+
// ============================================================================
42+
43+
/** Bundle version (increment when format changes) */
44+
private String bundleVersion;
45+
46+
/**
47+
* Hash of grammar sources + ANTLR version. Used for cache validation via ETag. Format:
48+
* "sha256:abc123..."
49+
*/
50+
private String grammarHash;
51+
52+
// ============================================================================
53+
// Lexer ATN & metadata
54+
// ============================================================================
55+
56+
/**
57+
* Serialized lexer ATN as int array. Frontend uses directly: new
58+
* ATNDeserializer().deserialize(lexerSerializedATN)
59+
*/
60+
private int[] lexerSerializedATN;
61+
62+
/** Lexer rule names (e.g., ["SEARCH", "WHERE", "PIPE", ...]) */
63+
private String[] lexerRuleNames;
64+
65+
/** Channel names (e.g., ["DEFAULT_TOKEN_CHANNEL", "WHITESPACE", "ERRORCHANNEL"]) */
66+
private String[] channelNames;
67+
68+
/** Mode names (e.g., ["DEFAULT_MODE"]) */
69+
private String[] modeNames;
70+
71+
// ============================================================================
72+
// Parser ATN & metadata
73+
// ============================================================================
74+
75+
/**
76+
* Serialized parser ATN as int array. Frontend uses directly: new
77+
* ATNDeserializer().deserialize(parserSerializedATN)
78+
*/
79+
private int[] parserSerializedATN;
80+
81+
/** Parser rule names (e.g., ["root", "pplStatement", "commands", ...]) */
82+
private String[] parserRuleNames;
83+
84+
/** Start rule index (usually 0 for "root" rule) */
85+
private int startRuleIndex;
86+
87+
// ============================================================================
88+
// Vocabulary
89+
// ============================================================================
90+
91+
/**
92+
* Literal names from vocabulary. Index = token type. Values are literal tokens with quotes, or
93+
* null. Example: ["<INVALID>", "'search'", "'where'", "'|'", null, null, ...]
94+
*/
95+
private String[] literalNames;
96+
97+
/**
98+
* Symbolic names from vocabulary. Index = token type. Values are token symbolic names, or null.
99+
* Example: ["<INVALID>", "SEARCH", "WHERE", "PIPE", "ID", "INTEGER", ...]
100+
*/
101+
private String[] symbolicNames;
102+
103+
/**
104+
* Optional display names (user-friendly token names). If not provided, frontend uses literal or
105+
* symbolic names.
106+
*/
107+
private String[] displayNames;
108+
109+
// ============================================================================
110+
// Token classification
111+
// ============================================================================
112+
113+
/**
114+
* Mapping from token symbolic name to suggestion category. Used by frontend to classify antlr4-c3
115+
* token candidates into suggestion types.
116+
*
117+
* <p>Example: { "SEARCH": "COMMAND", "WHERE": "COMMAND", "BY": "KEYWORD", "COUNT": "FUNCTION",
118+
* "AND": "OPERATOR" }
119+
*/
120+
private Map<String, String> tokenTypeToCategory;
121+
}
Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.sql.executor.autocomplete;
7+
8+
import java.nio.charset.StandardCharsets;
9+
import java.security.MessageDigest;
10+
import java.security.NoSuchAlgorithmException;
11+
import lombok.extern.log4j.Log4j2;
12+
import org.antlr.v4.runtime.Lexer;
13+
import org.antlr.v4.runtime.Parser;
14+
import org.antlr.v4.runtime.Vocabulary;
15+
16+
/**
17+
* Utility class for extracting ANTLR grammar artifacts (ATN, vocabulary, rule names) from generated
18+
* parser/lexer classes.
19+
*
20+
* <p>This class handles the low-level details of:
21+
*
22+
* <ul>
23+
* <li>Converting ANTLR's Java String ATN format to int[] for JSON transfer
24+
* <li>Extracting vocabulary (literal and symbolic names)
25+
* <li>Extracting rule names via public ANTLR APIs
26+
* <li>Computing grammar hash for versioning
27+
* </ul>
28+
*
29+
* <p>Language-specific builders (PPL, SQL) use this class to build their autocomplete bundles.
30+
*/
31+
@Log4j2
32+
public class GrammarArtifactBuilder {
33+
34+
/**
35+
* Extract literal names from vocabulary.
36+
*
37+
* <p>Returns array where index = token type, value = literal token (with quotes) or null.
38+
*
39+
* <p>Example: ["<INVALID>", "'search'", "'where'", "'|'", null, ...]
40+
*
41+
* @param vocabulary Parser vocabulary
42+
* @return Array of literal names
43+
*/
44+
public static String[] extractLiteralNames(Vocabulary vocabulary) {
45+
int maxTokenType = vocabulary.getMaxTokenType();
46+
String[] names = new String[maxTokenType + 1];
47+
48+
for (int i = 0; i <= maxTokenType; i++) {
49+
String literal = vocabulary.getLiteralName(i);
50+
// Keep nulls as nulls (no literal representation)
51+
names[i] = literal;
52+
}
53+
54+
log.debug("Extracted {} literal names", names.length);
55+
return names;
56+
}
57+
58+
/**
59+
* Extract symbolic names from vocabulary.
60+
*
61+
* <p>Returns array where index = token type, value = symbolic token name or null.
62+
*
63+
* <p>Example: ["<INVALID>", "SEARCH", "WHERE", "PIPE", "ID", ...]
64+
*
65+
* @param vocabulary Parser vocabulary
66+
* @return Array of symbolic names
67+
*/
68+
public static String[] extractSymbolicNames(Vocabulary vocabulary) {
69+
int maxTokenType = vocabulary.getMaxTokenType();
70+
String[] names = new String[maxTokenType + 1];
71+
72+
for (int i = 0; i <= maxTokenType; i++) {
73+
String symbolic = vocabulary.getSymbolicName(i);
74+
// Keep nulls as nulls (no symbolic name)
75+
names[i] = symbolic;
76+
}
77+
78+
log.debug("Extracted {} symbolic names", names.length);
79+
return names;
80+
}
81+
82+
/**
83+
* Extract rule names from parser.
84+
*
85+
* <p>Parser.getRuleNames() is public API.
86+
*
87+
* @param parser Parser instance
88+
* @return Array of rule names
89+
*/
90+
public static String[] extractParserRuleNames(Parser parser) {
91+
String[] ruleNames = parser.getRuleNames();
92+
log.debug("Extracted {} parser rule names", ruleNames.length);
93+
return ruleNames;
94+
}
95+
96+
/**
97+
* Extract rule names from lexer.
98+
*
99+
* <p>Lexer.getRuleNames() is public API (Lexer extends Recognizer).
100+
*
101+
* @param lexer Lexer instance
102+
* @return Array of lexer rule names
103+
*/
104+
public static String[] extractLexerRuleNames(Lexer lexer) {
105+
String[] ruleNames = lexer.getRuleNames();
106+
log.debug("Extracted {} lexer rule names", ruleNames.length);
107+
return ruleNames;
108+
}
109+
110+
/**
111+
* Extract channel names from lexer.
112+
*
113+
* <p>ANTLR 4.x exposes channel names via getChannelNames() method in generated lexers.
114+
* This method dynamically extracts the actual channel names from the lexer instance.
115+
*
116+
* @param lexer Lexer instance
117+
* @return Array of channel names
118+
*/
119+
public static String[] extractChannelNames(Lexer lexer) {
120+
String[] channelNames = lexer.getChannelNames();
121+
log.debug("Extracted {} channel names from lexer", channelNames.length);
122+
return channelNames;
123+
}
124+
125+
/**
126+
* Extract mode names from lexer.
127+
*
128+
* <p>ANTLR 4.x exposes mode names via getModeNames() method in generated lexers.
129+
* This method dynamically extracts the actual mode names from the lexer instance.
130+
*
131+
* @param lexer Lexer instance
132+
* @return Array of mode names
133+
*/
134+
public static String[] extractModeNames(Lexer lexer) {
135+
String[] modeNames = lexer.getModeNames();
136+
log.debug("Extracted {} mode names from lexer", modeNames.length);
137+
return modeNames;
138+
}
139+
140+
/**
141+
* Compute grammar hash from ATN data (recommended).
142+
*
143+
* <p>This method hashes the serialized ATN arrays directly, which:
144+
*
145+
* <ul>
146+
* <li>Always available at runtime (no classpath dependencies)
147+
* <li>Reflects the actual artifact being served
148+
* <li>Changes when grammar changes (ATN structure changes)
149+
* </ul>
150+
*
151+
* @param lexerATN Serialized lexer ATN as int array
152+
* @param parserATN Serialized parser ATN as int array
153+
* @param antlrVersion ANTLR tool version (e.g., "4.13.2")
154+
* @return Hash string in format "sha256:abc123..."
155+
*/
156+
public static String computeGrammarHash(int[] lexerATN, int[] parserATN, String antlrVersion) {
157+
try {
158+
MessageDigest digest = MessageDigest.getInstance("SHA-256");
159+
160+
// Hash lexer ATN data
161+
for (int value : lexerATN) {
162+
digest.update((byte) (value >> 8));
163+
digest.update((byte) value);
164+
}
165+
166+
// Hash parser ATN data
167+
for (int value : parserATN) {
168+
digest.update((byte) (value >> 8));
169+
digest.update((byte) value);
170+
}
171+
172+
// Hash ANTLR version to detect generator changes
173+
digest.update(antlrVersion.getBytes(StandardCharsets.UTF_8));
174+
175+
// Compute hash
176+
byte[] hashBytes = digest.digest();
177+
String result = "sha256:" + bytesToHex(hashBytes);
178+
179+
log.info("Computed grammar hash from ATN data: {}", result);
180+
return result;
181+
182+
} catch (NoSuchAlgorithmException e) {
183+
// SHA-256 is required by Java specification, this should never happen
184+
throw new IllegalStateException("SHA-256 algorithm not available", e);
185+
}
186+
}
187+
188+
189+
/**
190+
* Convert byte array to hex string.
191+
*
192+
* @param bytes Input bytes
193+
* @return Hex string (lowercase)
194+
*/
195+
private static String bytesToHex(byte[] bytes) {
196+
StringBuilder sb = new StringBuilder(bytes.length * 2);
197+
for (byte b : bytes) {
198+
sb.append(String.format("%02x", b & 0xFF));
199+
}
200+
return sb.toString();
201+
}
202+
}

legacy/build.gradle

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,8 @@ dependencies {
120120
api project(':opensearch')
121121

122122
// ANTLR gradle plugin and runtime dependency
123-
antlr "org.antlr:antlr4:4.7.1"
124-
implementation "org.antlr:antlr4-runtime:4.7.1"
123+
antlr "org.antlr:antlr4:4.13.2"
124+
implementation "org.antlr:antlr4-runtime:4.13.2"
125125
compileOnly group: 'javax.servlet', name: 'servlet-api', version:'2.5'
126126

127127
testImplementation group: 'org.hamcrest', name: 'hamcrest-core', version:'2.2'

plugin/src/main/java/org/opensearch/sql/plugin/SQLPlugin.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@
9494
import org.opensearch.sql.opensearch.storage.OpenSearchDataSourceFactory;
9595
import org.opensearch.sql.opensearch.storage.script.CompoundedScriptEngine;
9696
import org.opensearch.sql.plugin.config.OpenSearchPluginModule;
97+
import org.opensearch.sql.plugin.rest.RestPPLGrammarAction;
9798
import org.opensearch.sql.plugin.rest.RestPPLQueryAction;
9899
import org.opensearch.sql.plugin.rest.RestPPLStatsAction;
99100
import org.opensearch.sql.plugin.rest.RestQuerySettingsAction;
@@ -163,6 +164,7 @@ public List<RestHandler> getRestHandlers(
163164

164165
return Arrays.asList(
165166
new RestPPLQueryAction(),
167+
new RestPPLGrammarAction(),
166168
new RestSqlAction(settings, injector),
167169
new RestSqlStatsAction(settings, restController),
168170
new RestPPLStatsAction(settings, restController),

0 commit comments

Comments
 (0)