Skip to content

Commit 76e9c78

Browse files
committed
adjusting ignore token set to be lexical/internal only
Signed-off-by: Eric Wei <mengwei.eric@gmail.com>
1 parent c803ab0 commit 76e9c78

2 files changed

Lines changed: 132 additions & 58 deletions

File tree

ppl/src/main/java/org/opensearch/sql/ppl/autocomplete/PPLGrammarBundleBuilder.java

Lines changed: 33 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,22 @@ public class PPLGrammarBundleBuilder {
2727
private static final String ANTLR_VERSION =
2828
org.antlr.v4.runtime.RuntimeMetaData.getRuntimeVersion();
2929
private static final String BUNDLE_VERSION = "1.0";
30+
private static final Set<String> INTERNAL_NON_LITERAL_TOKENS =
31+
new HashSet<>(
32+
Arrays.asList(
33+
"ID",
34+
"NUMERIC_ID",
35+
"ID_DATE_SUFFIX",
36+
"CLUSTER",
37+
"TIME_SNAP",
38+
"SPANLENGTH",
39+
"DECIMAL_SPANLENGTH",
40+
"DQUOTA_STRING",
41+
"SQUOTA_STRING",
42+
"BQUOTA_STRING",
43+
"LINE_COMMENT",
44+
"BLOCK_COMMENT",
45+
"ERROR_RECOGNITION"));
3046

3147
public GrammarBundle build() {
3248
OpenSearchPPLLexer lexer = new OpenSearchPPLLexer(CharStreams.fromString(""));
@@ -59,7 +75,7 @@ public GrammarBundle build() {
5975
.literalNames(literalNames)
6076
.symbolicNames(symbolicNames)
6177
.tokenDictionary(buildTokenDictionary(vocabulary))
62-
.ignoredTokens(buildIgnoredTokens())
78+
.ignoredTokens(buildIgnoredTokens(vocabulary))
6379
.rulesToVisit(buildRulesToVisit(parser.getRuleNames()))
6480
.build();
6581
}
@@ -90,65 +106,32 @@ private static Map<String, Integer> buildTokenDictionary(Vocabulary vocabulary)
90106
}
91107

92108
/**
93-
* Build the list of token type IDs to ignore for autocomplete. Mirrors the frontend
94-
* getIgnoredTokens() logic: explicitly ignore AS/IN, then ignore two contiguous token ranges
95-
* minus operatorsToInclude.
109+
* Build token type IDs to ignore for autocomplete.
96110
*
97-
* <p>Range 1 (relevance/internal tokens): MATCH .. ERROR_RECOGNITION — covers relevance
98-
* functions, search parameters, span literals, IDs, quoted strings, and error tokens.
99-
*
100-
* <p>Range 2 (keywords/functions/operators): CASE .. CAST — covers CASE/ELSE, IN, EXISTS,
101-
* NOT/OR/AND/XOR, TRUE/FALSE, REGEXP, datetime parts, data type keywords, punctuation,
102-
* aggregate functions, math/text/date functions, and CAST.
103-
*
104-
* <p>Tokens in {@code operatorsToInclude} are kept as suggestions even if they fall within
105-
* an ignored range.
111+
* <p>Only lexical/internal tokens are ignored (identifiers, literals, quoted-string tokens,
112+
* comments, and error token). User-facing commands/functions/operators are intentionally kept so
113+
* completion dynamically reflects grammar changes.
106114
*/
107-
private static int[] buildIgnoredTokens() {
108-
// Verify range boundaries match expected token IDs. If the grammar changes and
109-
// shifts token ordinals, these assertions surface the problem at build time.
110-
assert OpenSearchPPLParser.MATCH == 427
111-
: "MATCH token ID shifted — update ignored range start";
112-
assert OpenSearchPPLParser.ERROR_RECOGNITION == 488
113-
: "ERROR_RECOGNITION token ID shifted — update ignored range end";
114-
assert OpenSearchPPLParser.CASE == 142
115-
: "CASE token ID shifted — update ignored range start";
116-
assert OpenSearchPPLParser.CAST == 387
117-
: "CAST token ID shifted — update ignored range end";
118-
119-
Set<Integer> operatorsToInclude = new HashSet<>(Arrays.asList(
120-
OpenSearchPPLParser.PIPE, OpenSearchPPLParser.EQUAL, OpenSearchPPLParser.COMMA,
121-
OpenSearchPPLParser.NOT_EQUAL, OpenSearchPPLParser.LESS, OpenSearchPPLParser.NOT_LESS,
122-
OpenSearchPPLParser.GREATER, OpenSearchPPLParser.NOT_GREATER,
123-
OpenSearchPPLParser.OR, OpenSearchPPLParser.AND,
124-
OpenSearchPPLParser.LT_PRTHS, OpenSearchPPLParser.RT_PRTHS,
125-
OpenSearchPPLParser.SPAN,
126-
OpenSearchPPLParser.MATCH, OpenSearchPPLParser.MATCH_PHRASE,
127-
OpenSearchPPLParser.MATCH_BOOL_PREFIX, OpenSearchPPLParser.MATCH_PHRASE_PREFIX,
128-
OpenSearchPPLParser.SQUOTA_STRING
129-
));
130-
115+
private static int[] buildIgnoredTokens(Vocabulary vocabulary) {
131116
List<Integer> ignored = new ArrayList<>();
132-
ignored.add(OpenSearchPPLParser.AS);
133-
ignored.add(OpenSearchPPLParser.IN);
134117

135-
// Range 1: MATCH .. ERROR_RECOGNITION
136-
for (int i = OpenSearchPPLParser.MATCH; i <= OpenSearchPPLParser.ERROR_RECOGNITION; i++) {
137-
if (!operatorsToInclude.contains(i)) {
138-
ignored.add(i);
139-
}
140-
}
141-
142-
// Range 2: CASE .. CAST
143-
for (int i = OpenSearchPPLParser.CASE; i <= OpenSearchPPLParser.CAST; i++) {
144-
if (!operatorsToInclude.contains(i)) {
145-
ignored.add(i);
118+
for (int tokenType = 0; tokenType <= vocabulary.getMaxTokenType(); tokenType++) {
119+
String symbolicName = vocabulary.getSymbolicName(tokenType);
120+
if (isLexicalInternalToken(symbolicName)) {
121+
ignored.add(tokenType);
146122
}
147123
}
148124

149125
return ignored.stream().mapToInt(Integer::intValue).toArray();
150126
}
151127

128+
private static boolean isLexicalInternalToken(String symbolicName) {
129+
if (symbolicName == null) {
130+
return false;
131+
}
132+
return symbolicName.endsWith("_LITERAL") || INTERNAL_NON_LITERAL_TOKENS.contains(symbolicName);
133+
}
134+
152135
/**
153136
* Build the list of parser rule indices for CodeCompletionCore preferredRules.
154137
* These rules trigger semantic suggestions (suggest fields, tables, functions, etc.).

ppl/src/test/java/org/opensearch/sql/ppl/autocomplete/PPLGrammarBundleBuilderTest.java

Lines changed: 99 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,37 @@
66
package org.opensearch.sql.ppl.autocomplete;
77

88
import static org.junit.Assert.assertEquals;
9+
import static org.junit.Assert.assertFalse;
910
import static org.junit.Assert.assertNotNull;
1011
import static org.junit.Assert.assertTrue;
1112

13+
import java.util.Arrays;
14+
import java.util.HashSet;
1215
import java.util.Map;
16+
import java.util.Set;
1317
import org.junit.BeforeClass;
1418
import org.junit.Test;
1519
import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser;
1620

1721
public class PPLGrammarBundleBuilderTest {
1822

1923
private static final int EXPECTED_ATN_SERIALIZATION_VERSION = 4;
24+
private static final Set<String> EXPECTED_IGNORED_NON_LITERAL_SYMBOLS =
25+
new HashSet<>(
26+
Arrays.asList(
27+
"ID",
28+
"NUMERIC_ID",
29+
"ID_DATE_SUFFIX",
30+
"CLUSTER",
31+
"TIME_SNAP",
32+
"SPANLENGTH",
33+
"DECIMAL_SPANLENGTH",
34+
"DQUOTA_STRING",
35+
"SQUOTA_STRING",
36+
"BQUOTA_STRING",
37+
"LINE_COMMENT",
38+
"BLOCK_COMMENT",
39+
"ERROR_RECOGNITION"));
2040

2141
private static GrammarBundle bundle;
2242

@@ -150,13 +170,84 @@ public void testRulesToVisitAreNonEmpty() {
150170
}
151171

152172
@Test
153-
public void testIgnoredRangeBoundariesMatchGrammar() {
154-
// These assertions mirror the runtime assertions in buildIgnoredTokens().
155-
// If the grammar changes token ordinals, both this test and the builder assertions
156-
// will flag the issue.
157-
assertEquals("MATCH token ID", 427, OpenSearchPPLParser.MATCH);
158-
assertEquals("ERROR_RECOGNITION token ID", 488, OpenSearchPPLParser.ERROR_RECOGNITION);
159-
assertEquals("CASE token ID", 142, OpenSearchPPLParser.CASE);
160-
assertEquals("CAST token ID", 387, OpenSearchPPLParser.CAST);
173+
public void testIgnoredTokensContainOnlyLexicalInternalTokens() {
174+
Set<Integer> ignored = ignoredTokenSet();
175+
for (Integer tokenType : ignored) {
176+
String symbol = bundle.getSymbolicNames()[tokenType];
177+
assertTrue(
178+
"ignoredTokens should contain only lexical/internal tokens, got: "
179+
+ symbol
180+
+ " ("
181+
+ tokenType
182+
+ ")",
183+
symbol != null
184+
&& (symbol.endsWith("_LITERAL")
185+
|| EXPECTED_IGNORED_NON_LITERAL_SYMBOLS.contains(symbol)));
186+
}
187+
}
188+
189+
@Test
190+
public void testCommandAndKeywordTokensAreNotIgnored() {
191+
Set<Integer> ignored = ignoredTokenSet();
192+
assertFalse("LOOKUP should not be ignored", ignored.contains(OpenSearchPPLParser.LOOKUP));
193+
assertFalse("REPLACE should not be ignored", ignored.contains(OpenSearchPPLParser.REPLACE));
194+
assertFalse("REVERSE should not be ignored", ignored.contains(OpenSearchPPLParser.REVERSE));
195+
assertFalse("MVCOMBINE should not be ignored", ignored.contains(OpenSearchPPLParser.MVCOMBINE));
196+
assertFalse("MVEXPAND should not be ignored", ignored.contains(OpenSearchPPLParser.MVEXPAND));
197+
assertFalse("LEFT should not be ignored", ignored.contains(OpenSearchPPLParser.LEFT));
198+
assertFalse("RIGHT should not be ignored", ignored.contains(OpenSearchPPLParser.RIGHT));
199+
assertFalse("AS should not be ignored", ignored.contains(OpenSearchPPLParser.AS));
200+
assertFalse("IN should not be ignored", ignored.contains(OpenSearchPPLParser.IN));
201+
}
202+
203+
@Test
204+
public void testExpressionFunctionTokensAreNotIgnored() {
205+
Set<Integer> ignored = ignoredTokenSet();
206+
assertFalse("MVAPPEND should not be ignored", ignored.contains(OpenSearchPPLParser.MVAPPEND));
207+
assertFalse("MVJOIN should not be ignored", ignored.contains(OpenSearchPPLParser.MVJOIN));
208+
assertFalse("MVINDEX should not be ignored", ignored.contains(OpenSearchPPLParser.MVINDEX));
209+
}
210+
211+
@Test
212+
public void testNewerGrammarKeywordsAreNotIgnoredWhenPresent() {
213+
// These tokens exist in newer grammar variants (for example graph lookup support).
214+
// Keep this test tolerant so it works across branches with different grammar revisions.
215+
assertTokenNotIgnoredIfPresent("GRAPHLOOKUP");
216+
assertTokenNotIgnoredIfPresent("START_FIELD");
217+
assertTokenNotIgnoredIfPresent("FROM_FIELD");
218+
assertTokenNotIgnoredIfPresent("TO_FIELD");
219+
assertTokenNotIgnoredIfPresent("MAX_DEPTH");
220+
assertTokenNotIgnoredIfPresent("DEPTH_FIELD");
221+
assertTokenNotIgnoredIfPresent("DIRECTION");
222+
assertTokenNotIgnoredIfPresent("UNI");
223+
assertTokenNotIgnoredIfPresent("BI");
224+
assertTokenNotIgnoredIfPresent("SUPPORT_ARRAY");
225+
assertTokenNotIgnoredIfPresent("BATCH_MODE");
226+
assertTokenNotIgnoredIfPresent("USE_PIT");
227+
}
228+
229+
private static Set<Integer> ignoredTokenSet() {
230+
Set<Integer> ignored = new HashSet<>();
231+
for (int tokenType : bundle.getIgnoredTokens()) {
232+
ignored.add(tokenType);
233+
}
234+
return ignored;
235+
}
236+
237+
private static void assertTokenNotIgnoredIfPresent(String symbolicTokenName) {
238+
int tokenType = tokenTypeBySymbolicName(symbolicTokenName);
239+
if (tokenType >= 0) {
240+
assertFalse(symbolicTokenName + " should not be ignored", ignoredTokenSet().contains(tokenType));
241+
}
242+
}
243+
244+
private static int tokenTypeBySymbolicName(String symbolicTokenName) {
245+
String[] symbols = bundle.getSymbolicNames();
246+
for (int i = 0; i < symbols.length; i++) {
247+
if (symbolicTokenName.equals(symbols[i])) {
248+
return i;
249+
}
250+
}
251+
return -1;
161252
}
162253
}

0 commit comments

Comments
 (0)