adjusting ignore token set to be lexical/internal only

mengweieric · mengweieric · commit 76e9c78615f0 · 2026-03-03T06:29:55.000-08:00
Signed-off-by: Eric Wei &lt;mengwei.eric@gmail.com&gt;
diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/autocomplete/PPLGrammarBundleBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/autocomplete/PPLGrammarBundleBuilder.java
@@ -27,6 +27,22 @@ public class PPLGrammarBundleBuilder {
   private static final String ANTLR_VERSION =
       org.antlr.v4.runtime.RuntimeMetaData.getRuntimeVersion();
   private static final String BUNDLE_VERSION = "1.0";
+  private static final Set<String> INTERNAL_NON_LITERAL_TOKENS =
+      new HashSet<>(
+          Arrays.asList(
+              "ID",
+              "NUMERIC_ID",
+              "ID_DATE_SUFFIX",
+              "CLUSTER",
+              "TIME_SNAP",
+              "SPANLENGTH",
+              "DECIMAL_SPANLENGTH",
+              "DQUOTA_STRING",
+              "SQUOTA_STRING",
+              "BQUOTA_STRING",
+              "LINE_COMMENT",
+              "BLOCK_COMMENT",
+              "ERROR_RECOGNITION"));
 
   public GrammarBundle build() {
     OpenSearchPPLLexer lexer = new OpenSearchPPLLexer(CharStreams.fromString(""));
@@ -59,7 +75,7 @@ public GrammarBundle build() {
         .literalNames(literalNames)
         .symbolicNames(symbolicNames)
         .tokenDictionary(buildTokenDictionary(vocabulary))
-        .ignoredTokens(buildIgnoredTokens())
+        .ignoredTokens(buildIgnoredTokens(vocabulary))
         .rulesToVisit(buildRulesToVisit(parser.getRuleNames()))
         .build();
   }
@@ -90,65 +106,32 @@ private static Map<String, Integer> buildTokenDictionary(Vocabulary vocabulary)
   }
 
   /**
-   * Build the list of token type IDs to ignore for autocomplete. Mirrors the frontend
-   * getIgnoredTokens() logic: explicitly ignore AS/IN, then ignore two contiguous token ranges
-   * minus operatorsToInclude.
+   * Build token type IDs to ignore for autocomplete.
    *
-   * <p>Range 1 (relevance/internal tokens): MATCH .. ERROR_RECOGNITION — covers relevance
-   * functions, search parameters, span literals, IDs, quoted strings, and error tokens.
-   *
-   * <p>Range 2 (keywords/functions/operators): CASE .. CAST — covers CASE/ELSE, IN, EXISTS,
-   * NOT/OR/AND/XOR, TRUE/FALSE, REGEXP, datetime parts, data type keywords, punctuation,
-   * aggregate functions, math/text/date functions, and CAST.
-   *
-   * <p>Tokens in {@code operatorsToInclude} are kept as suggestions even if they fall within
-   * an ignored range.
+   * <p>Only lexical/internal tokens are ignored (identifiers, literals, quoted-string tokens,
+   * comments, and error token). User-facing commands/functions/operators are intentionally kept so
+   * completion dynamically reflects grammar changes.
    */
-  private static int[] buildIgnoredTokens() {
-    // Verify range boundaries match expected token IDs. If the grammar changes and
-    // shifts token ordinals, these assertions surface the problem at build time.
-    assert OpenSearchPPLParser.MATCH == 427
-        : "MATCH token ID shifted — update ignored range start";
-    assert OpenSearchPPLParser.ERROR_RECOGNITION == 488
-        : "ERROR_RECOGNITION token ID shifted — update ignored range end";
-    assert OpenSearchPPLParser.CASE == 142
-        : "CASE token ID shifted — update ignored range start";
-    assert OpenSearchPPLParser.CAST == 387
-        : "CAST token ID shifted — update ignored range end";
-
-    Set<Integer> operatorsToInclude = new HashSet<>(Arrays.asList(
-        OpenSearchPPLParser.PIPE, OpenSearchPPLParser.EQUAL, OpenSearchPPLParser.COMMA,
-        OpenSearchPPLParser.NOT_EQUAL, OpenSearchPPLParser.LESS, OpenSearchPPLParser.NOT_LESS,
-        OpenSearchPPLParser.GREATER, OpenSearchPPLParser.NOT_GREATER,
-        OpenSearchPPLParser.OR, OpenSearchPPLParser.AND,
-        OpenSearchPPLParser.LT_PRTHS, OpenSearchPPLParser.RT_PRTHS,
-        OpenSearchPPLParser.SPAN,
-        OpenSearchPPLParser.MATCH, OpenSearchPPLParser.MATCH_PHRASE,
-        OpenSearchPPLParser.MATCH_BOOL_PREFIX, OpenSearchPPLParser.MATCH_PHRASE_PREFIX,
-        OpenSearchPPLParser.SQUOTA_STRING
-    ));
-
+  private static int[] buildIgnoredTokens(Vocabulary vocabulary) {
     List<Integer> ignored = new ArrayList<>();
-    ignored.add(OpenSearchPPLParser.AS);
-    ignored.add(OpenSearchPPLParser.IN);
 
-    // Range 1: MATCH .. ERROR_RECOGNITION
-    for (int i = OpenSearchPPLParser.MATCH; i <= OpenSearchPPLParser.ERROR_RECOGNITION; i++) {
-      if (!operatorsToInclude.contains(i)) {
-        ignored.add(i);
-      }
-    }
-
-    // Range 2: CASE .. CAST
-    for (int i = OpenSearchPPLParser.CASE; i <= OpenSearchPPLParser.CAST; i++) {
-      if (!operatorsToInclude.contains(i)) {
-        ignored.add(i);
+    for (int tokenType = 0; tokenType <= vocabulary.getMaxTokenType(); tokenType++) {
+      String symbolicName = vocabulary.getSymbolicName(tokenType);
+      if (isLexicalInternalToken(symbolicName)) {
+        ignored.add(tokenType);
       }
     }
 
     return ignored.stream().mapToInt(Integer::intValue).toArray();
   }
 
+  private static boolean isLexicalInternalToken(String symbolicName) {
+    if (symbolicName == null) {
+      return false;
+    }
+    return symbolicName.endsWith("_LITERAL") || INTERNAL_NON_LITERAL_TOKENS.contains(symbolicName);
+  }
+
   /**
    * Build the list of parser rule indices for CodeCompletionCore preferredRules.
    * These rules trigger semantic suggestions (suggest fields, tables, functions, etc.).
diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/autocomplete/PPLGrammarBundleBuilderTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/autocomplete/PPLGrammarBundleBuilderTest.java
@@ -6,17 +6,37 @@
 package org.opensearch.sql.ppl.autocomplete;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 
+import java.util.Arrays;
+import java.util.HashSet;
 import java.util.Map;
+import java.util.Set;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser;
 
 public class PPLGrammarBundleBuilderTest {
 
   private static final int EXPECTED_ATN_SERIALIZATION_VERSION = 4;
+  private static final Set<String> EXPECTED_IGNORED_NON_LITERAL_SYMBOLS =
+      new HashSet<>(
+          Arrays.asList(
+              "ID",
+              "NUMERIC_ID",
+              "ID_DATE_SUFFIX",
+              "CLUSTER",
+              "TIME_SNAP",
+              "SPANLENGTH",
+              "DECIMAL_SPANLENGTH",
+              "DQUOTA_STRING",
+              "SQUOTA_STRING",
+              "BQUOTA_STRING",
+              "LINE_COMMENT",
+              "BLOCK_COMMENT",
+              "ERROR_RECOGNITION"));
 
   private static GrammarBundle bundle;
 
@@ -150,13 +170,84 @@ public void testRulesToVisitAreNonEmpty() {
   }
 
   @Test
-  public void testIgnoredRangeBoundariesMatchGrammar() {
-    // These assertions mirror the runtime assertions in buildIgnoredTokens().
-    // If the grammar changes token ordinals, both this test and the builder assertions
-    // will flag the issue.
-    assertEquals("MATCH token ID", 427, OpenSearchPPLParser.MATCH);
-    assertEquals("ERROR_RECOGNITION token ID", 488, OpenSearchPPLParser.ERROR_RECOGNITION);
-    assertEquals("CASE token ID", 142, OpenSearchPPLParser.CASE);
-    assertEquals("CAST token ID", 387, OpenSearchPPLParser.CAST);
+  public void testIgnoredTokensContainOnlyLexicalInternalTokens() {
+    Set<Integer> ignored = ignoredTokenSet();
+    for (Integer tokenType : ignored) {
+      String symbol = bundle.getSymbolicNames()[tokenType];
+      assertTrue(
+          "ignoredTokens should contain only lexical/internal tokens, got: "
+              + symbol
+              + " ("
+              + tokenType
+              + ")",
+          symbol != null
+              && (symbol.endsWith("_LITERAL")
+                  || EXPECTED_IGNORED_NON_LITERAL_SYMBOLS.contains(symbol)));
+    }
+  }
+
+  @Test
+  public void testCommandAndKeywordTokensAreNotIgnored() {
+    Set<Integer> ignored = ignoredTokenSet();
+    assertFalse("LOOKUP should not be ignored", ignored.contains(OpenSearchPPLParser.LOOKUP));
+    assertFalse("REPLACE should not be ignored", ignored.contains(OpenSearchPPLParser.REPLACE));
+    assertFalse("REVERSE should not be ignored", ignored.contains(OpenSearchPPLParser.REVERSE));
+    assertFalse("MVCOMBINE should not be ignored", ignored.contains(OpenSearchPPLParser.MVCOMBINE));
+    assertFalse("MVEXPAND should not be ignored", ignored.contains(OpenSearchPPLParser.MVEXPAND));
+    assertFalse("LEFT should not be ignored", ignored.contains(OpenSearchPPLParser.LEFT));
+    assertFalse("RIGHT should not be ignored", ignored.contains(OpenSearchPPLParser.RIGHT));
+    assertFalse("AS should not be ignored", ignored.contains(OpenSearchPPLParser.AS));
+    assertFalse("IN should not be ignored", ignored.contains(OpenSearchPPLParser.IN));
+  }
+
+  @Test
+  public void testExpressionFunctionTokensAreNotIgnored() {
+    Set<Integer> ignored = ignoredTokenSet();
+    assertFalse("MVAPPEND should not be ignored", ignored.contains(OpenSearchPPLParser.MVAPPEND));
+    assertFalse("MVJOIN should not be ignored", ignored.contains(OpenSearchPPLParser.MVJOIN));
+    assertFalse("MVINDEX should not be ignored", ignored.contains(OpenSearchPPLParser.MVINDEX));
+  }
+
+  @Test
+  public void testNewerGrammarKeywordsAreNotIgnoredWhenPresent() {
+    // These tokens exist in newer grammar variants (for example graph lookup support).
+    // Keep this test tolerant so it works across branches with different grammar revisions.
+    assertTokenNotIgnoredIfPresent("GRAPHLOOKUP");
+    assertTokenNotIgnoredIfPresent("START_FIELD");
+    assertTokenNotIgnoredIfPresent("FROM_FIELD");
+    assertTokenNotIgnoredIfPresent("TO_FIELD");
+    assertTokenNotIgnoredIfPresent("MAX_DEPTH");
+    assertTokenNotIgnoredIfPresent("DEPTH_FIELD");
+    assertTokenNotIgnoredIfPresent("DIRECTION");
+    assertTokenNotIgnoredIfPresent("UNI");
+    assertTokenNotIgnoredIfPresent("BI");
+    assertTokenNotIgnoredIfPresent("SUPPORT_ARRAY");
+    assertTokenNotIgnoredIfPresent("BATCH_MODE");
+    assertTokenNotIgnoredIfPresent("USE_PIT");
+  }
+
+  private static Set<Integer> ignoredTokenSet() {
+    Set<Integer> ignored = new HashSet<>();
+    for (int tokenType : bundle.getIgnoredTokens()) {
+      ignored.add(tokenType);
+    }
+    return ignored;
+  }
+
+  private static void assertTokenNotIgnoredIfPresent(String symbolicTokenName) {
+    int tokenType = tokenTypeBySymbolicName(symbolicTokenName);
+    if (tokenType >= 0) {
+      assertFalse(symbolicTokenName + " should not be ignored", ignoredTokenSet().contains(tokenType));
+    }
+  }
+
+  private static int tokenTypeBySymbolicName(String symbolicTokenName) {
+    String[] symbols = bundle.getSymbolicNames();
+    for (int i = 0; i < symbols.length; i++) {
+      if (symbolicTokenName.equals(symbols[i])) {
+        return i;
+      }
+    }
+    return -1;
   }
 }