Support split eval function (#4814)

ahkcs · web-flow · commit 5dca84f73315 · 2025-12-08T13:47:56.000-08:00
* Support split eval function

Signed-off-by: Kai Huang &lt;ahkcs@amazon.com&gt;

# Conflicts:
#	core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java
#	integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java
#	ppl/src/main/antlr/OpenSearchPPLLexer.g4
#	ppl/src/main/antlr/OpenSearchPPLParser.g4
#	ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java
#	ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java

* doctest

Signed-off-by: Kai Huang &lt;ahkcs@amazon.com&gt;

* Update test cases

Signed-off-by: Kai Huang &lt;ahkcs@amazon.com&gt;

* Update to not use UDF

Signed-off-by: Kai Huang &lt;ahkcs@amazon.com&gt;

---------

Signed-off-by: Kai Huang &lt;ahkcs@amazon.com&gt;
diff --git a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java
@@ -75,6 +75,7 @@ public enum BuiltinFunctionName {
   MVAPPEND(FunctionName.of("mvappend")),
   MVJOIN(FunctionName.of("mvjoin")),
   MVINDEX(FunctionName.of("mvindex")),
+  SPLIT(FunctionName.of("split")),
   MVDEDUP(FunctionName.of("mvdedup")),
   FORALL(FunctionName.of("forall")),
   EXISTS(FunctionName.of("exists")),
diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java
@@ -195,6 +195,7 @@
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.SINH;
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPAN;
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPAN_BUCKET;
+import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPLIT;
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.SQRT;
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.STDDEV_POP;
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.STDDEV_SAMP;
@@ -989,6 +990,34 @@ void populate() {
                   builder.makeCall(SqlLibraryOperators.ARRAY_JOIN, array, delimiter),
           PPLTypeChecker.family(SqlTypeFamily.ARRAY, SqlTypeFamily.CHARACTER));
 
+      // Register SPLIT with custom logic for empty delimiter
+      // Case 1: Delimiter is not empty string, use SPLIT
+      // Case 2: Delimiter is empty string, use REGEXP_EXTRACT_ALL with '.' pattern
+      register(
+          SPLIT,
+          (FunctionImp2)
+              (builder, str, delimiter) -> {
+                // Create condition: delimiter = ''
+                RexNode emptyString = builder.makeLiteral("");
+                RexNode isEmptyDelimiter =
+                    builder.makeCall(SqlStdOperatorTable.EQUALS, delimiter, emptyString);
+
+                // For empty delimiter: split into characters using REGEXP_EXTRACT_ALL with '.'
+                // pattern This matches each individual character
+                RexNode dotPattern = builder.makeLiteral(".");
+                RexNode splitChars =
+                    builder.makeCall(SqlLibraryOperators.REGEXP_EXTRACT_ALL, str, dotPattern);
+
+                // For non-empty delimiter: use standard SPLIT
+                RexNode normalSplit = builder.makeCall(SqlLibraryOperators.SPLIT, str, delimiter);
+
+                // Use CASE to choose between the two approaches
+                // CASE WHEN isEmptyDelimiter THEN splitChars ELSE normalSplit END
+                return builder.makeCall(
+                    SqlStdOperatorTable.CASE, isEmptyDelimiter, splitChars, normalSplit);
+              },
+          PPLTypeChecker.family(SqlTypeFamily.CHARACTER, SqlTypeFamily.CHARACTER));
+
       // Register MVINDEX to use Calcite's ITEM/ARRAY_SLICE with index normalization
       register(
           MVINDEX,
diff --git a/docs/user/ppl/functions/collection.rst b/docs/user/ppl/functions/collection.rst
@@ -186,6 +186,60 @@ Example::
     | 120    |
     +--------+
 
+SPLIT
+-----
+
+Description
+>>>>>>>>>>>
+
+Usage: split(str, delimiter) splits the string values on the delimiter and returns the string values as a multivalue field (array). Use an empty string ("") to split the original string into one value per character. If the delimiter is not found, returns an array containing the original string. If the input string is empty, returns an empty array.
+
+Argument type: str: STRING, delimiter: STRING
+
+Return type: ARRAY of STRING
+
+Example::
+
+    os> source=people | eval test = 'buttercup;rarity;tenderhoof;dash', result = split(test, ';') | fields result | head 1
+    fetched rows / total rows = 1/1
+    +------------------------------------+
+    | result                             |
+    |------------------------------------|
+    | [buttercup,rarity,tenderhoof,dash] |
+    +------------------------------------+
+
+    os> source=people | eval test = '1a2b3c4def567890', result = split(test, 'def') | fields result | head 1
+    fetched rows / total rows = 1/1
+    +------------------+
+    | result           |
+    |------------------|
+    | [1a2b3c4,567890] |
+    +------------------+
+
+    os> source=people | eval test = 'abcd', result = split(test, '') | fields result | head 1
+    fetched rows / total rows = 1/1
+    +-----------+
+    | result    |
+    |-----------|
+    | [a,b,c,d] |
+    +-----------+
+
+    os> source=people | eval test = 'name::value', result = split(test, '::') | fields result | head 1
+    fetched rows / total rows = 1/1
+    +--------------+
+    | result       |
+    |--------------|
+    | [name,value] |
+    +--------------+
+
+    os> source=people | eval test = 'hello', result = split(test, ',') | fields result | head 1
+    fetched rows / total rows = 1/1
+    +---------+
+    | result  |
+    |---------|
+    | [hello] |
+    +---------+
+
 MVJOIN
 ------
 
diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java
@@ -567,4 +567,43 @@ public void testMvdedupPreservesOrder() throws IOException {
     // Should preserve first occurrence order: z, a, b, c
     verifyDataRows(actual, rows(List.of("z", "a", "b", "c")));
   }
+
+  @Test
+  public void testSplitWithSemicolonDelimiter() throws IOException {
+    JSONObject actual =
+        executeQuery(
+            String.format(
+                "source=%s | eval test = 'buttercup;rarity;tenderhoof;dash;mcintosh', result ="
+                    + " split(test, ';') | head 1 | fields result",
+                TEST_INDEX_BANK));
+
+    verifySchema(actual, schema("result", "array"));
+    verifyDataRows(actual, rows(List.of("buttercup", "rarity", "tenderhoof", "dash", "mcintosh")));
+  }
+
+  @Test
+  public void testSplitWithMultiCharDelimiter() throws IOException {
+    JSONObject actual =
+        executeQuery(
+            String.format(
+                "source=%s | eval test = '1a2b3c4def567890', result = split(test, 'def') | head 1 |"
+                    + " fields result",
+                TEST_INDEX_BANK));
+
+    verifySchema(actual, schema("result", "array"));
+    verifyDataRows(actual, rows(List.of("1a2b3c4", "567890")));
+  }
+
+  @Test
+  public void testSplitWithEmptyDelimiter() throws IOException {
+    JSONObject actual =
+        executeQuery(
+            String.format(
+                "source=%s | eval test = 'abcd', result = split(test, '') | head 1 | fields result",
+                TEST_INDEX_BANK));
+
+    verifySchema(actual, schema("result", "array"));
+    // Empty delimiter splits into individual characters
+    verifyDataRows(actual, rows(List.of("a", "b", "c", "d")));
+  }
 }
diff --git a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 b/ppl/src/main/antlr/OpenSearchPPLLexer.g4
@@ -445,6 +445,7 @@ MVAPPEND:                           'MVAPPEND';
 MVJOIN:                             'MVJOIN';
 MVINDEX:                            'MVINDEX';
 MVDEDUP:                            'MVDEDUP';
+SPLIT:                              'SPLIT';
 FORALL:                             'FORALL';
 FILTER:                             'FILTER';
 TRANSFORM:                          'TRANSFORM';
diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4
@@ -1098,6 +1098,7 @@ collectionFunctionName
     | MVJOIN
     | MVINDEX
     | MVDEDUP
+    | SPLIT
     | FORALL
     | EXISTS
     | FILTER
diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java
@@ -290,4 +290,81 @@ public void testMvdedupPreservesOrder() {
             + "LIMIT 1";
     verifyPPLToSparkSQL(root, expectedSparkSql);
   }
+
+  @Test
+  public void testSplitWithSemicolonDelimiter() {
+    String ppl =
+        "source=EMP | eval test = 'buttercup;rarity;tenderhoof', result = split(test, ';') | head"
+            + " 1 | fields result";
+    RelNode root = getRelNode(ppl);
+
+    String expectedLogical =
+        "LogicalProject(result=[$9])\n"
+            + "  LogicalSort(fetch=[1])\n"
+            + "    LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
+            + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['buttercup;rarity;tenderhoof':VARCHAR],"
+            + " result=[CASE(=(';', ''),"
+            + " REGEXP_EXTRACT_ALL('buttercup;rarity;tenderhoof':VARCHAR, '.'),"
+            + " SPLIT('buttercup;rarity;tenderhoof':VARCHAR, ';'))])\n"
+            + "      LogicalTableScan(table=[[scott, EMP]])\n";
+    verifyLogical(root, expectedLogical);
+
+    String expectedSparkSql =
+        "SELECT CASE WHEN ';' = '' THEN REGEXP_EXTRACT_ALL('buttercup;rarity;tenderhoof', "
+            + "'.') ELSE SPLIT('buttercup;rarity;tenderhoof', ';') END "
+            + "`result`\n"
+            + "FROM `scott`.`EMP`\n"
+            + "LIMIT 1";
+    verifyPPLToSparkSQL(root, expectedSparkSql);
+  }
+
+  @Test
+  public void testSplitWithMultiCharDelimiter() {
+    String ppl =
+        "source=EMP | eval test = '1a2b3c4def567890', result = split(test, 'def') | head 1 |"
+            + " fields result";
+    RelNode root = getRelNode(ppl);
+
+    String expectedLogical =
+        "LogicalProject(result=[$9])\n"
+            + "  LogicalSort(fetch=[1])\n"
+            + "    LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
+            + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['1a2b3c4def567890':VARCHAR],"
+            + " result=[CASE(=('def':VARCHAR, ''), REGEXP_EXTRACT_ALL('1a2b3c4def567890':VARCHAR,"
+            + " '.'), SPLIT('1a2b3c4def567890':VARCHAR, 'def':VARCHAR))])\n"
+            + "      LogicalTableScan(table=[[scott, EMP]])\n";
+    verifyLogical(root, expectedLogical);
+
+    String expectedSparkSql =
+        "SELECT CASE WHEN 'def' = '' THEN REGEXP_EXTRACT_ALL('1a2b3c4def567890', "
+            + "'.') ELSE SPLIT('1a2b3c4def567890', 'def') END `result`\n"
+            + "FROM `scott`.`EMP`\n"
+            + "LIMIT 1";
+    verifyPPLToSparkSQL(root, expectedSparkSql);
+  }
+
+  @Test
+  public void testSplitWithEmptyDelimiter() {
+    String ppl =
+        "source=EMP | eval test = 'abcd', result = split(test, '') | head 1 | fields result";
+    RelNode root = getRelNode(ppl);
+
+    // With empty delimiter, should split into individual characters
+    String expectedLogical =
+        "LogicalProject(result=[$9])\n"
+            + "  LogicalSort(fetch=[1])\n"
+            + "    LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
+            + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['abcd':VARCHAR],"
+            + " result=[CASE(=('':VARCHAR, ''), REGEXP_EXTRACT_ALL('abcd':VARCHAR,"
+            + " '.'), SPLIT('abcd':VARCHAR, '':VARCHAR))])\n"
+            + "      LogicalTableScan(table=[[scott, EMP]])\n";
+    verifyLogical(root, expectedLogical);
+
+    String expectedSparkSql =
+        "SELECT CASE WHEN '' = '' THEN REGEXP_EXTRACT_ALL('abcd', '.') "
+            + "ELSE SPLIT('abcd', '') END `result`\n"
+            + "FROM `scott`.`EMP`\n"
+            + "LIMIT 1";
+    verifyPPLToSparkSQL(root, expectedSparkSql);
+  }
 }
diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java
@@ -845,6 +845,22 @@ public void testMvindex() {
         anonymize("source=t | eval result=mvindex(array(1, 2, 3, 4, 5), 1, 3) | fields result"));
   }
 
+  @Test
+  public void testSplit() {
+    // Test split with delimiter
+    assertEquals(
+        "source=table | eval identifier=split(***,***) | fields + identifier",
+        anonymize("source=t | eval result=split('a;b;c', ';') | fields result"));
+    // Test split with field reference
+    assertEquals(
+        "source=table | eval identifier=split(identifier,***) | fields + identifier",
+        anonymize("source=t | eval result=split(text, ',') | fields result"));
+    // Test split with empty delimiter (splits into characters)
+    assertEquals(
+        "source=table | eval identifier=split(***,***) | fields + identifier",
+        anonymize("source=t | eval result=split('abcd', '') | fields result"));
+  }
+
   @Test
   public void testMvdedup() {
     // Test mvdedup with array containing duplicates