Skip to content

Commit 5dca84f

Browse files
authored
Support split eval function (#4814)
* Support split eval function Signed-off-by: Kai Huang <ahkcs@amazon.com> # Conflicts: # core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java # integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java # ppl/src/main/antlr/OpenSearchPPLLexer.g4 # ppl/src/main/antlr/OpenSearchPPLParser.g4 # ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java # ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java * doctest Signed-off-by: Kai Huang <ahkcs@amazon.com> * Update test cases Signed-off-by: Kai Huang <ahkcs@amazon.com> * Update to not use UDF Signed-off-by: Kai Huang <ahkcs@amazon.com> --------- Signed-off-by: Kai Huang <ahkcs@amazon.com>
1 parent d4daa34 commit 5dca84f

8 files changed

Lines changed: 218 additions & 0 deletions

File tree

core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ public enum BuiltinFunctionName {
7575
MVAPPEND(FunctionName.of("mvappend")),
7676
MVJOIN(FunctionName.of("mvjoin")),
7777
MVINDEX(FunctionName.of("mvindex")),
78+
SPLIT(FunctionName.of("split")),
7879
MVDEDUP(FunctionName.of("mvdedup")),
7980
FORALL(FunctionName.of("forall")),
8081
EXISTS(FunctionName.of("exists")),

core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@
195195
import static org.opensearch.sql.expression.function.BuiltinFunctionName.SINH;
196196
import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPAN;
197197
import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPAN_BUCKET;
198+
import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPLIT;
198199
import static org.opensearch.sql.expression.function.BuiltinFunctionName.SQRT;
199200
import static org.opensearch.sql.expression.function.BuiltinFunctionName.STDDEV_POP;
200201
import static org.opensearch.sql.expression.function.BuiltinFunctionName.STDDEV_SAMP;
@@ -989,6 +990,34 @@ void populate() {
989990
builder.makeCall(SqlLibraryOperators.ARRAY_JOIN, array, delimiter),
990991
PPLTypeChecker.family(SqlTypeFamily.ARRAY, SqlTypeFamily.CHARACTER));
991992

993+
// Register SPLIT with custom logic for empty delimiter
994+
// Case 1: Delimiter is not empty string, use SPLIT
995+
// Case 2: Delimiter is empty string, use REGEXP_EXTRACT_ALL with '.' pattern
996+
register(
997+
SPLIT,
998+
(FunctionImp2)
999+
(builder, str, delimiter) -> {
1000+
// Create condition: delimiter = ''
1001+
RexNode emptyString = builder.makeLiteral("");
1002+
RexNode isEmptyDelimiter =
1003+
builder.makeCall(SqlStdOperatorTable.EQUALS, delimiter, emptyString);
1004+
1005+
// For empty delimiter: split into characters using REGEXP_EXTRACT_ALL with '.'
1006+
// pattern This matches each individual character
1007+
RexNode dotPattern = builder.makeLiteral(".");
1008+
RexNode splitChars =
1009+
builder.makeCall(SqlLibraryOperators.REGEXP_EXTRACT_ALL, str, dotPattern);
1010+
1011+
// For non-empty delimiter: use standard SPLIT
1012+
RexNode normalSplit = builder.makeCall(SqlLibraryOperators.SPLIT, str, delimiter);
1013+
1014+
// Use CASE to choose between the two approaches
1015+
// CASE WHEN isEmptyDelimiter THEN splitChars ELSE normalSplit END
1016+
return builder.makeCall(
1017+
SqlStdOperatorTable.CASE, isEmptyDelimiter, splitChars, normalSplit);
1018+
},
1019+
PPLTypeChecker.family(SqlTypeFamily.CHARACTER, SqlTypeFamily.CHARACTER));
1020+
9921021
// Register MVINDEX to use Calcite's ITEM/ARRAY_SLICE with index normalization
9931022
register(
9941023
MVINDEX,

docs/user/ppl/functions/collection.rst

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,60 @@ Example::
186186
| 120 |
187187
+--------+
188188

189+
SPLIT
190+
-----
191+
192+
Description
193+
>>>>>>>>>>>
194+
195+
Usage: split(str, delimiter) splits the string values on the delimiter and returns the string values as a multivalue field (array). Use an empty string ("") to split the original string into one value per character. If the delimiter is not found, returns an array containing the original string. If the input string is empty, returns an empty array.
196+
197+
Argument type: str: STRING, delimiter: STRING
198+
199+
Return type: ARRAY of STRING
200+
201+
Example::
202+
203+
os> source=people | eval test = 'buttercup;rarity;tenderhoof;dash', result = split(test, ';') | fields result | head 1
204+
fetched rows / total rows = 1/1
205+
+------------------------------------+
206+
| result |
207+
|------------------------------------|
208+
| [buttercup,rarity,tenderhoof,dash] |
209+
+------------------------------------+
210+
211+
os> source=people | eval test = '1a2b3c4def567890', result = split(test, 'def') | fields result | head 1
212+
fetched rows / total rows = 1/1
213+
+------------------+
214+
| result |
215+
|------------------|
216+
| [1a2b3c4,567890] |
217+
+------------------+
218+
219+
os> source=people | eval test = 'abcd', result = split(test, '') | fields result | head 1
220+
fetched rows / total rows = 1/1
221+
+-----------+
222+
| result |
223+
|-----------|
224+
| [a,b,c,d] |
225+
+-----------+
226+
227+
os> source=people | eval test = 'name::value', result = split(test, '::') | fields result | head 1
228+
fetched rows / total rows = 1/1
229+
+--------------+
230+
| result |
231+
|--------------|
232+
| [name,value] |
233+
+--------------+
234+
235+
os> source=people | eval test = 'hello', result = split(test, ',') | fields result | head 1
236+
fetched rows / total rows = 1/1
237+
+---------+
238+
| result |
239+
|---------|
240+
| [hello] |
241+
+---------+
242+
189243
MVJOIN
190244
------
191245

integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,4 +567,43 @@ public void testMvdedupPreservesOrder() throws IOException {
567567
// Should preserve first occurrence order: z, a, b, c
568568
verifyDataRows(actual, rows(List.of("z", "a", "b", "c")));
569569
}
570+
571+
@Test
572+
public void testSplitWithSemicolonDelimiter() throws IOException {
573+
JSONObject actual =
574+
executeQuery(
575+
String.format(
576+
"source=%s | eval test = 'buttercup;rarity;tenderhoof;dash;mcintosh', result ="
577+
+ " split(test, ';') | head 1 | fields result",
578+
TEST_INDEX_BANK));
579+
580+
verifySchema(actual, schema("result", "array"));
581+
verifyDataRows(actual, rows(List.of("buttercup", "rarity", "tenderhoof", "dash", "mcintosh")));
582+
}
583+
584+
@Test
585+
public void testSplitWithMultiCharDelimiter() throws IOException {
586+
JSONObject actual =
587+
executeQuery(
588+
String.format(
589+
"source=%s | eval test = '1a2b3c4def567890', result = split(test, 'def') | head 1 |"
590+
+ " fields result",
591+
TEST_INDEX_BANK));
592+
593+
verifySchema(actual, schema("result", "array"));
594+
verifyDataRows(actual, rows(List.of("1a2b3c4", "567890")));
595+
}
596+
597+
@Test
598+
public void testSplitWithEmptyDelimiter() throws IOException {
599+
JSONObject actual =
600+
executeQuery(
601+
String.format(
602+
"source=%s | eval test = 'abcd', result = split(test, '') | head 1 | fields result",
603+
TEST_INDEX_BANK));
604+
605+
verifySchema(actual, schema("result", "array"));
606+
// Empty delimiter splits into individual characters
607+
verifyDataRows(actual, rows(List.of("a", "b", "c", "d")));
608+
}
570609
}

ppl/src/main/antlr/OpenSearchPPLLexer.g4

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,7 @@ MVAPPEND: 'MVAPPEND';
445445
MVJOIN: 'MVJOIN';
446446
MVINDEX: 'MVINDEX';
447447
MVDEDUP: 'MVDEDUP';
448+
SPLIT: 'SPLIT';
448449
FORALL: 'FORALL';
449450
FILTER: 'FILTER';
450451
TRANSFORM: 'TRANSFORM';

ppl/src/main/antlr/OpenSearchPPLParser.g4

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1098,6 +1098,7 @@ collectionFunctionName
10981098
| MVJOIN
10991099
| MVINDEX
11001100
| MVDEDUP
1101+
| SPLIT
11011102
| FORALL
11021103
| EXISTS
11031104
| FILTER

ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,4 +290,81 @@ public void testMvdedupPreservesOrder() {
290290
+ "LIMIT 1";
291291
verifyPPLToSparkSQL(root, expectedSparkSql);
292292
}
293+
294+
@Test
295+
public void testSplitWithSemicolonDelimiter() {
296+
String ppl =
297+
"source=EMP | eval test = 'buttercup;rarity;tenderhoof', result = split(test, ';') | head"
298+
+ " 1 | fields result";
299+
RelNode root = getRelNode(ppl);
300+
301+
String expectedLogical =
302+
"LogicalProject(result=[$9])\n"
303+
+ " LogicalSort(fetch=[1])\n"
304+
+ " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
305+
+ " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['buttercup;rarity;tenderhoof':VARCHAR],"
306+
+ " result=[CASE(=(';', ''),"
307+
+ " REGEXP_EXTRACT_ALL('buttercup;rarity;tenderhoof':VARCHAR, '.'),"
308+
+ " SPLIT('buttercup;rarity;tenderhoof':VARCHAR, ';'))])\n"
309+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
310+
verifyLogical(root, expectedLogical);
311+
312+
String expectedSparkSql =
313+
"SELECT CASE WHEN ';' = '' THEN REGEXP_EXTRACT_ALL('buttercup;rarity;tenderhoof', "
314+
+ "'.') ELSE SPLIT('buttercup;rarity;tenderhoof', ';') END "
315+
+ "`result`\n"
316+
+ "FROM `scott`.`EMP`\n"
317+
+ "LIMIT 1";
318+
verifyPPLToSparkSQL(root, expectedSparkSql);
319+
}
320+
321+
@Test
322+
public void testSplitWithMultiCharDelimiter() {
323+
String ppl =
324+
"source=EMP | eval test = '1a2b3c4def567890', result = split(test, 'def') | head 1 |"
325+
+ " fields result";
326+
RelNode root = getRelNode(ppl);
327+
328+
String expectedLogical =
329+
"LogicalProject(result=[$9])\n"
330+
+ " LogicalSort(fetch=[1])\n"
331+
+ " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
332+
+ " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['1a2b3c4def567890':VARCHAR],"
333+
+ " result=[CASE(=('def':VARCHAR, ''), REGEXP_EXTRACT_ALL('1a2b3c4def567890':VARCHAR,"
334+
+ " '.'), SPLIT('1a2b3c4def567890':VARCHAR, 'def':VARCHAR))])\n"
335+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
336+
verifyLogical(root, expectedLogical);
337+
338+
String expectedSparkSql =
339+
"SELECT CASE WHEN 'def' = '' THEN REGEXP_EXTRACT_ALL('1a2b3c4def567890', "
340+
+ "'.') ELSE SPLIT('1a2b3c4def567890', 'def') END `result`\n"
341+
+ "FROM `scott`.`EMP`\n"
342+
+ "LIMIT 1";
343+
verifyPPLToSparkSQL(root, expectedSparkSql);
344+
}
345+
346+
@Test
347+
public void testSplitWithEmptyDelimiter() {
348+
String ppl =
349+
"source=EMP | eval test = 'abcd', result = split(test, '') | head 1 | fields result";
350+
RelNode root = getRelNode(ppl);
351+
352+
// With empty delimiter, should split into individual characters
353+
String expectedLogical =
354+
"LogicalProject(result=[$9])\n"
355+
+ " LogicalSort(fetch=[1])\n"
356+
+ " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
357+
+ " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['abcd':VARCHAR],"
358+
+ " result=[CASE(=('':VARCHAR, ''), REGEXP_EXTRACT_ALL('abcd':VARCHAR,"
359+
+ " '.'), SPLIT('abcd':VARCHAR, '':VARCHAR))])\n"
360+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
361+
verifyLogical(root, expectedLogical);
362+
363+
String expectedSparkSql =
364+
"SELECT CASE WHEN '' = '' THEN REGEXP_EXTRACT_ALL('abcd', '.') "
365+
+ "ELSE SPLIT('abcd', '') END `result`\n"
366+
+ "FROM `scott`.`EMP`\n"
367+
+ "LIMIT 1";
368+
verifyPPLToSparkSQL(root, expectedSparkSql);
369+
}
293370
}

ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -845,6 +845,22 @@ public void testMvindex() {
845845
anonymize("source=t | eval result=mvindex(array(1, 2, 3, 4, 5), 1, 3) | fields result"));
846846
}
847847

848+
@Test
849+
public void testSplit() {
850+
// Test split with delimiter
851+
assertEquals(
852+
"source=table | eval identifier=split(***,***) | fields + identifier",
853+
anonymize("source=t | eval result=split('a;b;c', ';') | fields result"));
854+
// Test split with field reference
855+
assertEquals(
856+
"source=table | eval identifier=split(identifier,***) | fields + identifier",
857+
anonymize("source=t | eval result=split(text, ',') | fields result"));
858+
// Test split with empty delimiter (splits into characters)
859+
assertEquals(
860+
"source=table | eval identifier=split(***,***) | fields + identifier",
861+
anonymize("source=t | eval result=split('abcd', '') | fields result"));
862+
}
863+
848864
@Test
849865
public void testMvdedup() {
850866
// Test mvdedup with array containing duplicates

0 commit comments

Comments
 (0)