Skip to content

Commit d88108e

Browse files
committed
unit test - initail commit
Signed-off-by: Jialiang Liang <jiallian@amazon.com>
1 parent 8852bc7 commit d88108e

1 file changed

Lines changed: 172 additions & 0 deletions

File tree

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.sql.ppl.calcite;
7+
8+
import org.apache.calcite.rel.RelNode;
9+
import org.apache.calcite.test.CalciteAssert;
10+
import org.junit.Test;
11+
12+
public class CalcitePPLRegexTest extends CalcitePPLAbstractTest {
13+
public CalcitePPLRegexTest() {
14+
super(CalciteAssert.SchemaSpec.SCOTT_WITH_TEMPORAL);
15+
}
16+
17+
@Test
18+
public void testRegexBasic() {
19+
String ppl = "source=EMP | regex ENAME='A.*' | fields ENAME, JOB";
20+
RelNode root = getRelNode(ppl);
21+
String expectedLogical =
22+
"LogicalProject(ENAME=[$1], JOB=[$2])\n"
23+
+ " LogicalFilter(condition=[REGEXP_CONTAINS($1, 'A.*':VARCHAR)])\n"
24+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
25+
verifyLogical(root, expectedLogical);
26+
27+
String expectedSparkSql =
28+
"SELECT `ENAME`, `JOB`\n"
29+
+ "FROM `scott`.`EMP`\n"
30+
+ "WHERE REGEXP_CONTAINS(`ENAME`, 'A.*')";
31+
verifyPPLToSparkSQL(root, expectedSparkSql);
32+
}
33+
34+
@Test
35+
public void testRegexChainedFilters() {
36+
String ppl = "source=EMP | regex ENAME='A.*' | regex JOB='.*CLERK' | fields ENAME, JOB";
37+
RelNode root = getRelNode(ppl);
38+
String expectedLogical =
39+
"LogicalProject(ENAME=[$1], JOB=[$2])\n"
40+
+ " LogicalFilter(condition=[REGEXP_CONTAINS($2, '.*CLERK':VARCHAR)])\n"
41+
+ " LogicalFilter(condition=[REGEXP_CONTAINS($1, 'A.*':VARCHAR)])\n"
42+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
43+
verifyLogical(root, expectedLogical);
44+
45+
String expectedSparkSql =
46+
"SELECT `ENAME`, `JOB`\n"
47+
+ "FROM (SELECT *\n"
48+
+ "FROM `scott`.`EMP`\n"
49+
+ "WHERE REGEXP_CONTAINS(`ENAME`, 'A.*')) `t`\n"
50+
+ "WHERE REGEXP_CONTAINS(`JOB`, '.*CLERK')";
51+
verifyPPLToSparkSQL(root, expectedSparkSql);
52+
}
53+
54+
@Test
55+
public void testRegexWithNotEqual() {
56+
String ppl = "source=EMP | regex ENAME!='A.*' | fields ENAME, JOB";
57+
RelNode root = getRelNode(ppl);
58+
String expectedLogical =
59+
"LogicalProject(ENAME=[$1], JOB=[$2])\n"
60+
+ " LogicalFilter(condition=[NOT(REGEXP_CONTAINS($1, 'A.*':VARCHAR))])\n"
61+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
62+
verifyLogical(root, expectedLogical);
63+
64+
String expectedSparkSql =
65+
"SELECT `ENAME`, `JOB`\n"
66+
+ "FROM `scott`.`EMP`\n"
67+
+ "WHERE NOT REGEXP_CONTAINS(`ENAME`, 'A.*')";
68+
verifyPPLToSparkSQL(root, expectedSparkSql);
69+
}
70+
71+
@Test
72+
public void testRegexComplexPattern() {
73+
String ppl = "source=EMP | regex ENAME='[A-Z]{2,}' | fields ENAME";
74+
RelNode root = getRelNode(ppl);
75+
String expectedLogical =
76+
"LogicalProject(ENAME=[$1])\n"
77+
+ " LogicalFilter(condition=[REGEXP_CONTAINS($1, '[A-Z]{2,}':VARCHAR)])\n"
78+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
79+
verifyLogical(root, expectedLogical);
80+
81+
String expectedSparkSql =
82+
"SELECT `ENAME`\n" + "FROM `scott`.`EMP`\n" + "WHERE REGEXP_CONTAINS(`ENAME`, '[A-Z]{2,}')";
83+
verifyPPLToSparkSQL(root, expectedSparkSql);
84+
}
85+
86+
@Test
87+
public void testRegexWithEscapedCharacters() {
88+
String ppl = "source=EMP | regex JOB='SALES\\sMAN' | fields JOB";
89+
RelNode root = getRelNode(ppl);
90+
String expectedLogical =
91+
"LogicalProject(JOB=[$2])\n"
92+
+ " LogicalFilter(condition=[REGEXP_CONTAINS($2, 'SALES\\sMAN':VARCHAR)])\n"
93+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
94+
verifyLogical(root, expectedLogical);
95+
96+
String expectedSparkSql =
97+
"SELECT `JOB`\n" + "FROM `scott`.`EMP`\n" + "WHERE REGEXP_CONTAINS(`JOB`, 'SALES\\sMAN')";
98+
verifyPPLToSparkSQL(root, expectedSparkSql);
99+
}
100+
101+
@Test
102+
public void testRegexChainedCommands() {
103+
String ppl = "source=EMP | regex ENAME='A.*' | fields ENAME | sort ENAME | head 5";
104+
RelNode root = getRelNode(ppl);
105+
String expectedLogical =
106+
"LogicalSort(sort0=[$0], dir0=[ASC-nulls-first], fetch=[5])\n"
107+
+ " LogicalProject(ENAME=[$1])\n"
108+
+ " LogicalFilter(condition=[REGEXP_CONTAINS($1, 'A.*':VARCHAR)])\n"
109+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
110+
verifyLogical(root, expectedLogical);
111+
112+
String expectedSparkSql =
113+
"SELECT `ENAME`\n"
114+
+ "FROM `scott`.`EMP`\n"
115+
+ "WHERE REGEXP_CONTAINS(`ENAME`, 'A.*')\n"
116+
+ "ORDER BY `ENAME`\n"
117+
+ "LIMIT 5";
118+
verifyPPLToSparkSQL(root, expectedSparkSql);
119+
}
120+
121+
@Test
122+
public void testRegexWithAggregation() {
123+
String ppl = "source=EMP | regex JOB='.*CLERK' | stats count() by JOB";
124+
RelNode root = getRelNode(ppl);
125+
String expectedLogical =
126+
"LogicalProject(count()=[$1], JOB=[$0])\n"
127+
+ " LogicalAggregate(group=[{0}], count()=[COUNT()])\n"
128+
+ " LogicalProject(JOB=[$2])\n"
129+
+ " LogicalFilter(condition=[REGEXP_CONTAINS($2, '.*CLERK':VARCHAR)])\n"
130+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
131+
verifyLogical(root, expectedLogical);
132+
133+
String expectedSparkSql =
134+
"SELECT COUNT(*) `count()`, `JOB`\n"
135+
+ "FROM `scott`.`EMP`\n"
136+
+ "WHERE REGEXP_CONTAINS(`JOB`, '.*CLERK')\n"
137+
+ "GROUP BY `JOB`";
138+
verifyPPLToSparkSQL(root, expectedSparkSql);
139+
}
140+
141+
@Test
142+
public void testRegexCaseInsensitive() {
143+
String ppl = "source=EMP | regex ENAME='(?i)smith' | fields ENAME";
144+
RelNode root = getRelNode(ppl);
145+
String expectedLogical =
146+
"LogicalProject(ENAME=[$1])\n"
147+
+ " LogicalFilter(condition=[REGEXP_CONTAINS($1, '(?i)smith':VARCHAR)])\n"
148+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
149+
verifyLogical(root, expectedLogical);
150+
151+
String expectedSparkSql =
152+
"SELECT `ENAME`\n" + "FROM `scott`.`EMP`\n" + "WHERE REGEXP_CONTAINS(`ENAME`, '(?i)smith')";
153+
verifyPPLToSparkSQL(root, expectedSparkSql);
154+
}
155+
156+
@Test
157+
public void testRegexWithNumericPattern() {
158+
String ppl = "source=EMP | regex SAL='[0-9]{4,}' | fields ENAME, SAL";
159+
RelNode root = getRelNode(ppl);
160+
String expectedLogical =
161+
"LogicalProject(ENAME=[$1], SAL=[$5])\n"
162+
+ " LogicalFilter(condition=[REGEXP_CONTAINS($5, '[0-9]{4,}':VARCHAR)])\n"
163+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
164+
verifyLogical(root, expectedLogical);
165+
166+
String expectedSparkSql =
167+
"SELECT `ENAME`, `SAL`\n"
168+
+ "FROM `scott`.`EMP`\n"
169+
+ "WHERE REGEXP_CONTAINS(`SAL`, '[0-9]{4,}')";
170+
verifyPPLToSparkSQL(root, expectedSparkSql);
171+
}
172+
}

0 commit comments

Comments
 (0)