Skip to content

Commit 5b31a47

Browse files
committed
Add UT for rex
Signed-off-by: Jialiang Liang <jiallian@amazon.com>
1 parent 457c8c8 commit 5b31a47

1 file changed

Lines changed: 221 additions & 0 deletions

File tree

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.sql.ppl.calcite;
7+
8+
import org.apache.calcite.rel.RelNode;
9+
import org.apache.calcite.test.CalciteAssert;
10+
import org.junit.Test;
11+
12+
public class CalcitePPLRexTest extends CalcitePPLAbstractTest {
13+
public CalcitePPLRexTest() {
14+
super(CalciteAssert.SchemaSpec.SCOTT_WITH_TEMPORAL);
15+
}
16+
17+
@Test
18+
public void testRexBasicFieldExtraction() {
19+
String ppl = "source=EMP | rex field=ENAME '(?<first>[A-Z]).*' | fields ENAME, first";
20+
RelNode root = getRelNode(ppl);
21+
String expectedLogical =
22+
"LogicalProject(ENAME=[$1], first=[REX_EXTRACT($1, '(?<first>[A-Z]).*', 1)])\n"
23+
+ " LogicalFilter(condition=[REGEXP_CONTAINS($1, '([A-Z]).*')])\n"
24+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
25+
verifyLogical(root, expectedLogical);
26+
27+
String expectedSparkSql =
28+
"SELECT `ENAME`, `REX_EXTRACT`(`ENAME`, '(?<first>[A-Z]).*', 1) `first`\n"
29+
+ "FROM `scott`.`EMP`\n"
30+
+ "WHERE REGEXP_CONTAINS(`ENAME`, '([A-Z]).*')";
31+
verifyPPLToSparkSQL(root, expectedSparkSql);
32+
}
33+
34+
@Test
35+
public void testRexMultipleNamedGroups() {
36+
String ppl =
37+
"source=EMP | rex field=ENAME '(?<first>[A-Z])(?<rest>.*)' | fields ENAME, first, rest";
38+
RelNode root = getRelNode(ppl);
39+
String expectedLogical =
40+
"LogicalProject(ENAME=[$1], first=[REX_EXTRACT($1, '(?<first>[A-Z])(?<rest>.*)', 1)],"
41+
+ " rest=[REX_EXTRACT($1, '(?<first>[A-Z])(?<rest>.*)', 2)])\n"
42+
+ " LogicalFilter(condition=[REGEXP_CONTAINS($1, '([A-Z])(.*)')])\n"
43+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
44+
verifyLogical(root, expectedLogical);
45+
46+
String expectedSparkSql =
47+
"SELECT `ENAME`, `REX_EXTRACT`(`ENAME`, '(?<first>[A-Z])(?<rest>.*)', 1) `first`,"
48+
+ " `REX_EXTRACT`(`ENAME`, '(?<first>[A-Z])(?<rest>.*)', 2) `rest`\n"
49+
+ "FROM `scott`.`EMP`\n"
50+
+ "WHERE REGEXP_CONTAINS(`ENAME`, '([A-Z])(.*)')";
51+
verifyPPLToSparkSQL(root, expectedSparkSql);
52+
}
53+
54+
@Test
55+
public void testRexWithMaxMatch() {
56+
String ppl =
57+
"source=EMP | rex field=ENAME '(?<letter>[A-Z])' max_match=3 | fields ENAME, letter";
58+
RelNode root = getRelNode(ppl);
59+
String expectedLogical =
60+
"LogicalProject(ENAME=[$1], letter=[REX_EXTRACT_MULTI($1, '(?<letter>[A-Z])', 1, 3)])\n"
61+
+ " LogicalFilter(condition=[REGEXP_CONTAINS($1, '([A-Z])')])\n"
62+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
63+
verifyLogical(root, expectedLogical);
64+
65+
String expectedSparkSql =
66+
"SELECT `ENAME`, `REX_EXTRACT_MULTI`(`ENAME`, '(?<letter>[A-Z])', 1, 3) `letter`\n"
67+
+ "FROM `scott`.`EMP`\n"
68+
+ "WHERE REGEXP_CONTAINS(`ENAME`, '([A-Z])')";
69+
verifyPPLToSparkSQL(root, expectedSparkSql);
70+
}
71+
72+
@Test
73+
public void testRexSedMode() {
74+
String ppl = "source=EMP | rex field=ENAME mode=sed 's/A/X/' | fields ENAME";
75+
RelNode root = getRelNode(ppl);
76+
String expectedLogical =
77+
"LogicalProject(ENAME=[REX_SED($1, 's/A/X/')])\n"
78+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
79+
verifyLogical(root, expectedLogical);
80+
81+
String expectedSparkSql =
82+
"SELECT `REX_SED`(`ENAME`, 's/A/X/') `ENAME`\n" + "FROM `scott`.`EMP`";
83+
verifyPPLToSparkSQL(root, expectedSparkSql);
84+
}
85+
86+
@Test
87+
public void testRexWithOffsetField() {
88+
String ppl =
89+
"source=EMP | rex field=ENAME '(?<first>[A-Z])' offset_field=pos | fields ENAME, first,"
90+
+ " pos";
91+
RelNode root = getRelNode(ppl);
92+
String expectedLogical =
93+
"LogicalProject(ENAME=[$1], first=[REX_EXTRACT($1, '(?<first>[A-Z])', 1)],"
94+
+ " pos=[REX_OFFSET($1, '(?<first>[A-Z])')])\n"
95+
+ " LogicalFilter(condition=[REGEXP_CONTAINS($1, '([A-Z])')])\n"
96+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
97+
verifyLogical(root, expectedLogical);
98+
99+
String expectedSparkSql =
100+
"SELECT `ENAME`, `REX_EXTRACT`(`ENAME`, '(?<first>[A-Z])', 1) `first`,"
101+
+ " `REX_OFFSET`(`ENAME`, '(?<first>[A-Z])') `pos`\n"
102+
+ "FROM `scott`.`EMP`\n"
103+
+ "WHERE REGEXP_CONTAINS(`ENAME`, '([A-Z])')";
104+
verifyPPLToSparkSQL(root, expectedSparkSql);
105+
}
106+
107+
@Test
108+
public void testRexChainedCommands() {
109+
String ppl =
110+
"source=EMP | rex field=ENAME '(?<firstinitial>^.)' | rex field=JOB '(?<jobtype>\\w+)' |"
111+
+ " fields ENAME, JOB, firstinitial, jobtype";
112+
RelNode root = getRelNode(ppl);
113+
String expectedLogical =
114+
"LogicalProject(ENAME=[$1], JOB=[$2], firstinitial=[$8], jobtype=[REX_EXTRACT($2,"
115+
+ " '(?<jobtype>\\w+)', 1)])\n"
116+
+ " LogicalFilter(condition=[REGEXP_CONTAINS($2, '(\\w+)')])\n"
117+
+ " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
118+
+ " SAL=[$5], COMM=[$6], DEPTNO=[$7], firstinitial=[REX_EXTRACT($1,"
119+
+ " '(?<firstinitial>^.)', 1)])\n"
120+
+ " LogicalFilter(condition=[REGEXP_CONTAINS($1, '(^.)')])\n"
121+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
122+
verifyLogical(root, expectedLogical);
123+
124+
String expectedSparkSql =
125+
"SELECT `ENAME`, `JOB`, `firstinitial`, `REX_EXTRACT`(`JOB`, '(?<jobtype>\\w+)', 1)"
126+
+ " `jobtype`\n"
127+
+ "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`,"
128+
+ " `REX_EXTRACT`(`ENAME`, '(?<firstinitial>^.)', 1) `firstinitial`\n"
129+
+ "FROM `scott`.`EMP`\n"
130+
+ "WHERE REGEXP_CONTAINS(`ENAME`, '(^.)')) `t0`\n"
131+
+ "WHERE REGEXP_CONTAINS(`JOB`, '(\\w+)')";
132+
verifyPPLToSparkSQL(root, expectedSparkSql);
133+
}
134+
135+
@Test
136+
public void testRexWithWhereClause() {
137+
String ppl =
138+
"source=EMP | where SAL > 1000 | rex field=ENAME '(?<first>[A-Z]).*' | fields ENAME, first,"
139+
+ " SAL";
140+
RelNode root = getRelNode(ppl);
141+
String expectedLogical =
142+
"LogicalProject(ENAME=[$1], first=[REX_EXTRACT($1, '(?<first>[A-Z]).*', 1)], SAL=[$5])\n"
143+
+ " LogicalFilter(condition=[REGEXP_CONTAINS($1, '([A-Z]).*')])\n"
144+
+ " LogicalFilter(condition=[>($5, 1000)])\n"
145+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
146+
verifyLogical(root, expectedLogical);
147+
148+
String expectedSparkSql =
149+
"SELECT `ENAME`, `REX_EXTRACT`(`ENAME`, '(?<first>[A-Z]).*', 1) `first`, `SAL`\n"
150+
+ "FROM (SELECT *\n"
151+
+ "FROM `scott`.`EMP`\n"
152+
+ "WHERE `SAL` > 1000) `t`\n"
153+
+ "WHERE REGEXP_CONTAINS(`ENAME`, '([A-Z]).*')";
154+
verifyPPLToSparkSQL(root, expectedSparkSql);
155+
}
156+
157+
@Test
158+
public void testRexWithAggregation() {
159+
String ppl = "source=EMP | rex field=JOB '(?<jobtype>\\w+)' | stats count() by jobtype";
160+
RelNode root = getRelNode(ppl);
161+
String expectedLogical =
162+
"LogicalProject(count()=[$1], jobtype=[$0])\n"
163+
+ " LogicalAggregate(group=[{0}], count()=[COUNT()])\n"
164+
+ " LogicalProject(jobtype=[REX_EXTRACT($2, '(?<jobtype>\\w+)', 1)])\n"
165+
+ " LogicalFilter(condition=[REGEXP_CONTAINS($2, '(\\w+)')])\n"
166+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
167+
verifyLogical(root, expectedLogical);
168+
169+
String expectedSparkSql =
170+
"SELECT COUNT(*) `count()`, `REX_EXTRACT`(`JOB`, '(?<jobtype>\\w+)', 1) `jobtype`\n"
171+
+ "FROM `scott`.`EMP`\n"
172+
+ "WHERE REGEXP_CONTAINS(`JOB`, '(\\w+)')\n"
173+
+ "GROUP BY `REX_EXTRACT`(`JOB`, '(?<jobtype>\\w+)', 1)";
174+
verifyPPLToSparkSQL(root, expectedSparkSql);
175+
}
176+
177+
@Test
178+
public void testRexComplexPattern() {
179+
String ppl =
180+
"source=EMP | rex field=ENAME '(?<prefix>[A-Z]{2})(?<suffix>[A-Z]+)' | fields ENAME,"
181+
+ " prefix, suffix";
182+
RelNode root = getRelNode(ppl);
183+
String expectedLogical =
184+
"LogicalProject(ENAME=[$1], prefix=[REX_EXTRACT($1, '(?<prefix>[A-Z]{2})(?<suffix>[A-Z]+)',"
185+
+ " 1)], suffix=[REX_EXTRACT($1, '(?<prefix>[A-Z]{2})(?<suffix>[A-Z]+)', 2)])\n"
186+
+ " LogicalFilter(condition=[REGEXP_CONTAINS($1, '([A-Z]{2})([A-Z]+)')])\n"
187+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
188+
verifyLogical(root, expectedLogical);
189+
190+
String expectedSparkSql =
191+
"SELECT `ENAME`, `REX_EXTRACT`(`ENAME`, '(?<prefix>[A-Z]{2})(?<suffix>[A-Z]+)', 1)"
192+
+ " `prefix`, `REX_EXTRACT`(`ENAME`, '(?<prefix>[A-Z]{2})(?<suffix>[A-Z]+)', 2)"
193+
+ " `suffix`\n"
194+
+ "FROM `scott`.`EMP`\n"
195+
+ "WHERE REGEXP_CONTAINS(`ENAME`, '([A-Z]{2})([A-Z]+)')";
196+
verifyPPLToSparkSQL(root, expectedSparkSql);
197+
}
198+
199+
@Test
200+
public void testRexWithSort() {
201+
String ppl =
202+
"source=EMP | rex field=ENAME '(?<firstletter>^.)' | fields ENAME, firstletter | sort"
203+
+ " firstletter | head 5";
204+
RelNode root = getRelNode(ppl);
205+
String expectedLogical =
206+
"LogicalSort(sort0=[$1], dir0=[ASC-nulls-first], fetch=[5])\n"
207+
+ " LogicalProject(ENAME=[$1], firstletter=[REX_EXTRACT($1, '(?<firstletter>^.)',"
208+
+ " 1)])\n"
209+
+ " LogicalFilter(condition=[REGEXP_CONTAINS($1, '(^.)')])\n"
210+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
211+
verifyLogical(root, expectedLogical);
212+
213+
String expectedSparkSql =
214+
"SELECT `ENAME`, `REX_EXTRACT`(`ENAME`, '(?<firstletter>^.)', 1) `firstletter`\n"
215+
+ "FROM `scott`.`EMP`\n"
216+
+ "WHERE REGEXP_CONTAINS(`ENAME`, '(^.)')\n"
217+
+ "ORDER BY 2\n"
218+
+ "LIMIT 5";
219+
verifyPPLToSparkSQL(root, expectedSparkSql);
220+
}
221+
}

0 commit comments

Comments
 (0)