|
| 1 | +/* |
| 2 | + * Copyright OpenSearch Contributors |
| 3 | + * SPDX-License-Identifier: Apache-2.0 |
| 4 | + */ |
| 5 | + |
| 6 | +package org.opensearch.sql.ppl.calcite; |
| 7 | + |
| 8 | +import org.apache.calcite.rel.RelNode; |
| 9 | +import org.apache.calcite.test.CalciteAssert; |
| 10 | +import org.junit.Test; |
| 11 | + |
| 12 | +public class CalcitePPLRexTest extends CalcitePPLAbstractTest { |
| 13 | + public CalcitePPLRexTest() { |
| 14 | + super(CalciteAssert.SchemaSpec.SCOTT_WITH_TEMPORAL); |
| 15 | + } |
| 16 | + |
| 17 | + @Test |
| 18 | + public void testRexBasicFieldExtraction() { |
| 19 | + String ppl = "source=EMP | rex field=ENAME '(?<first>[A-Z]).*' | fields ENAME, first"; |
| 20 | + RelNode root = getRelNode(ppl); |
| 21 | + String expectedLogical = |
| 22 | + "LogicalProject(ENAME=[$1], first=[REX_EXTRACT($1, '(?<first>[A-Z]).*', 1)])\n" |
| 23 | + + " LogicalFilter(condition=[REGEXP_CONTAINS($1, '([A-Z]).*')])\n" |
| 24 | + + " LogicalTableScan(table=[[scott, EMP]])\n"; |
| 25 | + verifyLogical(root, expectedLogical); |
| 26 | + |
| 27 | + String expectedSparkSql = |
| 28 | + "SELECT `ENAME`, `REX_EXTRACT`(`ENAME`, '(?<first>[A-Z]).*', 1) `first`\n" |
| 29 | + + "FROM `scott`.`EMP`\n" |
| 30 | + + "WHERE REGEXP_CONTAINS(`ENAME`, '([A-Z]).*')"; |
| 31 | + verifyPPLToSparkSQL(root, expectedSparkSql); |
| 32 | + } |
| 33 | + |
| 34 | + @Test |
| 35 | + public void testRexMultipleNamedGroups() { |
| 36 | + String ppl = |
| 37 | + "source=EMP | rex field=ENAME '(?<first>[A-Z])(?<rest>.*)' | fields ENAME, first, rest"; |
| 38 | + RelNode root = getRelNode(ppl); |
| 39 | + String expectedLogical = |
| 40 | + "LogicalProject(ENAME=[$1], first=[REX_EXTRACT($1, '(?<first>[A-Z])(?<rest>.*)', 1)]," |
| 41 | + + " rest=[REX_EXTRACT($1, '(?<first>[A-Z])(?<rest>.*)', 2)])\n" |
| 42 | + + " LogicalFilter(condition=[REGEXP_CONTAINS($1, '([A-Z])(.*)')])\n" |
| 43 | + + " LogicalTableScan(table=[[scott, EMP]])\n"; |
| 44 | + verifyLogical(root, expectedLogical); |
| 45 | + |
| 46 | + String expectedSparkSql = |
| 47 | + "SELECT `ENAME`, `REX_EXTRACT`(`ENAME`, '(?<first>[A-Z])(?<rest>.*)', 1) `first`," |
| 48 | + + " `REX_EXTRACT`(`ENAME`, '(?<first>[A-Z])(?<rest>.*)', 2) `rest`\n" |
| 49 | + + "FROM `scott`.`EMP`\n" |
| 50 | + + "WHERE REGEXP_CONTAINS(`ENAME`, '([A-Z])(.*)')"; |
| 51 | + verifyPPLToSparkSQL(root, expectedSparkSql); |
| 52 | + } |
| 53 | + |
| 54 | + @Test |
| 55 | + public void testRexWithMaxMatch() { |
| 56 | + String ppl = |
| 57 | + "source=EMP | rex field=ENAME '(?<letter>[A-Z])' max_match=3 | fields ENAME, letter"; |
| 58 | + RelNode root = getRelNode(ppl); |
| 59 | + String expectedLogical = |
| 60 | + "LogicalProject(ENAME=[$1], letter=[REX_EXTRACT_MULTI($1, '(?<letter>[A-Z])', 1, 3)])\n" |
| 61 | + + " LogicalFilter(condition=[REGEXP_CONTAINS($1, '([A-Z])')])\n" |
| 62 | + + " LogicalTableScan(table=[[scott, EMP]])\n"; |
| 63 | + verifyLogical(root, expectedLogical); |
| 64 | + |
| 65 | + String expectedSparkSql = |
| 66 | + "SELECT `ENAME`, `REX_EXTRACT_MULTI`(`ENAME`, '(?<letter>[A-Z])', 1, 3) `letter`\n" |
| 67 | + + "FROM `scott`.`EMP`\n" |
| 68 | + + "WHERE REGEXP_CONTAINS(`ENAME`, '([A-Z])')"; |
| 69 | + verifyPPLToSparkSQL(root, expectedSparkSql); |
| 70 | + } |
| 71 | + |
| 72 | + @Test |
| 73 | + public void testRexSedMode() { |
| 74 | + String ppl = "source=EMP | rex field=ENAME mode=sed 's/A/X/' | fields ENAME"; |
| 75 | + RelNode root = getRelNode(ppl); |
| 76 | + String expectedLogical = |
| 77 | + "LogicalProject(ENAME=[REX_SED($1, 's/A/X/')])\n" |
| 78 | + + " LogicalTableScan(table=[[scott, EMP]])\n"; |
| 79 | + verifyLogical(root, expectedLogical); |
| 80 | + |
| 81 | + String expectedSparkSql = |
| 82 | + "SELECT `REX_SED`(`ENAME`, 's/A/X/') `ENAME`\n" + "FROM `scott`.`EMP`"; |
| 83 | + verifyPPLToSparkSQL(root, expectedSparkSql); |
| 84 | + } |
| 85 | + |
| 86 | + @Test |
| 87 | + public void testRexWithOffsetField() { |
| 88 | + String ppl = |
| 89 | + "source=EMP | rex field=ENAME '(?<first>[A-Z])' offset_field=pos | fields ENAME, first," |
| 90 | + + " pos"; |
| 91 | + RelNode root = getRelNode(ppl); |
| 92 | + String expectedLogical = |
| 93 | + "LogicalProject(ENAME=[$1], first=[REX_EXTRACT($1, '(?<first>[A-Z])', 1)]," |
| 94 | + + " pos=[REX_OFFSET($1, '(?<first>[A-Z])')])\n" |
| 95 | + + " LogicalFilter(condition=[REGEXP_CONTAINS($1, '([A-Z])')])\n" |
| 96 | + + " LogicalTableScan(table=[[scott, EMP]])\n"; |
| 97 | + verifyLogical(root, expectedLogical); |
| 98 | + |
| 99 | + String expectedSparkSql = |
| 100 | + "SELECT `ENAME`, `REX_EXTRACT`(`ENAME`, '(?<first>[A-Z])', 1) `first`," |
| 101 | + + " `REX_OFFSET`(`ENAME`, '(?<first>[A-Z])') `pos`\n" |
| 102 | + + "FROM `scott`.`EMP`\n" |
| 103 | + + "WHERE REGEXP_CONTAINS(`ENAME`, '([A-Z])')"; |
| 104 | + verifyPPLToSparkSQL(root, expectedSparkSql); |
| 105 | + } |
| 106 | + |
| 107 | + @Test |
| 108 | + public void testRexChainedCommands() { |
| 109 | + String ppl = |
| 110 | + "source=EMP | rex field=ENAME '(?<firstinitial>^.)' | rex field=JOB '(?<jobtype>\\w+)' |" |
| 111 | + + " fields ENAME, JOB, firstinitial, jobtype"; |
| 112 | + RelNode root = getRelNode(ppl); |
| 113 | + String expectedLogical = |
| 114 | + "LogicalProject(ENAME=[$1], JOB=[$2], firstinitial=[$8], jobtype=[REX_EXTRACT($2," |
| 115 | + + " '(?<jobtype>\\w+)', 1)])\n" |
| 116 | + + " LogicalFilter(condition=[REGEXP_CONTAINS($2, '(\\w+)')])\n" |
| 117 | + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," |
| 118 | + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], firstinitial=[REX_EXTRACT($1," |
| 119 | + + " '(?<firstinitial>^.)', 1)])\n" |
| 120 | + + " LogicalFilter(condition=[REGEXP_CONTAINS($1, '(^.)')])\n" |
| 121 | + + " LogicalTableScan(table=[[scott, EMP]])\n"; |
| 122 | + verifyLogical(root, expectedLogical); |
| 123 | + |
| 124 | + String expectedSparkSql = |
| 125 | + "SELECT `ENAME`, `JOB`, `firstinitial`, `REX_EXTRACT`(`JOB`, '(?<jobtype>\\w+)', 1)" |
| 126 | + + " `jobtype`\n" |
| 127 | + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," |
| 128 | + + " `REX_EXTRACT`(`ENAME`, '(?<firstinitial>^.)', 1) `firstinitial`\n" |
| 129 | + + "FROM `scott`.`EMP`\n" |
| 130 | + + "WHERE REGEXP_CONTAINS(`ENAME`, '(^.)')) `t0`\n" |
| 131 | + + "WHERE REGEXP_CONTAINS(`JOB`, '(\\w+)')"; |
| 132 | + verifyPPLToSparkSQL(root, expectedSparkSql); |
| 133 | + } |
| 134 | + |
| 135 | + @Test |
| 136 | + public void testRexWithWhereClause() { |
| 137 | + String ppl = |
| 138 | + "source=EMP | where SAL > 1000 | rex field=ENAME '(?<first>[A-Z]).*' | fields ENAME, first," |
| 139 | + + " SAL"; |
| 140 | + RelNode root = getRelNode(ppl); |
| 141 | + String expectedLogical = |
| 142 | + "LogicalProject(ENAME=[$1], first=[REX_EXTRACT($1, '(?<first>[A-Z]).*', 1)], SAL=[$5])\n" |
| 143 | + + " LogicalFilter(condition=[REGEXP_CONTAINS($1, '([A-Z]).*')])\n" |
| 144 | + + " LogicalFilter(condition=[>($5, 1000)])\n" |
| 145 | + + " LogicalTableScan(table=[[scott, EMP]])\n"; |
| 146 | + verifyLogical(root, expectedLogical); |
| 147 | + |
| 148 | + String expectedSparkSql = |
| 149 | + "SELECT `ENAME`, `REX_EXTRACT`(`ENAME`, '(?<first>[A-Z]).*', 1) `first`, `SAL`\n" |
| 150 | + + "FROM (SELECT *\n" |
| 151 | + + "FROM `scott`.`EMP`\n" |
| 152 | + + "WHERE `SAL` > 1000) `t`\n" |
| 153 | + + "WHERE REGEXP_CONTAINS(`ENAME`, '([A-Z]).*')"; |
| 154 | + verifyPPLToSparkSQL(root, expectedSparkSql); |
| 155 | + } |
| 156 | + |
| 157 | + @Test |
| 158 | + public void testRexWithAggregation() { |
| 159 | + String ppl = "source=EMP | rex field=JOB '(?<jobtype>\\w+)' | stats count() by jobtype"; |
| 160 | + RelNode root = getRelNode(ppl); |
| 161 | + String expectedLogical = |
| 162 | + "LogicalProject(count()=[$1], jobtype=[$0])\n" |
| 163 | + + " LogicalAggregate(group=[{0}], count()=[COUNT()])\n" |
| 164 | + + " LogicalProject(jobtype=[REX_EXTRACT($2, '(?<jobtype>\\w+)', 1)])\n" |
| 165 | + + " LogicalFilter(condition=[REGEXP_CONTAINS($2, '(\\w+)')])\n" |
| 166 | + + " LogicalTableScan(table=[[scott, EMP]])\n"; |
| 167 | + verifyLogical(root, expectedLogical); |
| 168 | + |
| 169 | + String expectedSparkSql = |
| 170 | + "SELECT COUNT(*) `count()`, `REX_EXTRACT`(`JOB`, '(?<jobtype>\\w+)', 1) `jobtype`\n" |
| 171 | + + "FROM `scott`.`EMP`\n" |
| 172 | + + "WHERE REGEXP_CONTAINS(`JOB`, '(\\w+)')\n" |
| 173 | + + "GROUP BY `REX_EXTRACT`(`JOB`, '(?<jobtype>\\w+)', 1)"; |
| 174 | + verifyPPLToSparkSQL(root, expectedSparkSql); |
| 175 | + } |
| 176 | + |
| 177 | + @Test |
| 178 | + public void testRexComplexPattern() { |
| 179 | + String ppl = |
| 180 | + "source=EMP | rex field=ENAME '(?<prefix>[A-Z]{2})(?<suffix>[A-Z]+)' | fields ENAME," |
| 181 | + + " prefix, suffix"; |
| 182 | + RelNode root = getRelNode(ppl); |
| 183 | + String expectedLogical = |
| 184 | + "LogicalProject(ENAME=[$1], prefix=[REX_EXTRACT($1, '(?<prefix>[A-Z]{2})(?<suffix>[A-Z]+)'," |
| 185 | + + " 1)], suffix=[REX_EXTRACT($1, '(?<prefix>[A-Z]{2})(?<suffix>[A-Z]+)', 2)])\n" |
| 186 | + + " LogicalFilter(condition=[REGEXP_CONTAINS($1, '([A-Z]{2})([A-Z]+)')])\n" |
| 187 | + + " LogicalTableScan(table=[[scott, EMP]])\n"; |
| 188 | + verifyLogical(root, expectedLogical); |
| 189 | + |
| 190 | + String expectedSparkSql = |
| 191 | + "SELECT `ENAME`, `REX_EXTRACT`(`ENAME`, '(?<prefix>[A-Z]{2})(?<suffix>[A-Z]+)', 1)" |
| 192 | + + " `prefix`, `REX_EXTRACT`(`ENAME`, '(?<prefix>[A-Z]{2})(?<suffix>[A-Z]+)', 2)" |
| 193 | + + " `suffix`\n" |
| 194 | + + "FROM `scott`.`EMP`\n" |
| 195 | + + "WHERE REGEXP_CONTAINS(`ENAME`, '([A-Z]{2})([A-Z]+)')"; |
| 196 | + verifyPPLToSparkSQL(root, expectedSparkSql); |
| 197 | + } |
| 198 | + |
| 199 | + @Test |
| 200 | + public void testRexWithSort() { |
| 201 | + String ppl = |
| 202 | + "source=EMP | rex field=ENAME '(?<firstletter>^.)' | fields ENAME, firstletter | sort" |
| 203 | + + " firstletter | head 5"; |
| 204 | + RelNode root = getRelNode(ppl); |
| 205 | + String expectedLogical = |
| 206 | + "LogicalSort(sort0=[$1], dir0=[ASC-nulls-first], fetch=[5])\n" |
| 207 | + + " LogicalProject(ENAME=[$1], firstletter=[REX_EXTRACT($1, '(?<firstletter>^.)'," |
| 208 | + + " 1)])\n" |
| 209 | + + " LogicalFilter(condition=[REGEXP_CONTAINS($1, '(^.)')])\n" |
| 210 | + + " LogicalTableScan(table=[[scott, EMP]])\n"; |
| 211 | + verifyLogical(root, expectedLogical); |
| 212 | + |
| 213 | + String expectedSparkSql = |
| 214 | + "SELECT `ENAME`, `REX_EXTRACT`(`ENAME`, '(?<firstletter>^.)', 1) `firstletter`\n" |
| 215 | + + "FROM `scott`.`EMP`\n" |
| 216 | + + "WHERE REGEXP_CONTAINS(`ENAME`, '(^.)')\n" |
| 217 | + + "ORDER BY 2\n" |
| 218 | + + "LIMIT 5"; |
| 219 | + verifyPPLToSparkSQL(root, expectedSparkSql); |
| 220 | + } |
| 221 | +} |
0 commit comments