Skip to content

Commit 6bb8b7a

Browse files
committed
feat(core): emit 4-arg regexp_replace with 'g' flag for SIMPLE patterns
`buildParseRelNode` for `ParseMethod.PATTERNS` lowered through PPL's REPLACE handler, which always emits Calcite's 3-arg `REGEXP_REPLACE_3`. That works on the V2 / Calcite path (Calcite's default is replace-all), but the analytics- engine route converts the call to substrait + DataFusion, and DataFusion's `regexp_replace` defaults to first-match-only without an explicit "g" flag. The dashboard test for `source = bank | patterns email mode=label` returned `<*>@pyrami.com` instead of `<*>@<*>.<*>` because only the first `[a-zA-Z0-9]+` run was replaced. Bypass the REPLACE handler for the PATTERNS branch and emit `REGEXP_REPLACE_PG_4` directly with a constant "g" flag. Same semantics on V2 / Calcite (Calcite's REGEXP_REPLACE_PG_4 with "g" = replace-all); fixes the analytics-engine path. CalcitePPLPatternsTest plan-string expectations updated to match the 4-arg form. 17/17 unit tests pass. IT result on analytics-engine route: testSimplePatternLabelMode_NotShowNumberedToken now passes. Signed-off-by: Kai Huang <ahkcs@amazon.com>
1 parent 8ac6c07 commit 6bb8b7a

2 files changed

Lines changed: 50 additions & 33 deletions

File tree

core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4301,9 +4301,26 @@ private void buildParseRelNode(Parse node, CalcitePlanContext context) {
43014301
}
43024302
List<RexNode> newFields = new ArrayList<>();
43034303
for (String groupCandidate : groupCandidates) {
4304-
RexNode innerRex =
4305-
PPLFuncImpTable.INSTANCE.resolve(
4306-
context.rexBuilder, ParseUtils.BUILTIN_FUNCTION_MAP.get(parseMethod), rexNodeList);
4304+
RexNode innerRex;
4305+
if (ParseMethod.PATTERNS.equals(parseMethod)) {
4306+
// Emit `regexp_replace(field, pattern, replacement, "g")` directly so the replacement
4307+
// is global (every match replaced). DataFusion's `regexp_replace` defaults to FIRST
4308+
// match only without the "g" flag — using the 3-arg form via the REPLACE handler
4309+
// produces `<*>@pyrami.com` instead of `<*>@<*>.<*>` on the analytics-engine route.
4310+
// Calcite's REGEXP_REPLACE_PG_4 with "g" matches what `replaceAll` does, so V2 /
4311+
// Calcite-path semantics are preserved.
4312+
RexNode globalFlag =
4313+
context.rexBuilder.makeLiteral(
4314+
"g", context.rexBuilder.getTypeFactory().createSqlType(SqlTypeName.VARCHAR), true);
4315+
innerRex =
4316+
context.rexBuilder.makeCall(
4317+
SqlLibraryOperators.REGEXP_REPLACE_PG_4,
4318+
ArrayUtils.add(rexNodeList, globalFlag));
4319+
} else {
4320+
innerRex =
4321+
PPLFuncImpTable.INSTANCE.resolve(
4322+
context.rexBuilder, ParseUtils.BUILTIN_FUNCTION_MAP.get(parseMethod), rexNodeList);
4323+
}
43074324
if (!ParseMethod.PATTERNS.equals(parseMethod)) {
43084325
newFields.add(
43094326
PPLFuncImpTable.INSTANCE.resolve(

ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLPatternsTest.java

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,13 @@ public void testPatternsLabelMode_NotShowNumberedToken_ForSimplePatternMethod()
3535
String expectedLogical =
3636
"LogicalProject(ENAME=[$1], patterns_field=[CASE(SEARCH($1, Sarg['':VARCHAR; NULL AS"
3737
+ " TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($1, '[a-zA-Z0-9]+':VARCHAR,"
38-
+ " '<*>':VARCHAR))])\n"
38+
+ " '<*>':VARCHAR, 'g':VARCHAR))])\n"
3939
+ " LogicalTableScan(table=[[scott, EMP]])\n";
4040
verifyLogical(root, expectedLogical);
4141

4242
String expectedSparkSql =
4343
"SELECT `ENAME`, CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN '' ELSE"
44-
+ " REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END `patterns_field`\n"
44+
+ " REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>', 'g') END `patterns_field`\n"
4545
+ "FROM `scott`.`EMP`";
4646
verifyPPLToSparkSQL(root, expectedSparkSql);
4747
}
@@ -56,18 +56,18 @@ public void testPatternsLabelMode_ShowNumberedToken_ForSimplePatternMethod() {
5656
String expectedLogical =
5757
"LogicalProject(ENAME=[$1], patterns_field=[SAFE_CAST(ITEM(PATTERN_PARSER(CASE(SEARCH($1,"
5858
+ " Sarg['':VARCHAR; NULL AS TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($1,"
59-
+ " '[a-zA-Z0-9]+':VARCHAR, '<*>':VARCHAR)), $1), 'pattern'))],"
59+
+ " '[a-zA-Z0-9]+':VARCHAR, '<*>':VARCHAR, 'g':VARCHAR)), $1), 'pattern'))],"
6060
+ " tokens=[SAFE_CAST(ITEM(PATTERN_PARSER(CASE(SEARCH($1, Sarg['':VARCHAR; NULL AS"
6161
+ " TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($1, '[a-zA-Z0-9]+':VARCHAR,"
62-
+ " '<*>':VARCHAR)), $1), 'tokens'))])\n"
62+
+ " '<*>':VARCHAR, 'g':VARCHAR)), $1), 'tokens'))])\n"
6363
+ " LogicalTableScan(table=[[scott, EMP]])\n";
6464
verifyLogical(root, expectedLogical);
6565

6666
String expectedSparkSql =
6767
"SELECT `ENAME`, TRY_CAST(PATTERN_PARSER(CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN"
68-
+ " '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END, `ENAME`)['pattern'] AS"
68+
+ " '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>', 'g') END, `ENAME`)['pattern'] AS"
6969
+ " STRING) `patterns_field`, TRY_CAST(PATTERN_PARSER(CASE WHEN `ENAME` IS NULL OR"
70-
+ " `ENAME` = '' THEN '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END,"
70+
+ " `ENAME` = '' THEN '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>', 'g') END,"
7171
+ " `ENAME`)['tokens'] AS MAP< VARCHAR, VARCHAR ARRAY >) `tokens`\n"
7272
+ "FROM `scott`.`EMP`";
7373
verifyPPLToSparkSQL(root, expectedSparkSql);
@@ -83,18 +83,18 @@ public void testPatternsLabelModeWithCustomPattern_ShowNumberedToken_ForSimplePa
8383
String expectedLogical =
8484
"LogicalProject(ENAME=[$1], patterns_field=[SAFE_CAST(ITEM(PATTERN_PARSER(CASE(SEARCH($1,"
8585
+ " Sarg['':VARCHAR; NULL AS TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($1,"
86-
+ " '[A-H]':VARCHAR, '<*>':VARCHAR)), $1), 'pattern'))],"
86+
+ " '[A-H]':VARCHAR, '<*>':VARCHAR, 'g':VARCHAR)), $1), 'pattern'))],"
8787
+ " tokens=[SAFE_CAST(ITEM(PATTERN_PARSER(CASE(SEARCH($1, Sarg['':VARCHAR; NULL AS"
88-
+ " TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($1, '[A-H]':VARCHAR, '<*>':VARCHAR)),"
88+
+ " TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($1, '[A-H]':VARCHAR, '<*>':VARCHAR, 'g':VARCHAR)),"
8989
+ " $1), 'tokens'))])\n"
9090
+ " LogicalTableScan(table=[[scott, EMP]])\n";
9191
verifyLogical(root, expectedLogical);
9292

9393
String expectedSparkSql =
9494
"SELECT `ENAME`, TRY_CAST(PATTERN_PARSER(CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN"
95-
+ " '' ELSE REGEXP_REPLACE(`ENAME`, '[A-H]', '<*>') END, `ENAME`)['pattern'] AS STRING)"
95+
+ " '' ELSE REGEXP_REPLACE(`ENAME`, '[A-H]', '<*>', 'g') END, `ENAME`)['pattern'] AS STRING)"
9696
+ " `patterns_field`, TRY_CAST(PATTERN_PARSER(CASE WHEN `ENAME` IS NULL OR `ENAME` ="
97-
+ " '' THEN '' ELSE REGEXP_REPLACE(`ENAME`, '[A-H]', '<*>') END, `ENAME`)['tokens'] AS"
97+
+ " '' THEN '' ELSE REGEXP_REPLACE(`ENAME`, '[A-H]', '<*>', 'g') END, `ENAME`)['tokens'] AS"
9898
+ " MAP< VARCHAR, VARCHAR ARRAY >) `tokens`\n"
9999
+ "FROM `scott`.`EMP`";
100100
verifyPPLToSparkSQL(root, expectedSparkSql);
@@ -108,13 +108,13 @@ public void testPatternsLabelModeWithCustomField_NotShowNumberedToken_ForSimpleP
108108

109109
String expectedLogical =
110110
"LogicalProject(ENAME=[$1], upper=[CASE(SEARCH($1, Sarg['':VARCHAR; NULL AS TRUE]:VARCHAR),"
111-
+ " '':VARCHAR, REGEXP_REPLACE($1, '[A-H]':VARCHAR, '<*>':VARCHAR))])\n"
111+
+ " '':VARCHAR, REGEXP_REPLACE($1, '[A-H]':VARCHAR, '<*>':VARCHAR, 'g':VARCHAR))])\n"
112112
+ " LogicalTableScan(table=[[scott, EMP]])\n";
113113
verifyLogical(root, expectedLogical);
114114

115115
String expectedSparkSql =
116116
"SELECT `ENAME`, CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN '' ELSE"
117-
+ " REGEXP_REPLACE(`ENAME`, '[A-H]', '<*>') END `upper`\n"
117+
+ " REGEXP_REPLACE(`ENAME`, '[A-H]', '<*>', 'g') END `upper`\n"
118118
+ "FROM `scott`.`EMP`";
119119
verifyPPLToSparkSQL(root, expectedSparkSql);
120120
}
@@ -130,19 +130,19 @@ public void testPatternsLabelModeWithPartitionBy_ShowNumberedToken_SimplePattern
130130
"LogicalProject(ENAME=[$1], DEPTNO=[$7],"
131131
+ " patterns_field=[SAFE_CAST(ITEM(PATTERN_PARSER(CASE(SEARCH($1, Sarg['':VARCHAR; NULL"
132132
+ " AS TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($1, '[a-zA-Z0-9]+':VARCHAR,"
133-
+ " '<*>':VARCHAR)), $1), 'pattern'))],"
133+
+ " '<*>':VARCHAR, 'g':VARCHAR)), $1), 'pattern'))],"
134134
+ " tokens=[SAFE_CAST(ITEM(PATTERN_PARSER(CASE(SEARCH($1, Sarg['':VARCHAR; NULL AS"
135135
+ " TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($1, '[a-zA-Z0-9]+':VARCHAR,"
136-
+ " '<*>':VARCHAR)), $1), 'tokens'))])\n"
136+
+ " '<*>':VARCHAR, 'g':VARCHAR)), $1), 'tokens'))])\n"
137137
+ " LogicalTableScan(table=[[scott, EMP]])\n";
138138
verifyLogical(root, expectedLogical);
139139

140140
String expectedSparkSql =
141141
"SELECT `ENAME`, `DEPTNO`, TRY_CAST(PATTERN_PARSER(CASE WHEN `ENAME` IS NULL OR `ENAME`"
142-
+ " = '' THEN '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END,"
142+
+ " = '' THEN '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>', 'g') END,"
143143
+ " `ENAME`)['pattern'] AS STRING) `patterns_field`, TRY_CAST(PATTERN_PARSER(CASE"
144144
+ " WHEN `ENAME` IS NULL OR `ENAME` = '' THEN '' ELSE REGEXP_REPLACE(`ENAME`,"
145-
+ " '[a-zA-Z0-9]+', '<*>') END, `ENAME`)['tokens'] AS MAP< VARCHAR, VARCHAR ARRAY >)"
145+
+ " '[a-zA-Z0-9]+', '<*>', 'g') END, `ENAME`)['tokens'] AS MAP< VARCHAR, VARCHAR ARRAY >)"
146146
+ " `tokens`\n"
147147
+ "FROM `scott`.`EMP`";
148148
verifyPPLToSparkSQL(root, expectedSparkSql);
@@ -248,18 +248,18 @@ public void testPatternsAggregationMode_NotShowNumberedToken_ForSimplePatternMet
248248
"LogicalAggregate(group=[{1}], pattern_count=[COUNT($1)], sample_logs=[TAKE($0, $2)])\n"
249249
+ " LogicalProject(ENAME=[$1], patterns_field=[CASE(SEARCH($1, Sarg['':VARCHAR; NULL"
250250
+ " AS TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($1, '[a-zA-Z0-9]+':VARCHAR,"
251-
+ " '<*>':VARCHAR))], $f9=[10])\n"
251+
+ " '<*>':VARCHAR, 'g':VARCHAR))], $f9=[10])\n"
252252
+ " LogicalTableScan(table=[[scott, EMP]])\n";
253253
verifyLogical(root, expectedLogical);
254254

255255
String expectedSparkSql =
256256
"SELECT CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN '' ELSE REGEXP_REPLACE(`ENAME`,"
257-
+ " '[a-zA-Z0-9]+', '<*>') END `patterns_field`, COUNT(CASE WHEN `ENAME` IS NULL OR"
258-
+ " `ENAME` = '' THEN '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END)"
257+
+ " '[a-zA-Z0-9]+', '<*>', 'g') END `patterns_field`, COUNT(CASE WHEN `ENAME` IS NULL OR"
258+
+ " `ENAME` = '' THEN '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>', 'g') END)"
259259
+ " `pattern_count`, `TAKE`(`ENAME`, 10) `sample_logs`\n"
260260
+ "FROM `scott`.`EMP`\n"
261261
+ "GROUP BY CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN '' ELSE"
262-
+ " REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END";
262+
+ " REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>', 'g') END";
263263
verifyPPLToSparkSQL(root, expectedSparkSql);
264264
}
265265

@@ -276,21 +276,21 @@ public void testPatternsAggregationMode_ShowNumberedToken_ForSimplePatternMethod
276276
+ " $2)])\n"
277277
+ " LogicalProject(ENAME=[$1], patterns_field=[CASE(SEARCH($1, Sarg['':VARCHAR; NULL"
278278
+ " AS TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($1, '[a-zA-Z0-9]+':VARCHAR,"
279-
+ " '<*>':VARCHAR))], $f9=[10])\n"
279+
+ " '<*>':VARCHAR, 'g':VARCHAR))], $f9=[10])\n"
280280
+ " LogicalTableScan(table=[[scott, EMP]])\n";
281281
verifyLogical(root, expectedLogical);
282282

283283
String expectedSparkSql =
284284
"SELECT TRY_CAST(PATTERN_PARSER(CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN '' ELSE"
285-
+ " REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END, `TAKE`(`ENAME`, 10))['pattern']"
285+
+ " REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>', 'g') END, `TAKE`(`ENAME`, 10))['pattern']"
286286
+ " AS STRING) `patterns_field`, COUNT(CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN"
287-
+ " '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END) `pattern_count`,"
287+
+ " '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>', 'g') END) `pattern_count`,"
288288
+ " TRY_CAST(PATTERN_PARSER(CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN '' ELSE"
289-
+ " REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END, `TAKE`(`ENAME`, 10))['tokens']"
289+
+ " REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>', 'g') END, `TAKE`(`ENAME`, 10))['tokens']"
290290
+ " AS MAP< VARCHAR, VARCHAR ARRAY >) `tokens`, `TAKE`(`ENAME`, 10) `sample_logs`\n"
291291
+ "FROM `scott`.`EMP`\n"
292292
+ "GROUP BY CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN '' ELSE"
293-
+ " REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END";
293+
+ " REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>', 'g') END";
294294
verifyPPLToSparkSQL(root, expectedSparkSql);
295295
}
296296

@@ -307,22 +307,22 @@ public void testPatternsAggregationModeWithGroupBy_ShowNumberedToken_ForSimplePa
307307
+ " $3)])\n"
308308
+ " LogicalProject(ENAME=[$1], DEPTNO=[$7], patterns_field=[CASE(SEARCH($1,"
309309
+ " Sarg['':VARCHAR; NULL AS TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($1,"
310-
+ " '[a-zA-Z0-9]+':VARCHAR, '<*>':VARCHAR))], $f9=[10])\n"
310+
+ " '[a-zA-Z0-9]+':VARCHAR, '<*>':VARCHAR, 'g':VARCHAR))], $f9=[10])\n"
311311
+ " LogicalTableScan(table=[[scott, EMP]])\n";
312312
verifyLogical(root, expectedLogical);
313313

314314
String expectedSparkSql =
315315
"SELECT `DEPTNO`, TRY_CAST(PATTERN_PARSER(CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN ''"
316-
+ " ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END, `TAKE`(`ENAME`,"
316+
+ " ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>', 'g') END, `TAKE`(`ENAME`,"
317317
+ " 10))['pattern'] AS STRING) `patterns_field`, COUNT(CASE WHEN `ENAME` IS NULL OR"
318-
+ " `ENAME` = '' THEN '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END)"
318+
+ " `ENAME` = '' THEN '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>', 'g') END)"
319319
+ " `pattern_count`, TRY_CAST(PATTERN_PARSER(CASE WHEN `ENAME` IS NULL OR `ENAME` = ''"
320-
+ " THEN '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END, `TAKE`(`ENAME`,"
320+
+ " THEN '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>', 'g') END, `TAKE`(`ENAME`,"
321321
+ " 10))['tokens'] AS MAP< VARCHAR, VARCHAR ARRAY >) `tokens`, `TAKE`(`ENAME`, 10)"
322322
+ " `sample_logs`\n"
323323
+ "FROM `scott`.`EMP`\n"
324324
+ "GROUP BY `DEPTNO`, CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN '' ELSE"
325-
+ " REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END";
325+
+ " REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>', 'g') END";
326326
verifyPPLToSparkSQL(root, expectedSparkSql);
327327
}
328328

0 commit comments

Comments
 (0)