Skip to content

Commit ec6da64

Browse files
committed
[Analytics Backend / DataFusion] Append "g" flag in RegexpReplaceAdapter for 3-arg calls
DataFusion's regexp_replace defaults to first-match-only without an explicit flag; Calcite's REGEXP_REPLACE_3 is already replace-all. PPL relies on the Calcite contract (every match replaced — used by SIMPLE patterns, regex_replace, rex mode=sed), so on the DataFusion path the adapter now rewrites every 3-arg REGEXP_REPLACE_3 to 4-arg REGEXP_REPLACE_PG_4(..., "g") preserving the same end-user semantics across backends. Companion change in opensearch-project/sql#5467: the SQL core no longer emits the 'g' flag itself — that DataFusion-specific concern now lives only here. Two existing unit tests updated to expect the new always-global behavior (testAdaptPassesThroughWhenNoQuoteBlock → testAdaptAppendsGlobalFlagFor3Arg, testAdaptPassesThroughNonLiteralPattern → testAdaptAppendsGlobalFlagForNonLiteralPattern). The \Q rewrite, $N backreference rewrite, and 4-arg-with-flags pass-through are unchanged. Signed-off-by: Kai Huang <ahkcs@amazon.com>
1 parent 9c7d359 commit ec6da64

2 files changed

Lines changed: 33 additions & 12 deletions

File tree

sandbox/plugins/analytics-backend-datafusion/src/main/java/org/opensearch/be/datafusion/RegexpReplaceAdapter.java

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
import org.apache.calcite.rex.RexCall;
1414
import org.apache.calcite.rex.RexLiteral;
1515
import org.apache.calcite.rex.RexNode;
16+
import org.apache.calcite.sql.fun.SqlLibraryOperators;
17+
import org.apache.calcite.sql.type.SqlTypeName;
1618
import org.opensearch.analytics.spi.FieldStorageInfo;
1719
import org.opensearch.analytics.spi.ScalarFunctionAdapter;
1820

@@ -96,22 +98,35 @@ public RexNode adapt(RexCall original, List<FieldStorageInfo> fieldStorage, RelO
9698
}
9799
}
98100

99-
if (rewrittenPattern == null && rewrittenReplacement == null) {
101+
// Append "g" so DataFusion's regexp_replace (first-match-only by default) replaces
102+
// every match — matching PPL's contract on the V2/Calcite path where the 3-arg form
103+
// is already replace-all. Pure 3-arg calls become REGEXP_REPLACE_PG_4(..., "g").
104+
boolean appendGlobalFlag = original.getOperator() == SqlLibraryOperators.REGEXP_REPLACE_3
105+
&& original.getOperands().size() == 3;
106+
107+
if (rewrittenPattern == null && rewrittenReplacement == null && !appendGlobalFlag) {
100108
return original;
101109
}
102110

103111
RexBuilder rexBuilder = cluster.getRexBuilder();
104112
// makeLiteral(String) infers a CHAR type sized to the rewritten string. Reusing the
105113
// original literal's type would right-pad to the OLD length (e.g. CHAR(23) → 8 trailing
106114
// spaces after a 15-char rewrite), corrupting the value at runtime.
107-
List<RexNode> newOperands = new ArrayList<>(original.getOperands().size());
115+
List<RexNode> newOperands = new ArrayList<>(original.getOperands().size() + (appendGlobalFlag ? 1 : 0));
108116
newOperands.add(original.getOperands().get(0));
109117
newOperands.add(rewrittenPattern != null ? rexBuilder.makeLiteral(rewrittenPattern) : patternOperand);
110118
newOperands.add(rewrittenReplacement != null ? rexBuilder.makeLiteral(rewrittenReplacement) : replacementOperand);
111119
// Append any trailing operand (the flags string in the 4-arg form) verbatim.
112120
for (int i = 3; i < original.getOperands().size(); i++) {
113121
newOperands.add(original.getOperands().get(i));
114122
}
123+
if (appendGlobalFlag) {
124+
newOperands.add(rexBuilder.makeLiteral(
125+
"g",
126+
rexBuilder.getTypeFactory().createSqlType(SqlTypeName.VARCHAR),
127+
true));
128+
return rexBuilder.makeCall(original.getType(), SqlLibraryOperators.REGEXP_REPLACE_PG_4, newOperands);
129+
}
115130
return rexBuilder.makeCall(original.getType(), original.getOperator(), newOperands);
116131
}
117132

sandbox/plugins/analytics-backend-datafusion/src/test/java/org/opensearch/be/datafusion/RegexpReplaceAdapterTests.java

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -165,30 +165,36 @@ public void testAdaptRewritesPatternLiteral() {
165165
assertEquals("Java \\Q…\\E rewritten to plain regex", "^BUSINESS(.*?)$", ((RexLiteral) newPatternNode).getValueAs(String.class));
166166
}
167167

168-
public void testAdaptPassesThroughWhenNoQuoteBlock() {
169-
// Pattern doesn't contain \Q — adapter must return the call unchanged (identity).
168+
public void testAdaptAppendsGlobalFlagFor3Arg() {
169+
// 3-arg REGEXP_REPLACE_3 with no \Q / $N rewrites still gets rewritten to 4-arg
170+
// REGEXP_REPLACE_PG_4 with "g" so DataFusion's regexp_replace (first-match-only by
171+
// default) matches Calcite's already-replace-all 3-arg semantics.
170172
RexNode field = rexBuilder.makeInputRef(varcharType, 0);
171173
RexNode pattern = rexBuilder.makeLiteral("^OFFICE.*$");
172174
RexNode replacement = rexBuilder.makeLiteral("OFC");
173175
RexCall original = (RexCall) rexBuilder.makeCall(SqlLibraryOperators.REGEXP_REPLACE_3, List.of(field, pattern, replacement));
174176

175-
RexNode adapted = adapter.adapt(original, List.of(), cluster);
177+
RexCall adapted = (RexCall) adapter.adapt(original, List.of(), cluster);
176178

177-
assertSame("identity — no rewrite when pattern has no \\Q", original, adapted);
179+
assertSame("operator switched to PG_4", SqlLibraryOperators.REGEXP_REPLACE_PG_4, adapted.getOperator());
180+
assertEquals("4 operands after append", 4, adapted.getOperands().size());
181+
assertTrue("trailing operand is a literal", adapted.getOperands().get(3) instanceof RexLiteral);
182+
assertEquals("trailing flag is \"g\"", "g", ((RexLiteral) adapted.getOperands().get(3)).getValueAs(String.class));
178183
}
179184

180-
public void testAdaptPassesThroughNonLiteralPattern() {
181-
// Pattern is a column reference (not a literal) — adapter cannot rewrite at planning
182-
// time; pass through and let DataFusion error at runtime if the value is incompatible.
183-
// Replacement is a plain literal with no $, so neither transform fires.
185+
public void testAdaptAppendsGlobalFlagForNonLiteralPattern() {
186+
// Pattern is a column reference (not a literal) — \Q / $N rewrites can't fire, but the
187+
// 3-arg → 4-arg-with-"g" rewrite still applies so DataFusion replaces every match.
184188
RexNode field = rexBuilder.makeInputRef(varcharType, 0);
185189
RexNode patternRef = rexBuilder.makeInputRef(varcharType, 1);
186190
RexNode replacement = rexBuilder.makeLiteral("X");
187191
RexCall original = (RexCall) rexBuilder.makeCall(SqlLibraryOperators.REGEXP_REPLACE_3, List.of(field, patternRef, replacement));
188192

189-
RexNode adapted = adapter.adapt(original, List.of(), cluster);
193+
RexCall adapted = (RexCall) adapter.adapt(original, List.of(), cluster);
190194

191-
assertSame("non-literal pattern must pass through", original, adapted);
195+
assertSame("operator switched to PG_4", SqlLibraryOperators.REGEXP_REPLACE_PG_4, adapted.getOperator());
196+
assertEquals("pattern reference preserved", patternRef, adapted.getOperands().get(1));
197+
assertEquals("4 operands", 4, adapted.getOperands().size());
192198
}
193199

194200
public void testAdaptRewritesReplacementOnly() {

0 commit comments

Comments
 (0)