Skip to content

Commit a6da3b1

Browse files
committed
support mode=extract and update doc
Signed-off-by: Jialiang Liang <jiallian@amazon.com>
1 parent 4a70b52 commit a6da3b1

4 files changed

Lines changed: 65 additions & 6 deletions

File tree

docs/user/ppl/cmd/rex.rst

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,16 @@ rex [mode=<mode>] field=<field> <pattern> [max_match=<int>] [offset_field=<strin
2323

2424
* field: mandatory. The field must be a string field to extract data from.
2525
* pattern: mandatory string. The regular expression pattern with named capture groups used to extract new fields. Pattern must contain at least one named capture group using ``(?<name>pattern)`` syntax.
26-
* mode: optional. Either ``extract`` (default) or ``sed``. In extract mode, creates new fields from named capture groups. In sed mode, performs text substitution on the field using sed-style patterns:
26+
* mode: optional. Either ``extract`` (default) or ``sed``.
2727

28-
- ``s/pattern/replacement/`` - Replace first occurrence
29-
- ``s/pattern/replacement/g`` - Replace all occurrences (global)
30-
- ``s/pattern/replacement/n`` - Replace only the nth occurrence (where n is a number)
31-
- ``y/from_chars/to_chars/`` - Character-by-character transliteration
32-
- Backreferences: ``\1``, ``\2``, etc. reference captured groups in replacement
28+
- **extract mode** (default): Creates new fields from regular expression named capture groups. This is the standard field extraction behavior.
29+
- **sed mode**: Performs text substitution on the field using sed-style patterns:
30+
31+
- ``s/pattern/replacement/`` - Replace first occurrence
32+
- ``s/pattern/replacement/g`` - Replace all occurrences (global)
33+
- ``s/pattern/replacement/n`` - Replace only the nth occurrence (where n is a number)
34+
- ``y/from_chars/to_chars/`` - Character-by-character transliteration
35+
- Backreferences: ``\1``, ``\2``, etc. reference captured groups in replacement
3336

3437
* max_match: optional integer (default=1). Maximum number of matches to extract. If greater than 1, extracted fields become arrays.
3538
* offset_field: optional string. Field name to store the character offset positions of matches.

ppl/src/main/antlr/OpenSearchPPLParser.g4

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,7 @@ rexOption
257257
: MAX_MATCH EQUAL maxMatch=integerLiteral
258258
| OFFSET_FIELD EQUAL offsetField=qualifiedName
259259
| MODE EQUAL SED
260+
| MODE EQUAL EXTRACT
260261
;
261262
patternsMethod
262263
: PUNCT

ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -929,6 +929,9 @@ public UnresolvedPlan visitRexCommand(OpenSearchPPLParser.RexCommandContext ctx)
929929
if (optionCtx.MODE() != null && optionCtx.SED() != null) {
930930
mode = Rex.RexMode.SED;
931931
}
932+
if (optionCtx.MODE() != null && optionCtx.EXTRACT() != null) {
933+
mode = Rex.RexMode.EXTRACT;
934+
}
932935
}
933936

934937
return new Rex(field, pattern, maxMatch, offsetField, mode);

ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLRexTest.java

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,4 +278,56 @@ public void testRexSedModeComplexPattern() {
278278
+ "FROM `scott`.`EMP`";
279279
verifyPPLToSparkSQL(root, expectedSparkSql);
280280
}
281+
282+
@Test
283+
public void testRexModeExtractExplicit() {
284+
String ppl =
285+
"source=EMP | rex field=ENAME mode=extract '(?<first>[A-Z]).*' | fields ENAME, first";
286+
RelNode root = getRelNode(ppl);
287+
String expectedLogical =
288+
"LogicalProject(ENAME=[$1], first=[REGEXP_EXTRACT($1, '(?<first>[A-Z]).*', 1)])\n"
289+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
290+
verifyLogical(root, expectedLogical);
291+
292+
String expectedSparkSql =
293+
"SELECT `ENAME`, REGEXP_EXTRACT(`ENAME`, '(?<first>[A-Z]).*', 1) `first`\n"
294+
+ "FROM `scott`.`EMP`";
295+
verifyPPLToSparkSQL(root, expectedSparkSql);
296+
}
297+
298+
@Test
299+
public void testRexModeExtractMultipleGroups() {
300+
String ppl =
301+
"source=EMP | rex field=ENAME mode=extract '(?<first>[A-Z])(?<rest>.*)' | fields ENAME,"
302+
+ " first, rest";
303+
RelNode root = getRelNode(ppl);
304+
String expectedLogical =
305+
"LogicalProject(ENAME=[$1], first=[REX_EXTRACT($1, '(?<first>[A-Z])(?<rest>.*)', 1)],"
306+
+ " rest=[REX_EXTRACT($1, '(?<first>[A-Z])(?<rest>.*)', 2)])\n"
307+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
308+
verifyLogical(root, expectedLogical);
309+
310+
String expectedSparkSql =
311+
"SELECT `ENAME`, `REX_EXTRACT`(`ENAME`, '(?<first>[A-Z])(?<rest>.*)', 1) `first`,"
312+
+ " `REX_EXTRACT`(`ENAME`, '(?<first>[A-Z])(?<rest>.*)', 2) `rest`\n"
313+
+ "FROM `scott`.`EMP`";
314+
verifyPPLToSparkSQL(root, expectedSparkSql);
315+
}
316+
317+
@Test
318+
public void testRexModeExtractWithMaxMatch() {
319+
String ppl =
320+
"source=EMP | rex field=ENAME mode=extract '(?<letter>[A-Z])' max_match=3 | fields ENAME,"
321+
+ " letter";
322+
RelNode root = getRelNode(ppl);
323+
String expectedLogical =
324+
"LogicalProject(ENAME=[$1], letter=[REX_EXTRACT_MULTI($1, '(?<letter>[A-Z])', 1, 3)])\n"
325+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
326+
verifyLogical(root, expectedLogical);
327+
328+
String expectedSparkSql =
329+
"SELECT `ENAME`, `REX_EXTRACT_MULTI`(`ENAME`, '(?<letter>[A-Z])', 1, 3) `letter`\n"
330+
+ "FROM `scott`.`EMP`";
331+
verifyPPLToSparkSQL(root, expectedSparkSql);
332+
}
281333
}

0 commit comments

Comments
 (0)