support mode=extract and update doc

RyanL1997 · RyanL1997 · commit a6da3b1f0646 · 2025-08-28T20:03:54.000-07:00
Signed-off-by: Jialiang Liang &lt;jiallian@amazon.com&gt;
diff --git a/docs/user/ppl/cmd/rex.rst b/docs/user/ppl/cmd/rex.rst
@@ -23,13 +23,16 @@ rex [mode=<mode>] field=<field> <pattern> [max_match=<int>] [offset_field=<strin
 
 * field: mandatory. The field must be a string field to extract data from.
 * pattern: mandatory string. The regular expression pattern with named capture groups used to extract new fields. Pattern must contain at least one named capture group using ``(?<name>pattern)`` syntax.
-* mode: optional. Either ``extract`` (default) or ``sed``. In extract mode, creates new fields from named capture groups. In sed mode, performs text substitution on the field using sed-style patterns:
+* mode: optional. Either ``extract`` (default) or ``sed``.
 
-  - ``s/pattern/replacement/`` - Replace first occurrence
-  - ``s/pattern/replacement/g`` - Replace all occurrences (global)  
-  - ``s/pattern/replacement/n`` - Replace only the nth occurrence (where n is a number)
-  - ``y/from_chars/to_chars/`` - Character-by-character transliteration
-  - Backreferences: ``\1``, ``\2``, etc. reference captured groups in replacement
+  - **extract mode** (default): Creates new fields from regular expression named capture groups. This is the standard field extraction behavior.
+  - **sed mode**: Performs text substitution on the field using sed-style patterns:
+
+    - ``s/pattern/replacement/`` - Replace first occurrence
+    - ``s/pattern/replacement/g`` - Replace all occurrences (global)  
+    - ``s/pattern/replacement/n`` - Replace only the nth occurrence (where n is a number)
+    - ``y/from_chars/to_chars/`` - Character-by-character transliteration
+    - Backreferences: ``\1``, ``\2``, etc. reference captured groups in replacement
 
 * max_match: optional integer (default=1). Maximum number of matches to extract. If greater than 1, extracted fields become arrays.
 * offset_field: optional string. Field name to store the character offset positions of matches.
diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4
@@ -257,6 +257,7 @@ rexOption
     : MAX_MATCH EQUAL maxMatch=integerLiteral
     | OFFSET_FIELD EQUAL offsetField=qualifiedName
     | MODE EQUAL SED
+    | MODE EQUAL EXTRACT
     ;
 patternsMethod
    : PUNCT
diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java
@@ -929,6 +929,9 @@ public UnresolvedPlan visitRexCommand(OpenSearchPPLParser.RexCommandContext ctx)
       if (optionCtx.MODE() != null && optionCtx.SED() != null) {
         mode = Rex.RexMode.SED;
       }
+      if (optionCtx.MODE() != null && optionCtx.EXTRACT() != null) {
+        mode = Rex.RexMode.EXTRACT;
+      }
     }
 
     return new Rex(field, pattern, maxMatch, offsetField, mode);
diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLRexTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLRexTest.java
@@ -278,4 +278,56 @@ public void testRexSedModeComplexPattern() {
             + "FROM `scott`.`EMP`";
     verifyPPLToSparkSQL(root, expectedSparkSql);
   }
+
+  @Test
+  public void testRexModeExtractExplicit() {
+    String ppl =
+        "source=EMP | rex field=ENAME mode=extract '(?<first>[A-Z]).*' | fields ENAME, first";
+    RelNode root = getRelNode(ppl);
+    String expectedLogical =
+        "LogicalProject(ENAME=[$1], first=[REGEXP_EXTRACT($1, '(?<first>[A-Z]).*', 1)])\n"
+            + "  LogicalTableScan(table=[[scott, EMP]])\n";
+    verifyLogical(root, expectedLogical);
+
+    String expectedSparkSql =
+        "SELECT `ENAME`, REGEXP_EXTRACT(`ENAME`, '(?<first>[A-Z]).*', 1) `first`\n"
+            + "FROM `scott`.`EMP`";
+    verifyPPLToSparkSQL(root, expectedSparkSql);
+  }
+
+  @Test
+  public void testRexModeExtractMultipleGroups() {
+    String ppl =
+        "source=EMP | rex field=ENAME mode=extract '(?<first>[A-Z])(?<rest>.*)' | fields ENAME,"
+            + " first, rest";
+    RelNode root = getRelNode(ppl);
+    String expectedLogical =
+        "LogicalProject(ENAME=[$1], first=[REX_EXTRACT($1, '(?<first>[A-Z])(?<rest>.*)', 1)],"
+            + " rest=[REX_EXTRACT($1, '(?<first>[A-Z])(?<rest>.*)', 2)])\n"
+            + "  LogicalTableScan(table=[[scott, EMP]])\n";
+    verifyLogical(root, expectedLogical);
+
+    String expectedSparkSql =
+        "SELECT `ENAME`, `REX_EXTRACT`(`ENAME`, '(?<first>[A-Z])(?<rest>.*)', 1) `first`,"
+            + " `REX_EXTRACT`(`ENAME`, '(?<first>[A-Z])(?<rest>.*)', 2) `rest`\n"
+            + "FROM `scott`.`EMP`";
+    verifyPPLToSparkSQL(root, expectedSparkSql);
+  }
+
+  @Test
+  public void testRexModeExtractWithMaxMatch() {
+    String ppl =
+        "source=EMP | rex field=ENAME mode=extract '(?<letter>[A-Z])' max_match=3 | fields ENAME,"
+            + " letter";
+    RelNode root = getRelNode(ppl);
+    String expectedLogical =
+        "LogicalProject(ENAME=[$1], letter=[REX_EXTRACT_MULTI($1, '(?<letter>[A-Z])', 1, 3)])\n"
+            + "  LogicalTableScan(table=[[scott, EMP]])\n";
+    verifyLogical(root, expectedLogical);
+
+    String expectedSparkSql =
+        "SELECT `ENAME`, `REX_EXTRACT_MULTI`(`ENAME`, '(?<letter>[A-Z])', 1, 3) `letter`\n"
+            + "FROM `scott`.`EMP`";
+    verifyPPLToSparkSQL(root, expectedSparkSql);
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -257,6 +257,7 @@ rexOption`
`257`	`257`	`: MAX_MATCH EQUAL maxMatch=integerLiteral`
`258`	`258`	`\| OFFSET_FIELD EQUAL offsetField=qualifiedName`
`259`	`259`	`\| MODE EQUAL SED`
	`260`	`+ \| MODE EQUAL EXTRACT`
`260`	`261`	`;`
`261`	`262`	`patternsMethod`
`262`	`263`	`: PUNCT`
Original file line number	Diff line number	Diff line change
`@@ -929,6 +929,9 @@ public UnresolvedPlan visitRexCommand(OpenSearchPPLParser.RexCommandContext ctx)`
`929`	`929`	`if (optionCtx.MODE() != null && optionCtx.SED() != null) {`
`930`	`930`	`mode = Rex.RexMode.SED;`
`931`	`931`	`}`
	`932`	`+ if (optionCtx.MODE() != null && optionCtx.EXTRACT() != null) {`
	`933`	`+ mode = Rex.RexMode.EXTRACT;`
	`934`	`+ }`
`932`	`935`	`}`
`933`	`936`
`934`	`937`	`return new Rex(field, pattern, maxMatch, offsetField, mode);`