Skip to content

Commit 6b0dc64

Browse files
committed
Add max match limit implementation
Signed-off-by: Jialiang Liang <jiallian@amazon.com>
1 parent 1da67a4 commit 6b0dc64

6 files changed

Lines changed: 233 additions & 2 deletions

File tree

common/src/main/java/org/opensearch/sql/common/setting/Settings.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ public enum Key {
2929
PATTERN_MODE("plugins.ppl.pattern.mode"),
3030
PATTERN_MAX_SAMPLE_COUNT("plugins.ppl.pattern.max.sample.count"),
3131
PATTERN_BUFFER_LIMIT("plugins.ppl.pattern.buffer.limit"),
32+
PPL_REX_MAX_MATCH_LIMIT("plugins.ppl.rex.max_match.limit"),
3233

3334
/** Enable Calcite as execution engine */
3435
CALCITE_ENGINE_ENABLED("plugins.calcite.enabled"),

docs/user/ppl/cmd/rex.rst

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ rex field=<field> <pattern> [max_match=<int>]
2323

2424
* field: mandatory. The field must be a string field to extract data from.
2525
* pattern: mandatory string. The regular expression pattern with named capture groups used to extract new fields. Pattern must contain at least one named capture group using ``(?<name>pattern)`` syntax.
26-
* max_match: optional integer (default=1). Maximum number of matches to extract. If greater than 1, extracted fields become arrays.
26+
* max_match: optional integer (default=1). Maximum number of matches to extract. If greater than 1, extracted fields become arrays. The value 0 means unlimited matches, but is automatically capped to the configured limit (default: 10, configurable via ``plugins.ppl.rex.max_match.limit``).
2727

2828
Example 1: Basic Field Extraction
2929
==================================
@@ -135,6 +135,28 @@ Correct PPL query without underscores::
135135
+-----------------------+------------+-------------+
136136

137137

138+
Example 7: Max Match Limit Protection
139+
======================================
140+
141+
Demonstrates the max_match limit protection mechanism. When max_match=0 (unlimited) is specified, the system automatically caps it to prevent memory exhaustion.
142+
143+
PPL query with max_match=0 automatically capped to default limit of 10::
144+
145+
os> source=accounts | rex field=address "(?<digit>\\d*)" max_match=0 | eval digit_count=array_length(digit) | fields address, digit_count | head 1 ;
146+
fetched rows / total rows = 1/1
147+
+-----------------+-------------+
148+
| address | digit_count |
149+
|-----------------+-------------|
150+
| 880 Holmes Lane | 10 |
151+
+-----------------+-------------+
152+
153+
PPL query exceeding the configured limit results in an error::
154+
155+
os> source=accounts | rex field=address "(?<digit>\\d*)" max_match=100 | fields address, digit | head 1 ;
156+
{'reason': 'Invalid Query', 'details': 'Rex command max_match value (100) exceeds the configured limit (10). Consider using a smaller max_match value or adjust the plugins.ppl.rex.max_match.limit setting.', 'type': 'IllegalArgumentException'}
157+
Error: Query returned no data
158+
159+
138160
Comparison with Related Commands
139161
================================
140162

@@ -164,3 +186,10 @@ There are several important limitations with the rex command:
164186

165187
- Pattern must contain at least one named capture group
166188
- Regular capture groups ``(...)`` without names are not allowed
189+
190+
**Max Match Limit:**
191+
192+
- The ``max_match`` parameter is subject to a configurable system limit to prevent memory exhaustion
193+
- When ``max_match=0`` (unlimited) is specified, it is automatically capped at the configured limit (default: 10)
194+
- User-specified values exceeding the configured limit will result in an error
195+
- Users can adjust the limit via the ``plugins.ppl.rex.max_match.limit`` cluster setting. Setting this limit to a large value is not recommended as it can lead to excessive memory consumption, especially with patterns that match empty strings (e.g., ``\d*``, ``\w*``)

integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteRexCommandIT.java

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import java.io.IOException;
1111
import org.json.JSONObject;
1212
import org.junit.jupiter.api.Test;
13+
import org.opensearch.sql.common.setting.Settings;
1314
import org.opensearch.sql.ppl.PPLIntegTestCase;
1415

1516
public class CalciteRexCommandIT extends PPLIntegTestCase {
@@ -147,4 +148,98 @@ public void testRexWithStatsCommand() throws IOException {
147148
assertFalse(domain.contains("@"));
148149
assertTrue(domain.matches("[a-z]+"));
149150
}
151+
152+
@Test
153+
public void testRexMaxMatchZeroLimitedToDefaultTen() throws IOException {
154+
JSONObject result =
155+
executeQuery(
156+
String.format(
157+
"source=%s | rex field=address \\\"(?<digit>\\\\\\\\d*)\\\" max_match=0 | eval"
158+
+ " digit_count=array_length(digit) | fields address, digit_count | head 1",
159+
TEST_INDEX_ACCOUNT));
160+
161+
assertEquals(1, result.getJSONArray("datarows").length());
162+
// Should be capped at 10 matches
163+
assertEquals(10, result.getJSONArray("datarows").getJSONArray(0).get(1));
164+
}
165+
166+
@Test
167+
public void testRexMaxMatchExceedsDefaultLimit() throws IOException {
168+
try {
169+
executeQuery(
170+
String.format(
171+
"source=%s | rex field=address \\\"(?<digit>\\\\\\\\d+)\\\" max_match=100 | fields"
172+
+ " address, digit",
173+
TEST_INDEX_ACCOUNT));
174+
fail("Should have thrown an exception for max_match exceeding default limit");
175+
} catch (Exception e) {
176+
assertTrue(e.getMessage().contains("exceeds the configured limit (10)"));
177+
assertTrue(e.getMessage().contains("Consider using a smaller max_match value"));
178+
}
179+
}
180+
181+
@Test
182+
public void testRexMaxMatchWithinDefaultLimit() throws IOException {
183+
JSONObject result =
184+
executeQuery(
185+
String.format(
186+
"source=%s | rex field=address \\\"(?<digit>\\\\\\\\d*)\\\" max_match=5 | eval"
187+
+ " digit_count=array_length(digit) | fields address, digit_count | head 1",
188+
TEST_INDEX_ACCOUNT));
189+
190+
assertEquals(1, result.getJSONArray("datarows").length());
191+
// Should respect the specified limit of 5
192+
assertEquals(5, result.getJSONArray("datarows").getJSONArray(0).get(1));
193+
}
194+
195+
@Test
196+
public void testRexMaxMatchAtDefaultLimit() throws IOException {
197+
JSONObject result =
198+
executeQuery(
199+
String.format(
200+
"source=%s | rex field=address \\\"(?<digit>\\\\\\\\d*)\\\" max_match=10 | eval"
201+
+ " digit_count=array_length(digit) | fields address, digit_count | head 1",
202+
TEST_INDEX_ACCOUNT));
203+
204+
assertEquals(1, result.getJSONArray("datarows").length());
205+
// Should accept exactly the limit
206+
assertEquals(10, result.getJSONArray("datarows").getJSONArray(0).get(1));
207+
}
208+
209+
@Test
210+
public void testRexMaxMatchConfigurableLimit() throws IOException {
211+
// Set a custom limit of 5
212+
updateClusterSettings(
213+
new ClusterSetting(PERSISTENT, Settings.Key.PPL_REX_MAX_MATCH_LIMIT.getKeyValue(), "5"));
214+
215+
try {
216+
// Test that max_match=0 is capped to the new limit
217+
JSONObject result =
218+
executeQuery(
219+
String.format(
220+
"source=%s | rex field=address \\\"(?<digit>\\\\\\\\d*)\\\" max_match=0 | eval"
221+
+ " digit_count=array_length(digit) | fields address, digit_count | head 1",
222+
TEST_INDEX_ACCOUNT));
223+
224+
assertEquals(1, result.getJSONArray("datarows").length());
225+
// Should be capped at the configured limit of 5
226+
assertEquals(5, result.getJSONArray("datarows").getJSONArray(0).get(1));
227+
228+
// Test that exceeding the custom limit throws an error
229+
try {
230+
executeQuery(
231+
String.format(
232+
"source=%s | rex field=address \\\"(?<digit>\\\\\\\\d+)\\\" max_match=10 | fields"
233+
+ " address, digit",
234+
TEST_INDEX_ACCOUNT));
235+
fail("Should have thrown an exception for max_match exceeding custom limit");
236+
} catch (Exception e) {
237+
assertTrue(e.getMessage().contains("exceeds the configured limit (5)"));
238+
assertTrue(e.getMessage().contains("adjust the plugins.ppl.rex.max_match.limit setting"));
239+
}
240+
} finally {
241+
updateClusterSettings(
242+
new ClusterSetting(PERSISTENT, Settings.Key.PPL_REX_MAX_MATCH_LIMIT.getKeyValue(), null));
243+
}
244+
}
150245
}

opensearch/src/main/java/org/opensearch/sql/opensearch/setting/OpenSearchSettings.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,14 @@ public class OpenSearchSettings extends Settings {
9999
Setting.Property.NodeScope,
100100
Setting.Property.Dynamic);
101101

102+
public static final Setting<?> PPL_REX_MAX_MATCH_LIMIT_SETTING =
103+
Setting.intSetting(
104+
Key.PPL_REX_MAX_MATCH_LIMIT.getKeyValue(),
105+
10,
106+
1,
107+
Setting.Property.NodeScope,
108+
Setting.Property.Dynamic);
109+
102110
public static final Setting<?> CALCITE_ENGINE_ENABLED_SETTING =
103111
Setting.boolSetting(
104112
Key.CALCITE_ENGINE_ENABLED.getKeyValue(),
@@ -327,6 +335,12 @@ public OpenSearchSettings(ClusterSettings clusterSettings) {
327335
Key.PATTERN_BUFFER_LIMIT,
328336
DEFAULT_PATTERN_BUFFER_LIMIT_SETTING,
329337
new Updater(Key.PATTERN_BUFFER_LIMIT));
338+
register(
339+
settingBuilder,
340+
clusterSettings,
341+
Key.PPL_REX_MAX_MATCH_LIMIT,
342+
PPL_REX_MAX_MATCH_LIMIT_SETTING,
343+
new Updater(Key.PPL_REX_MAX_MATCH_LIMIT));
330344
register(
331345
settingBuilder,
332346
clusterSettings,
@@ -531,6 +545,7 @@ public static List<Setting<?>> pluginSettings() {
531545
.add(DEFAULT_PATTERN_MODE_SETTING)
532546
.add(DEFAULT_PATTERN_MAX_SAMPLE_COUNT_SETTING)
533547
.add(DEFAULT_PATTERN_BUFFER_LIMIT_SETTING)
548+
.add(PPL_REX_MAX_MATCH_LIMIT_SETTING)
534549
.add(QUERY_MEMORY_LIMIT_SETTING)
535550
.add(QUERY_SIZE_LIMIT_SETTING)
536551
.add(METRICS_ROLLING_WINDOW_SETTING)

ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -982,7 +982,31 @@ public UnresolvedPlan visitRexCommand(OpenSearchPPLParser.RexCommandContext ctx)
982982
}
983983
}
984984

985-
return new Rex(field, pattern, mode, maxMatch);
985+
int maxMatchLimit =
986+
(settings != null) ? settings.getSettingValue(Settings.Key.PPL_REX_MAX_MATCH_LIMIT) : 10;
987+
988+
int effectiveMaxMatch;
989+
if (maxMatch.isPresent()) {
990+
if (maxMatch.get() == 0) {
991+
effectiveMaxMatch = maxMatchLimit;
992+
} else if (maxMatch.get() > maxMatchLimit) {
993+
throw new IllegalArgumentException(
994+
String.format(
995+
"Rex command max_match value (%d) exceeds the configured limit (%d). "
996+
+ "Consider using a smaller max_match value"
997+
+ (settings != null
998+
? " or adjust the plugins.ppl.rex.max_match.limit setting."
999+
: "."),
1000+
maxMatch.get(),
1001+
maxMatchLimit));
1002+
} else {
1003+
effectiveMaxMatch = maxMatch.get();
1004+
}
1005+
} else {
1006+
effectiveMaxMatch = 1;
1007+
}
1008+
1009+
return new Rex(field, pattern, mode, Optional.of(effectiveMaxMatch));
9861010
}
9871011

9881012
/** Get original text in query. */

ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLRexTest.java

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,24 @@
55

66
package org.opensearch.sql.ppl.calcite;
77

8+
import static org.mockito.Mockito.doReturn;
9+
810
import org.apache.calcite.rel.RelNode;
911
import org.apache.calcite.test.CalciteAssert;
12+
import org.junit.Before;
1013
import org.junit.Test;
14+
import org.opensearch.sql.common.setting.Settings;
1115

1216
public class CalcitePPLRexTest extends CalcitePPLAbstractTest {
1317
public CalcitePPLRexTest() {
1418
super(CalciteAssert.SchemaSpec.SCOTT_WITH_TEMPORAL);
1519
}
1620

21+
@Before
22+
public void setUp() {
23+
doReturn(10).when(settings).getSettingValue(Settings.Key.PPL_REX_MAX_MATCH_LIMIT);
24+
}
25+
1726
@Test
1827
public void testRexBasicFieldExtraction() {
1928
String ppl = "source=EMP | rex field=ENAME '(?<first>[A-Z]).*' | fields ENAME, first";
@@ -159,4 +168,62 @@ public void testRexWithSort() {
159168
+ "LIMIT 5";
160169
verifyPPLToSparkSQL(root, expectedSparkSql);
161170
}
171+
172+
@Test
173+
public void testRexWithMaxMatchZero() {
174+
// Test that max_match=0 (unlimited) is capped to the configured limit
175+
String ppl =
176+
"source=EMP | rex field=ENAME '(?<letter>[A-Z])' max_match=0 | fields ENAME, letter";
177+
RelNode root = getRelNode(ppl);
178+
String expectedLogical =
179+
"LogicalProject(ENAME=[$1], letter=[REX_EXTRACT_MULTI($1, '(?<letter>[A-Z])', 1, 10)])\n"
180+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
181+
verifyLogical(root, expectedLogical);
182+
183+
String expectedSparkSql =
184+
"SELECT `ENAME`, `REX_EXTRACT_MULTI`(`ENAME`, '(?<letter>[A-Z])', 1, 10) `letter`\n"
185+
+ "FROM `scott`.`EMP`";
186+
verifyPPLToSparkSQL(root, expectedSparkSql);
187+
}
188+
189+
@Test(expected = IllegalArgumentException.class)
190+
public void testRexWithMaxMatchExceedsLimit() {
191+
// Test that max_match exceeding the configured limit throws an exception
192+
String ppl =
193+
"source=EMP | rex field=ENAME '(?<letter>[A-Z])' max_match=100 | fields ENAME, letter";
194+
getRelNode(ppl);
195+
}
196+
197+
@Test
198+
public void testRexWithMaxMatchWithinLimit() {
199+
String ppl =
200+
"source=EMP | rex field=ENAME '(?<letter>[A-Z])' max_match=5 | fields ENAME, letter";
201+
RelNode root = getRelNode(ppl);
202+
String expectedLogical =
203+
"LogicalProject(ENAME=[$1], letter=[REX_EXTRACT_MULTI($1, '(?<letter>[A-Z])', 1, 5)])\n"
204+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
205+
verifyLogical(root, expectedLogical);
206+
207+
String expectedSparkSql =
208+
"SELECT `ENAME`, `REX_EXTRACT_MULTI`(`ENAME`, '(?<letter>[A-Z])', 1, 5) `letter`\n"
209+
+ "FROM `scott`.`EMP`";
210+
verifyPPLToSparkSQL(root, expectedSparkSql);
211+
}
212+
213+
@Test
214+
public void testRexWithMaxMatchAtLimit() {
215+
// Test that max_match exactly at the limit works
216+
String ppl =
217+
"source=EMP | rex field=ENAME '(?<letter>[A-Z])' max_match=10 | fields ENAME, letter";
218+
RelNode root = getRelNode(ppl);
219+
String expectedLogical =
220+
"LogicalProject(ENAME=[$1], letter=[REX_EXTRACT_MULTI($1, '(?<letter>[A-Z])', 1, 10)])\n"
221+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
222+
verifyLogical(root, expectedLogical);
223+
224+
String expectedSparkSql =
225+
"SELECT `ENAME`, `REX_EXTRACT_MULTI`(`ENAME`, '(?<letter>[A-Z])', 1, 10) `letter`\n"
226+
+ "FROM `scott`.`EMP`";
227+
verifyPPLToSparkSQL(root, expectedSparkSql);
228+
}
162229
}

0 commit comments

Comments
 (0)