Skip to content

Commit d41d952

Browse files
committed
Add Google RE2/J linear time regular expression as alternative to Java regex
1 parent 06ef24c commit d41d952

23 files changed

Lines changed: 558 additions & 20 deletions

docs/ingestion/data-formats.md

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,74 @@ Besides text formats, Druid also supports binary formats such as [Orc](#orc) and
7373
Druid supports custom text data formats and can use the Regex input format to parse them. However, be aware doing this to
7474
parse data is less efficient than writing a native Java `InputFormat` extension, or using an external stream processor. We welcome contributions of new input formats.
7575

76+
## Regex engine configuration
77+
78+
The `regex` input format supports configurable regex engines using the runtime property:
79+
80+
```properties
81+
druid.regex.engine=JAVA
82+
```
83+
84+
Supported values:
85+
86+
| Value | Description |
87+
|----------|--------------------------------------------------------------------------|
88+
| `JAVA` | Uses Java's built-in `java.util.regex.Pattern` engine. |
89+
| `RE2J` | Uses `Google's RE2/J` regex engine with linear-time matching guarantees. |
90+
91+
Default value:
92+
93+
```properties
94+
druid.regex.engine=JAVA
95+
```
96+
97+
### RE2/J engine
98+
99+
Setting:
100+
101+
```properties
102+
druid.regex.engine=RE2J
103+
```
104+
105+
enables the RE2/J regex engine for ingestion task `regex` input formats.
106+
107+
RE2/J helps protect against catastrophic backtracking and Regular Expression Denial of Service (ReDoS) attacks by guaranteeing linear-time regex evaluation.
108+
109+
### Compatibility differences
110+
111+
RE2/J does not support all Java regex features.
112+
113+
Unsupported or partially supported features include:
114+
- backreferences
115+
- lookbehind assertions
116+
- some advanced backtracking behavior
117+
118+
Patterns using unsupported constructs will fail during regex compilation.
119+
120+
### Example of catastrophic backtracking
121+
122+
The following Java regex may cause catastrophic backtracking:
123+
124+
```regex
125+
^(.*a){20}$
126+
```
127+
128+
against input such as:
129+
130+
```text
131+
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaX
132+
```
133+
134+
Using `RE2J` avoids this issue.
135+
136+
### Performance considerations
137+
138+
- `JAVA` may support more advanced regex syntax and behavior.
139+
- `RE2J` provides safer and more predictable runtime characteristics.
140+
- For trusted internal ingestion specs, `JAVA` may be preferred for compatibility.
141+
- For externally supplied regex patterns, `RE2J` is recommended.
142+
143+
76144
## Input format
77145

78146
You can use the `inputFormat` field to specify the data format for your input data.

extensions-core/druid-kerberos/pom.xml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,10 @@
248248
<groupId>org.eclipse.jetty</groupId>
249249
<artifactId>jetty-servlet</artifactId>
250250
</exclusion>
251+
<exclusion>
252+
<groupId>com.google.re2j</groupId>
253+
<artifactId>re2j</artifactId>
254+
</exclusion>
251255
</exclusions>
252256
</dependency>
253257
<dependency>

licenses.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5239,7 +5239,7 @@ name: RE2/J
52395239
license_category: binary
52405240
module: java-core
52415241
license_name: The Go license
5242-
version: 1.1
5242+
version: 1.7
52435243
license_file_path: licenses/bin/re2j.GO
52445244
libraries:
52455245
- com.google.re2j: re2j

processing/pom.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,11 @@
292292
<groupId>io.timeandspace</groupId>
293293
<artifactId>cron-scheduler</artifactId>
294294
</dependency>
295+
<dependency>
296+
<groupId>com.google.re2j</groupId>
297+
<artifactId>re2j</artifactId>
298+
<version>1.7</version>
299+
</dependency>
295300

296301
<!-- com.lmax.disruptor is optional in log4j-core, so we explicitly include it here -->
297302
<dependency>

processing/src/main/java/org/apache/druid/data/input/impl/RegexInputFormat.java

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
package org.apache.druid.data.input.impl;
2121

22+
import com.fasterxml.jackson.annotation.JacksonInject;
2223
import com.fasterxml.jackson.annotation.JsonCreator;
2324
import com.fasterxml.jackson.annotation.JsonIgnore;
2425
import com.fasterxml.jackson.annotation.JsonInclude;
@@ -29,12 +30,14 @@
2930
import org.apache.druid.data.input.InputEntityReader;
3031
import org.apache.druid.data.input.InputFormat;
3132
import org.apache.druid.data.input.InputRowSchema;
33+
import org.apache.druid.regex.RegexConfig;
34+
import org.apache.druid.regex.RegexPattern;
35+
import org.apache.druid.regex.RegexPatternFactory;
3236
import org.apache.druid.utils.CompressionUtils;
3337

3438
import javax.annotation.Nullable;
3539
import java.io.File;
3640
import java.util.List;
37-
import java.util.regex.Pattern;
3841

3942
public class RegexInputFormat implements InputFormat
4043
{
@@ -44,10 +47,11 @@ public class RegexInputFormat implements InputFormat
4447
private final String listDelimiter;
4548
private final List<String> columns;
4649
@JsonIgnore
47-
private final Supplier<Pattern> compiledPatternSupplier;
50+
private final Supplier<RegexPattern> compiledPatternSupplier;
4851

4952
@JsonCreator
5053
public RegexInputFormat(
54+
@JacksonInject RegexConfig regexConfig,
5155
@JsonProperty("pattern") String pattern,
5256
@JsonProperty("listDelimiter") @Nullable String listDelimiter,
5357
@JsonProperty("columns") @Nullable List<String> columns
@@ -56,7 +60,7 @@ public RegexInputFormat(
5660
this.pattern = pattern;
5761
this.listDelimiter = listDelimiter;
5862
this.columns = columns;
59-
this.compiledPatternSupplier = Suppliers.memoize(() -> Pattern.compile(pattern));
63+
this.compiledPatternSupplier = Suppliers.memoize(() -> RegexPatternFactory.compile(regexConfig.getEngine(), pattern));
6064
}
6165

6266
@JsonProperty

processing/src/main/java/org/apache/druid/data/input/impl/RegexReader.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,19 +30,19 @@
3030
import org.apache.druid.java.util.common.parsers.ParseException;
3131
import org.apache.druid.java.util.common.parsers.ParserUtils;
3232
import org.apache.druid.java.util.common.parsers.Parsers;
33+
import org.apache.druid.regex.RegexMatcher;
34+
import org.apache.druid.regex.RegexPattern;
3335

3436
import javax.annotation.Nullable;
3537
import java.util.ArrayList;
3638
import java.util.Collections;
3739
import java.util.List;
3840
import java.util.Map;
39-
import java.util.regex.Matcher;
40-
import java.util.regex.Pattern;
4141

4242
public class RegexReader extends TextReader.Strings
4343
{
4444
private final String pattern;
45-
private final Pattern compiledPattern;
45+
private final RegexPattern compiledPattern;
4646
private final Function<String, Object> transformationFunction;
4747

4848
private List<String> columns;
@@ -51,7 +51,7 @@ public class RegexReader extends TextReader.Strings
5151
InputRowSchema inputRowSchema,
5252
InputEntity source,
5353
String pattern,
54-
Pattern compiledPattern,
54+
RegexPattern compiledPattern,
5555
@Nullable String listDelimiter,
5656
@Nullable List<String> columns
5757
)
@@ -83,7 +83,7 @@ protected List<Map<String, Object>> toMap(String intermediateRow)
8383
private Map<String, Object> parseLine(String line)
8484
{
8585
try {
86-
final Matcher matcher = compiledPattern.matcher(line);
86+
final RegexMatcher matcher = compiledPattern.matcher(line);
8787

8888
if (!matcher.matches()) {
8989
throw new ParseException(line, "Incorrect Regex: %s . No match found.", pattern);
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.druid.guice;
21+
22+
import com.google.inject.Binder;
23+
import com.google.inject.Module;
24+
import org.apache.druid.regex.RegexConfig;
25+
26+
/**
27+
*/
28+
public class RegexEngineModule implements Module
29+
{
30+
@Override
31+
public void configure(Binder binder)
32+
{
33+
JsonConfigProvider.bind(binder, "druid.regex", RegexConfig.class);
34+
}
35+
}

processing/src/main/java/org/apache/druid/guice/StartupInjectorBuilder.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ public StartupInjectorBuilder()
6565
new JacksonModule(),
6666
new ConfigModule(),
6767
new ExpressionProcessingModule(),
68+
new RegexEngineModule(),
6869
binder -> binder.bind(DruidSecondaryModule.class),
6970
binder -> binder.bind(PropertiesValidator.class) // this gets properties injected, later call to validate checks
7071
);
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.druid.regex;
21+
22+
import java.util.regex.Matcher;
23+
24+
public class JavaRegexMatcher implements RegexMatcher
25+
{
26+
private final Matcher matcher;
27+
28+
public JavaRegexMatcher(Matcher matcher)
29+
{
30+
this.matcher = matcher;
31+
}
32+
33+
@Override
34+
public boolean matches()
35+
{
36+
return matcher.matches();
37+
}
38+
39+
@Override
40+
public boolean find()
41+
{
42+
return matcher.find();
43+
}
44+
45+
@Override
46+
public String group(int group)
47+
{
48+
return matcher.group(group);
49+
}
50+
51+
@Override
52+
public int groupCount()
53+
{
54+
return matcher.groupCount();
55+
}
56+
}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.druid.regex;
21+
22+
import java.util.regex.Pattern;
23+
24+
public class JavaRegexPattern implements RegexPattern
25+
{
26+
private final Pattern pattern;
27+
28+
public JavaRegexPattern(String regex)
29+
{
30+
this.pattern = Pattern.compile(regex);
31+
}
32+
33+
@Override
34+
public RegexMatcher matcher(String input)
35+
{
36+
return new JavaRegexMatcher(pattern.matcher(input));
37+
}
38+
}

0 commit comments

Comments
 (0)