Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions docs/ingestion/data-formats.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,74 @@ Besides text formats, Druid also supports binary formats such as [Orc](#orc) and
Druid supports custom text data formats and can use the Regex input format to parse them. However, be aware doing this to
parse data is less efficient than writing a native Java `InputFormat` extension, or using an external stream processor. We welcome contributions of new input formats.

## Regex engine configuration

The `regex` input format supports configurable regex engines using the runtime property:

```properties
druid.regex.engine=JAVA
```

Supported values:

| Value | Description |
|----------|--------------------------------------------------------------------------|
| `JAVA` | Uses Java's built-in `java.util.regex.Pattern` engine. |
| `RE2J` | Uses `Google's RE2/J` regex engine with linear-time matching guarantees. |

Default value:

```properties
druid.regex.engine=JAVA
```

### RE2/J engine

Setting:

```properties
druid.regex.engine=RE2J
```

enables the RE2/J regex engine for ingestion task `regex` input formats.

RE2/J helps protect against catastrophic backtracking and Regular Expression Denial of Service (ReDoS) attacks by guaranteeing linear-time regex evaluation.

### Compatibility differences

RE2/J does not support all Java regex features.

Unsupported or partially supported features include:
- back references
- look behind assertions
- some advanced backtracking behavior

Patterns using unsupported constructs will fail during regex compilation.

### Example of catastrophic backtracking

The following Java regex may cause catastrophic backtracking:

```regex
^(.*a){20}$
```

against input such as:

```text
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaX
```

Using `RE2J` avoids this issue.

### Performance considerations

- `JAVA` may support more advanced regex syntax and behavior.
- `RE2J` provides safer and more predictable runtime characteristics.
- For trusted internal ingestion specs, `JAVA` may be preferred for compatibility.
- For externally supplied regex patterns, `RE2J` is recommended.


## Input format

You can use the `inputFormat` field to specify the data format for your input data.
Expand Down
4 changes: 4 additions & 0 deletions extensions-core/druid-kerberos/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,10 @@
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-servlet</artifactId>
</exclusion>
<exclusion>
<groupId>com.google.re2j</groupId>
<artifactId>re2j</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
Expand Down
2 changes: 1 addition & 1 deletion licenses.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5239,7 +5239,7 @@ name: RE2/J
license_category: binary
module: java-core
license_name: The Go license
version: 1.1
version: 1.7
license_file_path: licenses/bin/re2j.GO
libraries:
- com.google.re2j: re2j
Expand Down
5 changes: 5 additions & 0 deletions processing/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,11 @@
<groupId>io.timeandspace</groupId>
<artifactId>cron-scheduler</artifactId>
</dependency>
<dependency>
<groupId>com.google.re2j</groupId>
<artifactId>re2j</artifactId>
<version>1.7</version>
</dependency>

<!-- com.lmax.disruptor is optional in log4j-core, so we explicitly include it here -->
<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

package org.apache.druid.data.input.impl;

import com.fasterxml.jackson.annotation.JacksonInject;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonInclude;
Expand All @@ -29,12 +30,14 @@
import org.apache.druid.data.input.InputEntityReader;
import org.apache.druid.data.input.InputFormat;
import org.apache.druid.data.input.InputRowSchema;
import org.apache.druid.regex.RegexConfig;
import org.apache.druid.regex.RegexPattern;
import org.apache.druid.regex.RegexPatternFactory;
import org.apache.druid.utils.CompressionUtils;

import javax.annotation.Nullable;
import java.io.File;
import java.util.List;
import java.util.regex.Pattern;

public class RegexInputFormat implements InputFormat
{
Expand All @@ -44,10 +47,11 @@ public class RegexInputFormat implements InputFormat
private final String listDelimiter;
private final List<String> columns;
@JsonIgnore
private final Supplier<Pattern> compiledPatternSupplier;
private final Supplier<RegexPattern> compiledPatternSupplier;

@JsonCreator
public RegexInputFormat(
@JacksonInject RegexConfig regexConfig,
@JsonProperty("pattern") String pattern,
@JsonProperty("listDelimiter") @Nullable String listDelimiter,
@JsonProperty("columns") @Nullable List<String> columns
Expand All @@ -56,7 +60,7 @@ public RegexInputFormat(
this.pattern = pattern;
this.listDelimiter = listDelimiter;
this.columns = columns;
this.compiledPatternSupplier = Suppliers.memoize(() -> Pattern.compile(pattern));
this.compiledPatternSupplier = Suppliers.memoize(() -> RegexPatternFactory.compile(regexConfig.getEngine(), pattern));
Comment thread
vivek807 marked this conversation as resolved.
}

@JsonProperty
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,19 @@
import org.apache.druid.java.util.common.parsers.ParseException;
import org.apache.druid.java.util.common.parsers.ParserUtils;
import org.apache.druid.java.util.common.parsers.Parsers;
import org.apache.druid.regex.RegexMatcher;
import org.apache.druid.regex.RegexPattern;

import javax.annotation.Nullable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class RegexReader extends TextReader.Strings
{
private final String pattern;
private final Pattern compiledPattern;
private final RegexPattern compiledPattern;
private final Function<String, Object> transformationFunction;

private List<String> columns;
Expand All @@ -51,7 +51,7 @@ public class RegexReader extends TextReader.Strings
InputRowSchema inputRowSchema,
InputEntity source,
String pattern,
Pattern compiledPattern,
RegexPattern compiledPattern,
@Nullable String listDelimiter,
@Nullable List<String> columns
)
Expand Down Expand Up @@ -83,7 +83,7 @@ protected List<Map<String, Object>> toMap(String intermediateRow)
private Map<String, Object> parseLine(String line)
{
try {
final Matcher matcher = compiledPattern.matcher(line);
final RegexMatcher matcher = compiledPattern.matcher(line);

if (!matcher.matches()) {
throw new ParseException(line, "Incorrect Regex: %s . No match found.", pattern);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.druid.guice;

import com.google.inject.Binder;
import com.google.inject.Module;
import org.apache.druid.regex.RegexConfig;

/**
*/
public class RegexEngineModule implements Module
{
@Override
public void configure(Binder binder)
{
JsonConfigProvider.bind(binder, "druid.regex", RegexConfig.class);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ public StartupInjectorBuilder()
new JacksonModule(),
new ConfigModule(),
new ExpressionProcessingModule(),
new RegexEngineModule(),
binder -> binder.bind(DruidSecondaryModule.class),
binder -> binder.bind(PropertiesValidator.class) // this gets properties injected, later call to validate checks
);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.druid.regex;

import java.util.regex.Matcher;

public class JavaRegexMatcher implements RegexMatcher
{
private final Matcher matcher;

public JavaRegexMatcher(Matcher matcher)
{
this.matcher = matcher;
}

@Override
public boolean matches()
{
return matcher.matches();
}

@Override
public boolean find()
{
return matcher.find();
}

@Override
public String group(int group)
{
return matcher.group(group);
}

@Override
public int groupCount()
{
return matcher.groupCount();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.druid.regex;

import java.util.regex.Pattern;

public class JavaRegexPattern implements RegexPattern
{
private final Pattern pattern;

public JavaRegexPattern(String regex)
{
this.pattern = Pattern.compile(regex);
}

@Override
public RegexMatcher matcher(String input)
{
return new JavaRegexMatcher(pattern.matcher(input));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.druid.regex;

import com.google.re2j.Matcher;

public class Re2jRegexMatcher implements RegexMatcher
{
private final Matcher matcher;

public Re2jRegexMatcher(Matcher matcher)
{
this.matcher = matcher;
}

@Override
public boolean matches()
{
return matcher.matches();
}

@Override
public boolean find()
{
return matcher.find();
}

@Override
public String group(int group)
{
return matcher.group(group);
}

@Override
public int groupCount()
{
return matcher.groupCount();
}
}
Loading
Loading