diff --git a/docs/ingestion/data-formats.md b/docs/ingestion/data-formats.md index 6a2aedc39764..1427ce0164d4 100644 --- a/docs/ingestion/data-formats.md +++ b/docs/ingestion/data-formats.md @@ -73,6 +73,74 @@ Besides text formats, Druid also supports binary formats such as [Orc](#orc) and Druid supports custom text data formats and can use the Regex input format to parse them. However, be aware doing this to parse data is less efficient than writing a native Java `InputFormat` extension, or using an external stream processor. We welcome contributions of new input formats. +## Regex engine configuration + +The `regex` input format supports configurable regex engines using the runtime property: + +```properties +druid.regex.engine=JAVA +``` + +Supported values: + +| Value | Description | +|----------|--------------------------------------------------------------------------| +| `JAVA` | Uses Java's built-in `java.util.regex.Pattern` engine. | +| `RE2J` | Uses `Google's RE2/J` regex engine with linear-time matching guarantees. | + +Default value: + +```properties +druid.regex.engine=JAVA +``` + +### RE2/J engine + +Setting: + +```properties +druid.regex.engine=RE2J +``` + +enables the RE2/J regex engine for ingestion task `regex` input formats. + +RE2/J helps protect against catastrophic backtracking and Regular Expression Denial of Service (ReDoS) attacks by guaranteeing linear-time regex evaluation. + +### Compatibility differences + +RE2/J does not support all Java regex features. + +Unsupported or partially supported features include: +- back references +- look behind assertions +- some advanced backtracking behavior + +Patterns using unsupported constructs will fail during regex compilation. + +### Example of catastrophic backtracking + +The following Java regex may cause catastrophic backtracking: + +```regex +^(.*a){20}$ +``` + +against input such as: + +```text +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaX +``` + +Using `RE2J` avoids this issue. + +### Performance considerations + +- `JAVA` may support more advanced regex syntax and behavior. +- `RE2J` provides safer and more predictable runtime characteristics. +- For trusted internal ingestion specs, `JAVA` may be preferred for compatibility. +- For externally supplied regex patterns, `RE2J` is recommended. + + ## Input format You can use the `inputFormat` field to specify the data format for your input data. diff --git a/extensions-core/druid-kerberos/pom.xml b/extensions-core/druid-kerberos/pom.xml index f1843314434d..5700d5e24da2 100644 --- a/extensions-core/druid-kerberos/pom.xml +++ b/extensions-core/druid-kerberos/pom.xml @@ -248,6 +248,10 @@ org.eclipse.jetty jetty-servlet + + com.google.re2j + re2j + diff --git a/licenses.yaml b/licenses.yaml index 23f58a0d3b3d..d70eab9e1ebb 100644 --- a/licenses.yaml +++ b/licenses.yaml @@ -5239,7 +5239,7 @@ name: RE2/J license_category: binary module: java-core license_name: The Go license -version: 1.1 +version: 1.7 license_file_path: licenses/bin/re2j.GO libraries: - com.google.re2j: re2j diff --git a/processing/pom.xml b/processing/pom.xml index 48a4d4999ef0..4469d67fb518 100644 --- a/processing/pom.xml +++ b/processing/pom.xml @@ -292,6 +292,11 @@ io.timeandspace cron-scheduler + + com.google.re2j + re2j + 1.7 + diff --git a/processing/src/main/java/org/apache/druid/data/input/impl/RegexInputFormat.java b/processing/src/main/java/org/apache/druid/data/input/impl/RegexInputFormat.java index a3057fa212b2..e807427a6862 100644 --- a/processing/src/main/java/org/apache/druid/data/input/impl/RegexInputFormat.java +++ b/processing/src/main/java/org/apache/druid/data/input/impl/RegexInputFormat.java @@ -19,6 +19,7 @@ package org.apache.druid.data.input.impl; +import com.fasterxml.jackson.annotation.JacksonInject; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonInclude; @@ -29,12 +30,14 @@ import org.apache.druid.data.input.InputEntityReader; import org.apache.druid.data.input.InputFormat; import org.apache.druid.data.input.InputRowSchema; +import org.apache.druid.regex.RegexConfig; +import org.apache.druid.regex.RegexPattern; +import org.apache.druid.regex.RegexPatternFactory; import org.apache.druid.utils.CompressionUtils; import javax.annotation.Nullable; import java.io.File; import java.util.List; -import java.util.regex.Pattern; public class RegexInputFormat implements InputFormat { @@ -44,10 +47,11 @@ public class RegexInputFormat implements InputFormat private final String listDelimiter; private final List columns; @JsonIgnore - private final Supplier compiledPatternSupplier; + private final Supplier compiledPatternSupplier; @JsonCreator public RegexInputFormat( + @JacksonInject RegexConfig regexConfig, @JsonProperty("pattern") String pattern, @JsonProperty("listDelimiter") @Nullable String listDelimiter, @JsonProperty("columns") @Nullable List columns @@ -56,7 +60,7 @@ public RegexInputFormat( this.pattern = pattern; this.listDelimiter = listDelimiter; this.columns = columns; - this.compiledPatternSupplier = Suppliers.memoize(() -> Pattern.compile(pattern)); + this.compiledPatternSupplier = Suppliers.memoize(() -> RegexPatternFactory.compile(regexConfig.getEngine(), pattern)); } @JsonProperty diff --git a/processing/src/main/java/org/apache/druid/data/input/impl/RegexReader.java b/processing/src/main/java/org/apache/druid/data/input/impl/RegexReader.java index 66f4d25b474e..5b6dbee41809 100644 --- a/processing/src/main/java/org/apache/druid/data/input/impl/RegexReader.java +++ b/processing/src/main/java/org/apache/druid/data/input/impl/RegexReader.java @@ -30,19 +30,19 @@ import org.apache.druid.java.util.common.parsers.ParseException; import org.apache.druid.java.util.common.parsers.ParserUtils; import org.apache.druid.java.util.common.parsers.Parsers; +import org.apache.druid.regex.RegexMatcher; +import org.apache.druid.regex.RegexPattern; import javax.annotation.Nullable; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; public class RegexReader extends TextReader.Strings { private final String pattern; - private final Pattern compiledPattern; + private final RegexPattern compiledPattern; private final Function transformationFunction; private List columns; @@ -51,7 +51,7 @@ public class RegexReader extends TextReader.Strings InputRowSchema inputRowSchema, InputEntity source, String pattern, - Pattern compiledPattern, + RegexPattern compiledPattern, @Nullable String listDelimiter, @Nullable List columns ) @@ -83,7 +83,7 @@ protected List> toMap(String intermediateRow) private Map parseLine(String line) { try { - final Matcher matcher = compiledPattern.matcher(line); + final RegexMatcher matcher = compiledPattern.matcher(line); if (!matcher.matches()) { throw new ParseException(line, "Incorrect Regex: %s . No match found.", pattern); diff --git a/processing/src/main/java/org/apache/druid/guice/RegexEngineModule.java b/processing/src/main/java/org/apache/druid/guice/RegexEngineModule.java new file mode 100644 index 000000000000..8579078fd6dd --- /dev/null +++ b/processing/src/main/java/org/apache/druid/guice/RegexEngineModule.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.guice; + +import com.google.inject.Binder; +import com.google.inject.Module; +import org.apache.druid.regex.RegexConfig; + +/** + */ +public class RegexEngineModule implements Module +{ + @Override + public void configure(Binder binder) + { + JsonConfigProvider.bind(binder, "druid.regex", RegexConfig.class); + } +} diff --git a/processing/src/main/java/org/apache/druid/guice/StartupInjectorBuilder.java b/processing/src/main/java/org/apache/druid/guice/StartupInjectorBuilder.java index 70dd1cc51519..096de6464f0c 100644 --- a/processing/src/main/java/org/apache/druid/guice/StartupInjectorBuilder.java +++ b/processing/src/main/java/org/apache/druid/guice/StartupInjectorBuilder.java @@ -65,6 +65,7 @@ public StartupInjectorBuilder() new JacksonModule(), new ConfigModule(), new ExpressionProcessingModule(), + new RegexEngineModule(), binder -> binder.bind(DruidSecondaryModule.class), binder -> binder.bind(PropertiesValidator.class) // this gets properties injected, later call to validate checks ); diff --git a/processing/src/main/java/org/apache/druid/regex/JavaRegexMatcher.java b/processing/src/main/java/org/apache/druid/regex/JavaRegexMatcher.java new file mode 100644 index 000000000000..48fd8e88d627 --- /dev/null +++ b/processing/src/main/java/org/apache/druid/regex/JavaRegexMatcher.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.regex; + +import java.util.regex.Matcher; + +public class JavaRegexMatcher implements RegexMatcher +{ + private final Matcher matcher; + + public JavaRegexMatcher(Matcher matcher) + { + this.matcher = matcher; + } + + @Override + public boolean matches() + { + return matcher.matches(); + } + + @Override + public boolean find() + { + return matcher.find(); + } + + @Override + public String group(int group) + { + return matcher.group(group); + } + + @Override + public int groupCount() + { + return matcher.groupCount(); + } +} diff --git a/processing/src/main/java/org/apache/druid/regex/JavaRegexPattern.java b/processing/src/main/java/org/apache/druid/regex/JavaRegexPattern.java new file mode 100644 index 000000000000..04de6d799b09 --- /dev/null +++ b/processing/src/main/java/org/apache/druid/regex/JavaRegexPattern.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.regex; + +import java.util.regex.Pattern; + +public class JavaRegexPattern implements RegexPattern +{ + private final Pattern pattern; + + public JavaRegexPattern(String regex) + { + this.pattern = Pattern.compile(regex); + } + + @Override + public RegexMatcher matcher(String input) + { + return new JavaRegexMatcher(pattern.matcher(input)); + } +} diff --git a/processing/src/main/java/org/apache/druid/regex/Re2jRegexMatcher.java b/processing/src/main/java/org/apache/druid/regex/Re2jRegexMatcher.java new file mode 100644 index 000000000000..8a7f74adeab4 --- /dev/null +++ b/processing/src/main/java/org/apache/druid/regex/Re2jRegexMatcher.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.regex; + +import com.google.re2j.Matcher; + +public class Re2jRegexMatcher implements RegexMatcher +{ + private final Matcher matcher; + + public Re2jRegexMatcher(Matcher matcher) + { + this.matcher = matcher; + } + + @Override + public boolean matches() + { + return matcher.matches(); + } + + @Override + public boolean find() + { + return matcher.find(); + } + + @Override + public String group(int group) + { + return matcher.group(group); + } + + @Override + public int groupCount() + { + return matcher.groupCount(); + } +} diff --git a/processing/src/main/java/org/apache/druid/regex/Re2jRegexPattern.java b/processing/src/main/java/org/apache/druid/regex/Re2jRegexPattern.java new file mode 100644 index 000000000000..6e6b0584b920 --- /dev/null +++ b/processing/src/main/java/org/apache/druid/regex/Re2jRegexPattern.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.regex; + +import com.google.re2j.Pattern; + +public class Re2jRegexPattern implements RegexPattern +{ + private final Pattern pattern; + + public Re2jRegexPattern(String regex) + { + this.pattern = Pattern.compile(regex); + } + + @Override + public RegexMatcher matcher(String input) + { + return new Re2jRegexMatcher(pattern.matcher(input)); + } +} diff --git a/processing/src/main/java/org/apache/druid/regex/RegexConfig.java b/processing/src/main/java/org/apache/druid/regex/RegexConfig.java new file mode 100644 index 000000000000..37db9b6f7bb6 --- /dev/null +++ b/processing/src/main/java/org/apache/druid/regex/RegexConfig.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.regex; + +import com.fasterxml.jackson.annotation.JsonProperty; + +public class RegexConfig +{ + @JsonProperty + private RegexEngineType engine = RegexEngineType.JAVA; + + public RegexEngineType getEngine() + { + return engine; + } + + public void setEngine(RegexEngineType engine) + { + this.engine = engine; + } + + public static RegexConfig with(RegexEngineType engine) + { + final RegexConfig config = new RegexConfig(); + config.setEngine(engine); + + return config; + } + + @Override + public String toString() + { + return "RegexConfig {" + + "engine=" + engine + + '}'; + } +} diff --git a/processing/src/main/java/org/apache/druid/regex/RegexEngineType.java b/processing/src/main/java/org/apache/druid/regex/RegexEngineType.java new file mode 100644 index 000000000000..4226bcb2a75d --- /dev/null +++ b/processing/src/main/java/org/apache/druid/regex/RegexEngineType.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.regex; + +public enum RegexEngineType +{ + JAVA, + RE2J +} diff --git a/processing/src/main/java/org/apache/druid/regex/RegexMatcher.java b/processing/src/main/java/org/apache/druid/regex/RegexMatcher.java new file mode 100644 index 000000000000..10c1deab6e24 --- /dev/null +++ b/processing/src/main/java/org/apache/druid/regex/RegexMatcher.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.regex; + +public interface RegexMatcher +{ + boolean matches(); + + boolean find(); + + String group(int group); + + int groupCount(); +} diff --git a/processing/src/main/java/org/apache/druid/regex/RegexPattern.java b/processing/src/main/java/org/apache/druid/regex/RegexPattern.java new file mode 100644 index 000000000000..b962a05ea217 --- /dev/null +++ b/processing/src/main/java/org/apache/druid/regex/RegexPattern.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.regex; + +public interface RegexPattern +{ + RegexMatcher matcher(String input); +} diff --git a/processing/src/main/java/org/apache/druid/regex/RegexPatternFactory.java b/processing/src/main/java/org/apache/druid/regex/RegexPatternFactory.java new file mode 100644 index 000000000000..ccf787b0551a --- /dev/null +++ b/processing/src/main/java/org/apache/druid/regex/RegexPatternFactory.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.regex; + +public class RegexPatternFactory +{ + public static RegexPattern compile(RegexEngineType type, String regex) + { + switch (type) { + case RE2J: + return new Re2jRegexPattern(regex); + + case JAVA: + default: + return new JavaRegexPattern(regex); + } + } +} diff --git a/processing/src/test/java/org/apache/druid/data/input/impl/RegexInputFormatTest.java b/processing/src/test/java/org/apache/druid/data/input/impl/RegexInputFormatTest.java index 74c4849de31d..55c54908b5a1 100644 --- a/processing/src/test/java/org/apache/druid/data/input/impl/RegexInputFormatTest.java +++ b/processing/src/test/java/org/apache/druid/data/input/impl/RegexInputFormatTest.java @@ -19,15 +19,25 @@ package org.apache.druid.data.input.impl; +import com.fasterxml.jackson.databind.InjectableValues; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.jsontype.NamedType; import com.google.common.collect.ImmutableList; +import org.apache.druid.data.input.InputEntityReader; import org.apache.druid.data.input.InputFormat; +import org.apache.druid.java.util.common.parsers.CloseableIterator; +import org.apache.druid.java.util.common.parsers.ParseException; +import org.apache.druid.regex.RegexConfig; +import org.apache.druid.regex.RegexEngineType; import org.apache.druid.utils.CompressionUtils; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.Map; public class RegexInputFormatTest @@ -40,10 +50,15 @@ public RegexInputFormatTest() mapper.registerSubtypes(new NamedType(RegexInputFormat.class, "regex")); } - @Test - public void testSerde() throws IOException + @ParameterizedTest + @EnumSource(RegexEngineType.class) + public void testSerde(RegexEngineType engineType) throws IOException { + final RegexConfig regexConfig = RegexConfig.with(engineType); + mapper.setInjectableValues(new InjectableValues.Std().addValue(RegexConfig.class, regexConfig)); + final RegexInputFormat expected = new RegexInputFormat( + regexConfig, "//[^\\r\\n]*[\\r\\n]", "|", ImmutableList.of("col1", "col2", "col3") @@ -57,10 +72,12 @@ public void testSerde() throws IOException Assertions.assertEquals(expected.getColumns(), fromJson.getColumns()); } - @Test - public void testIgnoreCompiledPatternInJson() throws IOException + @ParameterizedTest + @EnumSource(RegexEngineType.class) + public void testIgnoreCompiledPatternInJson(RegexEngineType engineType) throws IOException { final RegexInputFormat expected = new RegexInputFormat( + RegexConfig.with(engineType), "//[^\\r\\n]*[\\r\\n]", "|", ImmutableList.of("col1", "col2", "col3") @@ -71,21 +88,27 @@ public void testIgnoreCompiledPatternInJson() throws IOException Assertions.assertFalse(map.containsKey("compiledPattern")); } - @Test - public void test_getWeightedSize_withoutCompression() + @ParameterizedTest + @EnumSource(RegexEngineType.class) + public void test_getWeightedSize_withoutCompression(RegexEngineType engineType) { final RegexInputFormat format = new RegexInputFormat( + RegexConfig.with(engineType), "//[^\\r\\n]*[\\r\\n]", "|", ImmutableList.of("col1", "col2", "col3") ); final long unweightedSize = 100L; Assertions.assertEquals(unweightedSize, format.getWeightedSize("file.txt", unweightedSize)); + } - @Test - public void test_getWeightedSize_withGzCompression() + + @ParameterizedTest + @EnumSource(RegexEngineType.class) + public void test_getWeightedSize_withGzCompression(RegexEngineType engineType) { final RegexInputFormat format = new RegexInputFormat( + RegexConfig.with(engineType), "//[^\\r\\n]*[\\r\\n]", "|", ImmutableList.of("col1", "col2", "col3") @@ -96,4 +119,33 @@ public void test_getWeightedSize_withGzCompression() format.getWeightedSize("file.txt.gz", unweightedSize) ); } + + @Test + @Timeout(10) + public void test_backtracking() throws IOException + { + final RegexInputFormat inputFormat = new RegexInputFormat( + RegexConfig.with(RegexEngineType.RE2J), + "^(.*a){20}$", + null, + ImmutableList.of("value") + ); + + String maliciousInput = "a".repeat(50) + "X"; + InputEntityReader reader = inputFormat.createReader( + null, + new ByteEntity(maliciousInput.getBytes(StandardCharsets.UTF_8)), + null + ); + + try (CloseableIterator iterator = reader.read()) { + while (iterator.hasNext()) { + iterator.next(); + } + } + + catch (ParseException ignored) { + // expected for non-matching input + } + } } diff --git a/services/src/main/java/org/apache/druid/cli/CliCoordinator.java b/services/src/main/java/org/apache/druid/cli/CliCoordinator.java index f8b52fafbdff..0e9c3e0d26ae 100644 --- a/services/src/main/java/org/apache/druid/cli/CliCoordinator.java +++ b/services/src/main/java/org/apache/druid/cli/CliCoordinator.java @@ -50,6 +50,7 @@ import org.apache.druid.guice.MetadataConfigModule; import org.apache.druid.guice.MetadataManagerModule; import org.apache.druid.guice.QueryableModule; +import org.apache.druid.guice.RegexEngineModule; import org.apache.druid.guice.SegmentSchemaCacheModule; import org.apache.druid.guice.SupervisorCleanupModule; import org.apache.druid.guice.annotations.EscalatedGlobal; @@ -178,6 +179,7 @@ protected List getModules() modules.add(new QueryableModule()); } + modules.add(new RegexEngineModule()); modules.add( new Module() { diff --git a/services/src/main/java/org/apache/druid/cli/CliIndexer.java b/services/src/main/java/org/apache/druid/cli/CliIndexer.java index 7839ba7e6b46..6ffb17ad637e 100644 --- a/services/src/main/java/org/apache/druid/cli/CliIndexer.java +++ b/services/src/main/java/org/apache/druid/cli/CliIndexer.java @@ -47,6 +47,7 @@ import org.apache.druid.guice.QueryRunnerFactoryModule; import org.apache.druid.guice.QueryableModule; import org.apache.druid.guice.QueryablePeonModule; +import org.apache.druid.guice.RegexEngineModule; import org.apache.druid.guice.SegmentWranglerModule; import org.apache.druid.guice.ServerTypeConfig; import org.apache.druid.guice.annotations.AttemptId; @@ -243,7 +244,8 @@ public DataNodeService getDataNodeService(DruidServerConfig serverConfig) new MSQIndexingModule(), new MSQDurableStorageModule(), new MSQExternalDataSourceModule(), - new IndexerMemoryManagementModule() + new IndexerMemoryManagementModule(), + new RegexEngineModule() ); } } diff --git a/services/src/main/java/org/apache/druid/cli/CliMiddleManager.java b/services/src/main/java/org/apache/druid/cli/CliMiddleManager.java index e95506ebf89f..88a303022485 100644 --- a/services/src/main/java/org/apache/druid/cli/CliMiddleManager.java +++ b/services/src/main/java/org/apache/druid/cli/CliMiddleManager.java @@ -45,6 +45,7 @@ import org.apache.druid.guice.ManageLifecycle; import org.apache.druid.guice.MiddleManagerServiceModule; import org.apache.druid.guice.PolyBind; +import org.apache.druid.guice.RegexEngineModule; import org.apache.druid.guice.annotations.Self; import org.apache.druid.indexing.common.RetryPolicyFactory; import org.apache.druid.indexing.common.TaskStorageDirTracker; @@ -244,7 +245,8 @@ public WorkerNodeService getWorkerNodeService(WorkerConfig workerConfig) new LookupSerdeModule(), new MSQIndexingModule(), new MSQDurableStorageModule(), - new MSQExternalDataSourceModule() + new MSQExternalDataSourceModule(), + new RegexEngineModule() ); } diff --git a/services/src/main/java/org/apache/druid/cli/CliOverlord.java b/services/src/main/java/org/apache/druid/cli/CliOverlord.java index 78161ed098e5..6fa504c4d922 100644 --- a/services/src/main/java/org/apache/druid/cli/CliOverlord.java +++ b/services/src/main/java/org/apache/druid/cli/CliOverlord.java @@ -52,6 +52,7 @@ import org.apache.druid.guice.ManageLifecycle; import org.apache.druid.guice.MetadataManagerModule; import org.apache.druid.guice.PolyBind; +import org.apache.druid.guice.RegexEngineModule; import org.apache.druid.guice.SupervisorModule; import org.apache.druid.guice.annotations.Json; import org.apache.druid.indexing.common.RetryPolicyFactory; @@ -517,7 +518,8 @@ private void configureOverlordWebResources(Binder binder) new SamplerModule(), new MSQIndexingModule(), new MSQDurableStorageModule(), - new MSQExternalDataSourceModule() + new MSQExternalDataSourceModule(), + new RegexEngineModule() ); } diff --git a/services/src/main/java/org/apache/druid/cli/CliPeon.java b/services/src/main/java/org/apache/druid/cli/CliPeon.java index d234a6f37fec..722c7a983d72 100644 --- a/services/src/main/java/org/apache/druid/cli/CliPeon.java +++ b/services/src/main/java/org/apache/druid/cli/CliPeon.java @@ -61,6 +61,7 @@ import org.apache.druid.guice.QueryRunnerFactoryModule; import org.apache.druid.guice.QueryableModule; import org.apache.druid.guice.QueryablePeonModule; +import org.apache.druid.guice.RegexEngineModule; import org.apache.druid.guice.SegmentWranglerModule; import org.apache.druid.guice.ServerTypeConfig; import org.apache.druid.guice.annotations.AttemptId; @@ -219,6 +220,7 @@ protected List getModules() new SegmentWranglerModule(), new JoinableFactoryModule(), new IndexingServiceTaskLogsModule(properties), + new RegexEngineModule(), new Module() { @SuppressForbidden(reason = "System#out, System#err") diff --git a/website/.spelling b/website/.spelling index c85463563e13..3e43da39cde7 100644 --- a/website/.spelling +++ b/website/.spelling @@ -526,9 +526,11 @@ quantile quantiles queryable quickstart +RE2 realtime rebalance redis +ReDoS regexes reimported reindex