From 38de323d1544ebb1c190958cee13e755be1d603b Mon Sep 17 00:00:00 2001 From: Manisha Yadav Date: Mon, 8 Jun 2026 10:24:36 +0000 Subject: [PATCH 1/3] Add multiline input codec for grouping multi-line log events Signed-off-by: Manisha Yadav --- .../multiline-codecs/README.md | 123 ++++ .../multiline-codecs/build.gradle | 24 + .../codec/multiline/MultilineInputCodec.java | 227 +++++++ .../multiline/MultilineInputCodecConfig.java | 147 ++++ .../codec/multiline/MultilineWhat.java | 57 ++ .../codec/multiline/MultilineCodecsIT.java | 432 ++++++++++++ .../MultilineInputCodecConfigTest.java | 94 +++ .../multiline/MultilineInputCodecTest.java | 629 ++++++++++++++++++ .../codec/multiline/MultilineWhatTest.java | 50 ++ settings.gradle | 1 + 10 files changed, 1784 insertions(+) create mode 100644 data-prepper-plugins/multiline-codecs/README.md create mode 100644 data-prepper-plugins/multiline-codecs/build.gradle create mode 100644 data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodec.java create mode 100644 data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecConfig.java create mode 100644 data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineWhat.java create mode 100644 data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineCodecsIT.java create mode 100644 data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecConfigTest.java create mode 100644 data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecTest.java create mode 100644 data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineWhatTest.java diff --git a/data-prepper-plugins/multiline-codecs/README.md b/data-prepper-plugins/multiline-codecs/README.md new file mode 100644 index 0000000000..62b61e547e --- /dev/null +++ b/data-prepper-plugins/multiline-codecs/README.md @@ -0,0 +1,123 @@ +# Multiline Codecs + +This plugin provides a multiline input codec for Data Prepper that groups consecutive lines from an input stream into single events based on a configurable regex pattern. + +## Usages + +The multiline input codec can be configured with source plugins (e.g. S3 source, file source) in the pipeline file. + +### Use Cases + +- **Java/Kotlin stack traces**: Exception messages followed by `at ...` lines +- **Python tracebacks**: `Traceback` blocks spanning multiple lines +- **Timestamp-prefixed logs**: Logs where each entry starts with a timestamp and continuation lines don't +- **Multi-line JSON/XML in logs**: Structured data embedded across multiple lines within log entries +- **Custom log formats**: Any format where a recognizable pattern marks the start of a new event + +## Configuration Options + +| Option | Required | Type | Default | Description | +|---|---|---|---|---| +| `match` | Yes | String (regex) | - | A regular expression pattern used to identify line boundaries | +| `negate` | No | Boolean | `false` | When `false`, lines matching the pattern are continuation lines. When `true`, lines NOT matching the pattern are continuation lines | +| `what` | No | String | `previous` | Whether continuation lines belong to the `previous` or `next` event | +| `max_lines` | No | Integer | `500` | Maximum number of lines that can be combined into a single event | +| `max_length` | No | Integer | `10000` | Maximum character length of a combined multiline event | +| `line_separator` | No | String | `\n` | Separator string used when joining lines into a single event message | + +## How It Works + +The codec reads lines from the input stream and uses the `match` regex to determine event boundaries: + +1. **`negate=true` + `what=previous`** (most common): A new event starts when a line matches the pattern. Lines that do NOT match are appended to the preceding event. + +2. **`negate=false` + `what=previous`**: Lines that match the pattern are appended to the preceding event. + +3. **`negate=true` + `what=next`**: Lines that do NOT match the pattern are prepended to the next matching line. + +4. **`negate=false` + `what=next`**: Lines that match the pattern are prepended to the next non-matching line. + +## Examples + +### Java Stack Traces (timestamp-based grouping) + +Each log entry starts with a timestamp. Lines without a timestamp are continuations of the previous entry. + +```yaml +pipeline: + source: + s3: + codec: + multiline: + match: "^\\d{4}-\\d{2}-\\d{2}" + negate: true + what: previous +``` + +Input: +``` +2024-01-01 12:00:00 ERROR NullPointerException + at com.example.Service.method(Service.java:42) + at com.example.Main.run(Main.java:10) +2024-01-01 12:00:01 INFO Application recovered +``` + +Result: 2 events +- Event 1: The ERROR line with its full stack trace grouped together +- Event 2: The INFO line as a single event + +### Java Stack Traces (pattern-based grouping) + +Lines starting with whitespace followed by `at `, `...`, or `Caused by:` are continuations. + +```yaml +pipeline: + source: + s3: + codec: + multiline: + match: "^\\s+(at |\\.\\.\\.|Caused by:)" + negate: false + what: previous +``` + +### Python Tracebacks + +```yaml +pipeline: + source: + s3: + codec: + multiline: + match: "^Traceback|^\\s|^\\w+Error" + negate: false + what: previous +``` + +### Log Entries with Preamble (next mode) + +Lines starting with whitespace are prepended to the next non-indented line. + +```yaml +pipeline: + source: + s3: + codec: + multiline: + match: "^\\s" + negate: false + what: next +``` + +## Developer Guide + +This plugin is compatible with Java 11. See below: + +- [CONTRIBUTING](https://github.com/opensearch-project/data-prepper/blob/main/CONTRIBUTING.md) +- [monitoring](https://github.com/opensearch-project/data-prepper/blob/main/docs/monitoring.md) + +The following command runs the unit and integration tests: + +``` +./gradlew :data-prepper-plugins:multiline-codecs:test +``` diff --git a/data-prepper-plugins/multiline-codecs/build.gradle b/data-prepper-plugins/multiline-codecs/build.gradle new file mode 100644 index 0000000000..cf619d2062 --- /dev/null +++ b/data-prepper-plugins/multiline-codecs/build.gradle @@ -0,0 +1,24 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +plugins { + id 'java' +} + +dependencies { + implementation project(':data-prepper-api') + implementation 'com.fasterxml.jackson.core:jackson-annotations' + implementation libs.parquet.common + testImplementation project(':data-prepper-plugins:common') + testImplementation project(':data-prepper-test:test-event') +} + +test { + useJUnitPlatform() +} diff --git a/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodec.java b/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodec.java new file mode 100644 index 0000000000..07eba72e35 --- /dev/null +++ b/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodec.java @@ -0,0 +1,227 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.dataprepper.plugins.codec.multiline; + +import org.opensearch.dataprepper.model.annotations.DataPrepperPlugin; +import org.opensearch.dataprepper.model.annotations.DataPrepperPluginConstructor; +import org.opensearch.dataprepper.model.codec.InputCodec; +import org.opensearch.dataprepper.model.event.Event; +import org.opensearch.dataprepper.model.event.EventFactory; +import org.opensearch.dataprepper.model.event.LogEventBuilder; +import org.opensearch.dataprepper.model.log.Log; +import org.opensearch.dataprepper.model.record.Record; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.Collections; +import java.util.Objects; +import java.util.function.Consumer; +import java.util.regex.Pattern; + +/** + * An implementation of {@link InputCodec} which groups multiple lines from an input stream + * into single events based on a configurable regex pattern. + * + *

This is useful for ingesting logs where a single logical event spans multiple lines, + * such as Java stack traces, Python tracebacks, or any log format where entries begin with + * a recognizable pattern (e.g., a timestamp).

+ * + *

The codec supports two grouping modes via the {@code what} configuration:

+ * + * + *

The {@code negate} option controls which lines are considered continuation lines:

+ * + */ +@DataPrepperPlugin(name = "multiline", pluginType = InputCodec.class, pluginConfigurationType = MultilineInputCodecConfig.class) +public class MultilineInputCodec implements InputCodec { + + private static final Logger LOG = LoggerFactory.getLogger(MultilineInputCodec.class); + static final String MESSAGE_FIELD_NAME = "message"; + + private final Pattern pattern; + private final boolean negate; + private final MultilineWhat what; + private final int maxLines; + private final int maxLength; + private final String lineSeparator; + private final EventFactory eventFactory; + + @DataPrepperPluginConstructor + public MultilineInputCodec(final MultilineInputCodecConfig config, final EventFactory eventFactory) { + Objects.requireNonNull(config, "config must not be null"); + this.eventFactory = Objects.requireNonNull(eventFactory, "eventFactory must not be null"); + try { + this.pattern = Pattern.compile(config.getMatch()); + } catch (final Exception e) { + throw new IllegalArgumentException("Invalid regex pattern for 'match': " + config.getMatch(), e); + } + this.negate = config.getNegate(); + this.what = config.getWhat(); + this.maxLines = config.getMaxLines(); + this.maxLength = config.getMaxLength(); + this.lineSeparator = config.getLineSeparator(); + } + + @Override + public void parse(final InputStream inputStream, final Consumer> eventConsumer) throws IOException { + Objects.requireNonNull(inputStream, "inputStream must not be null"); + Objects.requireNonNull(eventConsumer, "eventConsumer must not be null"); + + try (final BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream))) { + if (what == MultilineWhat.PREVIOUS) { + parsePreviousMode(reader, eventConsumer); + } else { + parseNextMode(reader, eventConsumer); + } + } + } + + /** + * In PREVIOUS mode, continuation lines are appended to the preceding event. + * A new event boundary is detected when a line is NOT a continuation line + * (i.e., it's a "start" line). + */ + private void parsePreviousMode(final BufferedReader reader, final Consumer> eventConsumer) throws IOException { + final StringBuilder buffer = new StringBuilder(); + int lineCount = 0; + String line; + + while ((line = reader.readLine()) != null) { + final boolean isContinuation = isContinuationLine(line); + + if (!isContinuation && buffer.length() > 0) { + emitEvent(buffer.toString(), eventConsumer); + buffer.setLength(0); + lineCount = 0; + } + + if (shouldFlush(buffer, lineCount, line)) { + if (buffer.length() > 0) { + emitEvent(buffer.toString(), eventConsumer); + buffer.setLength(0); + lineCount = 0; + } + } + + if (buffer.length() > 0) { + buffer.append(lineSeparator); + } + buffer.append(line); + lineCount++; + } + + if (buffer.length() > 0) { + emitEvent(buffer.toString(), eventConsumer); + } + } + + /** + * In NEXT mode, continuation lines are prepended to the following event. + * A new event boundary is detected when a line is NOT a continuation line, + * and the buffer (containing prior continuation lines) is combined with this line. + */ + private void parseNextMode(final BufferedReader reader, final Consumer> eventConsumer) throws IOException { + final StringBuilder buffer = new StringBuilder(); + int lineCount = 0; + boolean bufferHasNonContinuation = false; + String line; + + while ((line = reader.readLine()) != null) { + final boolean isContinuation = isContinuationLine(line); + + if (!isContinuation) { + if (bufferHasNonContinuation) { + // The buffer already has a complete event (non-continuation at end). + // Emit it and start fresh. + emitEvent(buffer.toString(), eventConsumer); + buffer.setLength(0); + lineCount = 0; + bufferHasNonContinuation = false; + } + // Append this non-continuation line to the buffer (with any preceding continuations). + if (buffer.length() > 0) { + buffer.append(lineSeparator); + } + buffer.append(line); + lineCount++; + bufferHasNonContinuation = true; + continue; + } + + // This is a continuation line. + if (bufferHasNonContinuation) { + // Buffer has a complete event ending with non-continuation. + // Emit it, then start collecting continuations for the next event. + emitEvent(buffer.toString(), eventConsumer); + buffer.setLength(0); + lineCount = 0; + bufferHasNonContinuation = false; + } + + if (shouldFlush(buffer, lineCount, line)) { + if (buffer.length() > 0) { + emitEvent(buffer.toString(), eventConsumer); + buffer.setLength(0); + lineCount = 0; + } + } + + if (buffer.length() > 0) { + buffer.append(lineSeparator); + } + buffer.append(line); + lineCount++; + } + + if (buffer.length() > 0) { + emitEvent(buffer.toString(), eventConsumer); + } + } + + /** + * Determines if a line is a continuation line based on the pattern and negate settings. + * + *

When {@code negate=false}: a line matching the pattern IS a continuation line.

+ *

When {@code negate=true}: a line NOT matching the pattern IS a continuation line.

+ */ + boolean isContinuationLine(final String line) { + final boolean matches = pattern.matcher(line).find(); + return negate != matches; + } + + private boolean shouldFlush(final StringBuilder buffer, final int lineCount, final String nextLine) { + if (lineCount >= maxLines) { + LOG.debug("Flushing multiline event due to max_lines limit of {}", maxLines); + return true; + } + if (buffer.length() + lineSeparator.length() + nextLine.length() > maxLength) { + LOG.debug("Flushing multiline event due to max_length limit of {}", maxLength); + return true; + } + return false; + } + + private void emitEvent(final String message, final Consumer> eventConsumer) { + final Log event = eventFactory.eventBuilder(LogEventBuilder.class) + .withData(Collections.singletonMap(MESSAGE_FIELD_NAME, message)) + .build(); + eventConsumer.accept(new Record<>(event)); + } +} diff --git a/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecConfig.java b/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecConfig.java new file mode 100644 index 0000000000..dc26290f3a --- /dev/null +++ b/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecConfig.java @@ -0,0 +1,147 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.dataprepper.plugins.codec.multiline; + +import com.fasterxml.jackson.annotation.JsonProperty; +import jakarta.validation.constraints.AssertTrue; +import jakarta.validation.constraints.Min; +import jakarta.validation.constraints.NotEmpty; +import jakarta.validation.constraints.NotNull; + +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +/** + * Configuration class for the multiline input codec. + * + *

The multiline codec groups consecutive lines from an input stream into a single event + * based on a regex pattern. This is useful for log formats where a single logical event + * spans multiple lines (e.g., Java stack traces, multi-line application logs).

+ * + *

Example configuration for Java stack traces:

+ *
+ * codec:
+ *   multiline:
+ *     match: "^\\s+(at |\\.\\.\\.|Caused by:)"
+ *     negate: false
+ *     what: previous
+ * 
+ * + *

Example configuration for timestamp-prefixed logs:

+ *
+ * codec:
+ *   multiline:
+ *     match: "^\\d{4}-\\d{2}-\\d{2}"
+ *     negate: true
+ *     what: previous
+ * 
+ */ +public class MultilineInputCodecConfig { + + static final int DEFAULT_MAX_LINES = 500; + static final int DEFAULT_MAX_LENGTH = 10000; + static final String DEFAULT_LINE_SEPARATOR = "\n"; + + @NotEmpty(message = "match must not be empty") + @JsonProperty("match") + private String match; + + @NotNull(message = "negate must not be null") + @JsonProperty("negate") + private Boolean negate = false; + + @NotNull(message = "what must not be null") + @JsonProperty("what") + private MultilineWhat what = MultilineWhat.PREVIOUS; + + @Min(value = 1, message = "max_lines must be at least 1") + @JsonProperty("max_lines") + private int maxLines = DEFAULT_MAX_LINES; + + @Min(value = 1, message = "max_length must be at least 1") + @JsonProperty("max_length") + private int maxLength = DEFAULT_MAX_LENGTH; + + @NotNull(message = "line_separator must not be null") + @JsonProperty("line_separator") + private String lineSeparator = DEFAULT_LINE_SEPARATOR; + + /** + * The regex pattern used to identify line boundaries. + * + * @return The regex pattern string. + */ + public String getMatch() { + return match; + } + + /** + * Whether to negate the pattern match. + *

When false: lines matching the pattern are considered continuation lines.

+ *

When true: lines NOT matching the pattern are considered continuation lines.

+ * + * @return true if the pattern should be negated. + */ + public Boolean getNegate() { + return negate; + } + + /** + * Defines whether unmatched (continuation) lines belong to the previous or next event. + * + * @return The multiline grouping direction. + */ + public MultilineWhat getWhat() { + return what; + } + + /** + * The maximum number of lines that can be combined into a single event. + * When this limit is reached, the accumulated lines are flushed as an event + * and a new accumulation begins. + * + * @return The maximum number of lines per event. + */ + public int getMaxLines() { + return maxLines; + } + + /** + * The maximum character length of a combined multiline event. + * When this limit is reached, the accumulated lines are flushed as an event. + * + * @return The maximum character length per event. + */ + public int getMaxLength() { + return maxLength; + } + + /** + * The separator string to use when joining multiple lines into a single event message. + * + * @return The line separator string. + */ + public String getLineSeparator() { + return lineSeparator; + } + + @AssertTrue(message = "match must be a valid regular expression") + boolean isValidPattern() { + if (match == null || match.isEmpty()) { + return false; + } + try { + Pattern.compile(match); + return true; + } catch (final PatternSyntaxException e) { + return false; + } + } +} diff --git a/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineWhat.java b/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineWhat.java new file mode 100644 index 0000000000..ab21b16ac5 --- /dev/null +++ b/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineWhat.java @@ -0,0 +1,57 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.dataprepper.plugins.codec.multiline; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonValue; + +import java.util.Arrays; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * Defines whether unmatched lines should be grouped with the previous or next matching line. + */ +public enum MultilineWhat { + + /** + * Unmatched lines are appended to the previous matching line's event. + */ + PREVIOUS("previous"), + + /** + * Unmatched lines are prepended to the next matching line's event. + */ + NEXT("next"); + + private static final Map OPTIONS_MAP = Arrays.stream(MultilineWhat.values()) + .collect(Collectors.toMap(MultilineWhat::toString, value -> value)); + + private final String name; + + MultilineWhat(final String name) { + this.name = name; + } + + @JsonCreator + public static MultilineWhat fromString(final String value) { + final MultilineWhat result = OPTIONS_MAP.get(value.toLowerCase()); + if (result == null) { + throw new IllegalArgumentException("Invalid value for 'what': " + value + ". Valid values are: " + OPTIONS_MAP.keySet()); + } + return result; + } + + @JsonValue + @Override + public String toString() { + return name; + } +} diff --git a/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineCodecsIT.java b/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineCodecsIT.java new file mode 100644 index 0000000000..6d2b970db5 --- /dev/null +++ b/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineCodecsIT.java @@ -0,0 +1,432 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.dataprepper.plugins.codec.multiline; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.ArgumentCaptor; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import org.opensearch.dataprepper.event.TestEventFactory; +import org.opensearch.dataprepper.model.event.Event; +import org.opensearch.dataprepper.model.event.EventFactory; +import org.opensearch.dataprepper.model.record.Record; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.function.Consumer; + +import static org.hamcrest.CoreMatchers.equalTo; +import static org.hamcrest.CoreMatchers.notNullValue; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.containsString; +import static org.mockito.Mockito.lenient; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; + +@ExtendWith(MockitoExtension.class) +public class MultilineCodecsIT { + + @Mock + private MultilineInputCodecConfig config; + + @Mock + private Consumer> eventConsumer; + + private final EventFactory eventFactory = TestEventFactory.getTestEventFactory(); + + @BeforeEach + void setUp() { + lenient().when(config.getMaxLines()).thenReturn(500); + lenient().when(config.getMaxLength()).thenReturn(50000); + lenient().when(config.getLineSeparator()).thenReturn("\n"); + } + + private MultilineInputCodec createObjectUnderTest() { + return new MultilineInputCodec(config, eventFactory); + } + + private InputStream toInputStream(final String content) { + return new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8)); + } + + @Test + void parse_java_stack_trace_groups_exception_with_stack_frames() throws IOException { + lenient().when(config.getMatch()).thenReturn("^\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}"); + lenient().when(config.getNegate()).thenReturn(true); + lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + + final String input = + "2024-01-15 10:23:45.123 ERROR [main] com.example.UserService - Request failed\n" + + "java.lang.NullPointerException: null\n" + + "\tat com.example.UserService.getUser(UserService.java:42)\n" + + "\tat com.example.Controller.handle(Controller.java:28)\n" + + "Caused by: java.sql.SQLException: Connection refused\n" + + "\tat com.mysql.jdbc.Connection.connect(Connection.java:456)\n" + + "\t... 12 more\n" + + "2024-01-15 10:23:45.456 INFO [main] com.example.UserService - Retrying\n" + + "2024-01-15 10:23:46.789 WARN [worker-1] com.example.Cache - Cache miss\n"; + + createObjectUnderTest().parse(toInputStream(input), eventConsumer); + + final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); + verify(eventConsumer, times(3)).accept(captor.capture()); + + final List> records = captor.getAllValues(); + + // Event 1: ERROR log + stack trace (7 lines grouped) + final String event1 = records.get(0).getData().get("message", String.class); + assertThat(event1, notNullValue()); + assertThat(event1, containsString("NullPointerException")); + assertThat(event1, containsString("at com.example.UserService.getUser")); + assertThat(event1, containsString("Caused by: java.sql.SQLException")); + assertThat(event1, containsString("... 12 more")); + + // Event 2: INFO single line + final String event2 = records.get(1).getData().get("message", String.class); + assertThat(event2, equalTo("2024-01-15 10:23:45.456 INFO [main] com.example.UserService - Retrying")); + + // Event 3: WARN single line + final String event3 = records.get(2).getData().get("message", String.class); + assertThat(event3, equalTo("2024-01-15 10:23:46.789 WARN [worker-1] com.example.Cache - Cache miss")); + } + + @Test + void parse_python_traceback_groups_traceback_with_error_line() throws IOException { + lenient().when(config.getMatch()).thenReturn("^\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}"); + lenient().when(config.getNegate()).thenReturn(true); + lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + + final String input = + "2024-03-20 08:15:00,123 INFO Starting application\n" + + "2024-03-20 08:15:02,789 ERROR Unhandled exception\n" + + "Traceback (most recent call last):\n" + + " File \"/app/worker.py\", line 45, in process\n" + + " result = transform(record)\n" + + "ValueError: invalid literal for int()\n" + + "2024-03-20 08:15:03,456 INFO Recovered\n"; + + createObjectUnderTest().parse(toInputStream(input), eventConsumer); + + final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); + verify(eventConsumer, times(3)).accept(captor.capture()); + + final List> records = captor.getAllValues(); + + // Event 1: single INFO line + assertThat(records.get(0).getData().get("message", String.class), + equalTo("2024-03-20 08:15:00,123 INFO Starting application")); + + // Event 2: ERROR + traceback (5 lines grouped) + final String event2 = records.get(1).getData().get("message", String.class); + assertThat(event2, containsString("ERROR Unhandled exception")); + assertThat(event2, containsString("Traceback (most recent call last):")); + assertThat(event2, containsString("File \"/app/worker.py\"")); + assertThat(event2, containsString("ValueError: invalid literal")); + + // Event 3: single INFO line + assertThat(records.get(2).getData().get("message", String.class), + equalTo("2024-03-20 08:15:03,456 INFO Recovered")); + } + + @Test + void parse_xml_multiline_logs_groups_xml_body_with_header() throws IOException { + lenient().when(config.getMatch()).thenReturn("^\\[\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}"); + lenient().when(config.getNegate()).thenReturn(true); + lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + + final String input = + "[2024-05-10 14:30:00.001] [INFO] Incoming request:\n" + + "\n" + + " value\n" + + "\n" + + "[2024-05-10 14:30:00.045] [INFO] Request processed\n"; + + createObjectUnderTest().parse(toInputStream(input), eventConsumer); + + final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); + verify(eventConsumer, times(2)).accept(captor.capture()); + + final List> records = captor.getAllValues(); + + // Event 1: log line + XML body (4 lines grouped) + final String event1 = records.get(0).getData().get("message", String.class); + assertThat(event1, containsString("[INFO] Incoming request:")); + assertThat(event1, containsString("")); + assertThat(event1, containsString("value")); + assertThat(event1, containsString("")); + + // Event 2: single line + assertThat(records.get(1).getData().get("message", String.class), + equalTo("[2024-05-10 14:30:00.045] [INFO] Request processed")); + } + + @Test + void parse_sql_multiline_logs_groups_query_with_header() throws IOException { + lenient().when(config.getMatch()).thenReturn("^\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}"); + lenient().when(config.getNegate()).thenReturn(true); + lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + + final String input = + "2024-07-01 09:00:01 [Query] thread_id=145 exec_time=0.003s\n" + + "SELECT u.id, u.name\n" + + "FROM users u\n" + + "WHERE u.active = 1\n" + + "ORDER BY u.name;\n" + + "2024-07-01 09:00:02 [Query] thread_id=146 exec_time=0.001s\n" + + "SELECT COUNT(*) FROM sessions;\n"; + + createObjectUnderTest().parse(toInputStream(input), eventConsumer); + + final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); + verify(eventConsumer, times(2)).accept(captor.capture()); + + final List> records = captor.getAllValues(); + + // Event 1: query header + multi-line SQL (5 lines grouped) + final String event1 = records.get(0).getData().get("message", String.class); + assertThat(event1, containsString("[Query] thread_id=145")); + assertThat(event1, containsString("SELECT u.id, u.name")); + assertThat(event1, containsString("FROM users u")); + assertThat(event1, containsString("WHERE u.active = 1")); + assertThat(event1, containsString("ORDER BY u.name;")); + + // Event 2: query header + single-line SQL (2 lines grouped) + final String event2 = records.get(1).getData().get("message", String.class); + assertThat(event2, containsString("[Query] thread_id=146")); + assertThat(event2, containsString("SELECT COUNT(*) FROM sessions;")); + } + + @Test + void parse_syslog_ise_multiline_groups_continuation_lines() throws IOException { + lenient().when(config.getMatch()).thenReturn("^<\\d+>[A-Z][a-z]{2}\\s+\\d+\\s+\\d{2}:\\d{2}:\\d{2}"); + lenient().when(config.getNegate()).thenReturn(true); + lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + + final String input = + "<181>Jun 1 12:39:49 Infra-ISE CISE_Audit 0000000176 NOTICE Admin-Login: success\n" + + "<181>Jun 1 12:39:49 Infra-ISE CISE_Audit 0000000177 NOTICE OpenAPI: Response={\\\n" + + " \"version\" : \"1.0.0\",\\\n" + + " \"status\" : \"ok\"\\\n" + + "}, HttpCode=200\n" + + "<181>Jun 1 12:40:15 Infra-ISE CISE_Audit 0000000178 NOTICE Config-Change: added\n"; + + createObjectUnderTest().parse(toInputStream(input), eventConsumer); + + final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); + verify(eventConsumer, times(3)).accept(captor.capture()); + + final List> records = captor.getAllValues(); + + // Event 1: single-line syslog + assertThat(records.get(0).getData().get("message", String.class), + containsString("Admin-Login: success")); + + // Event 2: multiline syslog with JSON continuation (4 lines grouped) + final String event2 = records.get(1).getData().get("message", String.class); + assertThat(event2, containsString("OpenAPI: Response=")); + assertThat(event2, containsString("\"version\" : \"1.0.0\"")); + assertThat(event2, containsString("HttpCode=200")); + + // Event 3: single-line syslog + assertThat(records.get(2).getData().get("message", String.class), + containsString("Config-Change: added")); + } + + @Test + void parse_with_negate_false_groups_matching_lines_with_previous() throws IOException { + lenient().when(config.getMatch()).thenReturn("^\\s+(at |\\.\\.\\.|Caused by:)"); + lenient().when(config.getNegate()).thenReturn(false); + lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + + final String input = + "java.lang.RuntimeException: error\n" + + " at com.example.A.method(A.java:1)\n" + + " at com.example.B.method(B.java:2)\n" + + " Caused by: java.io.IOException\n" + + " at com.example.C.read(C.java:3)\n" + + " ... 5 more\n" + + "Application recovered\n"; + + createObjectUnderTest().parse(toInputStream(input), eventConsumer); + + final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); + verify(eventConsumer, times(2)).accept(captor.capture()); + + final List> records = captor.getAllValues(); + + // Event 1: exception + all matching stack frames (6 lines grouped) + final String event1 = records.get(0).getData().get("message", String.class); + assertThat(event1, containsString("RuntimeException: error")); + assertThat(event1, containsString("at com.example.A.method")); + assertThat(event1, containsString("Caused by: java.io.IOException")); + assertThat(event1, containsString("... 5 more")); + + // Event 2: non-matching line on its own + assertThat(records.get(1).getData().get("message", String.class), + equalTo("Application recovered")); + } + + @Test + void parse_with_next_mode_prepends_continuation_to_following_event() throws IOException { + lenient().when(config.getMatch()).thenReturn("^\\s"); + lenient().when(config.getNegate()).thenReturn(false); + lenient().when(config.getWhat()).thenReturn(MultilineWhat.NEXT); + + final String input = + " context-line-1\n" + + " context-line-2\n" + + "MAIN EVENT A\n" + + " context-line-3\n" + + "MAIN EVENT B\n"; + + createObjectUnderTest().parse(toInputStream(input), eventConsumer); + + final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); + verify(eventConsumer, times(2)).accept(captor.capture()); + + final List> records = captor.getAllValues(); + + // Event 1: continuation lines + first non-continuation + final String event1 = records.get(0).getData().get("message", String.class); + assertThat(event1, containsString("context-line-1")); + assertThat(event1, containsString("context-line-2")); + assertThat(event1, containsString("MAIN EVENT A")); + + // Event 2: continuation line + second non-continuation + final String event2 = records.get(1).getData().get("message", String.class); + assertThat(event2, containsString("context-line-3")); + assertThat(event2, containsString("MAIN EVENT B")); + } + + @Test + void parse_respects_max_lines_limit() throws IOException { + lenient().when(config.getMatch()).thenReturn("^\\d{4}"); + lenient().when(config.getNegate()).thenReturn(true); + lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + lenient().when(config.getMaxLines()).thenReturn(3); + + final String input = + "2024 start\n" + + " line 2\n" + + " line 3\n" + + " line 4\n" + + " line 5\n" + + "2024 next event\n"; + + createObjectUnderTest().parse(toInputStream(input), eventConsumer); + + final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); + verify(eventConsumer, times(3)).accept(captor.capture()); + + final List> records = captor.getAllValues(); + + // Event 1: first 3 lines (hit max_lines) + final String event1 = records.get(0).getData().get("message", String.class); + assertThat(event1, equalTo("2024 start\n line 2\n line 3")); + + // Event 2: overflow lines + final String event2 = records.get(1).getData().get("message", String.class); + assertThat(event2, equalTo(" line 4\n line 5")); + + // Event 3: next event + assertThat(records.get(2).getData().get("message", String.class), + equalTo("2024 next event")); + } + + @Test + void parse_respects_max_length_limit() throws IOException { + lenient().when(config.getMatch()).thenReturn("^\\d{4}"); + lenient().when(config.getNegate()).thenReturn(true); + lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + lenient().when(config.getMaxLength()).thenReturn(25); + + final String input = + "2024 start here\n" + + " long continuation line\n" + + "2024 next\n"; + + createObjectUnderTest().parse(toInputStream(input), eventConsumer); + + final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); + verify(eventConsumer, times(3)).accept(captor.capture()); + + final List> records = captor.getAllValues(); + + // Event 1: flushed due to max_length before adding continuation + assertThat(records.get(0).getData().get("message", String.class), + equalTo("2024 start here")); + + // Event 2: continuation line on its own + assertThat(records.get(1).getData().get("message", String.class), + equalTo(" long continuation line")); + + // Event 3: next event + assertThat(records.get(2).getData().get("message", String.class), + equalTo("2024 next")); + } + + @Test + void parse_empty_input_produces_no_events() throws IOException { + lenient().when(config.getMatch()).thenReturn("^\\d{4}"); + lenient().when(config.getNegate()).thenReturn(true); + lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + + createObjectUnderTest().parse(toInputStream(""), eventConsumer); + + verify(eventConsumer, times(0)).accept(ArgumentCaptor.forClass(Record.class).capture()); + } + + @Test + void parse_all_lines_are_single_events_when_all_match_pattern() throws IOException { + lenient().when(config.getMatch()).thenReturn("^\\d{4}"); + lenient().when(config.getNegate()).thenReturn(true); + lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + + final String input = + "2024 event one\n" + + "2024 event two\n" + + "2024 event three\n"; + + createObjectUnderTest().parse(toInputStream(input), eventConsumer); + + final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); + verify(eventConsumer, times(3)).accept(captor.capture()); + + final List> records = captor.getAllValues(); + assertThat(records.get(0).getData().get("message", String.class), equalTo("2024 event one")); + assertThat(records.get(1).getData().get("message", String.class), equalTo("2024 event two")); + assertThat(records.get(2).getData().get("message", String.class), equalTo("2024 event three")); + } + + @Test + void parse_all_lines_form_single_event_when_none_match_pattern() throws IOException { + lenient().when(config.getMatch()).thenReturn("^NEVER_MATCHES"); + lenient().when(config.getNegate()).thenReturn(true); + lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + + final String input = + "line one\n" + + "line two\n" + + "line three\n"; + + createObjectUnderTest().parse(toInputStream(input), eventConsumer); + + final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); + verify(eventConsumer, times(1)).accept(captor.capture()); + + assertThat(captor.getValue().getData().get("message", String.class), + equalTo("line one\nline two\nline three")); + } +} diff --git a/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecConfigTest.java b/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecConfigTest.java new file mode 100644 index 0000000000..6d74973abf --- /dev/null +++ b/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecConfigTest.java @@ -0,0 +1,94 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.dataprepper.plugins.codec.multiline; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.lang.reflect.Field; + +import static org.hamcrest.CoreMatchers.equalTo; +import static org.hamcrest.MatcherAssert.assertThat; + +class MultilineInputCodecConfigTest { + + @Test + void defaults_are_correct() { + final MultilineInputCodecConfig config = new MultilineInputCodecConfig(); + + assertThat(config.getNegate(), equalTo(false)); + assertThat(config.getWhat(), equalTo(MultilineWhat.PREVIOUS)); + assertThat(config.getMaxLines(), equalTo(MultilineInputCodecConfig.DEFAULT_MAX_LINES)); + assertThat(config.getMaxLength(), equalTo(MultilineInputCodecConfig.DEFAULT_MAX_LENGTH)); + assertThat(config.getLineSeparator(), equalTo(MultilineInputCodecConfig.DEFAULT_LINE_SEPARATOR)); + assertThat(config.getMatch(), equalTo(null)); + } + + @Test + void getMatch_returns_configured_value() throws Exception { + final MultilineInputCodecConfig config = new MultilineInputCodecConfig(); + setField(config, "match", "^\\d{4}"); + assertThat(config.getMatch(), equalTo("^\\d{4}")); + } + + @Test + void isValidPattern_returns_true_for_valid_regex() throws Exception { + final MultilineInputCodecConfig config = new MultilineInputCodecConfig(); + setField(config, "match", "^\\d{4}-\\d{2}-\\d{2}"); + assertThat(config.isValidPattern(), equalTo(true)); + } + + @Test + void isValidPattern_returns_false_for_invalid_regex() throws Exception { + final MultilineInputCodecConfig config = new MultilineInputCodecConfig(); + setField(config, "match", "[invalid("); + assertThat(config.isValidPattern(), equalTo(false)); + } + + @Test + void isValidPattern_returns_false_for_null_match() { + final MultilineInputCodecConfig config = new MultilineInputCodecConfig(); + assertThat(config.isValidPattern(), equalTo(false)); + } + + @Test + void isValidPattern_returns_false_for_empty_match() throws Exception { + final MultilineInputCodecConfig config = new MultilineInputCodecConfig(); + setField(config, "match", ""); + assertThat(config.isValidPattern(), equalTo(false)); + } + + @ParameterizedTest + @ValueSource(ints = {1, 100, 1000}) + void getMaxLines_returns_configured_value(final int maxLines) throws Exception { + final MultilineInputCodecConfig config = new MultilineInputCodecConfig(); + setField(config, "maxLines", maxLines); + assertThat(config.getMaxLines(), equalTo(maxLines)); + } + + @ParameterizedTest + @ValueSource(ints = {1, 5000, 50000}) + void getMaxLength_returns_configured_value(final int maxLength) throws Exception { + final MultilineInputCodecConfig config = new MultilineInputCodecConfig(); + setField(config, "maxLength", maxLength); + assertThat(config.getMaxLength(), equalTo(maxLength)); + } + + private void setField(final Object object, final String fieldName, final Object value) throws Exception { + final Field field = object.getClass().getDeclaredField(fieldName); + try { + field.setAccessible(true); + field.set(object, value); + } finally { + field.setAccessible(false); + } + } +} diff --git a/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecTest.java b/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecTest.java new file mode 100644 index 0000000000..5738a82da7 --- /dev/null +++ b/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecTest.java @@ -0,0 +1,629 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.dataprepper.plugins.codec.multiline; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import org.opensearch.dataprepper.event.TestEventFactory; +import org.opensearch.dataprepper.model.event.Event; +import org.opensearch.dataprepper.model.event.EventFactory; +import org.opensearch.dataprepper.model.record.Record; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +import static org.hamcrest.CoreMatchers.equalTo; +import static org.hamcrest.CoreMatchers.notNullValue; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class MultilineInputCodecTest { + + @Mock + private MultilineInputCodecConfig config; + + private final EventFactory eventFactory = TestEventFactory.getTestEventFactory(); + + private MultilineInputCodec createObjectUnderTest() { + return new MultilineInputCodec(config, eventFactory); + } + + private InputStream toInputStream(final String content) { + return new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8)); + } + + private List> parseContent(final String content) throws IOException { + final List> events = new ArrayList<>(); + createObjectUnderTest().parse(toInputStream(content), events::add); + return events; + } + + @Test + void constructor_throws_if_config_is_null() { + assertThrows(NullPointerException.class, () -> new MultilineInputCodec(null, eventFactory)); + } + + @Test + void constructor_throws_if_eventFactory_is_null() { + assertThrows(NullPointerException.class, () -> new MultilineInputCodec(config, null)); + } + + @Test + void constructor_throws_if_match_pattern_is_invalid() { + when(config.getMatch()).thenReturn("[invalid("); + + assertThrows(IllegalArgumentException.class, this::createObjectUnderTest); + } + + @Test + void parse_throws_if_inputStream_is_null() { + when(config.getMatch()).thenReturn("^\\S"); + when(config.getNegate()).thenReturn(true); + when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + when(config.getMaxLines()).thenReturn(500); + when(config.getMaxLength()).thenReturn(10000); + when(config.getLineSeparator()).thenReturn("\n"); + + final MultilineInputCodec codec = createObjectUnderTest(); + assertThrows(NullPointerException.class, () -> codec.parse(null, events -> {})); + } + + @Test + void parse_throws_if_consumer_is_null() { + when(config.getMatch()).thenReturn("^\\S"); + when(config.getNegate()).thenReturn(true); + when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + when(config.getMaxLines()).thenReturn(500); + when(config.getMaxLength()).thenReturn(10000); + when(config.getLineSeparator()).thenReturn("\n"); + + final MultilineInputCodec codec = createObjectUnderTest(); + assertThrows(NullPointerException.class, () -> codec.parse(toInputStream("test"), null)); + } + + @Test + void parse_empty_input_produces_no_events() throws IOException { + when(config.getMatch()).thenReturn("^\\S"); + when(config.getNegate()).thenReturn(true); + when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + when(config.getMaxLines()).thenReturn(500); + when(config.getMaxLength()).thenReturn(10000); + when(config.getLineSeparator()).thenReturn("\n"); + + final List> events = parseContent(""); + assertThat(events.size(), equalTo(0)); + } + + @Test + void parse_single_line_produces_one_event() throws IOException { + when(config.getMatch()).thenReturn("^\\d{4}-\\d{2}-\\d{2}"); + when(config.getNegate()).thenReturn(true); + when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + when(config.getMaxLines()).thenReturn(500); + when(config.getMaxLength()).thenReturn(10000); + when(config.getLineSeparator()).thenReturn("\n"); + + final List> events = parseContent("2024-01-01 INFO single line\n"); + assertThat(events.size(), equalTo(1)); + assertThat(events.get(0).getData().get("message", String.class), equalTo("2024-01-01 INFO single line")); + } + + @Nested + class PreviousModeWithNegateTrue { + + @BeforeEach + void setUp() { + when(config.getMatch()).thenReturn("^\\d{4}-\\d{2}-\\d{2}"); + when(config.getNegate()).thenReturn(true); + when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + when(config.getMaxLines()).thenReturn(500); + when(config.getMaxLength()).thenReturn(10000); + when(config.getLineSeparator()).thenReturn("\n"); + } + + @Test + void groups_java_stack_trace_with_timestamp_start() throws IOException { + final String input = "2024-01-01 ERROR NullPointerException\n" + + " at com.example.Service.method(Service.java:42)\n" + + " at com.example.Main.run(Main.java:10)\n" + + "2024-01-01 INFO Application recovered\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(2)); + assertThat(events.get(0).getData().get("message", String.class), + equalTo("2024-01-01 ERROR NullPointerException\n" + + " at com.example.Service.method(Service.java:42)\n" + + " at com.example.Main.run(Main.java:10)")); + assertThat(events.get(1).getData().get("message", String.class), + equalTo("2024-01-01 INFO Application recovered")); + } + + @Test + void multiple_single_line_events_each_matching_pattern() throws IOException { + final String input = "2024-01-01 INFO line one\n" + + "2024-01-02 INFO line two\n" + + "2024-01-03 INFO line three\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(3)); + assertThat(events.get(0).getData().get("message", String.class), equalTo("2024-01-01 INFO line one")); + assertThat(events.get(1).getData().get("message", String.class), equalTo("2024-01-02 INFO line two")); + assertThat(events.get(2).getData().get("message", String.class), equalTo("2024-01-03 INFO line three")); + } + + @Test + void continuation_lines_at_beginning_are_grouped_as_first_event() throws IOException { + final String input = " orphan continuation line 1\n" + + " orphan continuation line 2\n" + + "2024-01-01 INFO first real entry\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(2)); + assertThat(events.get(0).getData().get("message", String.class), + equalTo(" orphan continuation line 1\n orphan continuation line 2")); + assertThat(events.get(1).getData().get("message", String.class), + equalTo("2024-01-01 INFO first real entry")); + } + + @Test + void last_event_with_continuations_flushed_at_end_of_stream() throws IOException { + final String input = "2024-01-01 ERROR Exception occurred\n" + + " at com.example.Foo.bar(Foo.java:1)\n" + + " at com.example.Baz.run(Baz.java:2)\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(1)); + assertThat(events.get(0).getData().get("message", String.class), + equalTo("2024-01-01 ERROR Exception occurred\n" + + " at com.example.Foo.bar(Foo.java:1)\n" + + " at com.example.Baz.run(Baz.java:2)")); + } + + @Test + void no_lines_match_pattern_produces_single_event() throws IOException { + final String input = " continuation line 1\n" + + " continuation line 2\n" + + " continuation line 3\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(1)); + assertThat(events.get(0).getData().get("message", String.class), + equalTo(" continuation line 1\n continuation line 2\n continuation line 3")); + } + } + + @Nested + class PreviousModeWithNegateFalse { + + @BeforeEach + void setUp() { + when(config.getMatch()).thenReturn("^\\s+(at |\\.\\.\\.|Caused by:)"); + when(config.getNegate()).thenReturn(false); + when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + when(config.getMaxLines()).thenReturn(500); + when(config.getMaxLength()).thenReturn(10000); + when(config.getLineSeparator()).thenReturn("\n"); + } + + @Test + void groups_stack_trace_lines_matching_pattern_with_previous() throws IOException { + final String input = "java.lang.NullPointerException: null\n" + + " at com.example.Service.process(Service.java:42)\n" + + " at com.example.Main.run(Main.java:10)\n" + + "INFO: Recovery complete\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(2)); + assertThat(events.get(0).getData().get("message", String.class), + equalTo("java.lang.NullPointerException: null\n" + + " at com.example.Service.process(Service.java:42)\n" + + " at com.example.Main.run(Main.java:10)")); + assertThat(events.get(1).getData().get("message", String.class), + equalTo("INFO: Recovery complete")); + } + + @Test + void caused_by_is_grouped_with_previous() throws IOException { + final String input = "java.lang.RuntimeException: error\n" + + " at com.example.A.method(A.java:1)\n" + + " Caused by: java.io.IOException\n" + + " at com.example.B.read(B.java:5)\n" + + "Next log entry\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(2)); + assertThat(events.get(0).getData().get("message", String.class), + equalTo("java.lang.RuntimeException: error\n" + + " at com.example.A.method(A.java:1)\n" + + " Caused by: java.io.IOException\n" + + " at com.example.B.read(B.java:5)")); + } + } + + @Nested + class NextMode { + + @BeforeEach + void setUp() { + when(config.getMatch()).thenReturn("^\\s"); + when(config.getNegate()).thenReturn(false); + when(config.getWhat()).thenReturn(MultilineWhat.NEXT); + when(config.getMaxLines()).thenReturn(500); + when(config.getMaxLength()).thenReturn(10000); + when(config.getLineSeparator()).thenReturn("\n"); + } + + @Test + void continuation_lines_prepended_to_next_event() throws IOException { + final String input = " header line 1\n" + + " header line 2\n" + + "MAIN LOG ENTRY\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(1)); + assertThat(events.get(0).getData().get("message", String.class), + equalTo(" header line 1\n header line 2\nMAIN LOG ENTRY")); + } + + @Test + void multiple_groups_in_next_mode() throws IOException { + final String input = " context A\n" + + "EVENT A\n" + + " context B\n" + + "EVENT B\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(2)); + assertThat(events.get(0).getData().get("message", String.class), + equalTo(" context A\nEVENT A")); + assertThat(events.get(1).getData().get("message", String.class), + equalTo(" context B\nEVENT B")); + } + + @Test + void trailing_continuation_lines_flushed_at_end_of_stream() throws IOException { + final String input = "EVENT A\n" + + " trailing context 1\n" + + " trailing context 2\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(2)); + assertThat(events.get(0).getData().get("message", String.class), + equalTo("EVENT A")); + assertThat(events.get(1).getData().get("message", String.class), + equalTo(" trailing context 1\n trailing context 2")); + } + + @Test + void no_continuation_lines_each_line_is_separate_event() throws IOException { + final String input = "EVENT A\n" + + "EVENT B\n" + + "EVENT C\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(3)); + assertThat(events.get(0).getData().get("message", String.class), equalTo("EVENT A")); + assertThat(events.get(1).getData().get("message", String.class), equalTo("EVENT B")); + assertThat(events.get(2).getData().get("message", String.class), equalTo("EVENT C")); + } + } + + @Nested + class NextModeMaxLinesLimit { + + @BeforeEach + void setUp() { + when(config.getMatch()).thenReturn("^\\d{4}"); + when(config.getNegate()).thenReturn(true); + when(config.getWhat()).thenReturn(MultilineWhat.NEXT); + when(config.getMaxLines()).thenReturn(3); + when(config.getMaxLength()).thenReturn(10000); + when(config.getLineSeparator()).thenReturn("\n"); + } + + @Test + void flushes_continuation_lines_when_max_lines_exceeded_in_next_mode() throws IOException { + final String input = " ctx 1\n" + + " ctx 2\n" + + " ctx 3\n" + + " ctx 4\n" + + "2024 EVENT\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(2)); + assertThat(events.get(0).getData().get("message", String.class), + equalTo(" ctx 1\n ctx 2\n ctx 3")); + assertThat(events.get(1).getData().get("message", String.class), + equalTo(" ctx 4\n2024 EVENT")); + } + } + + @Nested + class NextModeWithNegateTrue { + + @BeforeEach + void setUp() { + when(config.getMatch()).thenReturn("^\\["); + when(config.getNegate()).thenReturn(true); + when(config.getWhat()).thenReturn(MultilineWhat.NEXT); + when(config.getMaxLines()).thenReturn(500); + when(config.getMaxLength()).thenReturn(10000); + when(config.getLineSeparator()).thenReturn("\n"); + } + + @Test + void lines_not_matching_pattern_are_prepended_to_next_matching_line() throws IOException { + final String input = "preamble line 1\n" + + "preamble line 2\n" + + "[2024-01-01] Log entry\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(1)); + assertThat(events.get(0).getData().get("message", String.class), + equalTo("preamble line 1\npreamble line 2\n[2024-01-01] Log entry")); + } + } + + @Nested + class MaxLinesLimit { + + @BeforeEach + void setUp() { + when(config.getMatch()).thenReturn("^\\d{4}"); + when(config.getNegate()).thenReturn(true); + when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + when(config.getMaxLines()).thenReturn(3); + when(config.getMaxLength()).thenReturn(10000); + when(config.getLineSeparator()).thenReturn("\n"); + } + + @Test + void flushes_event_when_max_lines_exceeded() throws IOException { + final String input = "2024-01-01 ERROR start\n" + + " line 2\n" + + " line 3\n" + + " line 4\n" + + " line 5\n" + + "2024-01-02 INFO next\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(3)); + assertThat(events.get(0).getData().get("message", String.class), + equalTo("2024-01-01 ERROR start\n line 2\n line 3")); + assertThat(events.get(1).getData().get("message", String.class), + equalTo(" line 4\n line 5")); + assertThat(events.get(2).getData().get("message", String.class), + equalTo("2024-01-02 INFO next")); + } + } + + @Nested + class MaxLengthLimit { + + @BeforeEach + void setUp() { + when(config.getMatch()).thenReturn("^\\d{4}"); + when(config.getNegate()).thenReturn(true); + when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + when(config.getMaxLines()).thenReturn(500); + when(config.getMaxLength()).thenReturn(30); + when(config.getLineSeparator()).thenReturn("\n"); + } + + @Test + void flushes_event_when_max_length_exceeded() throws IOException { + final String input = "2024 start line here\n" + + " continuation is long\n" + + "2024 next entry\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(3)); + // First event is "2024 start line here" (20 chars) + // Adding "\n continuation is long" would be 20+1+22=43 > 30, so it flushes + assertThat(events.get(0).getData().get("message", String.class), + equalTo("2024 start line here")); + assertThat(events.get(1).getData().get("message", String.class), + equalTo(" continuation is long")); + assertThat(events.get(2).getData().get("message", String.class), + equalTo("2024 next entry")); + } + } + + @Nested + class CustomLineSeparator { + + @BeforeEach + void setUp() { + when(config.getMatch()).thenReturn("^\\d{4}"); + when(config.getNegate()).thenReturn(true); + when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + when(config.getMaxLines()).thenReturn(500); + when(config.getMaxLength()).thenReturn(10000); + when(config.getLineSeparator()).thenReturn("\r\n"); + } + + @Test + void uses_custom_line_separator_when_joining() throws IOException { + final String input = "2024-01-01 ERROR start\n" + + " continuation\n" + + "2024-01-02 INFO next\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(2)); + assertThat(events.get(0).getData().get("message", String.class), + equalTo("2024-01-01 ERROR start\r\n continuation")); + } + } + + @Nested + class RealWorldScenarios { + + @Test + void python_traceback() throws IOException { + when(config.getMatch()).thenReturn("^Traceback|^\\s|^\\w+Error"); + when(config.getNegate()).thenReturn(false); + when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + when(config.getMaxLines()).thenReturn(500); + when(config.getMaxLength()).thenReturn(10000); + when(config.getLineSeparator()).thenReturn("\n"); + + final String input = "2024-01-01 INFO Starting application\n" + + "Traceback (most recent call last):\n" + + " File \"main.py\", line 10, in \n" + + " result = process()\n" + + " File \"service.py\", line 5, in process\n" + + " return 1/0\n" + + "ZeroDivisionError: division by zero\n" + + "2024-01-01 INFO Recovered\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(2)); + assertThat(events.get(0).getData().get("message", String.class), + equalTo("2024-01-01 INFO Starting application\n" + + "Traceback (most recent call last):\n" + + " File \"main.py\", line 10, in \n" + + " result = process()\n" + + " File \"service.py\", line 5, in process\n" + + " return 1/0\n" + + "ZeroDivisionError: division by zero")); + assertThat(events.get(1).getData().get("message", String.class), + equalTo("2024-01-01 INFO Recovered")); + } + + @Test + void multiline_xml_in_logs() throws IOException { + when(config.getMatch()).thenReturn("^\\d{4}-\\d{2}-\\d{2}"); + when(config.getNegate()).thenReturn(true); + when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + when(config.getMaxLines()).thenReturn(500); + when(config.getMaxLength()).thenReturn(10000); + when(config.getLineSeparator()).thenReturn("\n"); + + final String input = "2024-01-01 Request body:\n" + + "\n" + + " value\n" + + "\n" + + "2024-01-01 Response sent\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(2)); + assertThat(events.get(0).getData().get("message", String.class), + equalTo("2024-01-01 Request body:\n\n value\n")); + assertThat(events.get(1).getData().get("message", String.class), + equalTo("2024-01-01 Response sent")); + } + + @Test + void log4j_multiline_with_nested_exception() throws IOException { + when(config.getMatch()).thenReturn("^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}"); + when(config.getNegate()).thenReturn(true); + when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + when(config.getMaxLines()).thenReturn(500); + when(config.getMaxLength()).thenReturn(10000); + when(config.getLineSeparator()).thenReturn("\n"); + + final String input = "2024-01-01T12:00:00 ERROR Application failed\n" + + "java.lang.RuntimeException: Outer\n" + + "\tat com.example.A.run(A.java:10)\n" + + "Caused by: java.io.IOException: Inner\n" + + "\tat com.example.B.read(B.java:20)\n" + + "\t... 5 more\n" + + "2024-01-01T12:00:01 INFO Shutdown complete\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(2)); + assertThat(events.get(0).getData().get("message", String.class), + equalTo("2024-01-01T12:00:00 ERROR Application failed\n" + + "java.lang.RuntimeException: Outer\n" + + "\tat com.example.A.run(A.java:10)\n" + + "Caused by: java.io.IOException: Inner\n" + + "\tat com.example.B.read(B.java:20)\n" + + "\t... 5 more")); + } + } + + @Nested + class IsContinuationLineTests { + + @Test + void negate_false_matching_line_is_continuation() { + when(config.getMatch()).thenReturn("^\\s"); + when(config.getNegate()).thenReturn(false); + when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + when(config.getMaxLines()).thenReturn(500); + when(config.getMaxLength()).thenReturn(10000); + when(config.getLineSeparator()).thenReturn("\n"); + + final MultilineInputCodec codec = createObjectUnderTest(); + assertThat(codec.isContinuationLine(" indented"), equalTo(true)); + assertThat(codec.isContinuationLine("not indented"), equalTo(false)); + } + + @Test + void negate_true_non_matching_line_is_continuation() { + when(config.getMatch()).thenReturn("^\\d{4}"); + when(config.getNegate()).thenReturn(true); + when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + when(config.getMaxLines()).thenReturn(500); + when(config.getMaxLength()).thenReturn(10000); + when(config.getLineSeparator()).thenReturn("\n"); + + final MultilineInputCodec codec = createObjectUnderTest(); + assertThat(codec.isContinuationLine(" no timestamp"), equalTo(true)); + assertThat(codec.isContinuationLine("2024 has timestamp"), equalTo(false)); + } + } + + @Test + void event_metadata_is_log_type() throws IOException { + when(config.getMatch()).thenReturn("^\\d{4}"); + when(config.getNegate()).thenReturn(true); + when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + when(config.getMaxLines()).thenReturn(500); + when(config.getMaxLength()).thenReturn(10000); + when(config.getLineSeparator()).thenReturn("\n"); + + final List> events = parseContent("2024-01-01 test\n"); + + assertThat(events.size(), equalTo(1)); + assertThat(events.get(0).getData(), notNullValue()); + assertThat(events.get(0).getData().getMetadata(), notNullValue()); + assertThat(events.get(0).getData().getMetadata().getEventType(), equalTo("LOG")); + } +} diff --git a/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineWhatTest.java b/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineWhatTest.java new file mode 100644 index 0000000000..9928024685 --- /dev/null +++ b/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineWhatTest.java @@ -0,0 +1,50 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.dataprepper.plugins.codec.multiline; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import static org.hamcrest.CoreMatchers.equalTo; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertThrows; + +class MultilineWhatTest { + + @Test + void fromString_returns_PREVIOUS_for_previous() { + assertThat(MultilineWhat.fromString("previous"), equalTo(MultilineWhat.PREVIOUS)); + } + + @Test + void fromString_returns_NEXT_for_next() { + assertThat(MultilineWhat.fromString("next"), equalTo(MultilineWhat.NEXT)); + } + + @Test + void fromString_is_case_insensitive() { + assertThat(MultilineWhat.fromString("PREVIOUS"), equalTo(MultilineWhat.PREVIOUS)); + assertThat(MultilineWhat.fromString("NEXT"), equalTo(MultilineWhat.NEXT)); + assertThat(MultilineWhat.fromString("Previous"), equalTo(MultilineWhat.PREVIOUS)); + } + + @ParameterizedTest + @ValueSource(strings = {"invalid", "before", "after", ""}) + void fromString_throws_for_invalid_value(final String value) { + assertThrows(IllegalArgumentException.class, () -> MultilineWhat.fromString(value)); + } + + @Test + void toString_returns_correct_values() { + assertThat(MultilineWhat.PREVIOUS.toString(), equalTo("previous")); + assertThat(MultilineWhat.NEXT.toString(), equalTo("next")); + } +} diff --git a/settings.gradle b/settings.gradle index f6f07cc1b0..3409e170eb 100644 --- a/settings.gradle +++ b/settings.gradle @@ -169,6 +169,7 @@ include 'release:maven' include 'e2e-test:peerforwarder' include 'data-prepper-plugins:failures-common' include 'data-prepper-plugins:newline-codecs' +include 'data-prepper-plugins:multiline-codecs' include 'data-prepper-plugins:avro-codecs' include 'data-prepper-plugins:kafka-plugins' include 'data-prepper-plugins:user-agent-processor' From eb89a299d2e083194fbe5751d93c1269c072948a Mon Sep 17 00:00:00 2001 From: Manisha Yadav Date: Mon, 15 Jun 2026 13:49:31 +0000 Subject: [PATCH 2/3] Address review comments: Implement config as per the suggested new design Signed-off-by: Manisha Yadav --- .../multiline-codecs/README.md | 64 ++- .../multiline-codecs/build.gradle | 9 - .../codec/multiline/MultilineInputCodec.java | 190 +++++-- .../multiline/MultilineInputCodecConfig.java | 196 +++++--- .../codec/multiline/MultilineMode.java | 40 ++ .../codec/multiline/MultilineWhat.java | 57 --- .../codec/multiline/MultilineCodecsIT.java | 279 +++-------- .../MultilineInputCodecConfigTest.java | 108 ++-- .../multiline/MultilineInputCodecTest.java | 462 ++++++------------ .../codec/multiline/MultilineWhatTest.java | 50 -- 10 files changed, 627 insertions(+), 828 deletions(-) create mode 100644 data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineMode.java delete mode 100644 data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineWhat.java delete mode 100644 data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineWhatTest.java diff --git a/data-prepper-plugins/multiline-codecs/README.md b/data-prepper-plugins/multiline-codecs/README.md index 62b61e547e..bae5add698 100644 --- a/data-prepper-plugins/multiline-codecs/README.md +++ b/data-prepper-plugins/multiline-codecs/README.md @@ -12,36 +12,41 @@ The multiline input codec can be configured with source plugins (e.g. S3 source, - **Python tracebacks**: `Traceback` blocks spanning multiple lines - **Timestamp-prefixed logs**: Logs where each entry starts with a timestamp and continuation lines don't - **Multi-line JSON/XML in logs**: Structured data embedded across multiple lines within log entries -- **Custom log formats**: Any format where a recognizable pattern marks the start of a new event +- **Custom log formats**: Any format where a recognizable pattern marks the start or end of a new event ## Configuration Options +Exactly one of the four pattern fields must be specified: + | Option | Required | Type | Default | Description | |---|---|---|---|---| -| `match` | Yes | String (regex) | - | A regular expression pattern used to identify line boundaries | -| `negate` | No | Boolean | `false` | When `false`, lines matching the pattern are continuation lines. When `true`, lines NOT matching the pattern are continuation lines | -| `what` | No | String | `previous` | Whether continuation lines belong to the `previous` or `next` event | +| `event_start_pattern` | One of four | String (regex) | - | A new event begins at each line matching this pattern | +| `event_end_pattern` | One of four | String (regex) | - | An event ends at each line matching this pattern (inclusive) | +| `continuation_line_start_pattern` | One of four | String (regex) | - | Lines matching this pattern are continuations of the previous event | +| `continuation_line_end_pattern` | One of four | String (regex) | - | Lines matching this pattern are prepended to the next event | +| `omit_matched_section` | No | Boolean | `false` | When true, the matched portion of the line is omitted from the output | | `max_lines` | No | Integer | `500` | Maximum number of lines that can be combined into a single event | -| `max_length` | No | Integer | `10000` | Maximum character length of a combined multiline event | -| `line_separator` | No | String | `\n` | Separator string used when joining lines into a single event message | +| `max_length` | No | Integer | `10000` | Maximum character length of a combined multiline event. Note: a single line exceeding this limit will still be emitted as a complete event without truncation | +| `line_separator` | No | String | `\n` | Separator string used when joining lines into a single event message. Note: `BufferedReader.readLine()` strips original line endings, so the codec normalizes joined lines using this separator. Set to `""` for no separator | +| `encoding` | No | String | `UTF-8` | Character encoding to use when reading the input stream | ## How It Works -The codec reads lines from the input stream and uses the `match` regex to determine event boundaries: +The codec reads lines from the input stream and uses the configured pattern to determine event boundaries: -1. **`negate=true` + `what=previous`** (most common): A new event starts when a line matches the pattern. Lines that do NOT match are appended to the preceding event. +1. **`event_start_pattern`** (most common): Each line matching the pattern starts a new event. All subsequent non-matching lines are appended to it. -2. **`negate=false` + `what=previous`**: Lines that match the pattern are appended to the preceding event. +2. **`event_end_pattern`**: Lines are accumulated until a line matches the pattern. The matching line is included in the current event, and the next line starts a new event. -3. **`negate=true` + `what=next`**: Lines that do NOT match the pattern are prepended to the next matching line. +3. **`continuation_line_start_pattern`**: Lines matching the pattern are continuations of the previous event. Non-matching lines start new events. -4. **`negate=false` + `what=next`**: Lines that match the pattern are prepended to the next non-matching line. +4. **`continuation_line_end_pattern`**: Lines matching the pattern are prepended to the next non-matching line's event. ## Examples -### Java Stack Traces (timestamp-based grouping) +### Java Stack Traces -Each log entry starts with a timestamp. Lines without a timestamp are continuations of the previous entry. +Each log entry starts with a timestamp. Lines without a timestamp (stack frames) are part of the previous entry. ```yaml pipeline: @@ -49,9 +54,7 @@ pipeline: s3: codec: multiline: - match: "^\\d{4}-\\d{2}-\\d{2}" - negate: true - what: previous + event_start_pattern: "^\\d{4}-\\d{2}-\\d{2}" ``` Input: @@ -62,13 +65,11 @@ Input: 2024-01-01 12:00:01 INFO Application recovered ``` -Result: 2 events -- Event 1: The ERROR line with its full stack trace grouped together -- Event 2: The INFO line as a single event +Result: 2 events (stack trace grouped with its ERROR line) -### Java Stack Traces (pattern-based grouping) +### Delimiter-Separated Entries -Lines starting with whitespace followed by `at `, `...`, or `Caused by:` are continuations. +Log entries are separated by a `---` line. ```yaml pipeline: @@ -76,12 +77,12 @@ pipeline: s3: codec: multiline: - match: "^\\s+(at |\\.\\.\\.|Caused by:)" - negate: false - what: previous + event_end_pattern: "^---$" ``` -### Python Tracebacks +### Stack Traces (continuation pattern) + +Lines starting with whitespace followed by `at ` or `Caused by:` are continuations. ```yaml pipeline: @@ -89,14 +90,12 @@ pipeline: s3: codec: multiline: - match: "^Traceback|^\\s|^\\w+Error" - negate: false - what: previous + continuation_line_start_pattern: "^\\s+(at |\\.\\.\\.|Caused by:)" ``` -### Log Entries with Preamble (next mode) +### Omitting Timestamps from Output -Lines starting with whitespace are prepended to the next non-indented line. +Strip the timestamp from each event's first line: ```yaml pipeline: @@ -104,9 +103,8 @@ pipeline: s3: codec: multiline: - match: "^\\s" - negate: false - what: next + event_start_pattern: "^\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}\\s+" + omit_matched_section: true ``` ## Developer Guide diff --git a/data-prepper-plugins/multiline-codecs/build.gradle b/data-prepper-plugins/multiline-codecs/build.gradle index cf619d2062..ade99e2e54 100644 --- a/data-prepper-plugins/multiline-codecs/build.gradle +++ b/data-prepper-plugins/multiline-codecs/build.gradle @@ -7,18 +7,9 @@ * compatible open source license. */ -plugins { - id 'java' -} - dependencies { implementation project(':data-prepper-api') implementation 'com.fasterxml.jackson.core:jackson-annotations' - implementation libs.parquet.common testImplementation project(':data-prepper-plugins:common') testImplementation project(':data-prepper-test:test-event') } - -test { - useJUnitPlatform() -} diff --git a/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodec.java b/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodec.java index 07eba72e35..e3e5e1dc9d 100644 --- a/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodec.java +++ b/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodec.java @@ -24,9 +24,11 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.nio.charset.Charset; import java.util.Collections; import java.util.Objects; import java.util.function.Consumer; +import java.util.regex.Matcher; import java.util.regex.Pattern; /** @@ -37,46 +39,57 @@ * such as Java stack traces, Python tracebacks, or any log format where entries begin with * a recognizable pattern (e.g., a timestamp).

* - *

The codec supports two grouping modes via the {@code what} configuration:

+ *

The codec supports four mutually exclusive pattern modes:

*
    - *
  • {@code previous}: Continuation lines are appended to the preceding event.
  • - *
  • {@code next}: Continuation lines are prepended to the following event.
  • - *
- * - *

The {@code negate} option controls which lines are considered continuation lines:

- *
    - *
  • {@code negate=false}: Lines matching the pattern are continuation lines.
  • - *
  • {@code negate=true}: Lines NOT matching the pattern are continuation lines.
  • + *
  • {@code event_start_pattern}: A new event begins at each matching line.
  • + *
  • {@code event_end_pattern}: An event ends at each matching line (inclusive).
  • + *
  • {@code continuation_line_start_pattern}: Matching lines are continuations of the previous event.
  • + *
  • {@code continuation_line_end_pattern}: Matching lines are prepended to the next event.
  • *
*/ @DataPrepperPlugin(name = "multiline", pluginType = InputCodec.class, pluginConfigurationType = MultilineInputCodecConfig.class) public class MultilineInputCodec implements InputCodec { private static final Logger LOG = LoggerFactory.getLogger(MultilineInputCodec.class); - static final String MESSAGE_FIELD_NAME = "message"; + private static final String MESSAGE_FIELD_NAME = "message"; private final Pattern pattern; - private final boolean negate; - private final MultilineWhat what; + private final MultilineMode mode; + private final boolean omitMatchedSection; private final int maxLines; private final int maxLength; private final String lineSeparator; + private final Charset encoding; private final EventFactory eventFactory; @DataPrepperPluginConstructor public MultilineInputCodec(final MultilineInputCodecConfig config, final EventFactory eventFactory) { Objects.requireNonNull(config, "config must not be null"); this.eventFactory = Objects.requireNonNull(eventFactory, "eventFactory must not be null"); - try { - this.pattern = Pattern.compile(config.getMatch()); - } catch (final Exception e) { - throw new IllegalArgumentException("Invalid regex pattern for 'match': " + config.getMatch(), e); + + this.pattern = config.getCompiledPattern(); + if (this.pattern == null) { + throw new IllegalArgumentException("A valid pattern must be configured"); } - this.negate = config.getNegate(); - this.what = config.getWhat(); + + this.mode = resolveMode(config); + this.omitMatchedSection = config.getOmitMatchedSection(); this.maxLines = config.getMaxLines(); this.maxLength = config.getMaxLength(); this.lineSeparator = config.getLineSeparator(); + this.encoding = config.getEncoding(); + } + + private static MultilineMode resolveMode(final MultilineInputCodecConfig config) { + if (config.getEventStartPattern() != null) { + return MultilineMode.EVENT_START; + } else if (config.getEventEndPattern() != null) { + return MultilineMode.EVENT_END; + } else if (config.getContinuationLineStartPattern() != null) { + return MultilineMode.CONTINUATION_START; + } else { + return MultilineMode.CONTINUATION_END; + } } @Override @@ -84,35 +97,109 @@ public void parse(final InputStream inputStream, final Consumer> e Objects.requireNonNull(inputStream, "inputStream must not be null"); Objects.requireNonNull(eventConsumer, "eventConsumer must not be null"); - try (final BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream))) { - if (what == MultilineWhat.PREVIOUS) { - parsePreviousMode(reader, eventConsumer); - } else { - parseNextMode(reader, eventConsumer); + try (final BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, encoding))) { + switch (mode) { + case EVENT_START: + parseEventStartMode(reader, eventConsumer); + break; + case EVENT_END: + parseEventEndMode(reader, eventConsumer); + break; + case CONTINUATION_START: + parseContinuationStartMode(reader, eventConsumer); + break; + case CONTINUATION_END: + parseContinuationEndMode(reader, eventConsumer); + break; + default: + throw new IllegalStateException("Unknown multiline mode: " + mode); + } + } + } + + /** + * EVENT_START mode: A new event begins at each line matching the pattern. + * Non-matching lines are continuations of the preceding event. + */ + private void parseEventStartMode(final BufferedReader reader, final Consumer> eventConsumer) throws IOException { + final StringBuilder buffer = new StringBuilder(); + int lineCount = 0; + String line; + + while ((line = reader.readLine()) != null) { + final boolean matches = pattern.matcher(line).find(); + + if (matches || shouldFlush(buffer, lineCount, line)) { + if (buffer.length() > 0) { + emitEvent(buffer.toString(), eventConsumer); + buffer.setLength(0); + lineCount = 0; + } } + + if (buffer.length() > 0) { + buffer.append(lineSeparator); + } + buffer.append(processLine(line, matches)); + lineCount++; + } + + if (buffer.length() > 0) { + emitEvent(buffer.toString(), eventConsumer); } } /** - * In PREVIOUS mode, continuation lines are appended to the preceding event. - * A new event boundary is detected when a line is NOT a continuation line - * (i.e., it's a "start" line). + * EVENT_END mode: An event ends at each line matching the pattern (inclusive). + * The matching line is included in the current event, then a new event begins. */ - private void parsePreviousMode(final BufferedReader reader, final Consumer> eventConsumer) throws IOException { + private void parseEventEndMode(final BufferedReader reader, final Consumer> eventConsumer) throws IOException { final StringBuilder buffer = new StringBuilder(); int lineCount = 0; String line; while ((line = reader.readLine()) != null) { - final boolean isContinuation = isContinuationLine(line); + final boolean matches = pattern.matcher(line).find(); - if (!isContinuation && buffer.length() > 0) { + if (shouldFlush(buffer, lineCount, line)) { + if (buffer.length() > 0) { + emitEvent(buffer.toString(), eventConsumer); + buffer.setLength(0); + lineCount = 0; + } + } + + if (buffer.length() > 0) { + buffer.append(lineSeparator); + } + buffer.append(processLine(line, matches)); + lineCount++; + + if (matches) { emitEvent(buffer.toString(), eventConsumer); buffer.setLength(0); lineCount = 0; } + } - if (shouldFlush(buffer, lineCount, line)) { + if (buffer.length() > 0) { + emitEvent(buffer.toString(), eventConsumer); + } + } + + /** + * CONTINUATION_START mode: Lines matching the pattern are continuations of the previous event. + * Non-matching lines start new events. + */ + private void parseContinuationStartMode(final BufferedReader reader, final Consumer> eventConsumer) throws IOException { + final StringBuilder buffer = new StringBuilder(); + int lineCount = 0; + String line; + + while ((line = reader.readLine()) != null) { + final boolean matches = pattern.matcher(line).find(); + + if (!matches || shouldFlush(buffer, lineCount, line)) { if (buffer.length() > 0) { emitEvent(buffer.toString(), eventConsumer); buffer.setLength(0); @@ -123,7 +210,7 @@ private void parsePreviousMode(final BufferedReader reader, final Consumer 0) { buffer.append(lineSeparator); } - buffer.append(line); + buffer.append(processLine(line, matches)); lineCount++; } @@ -133,42 +220,35 @@ private void parsePreviousMode(final BufferedReader reader, final Consumer> eventConsumer) throws IOException { + private void parseContinuationEndMode(final BufferedReader reader, final Consumer> eventConsumer) throws IOException { final StringBuilder buffer = new StringBuilder(); int lineCount = 0; boolean bufferHasNonContinuation = false; String line; while ((line = reader.readLine()) != null) { - final boolean isContinuation = isContinuationLine(line); + final boolean matches = pattern.matcher(line).find(); - if (!isContinuation) { + if (!matches) { if (bufferHasNonContinuation) { - // The buffer already has a complete event (non-continuation at end). - // Emit it and start fresh. emitEvent(buffer.toString(), eventConsumer); buffer.setLength(0); lineCount = 0; bufferHasNonContinuation = false; } - // Append this non-continuation line to the buffer (with any preceding continuations). if (buffer.length() > 0) { buffer.append(lineSeparator); } - buffer.append(line); + buffer.append(processLine(line, false)); lineCount++; bufferHasNonContinuation = true; continue; } - // This is a continuation line. if (bufferHasNonContinuation) { - // Buffer has a complete event ending with non-continuation. - // Emit it, then start collecting continuations for the next event. emitEvent(buffer.toString(), eventConsumer); buffer.setLength(0); lineCount = 0; @@ -186,7 +266,7 @@ private void parseNextMode(final BufferedReader reader, final Consumer 0) { buffer.append(lineSeparator); } - buffer.append(line); + buffer.append(processLine(line, matches)); lineCount++; } @@ -195,17 +275,19 @@ private void parseNextMode(final BufferedReader reader, final ConsumerWhen {@code negate=false}: a line matching the pattern IS a continuation line.

- *

When {@code negate=true}: a line NOT matching the pattern IS a continuation line.

- */ - boolean isContinuationLine(final String line) { - final boolean matches = pattern.matcher(line).find(); - return negate != matches; + private String processLine(final String line, final boolean matches) { + if (!omitMatchedSection || !matches) { + return line; + } + final Matcher matcher = pattern.matcher(line); + return matcher.replaceFirst(""); } + /** + * Determines if the buffer should be flushed before appending the next line. + * Note: if a single line exceeds max_length on its own, it will still be emitted + * as a complete event without truncation. + */ private boolean shouldFlush(final StringBuilder buffer, final int lineCount, final String nextLine) { if (lineCount >= maxLines) { LOG.debug("Flushing multiline event due to max_lines limit of {}", maxLines); diff --git a/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecConfig.java b/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecConfig.java index dc26290f3a..9eb76ce4fd 100644 --- a/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecConfig.java +++ b/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecConfig.java @@ -12,9 +12,12 @@ import com.fasterxml.jackson.annotation.JsonProperty; import jakarta.validation.constraints.AssertTrue; import jakarta.validation.constraints.Min; -import jakarta.validation.constraints.NotEmpty; import jakarta.validation.constraints.NotNull; +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.StandardCharsets; +import java.nio.charset.UnsupportedCharsetException; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -22,25 +25,19 @@ * Configuration class for the multiline input codec. * *

The multiline codec groups consecutive lines from an input stream into a single event - * based on a regex pattern. This is useful for log formats where a single logical event - * spans multiple lines (e.g., Java stack traces, multi-line application logs).

+ * based on a regex pattern. Exactly one of the four pattern fields must be specified:

+ *
    + *
  • {@code event_start_pattern}: A new event begins at each line matching this pattern.
  • + *
  • {@code event_end_pattern}: An event ends at each line matching this pattern (inclusive).
  • + *
  • {@code continuation_line_start_pattern}: Lines matching this pattern are continuations of the previous event.
  • + *
  • {@code continuation_line_end_pattern}: Lines matching this pattern are prepended to the next event.
  • + *
* *

Example configuration for Java stack traces:

*
  * codec:
  *   multiline:
- *     match: "^\\s+(at |\\.\\.\\.|Caused by:)"
- *     negate: false
- *     what: previous
- * 
- * - *

Example configuration for timestamp-prefixed logs:

- *
- * codec:
- *   multiline:
- *     match: "^\\d{4}-\\d{2}-\\d{2}"
- *     negate: true
- *     what: previous
+ *     event_start_pattern: "^\\d{4}-\\d{2}-\\d{2}"
  * 
*/ public class MultilineInputCodecConfig { @@ -49,17 +46,20 @@ public class MultilineInputCodecConfig { static final int DEFAULT_MAX_LENGTH = 10000; static final String DEFAULT_LINE_SEPARATOR = "\n"; - @NotEmpty(message = "match must not be empty") - @JsonProperty("match") - private String match; + @JsonProperty("event_start_pattern") + private String eventStartPattern; + + @JsonProperty("event_end_pattern") + private String eventEndPattern; - @NotNull(message = "negate must not be null") - @JsonProperty("negate") - private Boolean negate = false; + @JsonProperty("continuation_line_start_pattern") + private String continuationLineStartPattern; - @NotNull(message = "what must not be null") - @JsonProperty("what") - private MultilineWhat what = MultilineWhat.PREVIOUS; + @JsonProperty("continuation_line_end_pattern") + private String continuationLineEndPattern; + + @JsonProperty("omit_matched_section") + private boolean omitMatchedSection = false; @Min(value = 1, message = "max_lines must be at least 1") @JsonProperty("max_lines") @@ -73,75 +73,139 @@ public class MultilineInputCodecConfig { @JsonProperty("line_separator") private String lineSeparator = DEFAULT_LINE_SEPARATOR; - /** - * The regex pattern used to identify line boundaries. - * - * @return The regex pattern string. - */ - public String getMatch() { - return match; + @JsonProperty("encoding") + private String encoding = StandardCharsets.UTF_8.name(); + + private Pattern compiledPattern; + private Charset encodingCharset; + + public String getEventStartPattern() { + return eventStartPattern; } - /** - * Whether to negate the pattern match. - *

When false: lines matching the pattern are considered continuation lines.

- *

When true: lines NOT matching the pattern are considered continuation lines.

- * - * @return true if the pattern should be negated. - */ - public Boolean getNegate() { - return negate; + public String getEventEndPattern() { + return eventEndPattern; } - /** - * Defines whether unmatched (continuation) lines belong to the previous or next event. - * - * @return The multiline grouping direction. - */ - public MultilineWhat getWhat() { - return what; + public String getContinuationLineStartPattern() { + return continuationLineStartPattern; + } + + public String getContinuationLineEndPattern() { + return continuationLineEndPattern; + } + + public boolean getOmitMatchedSection() { + return omitMatchedSection; } - /** - * The maximum number of lines that can be combined into a single event. - * When this limit is reached, the accumulated lines are flushed as an event - * and a new accumulation begins. - * - * @return The maximum number of lines per event. - */ public int getMaxLines() { return maxLines; } + public int getMaxLength() { + return maxLength; + } + + public String getLineSeparator() { + return lineSeparator; + } + /** - * The maximum character length of a combined multiline event. - * When this limit is reached, the accumulated lines are flushed as an event. + * Returns the validated Charset. The encoding is validated once during + * bean validation and stored to avoid repeated parsing. * - * @return The maximum character length per event. + * @return The validated Charset. */ - public int getMaxLength() { - return maxLength; + public Charset getEncoding() { + return encodingCharset; } /** - * The separator string to use when joining multiple lines into a single event message. + * Returns the compiled regex pattern. The pattern is compiled once during validation + * and reused to avoid duplicate compilation. * - * @return The line separator string. + * @return The compiled Pattern. */ - public String getLineSeparator() { - return lineSeparator; + public Pattern getCompiledPattern() { + return compiledPattern; + } + + @AssertTrue(message = "Exactly one pattern field must be specified: event_start_pattern, event_end_pattern, " + + "continuation_line_start_pattern, or continuation_line_end_pattern") + boolean isExactlyOnePatternSpecified() { + int count = 0; + if (eventStartPattern != null) count++; + if (eventEndPattern != null) count++; + if (continuationLineStartPattern != null) count++; + if (continuationLineEndPattern != null) count++; + return count == 1; } - @AssertTrue(message = "match must be a valid regular expression") + @AssertTrue(message = "The specified pattern must be a valid regular expression") boolean isValidPattern() { - if (match == null || match.isEmpty()) { + final String patternString = getConfiguredPatternString(); + if (patternString == null || patternString.isEmpty()) { return false; } try { - Pattern.compile(match); + compiledPattern = Pattern.compile(patternString); return true; } catch (final PatternSyntaxException e) { return false; } } + + @AssertTrue(message = "The specified encoding must be a valid charset") + boolean isValidEncoding() { + if (encoding == null || encoding.isEmpty()) { + return false; + } + try { + encodingCharset = Charset.forName(encoding); + return true; + } catch (final IllegalCharsetNameException | UnsupportedCharsetException e) { + return false; + } + } + + String getConfiguredPatternString() { + if (eventStartPattern != null) return eventStartPattern; + if (eventEndPattern != null) return eventEndPattern; + if (continuationLineStartPattern != null) return continuationLineStartPattern; + if (continuationLineEndPattern != null) return continuationLineEndPattern; + return null; + } + + static Builder builder() { + return new Builder(); + } + + static class Builder { + private final MultilineInputCodecConfig config = new MultilineInputCodecConfig(); + + Builder withEventStartPattern(final String pattern) { + config.eventStartPattern = pattern; + return this; + } + + Builder withEventEndPattern(final String pattern) { + config.eventEndPattern = pattern; + return this; + } + + Builder withContinuationLineStartPattern(final String pattern) { + config.continuationLineStartPattern = pattern; + return this; + } + + Builder withContinuationLineEndPattern(final String pattern) { + config.continuationLineEndPattern = pattern; + return this; + } + + MultilineInputCodecConfig build() { + return config; + } + } } diff --git a/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineMode.java b/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineMode.java new file mode 100644 index 0000000000..fafa8cd0b0 --- /dev/null +++ b/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineMode.java @@ -0,0 +1,40 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.dataprepper.plugins.codec.multiline; + +/** + * Internal representation of the multiline grouping mode, determined from the configuration. + */ +enum MultilineMode { + + /** + * A new event starts at each line matching the pattern. + * Non-matching lines are continuations of the preceding event. + */ + EVENT_START, + + /** + * An event ends at each line matching the pattern (inclusive). + * The next line begins a new event. + */ + EVENT_END, + + /** + * Lines matching the pattern are continuations of the previous event. + * Non-matching lines start new events. + */ + CONTINUATION_START, + + /** + * Lines matching the pattern are prepended to the next event. + * Non-matching lines complete the event. + */ + CONTINUATION_END +} diff --git a/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineWhat.java b/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineWhat.java deleted file mode 100644 index ab21b16ac5..0000000000 --- a/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineWhat.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.dataprepper.plugins.codec.multiline; - -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonValue; - -import java.util.Arrays; -import java.util.Map; -import java.util.stream.Collectors; - -/** - * Defines whether unmatched lines should be grouped with the previous or next matching line. - */ -public enum MultilineWhat { - - /** - * Unmatched lines are appended to the previous matching line's event. - */ - PREVIOUS("previous"), - - /** - * Unmatched lines are prepended to the next matching line's event. - */ - NEXT("next"); - - private static final Map OPTIONS_MAP = Arrays.stream(MultilineWhat.values()) - .collect(Collectors.toMap(MultilineWhat::toString, value -> value)); - - private final String name; - - MultilineWhat(final String name) { - this.name = name; - } - - @JsonCreator - public static MultilineWhat fromString(final String value) { - final MultilineWhat result = OPTIONS_MAP.get(value.toLowerCase()); - if (result == null) { - throw new IllegalArgumentException("Invalid value for 'what': " + value + ". Valid values are: " + OPTIONS_MAP.keySet()); - } - return result; - } - - @JsonValue - @Override - public String toString() { - return name; - } -} diff --git a/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineCodecsIT.java b/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineCodecsIT.java index 6d2b970db5..f81b4fb424 100644 --- a/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineCodecsIT.java +++ b/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineCodecsIT.java @@ -26,9 +26,9 @@ import java.nio.charset.StandardCharsets; import java.util.List; import java.util.function.Consumer; +import java.util.regex.Pattern; import static org.hamcrest.CoreMatchers.equalTo; -import static org.hamcrest.CoreMatchers.notNullValue; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.containsString; import static org.mockito.Mockito.lenient; @@ -51,6 +51,8 @@ void setUp() { lenient().when(config.getMaxLines()).thenReturn(500); lenient().when(config.getMaxLength()).thenReturn(50000); lenient().when(config.getLineSeparator()).thenReturn("\n"); + lenient().when(config.getOmitMatchedSection()).thenReturn(false); + lenient().when(config.getEncoding()).thenReturn(StandardCharsets.UTF_8); } private MultilineInputCodec createObjectUnderTest() { @@ -62,10 +64,9 @@ private InputStream toInputStream(final String content) { } @Test - void parse_java_stack_trace_groups_exception_with_stack_frames() throws IOException { - lenient().when(config.getMatch()).thenReturn("^\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}"); - lenient().when(config.getNegate()).thenReturn(true); - lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + void parse_java_stack_trace_with_event_start_pattern() throws IOException { + lenient().when(config.getCompiledPattern()).thenReturn(Pattern.compile("^\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}")); + lenient().when(config.getEventStartPattern()).thenReturn("^\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}"); final String input = "2024-01-15 10:23:45.123 ERROR [main] com.example.UserService - Request failed\n" + @@ -74,46 +75,32 @@ void parse_java_stack_trace_groups_exception_with_stack_frames() throws IOExcept "\tat com.example.Controller.handle(Controller.java:28)\n" + "Caused by: java.sql.SQLException: Connection refused\n" + "\tat com.mysql.jdbc.Connection.connect(Connection.java:456)\n" + - "\t... 12 more\n" + - "2024-01-15 10:23:45.456 INFO [main] com.example.UserService - Retrying\n" + - "2024-01-15 10:23:46.789 WARN [worker-1] com.example.Cache - Cache miss\n"; + "2024-01-15 10:23:45.456 INFO [main] com.example.UserService - Retrying\n"; createObjectUnderTest().parse(toInputStream(input), eventConsumer); final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); - verify(eventConsumer, times(3)).accept(captor.capture()); + verify(eventConsumer, times(2)).accept(captor.capture()); final List> records = captor.getAllValues(); - - // Event 1: ERROR log + stack trace (7 lines grouped) final String event1 = records.get(0).getData().get("message", String.class); - assertThat(event1, notNullValue()); assertThat(event1, containsString("NullPointerException")); assertThat(event1, containsString("at com.example.UserService.getUser")); assertThat(event1, containsString("Caused by: java.sql.SQLException")); - assertThat(event1, containsString("... 12 more")); - - // Event 2: INFO single line - final String event2 = records.get(1).getData().get("message", String.class); - assertThat(event2, equalTo("2024-01-15 10:23:45.456 INFO [main] com.example.UserService - Retrying")); - - // Event 3: WARN single line - final String event3 = records.get(2).getData().get("message", String.class); - assertThat(event3, equalTo("2024-01-15 10:23:46.789 WARN [worker-1] com.example.Cache - Cache miss")); + assertThat(records.get(1).getData().get("message", String.class), + equalTo("2024-01-15 10:23:45.456 INFO [main] com.example.UserService - Retrying")); } @Test - void parse_python_traceback_groups_traceback_with_error_line() throws IOException { - lenient().when(config.getMatch()).thenReturn("^\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}"); - lenient().when(config.getNegate()).thenReturn(true); - lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + void parse_python_traceback_with_event_start_pattern() throws IOException { + lenient().when(config.getCompiledPattern()).thenReturn(Pattern.compile("^\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}")); + lenient().when(config.getEventStartPattern()).thenReturn("^\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}"); final String input = "2024-03-20 08:15:00,123 INFO Starting application\n" + "2024-03-20 08:15:02,789 ERROR Unhandled exception\n" + "Traceback (most recent call last):\n" + " File \"/app/worker.py\", line 45, in process\n" + - " result = transform(record)\n" + "ValueError: invalid literal for int()\n" + "2024-03-20 08:15:03,456 INFO Recovered\n"; @@ -123,28 +110,19 @@ void parse_python_traceback_groups_traceback_with_error_line() throws IOExceptio verify(eventConsumer, times(3)).accept(captor.capture()); final List> records = captor.getAllValues(); - - // Event 1: single INFO line assertThat(records.get(0).getData().get("message", String.class), equalTo("2024-03-20 08:15:00,123 INFO Starting application")); - - // Event 2: ERROR + traceback (5 lines grouped) final String event2 = records.get(1).getData().get("message", String.class); - assertThat(event2, containsString("ERROR Unhandled exception")); - assertThat(event2, containsString("Traceback (most recent call last):")); - assertThat(event2, containsString("File \"/app/worker.py\"")); - assertThat(event2, containsString("ValueError: invalid literal")); - - // Event 3: single INFO line + assertThat(event2, containsString("Traceback")); + assertThat(event2, containsString("ValueError")); assertThat(records.get(2).getData().get("message", String.class), equalTo("2024-03-20 08:15:03,456 INFO Recovered")); } @Test - void parse_xml_multiline_logs_groups_xml_body_with_header() throws IOException { - lenient().when(config.getMatch()).thenReturn("^\\[\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}"); - lenient().when(config.getNegate()).thenReturn(true); - lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + void parse_xml_multiline_with_event_start_pattern() throws IOException { + lenient().when(config.getCompiledPattern()).thenReturn(Pattern.compile("^\\[\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}")); + lenient().when(config.getEventStartPattern()).thenReturn("^\\[\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}"); final String input = "[2024-05-10 14:30:00.001] [INFO] Incoming request:\n" + @@ -159,68 +137,24 @@ void parse_xml_multiline_logs_groups_xml_body_with_header() throws IOException { verify(eventConsumer, times(2)).accept(captor.capture()); final List> records = captor.getAllValues(); - - // Event 1: log line + XML body (4 lines grouped) final String event1 = records.get(0).getData().get("message", String.class); - assertThat(event1, containsString("[INFO] Incoming request:")); assertThat(event1, containsString("")); - assertThat(event1, containsString("value")); assertThat(event1, containsString("")); - - // Event 2: single line assertThat(records.get(1).getData().get("message", String.class), equalTo("[2024-05-10 14:30:00.045] [INFO] Request processed")); } @Test - void parse_sql_multiline_logs_groups_query_with_header() throws IOException { - lenient().when(config.getMatch()).thenReturn("^\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}"); - lenient().when(config.getNegate()).thenReturn(true); - lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); - - final String input = - "2024-07-01 09:00:01 [Query] thread_id=145 exec_time=0.003s\n" + - "SELECT u.id, u.name\n" + - "FROM users u\n" + - "WHERE u.active = 1\n" + - "ORDER BY u.name;\n" + - "2024-07-01 09:00:02 [Query] thread_id=146 exec_time=0.001s\n" + - "SELECT COUNT(*) FROM sessions;\n"; - - createObjectUnderTest().parse(toInputStream(input), eventConsumer); - - final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); - verify(eventConsumer, times(2)).accept(captor.capture()); - - final List> records = captor.getAllValues(); - - // Event 1: query header + multi-line SQL (5 lines grouped) - final String event1 = records.get(0).getData().get("message", String.class); - assertThat(event1, containsString("[Query] thread_id=145")); - assertThat(event1, containsString("SELECT u.id, u.name")); - assertThat(event1, containsString("FROM users u")); - assertThat(event1, containsString("WHERE u.active = 1")); - assertThat(event1, containsString("ORDER BY u.name;")); - - // Event 2: query header + single-line SQL (2 lines grouped) - final String event2 = records.get(1).getData().get("message", String.class); - assertThat(event2, containsString("[Query] thread_id=146")); - assertThat(event2, containsString("SELECT COUNT(*) FROM sessions;")); - } - - @Test - void parse_syslog_ise_multiline_groups_continuation_lines() throws IOException { - lenient().when(config.getMatch()).thenReturn("^<\\d+>[A-Z][a-z]{2}\\s+\\d+\\s+\\d{2}:\\d{2}:\\d{2}"); - lenient().when(config.getNegate()).thenReturn(true); - lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + void parse_syslog_ise_with_event_start_pattern() throws IOException { + lenient().when(config.getCompiledPattern()).thenReturn(Pattern.compile("^<\\d+>[A-Z][a-z]{2}\\s+\\d+\\s+\\d{2}:\\d{2}:\\d{2}")); + lenient().when(config.getEventStartPattern()).thenReturn("^<\\d+>[A-Z][a-z]{2}\\s+\\d+\\s+\\d{2}:\\d{2}:\\d{2}"); final String input = - "<181>Jun 1 12:39:49 Infra-ISE CISE_Audit 0000000176 NOTICE Admin-Login: success\n" + - "<181>Jun 1 12:39:49 Infra-ISE CISE_Audit 0000000177 NOTICE OpenAPI: Response={\\\n" + - " \"version\" : \"1.0.0\",\\\n" + - " \"status\" : \"ok\"\\\n" + + "<181>Jun 1 12:39:49 Infra-ISE Audit NOTICE Admin-Login: success\n" + + "<181>Jun 1 12:39:49 Infra-ISE Audit NOTICE OpenAPI: Response={\n" + + " \"version\" : \"1.0.0\"\n" + "}, HttpCode=200\n" + - "<181>Jun 1 12:40:15 Infra-ISE CISE_Audit 0000000178 NOTICE Config-Change: added\n"; + "<181>Jun 1 12:40:15 Infra-ISE Audit NOTICE Config-Change: added\n"; createObjectUnderTest().parse(toInputStream(input), eventConsumer); @@ -228,35 +162,26 @@ void parse_syslog_ise_multiline_groups_continuation_lines() throws IOException { verify(eventConsumer, times(3)).accept(captor.capture()); final List> records = captor.getAllValues(); - - // Event 1: single-line syslog assertThat(records.get(0).getData().get("message", String.class), containsString("Admin-Login: success")); - - // Event 2: multiline syslog with JSON continuation (4 lines grouped) final String event2 = records.get(1).getData().get("message", String.class); assertThat(event2, containsString("OpenAPI: Response=")); assertThat(event2, containsString("\"version\" : \"1.0.0\"")); assertThat(event2, containsString("HttpCode=200")); - - // Event 3: single-line syslog assertThat(records.get(2).getData().get("message", String.class), containsString("Config-Change: added")); } @Test - void parse_with_negate_false_groups_matching_lines_with_previous() throws IOException { - lenient().when(config.getMatch()).thenReturn("^\\s+(at |\\.\\.\\.|Caused by:)"); - lenient().when(config.getNegate()).thenReturn(false); - lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + void parse_with_continuation_line_start_pattern() throws IOException { + lenient().when(config.getCompiledPattern()).thenReturn(Pattern.compile("^\\s+(at |\\.\\.\\.|Caused by:)")); + lenient().when(config.getContinuationLineStartPattern()).thenReturn("^\\s+(at |\\.\\.\\.|Caused by:)"); final String input = "java.lang.RuntimeException: error\n" + " at com.example.A.method(A.java:1)\n" + - " at com.example.B.method(B.java:2)\n" + " Caused by: java.io.IOException\n" + " at com.example.C.read(C.java:3)\n" + - " ... 5 more\n" + "Application recovered\n"; createObjectUnderTest().parse(toInputStream(input), eventConsumer); @@ -265,31 +190,25 @@ void parse_with_negate_false_groups_matching_lines_with_previous() throws IOExce verify(eventConsumer, times(2)).accept(captor.capture()); final List> records = captor.getAllValues(); - - // Event 1: exception + all matching stack frames (6 lines grouped) final String event1 = records.get(0).getData().get("message", String.class); assertThat(event1, containsString("RuntimeException: error")); assertThat(event1, containsString("at com.example.A.method")); assertThat(event1, containsString("Caused by: java.io.IOException")); - assertThat(event1, containsString("... 5 more")); - - // Event 2: non-matching line on its own assertThat(records.get(1).getData().get("message", String.class), equalTo("Application recovered")); } @Test - void parse_with_next_mode_prepends_continuation_to_following_event() throws IOException { - lenient().when(config.getMatch()).thenReturn("^\\s"); - lenient().when(config.getNegate()).thenReturn(false); - lenient().when(config.getWhat()).thenReturn(MultilineWhat.NEXT); + void parse_with_event_end_pattern() throws IOException { + lenient().when(config.getCompiledPattern()).thenReturn(Pattern.compile("^---$")); + lenient().when(config.getEventEndPattern()).thenReturn("^---$"); final String input = - " context-line-1\n" + - " context-line-2\n" + - "MAIN EVENT A\n" + - " context-line-3\n" + - "MAIN EVENT B\n"; + "entry 1 line 1\n" + + "entry 1 line 2\n" + + "---\n" + + "entry 2 line 1\n" + + "---\n"; createObjectUnderTest().parse(toInputStream(input), eventConsumer); @@ -297,136 +216,68 @@ void parse_with_next_mode_prepends_continuation_to_following_event() throws IOEx verify(eventConsumer, times(2)).accept(captor.capture()); final List> records = captor.getAllValues(); - - // Event 1: continuation lines + first non-continuation - final String event1 = records.get(0).getData().get("message", String.class); - assertThat(event1, containsString("context-line-1")); - assertThat(event1, containsString("context-line-2")); - assertThat(event1, containsString("MAIN EVENT A")); - - // Event 2: continuation line + second non-continuation - final String event2 = records.get(1).getData().get("message", String.class); - assertThat(event2, containsString("context-line-3")); - assertThat(event2, containsString("MAIN EVENT B")); + assertThat(records.get(0).getData().get("message", String.class), + equalTo("entry 1 line 1\nentry 1 line 2\n---")); + assertThat(records.get(1).getData().get("message", String.class), + equalTo("entry 2 line 1\n---")); } @Test - void parse_respects_max_lines_limit() throws IOException { - lenient().when(config.getMatch()).thenReturn("^\\d{4}"); - lenient().when(config.getNegate()).thenReturn(true); - lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); - lenient().when(config.getMaxLines()).thenReturn(3); + void parse_with_continuation_end_pattern() throws IOException { + lenient().when(config.getCompiledPattern()).thenReturn(Pattern.compile("^\\s")); + lenient().when(config.getContinuationLineEndPattern()).thenReturn("^\\s"); final String input = - "2024 start\n" + - " line 2\n" + - " line 3\n" + - " line 4\n" + - " line 5\n" + - "2024 next event\n"; + " context-line-1\n" + + " context-line-2\n" + + "MAIN EVENT A\n" + + " context-line-3\n" + + "MAIN EVENT B\n"; createObjectUnderTest().parse(toInputStream(input), eventConsumer); final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); - verify(eventConsumer, times(3)).accept(captor.capture()); + verify(eventConsumer, times(2)).accept(captor.capture()); final List> records = captor.getAllValues(); - - // Event 1: first 3 lines (hit max_lines) final String event1 = records.get(0).getData().get("message", String.class); - assertThat(event1, equalTo("2024 start\n line 2\n line 3")); - - // Event 2: overflow lines + assertThat(event1, containsString("context-line-1")); + assertThat(event1, containsString("MAIN EVENT A")); final String event2 = records.get(1).getData().get("message", String.class); - assertThat(event2, equalTo(" line 4\n line 5")); - - // Event 3: next event - assertThat(records.get(2).getData().get("message", String.class), - equalTo("2024 next event")); + assertThat(event2, containsString("context-line-3")); + assertThat(event2, containsString("MAIN EVENT B")); } @Test - void parse_respects_max_length_limit() throws IOException { - lenient().when(config.getMatch()).thenReturn("^\\d{4}"); - lenient().when(config.getNegate()).thenReturn(true); - lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); - lenient().when(config.getMaxLength()).thenReturn(25); + void parse_with_omit_matched_section() throws IOException { + lenient().when(config.getCompiledPattern()).thenReturn(Pattern.compile("^\\d{4}-\\d{2}-\\d{2}\\s+")); + lenient().when(config.getEventStartPattern()).thenReturn("^\\d{4}-\\d{2}-\\d{2}\\s+"); + lenient().when(config.getOmitMatchedSection()).thenReturn(true); final String input = - "2024 start here\n" + - " long continuation line\n" + - "2024 next\n"; + "2024-01-01 ERROR something bad\n" + + " stack trace\n" + + "2024-01-02 INFO recovered\n"; createObjectUnderTest().parse(toInputStream(input), eventConsumer); final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); - verify(eventConsumer, times(3)).accept(captor.capture()); + verify(eventConsumer, times(2)).accept(captor.capture()); final List> records = captor.getAllValues(); - - // Event 1: flushed due to max_length before adding continuation assertThat(records.get(0).getData().get("message", String.class), - equalTo("2024 start here")); - - // Event 2: continuation line on its own + equalTo("ERROR something bad\n stack trace")); assertThat(records.get(1).getData().get("message", String.class), - equalTo(" long continuation line")); - - // Event 3: next event - assertThat(records.get(2).getData().get("message", String.class), - equalTo("2024 next")); + equalTo("INFO recovered")); } @Test void parse_empty_input_produces_no_events() throws IOException { - lenient().when(config.getMatch()).thenReturn("^\\d{4}"); - lenient().when(config.getNegate()).thenReturn(true); - lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + lenient().when(config.getCompiledPattern()).thenReturn(Pattern.compile("^\\d{4}")); + lenient().when(config.getEventStartPattern()).thenReturn("^\\d{4}"); createObjectUnderTest().parse(toInputStream(""), eventConsumer); verify(eventConsumer, times(0)).accept(ArgumentCaptor.forClass(Record.class).capture()); } - - @Test - void parse_all_lines_are_single_events_when_all_match_pattern() throws IOException { - lenient().when(config.getMatch()).thenReturn("^\\d{4}"); - lenient().when(config.getNegate()).thenReturn(true); - lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); - - final String input = - "2024 event one\n" + - "2024 event two\n" + - "2024 event three\n"; - - createObjectUnderTest().parse(toInputStream(input), eventConsumer); - - final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); - verify(eventConsumer, times(3)).accept(captor.capture()); - - final List> records = captor.getAllValues(); - assertThat(records.get(0).getData().get("message", String.class), equalTo("2024 event one")); - assertThat(records.get(1).getData().get("message", String.class), equalTo("2024 event two")); - assertThat(records.get(2).getData().get("message", String.class), equalTo("2024 event three")); - } - - @Test - void parse_all_lines_form_single_event_when_none_match_pattern() throws IOException { - lenient().when(config.getMatch()).thenReturn("^NEVER_MATCHES"); - lenient().when(config.getNegate()).thenReturn(true); - lenient().when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); - - final String input = - "line one\n" + - "line two\n" + - "line three\n"; - - createObjectUnderTest().parse(toInputStream(input), eventConsumer); - - final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); - verify(eventConsumer, times(1)).accept(captor.capture()); - - assertThat(captor.getValue().getData().get("message", String.class), - equalTo("line one\nline two\nline three")); - } } diff --git a/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecConfigTest.java b/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecConfigTest.java index 6d74973abf..74c5f97b9a 100644 --- a/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecConfigTest.java +++ b/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecConfigTest.java @@ -10,12 +10,9 @@ package org.opensearch.dataprepper.plugins.codec.multiline; import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.ValueSource; - -import java.lang.reflect.Field; import static org.hamcrest.CoreMatchers.equalTo; +import static org.hamcrest.CoreMatchers.nullValue; import static org.hamcrest.MatcherAssert.assertThat; class MultilineInputCodecConfigTest { @@ -24,71 +21,102 @@ class MultilineInputCodecConfigTest { void defaults_are_correct() { final MultilineInputCodecConfig config = new MultilineInputCodecConfig(); - assertThat(config.getNegate(), equalTo(false)); - assertThat(config.getWhat(), equalTo(MultilineWhat.PREVIOUS)); + assertThat(config.getEventStartPattern(), nullValue()); + assertThat(config.getEventEndPattern(), nullValue()); + assertThat(config.getContinuationLineStartPattern(), nullValue()); + assertThat(config.getContinuationLineEndPattern(), nullValue()); + assertThat(config.getOmitMatchedSection(), equalTo(false)); assertThat(config.getMaxLines(), equalTo(MultilineInputCodecConfig.DEFAULT_MAX_LINES)); assertThat(config.getMaxLength(), equalTo(MultilineInputCodecConfig.DEFAULT_MAX_LENGTH)); assertThat(config.getLineSeparator(), equalTo(MultilineInputCodecConfig.DEFAULT_LINE_SEPARATOR)); - assertThat(config.getMatch(), equalTo(null)); + assertThat(config.getConfiguredPatternString(), nullValue()); } @Test - void getMatch_returns_configured_value() throws Exception { - final MultilineInputCodecConfig config = new MultilineInputCodecConfig(); - setField(config, "match", "^\\d{4}"); - assertThat(config.getMatch(), equalTo("^\\d{4}")); + void isExactlyOnePatternSpecified_returns_true_for_event_start_pattern() { + final MultilineInputCodecConfig config = MultilineInputCodecConfig.builder() + .withEventStartPattern("^\\d{4}") + .build(); + assertThat(config.isExactlyOnePatternSpecified(), equalTo(true)); } @Test - void isValidPattern_returns_true_for_valid_regex() throws Exception { - final MultilineInputCodecConfig config = new MultilineInputCodecConfig(); - setField(config, "match", "^\\d{4}-\\d{2}-\\d{2}"); - assertThat(config.isValidPattern(), equalTo(true)); + void isExactlyOnePatternSpecified_returns_true_for_event_end_pattern() { + final MultilineInputCodecConfig config = MultilineInputCodecConfig.builder() + .withEventEndPattern("^---$") + .build(); + assertThat(config.isExactlyOnePatternSpecified(), equalTo(true)); } @Test - void isValidPattern_returns_false_for_invalid_regex() throws Exception { - final MultilineInputCodecConfig config = new MultilineInputCodecConfig(); - setField(config, "match", "[invalid("); - assertThat(config.isValidPattern(), equalTo(false)); + void isExactlyOnePatternSpecified_returns_true_for_continuation_line_start_pattern() { + final MultilineInputCodecConfig config = MultilineInputCodecConfig.builder() + .withContinuationLineStartPattern("^\\s") + .build(); + assertThat(config.isExactlyOnePatternSpecified(), equalTo(true)); } @Test - void isValidPattern_returns_false_for_null_match() { + void isExactlyOnePatternSpecified_returns_true_for_continuation_line_end_pattern() { + final MultilineInputCodecConfig config = MultilineInputCodecConfig.builder() + .withContinuationLineEndPattern("^\\s") + .build(); + assertThat(config.isExactlyOnePatternSpecified(), equalTo(true)); + } + + @Test + void isExactlyOnePatternSpecified_returns_false_when_none_specified() { final MultilineInputCodecConfig config = new MultilineInputCodecConfig(); + assertThat(config.isExactlyOnePatternSpecified(), equalTo(false)); + } + + @Test + void isExactlyOnePatternSpecified_returns_false_when_two_specified() { + final MultilineInputCodecConfig config = MultilineInputCodecConfig.builder() + .withEventStartPattern("^\\d{4}") + .withEventEndPattern("^---$") + .build(); + assertThat(config.isExactlyOnePatternSpecified(), equalTo(false)); + } + + @Test + void isValidPattern_returns_true_for_valid_regex() { + final MultilineInputCodecConfig config = MultilineInputCodecConfig.builder() + .withEventStartPattern("^\\d{4}-\\d{2}-\\d{2}") + .build(); + assertThat(config.isValidPattern(), equalTo(true)); + } + + @Test + void isValidPattern_returns_false_for_invalid_regex() { + final MultilineInputCodecConfig config = MultilineInputCodecConfig.builder() + .withEventStartPattern("[invalid(") + .build(); assertThat(config.isValidPattern(), equalTo(false)); } @Test - void isValidPattern_returns_false_for_empty_match() throws Exception { + void isValidPattern_returns_false_when_no_pattern_configured() { final MultilineInputCodecConfig config = new MultilineInputCodecConfig(); - setField(config, "match", ""); assertThat(config.isValidPattern(), equalTo(false)); } - @ParameterizedTest - @ValueSource(ints = {1, 100, 1000}) - void getMaxLines_returns_configured_value(final int maxLines) throws Exception { + @Test + void getConfiguredPatternString_returns_null_when_none_specified() { final MultilineInputCodecConfig config = new MultilineInputCodecConfig(); - setField(config, "maxLines", maxLines); - assertThat(config.getMaxLines(), equalTo(maxLines)); + assertThat(config.getConfiguredPatternString(), nullValue()); } - @ParameterizedTest - @ValueSource(ints = {1, 5000, 50000}) - void getMaxLength_returns_configured_value(final int maxLength) throws Exception { + @Test + void isValidEncoding_returns_true_for_default_utf8() { final MultilineInputCodecConfig config = new MultilineInputCodecConfig(); - setField(config, "maxLength", maxLength); - assertThat(config.getMaxLength(), equalTo(maxLength)); + assertThat(config.isValidEncoding(), equalTo(true)); } - private void setField(final Object object, final String fieldName, final Object value) throws Exception { - final Field field = object.getClass().getDeclaredField(fieldName); - try { - field.setAccessible(true); - field.set(object, value); - } finally { - field.setAccessible(false); - } + @Test + void isValidEncoding_returns_true_for_valid_charset() { + final MultilineInputCodecConfig config = new MultilineInputCodecConfig(); + assertThat(config.isValidEncoding(), equalTo(true)); + assertThat(config.getEncoding().name(), equalTo("UTF-8")); } } diff --git a/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecTest.java b/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecTest.java index 5738a82da7..a7f66c8306 100644 --- a/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecTest.java +++ b/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecTest.java @@ -26,6 +26,7 @@ import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; +import java.util.regex.Pattern; import static org.hamcrest.CoreMatchers.equalTo; import static org.hamcrest.CoreMatchers.notNullValue; @@ -66,80 +67,39 @@ void constructor_throws_if_eventFactory_is_null() { } @Test - void constructor_throws_if_match_pattern_is_invalid() { - when(config.getMatch()).thenReturn("[invalid("); + void constructor_throws_if_no_pattern_configured() { + when(config.getCompiledPattern()).thenReturn(null); assertThrows(IllegalArgumentException.class, this::createObjectUnderTest); } @Test - void parse_throws_if_inputStream_is_null() { - when(config.getMatch()).thenReturn("^\\S"); - when(config.getNegate()).thenReturn(true); - when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); - when(config.getMaxLines()).thenReturn(500); - when(config.getMaxLength()).thenReturn(10000); - when(config.getLineSeparator()).thenReturn("\n"); - - final MultilineInputCodec codec = createObjectUnderTest(); - assertThrows(NullPointerException.class, () -> codec.parse(null, events -> {})); - } - - @Test - void parse_throws_if_consumer_is_null() { - when(config.getMatch()).thenReturn("^\\S"); - when(config.getNegate()).thenReturn(true); - when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); - when(config.getMaxLines()).thenReturn(500); - when(config.getMaxLength()).thenReturn(10000); - when(config.getLineSeparator()).thenReturn("\n"); - - final MultilineInputCodec codec = createObjectUnderTest(); - assertThrows(NullPointerException.class, () -> codec.parse(toInputStream("test"), null)); - } - - @Test - void parse_empty_input_produces_no_events() throws IOException { - when(config.getMatch()).thenReturn("^\\S"); - when(config.getNegate()).thenReturn(true); - when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); - when(config.getMaxLines()).thenReturn(500); - when(config.getMaxLength()).thenReturn(10000); - when(config.getLineSeparator()).thenReturn("\n"); + void constructor_throws_if_pattern_is_invalid() { + when(config.getCompiledPattern()).thenReturn(null); - final List> events = parseContent(""); - assertThat(events.size(), equalTo(0)); + assertThrows(IllegalArgumentException.class, this::createObjectUnderTest); } - @Test - void parse_single_line_produces_one_event() throws IOException { - when(config.getMatch()).thenReturn("^\\d{4}-\\d{2}-\\d{2}"); - when(config.getNegate()).thenReturn(true); - when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + private void setupConfig(final String patternStr) { + when(config.getCompiledPattern()).thenReturn(Pattern.compile(patternStr)); when(config.getMaxLines()).thenReturn(500); when(config.getMaxLength()).thenReturn(10000); when(config.getLineSeparator()).thenReturn("\n"); - - final List> events = parseContent("2024-01-01 INFO single line\n"); - assertThat(events.size(), equalTo(1)); - assertThat(events.get(0).getData().get("message", String.class), equalTo("2024-01-01 INFO single line")); + when(config.getOmitMatchedSection()).thenReturn(false); + when(config.getEncoding()).thenReturn(StandardCharsets.UTF_8); } @Nested - class PreviousModeWithNegateTrue { + class EventStartMode { @BeforeEach void setUp() { - when(config.getMatch()).thenReturn("^\\d{4}-\\d{2}-\\d{2}"); - when(config.getNegate()).thenReturn(true); - when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); - when(config.getMaxLines()).thenReturn(500); - when(config.getMaxLength()).thenReturn(10000); - when(config.getLineSeparator()).thenReturn("\n"); + setupConfig("^\\d{4}-\\d{2}-\\d{2}"); + when(config.getEventStartPattern()).thenReturn("^\\d{4}-\\d{2}-\\d{2}"); } @Test - void groups_java_stack_trace_with_timestamp_start() throws IOException { + void groups_stack_trace_with_timestamp_start() throws IOException { final String input = "2024-01-01 ERROR NullPointerException\n" + " at com.example.Service.method(Service.java:42)\n" + " at com.example.Main.run(Main.java:10)\n" + @@ -157,7 +117,7 @@ void groups_java_stack_trace_with_timestamp_start() throws IOException { } @Test - void multiple_single_line_events_each_matching_pattern() throws IOException { + void multiple_single_line_events() throws IOException { final String input = "2024-01-01 INFO line one\n" + "2024-01-02 INFO line two\n" + "2024-01-03 INFO line three\n"; @@ -171,64 +131,115 @@ void multiple_single_line_events_each_matching_pattern() throws IOException { } @Test - void continuation_lines_at_beginning_are_grouped_as_first_event() throws IOException { - final String input = " orphan continuation line 1\n" + - " orphan continuation line 2\n" + - "2024-01-01 INFO first real entry\n"; + void continuation_lines_at_beginning_grouped_as_first_event() throws IOException { + final String input = " orphan line 1\n" + + " orphan line 2\n" + + "2024-01-01 INFO first entry\n"; final List> events = parseContent(input); assertThat(events.size(), equalTo(2)); assertThat(events.get(0).getData().get("message", String.class), - equalTo(" orphan continuation line 1\n orphan continuation line 2")); + equalTo(" orphan line 1\n orphan line 2")); assertThat(events.get(1).getData().get("message", String.class), - equalTo("2024-01-01 INFO first real entry")); + equalTo("2024-01-01 INFO first entry")); } @Test - void last_event_with_continuations_flushed_at_end_of_stream() throws IOException { - final String input = "2024-01-01 ERROR Exception occurred\n" + - " at com.example.Foo.bar(Foo.java:1)\n" + - " at com.example.Baz.run(Baz.java:2)\n"; + void last_event_flushed_at_end_of_stream() throws IOException { + final String input = "2024-01-01 ERROR Exception\n" + + " at com.example.Foo.bar(Foo.java:1)\n"; final List> events = parseContent(input); assertThat(events.size(), equalTo(1)); assertThat(events.get(0).getData().get("message", String.class), - equalTo("2024-01-01 ERROR Exception occurred\n" + - " at com.example.Foo.bar(Foo.java:1)\n" + - " at com.example.Baz.run(Baz.java:2)")); + equalTo("2024-01-01 ERROR Exception\n at com.example.Foo.bar(Foo.java:1)")); + } + + @Test + void empty_input_produces_no_events() throws IOException { + final List> events = parseContent(""); + assertThat(events.size(), equalTo(0)); } @Test - void no_lines_match_pattern_produces_single_event() throws IOException { - final String input = " continuation line 1\n" + - " continuation line 2\n" + - " continuation line 3\n"; + void no_lines_match_produces_single_event() throws IOException { + final String input = " line 1\n line 2\n line 3\n"; final List> events = parseContent(input); assertThat(events.size(), equalTo(1)); assertThat(events.get(0).getData().get("message", String.class), - equalTo(" continuation line 1\n continuation line 2\n continuation line 3")); + equalTo(" line 1\n line 2\n line 3")); } } @Nested - class PreviousModeWithNegateFalse { + class EventEndMode { @BeforeEach void setUp() { - when(config.getMatch()).thenReturn("^\\s+(at |\\.\\.\\.|Caused by:)"); - when(config.getNegate()).thenReturn(false); - when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); - when(config.getMaxLines()).thenReturn(500); - when(config.getMaxLength()).thenReturn(10000); - when(config.getLineSeparator()).thenReturn("\n"); + setupConfig("^---$"); + when(config.getEventEndPattern()).thenReturn("^---$"); } @Test - void groups_stack_trace_lines_matching_pattern_with_previous() throws IOException { + void groups_lines_until_separator() throws IOException { + final String input = "line 1\n" + + "line 2\n" + + "---\n" + + "line 3\n" + + "line 4\n" + + "---\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(2)); + assertThat(events.get(0).getData().get("message", String.class), + equalTo("line 1\nline 2\n---")); + assertThat(events.get(1).getData().get("message", String.class), + equalTo("line 3\nline 4\n---")); + } + + @Test + void trailing_lines_without_end_marker_flushed() throws IOException { + final String input = "line 1\n" + + "---\n" + + "line 2\n" + + "line 3\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(2)); + assertThat(events.get(0).getData().get("message", String.class), + equalTo("line 1\n---")); + assertThat(events.get(1).getData().get("message", String.class), + equalTo("line 2\nline 3")); + } + + @Test + void single_line_matching_end_pattern() throws IOException { + final String input = "---\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(1)); + assertThat(events.get(0).getData().get("message", String.class), equalTo("---")); + } + } + + @Nested + class ContinuationStartMode { + + @BeforeEach + void setUp() { + setupConfig("^\\s+(at |\\.\\.\\.|Caused by:)"); + when(config.getContinuationLineStartPattern()).thenReturn("^\\s+(at |\\.\\.\\.|Caused by:)"); + } + + @Test + void groups_stack_trace_lines_with_previous() throws IOException { final String input = "java.lang.NullPointerException: null\n" + " at com.example.Service.process(Service.java:42)\n" + " at com.example.Main.run(Main.java:10)\n" + @@ -246,7 +257,7 @@ void groups_stack_trace_lines_matching_pattern_with_previous() throws IOExceptio } @Test - void caused_by_is_grouped_with_previous() throws IOException { + void caused_by_grouped_with_previous() throws IOException { final String input = "java.lang.RuntimeException: error\n" + " at com.example.A.method(A.java:1)\n" + " Caused by: java.io.IOException\n" + @@ -265,16 +276,11 @@ void caused_by_is_grouped_with_previous() throws IOException { } @Nested - class NextMode { + class ContinuationEndMode { @BeforeEach void setUp() { - when(config.getMatch()).thenReturn("^\\s"); - when(config.getNegate()).thenReturn(false); - when(config.getWhat()).thenReturn(MultilineWhat.NEXT); - when(config.getMaxLines()).thenReturn(500); - when(config.getMaxLength()).thenReturn(10000); - when(config.getLineSeparator()).thenReturn("\n"); + setupConfig("^\\s"); } @Test @@ -291,7 +297,7 @@ void continuation_lines_prepended_to_next_event() throws IOException { } @Test - void multiple_groups_in_next_mode() throws IOException { + void multiple_groups() throws IOException { final String input = " context A\n" + "EVENT A\n" + " context B\n" + @@ -307,25 +313,22 @@ void multiple_groups_in_next_mode() throws IOException { } @Test - void trailing_continuation_lines_flushed_at_end_of_stream() throws IOException { + void trailing_continuation_lines_flushed() throws IOException { final String input = "EVENT A\n" + - " trailing context 1\n" + - " trailing context 2\n"; + " trailing 1\n" + + " trailing 2\n"; final List> events = parseContent(input); assertThat(events.size(), equalTo(2)); - assertThat(events.get(0).getData().get("message", String.class), - equalTo("EVENT A")); + assertThat(events.get(0).getData().get("message", String.class), equalTo("EVENT A")); assertThat(events.get(1).getData().get("message", String.class), - equalTo(" trailing context 1\n trailing context 2")); + equalTo(" trailing 1\n trailing 2")); } @Test - void no_continuation_lines_each_line_is_separate_event() throws IOException { - final String input = "EVENT A\n" + - "EVENT B\n" + - "EVENT C\n"; + void no_continuation_lines_each_is_separate_event() throws IOException { + final String input = "EVENT A\nEVENT B\nEVENT C\n"; final List> events = parseContent(input); @@ -337,60 +340,69 @@ void no_continuation_lines_each_line_is_separate_event() throws IOException { } @Nested - class NextModeMaxLinesLimit { + class OmitMatchedSection { - @BeforeEach - void setUp() { - when(config.getMatch()).thenReturn("^\\d{4}"); - when(config.getNegate()).thenReturn(true); - when(config.getWhat()).thenReturn(MultilineWhat.NEXT); - when(config.getMaxLines()).thenReturn(3); + @Test + void event_start_pattern_omits_matched_section() throws IOException { + when(config.getCompiledPattern()).thenReturn(Pattern.compile("^\\d{4}-\\d{2}-\\d{2}\\s+")); + when(config.getEventStartPattern()).thenReturn("^\\d{4}-\\d{2}-\\d{2}\\s+"); + when(config.getMaxLines()).thenReturn(500); when(config.getMaxLength()).thenReturn(10000); when(config.getLineSeparator()).thenReturn("\n"); - } + when(config.getOmitMatchedSection()).thenReturn(true); + when(config.getEncoding()).thenReturn(StandardCharsets.UTF_8); - @Test - void flushes_continuation_lines_when_max_lines_exceeded_in_next_mode() throws IOException { - final String input = " ctx 1\n" + - " ctx 2\n" + - " ctx 3\n" + - " ctx 4\n" + - "2024 EVENT\n"; + final String input = "2024-01-01 ERROR something\n" + + " stack trace line\n" + + "2024-01-02 INFO recovered\n"; final List> events = parseContent(input); assertThat(events.size(), equalTo(2)); assertThat(events.get(0).getData().get("message", String.class), - equalTo(" ctx 1\n ctx 2\n ctx 3")); + equalTo("ERROR something\n stack trace line")); assertThat(events.get(1).getData().get("message", String.class), - equalTo(" ctx 4\n2024 EVENT")); + equalTo("INFO recovered")); } - } - - @Nested - class NextModeWithNegateTrue { - @BeforeEach - void setUp() { - when(config.getMatch()).thenReturn("^\\["); - when(config.getNegate()).thenReturn(true); - when(config.getWhat()).thenReturn(MultilineWhat.NEXT); + @Test + void event_end_pattern_omits_matched_section() throws IOException { + when(config.getCompiledPattern()).thenReturn(Pattern.compile("^---$")); + when(config.getEventEndPattern()).thenReturn("^---$"); when(config.getMaxLines()).thenReturn(500); when(config.getMaxLength()).thenReturn(10000); when(config.getLineSeparator()).thenReturn("\n"); + when(config.getOmitMatchedSection()).thenReturn(true); + when(config.getEncoding()).thenReturn(StandardCharsets.UTF_8); + + final String input = "line 1\nline 2\n---\nline 3\n---\n"; + + final List> events = parseContent(input); + + assertThat(events.size(), equalTo(2)); + assertThat(events.get(0).getData().get("message", String.class), + equalTo("line 1\nline 2\n")); + assertThat(events.get(1).getData().get("message", String.class), + equalTo("line 3\n")); } @Test - void lines_not_matching_pattern_are_prepended_to_next_matching_line() throws IOException { - final String input = "preamble line 1\n" + - "preamble line 2\n" + - "[2024-01-01] Log entry\n"; + void omit_false_preserves_matched_section() throws IOException { + when(config.getCompiledPattern()).thenReturn(Pattern.compile("^\\d{4}-\\d{2}-\\d{2}\\s+")); + when(config.getEventStartPattern()).thenReturn("^\\d{4}-\\d{2}-\\d{2}\\s+"); + when(config.getMaxLines()).thenReturn(500); + when(config.getMaxLength()).thenReturn(10000); + when(config.getLineSeparator()).thenReturn("\n"); + when(config.getOmitMatchedSection()).thenReturn(false); + when(config.getEncoding()).thenReturn(StandardCharsets.UTF_8); + + final String input = "2024-01-01 ERROR something\n"; final List> events = parseContent(input); assertThat(events.size(), equalTo(1)); assertThat(events.get(0).getData().get("message", String.class), - equalTo("preamble line 1\npreamble line 2\n[2024-01-01] Log entry")); + equalTo("2024-01-01 ERROR something")); } } @@ -399,32 +411,28 @@ class MaxLinesLimit { @BeforeEach void setUp() { - when(config.getMatch()).thenReturn("^\\d{4}"); - when(config.getNegate()).thenReturn(true); - when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + when(config.getCompiledPattern()).thenReturn(Pattern.compile("^\\d{4}")); + when(config.getEventStartPattern()).thenReturn("^\\d{4}"); when(config.getMaxLines()).thenReturn(3); when(config.getMaxLength()).thenReturn(10000); when(config.getLineSeparator()).thenReturn("\n"); + when(config.getOmitMatchedSection()).thenReturn(false); + when(config.getEncoding()).thenReturn(StandardCharsets.UTF_8); } @Test void flushes_event_when_max_lines_exceeded() throws IOException { - final String input = "2024-01-01 ERROR start\n" + - " line 2\n" + - " line 3\n" + - " line 4\n" + - " line 5\n" + - "2024-01-02 INFO next\n"; + final String input = "2024 start\n line 2\n line 3\n line 4\n line 5\n2024 next\n"; final List> events = parseContent(input); assertThat(events.size(), equalTo(3)); assertThat(events.get(0).getData().get("message", String.class), - equalTo("2024-01-01 ERROR start\n line 2\n line 3")); + equalTo("2024 start\n line 2\n line 3")); assertThat(events.get(1).getData().get("message", String.class), equalTo(" line 4\n line 5")); assertThat(events.get(2).getData().get("message", String.class), - equalTo("2024-01-02 INFO next")); + equalTo("2024 next")); } } @@ -433,25 +441,22 @@ class MaxLengthLimit { @BeforeEach void setUp() { - when(config.getMatch()).thenReturn("^\\d{4}"); - when(config.getNegate()).thenReturn(true); - when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); + when(config.getCompiledPattern()).thenReturn(Pattern.compile("^\\d{4}")); + when(config.getEventStartPattern()).thenReturn("^\\d{4}"); when(config.getMaxLines()).thenReturn(500); when(config.getMaxLength()).thenReturn(30); when(config.getLineSeparator()).thenReturn("\n"); + when(config.getOmitMatchedSection()).thenReturn(false); + when(config.getEncoding()).thenReturn(StandardCharsets.UTF_8); } @Test void flushes_event_when_max_length_exceeded() throws IOException { - final String input = "2024 start line here\n" + - " continuation is long\n" + - "2024 next entry\n"; + final String input = "2024 start line here\n continuation is long\n2024 next entry\n"; final List> events = parseContent(input); assertThat(events.size(), equalTo(3)); - // First event is "2024 start line here" (20 chars) - // Adding "\n continuation is long" would be 20+1+22=43 > 30, so it flushes assertThat(events.get(0).getData().get("message", String.class), equalTo("2024 start line here")); assertThat(events.get(1).getData().get("message", String.class), @@ -461,163 +466,10 @@ void flushes_event_when_max_length_exceeded() throws IOException { } } - @Nested - class CustomLineSeparator { - - @BeforeEach - void setUp() { - when(config.getMatch()).thenReturn("^\\d{4}"); - when(config.getNegate()).thenReturn(true); - when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); - when(config.getMaxLines()).thenReturn(500); - when(config.getMaxLength()).thenReturn(10000); - when(config.getLineSeparator()).thenReturn("\r\n"); - } - - @Test - void uses_custom_line_separator_when_joining() throws IOException { - final String input = "2024-01-01 ERROR start\n" + - " continuation\n" + - "2024-01-02 INFO next\n"; - - final List> events = parseContent(input); - - assertThat(events.size(), equalTo(2)); - assertThat(events.get(0).getData().get("message", String.class), - equalTo("2024-01-01 ERROR start\r\n continuation")); - } - } - - @Nested - class RealWorldScenarios { - - @Test - void python_traceback() throws IOException { - when(config.getMatch()).thenReturn("^Traceback|^\\s|^\\w+Error"); - when(config.getNegate()).thenReturn(false); - when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); - when(config.getMaxLines()).thenReturn(500); - when(config.getMaxLength()).thenReturn(10000); - when(config.getLineSeparator()).thenReturn("\n"); - - final String input = "2024-01-01 INFO Starting application\n" + - "Traceback (most recent call last):\n" + - " File \"main.py\", line 10, in \n" + - " result = process()\n" + - " File \"service.py\", line 5, in process\n" + - " return 1/0\n" + - "ZeroDivisionError: division by zero\n" + - "2024-01-01 INFO Recovered\n"; - - final List> events = parseContent(input); - - assertThat(events.size(), equalTo(2)); - assertThat(events.get(0).getData().get("message", String.class), - equalTo("2024-01-01 INFO Starting application\n" + - "Traceback (most recent call last):\n" + - " File \"main.py\", line 10, in \n" + - " result = process()\n" + - " File \"service.py\", line 5, in process\n" + - " return 1/0\n" + - "ZeroDivisionError: division by zero")); - assertThat(events.get(1).getData().get("message", String.class), - equalTo("2024-01-01 INFO Recovered")); - } - - @Test - void multiline_xml_in_logs() throws IOException { - when(config.getMatch()).thenReturn("^\\d{4}-\\d{2}-\\d{2}"); - when(config.getNegate()).thenReturn(true); - when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); - when(config.getMaxLines()).thenReturn(500); - when(config.getMaxLength()).thenReturn(10000); - when(config.getLineSeparator()).thenReturn("\n"); - - final String input = "2024-01-01 Request body:\n" + - "\n" + - " value\n" + - "\n" + - "2024-01-01 Response sent\n"; - - final List> events = parseContent(input); - - assertThat(events.size(), equalTo(2)); - assertThat(events.get(0).getData().get("message", String.class), - equalTo("2024-01-01 Request body:\n\n value\n")); - assertThat(events.get(1).getData().get("message", String.class), - equalTo("2024-01-01 Response sent")); - } - - @Test - void log4j_multiline_with_nested_exception() throws IOException { - when(config.getMatch()).thenReturn("^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}"); - when(config.getNegate()).thenReturn(true); - when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); - when(config.getMaxLines()).thenReturn(500); - when(config.getMaxLength()).thenReturn(10000); - when(config.getLineSeparator()).thenReturn("\n"); - - final String input = "2024-01-01T12:00:00 ERROR Application failed\n" + - "java.lang.RuntimeException: Outer\n" + - "\tat com.example.A.run(A.java:10)\n" + - "Caused by: java.io.IOException: Inner\n" + - "\tat com.example.B.read(B.java:20)\n" + - "\t... 5 more\n" + - "2024-01-01T12:00:01 INFO Shutdown complete\n"; - - final List> events = parseContent(input); - - assertThat(events.size(), equalTo(2)); - assertThat(events.get(0).getData().get("message", String.class), - equalTo("2024-01-01T12:00:00 ERROR Application failed\n" + - "java.lang.RuntimeException: Outer\n" + - "\tat com.example.A.run(A.java:10)\n" + - "Caused by: java.io.IOException: Inner\n" + - "\tat com.example.B.read(B.java:20)\n" + - "\t... 5 more")); - } - } - - @Nested - class IsContinuationLineTests { - - @Test - void negate_false_matching_line_is_continuation() { - when(config.getMatch()).thenReturn("^\\s"); - when(config.getNegate()).thenReturn(false); - when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); - when(config.getMaxLines()).thenReturn(500); - when(config.getMaxLength()).thenReturn(10000); - when(config.getLineSeparator()).thenReturn("\n"); - - final MultilineInputCodec codec = createObjectUnderTest(); - assertThat(codec.isContinuationLine(" indented"), equalTo(true)); - assertThat(codec.isContinuationLine("not indented"), equalTo(false)); - } - - @Test - void negate_true_non_matching_line_is_continuation() { - when(config.getMatch()).thenReturn("^\\d{4}"); - when(config.getNegate()).thenReturn(true); - when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); - when(config.getMaxLines()).thenReturn(500); - when(config.getMaxLength()).thenReturn(10000); - when(config.getLineSeparator()).thenReturn("\n"); - - final MultilineInputCodec codec = createObjectUnderTest(); - assertThat(codec.isContinuationLine(" no timestamp"), equalTo(true)); - assertThat(codec.isContinuationLine("2024 has timestamp"), equalTo(false)); - } - } - @Test void event_metadata_is_log_type() throws IOException { - when(config.getMatch()).thenReturn("^\\d{4}"); - when(config.getNegate()).thenReturn(true); - when(config.getWhat()).thenReturn(MultilineWhat.PREVIOUS); - when(config.getMaxLines()).thenReturn(500); - when(config.getMaxLength()).thenReturn(10000); - when(config.getLineSeparator()).thenReturn("\n"); + setupConfig("^\\d{4}"); + when(config.getEventStartPattern()).thenReturn("^\\d{4}"); final List> events = parseContent("2024-01-01 test\n"); diff --git a/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineWhatTest.java b/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineWhatTest.java deleted file mode 100644 index 9928024685..0000000000 --- a/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineWhatTest.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.dataprepper.plugins.codec.multiline; - -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.ValueSource; - -import static org.hamcrest.CoreMatchers.equalTo; -import static org.hamcrest.MatcherAssert.assertThat; -import static org.junit.jupiter.api.Assertions.assertThrows; - -class MultilineWhatTest { - - @Test - void fromString_returns_PREVIOUS_for_previous() { - assertThat(MultilineWhat.fromString("previous"), equalTo(MultilineWhat.PREVIOUS)); - } - - @Test - void fromString_returns_NEXT_for_next() { - assertThat(MultilineWhat.fromString("next"), equalTo(MultilineWhat.NEXT)); - } - - @Test - void fromString_is_case_insensitive() { - assertThat(MultilineWhat.fromString("PREVIOUS"), equalTo(MultilineWhat.PREVIOUS)); - assertThat(MultilineWhat.fromString("NEXT"), equalTo(MultilineWhat.NEXT)); - assertThat(MultilineWhat.fromString("Previous"), equalTo(MultilineWhat.PREVIOUS)); - } - - @ParameterizedTest - @ValueSource(strings = {"invalid", "before", "after", ""}) - void fromString_throws_for_invalid_value(final String value) { - assertThrows(IllegalArgumentException.class, () -> MultilineWhat.fromString(value)); - } - - @Test - void toString_returns_correct_values() { - assertThat(MultilineWhat.PREVIOUS.toString(), equalTo("previous")); - assertThat(MultilineWhat.NEXT.toString(), equalTo("next")); - } -} From 71da7e028afef54c91a8ebf62668f39753fb90dd Mon Sep 17 00:00:00 2001 From: Manisha Yadav Date: Wed, 17 Jun 2026 09:50:28 +0000 Subject: [PATCH 3/3] Address review comments on new design implemenatation Signed-off-by: Manisha Yadav --- .../multiline-codecs/build.gradle | 1 + .../codec/multiline/MultilineInputCodec.java | 188 +++---------- .../multiline/MultilineInputCodecConfig.java | 18 +- .../codec/multiline/MultilineCodecsIT.java | 246 ++++-------------- .../multiline/MultilineInputCodecTest.java | 11 +- .../continuation-line-end-pattern.yaml | 15 ++ .../continuation-line-start-pattern.yaml | 15 ++ .../codec/multiline/event-end-pattern.yaml | 15 ++ .../codec/multiline/event-start-pattern.yaml | 15 ++ .../codec/multiline/omit-matched-section.yaml | 16 ++ 10 files changed, 172 insertions(+), 368 deletions(-) create mode 100644 data-prepper-plugins/multiline-codecs/src/test/resources/org/opensearch/dataprepper/plugins/codec/multiline/continuation-line-end-pattern.yaml create mode 100644 data-prepper-plugins/multiline-codecs/src/test/resources/org/opensearch/dataprepper/plugins/codec/multiline/continuation-line-start-pattern.yaml create mode 100644 data-prepper-plugins/multiline-codecs/src/test/resources/org/opensearch/dataprepper/plugins/codec/multiline/event-end-pattern.yaml create mode 100644 data-prepper-plugins/multiline-codecs/src/test/resources/org/opensearch/dataprepper/plugins/codec/multiline/event-start-pattern.yaml create mode 100644 data-prepper-plugins/multiline-codecs/src/test/resources/org/opensearch/dataprepper/plugins/codec/multiline/omit-matched-section.yaml diff --git a/data-prepper-plugins/multiline-codecs/build.gradle b/data-prepper-plugins/multiline-codecs/build.gradle index ade99e2e54..65a8a97804 100644 --- a/data-prepper-plugins/multiline-codecs/build.gradle +++ b/data-prepper-plugins/multiline-codecs/build.gradle @@ -12,4 +12,5 @@ dependencies { implementation 'com.fasterxml.jackson.core:jackson-annotations' testImplementation project(':data-prepper-plugins:common') testImplementation project(':data-prepper-test:test-event') + testImplementation project(':data-prepper-test:plugin-test-framework') } diff --git a/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodec.java b/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodec.java index e3e5e1dc9d..b345a2e1dd 100644 --- a/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodec.java +++ b/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodec.java @@ -54,7 +54,8 @@ public class MultilineInputCodec implements InputCodec { private static final String MESSAGE_FIELD_NAME = "message"; private final Pattern pattern; - private final MultilineMode mode; + private final boolean boundaryOnMatch; + private final boolean flushAfter; private final boolean omitMatchedSection; private final int maxLines; private final int maxLength; @@ -72,7 +73,9 @@ public MultilineInputCodec(final MultilineInputCodecConfig config, final EventFa throw new IllegalArgumentException("A valid pattern must be configured"); } - this.mode = resolveMode(config); + final MultilineMode mode = resolveMode(config); + this.boundaryOnMatch = (mode == MultilineMode.EVENT_START || mode == MultilineMode.EVENT_END); + this.flushAfter = (mode == MultilineMode.EVENT_END || mode == MultilineMode.CONTINUATION_END); this.omitMatchedSection = config.getOmitMatchedSection(); this.maxLines = config.getMaxLines(); this.maxLength = config.getMaxLength(); @@ -97,190 +100,59 @@ public void parse(final InputStream inputStream, final Consumer> e Objects.requireNonNull(inputStream, "inputStream must not be null"); Objects.requireNonNull(eventConsumer, "eventConsumer must not be null"); - try (final BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, encoding))) { - switch (mode) { - case EVENT_START: - parseEventStartMode(reader, eventConsumer); - break; - case EVENT_END: - parseEventEndMode(reader, eventConsumer); - break; - case CONTINUATION_START: - parseContinuationStartMode(reader, eventConsumer); - break; - case CONTINUATION_END: - parseContinuationEndMode(reader, eventConsumer); - break; - default: - throw new IllegalStateException("Unknown multiline mode: " + mode); - } - } + final BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, encoding)); + parseLines(reader, eventConsumer); } - /** - * EVENT_START mode: A new event begins at each line matching the pattern. - * Non-matching lines are continuations of the preceding event. - */ - private void parseEventStartMode(final BufferedReader reader, final Consumer> eventConsumer) throws IOException { + private void parseLines(final BufferedReader reader, final Consumer> eventConsumer) throws IOException { final StringBuilder buffer = new StringBuilder(); int lineCount = 0; String line; while ((line = reader.readLine()) != null) { final boolean matches = pattern.matcher(line).find(); + final boolean isBoundary = (boundaryOnMatch == matches); - if (matches || shouldFlush(buffer, lineCount, line)) { - if (buffer.length() > 0) { - emitEvent(buffer.toString(), eventConsumer); - buffer.setLength(0); - lineCount = 0; - } - } - - if (buffer.length() > 0) { - buffer.append(lineSeparator); - } - buffer.append(processLine(line, matches)); - lineCount++; - } - - if (buffer.length() > 0) { - emitEvent(buffer.toString(), eventConsumer); - } - } - - /** - * EVENT_END mode: An event ends at each line matching the pattern (inclusive). - * The matching line is included in the current event, then a new event begins. - */ - private void parseEventEndMode(final BufferedReader reader, final Consumer> eventConsumer) throws IOException { - final StringBuilder buffer = new StringBuilder(); - int lineCount = 0; - String line; - - while ((line = reader.readLine()) != null) { - final boolean matches = pattern.matcher(line).find(); - - if (shouldFlush(buffer, lineCount, line)) { - if (buffer.length() > 0) { - emitEvent(buffer.toString(), eventConsumer); - buffer.setLength(0); - lineCount = 0; - } + if ((!flushAfter && isBoundary) || shouldFlush(buffer, lineCount, line)) { + flushIfNonEmpty(buffer, eventConsumer); + lineCount = 0; } - if (buffer.length() > 0) { - buffer.append(lineSeparator); - } - buffer.append(processLine(line, matches)); + appendLineToBuffer(buffer, processLine(line, matches)); lineCount++; - if (matches) { - emitEvent(buffer.toString(), eventConsumer); - buffer.setLength(0); + if (flushAfter && isBoundary) { + flushIfNonEmpty(buffer, eventConsumer); lineCount = 0; } } - if (buffer.length() > 0) { - emitEvent(buffer.toString(), eventConsumer); - } + flushIfNonEmpty(buffer, eventConsumer); } - /** - * CONTINUATION_START mode: Lines matching the pattern are continuations of the previous event. - * Non-matching lines start new events. - */ - private void parseContinuationStartMode(final BufferedReader reader, final Consumer> eventConsumer) throws IOException { - final StringBuilder buffer = new StringBuilder(); - int lineCount = 0; - String line; - - while ((line = reader.readLine()) != null) { - final boolean matches = pattern.matcher(line).find(); - - if (!matches || shouldFlush(buffer, lineCount, line)) { - if (buffer.length() > 0) { - emitEvent(buffer.toString(), eventConsumer); - buffer.setLength(0); - lineCount = 0; - } - } - - if (buffer.length() > 0) { - buffer.append(lineSeparator); - } - buffer.append(processLine(line, matches)); - lineCount++; - } - - if (buffer.length() > 0) { - emitEvent(buffer.toString(), eventConsumer); + private String processLine(final String line, final boolean matches) { + if (!omitMatchedSection || !matches) { + return line; } + final Matcher matcher = pattern.matcher(line); + return matcher.replaceFirst(""); } - /** - * CONTINUATION_END mode: Lines matching the pattern are prepended to the next event. - * Non-matching lines complete the current event. - */ - private void parseContinuationEndMode(final BufferedReader reader, final Consumer> eventConsumer) throws IOException { - final StringBuilder buffer = new StringBuilder(); - int lineCount = 0; - boolean bufferHasNonContinuation = false; - String line; - - while ((line = reader.readLine()) != null) { - final boolean matches = pattern.matcher(line).find(); - - if (!matches) { - if (bufferHasNonContinuation) { - emitEvent(buffer.toString(), eventConsumer); - buffer.setLength(0); - lineCount = 0; - bufferHasNonContinuation = false; - } - if (buffer.length() > 0) { - buffer.append(lineSeparator); - } - buffer.append(processLine(line, false)); - lineCount++; - bufferHasNonContinuation = true; - continue; - } - - if (bufferHasNonContinuation) { - emitEvent(buffer.toString(), eventConsumer); - buffer.setLength(0); - lineCount = 0; - bufferHasNonContinuation = false; - } - - if (shouldFlush(buffer, lineCount, line)) { - if (buffer.length() > 0) { - emitEvent(buffer.toString(), eventConsumer); - buffer.setLength(0); - lineCount = 0; - } - } - - if (buffer.length() > 0) { - buffer.append(lineSeparator); - } - buffer.append(processLine(line, matches)); - lineCount++; + private void appendLineToBuffer(final StringBuilder buffer, final String processedLine) { + if (processedLine.isEmpty()) { + return; } - if (buffer.length() > 0) { - emitEvent(buffer.toString(), eventConsumer); + buffer.append(lineSeparator); } + buffer.append(processedLine); } - private String processLine(final String line, final boolean matches) { - if (!omitMatchedSection || !matches) { - return line; + private void flushIfNonEmpty(final StringBuilder buffer, final Consumer> eventConsumer) { + if (buffer.length() > 0) { + emitEvent(buffer.toString(), eventConsumer); + buffer.setLength(0); } - final Matcher matcher = pattern.matcher(line); - return matcher.replaceFirst(""); } /** diff --git a/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecConfig.java b/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecConfig.java index 9eb76ce4fd..4bea8356e7 100644 --- a/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecConfig.java +++ b/data-prepper-plugins/multiline-codecs/src/main/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecConfig.java @@ -77,7 +77,6 @@ public class MultilineInputCodecConfig { private String encoding = StandardCharsets.UTF_8.name(); private Pattern compiledPattern; - private Charset encodingCharset; public String getEventStartPattern() { return eventStartPattern; @@ -112,22 +111,23 @@ public String getLineSeparator() { } /** - * Returns the validated Charset. The encoding is validated once during - * bean validation and stored to avoid repeated parsing. + * Returns the validated Charset, compiled on first access. * - * @return The validated Charset. + * @return The Charset. */ public Charset getEncoding() { - return encodingCharset; + return Charset.forName(encoding); } /** - * Returns the compiled regex pattern. The pattern is compiled once during validation - * and reused to avoid duplicate compilation. + * Returns the compiled regex pattern, compiled on first access. * * @return The compiled Pattern. */ public Pattern getCompiledPattern() { + if (compiledPattern == null) { + compiledPattern = Pattern.compile(getConfiguredPatternString()); + } return compiledPattern; } @@ -149,7 +149,7 @@ boolean isValidPattern() { return false; } try { - compiledPattern = Pattern.compile(patternString); + Pattern.compile(patternString); return true; } catch (final PatternSyntaxException e) { return false; @@ -162,7 +162,7 @@ boolean isValidEncoding() { return false; } try { - encodingCharset = Charset.forName(encoding); + Charset.forName(encoding); return true; } catch (final IllegalCharsetNameException | UnsupportedCharsetException e) { return false; diff --git a/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineCodecsIT.java b/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineCodecsIT.java index f81b4fb424..ad67ec24df 100644 --- a/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineCodecsIT.java +++ b/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineCodecsIT.java @@ -9,64 +9,36 @@ package org.opensearch.dataprepper.plugins.codec.multiline; -import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.ExtendWith; -import org.mockito.ArgumentCaptor; -import org.mockito.Mock; -import org.mockito.junit.jupiter.MockitoExtension; -import org.opensearch.dataprepper.event.TestEventFactory; +import org.opensearch.dataprepper.model.codec.InputCodec; import org.opensearch.dataprepper.model.event.Event; -import org.opensearch.dataprepper.model.event.EventFactory; import org.opensearch.dataprepper.model.record.Record; +import org.opensearch.dataprepper.test.plugins.DataPrepperPluginTest; +import org.opensearch.dataprepper.test.plugins.PluginConfigurationFile; +import org.opensearch.dataprepper.test.plugins.junit.BaseDataPrepperPluginStandardTestSuite; import java.io.ByteArrayInputStream; import java.io.IOException; -import java.io.InputStream; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; import java.util.List; -import java.util.function.Consumer; -import java.util.regex.Pattern; import static org.hamcrest.CoreMatchers.equalTo; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.containsString; -import static org.mockito.Mockito.lenient; -import static org.mockito.Mockito.times; -import static org.mockito.Mockito.verify; -@ExtendWith(MockitoExtension.class) -public class MultilineCodecsIT { +@DataPrepperPluginTest(pluginName = "multiline", pluginType = InputCodec.class) +public class MultilineCodecsIT extends BaseDataPrepperPluginStandardTestSuite { - @Mock - private MultilineInputCodecConfig config; - - @Mock - private Consumer> eventConsumer; - - private final EventFactory eventFactory = TestEventFactory.getTestEventFactory(); - - @BeforeEach - void setUp() { - lenient().when(config.getMaxLines()).thenReturn(500); - lenient().when(config.getMaxLength()).thenReturn(50000); - lenient().when(config.getLineSeparator()).thenReturn("\n"); - lenient().when(config.getOmitMatchedSection()).thenReturn(false); - lenient().when(config.getEncoding()).thenReturn(StandardCharsets.UTF_8); - } - - private MultilineInputCodec createObjectUnderTest() { - return new MultilineInputCodec(config, eventFactory); - } - - private InputStream toInputStream(final String content) { - return new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8)); + private List> parseContent(final InputCodec codec, final String content) throws IOException { + final List> events = new ArrayList<>(); + codec.parse(new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8)), events::add); + return events; } @Test - void parse_java_stack_trace_with_event_start_pattern() throws IOException { - lenient().when(config.getCompiledPattern()).thenReturn(Pattern.compile("^\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}")); - lenient().when(config.getEventStartPattern()).thenReturn("^\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}"); + void parse_java_stack_trace_with_event_start_pattern( + @PluginConfigurationFile("event-start-pattern.yaml") final InputCodec codec) throws IOException { final String input = "2024-01-15 10:23:45.123 ERROR [main] com.example.UserService - Request failed\n" + @@ -77,105 +49,40 @@ void parse_java_stack_trace_with_event_start_pattern() throws IOException { "\tat com.mysql.jdbc.Connection.connect(Connection.java:456)\n" + "2024-01-15 10:23:45.456 INFO [main] com.example.UserService - Retrying\n"; - createObjectUnderTest().parse(toInputStream(input), eventConsumer); + final List> events = parseContent(codec, input); - final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); - verify(eventConsumer, times(2)).accept(captor.capture()); - - final List> records = captor.getAllValues(); - final String event1 = records.get(0).getData().get("message", String.class); + assertThat(events.size(), equalTo(2)); + final String event1 = events.get(0).getData().get("message", String.class); assertThat(event1, containsString("NullPointerException")); assertThat(event1, containsString("at com.example.UserService.getUser")); assertThat(event1, containsString("Caused by: java.sql.SQLException")); - assertThat(records.get(1).getData().get("message", String.class), + assertThat(events.get(1).getData().get("message", String.class), equalTo("2024-01-15 10:23:45.456 INFO [main] com.example.UserService - Retrying")); } @Test - void parse_python_traceback_with_event_start_pattern() throws IOException { - lenient().when(config.getCompiledPattern()).thenReturn(Pattern.compile("^\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}")); - lenient().when(config.getEventStartPattern()).thenReturn("^\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}"); + void parse_with_event_end_pattern( + @PluginConfigurationFile("event-end-pattern.yaml") final InputCodec codec) throws IOException { final String input = - "2024-03-20 08:15:00,123 INFO Starting application\n" + - "2024-03-20 08:15:02,789 ERROR Unhandled exception\n" + - "Traceback (most recent call last):\n" + - " File \"/app/worker.py\", line 45, in process\n" + - "ValueError: invalid literal for int()\n" + - "2024-03-20 08:15:03,456 INFO Recovered\n"; - - createObjectUnderTest().parse(toInputStream(input), eventConsumer); - - final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); - verify(eventConsumer, times(3)).accept(captor.capture()); - - final List> records = captor.getAllValues(); - assertThat(records.get(0).getData().get("message", String.class), - equalTo("2024-03-20 08:15:00,123 INFO Starting application")); - final String event2 = records.get(1).getData().get("message", String.class); - assertThat(event2, containsString("Traceback")); - assertThat(event2, containsString("ValueError")); - assertThat(records.get(2).getData().get("message", String.class), - equalTo("2024-03-20 08:15:03,456 INFO Recovered")); - } - - @Test - void parse_xml_multiline_with_event_start_pattern() throws IOException { - lenient().when(config.getCompiledPattern()).thenReturn(Pattern.compile("^\\[\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}")); - lenient().when(config.getEventStartPattern()).thenReturn("^\\[\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}"); - - final String input = - "[2024-05-10 14:30:00.001] [INFO] Incoming request:\n" + - "\n" + - " value\n" + - "\n" + - "[2024-05-10 14:30:00.045] [INFO] Request processed\n"; - - createObjectUnderTest().parse(toInputStream(input), eventConsumer); - - final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); - verify(eventConsumer, times(2)).accept(captor.capture()); - - final List> records = captor.getAllValues(); - final String event1 = records.get(0).getData().get("message", String.class); - assertThat(event1, containsString("")); - assertThat(event1, containsString("")); - assertThat(records.get(1).getData().get("message", String.class), - equalTo("[2024-05-10 14:30:00.045] [INFO] Request processed")); - } - - @Test - void parse_syslog_ise_with_event_start_pattern() throws IOException { - lenient().when(config.getCompiledPattern()).thenReturn(Pattern.compile("^<\\d+>[A-Z][a-z]{2}\\s+\\d+\\s+\\d{2}:\\d{2}:\\d{2}")); - lenient().when(config.getEventStartPattern()).thenReturn("^<\\d+>[A-Z][a-z]{2}\\s+\\d+\\s+\\d{2}:\\d{2}:\\d{2}"); - - final String input = - "<181>Jun 1 12:39:49 Infra-ISE Audit NOTICE Admin-Login: success\n" + - "<181>Jun 1 12:39:49 Infra-ISE Audit NOTICE OpenAPI: Response={\n" + - " \"version\" : \"1.0.0\"\n" + - "}, HttpCode=200\n" + - "<181>Jun 1 12:40:15 Infra-ISE Audit NOTICE Config-Change: added\n"; - - createObjectUnderTest().parse(toInputStream(input), eventConsumer); + "entry 1 line 1\n" + + "entry 1 line 2\n" + + "---\n" + + "entry 2 line 1\n" + + "---\n"; - final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); - verify(eventConsumer, times(3)).accept(captor.capture()); + final List> events = parseContent(codec, input); - final List> records = captor.getAllValues(); - assertThat(records.get(0).getData().get("message", String.class), - containsString("Admin-Login: success")); - final String event2 = records.get(1).getData().get("message", String.class); - assertThat(event2, containsString("OpenAPI: Response=")); - assertThat(event2, containsString("\"version\" : \"1.0.0\"")); - assertThat(event2, containsString("HttpCode=200")); - assertThat(records.get(2).getData().get("message", String.class), - containsString("Config-Change: added")); + assertThat(events.size(), equalTo(2)); + assertThat(events.get(0).getData().get("message", String.class), + equalTo("entry 1 line 1\nentry 1 line 2\n---")); + assertThat(events.get(1).getData().get("message", String.class), + equalTo("entry 2 line 1\n---")); } @Test - void parse_with_continuation_line_start_pattern() throws IOException { - lenient().when(config.getCompiledPattern()).thenReturn(Pattern.compile("^\\s+(at |\\.\\.\\.|Caused by:)")); - lenient().when(config.getContinuationLineStartPattern()).thenReturn("^\\s+(at |\\.\\.\\.|Caused by:)"); + void parse_with_continuation_line_start_pattern( + @PluginConfigurationFile("continuation-line-start-pattern.yaml") final InputCodec codec) throws IOException { final String input = "java.lang.RuntimeException: error\n" + @@ -184,48 +91,38 @@ void parse_with_continuation_line_start_pattern() throws IOException { " at com.example.C.read(C.java:3)\n" + "Application recovered\n"; - createObjectUnderTest().parse(toInputStream(input), eventConsumer); - - final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); - verify(eventConsumer, times(2)).accept(captor.capture()); + final List> events = parseContent(codec, input); - final List> records = captor.getAllValues(); - final String event1 = records.get(0).getData().get("message", String.class); + assertThat(events.size(), equalTo(2)); + final String event1 = events.get(0).getData().get("message", String.class); assertThat(event1, containsString("RuntimeException: error")); assertThat(event1, containsString("at com.example.A.method")); assertThat(event1, containsString("Caused by: java.io.IOException")); - assertThat(records.get(1).getData().get("message", String.class), + assertThat(events.get(1).getData().get("message", String.class), equalTo("Application recovered")); } @Test - void parse_with_event_end_pattern() throws IOException { - lenient().when(config.getCompiledPattern()).thenReturn(Pattern.compile("^---$")); - lenient().when(config.getEventEndPattern()).thenReturn("^---$"); + void parse_with_omit_matched_section( + @PluginConfigurationFile("omit-matched-section.yaml") final InputCodec codec) throws IOException { final String input = - "entry 1 line 1\n" + - "entry 1 line 2\n" + - "---\n" + - "entry 2 line 1\n" + - "---\n"; - - createObjectUnderTest().parse(toInputStream(input), eventConsumer); + "2024-01-01 ERROR something bad\n" + + " stack trace\n" + + "2024-01-02 INFO recovered\n"; - final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); - verify(eventConsumer, times(2)).accept(captor.capture()); + final List> events = parseContent(codec, input); - final List> records = captor.getAllValues(); - assertThat(records.get(0).getData().get("message", String.class), - equalTo("entry 1 line 1\nentry 1 line 2\n---")); - assertThat(records.get(1).getData().get("message", String.class), - equalTo("entry 2 line 1\n---")); + assertThat(events.size(), equalTo(2)); + assertThat(events.get(0).getData().get("message", String.class), + equalTo("ERROR something bad\n stack trace")); + assertThat(events.get(1).getData().get("message", String.class), + equalTo("INFO recovered")); } @Test - void parse_with_continuation_end_pattern() throws IOException { - lenient().when(config.getCompiledPattern()).thenReturn(Pattern.compile("^\\s")); - lenient().when(config.getContinuationLineEndPattern()).thenReturn("^\\s"); + void parse_with_continuation_line_end_pattern( + @PluginConfigurationFile("continuation-line-end-pattern.yaml") final InputCodec codec) throws IOException { final String input = " context-line-1\n" + @@ -234,50 +131,15 @@ void parse_with_continuation_end_pattern() throws IOException { " context-line-3\n" + "MAIN EVENT B\n"; - createObjectUnderTest().parse(toInputStream(input), eventConsumer); - - final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); - verify(eventConsumer, times(2)).accept(captor.capture()); + final List> events = parseContent(codec, input); - final List> records = captor.getAllValues(); - final String event1 = records.get(0).getData().get("message", String.class); + assertThat(events.size(), equalTo(2)); + final String event1 = events.get(0).getData().get("message", String.class); assertThat(event1, containsString("context-line-1")); + assertThat(event1, containsString("context-line-2")); assertThat(event1, containsString("MAIN EVENT A")); - final String event2 = records.get(1).getData().get("message", String.class); + final String event2 = events.get(1).getData().get("message", String.class); assertThat(event2, containsString("context-line-3")); assertThat(event2, containsString("MAIN EVENT B")); } - - @Test - void parse_with_omit_matched_section() throws IOException { - lenient().when(config.getCompiledPattern()).thenReturn(Pattern.compile("^\\d{4}-\\d{2}-\\d{2}\\s+")); - lenient().when(config.getEventStartPattern()).thenReturn("^\\d{4}-\\d{2}-\\d{2}\\s+"); - lenient().when(config.getOmitMatchedSection()).thenReturn(true); - - final String input = - "2024-01-01 ERROR something bad\n" + - " stack trace\n" + - "2024-01-02 INFO recovered\n"; - - createObjectUnderTest().parse(toInputStream(input), eventConsumer); - - final ArgumentCaptor> captor = ArgumentCaptor.forClass(Record.class); - verify(eventConsumer, times(2)).accept(captor.capture()); - - final List> records = captor.getAllValues(); - assertThat(records.get(0).getData().get("message", String.class), - equalTo("ERROR something bad\n stack trace")); - assertThat(records.get(1).getData().get("message", String.class), - equalTo("INFO recovered")); - } - - @Test - void parse_empty_input_produces_no_events() throws IOException { - lenient().when(config.getCompiledPattern()).thenReturn(Pattern.compile("^\\d{4}")); - lenient().when(config.getEventStartPattern()).thenReturn("^\\d{4}"); - - createObjectUnderTest().parse(toInputStream(""), eventConsumer); - - verify(eventConsumer, times(0)).accept(ArgumentCaptor.forClass(Record.class).capture()); - } } diff --git a/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecTest.java b/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecTest.java index a7f66c8306..ea8dc07d4d 100644 --- a/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecTest.java +++ b/data-prepper-plugins/multiline-codecs/src/test/java/org/opensearch/dataprepper/plugins/codec/multiline/MultilineInputCodecTest.java @@ -73,13 +73,6 @@ void constructor_throws_if_no_pattern_configured() { assertThrows(IllegalArgumentException.class, this::createObjectUnderTest); } - @Test - void constructor_throws_if_pattern_is_invalid() { - when(config.getCompiledPattern()).thenReturn(null); - - assertThrows(IllegalArgumentException.class, this::createObjectUnderTest); - } - private void setupConfig(final String patternStr) { when(config.getCompiledPattern()).thenReturn(Pattern.compile(patternStr)); when(config.getMaxLines()).thenReturn(500); @@ -381,9 +374,9 @@ void event_end_pattern_omits_matched_section() throws IOException { assertThat(events.size(), equalTo(2)); assertThat(events.get(0).getData().get("message", String.class), - equalTo("line 1\nline 2\n")); + equalTo("line 1\nline 2")); assertThat(events.get(1).getData().get("message", String.class), - equalTo("line 3\n")); + equalTo("line 3")); } @Test diff --git a/data-prepper-plugins/multiline-codecs/src/test/resources/org/opensearch/dataprepper/plugins/codec/multiline/continuation-line-end-pattern.yaml b/data-prepper-plugins/multiline-codecs/src/test/resources/org/opensearch/dataprepper/plugins/codec/multiline/continuation-line-end-pattern.yaml new file mode 100644 index 0000000000..2beb48c08a --- /dev/null +++ b/data-prepper-plugins/multiline-codecs/src/test/resources/org/opensearch/dataprepper/plugins/codec/multiline/continuation-line-end-pattern.yaml @@ -0,0 +1,15 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +test-pipeline: + source: + unused: + processor: + - multiline: + continuation_line_end_pattern: "^\\s" + sink: + - unused: diff --git a/data-prepper-plugins/multiline-codecs/src/test/resources/org/opensearch/dataprepper/plugins/codec/multiline/continuation-line-start-pattern.yaml b/data-prepper-plugins/multiline-codecs/src/test/resources/org/opensearch/dataprepper/plugins/codec/multiline/continuation-line-start-pattern.yaml new file mode 100644 index 0000000000..7fbb62d7cc --- /dev/null +++ b/data-prepper-plugins/multiline-codecs/src/test/resources/org/opensearch/dataprepper/plugins/codec/multiline/continuation-line-start-pattern.yaml @@ -0,0 +1,15 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +test-pipeline: + source: + unused: + processor: + - multiline: + continuation_line_start_pattern: "^\\s+(at |\\.\\.\\.|Caused by:)" + sink: + - unused: diff --git a/data-prepper-plugins/multiline-codecs/src/test/resources/org/opensearch/dataprepper/plugins/codec/multiline/event-end-pattern.yaml b/data-prepper-plugins/multiline-codecs/src/test/resources/org/opensearch/dataprepper/plugins/codec/multiline/event-end-pattern.yaml new file mode 100644 index 0000000000..06b9577b18 --- /dev/null +++ b/data-prepper-plugins/multiline-codecs/src/test/resources/org/opensearch/dataprepper/plugins/codec/multiline/event-end-pattern.yaml @@ -0,0 +1,15 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +test-pipeline: + source: + unused: + processor: + - multiline: + event_end_pattern: "^---$" + sink: + - unused: diff --git a/data-prepper-plugins/multiline-codecs/src/test/resources/org/opensearch/dataprepper/plugins/codec/multiline/event-start-pattern.yaml b/data-prepper-plugins/multiline-codecs/src/test/resources/org/opensearch/dataprepper/plugins/codec/multiline/event-start-pattern.yaml new file mode 100644 index 0000000000..c95b3b7be9 --- /dev/null +++ b/data-prepper-plugins/multiline-codecs/src/test/resources/org/opensearch/dataprepper/plugins/codec/multiline/event-start-pattern.yaml @@ -0,0 +1,15 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +test-pipeline: + source: + unused: + processor: + - multiline: + event_start_pattern: "^\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2}" + sink: + - unused: diff --git a/data-prepper-plugins/multiline-codecs/src/test/resources/org/opensearch/dataprepper/plugins/codec/multiline/omit-matched-section.yaml b/data-prepper-plugins/multiline-codecs/src/test/resources/org/opensearch/dataprepper/plugins/codec/multiline/omit-matched-section.yaml new file mode 100644 index 0000000000..ec7b990b13 --- /dev/null +++ b/data-prepper-plugins/multiline-codecs/src/test/resources/org/opensearch/dataprepper/plugins/codec/multiline/omit-matched-section.yaml @@ -0,0 +1,16 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +test-pipeline: + source: + unused: + processor: + - multiline: + event_start_pattern: "^\\d{4}-\\d{2}-\\d{2}\\s+" + omit_matched_section: true + sink: + - unused: