Skip to content

Commit 54f8e29

Browse files
authored
Chunk queries for duplicate documents, add metrics, track duplicate documents found in queries (#5847)
Signed-off-by: Taylor Gray <tylgry@amazon.com>
1 parent 48d5135 commit 54f8e29

3 files changed

Lines changed: 137 additions & 21 deletions

File tree

data-prepper-plugins/opensearch/build.gradle

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ dependencies {
4646
testImplementation 'net.bytebuddy:byte-buddy:1.15.11'
4747
testImplementation 'net.bytebuddy:byte-buddy-agent:1.15.11'
4848
testImplementation testLibs.slf4j.simple
49+
testImplementation project(path: ':data-prepper-test:test-common')
4950
}
5051

5152
sourceSets {

data-prepper-plugins/opensearch/src/main/java/org/opensearch/dataprepper/plugins/sink/opensearch/index/ExistingDocumentQueryManager.java

Lines changed: 49 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import com.fasterxml.jackson.databind.node.ObjectNode;
44
import com.google.common.annotations.VisibleForTesting;
55
import io.micrometer.core.instrument.Counter;
6+
import io.micrometer.core.instrument.Timer;
67
import org.opensearch.client.opensearch.OpenSearchClient;
78
import org.opensearch.client.opensearch._types.FieldValue;
89
import org.opensearch.client.opensearch._types.query_dsl.Query;
@@ -43,14 +44,24 @@ public class ExistingDocumentQueryManager implements Runnable {
4344

4445
static final String DOCUMENTS_CURRENTLY_BEING_QUERIED = "documentsBeingQueried";
4546

47+
static final String DUPLICATE_EVENTS_IN_QUERY_MANAGER = "duplicateEventsInQueryManager";
4648

49+
static final String QUERY_TIME = "queryDuplicatesTime";
50+
51+
static final String POTENTIAL_DUPLICATES = "potentialDuplicates";
4752

4853
private final Counter eventsDroppedAndReleasedCounter;
4954

5055
private final Counter eventsAddedForQuerying;
5156

5257
private final Counter eventsReturnedForIndexing;
5358

59+
private final Counter duplicateEventsInQueryManager;
60+
61+
private final Counter potentialDuplicatesDeleted;
62+
63+
private final Timer queryTimePerLoop;
64+
5465
private final AtomicInteger documentsCurrentlyBeingQueried = new AtomicInteger(0);
5566

5667
private final AtomicInteger documentsCurrentlyBeingQueriedGauge;
@@ -88,6 +99,9 @@ public ExistingDocumentQueryManager(final IndexConfiguration indexConfiguration,
8899
this.eventsDroppedAndReleasedCounter = pluginMetrics.counter(EVENTS_DROPPED_AND_RELEASED);
89100
this.eventsReturnedForIndexing = pluginMetrics.counter(EVENTS_RETURNED_FOR_INDEXING);
90101
this.documentsCurrentlyBeingQueriedGauge = pluginMetrics.gauge(DOCUMENTS_CURRENTLY_BEING_QUERIED, documentsCurrentlyBeingQueried, AtomicInteger::get);
102+
this.duplicateEventsInQueryManager = pluginMetrics.counter(DUPLICATE_EVENTS_IN_QUERY_MANAGER);
103+
this.queryTimePerLoop = pluginMetrics.timer(QUERY_TIME);
104+
this.potentialDuplicatesDeleted = pluginMetrics.counter(POTENTIAL_DUPLICATES);
91105
this.lockReadyToIngest = new ReentrantLock();
92106
this.lockWaitingForQuery = new ReentrantLock();
93107
}
@@ -96,7 +110,7 @@ public ExistingDocumentQueryManager(final IndexConfiguration indexConfiguration,
96110
public void run() {
97111
while (!Thread.currentThread().isInterrupted() && !shouldStop) {
98112
try {
99-
runQueryLoop();
113+
queryTimePerLoop.record(this::runQueryLoop);
100114
} catch (final Exception e) {
101115
LOG.error("Exception in primary loop responsible for querying for existing documents, retrying", e);
102116
} finally {
@@ -116,13 +130,14 @@ void runQueryLoop() {
116130
// Query for existing documents
117131
final MsearchRequest msearchRequest = buildMultiSearchRequest();
118132
final MsearchResponse<ObjectNode> msearchResponse = queryForTermValues(msearchRequest);
119-
lastQueryTime = Instant.now();
120133

121134
// Drop and Release Existing Documents
122135
dropAndReleaseFoundEvents(msearchResponse);
123136

124137
// Move non-existing documents past query_duration to bulkOperationsReadyForIndex
125138
moveBulkRequestsThatHaveReachedQueryDuration();
139+
140+
lastQueryTime = Instant.now();
126141
}
127142
}
128143

@@ -134,12 +149,17 @@ public void addBulkOperation(final BulkOperationWrapper bulkOperationWrapper) {
134149
lockWaitingForQuery.lock();
135150
final String termValue = bulkOperationWrapper.getTermValue();
136151
try {
137-
bulkOperationsWaitingForQuery.computeIfAbsent(bulkOperationWrapper.getIndex(),
152+
final QueryManagerBulkOperation queryManagerBulkOperation = bulkOperationsWaitingForQuery.computeIfAbsent(bulkOperationWrapper.getIndex(),
138153
k -> new ConcurrentHashMap<>()).put(termValue, new QueryManagerBulkOperation(bulkOperationWrapper, Instant.now(), termValue));
154+
// Only increment if this is a new document
155+
if (queryManagerBulkOperation == null) {
156+
documentsCurrentlyBeingQueriedGauge.incrementAndGet();
157+
} else {
158+
duplicateEventsInQueryManager.increment();
159+
}
139160
} finally {
140161
lockWaitingForQuery.unlock();
141162
}
142-
documentsCurrentlyBeingQueriedGauge.incrementAndGet();
143163
eventsAddedForQuerying.increment();
144164
}
145165

@@ -168,19 +188,25 @@ private MsearchRequest buildMultiSearchRequest() {
168188
for (final Map.Entry<String, Map<String, QueryManagerBulkOperation>> entry : bulkOperationsWaitingForQuery.entrySet()) {
169189
final String index = entry.getKey();
170190
final List<FieldValue> values = getTermValues(entry.getValue().values());
171-
172-
m.searches(s -> s
173-
.header(h -> h.index(index))
174-
.body(b -> b
175-
.size(values.size())
176-
.source(source -> source.filter(f -> f.includes(queryTerm)))
177-
.query(Query.of(q -> q
178-
.terms(TermsQuery.of(t -> t
179-
.field(queryTerm)
180-
.terms(TermsQueryField.of(tf -> tf.value(values)))
181-
))
182-
))
183-
));
191+
final int batchSize = 1000;
192+
193+
LOG.info("Creating search requests for {} query term values in batches of {}", values.size(), batchSize);
194+
for (int i = 0; i < values.size(); i += batchSize) {
195+
final List<FieldValue> chunk = values.subList(i, Math.min(i + batchSize, values.size()));
196+
197+
m.searches(s -> s
198+
.header(h -> h.index(index))
199+
.body(b -> b
200+
.size(chunk.size())
201+
.source(source -> source.filter(f -> f.includes(queryTerm)))
202+
.query(Query.of(q -> q
203+
.terms(TermsQuery.of(t -> t
204+
.field(queryTerm)
205+
.terms(TermsQueryField.of(tf -> tf.value(chunk)))
206+
))
207+
))
208+
));
209+
}
184210
}
185211
return m;
186212
});
@@ -214,7 +240,7 @@ private void moveBulkRequestsThatHaveReachedQueryDuration() {
214240
while (bulkOperationIterator.hasNext()) {
215241
final Map.Entry<String, QueryManagerBulkOperation> entry = bulkOperationIterator.next();
216242
final QueryManagerBulkOperation bulkOperation = entry.getValue();
217-
if (bulkOperation.getStartTime().plus(indexConfiguration.getQueryDuration()).isBefore(lastQueryTime)) {
243+
if (lastQueryTime != null && bulkOperation.getStartTime().plus(indexConfiguration.getQueryDuration()).isBefore(lastQueryTime)) {
218244
lockReadyToIngest.lock();
219245
try {
220246
LOG.debug("Moving bulk operation for index {} and term value {} to be ingested after querying and finding no existing document",
@@ -234,7 +260,7 @@ private void moveBulkRequestsThatHaveReachedQueryDuration() {
234260
private void dropAndReleaseFoundEvents(final MsearchResponse<ObjectNode> msearchResponse) {
235261
msearchResponse.responses().forEach(response -> {
236262
if (response.isFailure()) {
237-
LOG.error("Search response failed: {}", response.failure().error().reason());
263+
LOG.error("Search response failed, potential for duplicate documents: {}", response.failure().error().toString());
238264
} else {
239265
response.result().hits().hits().forEach(hit -> {
240266
final String indexForHit = hit.index();
@@ -246,11 +272,14 @@ private void dropAndReleaseFoundEvents(final MsearchResponse<ObjectNode> msearch
246272
final Map<String, QueryManagerBulkOperation> bulkOperationsForIndex = bulkOperationsWaitingForQuery.get(indexForHit);
247273
final QueryManagerBulkOperation bulkOperationToRelease = bulkOperationsForIndex.get(queryTermValue);
248274
if (bulkOperationToRelease == null) {
249-
LOG.error("bulk operation for term value {} is null", queryTermValue);
275+
// Means two documents with the same query term value were found
276+
LOG.warn("Bulk operation for term value {} with id {} is null, potentially a duplicate document", queryTermValue, hit.id());
277+
potentialDuplicatesDeleted.increment();
250278
} else {
251279
LOG.debug("Found document with query term {}, dropping and releasing Event handle", queryTermValue);
252280
bulkOperationToRelease.getBulkOperationWrapper().releaseEventHandle(true);
253281
eventsDroppedAndReleasedCounter.increment();
282+
documentsCurrentlyBeingQueriedGauge.decrementAndGet();
254283
bulkOperationsForIndex.remove(queryTermValue);
255284
}
256285
} finally {

data-prepper-plugins/opensearch/src/test/java/org/opensearch/dataprepper/plugins/sink/opensearch/index/ExistingDocumentQueryManagerTest.java

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@
1818
import org.opensearch.client.opensearch.core.search.HitsMetadata;
1919
import org.opensearch.dataprepper.metrics.PluginMetrics;
2020
import org.opensearch.dataprepper.plugins.sink.opensearch.BulkOperationWrapper;
21+
import org.opensearch.dataprepper.test.helper.ReflectivelySetField;
2122

2223
import java.io.IOException;
2324
import java.time.Duration;
25+
import java.time.Instant;
2426
import java.util.List;
2527
import java.util.Set;
2628
import java.util.UUID;
@@ -36,9 +38,11 @@
3638
import static org.mockito.Mockito.verifyNoMoreInteractions;
3739
import static org.mockito.Mockito.when;
3840
import static org.opensearch.dataprepper.plugins.sink.opensearch.index.ExistingDocumentQueryManager.DOCUMENTS_CURRENTLY_BEING_QUERIED;
41+
import static org.opensearch.dataprepper.plugins.sink.opensearch.index.ExistingDocumentQueryManager.DUPLICATE_EVENTS_IN_QUERY_MANAGER;
3942
import static org.opensearch.dataprepper.plugins.sink.opensearch.index.ExistingDocumentQueryManager.EVENTS_ADDED_FOR_QUERYING;
4043
import static org.opensearch.dataprepper.plugins.sink.opensearch.index.ExistingDocumentQueryManager.EVENTS_DROPPED_AND_RELEASED;
4144
import static org.opensearch.dataprepper.plugins.sink.opensearch.index.ExistingDocumentQueryManager.EVENTS_RETURNED_FOR_INDEXING;
45+
import static org.opensearch.dataprepper.plugins.sink.opensearch.index.ExistingDocumentQueryManager.POTENTIAL_DUPLICATES;
4246

4347
@ExtendWith(MockitoExtension.class)
4448
public class ExistingDocumentQueryManagerTest {
@@ -61,6 +65,12 @@ public class ExistingDocumentQueryManagerTest {
6165
@Mock
6266
private Counter eventsReturnedForIndexing;
6367

68+
@Mock
69+
private Counter duplicateEventsAddedToQueryManager;
70+
71+
@Mock
72+
private Counter potentialDuplicates;
73+
6474
@Mock
6575
private AtomicInteger documentsCurrentlyQueried;
6676

@@ -71,7 +81,9 @@ void setup() {
7181
when(pluginMetrics.counter(EVENTS_ADDED_FOR_QUERYING)).thenReturn(eventsAddedForQuerying);
7282
when(pluginMetrics.counter(EVENTS_DROPPED_AND_RELEASED)).thenReturn(eventsDroppedAndReleased);
7383
when(pluginMetrics.counter(EVENTS_RETURNED_FOR_INDEXING)).thenReturn(eventsReturnedForIndexing);
84+
when(pluginMetrics.counter(DUPLICATE_EVENTS_IN_QUERY_MANAGER)).thenReturn(duplicateEventsAddedToQueryManager);
7485
when(pluginMetrics.gauge(eq(DOCUMENTS_CURRENTLY_BEING_QUERIED), any(AtomicInteger.class), any())).thenReturn(documentsCurrentlyQueried);
86+
when(pluginMetrics.counter(POTENTIAL_DUPLICATES)).thenReturn(potentialDuplicates);
7587
queryTerm = UUID.randomUUID().toString();
7688
when(indexConfiguration.getQueryTerm()).thenReturn(queryTerm);
7789
}
@@ -122,13 +134,14 @@ void add_bulk_operation_and_found_in_query_drops_and_releases_event() throws IOE
122134
verify(eventsDroppedAndReleased).increment();
123135
verify(eventsAddedForQuerying).increment();
124136
verify(documentsCurrentlyQueried).incrementAndGet();
137+
verify(documentsCurrentlyQueried).decrementAndGet();
125138
verify(bulkOperationWrapper).releaseEventHandle(true);
126139

127140
verifyNoMoreInteractions(indexConfiguration);
128141
}
129142

130143
@Test
131-
void add_bulk_operation_and_not_found_in_query_returns_as_ready_to_ingest() throws IOException, InterruptedException {
144+
void add_bulk_operation_and_not_found_in_query_returns_as_ready_to_ingest() throws IOException, InterruptedException, NoSuchFieldException, IllegalAccessException {
132145
when(indexConfiguration.getQueryDuration()).thenReturn(Duration.ofMillis(1));
133146

134147
final BulkOperationWrapper bulkOperationWrapper = mock(BulkOperationWrapper.class);
@@ -164,6 +177,8 @@ void add_bulk_operation_and_not_found_in_query_returns_as_ready_to_ingest() thro
164177

165178
final ExistingDocumentQueryManager objectUnderTest = createObjectUnderTest();
166179

180+
ReflectivelySetField.setField(ExistingDocumentQueryManager.class, objectUnderTest, "lastQueryTime", Instant.now().plusMillis(100));
181+
167182
objectUnderTest.addBulkOperation(bulkOperationWrapper);
168183

169184
Thread.sleep(20);
@@ -209,4 +224,75 @@ void add_bulk_operation_and_not_found_in_query_returns_as_ready_to_ingest() thro
209224
assertThat(msearchRequest.searches().get(0).body().query().terms().terms().value().get(0), notNullValue());
210225
assertThat(msearchRequest.searches().get(0).body().query().terms().terms().value().get(0).stringValue(), equalTo(termValue));
211226
}
227+
228+
@Test
229+
void add_bulk_operation_for_same_term_value_increments_duplicate_events_metric() {
230+
final BulkOperationWrapper bulkOperationWrapper = mock(BulkOperationWrapper.class);
231+
final String index = UUID.randomUUID().toString();
232+
final String termValue = UUID.randomUUID().toString();
233+
when(bulkOperationWrapper.getTermValue()).thenReturn(termValue);
234+
when(bulkOperationWrapper.getIndex()).thenReturn(index);
235+
236+
final ExistingDocumentQueryManager objectUnderTest = createObjectUnderTest();
237+
238+
objectUnderTest.addBulkOperation(bulkOperationWrapper);
239+
240+
objectUnderTest.addBulkOperation(bulkOperationWrapper);
241+
242+
verify(duplicateEventsAddedToQueryManager).increment();
243+
verify(documentsCurrentlyQueried).incrementAndGet();
244+
}
245+
246+
@Test
247+
void query_response_with_two_documents_with_same_term_value_tracks_duplicate_document() throws IOException {
248+
final BulkOperationWrapper bulkOperationWrapper = mock(BulkOperationWrapper.class);
249+
final String index = UUID.randomUUID().toString();
250+
final String termValue = UUID.randomUUID().toString();
251+
when(bulkOperationWrapper.getTermValue()).thenReturn(termValue);
252+
when(bulkOperationWrapper.getIndex()).thenReturn(index);
253+
254+
final MsearchResponse<ObjectNode> msearchResponse = mock(MsearchResponse.class);
255+
final MultiSearchResponseItem<ObjectNode> responseItem = mock(MultiSearchResponseItem.class);
256+
when(responseItem.isFailure()).thenReturn(false);
257+
258+
final MultiSearchItem<ObjectNode> multiSearchItem = mock(MultiSearchItem.class);
259+
final HitsMetadata<ObjectNode> hitsMetadata = mock(HitsMetadata.class);
260+
final Hit<ObjectNode> hit = mock(Hit.class);
261+
when(hit.index()).thenReturn(index);
262+
263+
final ObjectNode objectNode = mock(ObjectNode.class);
264+
final JsonNode jsonNode = mock(JsonNode.class);
265+
when(jsonNode.textValue()).thenReturn(termValue);
266+
when(objectNode.findValue(queryTerm)).thenReturn(jsonNode);
267+
when(hit.source()).thenReturn(objectNode);
268+
269+
final Hit<ObjectNode> duplicateHit = mock(Hit.class);
270+
when(duplicateHit.index()).thenReturn(index);
271+
when(duplicateHit.id()).thenReturn(UUID.randomUUID().toString());
272+
273+
when(jsonNode.textValue()).thenReturn(termValue);
274+
when(objectNode.findValue(queryTerm)).thenReturn(jsonNode);
275+
when(duplicateHit.source()).thenReturn(objectNode);
276+
277+
when(multiSearchItem.hits()).thenReturn(hitsMetadata);
278+
when(hitsMetadata.hits()).thenReturn(List.of(hit, duplicateHit));
279+
280+
when(responseItem.result()).thenReturn(multiSearchItem);
281+
282+
when(msearchResponse.responses()).thenReturn(List.of(responseItem));
283+
284+
when(openSearchClient.msearch(any(MsearchRequest.class), eq(ObjectNode.class)))
285+
.thenReturn(msearchResponse);
286+
287+
final ExistingDocumentQueryManager objectUnderTest = createObjectUnderTest();
288+
289+
objectUnderTest.addBulkOperation(bulkOperationWrapper);
290+
291+
objectUnderTest.runQueryLoop();
292+
293+
verify(openSearchClient).msearch(any(MsearchRequest.class), eq(ObjectNode.class));
294+
verify(potentialDuplicates).increment();
295+
verifyNoMoreInteractions(openSearchClient);
296+
297+
}
212298
}

0 commit comments

Comments
 (0)