Skip to content

Commit 792e5bb

Browse files
committed
Optimize MongoDBExportPartitionSupplier for uniform _id type collections
For collections with uniform _id types, replace the 8-clause $or query with a simple Filters.gt("_id", value) for finding partition boundaries. This allows DocumentDB to use a single B-tree index seek instead of multi-index scan. Changes: - Add isUniformIdType() that checks first/last doc _id types - Add buildNextStartFilter() with simple $gt for uniform types, falling back to $or-based query for mixed types - Use fresh Filters.gte() + skip() per iteration for partition end - Extract addPartition() helper to reduce duplication - Make BsonHelper.isClassNumber() public for numeric type grouping Performance: 14M docs (10GB) partitioned in ~30 seconds. Signed-off-by: Dinu John <86094133+dinujoh@users.noreply.github.com>
1 parent 287321c commit 792e5bb

4 files changed

Lines changed: 274 additions & 82 deletions

File tree

data-prepper-plugins/mongodb/src/main/java/org/opensearch/dataprepper/plugins/mongo/client/BsonHelper.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ private static Bson buildQuery(final Function<Object, Bson> function, final Stri
186186
}
187187
}
188188

189-
private static boolean isClassNumber(final String className) {
189+
public static boolean isClassNumber(final String className) {
190190
return className.equals("java.lang.Integer") || className.equals("java.lang.Long") || className.equals("java.lang.Double")
191191
|| className.equals("org.bson.types.Decimal128");
192192
}

data-prepper-plugins/mongodb/src/main/java/org/opensearch/dataprepper/plugins/mongo/export/MongoDBExportPartitionSupplier.java

Lines changed: 112 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,14 @@
66
package org.opensearch.dataprepper.plugins.mongo.export;
77

88
import com.mongodb.MongoClientException;
9-
import com.mongodb.client.FindIterable;
109
import com.mongodb.client.MongoClient;
1110
import com.mongodb.client.MongoCollection;
12-
import com.mongodb.client.MongoCursor;
1311
import com.mongodb.client.MongoDatabase;
1412
import com.mongodb.client.model.Filters;
13+
import com.mongodb.client.model.Projections;
14+
import com.mongodb.client.model.Sorts;
1515
import org.bson.Document;
16+
import org.bson.conversions.Bson;
1617
import org.opensearch.dataprepper.model.source.coordinator.PartitionIdentifier;
1718
import org.opensearch.dataprepper.model.source.coordinator.enhanced.EnhancedSourceCoordinator;
1819
import org.opensearch.dataprepper.plugins.mongo.client.BsonHelper;
@@ -37,6 +38,9 @@ public class MongoDBExportPartitionSupplier implements Function<ExportPartition,
3738
private static final Logger LOG = LoggerFactory.getLogger(MongoDBExportPartitionSupplier.class);
3839
private static final String MONGODB_PARTITION_KEY_FORMAT = "%s|%s|%s|%s|%s"; // partition format: <db.collection>|<gte>|<lt>|<gteClassName>|<lteClassName>
3940
private static final String COLLECTION_SPLITTER = "\\.";
41+
private static final Bson ID_PROJECTION = Projections.include("_id");
42+
private static final Bson ID_ASC = Sorts.ascending("_id");
43+
private static final Bson ID_DESC = Sorts.descending("_id");
4044

4145
private final MongoDBSourceConfig sourceConfig;
4246
private final EnhancedSourceCoordinator enhancedSourceCoordinator;
@@ -50,95 +54,143 @@ public MongoDBExportPartitionSupplier(final MongoDBSourceConfig sourceConfig,
5054
this.documentDBAggregateMetrics = documentDBAggregateMetrics;
5155
}
5256

57+
/**
58+
* Detects whether the collection has a uniform _id type by checking the first and last documents.
59+
* If uniform, we can use a simple Filters.gt() instead of the complex $or query across all BSON types.
60+
*/
61+
boolean isUniformIdType(final MongoCollection<Document> col) {
62+
final Document first = col.find().projection(ID_PROJECTION).sort(ID_ASC).limit(1).first();
63+
final Document last = col.find().projection(ID_PROJECTION).sort(ID_DESC).limit(1).first();
64+
if (first == null || last == null) {
65+
return true;
66+
}
67+
final String firstType = first.get("_id").getClass().getName();
68+
final String lastType = last.get("_id").getClass().getName();
69+
if (BsonHelper.isClassNumber(firstType) && BsonHelper.isClassNumber(lastType)) {
70+
return true;
71+
}
72+
return firstType.equals(lastType);
73+
}
74+
75+
private Bson buildNextStartFilter(final Object lastLteValue, final String lteClassName, final boolean uniformType) {
76+
if (uniformType) {
77+
return Filters.gt("_id", lastLteValue);
78+
}
79+
final String lteValueString = BsonHelper.getPartitionStringFromMongoDBId(lastLteValue, lteClassName);
80+
return buildGtQuery(lteValueString, lteClassName, MAX_KEY);
81+
}
82+
83+
private void addPartition(final List<PartitionIdentifier> partitions, final String collectionDbName,
84+
final Object gteValue, final String gteClassName,
85+
final Object lteValue, final String lteClassName) {
86+
final String gteValueString = BsonHelper.getPartitionStringFromMongoDBId(gteValue, gteClassName);
87+
final String lteValueString = BsonHelper.getPartitionStringFromMongoDBId(lteValue, lteClassName);
88+
LOG.debug("Partition of {} : { gte: {} class: {}, lte: {} class {} }",
89+
collectionDbName, gteValueString, gteClassName, lteValueString, lteClassName);
90+
partitions.add(PartitionIdentifier.builder()
91+
.withPartitionKey(String.format(MONGODB_PARTITION_KEY_FORMAT,
92+
collectionDbName, gteValueString, lteValueString, gteClassName, lteClassName))
93+
.build());
94+
}
95+
5396
private PartitionIdentifierBatch buildPartitions(final ExportPartition exportPartition) {
5497
documentDBAggregateMetrics.getExportApiInvocations().increment();
5598
final List<PartitionIdentifier> collectionPartitions = new ArrayList<>();
5699
final String collectionDbName = exportPartition.getCollection();
57-
List<String> collection = List.of(collectionDbName.split(COLLECTION_SPLITTER));
100+
final List<String> collection = List.of(collectionDbName.split(COLLECTION_SPLITTER));
58101
if (collection.size() < 2) {
59102
documentDBAggregateMetrics.getExport4xxErrors().increment();
60103
throw new IllegalArgumentException("Invalid Collection Name. Must be in db.collection format");
61104
}
62-
final Optional<ExportProgressState> exportProgressStateOptional = exportPartition
63-
.getProgressState();
64-
final Object lastEndDocId = exportProgressStateOptional.map(
65-
ExportProgressState::getLastEndDocId).orElse(null);
105+
106+
final Optional<ExportProgressState> exportProgressStateOptional = exportPartition.getProgressState();
107+
final Object lastEndDocId = exportProgressStateOptional.map(ExportProgressState::getLastEndDocId).orElse(null);
66108
boolean isLastBatch = false;
67109
Object endDocId = lastEndDocId;
110+
68111
try (MongoClient mongoClient = MongoDBConnection.getMongoClient(sourceConfig)) {
69112
final MongoDatabase db = mongoClient.getDatabase(collection.get(0));
70-
final MongoCollection<Document> col = db.getCollection(collectionDbName.substring(collection.get(0).length()+1));
113+
final MongoCollection<Document> col = db.getCollection(
114+
collectionDbName.substring(collection.get(0).length() + 1));
71115
final int partitionSize = exportPartition.getPartitionSize();
72-
FindIterable<Document> startIterable;
116+
117+
final boolean uniformType = isUniformIdType(col);
118+
LOG.info("Collection {} has {} _id type. Using {} partition query strategy.",
119+
collectionDbName, uniformType ? "uniform" : "mixed", uniformType ? "simple $gt" : "$or-based");
120+
121+
Bson startFilter;
73122
if (lastEndDocId != null) {
74-
startIterable = col.find(Filters.gt("_id", lastEndDocId))
75-
.projection(new Document("_id", 1))
76-
.sort(new Document("_id", 1))
77-
.limit(1);
123+
startFilter = Filters.gt("_id", lastEndDocId);
78124
} else {
79-
startIterable = col.find()
80-
.projection(new Document("_id", 1))
81-
.sort(new Document("_id", 1))
82-
.limit(1);
125+
startFilter = new Document();
83126
}
127+
84128
while (!Thread.currentThread().isInterrupted()) {
85-
try (final MongoCursor<Document> startCursor = startIterable.iterator()) {
86-
if (!startCursor.hasNext()) {
87-
LOG.info("No records to process or has reached end of the export partition.");
88-
isLastBatch = true;
89-
break;
90-
}
91-
final Document startDoc = startCursor.next();
92-
final Object gteValue = startDoc.get("_id");
93-
final String gteClassName = gteValue.getClass().getName();
94-
95-
// Get end doc
96-
Document endDoc = startIterable.skip(partitionSize - 1).limit(1).first();
97-
if (endDoc == null) {
98-
// this means we have reached the end of the doc
99-
endDoc = col.find()
100-
.projection(new Document("_id", 1))
101-
.sort(new Document("_id", -1))
102-
.limit(1)
103-
.first();
104-
isLastBatch = true;
105-
}
129+
final Document startDoc = col.find(startFilter)
130+
.projection(ID_PROJECTION)
131+
.sort(ID_ASC)
132+
.limit(1)
133+
.first();
134+
135+
if (startDoc == null) {
136+
LOG.info("No records to process or has reached end of the export partition.");
137+
isLastBatch = true;
138+
break;
139+
}
106140

107-
final Object lteValue = endDoc.get("_id");
108-
final String lteClassName = lteValue.getClass().getName();
109-
endDocId = lteValue;
110-
final String gteValueString = BsonHelper.getPartitionStringFromMongoDBId(gteValue, gteClassName);
111-
final String lteValueString = BsonHelper.getPartitionStringFromMongoDBId(lteValue, lteClassName);
112-
LOG.debug("Partition of {} : { gte: {} class: {}, lte: {} class {} }", collectionDbName, gteValueString, gteClassName, lteValueString, lteClassName);
113-
collectionPartitions.add(
114-
PartitionIdentifier
115-
.builder()
116-
.withPartitionKey(String.format(MONGODB_PARTITION_KEY_FORMAT, collectionDbName, gteValueString, lteValueString, gteClassName, lteClassName))
117-
.build());
118-
documentDBAggregateMetrics.getExportPartitionQueryCount().increment();
119-
120-
if (isLastBatch) {
141+
final Object gteValue = startDoc.get("_id");
142+
final String gteClassName = gteValue.getClass().getName();
143+
144+
final Document endDoc = col.find(Filters.gte("_id", gteValue))
145+
.projection(ID_PROJECTION)
146+
.sort(ID_ASC)
147+
.skip(partitionSize - 1)
148+
.limit(1)
149+
.first();
150+
151+
final Object lteValue;
152+
final String lteClassName;
153+
154+
if (endDoc == null) {
155+
final Document lastDoc = col.find()
156+
.projection(ID_PROJECTION)
157+
.sort(ID_DESC)
158+
.limit(1)
159+
.first();
160+
if (lastDoc == null) {
161+
isLastBatch = true;
121162
break;
122163
}
164+
lteValue = lastDoc.get("_id");
165+
lteClassName = lteValue.getClass().getName();
166+
isLastBatch = true;
167+
} else {
168+
lteValue = endDoc.get("_id");
169+
lteClassName = lteValue.getClass().getName();
170+
}
123171

124-
// extend the ownership of the partition
125-
enhancedSourceCoordinator.saveProgressStateForPartition(exportPartition, null);
172+
endDocId = lteValue;
173+
addPartition(collectionPartitions, collectionDbName, gteValue, gteClassName, lteValue, lteClassName);
174+
documentDBAggregateMetrics.getExportPartitionQueryCount().increment();
126175

127-
startIterable = col.find(buildGtQuery(lteValueString, lteClassName, MAX_KEY))
128-
.projection(new Document("_id", 1))
129-
.sort(new Document("_id", 1))
130-
.limit(1);
176+
if (isLastBatch) {
177+
break;
131178
}
179+
180+
// extend the ownership of the partition
181+
enhancedSourceCoordinator.saveProgressStateForPartition(exportPartition, null);
182+
183+
startFilter = buildNextStartFilter(lteValue, lteClassName, uniformType);
132184
}
133185
} catch (final IllegalArgumentException | MongoClientException e) {
134186
// IllegalArgumentException is thrown when database or collection name is not valid
135187
// MongoClientException is thrown for exceptions indicating a failure condition with the MongoClient
136188
documentDBAggregateMetrics.getExport4xxErrors().increment();
137-
LOG.error("Client side exception while build partitions.", e);
189+
LOG.error("Client side exception while building partitions.", e);
138190
throw new RuntimeException(e);
139191
} catch (final Exception e) {
140192
documentDBAggregateMetrics.getExport5xxErrors().increment();
141-
LOG.error("Server side exception while build partitions.", e);
193+
LOG.error("Server side exception while building partitions.", e);
142194
throw new RuntimeException(e);
143195
}
144196

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*
5+
* The OpenSearch Contributors require contributions made to
6+
* this file be licensed under the Apache-2.0 license or a
7+
* compatible open source license.
8+
*/
9+
10+
package org.opensearch.dataprepper.plugins.mongo.export;
11+
12+
import com.mongodb.client.FindIterable;
13+
import com.mongodb.client.MongoCollection;
14+
import org.bson.Document;
15+
import org.bson.types.Decimal128;
16+
import org.bson.types.ObjectId;
17+
import org.junit.jupiter.api.BeforeEach;
18+
import org.junit.jupiter.api.Test;
19+
import org.junit.jupiter.api.extension.ExtendWith;
20+
import org.mockito.Mock;
21+
import org.mockito.junit.jupiter.MockitoExtension;
22+
import org.opensearch.dataprepper.model.source.coordinator.enhanced.EnhancedSourceCoordinator;
23+
import org.opensearch.dataprepper.plugins.mongo.configuration.MongoDBSourceConfig;
24+
import org.opensearch.dataprepper.plugins.mongo.utils.DocumentDBSourceAggregateMetrics;
25+
26+
import static org.hamcrest.MatcherAssert.assertThat;
27+
import static org.hamcrest.Matchers.is;
28+
import static org.mockito.ArgumentMatchers.any;
29+
import static org.mockito.Mockito.when;
30+
31+
@ExtendWith(MockitoExtension.class)
32+
public class MongoDBExportPartitionSupplierIsUniformIdTypeTest {
33+
34+
@Mock
35+
private MongoDBSourceConfig sourceConfig;
36+
@Mock
37+
private EnhancedSourceCoordinator sourceCoordinator;
38+
@Mock
39+
private DocumentDBSourceAggregateMetrics aggregateMetrics;
40+
@Mock
41+
private MongoCollection<Document> collection;
42+
@Mock
43+
private FindIterable<Document> findIterable;
44+
45+
private MongoDBExportPartitionSupplier supplier;
46+
47+
@BeforeEach
48+
void setUp() {
49+
supplier = new MongoDBExportPartitionSupplier(sourceConfig, sourceCoordinator, aggregateMetrics);
50+
when(collection.find()).thenReturn(findIterable);
51+
when(findIterable.projection(any())).thenReturn(findIterable);
52+
when(findIterable.sort(any())).thenReturn(findIterable);
53+
when(findIterable.limit(1)).thenReturn(findIterable);
54+
}
55+
56+
@Test
57+
void isUniformIdType_emptyCollection_returnsTrue() {
58+
when(findIterable.first()).thenReturn(null);
59+
assertThat(supplier.isUniformIdType(collection), is(true));
60+
}
61+
62+
@Test
63+
void isUniformIdType_uniformObjectId_returnsTrue() {
64+
when(findIterable.first())
65+
.thenReturn(new Document("_id", new ObjectId()))
66+
.thenReturn(new Document("_id", new ObjectId()));
67+
assertThat(supplier.isUniformIdType(collection), is(true));
68+
}
69+
70+
@Test
71+
void isUniformIdType_uniformString_returnsTrue() {
72+
when(findIterable.first())
73+
.thenReturn(new Document("_id", "abc"))
74+
.thenReturn(new Document("_id", "xyz"));
75+
assertThat(supplier.isUniformIdType(collection), is(true));
76+
}
77+
78+
@Test
79+
void isUniformIdType_mixedTypes_returnsFalse() {
80+
when(findIterable.first())
81+
.thenReturn(new Document("_id", 1))
82+
.thenReturn(new Document("_id", new ObjectId()));
83+
assertThat(supplier.isUniformIdType(collection), is(false));
84+
}
85+
86+
@Test
87+
void isUniformIdType_integerAndLong_returnsTrue() {
88+
when(findIterable.first())
89+
.thenReturn(new Document("_id", 42))
90+
.thenReturn(new Document("_id", 999999999999L));
91+
assertThat(supplier.isUniformIdType(collection), is(true));
92+
}
93+
94+
@Test
95+
void isUniformIdType_doubleAndDecimal128_returnsTrue() {
96+
when(findIterable.first())
97+
.thenReturn(new Document("_id", 3.14))
98+
.thenReturn(new Document("_id", Decimal128.parse("99.99")));
99+
assertThat(supplier.isUniformIdType(collection), is(true));
100+
}
101+
102+
@Test
103+
void isUniformIdType_stringAndObjectId_returnsFalse() {
104+
when(findIterable.first())
105+
.thenReturn(new Document("_id", "abc"))
106+
.thenReturn(new Document("_id", new ObjectId()));
107+
assertThat(supplier.isUniformIdType(collection), is(false));
108+
}
109+
}

0 commit comments

Comments
 (0)