Skip to content

Commit c10c656

Browse files
committed
Cosmos: Fix 410/1002 PartitionKeyRangeGone surfacing on query path during split/merge
The query-path PartitionKeyRangeGoneRetryPolicy retried 410/1002 only once and ignored 410/1007 (CompletingSplitOrMerge) and 410/1008 (CompletingPartitionMigration), surfacing transient 410s to query callers during a partition split/merge. It now refreshes the routing map and retries those sub-statuses up to 10 times (using an AtomicInteger counter), matching the bulk/transactional-batch retry policies. Mirrors the .NET SDK fix (Azure/azure-cosmos-dotnet-v3 PR #5941). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 7a6c9d9 commit c10c656

2 files changed

Lines changed: 10 additions & 7 deletions

File tree

sdk/cosmos/azure-cosmos/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#### Breaking Changes
88

99
#### Bugs Fixed
10+
* Fixed transient `410/1002` (`PartitionKeyRangeGone`) errors surfacing to query callers during a partition split or merge. The query-path `PartitionKeyRangeGoneRetryPolicy` previously retried only once and ignored the in-progress `410/1007` (`CompletingSplitOrMerge`) and `410/1008` (`CompletingPartitionMigration`) sub-status codes; it now refreshes the routing map and retries those sub-statuses up to 10 times before surfacing the error.
1011

1112
#### Other Changes
1213

sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/PartitionKeyRangeGoneRetryPolicy.java

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
import java.time.Duration;
1313
import java.util.Map;
14+
import java.util.concurrent.atomic.AtomicInteger;
1415

1516
// TODO: this need testing
1617
/**
@@ -25,7 +26,8 @@ public class PartitionKeyRangeGoneRetryPolicy extends DocumentClientRetryPolicy
2526
private final IPartitionKeyRangeCache partitionKeyRangeCache;
2627
private final String collectionLink;
2728
private final Map<String, Object> requestOptionProperties;
28-
private volatile boolean retried;
29+
private static final int MAX_RETRY_COUNT = 10;
30+
private final AtomicInteger retryCount = new AtomicInteger(0);
2931
private RxDocumentServiceRequest request;
3032

3133
public PartitionKeyRangeGoneRetryPolicy(DiagnosticsClientContext diagnosticsClientContext,
@@ -53,9 +55,11 @@ public Mono<ShouldRetryResult> shouldRetry(Exception exception) {
5355
CosmosException clientException = Utils.as(exception, CosmosException.class);
5456
if (clientException != null &&
5557
Exceptions.isStatusCode(clientException, HttpConstants.StatusCodes.GONE) &&
56-
Exceptions.isSubStatusCode(clientException, HttpConstants.SubStatusCodes.PARTITION_KEY_RANGE_GONE)) {
58+
(Exceptions.isSubStatusCode(clientException, HttpConstants.SubStatusCodes.PARTITION_KEY_RANGE_GONE)
59+
|| Exceptions.isSubStatusCode(clientException, HttpConstants.SubStatusCodes.COMPLETING_SPLIT_OR_MERGE)
60+
|| Exceptions.isSubStatusCode(clientException, HttpConstants.SubStatusCodes.COMPLETING_PARTITION_MIGRATION))) {
5761

58-
if (this.retried){
62+
if (this.retryCount.getAndIncrement() >= MAX_RETRY_COUNT) {
5963
return Mono.just(ShouldRetryResult.error(clientException));
6064
}
6165

@@ -96,10 +100,8 @@ public Mono<ShouldRetryResult> shouldRetry(Exception exception) {
96100
});
97101

98102
// TODO: Check if this behavior can be replaced by doOnSubscribe
99-
return refreshedRoutingMapObs.flatMap(rm -> {
100-
this.retried = true;
101-
return Mono.just(ShouldRetryResult.retryAfter(Duration.ZERO));
102-
});
103+
return refreshedRoutingMapObs.flatMap(rm ->
104+
Mono.just(ShouldRetryResult.retryAfter(Duration.ZERO)));
103105

104106
});
105107

0 commit comments

Comments
 (0)