diff --git a/sdk/cosmos/azure-cosmos/CHANGELOG.md b/sdk/cosmos/azure-cosmos/CHANGELOG.md index 5146e18db2e8..c8137802a4ea 100644 --- a/sdk/cosmos/azure-cosmos/CHANGELOG.md +++ b/sdk/cosmos/azure-cosmos/CHANGELOG.md @@ -7,6 +7,7 @@ #### Breaking Changes #### Bugs Fixed +* Fixed transient `410/1002` (`PartitionKeyRangeGone`) errors surfacing to callers during a partition split or merge. The `PartitionKeyRangeGoneRetryPolicy` (used on the query and change-feed paths) previously retried only once and ignored the in-progress `410/1007` (`CompletingSplitOrMerge`) and `410/1008` (`CompletingPartitionMigration`) sub-status codes; it now refreshes the routing map and retries those sub-statuses up to 10 times before surfacing the error. #### Other Changes diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/PartitionKeyRangeGoneRetryPolicy.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/PartitionKeyRangeGoneRetryPolicy.java index 9344eaf5e2a7..f9faa0453c5a 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/PartitionKeyRangeGoneRetryPolicy.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/PartitionKeyRangeGoneRetryPolicy.java @@ -11,6 +11,7 @@ import java.time.Duration; import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; // TODO: this need testing /** @@ -25,7 +26,8 @@ public class PartitionKeyRangeGoneRetryPolicy extends DocumentClientRetryPolicy private final IPartitionKeyRangeCache partitionKeyRangeCache; private final String collectionLink; private final Map requestOptionProperties; - private volatile boolean retried; + private static final int MAX_RETRY_COUNT = 10; + private final AtomicInteger retryCount = new AtomicInteger(0); private RxDocumentServiceRequest request; public PartitionKeyRangeGoneRetryPolicy(DiagnosticsClientContext diagnosticsClientContext, @@ -53,9 +55,11 @@ public Mono shouldRetry(Exception exception) { CosmosException clientException = Utils.as(exception, CosmosException.class); if (clientException != null && Exceptions.isStatusCode(clientException, HttpConstants.StatusCodes.GONE) && - Exceptions.isSubStatusCode(clientException, HttpConstants.SubStatusCodes.PARTITION_KEY_RANGE_GONE)) { + (Exceptions.isSubStatusCode(clientException, HttpConstants.SubStatusCodes.PARTITION_KEY_RANGE_GONE) + || Exceptions.isSubStatusCode(clientException, HttpConstants.SubStatusCodes.COMPLETING_SPLIT_OR_MERGE) + || Exceptions.isSubStatusCode(clientException, HttpConstants.SubStatusCodes.COMPLETING_PARTITION_MIGRATION))) { - if (this.retried){ + if (this.retryCount.getAndIncrement() >= MAX_RETRY_COUNT) { return Mono.just(ShouldRetryResult.error(clientException)); } @@ -96,10 +100,8 @@ public Mono shouldRetry(Exception exception) { }); // TODO: Check if this behavior can be replaced by doOnSubscribe - return refreshedRoutingMapObs.flatMap(rm -> { - this.retried = true; - return Mono.just(ShouldRetryResult.retryAfter(Duration.ZERO)); - }); + return refreshedRoutingMapObs.flatMap(rm -> + Mono.just(ShouldRetryResult.retryAfter(Duration.ZERO))); });