Skip to content

Commit 2048abe

Browse files
jeet1995Copilot
andcommitted
Fix background database account refresh stopping in multi-writer accounts
In multi-writer accounts, refreshLocationPrivateAsync() stops the background refresh timer when shouldRefreshEndpoints() returns false. This means topology changes (e.g., multi-write to single-write transitions) go undetected until the next explicit refresh trigger. The .NET SDK (azure-cosmos-dotnet-v3) correctly continues the background refresh loop unconditionally - the loop only stops when canRefreshInBackground is explicitly false, not when shouldRefreshEndpoints returns false. This fix adds startRefreshLocationTimerAsync() to the else-branch of refreshLocationPrivateAsync(), ensuring the background timer always reschedules itself regardless of whether endpoints currently need refreshing. Without this fix, after a multi-write -> single-write -> multi-write transition, reads remain stuck on the primary region because the SDK never re-reads account metadata to learn about the restored multi-write topology. Unit tests updated: - backgroundRefreshForMultiMaster: assertTrue (timer must keep running) - backgroundRefreshDetectsTopologyChangeForMultiMaster: new test proving MW->SW transition detection via mock Related: PR #6139 (point #4 in description acknowledged this bug) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent a8e02c1 commit 2048abe

File tree

2 files changed

+61
-2
lines changed

2 files changed

+61
-2
lines changed

sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/directconnectivity/GlobalEndPointManagerTest.java

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ public void refreshLocationAsyncForWriteForbidden() throws Exception {
223223
}
224224

225225
/**
226-
* Test for background refresh disable for multimaster
226+
* Test for background refresh in multi-master: timer must keep running
227227
*/
228228
@Test(groups = {"unit"}, timeOut = TIMEOUT)
229229
public void backgroundRefreshForMultiMaster() throws Exception {
@@ -236,8 +236,57 @@ public void backgroundRefreshForMultiMaster() throws Exception {
236236
GlobalEndpointManager globalEndPointManager = new GlobalEndpointManager(databaseAccountManagerInternal, connectionPolicy, new Configs());
237237
globalEndPointManager.init();
238238

239+
// Background refresh timer must keep running even for multi-master accounts where
240+
// shouldRefreshEndpoints() returns false. This ensures topology changes (e.g.,
241+
// multi-write <-> single-write transitions) are detected.
239242
AtomicBoolean isRefreshInBackground = getRefreshInBackground(globalEndPointManager);
240-
Assert.assertFalse(isRefreshInBackground.get());
243+
Assert.assertTrue(isRefreshInBackground.get());
244+
LifeCycleUtils.closeQuietly(globalEndPointManager);
245+
}
246+
247+
/**
248+
* Validates that a multi-master account's background refresh timer detects a topology
249+
* change from multi-write to single-write. Without the fix in refreshLocationPrivateAsync,
250+
* the timer stops after init and the transition is never detected.
251+
*/
252+
@Test(groups = {"unit"}, timeOut = TIMEOUT)
253+
public void backgroundRefreshDetectsTopologyChangeForMultiMaster() throws Exception {
254+
// Start with a multi-writer account (dbAccountJson4: MW, East US + East Asia)
255+
ConnectionPolicy connectionPolicy = new ConnectionPolicy(DirectConnectionConfig.getDefaultConfig());
256+
connectionPolicy.setEndpointDiscoveryEnabled(true);
257+
connectionPolicy.setMultipleWriteRegionsEnabled(true);
258+
DatabaseAccount multiWriterAccount = new DatabaseAccount(dbAccountJson4);
259+
Mockito.when(databaseAccountManagerInternal.getDatabaseAccountFromEndpoint(ArgumentMatchers.any()))
260+
.thenReturn(Flux.just(multiWriterAccount));
261+
Mockito.when(databaseAccountManagerInternal.getServiceEndpoint())
262+
.thenReturn(new URI("https://testaccount.documents.azure.com:443"));
263+
264+
GlobalEndpointManager globalEndPointManager = new GlobalEndpointManager(
265+
databaseAccountManagerInternal, connectionPolicy, new Configs());
266+
setBackgroundRefreshLocationTimeIntervalInMS(globalEndPointManager, 500);
267+
globalEndPointManager.init();
268+
269+
// Verify multi-writer state: 2 write regions available
270+
LocationCache locationCache = this.getLocationCache(globalEndPointManager);
271+
Map<String, RegionalRoutingContext> availableWriteEndpoints = this.getAvailableWriteEndpointByLocation(locationCache);
272+
Assert.assertEquals(availableWriteEndpoints.size(), 2, "Expected 2 write regions for multi-writer account");
273+
Assert.assertTrue(availableWriteEndpoints.containsKey("East US"));
274+
Assert.assertTrue(availableWriteEndpoints.containsKey("East Asia"));
275+
276+
// Transition to single-writer account (dbAccountJson1: SW, East US only for writes)
277+
DatabaseAccount singleWriterAccount = new DatabaseAccount(dbAccountJson1);
278+
Mockito.when(databaseAccountManagerInternal.getDatabaseAccountFromEndpoint(ArgumentMatchers.any()))
279+
.thenReturn(Flux.just(singleWriterAccount));
280+
281+
// Wait for background refresh to detect the topology change
282+
Thread.sleep(2000);
283+
284+
// Verify single-writer state: write endpoints updated to reflect single-writer topology
285+
locationCache = this.getLocationCache(globalEndPointManager);
286+
availableWriteEndpoints = this.getAvailableWriteEndpointByLocation(locationCache);
287+
Assert.assertEquals(availableWriteEndpoints.size(), 1, "Expected 1 write region after transition to single-writer");
288+
Assert.assertTrue(availableWriteEndpoints.containsKey("East US"));
289+
241290
LifeCycleUtils.closeQuietly(globalEndPointManager);
242291
}
243292

sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,16 @@ private Mono<Void> refreshLocationPrivateAsync(DatabaseAccount databaseAccount)
302302
return Mono.empty();
303303
} else {
304304
logger.debug("shouldRefreshEndpoints: false, nothing to do.");
305+
306+
// Even when no endpoint refresh is needed right now, we must keep the
307+
// background refresh timer running so that future topology changes
308+
// (e.g., multi-write <-> single-write transitions) are detected.
309+
// This aligns with the .NET SDK behavior where the background loop
310+
// continues unconditionally as long as the client is alive.
311+
if (!this.refreshInBackground.get()) {
312+
this.startRefreshLocationTimerAsync();
313+
}
314+
305315
this.isRefreshing.set(false);
306316
return Mono.empty();
307317
}

0 commit comments

Comments
 (0)