Skip to content

Commit d436d70

Browse files
committed
fix(spanner): enforce READY-only location aware routing and add endpoint lifecycle management
Location aware routing previously treated IDLE and CONNECTING channels as healthy, which could send traffic to stale replicas after cache updates. This change tightens endpoint readiness to READY-only, adds state-aware skipped_tablets reporting (TRANSIENT_FAILURE only), and introduces a background lifecycle manager that probes endpoints with GetSession to keep channels warm and evicts idle endpoints after 30 minutes of no real traffic.
1 parent 4cb170d commit d436d70

File tree

13 files changed

+1541
-92
lines changed

13 files changed

+1541
-92
lines changed

java-spanner/google-cloud-spanner/src/main/java/com/google/cloud/spanner/spi/v1/ChannelEndpoint.java

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,19 +41,27 @@ public interface ChannelEndpoint {
4141
String getAddress();
4242

4343
/**
44-
* Returns whether this server is ready to accept RPCs.
44+
* Returns whether this server's channel is in {@code READY} state and can accept location-aware
45+
* RPCs.
4546
*
46-
* <p>A server is considered unhealthy if:
47+
* <p>Only endpoints in {@code READY} state are eligible for location-aware routing. Endpoints in
48+
* {@code IDLE}, {@code CONNECTING}, {@code TRANSIENT_FAILURE}, or {@code SHUTDOWN} are not
49+
* considered healthy for location-aware routing purposes.
4750
*
48-
* <ul>
49-
* <li>The underlying channel is shutdown or terminated
50-
* <li>The channel is in a transient failure state
51-
* </ul>
52-
*
53-
* @return true if the server is healthy and ready to accept RPCs
51+
* @return true if the channel is in READY state
5452
*/
5553
boolean isHealthy();
5654

55+
/**
56+
* Returns whether this server's channel is in {@code TRANSIENT_FAILURE} state.
57+
*
58+
* <p>When an endpoint is in transient failure, it should be reported as a skipped tablet in
59+
* routing hints so the server can refresh the client cache.
60+
*
61+
* @return true if the channel is in TRANSIENT_FAILURE state
62+
*/
63+
boolean isTransientFailure();
64+
5765
/**
5866
* Returns the gRPC channel for making RPCs to this server.
5967
*

java-spanner/google-cloud-spanner/src/main/java/com/google/cloud/spanner/spi/v1/ChannelEndpointCache.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,19 @@ public interface ChannelEndpointCache {
5454
*/
5555
ChannelEndpoint get(String address);
5656

57+
/**
58+
* Returns a cached channel for the given address without creating it.
59+
*
60+
* <p>Unlike {@link #get(String)}, this method does not create a new endpoint if one does not
61+
* already exist in the cache. This is used by location-aware routing to avoid foreground endpoint
62+
* creation on the request path.
63+
*
64+
* @param address the server address in "host:port" format
65+
* @return the cached channel instance, or null if no endpoint exists for this address
66+
*/
67+
@javax.annotation.Nullable
68+
ChannelEndpoint getIfPresent(String address);
69+
5770
/**
5871
* Evicts a server connection from the cache and gracefully shuts down its channel.
5972
*

java-spanner/google-cloud-spanner/src/main/java/com/google/cloud/spanner/spi/v1/ChannelFinder.java

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,19 @@
2222
import com.google.spanner.v1.CommitRequest;
2323
import com.google.spanner.v1.DirectedReadOptions;
2424
import com.google.spanner.v1.ExecuteSqlRequest;
25+
import com.google.spanner.v1.Group;
2526
import com.google.spanner.v1.Mutation;
2627
import com.google.spanner.v1.ReadRequest;
2728
import com.google.spanner.v1.RoutingHint;
29+
import com.google.spanner.v1.Tablet;
2830
import com.google.spanner.v1.TransactionOptions;
2931
import com.google.spanner.v1.TransactionSelector;
3032
import java.util.ArrayList;
3133
import java.util.List;
3234
import java.util.Objects;
3335
import java.util.concurrent.ThreadLocalRandom;
3436
import java.util.concurrent.atomic.AtomicLong;
37+
import javax.annotation.Nullable;
3538

3639
/**
3740
* Finds a server for a request using location-aware routing metadata.
@@ -44,9 +47,16 @@ public final class ChannelFinder {
4447
private final AtomicLong databaseId = new AtomicLong();
4548
private final KeyRecipeCache recipeCache = new KeyRecipeCache();
4649
private final KeyRangeCache rangeCache;
50+
@Nullable private final EndpointLifecycleManager lifecycleManager;
4751

4852
public ChannelFinder(ChannelEndpointCache endpointCache) {
49-
this.rangeCache = new KeyRangeCache(Objects.requireNonNull(endpointCache));
53+
this(endpointCache, null);
54+
}
55+
56+
public ChannelFinder(
57+
ChannelEndpointCache endpointCache, @Nullable EndpointLifecycleManager lifecycleManager) {
58+
this.rangeCache = new KeyRangeCache(Objects.requireNonNull(endpointCache), lifecycleManager);
59+
this.lifecycleManager = lifecycleManager;
5060
}
5161

5262
void useDeterministicRandom() {
@@ -67,6 +77,19 @@ public void update(CacheUpdate update) {
6777
recipeCache.addRecipes(update.getKeyRecipes());
6878
}
6979
rangeCache.addRanges(update);
80+
81+
// Notify the lifecycle manager about server addresses so it can create endpoints
82+
// in the background and start probing.
83+
if (lifecycleManager != null) {
84+
for (Group group : update.getGroupList()) {
85+
for (Tablet tablet : group.getTabletsList()) {
86+
String addr = tablet.getServerAddress();
87+
if (!addr.isEmpty()) {
88+
lifecycleManager.ensureEndpointExists(addr);
89+
}
90+
}
91+
}
92+
}
7093
}
7194
}
7295

0 commit comments

Comments
 (0)