Skip to content

Commit 62bf3c1

Browse files
committed
fix: extend NodeDiagnostics with node state, distance, datacenter and pool size (DRIVER-540)
Add four additional fields to DriverTimeoutException.NodeDiagnostics captured at timeout time: - nodeState: UP/DOWN/FORCED_DOWN/UNKNOWN — immediately explains timeouts to downed nodes - nodeDistance: LOCAL/REMOTE/IGNORED — contextualizes latency expectations - datacenter: node DC — helps diagnose cross-DC routing issues - poolSize: active connection count — reveals degraded pools (fewer connections than expected) All four are available from Node and ChannelPool already in scope at each buildNodeDiagnostics() call site. No new infrastructure required. Updated toString() example: /10.0.0.1:9042 [state: UP, distance: LOCAL, dc: dc1, channel in-flight: 5, pool size: 3, pool in-flight: 12, pool available ids: 988, pool orphaned ids: 2]
1 parent 441eb42 commit 62bf3c1

5 files changed

Lines changed: 144 additions & 10 deletions

File tree

core/src/main/java/com/datastax/dse/driver/internal/core/cql/continuous/ContinuousRequestHandlerBase.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,7 +414,11 @@ private NodeDiagnostics buildNodeDiagnostics() {
414414
ChannelPool pool = session.getPools().get(cb.node);
415415
return NodeDiagnostics.of(
416416
cb.node.getEndPoint(),
417+
cb.node.getState(),
418+
cb.node.getDistance(),
419+
cb.node.getDatacenter(),
417420
channelInFlight,
421+
pool != null ? pool.size() : UNAVAILABLE,
418422
pool != null ? pool.getInFlight() : UNAVAILABLE,
419423
pool != null ? pool.getAvailableIds() : UNAVAILABLE,
420424
pool != null ? pool.getOrphanedIds() : UNAVAILABLE);
@@ -748,7 +752,11 @@ private void onPageTimeout(int expectedPage) {
748752
"Timed out waiting for page " + expectedPage,
749753
NodeDiagnostics.of(
750754
node.getEndPoint(),
755+
node.getState(),
756+
node.getDistance(),
757+
node.getDatacenter(),
751758
channelInFlight,
759+
pool != null ? pool.size() : UNAVAILABLE,
752760
pool != null ? pool.getInFlight() : UNAVAILABLE,
753761
pool != null ? pool.getAvailableIds() : UNAVAILABLE,
754762
pool != null ? pool.getOrphanedIds() : UNAVAILABLE)),

core/src/main/java/com/datastax/dse/driver/internal/core/graph/GraphRequestHandler.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,11 @@ private NodeDiagnostics buildNodeDiagnostics() {
248248
ChannelPool pool = session.getPools().get(cb.node);
249249
return NodeDiagnostics.of(
250250
cb.node.getEndPoint(),
251+
cb.node.getState(),
252+
cb.node.getDistance(),
253+
cb.node.getDatacenter(),
251254
channelInFlight,
255+
pool != null ? pool.size() : UNAVAILABLE,
252256
pool != null ? pool.getInFlight() : UNAVAILABLE,
253257
pool != null ? pool.getAvailableIds() : UNAVAILABLE,
254258
pool != null ? pool.getOrphanedIds() : UNAVAILABLE);

core/src/main/java/com/datastax/oss/driver/api/core/DriverTimeoutException.java

Lines changed: 124 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@
1818
package com.datastax.oss.driver.api.core;
1919

2020
import com.datastax.oss.driver.api.core.cql.ExecutionInfo;
21+
import com.datastax.oss.driver.api.core.loadbalancing.NodeDistance;
2122
import com.datastax.oss.driver.api.core.metadata.EndPoint;
23+
import com.datastax.oss.driver.api.core.metadata.NodeState;
2224
import edu.umd.cs.findbugs.annotations.NonNull;
2325
import edu.umd.cs.findbugs.annotations.Nullable;
2426

@@ -43,8 +45,14 @@ public class DriverTimeoutException extends DriverException {
4345
* <p>Fields:
4446
*
4547
* <ul>
48+
* <li>{@link #getNodeState()}: the state of the node (UP, DOWN, etc.) at timeout time.
49+
* <li>{@link #getNodeDistance()}: the distance assigned to the node by the load-balancing
50+
* policy (LOCAL, REMOTE, or IGNORED).
51+
* <li>{@link #getDatacenter()}: the datacenter the node belongs to.
4652
* <li>{@link #getChannelInFlight()}: requests currently awaiting a response on the specific
4753
* connection used for this request.
54+
* <li>{@link #getPoolSize()}: number of active connections in the pool ({@link #UNAVAILABLE} if
55+
* the pool was already removed).
4856
* <li>{@link #getPoolInFlight()}: total in-flight across all connections to this host ({@link
4957
* #UNAVAILABLE} if the pool was already removed).
5058
* <li>{@link #getPoolAvailableIds()}: remaining stream IDs available to send new requests; a
@@ -63,12 +71,18 @@ public class DriverTimeoutException extends DriverException {
6371
* were sent but not answered within the timeout.
6472
* <li>High {@code poolOrphanedIds} → previous timeouts consumed stream IDs that the driver is
6573
* still waiting to reclaim.
74+
* <li>{@code poolSize} below expected → pool is degraded; some connections have been lost.
75+
* <li>{@code nodeState} DOWN or FORCED_DOWN → node is known to be unavailable.
6676
* </ul>
6777
*/
6878
public static final class NodeDiagnostics {
6979

7080
@NonNull private final EndPoint endPoint;
81+
@Nullable private final NodeState nodeState;
82+
@Nullable private final NodeDistance nodeDistance;
83+
@Nullable private final String datacenter;
7184
private final int channelInFlight;
85+
private final int poolSize;
7286
private final int poolInFlight;
7387
private final int poolAvailableIds;
7488
private final int poolOrphanedIds;
@@ -77,34 +91,64 @@ public static final class NodeDiagnostics {
7791
* Creates a full diagnostic snapshot (pool was available at timeout time).
7892
*
7993
* @param endPoint the endpoint of the node.
94+
* @param nodeState the state of the node at timeout time.
95+
* @param nodeDistance the distance assigned to the node by the load-balancing policy.
96+
* @param datacenter the datacenter the node belongs to.
8097
* @param channelInFlight in-flight count on the specific channel.
98+
* @param poolSize number of active connections in the pool.
8199
* @param poolInFlight total in-flight across the pool for this host.
82100
* @param poolAvailableIds remaining stream IDs available in the pool.
83101
* @param poolOrphanedIds orphaned stream IDs in the pool.
84102
*/
85103
public NodeDiagnostics(
86104
@NonNull EndPoint endPoint,
105+
@Nullable NodeState nodeState,
106+
@Nullable NodeDistance nodeDistance,
107+
@Nullable String datacenter,
87108
int channelInFlight,
109+
int poolSize,
88110
int poolInFlight,
89111
int poolAvailableIds,
90112
int poolOrphanedIds) {
91113
this.endPoint = endPoint;
114+
this.nodeState = nodeState;
115+
this.nodeDistance = nodeDistance;
116+
this.datacenter = datacenter;
92117
this.channelInFlight = channelInFlight;
118+
this.poolSize = poolSize;
93119
this.poolInFlight = poolInFlight;
94120
this.poolAvailableIds = poolAvailableIds;
95121
this.poolOrphanedIds = poolOrphanedIds;
96122
}
97123

98124
/**
99125
* Creates a partial diagnostic snapshot for when the pool was unavailable at timeout time. The
100-
* pool-related fields ({@link #getPoolInFlight()}, {@link #getPoolAvailableIds()}, {@link
101-
* #getPoolOrphanedIds()}) will be {@link DriverTimeoutException#UNAVAILABLE}.
126+
* pool-related fields ({@link #getPoolSize()}, {@link #getPoolInFlight()}, {@link
127+
* #getPoolAvailableIds()}, {@link #getPoolOrphanedIds()}) will be {@link
128+
* DriverTimeoutException#UNAVAILABLE}.
102129
*
103130
* @param endPoint the endpoint of the node.
131+
* @param nodeState the state of the node at timeout time.
132+
* @param nodeDistance the distance assigned to the node by the load-balancing policy.
133+
* @param datacenter the datacenter the node belongs to.
104134
* @param channelInFlight in-flight count on the specific channel.
105135
*/
106-
public NodeDiagnostics(@NonNull EndPoint endPoint, int channelInFlight) {
107-
this(endPoint, channelInFlight, UNAVAILABLE, UNAVAILABLE, UNAVAILABLE);
136+
public NodeDiagnostics(
137+
@NonNull EndPoint endPoint,
138+
@Nullable NodeState nodeState,
139+
@Nullable NodeDistance nodeDistance,
140+
@Nullable String datacenter,
141+
int channelInFlight) {
142+
this(
143+
endPoint,
144+
nodeState,
145+
nodeDistance,
146+
datacenter,
147+
channelInFlight,
148+
UNAVAILABLE,
149+
UNAVAILABLE,
150+
UNAVAILABLE,
151+
UNAVAILABLE);
108152
}
109153

110154
/**
@@ -113,7 +157,12 @@ public NodeDiagnostics(@NonNull EndPoint endPoint, int channelInFlight) {
113157
* timeout time.
114158
*
115159
* @param endPoint the endpoint of the node.
160+
* @param nodeState the state of the node at timeout time.
161+
* @param nodeDistance the distance assigned to the node by the load-balancing policy.
162+
* @param datacenter the datacenter the node belongs to.
116163
* @param channelInFlight in-flight count on the specific channel.
164+
* @param poolSize number of active connections in the pool, or {@link
165+
* DriverTimeoutException#UNAVAILABLE}.
117166
* @param poolInFlight total in-flight across the pool, or {@link
118167
* DriverTimeoutException#UNAVAILABLE}.
119168
* @param poolAvailableIds remaining stream IDs in the pool, or {@link
@@ -124,12 +173,24 @@ public NodeDiagnostics(@NonNull EndPoint endPoint, int channelInFlight) {
124173
@NonNull
125174
public static NodeDiagnostics of(
126175
@NonNull EndPoint endPoint,
176+
@Nullable NodeState nodeState,
177+
@Nullable NodeDistance nodeDistance,
178+
@Nullable String datacenter,
127179
int channelInFlight,
180+
int poolSize,
128181
int poolInFlight,
129182
int poolAvailableIds,
130183
int poolOrphanedIds) {
131184
return new NodeDiagnostics(
132-
endPoint, channelInFlight, poolInFlight, poolAvailableIds, poolOrphanedIds);
185+
endPoint,
186+
nodeState,
187+
nodeDistance,
188+
datacenter,
189+
channelInFlight,
190+
poolSize,
191+
poolInFlight,
192+
poolAvailableIds,
193+
poolOrphanedIds);
133194
}
134195

135196
/** Returns the endpoint of the node that had in-flight requests at timeout time. */
@@ -138,6 +199,30 @@ public EndPoint getEndPoint() {
138199
return endPoint;
139200
}
140201

202+
/**
203+
* Returns the state of the node at timeout time (e.g. UP, DOWN, FORCED_DOWN), or {@code null}
204+
* if not available.
205+
*/
206+
@Nullable
207+
public NodeState getNodeState() {
208+
return nodeState;
209+
}
210+
211+
/**
212+
* Returns the distance assigned to this node by the load-balancing policy at timeout time (e.g.
213+
* LOCAL, REMOTE, IGNORED), or {@code null} if not available.
214+
*/
215+
@Nullable
216+
public NodeDistance getNodeDistance() {
217+
return nodeDistance;
218+
}
219+
220+
/** Returns the datacenter this node belongs to, or {@code null} if not available. */
221+
@Nullable
222+
public String getDatacenter() {
223+
return datacenter;
224+
}
225+
141226
/**
142227
* Returns the number of in-flight requests on the specific connection at timeout time, or
143228
* {@link DriverTimeoutException#UNAVAILABLE} if not available.
@@ -146,6 +231,14 @@ public int getChannelInFlight() {
146231
return channelInFlight;
147232
}
148233

234+
/**
235+
* Returns the number of active connections in the pool at timeout time, or {@link
236+
* DriverTimeoutException#UNAVAILABLE} if the pool was no longer available.
237+
*/
238+
public int getPoolSize() {
239+
return poolSize;
240+
}
241+
149242
/**
150243
* Returns the total number of in-flight requests across all connections to this host at timeout
151244
* time, or {@link DriverTimeoutException#UNAVAILABLE} if the pool was no longer available.
@@ -174,12 +267,33 @@ public int getPoolOrphanedIds() {
174267

175268
@Override
176269
public String toString() {
270+
StringBuilder sb = new StringBuilder();
271+
sb.append(endPoint);
272+
sb.append(" [");
273+
if (nodeState != null) {
274+
sb.append("state: ").append(nodeState).append(", ");
275+
}
276+
if (nodeDistance != null) {
277+
sb.append("distance: ").append(nodeDistance).append(", ");
278+
}
279+
if (datacenter != null) {
280+
sb.append("dc: ").append(datacenter).append(", ");
281+
}
282+
sb.append("channel in-flight: ").append(channelInFlight).append(", ");
177283
if (poolInFlight == UNAVAILABLE) {
178-
return String.format("%s [channel in-flight: %d, pool: n/a]", endPoint, channelInFlight);
284+
sb.append("pool: n/a");
285+
} else {
286+
sb.append("pool size: ")
287+
.append(poolSize)
288+
.append(", pool in-flight: ")
289+
.append(poolInFlight)
290+
.append(", pool available ids: ")
291+
.append(poolAvailableIds)
292+
.append(", pool orphaned ids: ")
293+
.append(poolOrphanedIds);
179294
}
180-
return String.format(
181-
"%s [channel in-flight: %d, pool in-flight: %d, pool available ids: %d, pool orphaned ids: %d]",
182-
endPoint, channelInFlight, poolInFlight, poolAvailableIds, poolOrphanedIds);
295+
sb.append("]");
296+
return sb.toString();
183297
}
184298
}
185299

@@ -234,6 +348,6 @@ private static String buildMessage(
234348
if (nodeDiagnostics == null) {
235349
return baseMessage;
236350
}
237-
return baseMessage + " node in flight: " + nodeDiagnostics;
351+
return baseMessage + " \u2014 node in flight: " + nodeDiagnostics;
238352
}
239353
}

core/src/main/java/com/datastax/oss/driver/internal/core/cql/CqlPrepareHandler.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,11 @@ private NodeDiagnostics buildNodeDiagnostics() {
195195
ChannelPool pool = session.getPools().get(cb.node);
196196
return NodeDiagnostics.of(
197197
cb.node.getEndPoint(),
198+
cb.node.getState(),
199+
cb.node.getDistance(),
200+
cb.node.getDatacenter(),
198201
channelInFlight,
202+
pool != null ? pool.size() : UNAVAILABLE,
199203
pool != null ? pool.getInFlight() : UNAVAILABLE,
200204
pool != null ? pool.getAvailableIds() : UNAVAILABLE,
201205
pool != null ? pool.getOrphanedIds() : UNAVAILABLE);

core/src/main/java/com/datastax/oss/driver/internal/core/cql/CqlRequestHandler.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,11 @@ private NodeDiagnostics buildNodeDiagnostics() {
263263
ChannelPool pool = session.getPools().get(cb.node);
264264
return NodeDiagnostics.of(
265265
cb.node.getEndPoint(),
266+
cb.node.getState(),
267+
cb.node.getDistance(),
268+
cb.node.getDatacenter(),
266269
channelInFlight,
270+
pool != null ? pool.size() : UNAVAILABLE,
267271
pool != null ? pool.getInFlight() : UNAVAILABLE,
268272
pool != null ? pool.getAvailableIds() : UNAVAILABLE,
269273
pool != null ? pool.getOrphanedIds() : UNAVAILABLE);

0 commit comments

Comments
 (0)