|
19 | 19 | package org.apache.skywalking.apm.agent.core.remote; |
20 | 20 |
|
21 | 21 | import io.grpc.Channel; |
| 22 | +import io.grpc.ConnectivityState; |
22 | 23 | import io.grpc.Status; |
23 | 24 | import io.grpc.StatusRuntimeException; |
24 | 25 |
|
@@ -57,6 +58,8 @@ public class GRPCChannelManager implements BootService, Runnable { |
57 | 58 | private volatile List<String> grpcServers; |
58 | 59 | private volatile int selectedIdx = -1; |
59 | 60 | private volatile int reconnectCount = 0; |
| 61 | + private volatile int transientFailureCount = 0; |
| 62 | + private final Object statusLock = new Object(); |
60 | 63 |
|
61 | 64 | @Override |
62 | 65 | public void prepare() { |
@@ -99,7 +102,15 @@ public void shutdown() { |
99 | 102 |
|
100 | 103 | @Override |
101 | 104 | public void run() { |
102 | | - LOGGER.debug("Selected collector grpc service running, reconnect:{}.", reconnect); |
| 105 | + if (reconnect) { |
| 106 | + LOGGER.warn("Selected collector grpc service running, reconnect:{}.", reconnect); |
| 107 | + } else { |
| 108 | + LOGGER.debug("Selected collector grpc service running, reconnect:{}.", reconnect); |
| 109 | + } |
| 110 | + |
| 111 | + // Check channel state even when reconnect is false to detect prolonged failures |
| 112 | + checkChannelStateAndTriggerReconnectIfNeeded(); |
| 113 | + |
103 | 114 | if (IS_RESOLVE_DNS_PERIODICALLY && reconnect) { |
104 | 115 | grpcServers = Arrays.stream(Config.Collector.BACKEND_SERVICE.split(",")) |
105 | 116 | .filter(StringUtil::isNotBlank) |
@@ -130,26 +141,34 @@ public void run() { |
130 | 141 | String server = ""; |
131 | 142 | try { |
132 | 143 | int index = Math.abs(random.nextInt()) % grpcServers.size(); |
133 | | - selectedIdx = index; |
134 | 144 |
|
135 | 145 | server = grpcServers.get(index); |
136 | 146 | String[] ipAndPort = server.split(":"); |
137 | 147 |
|
138 | | - LOGGER.debug("Attempting to reconnect to gRPC server {}. Shutting down existing channel if any.", server); |
139 | | - if (managedChannel != null) { |
140 | | - managedChannel.shutdownNow(); |
141 | | - } |
| 148 | + if (index != selectedIdx) { |
| 149 | + selectedIdx = index; |
| 150 | + LOGGER.debug("Connecting to different gRPC server {}. Shutting down existing channel if any.", server); |
| 151 | + createNewChannel(ipAndPort[0], Integer.parseInt(ipAndPort[1])); |
| 152 | + } else { |
| 153 | + // Same server, increment reconnectCount and check state |
| 154 | + reconnectCount++; |
142 | 155 |
|
143 | | - managedChannel = GRPCChannel.newBuilder(ipAndPort[0], Integer.parseInt(ipAndPort[1])) |
144 | | - .addManagedChannelBuilder(new StandardChannelBuilder()) |
145 | | - .addManagedChannelBuilder(new TLSChannelBuilder()) |
146 | | - .addChannelDecorator(new AgentIDDecorator()) |
147 | | - .addChannelDecorator(new AuthenticationDecorator()) |
148 | | - .build(); |
149 | | - LOGGER.debug("Successfully reconnected to gRPC server {}.", server); |
150 | | - reconnectCount = 0; |
151 | | - reconnect = false; |
152 | | - notify(GRPCChannelStatus.CONNECTED); |
| 156 | + // Force reconnect if reconnectCount or transientFailureCount exceeds threshold |
| 157 | + boolean forceReconnect = reconnectCount > Config.Agent.FORCE_RECONNECTION_PERIOD |
| 158 | + || transientFailureCount > Config.Agent.FORCE_RECONNECTION_PERIOD; |
| 159 | + |
| 160 | + if (forceReconnect) { |
| 161 | + // Failed to reconnect after multiple attempts, force rebuild channel |
| 162 | + LOGGER.warn("Force rebuild channel to {} (reconnectCount={}, transientFailureCount={})", |
| 163 | + server, reconnectCount, transientFailureCount); |
| 164 | + createNewChannel(ipAndPort[0], Integer.parseInt(ipAndPort[1])); |
| 165 | + } else if (managedChannel.isConnected(false)) { |
| 166 | + // Reconnect to the same server is automatically done by GRPC, |
| 167 | + // therefore we are responsible to check the connectivity and |
| 168 | + // set the state and notify listeners |
| 169 | + markAsConnected(); |
| 170 | + } |
| 171 | + } |
153 | 172 |
|
154 | 173 | return; |
155 | 174 | } catch (Throwable t) { |
@@ -177,17 +196,85 @@ public Channel getChannel() { |
177 | 196 | */ |
178 | 197 | public void reportError(Throwable throwable) { |
179 | 198 | if (isNetworkError(throwable)) { |
| 199 | + triggerReconnect(); |
| 200 | + } |
| 201 | + } |
| 202 | + |
| 203 | + private void notify(GRPCChannelStatus status) { |
| 204 | + synchronized (listeners) { |
| 205 | + for (GRPCChannelListener listener : listeners) { |
| 206 | + try { |
| 207 | + listener.statusChanged(status); |
| 208 | + } catch (Throwable t) { |
| 209 | + LOGGER.error(t, "Fail to notify {} about channel connected.", listener.getClass().getName()); |
| 210 | + } |
| 211 | + } |
| 212 | + } |
| 213 | + } |
| 214 | + |
| 215 | + /** |
| 216 | + * Create a new gRPC channel to the specified server and reset connection state. |
| 217 | + */ |
| 218 | + private void createNewChannel(String host, int port) throws Exception { |
| 219 | + if (managedChannel != null) { |
| 220 | + managedChannel.shutdownNow(); |
| 221 | + } |
| 222 | + |
| 223 | + managedChannel = GRPCChannel.newBuilder(host, port) |
| 224 | + .addManagedChannelBuilder(new StandardChannelBuilder()) |
| 225 | + .addManagedChannelBuilder(new TLSChannelBuilder()) |
| 226 | + .addChannelDecorator(new AgentIDDecorator()) |
| 227 | + .addChannelDecorator(new AuthenticationDecorator()) |
| 228 | + .build(); |
| 229 | + |
| 230 | + markAsConnected(); |
| 231 | + } |
| 232 | + |
| 233 | + /** |
| 234 | + * Trigger reconnection by setting reconnect flag and notifying listeners. |
| 235 | + */ |
| 236 | + private void triggerReconnect() { |
| 237 | + synchronized (statusLock) { |
180 | 238 | reconnect = true; |
181 | 239 | notify(GRPCChannelStatus.DISCONNECT); |
182 | 240 | } |
183 | 241 | } |
184 | 242 |
|
185 | | - private void notify(GRPCChannelStatus status) { |
186 | | - for (GRPCChannelListener listener : listeners) { |
| 243 | + /** |
| 244 | + * Mark connection as successful and reset connection state. |
| 245 | + */ |
| 246 | + private void markAsConnected() { |
| 247 | + synchronized (statusLock) { |
| 248 | + reconnectCount = 0; |
| 249 | + reconnect = false; |
| 250 | + notify(GRPCChannelStatus.CONNECTED); |
| 251 | + } |
| 252 | + } |
| 253 | + |
| 254 | + /** |
| 255 | + * Check the connectivity state of existing channel and trigger reconnect if needed. |
| 256 | + * This method monitors TRANSIENT_FAILURE state and triggers reconnect if the failure persists too long. |
| 257 | + */ |
| 258 | + private void checkChannelStateAndTriggerReconnectIfNeeded() { |
| 259 | + if (managedChannel != null) { |
187 | 260 | try { |
188 | | - listener.statusChanged(status); |
| 261 | + ConnectivityState state = managedChannel.getState(false); |
| 262 | + LOGGER.debug("Current channel state: {}", state); |
| 263 | + |
| 264 | + if (state == ConnectivityState.TRANSIENT_FAILURE) { |
| 265 | + transientFailureCount++; |
| 266 | + LOGGER.warn("Channel in TRANSIENT_FAILURE state, count: {}", transientFailureCount); |
| 267 | + } else if (state == ConnectivityState.SHUTDOWN) { |
| 268 | + LOGGER.warn("Channel is SHUTDOWN"); |
| 269 | + if (!reconnect) { |
| 270 | + triggerReconnect(); |
| 271 | + } |
| 272 | + } else { |
| 273 | + // IDLE, READY, CONNECTING are all normal states |
| 274 | + transientFailureCount = 0; |
| 275 | + } |
189 | 276 | } catch (Throwable t) { |
190 | | - LOGGER.error(t, "Fail to notify {} about channel connected.", listener.getClass().getName()); |
| 277 | + LOGGER.error(t, "Error checking channel state"); |
191 | 278 | } |
192 | 279 | } |
193 | 280 | } |
|
0 commit comments