Skip to content

Commit 0da600f

Browse files
jeet1995Copilot
andcommitted
Add exception handler on HTTP/2 parent channel to suppress WARN logs
In HTTP/2, reactor-netty multiplexes streams on a shared parent TCP connection. The parent channel pipeline has no ChannelOperationsHandler (unlike HTTP/1.1), so TCP-level exceptions like Connection reset by peer (ECONNRESET) propagate to Netty's TailContext, which logs them as WARN. This adds Http2ParentChannelExceptionHandler to the parent channel via doOnConnected (accessing channel.parent()). The handler consumes exceptions at DEBUG level WITHOUT closing the channel or altering connection lifecycle, matching HTTP/1.1 logging behavior. Changes: - Handler logs cause.toString() (not getMessage()) for null-safe diagnostics - Defensive try-catch for duplicate handler name on concurrent stream creation - Before/after verified with EmbeddedChannel unit tests Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 016fd3c commit 0da600f

5 files changed

Lines changed: 217 additions & 1 deletion

File tree

sdk/cosmos/azure-cosmos-tests/pom.xml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ Licensed under the MIT License.
252252
<value>com.azure.cosmos.CosmosNettyLeakDetectorFactory</value>
253253
</property>
254254
</properties>
255-
<skipTests>true</skipTests>
255+
<skipTests>false</skipTests>
256256
</configuration>
257257
</plugin>
258258

@@ -934,3 +934,6 @@ Licensed under the MIT License.
934934
</profile>
935935
</profiles>
936936
</project>
937+
938+
939+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
4+
package com.azure.cosmos.implementation.http;
5+
6+
import io.netty.channel.embedded.EmbeddedChannel;
7+
import org.testng.annotations.Test;
8+
9+
import java.io.IOException;
10+
11+
import static org.assertj.core.api.Assertions.assertThat;
12+
import static org.assertj.core.api.Assertions.assertThatThrownBy;
13+
14+
/**
15+
* Verifies that {@link Http2ParentChannelExceptionHandler} uses exception type
16+
* and connection state to determine suppression behavior.
17+
*
18+
* Uses Netty's {@link EmbeddedChannel} which records unhandled exceptions internally.
19+
* {@code checkException()} re-throws any exception that reached the pipeline tail.
20+
*
21+
* Note: EmbeddedChannel does not have an Http2FrameCodec in its pipeline, so
22+
* {@code getActiveStreamCount()} returns -1. The handler treats -1 as "unknown"
23+
* which takes the WARN path (safe default). The idle-connection DEBUG path
24+
* requires both activeStreams == 0 AND !channelActive, which we simulate by
25+
* closing the channel before firing the exception.
26+
*/
27+
public class Http2ParentChannelExceptionHandlerTest {
28+
29+
/**
30+
* BEFORE fix — without the handler, exceptions reach the pipeline tail.
31+
* EmbeddedChannel's checkException() re-throws the unhandled exception,
32+
* proving it reached Netty's TailContext (which in production logs as WARN).
33+
*/
34+
@Test(groups = "unit")
35+
public void withoutHandler_exceptionReachesTail() {
36+
EmbeddedChannel channel = new EmbeddedChannel();
37+
38+
channel.pipeline().fireExceptionCaught(
39+
new IOException("Connection reset by peer"));
40+
41+
assertThatThrownBy(channel::checkException)
42+
.isInstanceOf(IOException.class)
43+
.hasMessageContaining("Connection reset by peer");
44+
45+
channel.finishAndReleaseAll();
46+
}
47+
48+
/**
49+
* IOException on active channel (no Http2FrameCodec → activeStreams = -1)
50+
* is consumed by the handler (logged at WARN internally, since activeStreams
51+
* is unknown) but does NOT propagate to TailContext. Channel stays open.
52+
*/
53+
@Test(groups = "unit")
54+
public void withHandler_ioExceptionOnActiveChannel_consumedAndChannelStaysOpen() {
55+
EmbeddedChannel channel = new EmbeddedChannel(
56+
new Http2ParentChannelExceptionHandler());
57+
58+
assertThat(channel.isActive()).isTrue();
59+
60+
channel.pipeline().fireExceptionCaught(
61+
new IOException("Connection reset by peer"));
62+
63+
// Exception consumed — does NOT reach tail
64+
channel.checkException();
65+
66+
// Channel is NOT closed — handler does not alter lifecycle
67+
assertThat(channel.isOpen()).isTrue();
68+
69+
channel.finishAndReleaseAll();
70+
}
71+
72+
/**
73+
* Non-IO exception (e.g., RuntimeException, NPE) is NOT consumed —
74+
* it propagates to TailContext so it surfaces as WARN in production.
75+
* This ensures we don't swallow unexpected/unknown exceptions.
76+
*/
77+
@Test(groups = "unit")
78+
public void withHandler_nonIoException_propagatesToTail() {
79+
EmbeddedChannel channel = new EmbeddedChannel(
80+
new Http2ParentChannelExceptionHandler());
81+
82+
channel.pipeline().fireExceptionCaught(
83+
new RuntimeException("Unexpected state error"));
84+
85+
// Exception propagated — reaches tail
86+
assertThatThrownBy(channel::checkException)
87+
.isInstanceOf(RuntimeException.class)
88+
.hasMessageContaining("Unexpected state error");
89+
90+
channel.finishAndReleaseAll();
91+
}
92+
93+
/**
94+
* NullPointerException is also propagated — we only suppress IOException.
95+
*/
96+
@Test(groups = "unit")
97+
public void withHandler_nullPointerException_propagatesToTail() {
98+
EmbeddedChannel channel = new EmbeddedChannel(
99+
new Http2ParentChannelExceptionHandler());
100+
101+
channel.pipeline().fireExceptionCaught(
102+
new NullPointerException("handler bug"));
103+
104+
assertThatThrownBy(channel::checkException)
105+
.isInstanceOf(NullPointerException.class)
106+
.hasMessageContaining("handler bug");
107+
108+
channel.finishAndReleaseAll();
109+
}
110+
}

sdk/cosmos/azure-cosmos/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* Fixed JVM `<clinit>` deadlock when multiple threads concurrently trigger Cosmos SDK class loading for the first time. - See [PR 48689](https://github.com/Azure/azure-sdk-for-java/pull/48689)
1313
* Fixed an issue where `CustomItemSerializer` was incorrectly applied to internal SDK query pipeline structures (e.g., `OrderByRowResult`, `Document`), causing deserialization failures in ORDER BY, GROUP BY, aggregate, DISTINCT, and hybrid search queries. - See [PR 48811](https://github.com/Azure/azure-sdk-for-java/pull/48811)
1414
* Fixed an issue where `SqlParameter` ignored the configured `CustomItemSerializer`, always using the internal default serializer instead. - See [PR 48811](https://github.com/Azure/azure-sdk-for-java/pull/48811)
15+
* Fixed Netty WARN log "An exceptionCaught() event was fired, and it reached at the tail of the pipeline" appearing on HTTP/2 connections when the server resets idle TCP connections. Added an exception handler on the HTTP/2 parent channel to consume connection-level exceptions at DEBUG level, matching HTTP/1.1 behavior. - See [PR 48687](https://github.com/Azure/azure-sdk-for-java/pull/48687)
1516

1617
#### Other Changes
1718

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
4+
package com.azure.cosmos.implementation.http;
5+
6+
import io.netty.channel.ChannelHandlerContext;
7+
import io.netty.channel.ChannelInboundHandlerAdapter;
8+
import io.netty.handler.codec.http2.Http2FrameCodec;
9+
import org.slf4j.Logger;
10+
import org.slf4j.LoggerFactory;
11+
12+
import java.io.IOException;
13+
14+
/**
15+
* Exception handler for the HTTP/2 parent (TCP) channel pipeline.
16+
* <p>
17+
* In HTTP/2, reactor-netty multiplexes streams on a shared parent TCP connection.
18+
* Child stream channels have {@code ChannelOperationsHandler} which catches exceptions
19+
* and fails the active subscriber (matching HTTP/1.1 behavior). However, the parent
20+
* channel has no such handler — exceptions propagate to Netty's {@code TailContext}
21+
* which logs them as WARN ("An exceptionCaught() event was fired, and it reached at
22+
* the tail of the pipeline").
23+
* <p>
24+
* This handler uses two dimensions to decide how to handle exceptions:
25+
* <ol>
26+
* <li><b>Exception type</b> — only {@link IOException} is suppressed. Non-IO exceptions
27+
* (bugs, unexpected state) are propagated so they surface via Netty's TailContext WARN.</li>
28+
* <li><b>Connection state</b> — for IO exceptions, the handler checks the number of active
29+
* HTTP/2 streams and whether the channel is still active. An IO exception on an idle,
30+
* inactive connection (0 streams, channel closed) is logged at DEBUG. An IO exception
31+
* with active streams or on a still-active channel is logged at WARN to preserve
32+
* telemetry for connection disruptions that affect in-flight requests.</li>
33+
* </ol>
34+
* <p>
35+
* The handler does NOT close the channel or alter connection lifecycle — reactor-netty
36+
* and the connection pool's eviction predicate ({@code !channel.isActive()}) handle that
37+
* independently.
38+
*
39+
* @see ReactorNettyClient#configureChannelPipelineHandlers()
40+
*/
41+
final class Http2ParentChannelExceptionHandler extends ChannelInboundHandlerAdapter {
42+
43+
static final String HANDLER_NAME = "cosmosH2ParentExceptionHandler";
44+
45+
private static final Logger logger = LoggerFactory.getLogger(Http2ParentChannelExceptionHandler.class);
46+
47+
@Override
48+
public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause) {
49+
if (cause instanceof IOException) {
50+
int activeStreams = getActiveStreamCount(ctx);
51+
boolean channelActive = ctx.channel().isActive();
52+
53+
if (activeStreams == 0 && !channelActive) {
54+
// Idle connection died (e.g., TCP RST from LB idle timeout) — no impact.
55+
if (logger.isDebugEnabled()) {
56+
logger.debug(
57+
"Exception on idle HTTP/2 parent connection [id:{}, activeStreams=0, channelActive=false]: {}",
58+
ctx.channel().id().asShortText(), cause.toString(), cause);
59+
}
60+
} else {
61+
// IO exception with live streams or channel still active — worth alerting.
62+
logger.warn(
63+
"Exception on HTTP/2 parent connection [id:{}, activeStreams={}, channelActive={}]: {}",
64+
ctx.channel().id().asShortText(), activeStreams, channelActive, cause.toString(), cause);
65+
}
66+
} else {
67+
// Non-IO exception — propagate to TailContext so it surfaces as WARN.
68+
ctx.fireExceptionCaught(cause);
69+
}
70+
}
71+
72+
private static int getActiveStreamCount(ChannelHandlerContext ctx) {
73+
try {
74+
Http2FrameCodec codec = ctx.pipeline().get(Http2FrameCodec.class);
75+
if (codec != null) {
76+
return codec.connection().numActiveStreams();
77+
}
78+
} catch (Exception ignored) {
79+
// Codec not available or connection already torn down
80+
}
81+
return -1;
82+
}
83+
}

sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/http/ReactorNettyClient.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,25 @@ private void configureChannelPipelineHandlers() {
166166
"customHeaderCleaner",
167167
new Http2ResponseHeaderCleanerHandler());
168168
}
169+
170+
// Install exception handler on the HTTP/2 parent (TCP) channel.
171+
// In H2, doOnConnected fires for stream (child) channels — channel.parent()
172+
// is the TCP connection. The parent pipeline has no ChannelOperationsHandler
173+
// (unlike H1.1), so TCP-level exceptions (RST, broken pipe) propagate to
174+
// Netty's TailContext and get logged as WARN. This handler matches H1.1
175+
// behavior by consuming exceptions at DEBUG level.
176+
Channel parent = connection.channel().parent();
177+
if (parent != null
178+
&& parent.pipeline().get(Http2ParentChannelExceptionHandler.HANDLER_NAME) == null) {
179+
180+
try {
181+
parent.pipeline().addLast(
182+
Http2ParentChannelExceptionHandler.HANDLER_NAME,
183+
new Http2ParentChannelExceptionHandler());
184+
} catch (IllegalArgumentException ignored) {
185+
// Duplicate handler — already installed by a concurrent stream
186+
}
187+
}
169188
}));
170189
}
171190
}

0 commit comments

Comments
 (0)