Skip to content

Commit 55ed09e

Browse files
jeet1995Copilot
andcommitted
fix(cosmos): HTTP/2 PING handler must observe child-stream read activity, not just parent-pipeline frames
Follow-up to #49095. Root cause: Http2PingHandler is installed on the parent (TCP) channel and only updated lastReadActivityNanos from parent-pipeline channelRead -- which sees PING ACK / SETTINGS / GOAWAY but NOT HEADERS / DATA frames. With reactor-netty's Http2MultiplexHandler in the parent pipeline, all application H2 traffic is demuxed onto per-stream child channels, so a connection serving thousands of QPS still looked idle to the PING handler, causing unnecessary PINGs and measurable P95 overhead under load. Fix: - New package-private AttributeKey<AtomicLong> LAST_CHILD_STREAM_READ_ACTIVITY_NANOS seeded on the parent channel in handlerAdded. - Http2PingCloseRewrapHandler (already installed in every child stream's pipeline) overrides channelReadComplete to stamp the parent attribute via accumulateAndGet(now, Math::max) -- once per read cycle, monotonic. - Http2PingHandler.maybeSendPing now uses effectiveLastReadActivityNanos = max(field, child attr) for both the idle-threshold check AND the outstanding-PING early-return: if child-stream reads arrived after an outstanding PING, clear pingOutstandingSinceNanos and reset consecutiveFailures (defense against middleboxes that drop PING-ACK but pass application H2 frames). - Names use lastReadActivity / read activity -- both signals are inbound reads, never writes. Tests: two new unit tests in Http2PingHandlerTest covering the suppression path and the outstanding-PING reset path; both bypass real time via reflection. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent c46043e commit 55ed09e

4 files changed

Lines changed: 270 additions & 15 deletions

File tree

sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/http/Http2PingHandlerTest.java

Lines changed: 116 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import java.lang.reflect.Method;
1717
import java.util.concurrent.ScheduledFuture;
1818
import java.util.concurrent.TimeUnit;
19+
import java.util.concurrent.atomic.AtomicLong;
1920

2021
import static org.assertj.core.api.Assertions.assertThat;
2122

@@ -25,7 +26,7 @@
2526
* Covers state transitions that do not require advancing real time -- the handler's
2627
* timeout / threshold logic reads {@code System.nanoTime()}, so the tests sidestep
2728
* the clock by pre-setting {@code pingOutstandingSinceNanos} (or
28-
* {@code lastActivityNanos}) via reflection:
29+
* {@code lastReadActivityNanos}) via reflection:
2930
* <ul>
3031
* <li>PING ACK with matching payload resets the failure counter (RFC 9113 §6.7 payload echo).</li>
3132
* <li>PING ACK with mismatched payload is ignored (late ACK after timeout cannot mask degradation).</li>
@@ -209,10 +210,14 @@ public void write(ChannelHandlerContext ctx, Object msg, ChannelPromise promise)
209210

210211
ChannelHandlerContext ctx = channel.pipeline().context(handler);
211212
long longAgo = System.nanoTime() - TimeUnit.SECONDS.toNanos(60);
213+
AtomicLong childRead = channel.attr(Http2PingHandler.LAST_CHILD_STREAM_READ_ACTIVITY_NANOS).get();
212214

213215
// First tick: force idle -> PING send attempt -> write fails.
214216
// Expect consecutiveFailures=1, channel still open, task still scheduled.
215-
setField(handler, "lastActivityNanos", longAgo);
217+
// Rewind both lastReadActivityNanos AND the child-stream-read-activity attribute
218+
// (seeded in handlerAdded to now()) so effectiveLastReadActivityNanos is stale.
219+
setField(handler, "lastReadActivityNanos", longAgo);
220+
childRead.set(longAgo);
216221
invokeMaybeSendPing(handler, ctx);
217222
channel.runPendingTasks(); // run the write + listener on the embedded event loop
218223

@@ -222,7 +227,12 @@ public void write(ChannelHandlerContext ctx, Object msg, ChannelPromise promise)
222227
assertThat((Object) getField(handler, "pingTask")).isNotNull();
223228

224229
// Second tick: write fails again -> threshold reached -> task cancelled, channel closed.
225-
setField(handler, "lastActivityNanos", longAgo);
230+
// Also rewind lastPingSendNanos so the send-throttle (added with the child-stream
231+
// activity fix to prevent rapid-fire PING storms after write failure) does not
232+
// block the second send within the same pingInterval.
233+
setField(handler, "lastReadActivityNanos", longAgo);
234+
setField(handler, "lastPingSendNanos", longAgo);
235+
childRead.set(longAgo);
226236
invokeMaybeSendPing(handler, ctx);
227237
channel.runPendingTasks();
228238

@@ -247,10 +257,17 @@ public void ackTimeout_incrementsConsecutiveFailuresAndClosesAtThreshold() throw
247257
ChannelHandlerContext ctx = channel.pipeline().context(handler);
248258

249259
long pastTimeout = System.nanoTime() - TimeUnit.SECONDS.toNanos(5);
260+
long evenOlder = pastTimeout - TimeUnit.SECONDS.toNanos(1);
250261

251262
// First tick: outstanding PING has aged past timeout -> failures=1, channel still open.
263+
// Also rewind lastReadActivityNanos AND the child-stream-read-activity attribute to
264+
// BEFORE the outstanding PING so the new "activity after outstanding PING" early-return
265+
// (which would otherwise clear the outstanding state) does not fire.
252266
setField(handler, "pingsSent", 1);
253267
setField(handler, "pingOutstandingSinceNanos", pastTimeout);
268+
setField(handler, "lastReadActivityNanos", evenOlder);
269+
AtomicLong childRead = channel.attr(Http2PingHandler.LAST_CHILD_STREAM_READ_ACTIVITY_NANOS).get();
270+
childRead.set(evenOlder);
254271
invokeMaybeSendPing(handler, ctx);
255272
channel.runPendingTasks();
256273

@@ -263,6 +280,8 @@ public void ackTimeout_incrementsConsecutiveFailuresAndClosesAtThreshold() throw
263280
// -> failures=threshold -> task cancelled, channel closed.
264281
setField(handler, "pingsSent", 2);
265282
setField(handler, "pingOutstandingSinceNanos", pastTimeout);
283+
setField(handler, "lastReadActivityNanos", evenOlder);
284+
childRead.set(evenOlder);
266285
invokeMaybeSendPing(handler, ctx);
267286
channel.runPendingTasks();
268287

@@ -274,6 +293,100 @@ public void ackTimeout_incrementsConsecutiveFailuresAndClosesAtThreshold() throw
274293
channel.finishAndReleaseAll();
275294
}
276295

296+
@Test(groups = "unit")
297+
public void childStreamReadActivity_suppressesIdlePing() throws Exception {
298+
// Regression test for the v4.81.0-beta.1 fix: with parent-pipeline reads
299+
// stale (would breach the idle threshold) but child-stream reads recent
300+
// (via the LAST_CHILD_STREAM_READ_ACTIVITY_NANOS attribute updated by
301+
// Http2PingCloseRewrapHandler.channelReadComplete on each H2 child stream),
302+
// the handler must NOT send a PING.
303+
//
304+
// Without the fix (Http2PingHandler reading only lastReadActivityNanos), a
305+
// saturated connection serving thousands of QPS via H2 child streams would
306+
// still look "idle" to the handler -- because HEADERS/DATA frames are
307+
// demuxed by Http2MultiplexHandler and never surface on the parent's
308+
// channelRead -- and PINGs would fire on every interval, adding measurable
309+
// latency overhead under load (the symptom that motivated this PR).
310+
Http2PingHandler handler = new Http2PingHandler(1, 1, 3);
311+
EmbeddedChannel channel = new EmbeddedChannel(handler);
312+
ChannelHandlerContext ctx = channel.pipeline().context(handler);
313+
314+
// Parent-pipeline read-timestamp is stale (60s ago).
315+
long longAgo = System.nanoTime() - TimeUnit.SECONDS.toNanos(60);
316+
setField(handler, "lastReadActivityNanos", longAgo);
317+
318+
// Child-stream read-activity attribute is fresh: simulate Http2PingCloseRewrapHandler
319+
// having just stamped the parent attribute after demuxing an inbound HEADERS/DATA batch.
320+
AtomicLong childReadActivity = channel.attr(Http2PingHandler.LAST_CHILD_STREAM_READ_ACTIVITY_NANOS).get();
321+
assertThat(childReadActivity).as("handlerAdded should have seeded the child-read-activity attribute").isNotNull();
322+
childReadActivity.set(System.nanoTime());
323+
324+
invokeMaybeSendPing(handler, ctx);
325+
channel.runPendingTasks();
326+
327+
// PING must NOT have fired: effectiveLastReadActivityNanos = max(stale, fresh) = fresh.
328+
assertThat((int) getField(handler, "pingsSent"))
329+
.as("recent child-stream reads must suppress idle-PING")
330+
.isZero();
331+
assertThat((long) getField(handler, "pingOutstandingSinceNanos")).isZero();
332+
assertThat(channel.isOpen()).isTrue();
333+
334+
channel.finishAndReleaseAll();
335+
}
336+
337+
@Test(groups = "unit")
338+
public void childStreamReadActivityAfterOutstandingPing_resetsFailureState() throws Exception {
339+
// Regression test for the v4.81.0-beta.1 fix: when a PING is outstanding
340+
// (pingOutstandingSinceNanos != 0) AND child-stream reads flow AFTER the
341+
// PING was sent, the handler must clear the outstanding state and reset
342+
// consecutiveFailures -- the connection is demonstrably healthy (H2 codec is
343+
// still delivering application data), even though the PING ACK itself was
344+
// never observed on the parent pipeline.
345+
//
346+
// Without the fix, a middlebox that drops PING-ACK frames (but lets
347+
// application H2 frames through) would accumulate consecutiveFailures up to
348+
// the threshold and close a healthy, actively-serving connection.
349+
Http2PingHandler handler = new Http2PingHandler(1, 1, 3);
350+
EmbeddedChannel channel = new EmbeddedChannel(handler);
351+
ChannelHandlerContext ctx = channel.pipeline().context(handler);
352+
353+
// PING sent 5s ago is still outstanding; 2 prior consecutive failures recorded.
354+
long pingSentAt = System.nanoTime() - TimeUnit.SECONDS.toNanos(5);
355+
setField(handler, "pingsSent", 7);
356+
setField(handler, "pingOutstandingSinceNanos", pingSentAt);
357+
setField(handler, "consecutiveFailures", 2);
358+
359+
// Parent-pipeline read-timestamp is even OLDER than the outstanding PING so
360+
// it cannot itself trigger the early-return -- proving the reset is driven
361+
// by the child-stream attribute specifically.
362+
long parentLastRead = System.nanoTime() - TimeUnit.SECONDS.toNanos(10);
363+
setField(handler, "lastReadActivityNanos", parentLastRead);
364+
365+
// Child-stream read-activity stamped AFTER the PING was sent (Http2PingCloseRewrapHandler
366+
// saw inbound HEADERS/DATA on a child stream just now).
367+
AtomicLong childReadActivity = channel.attr(Http2PingHandler.LAST_CHILD_STREAM_READ_ACTIVITY_NANOS).get();
368+
assertThat(childReadActivity).isNotNull();
369+
childReadActivity.set(System.nanoTime());
370+
371+
invokeMaybeSendPing(handler, ctx);
372+
channel.runPendingTasks();
373+
374+
// Early-return cleared outstanding state and the failure counter; no new
375+
// PING was sent; channel is still healthy and open.
376+
assertThat((long) getField(handler, "pingOutstandingSinceNanos"))
377+
.as("child-stream reads after outstanding PING must clear pingOutstandingSinceNanos")
378+
.isZero();
379+
assertThat((int) getField(handler, "consecutiveFailures"))
380+
.as("child-stream reads after outstanding PING must reset consecutiveFailures")
381+
.isZero();
382+
assertThat((int) getField(handler, "pingsSent"))
383+
.as("early-return must NOT send a new PING")
384+
.isEqualTo(7);
385+
assertThat(channel.isOpen()).isTrue();
386+
387+
channel.finishAndReleaseAll();
388+
}
389+
277390
private static void invokeMaybeSendPing(Http2PingHandler handler, ChannelHandlerContext ctx) throws Exception {
278391
Method m = Http2PingHandler.class.getDeclaredMethod("maybeSendPing", ChannelHandlerContext.class);
279392
m.setAccessible(true);

sdk/cosmos/azure-cosmos/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#### Bugs Fixed
1313
* Fixed region name normalization for preferred and excluded regions — non-canonical inputs (e.g., `"westus3"`, `"WEST US 3"`) are now mapped to the canonical form. Also fixed a case-sensitive exclude-region check in PPCB reevaluate logic. - See [PR 49090](https://github.com/Azure/azure-sdk-for-java/pull/49090)
1414
* Fixed `UnsupportedOperationException` when using `readManyByPartitionKeys` for empty pages. - See [PR 49311](https://github.com/Azure/azure-sdk-for-java/pull/49311)
15+
* Fixed HTTP/2 PING keepalive handler (introduced in [PR 49095](https://github.com/Azure/azure-sdk-for-java/pull/49095)) so it observes child-stream HEADERS/DATA reads via `Http2PingCloseRewrapHandler.channelReadComplete`, preventing spurious PINGs (and spurious closes) on connections actively serving requests through `Http2MultiplexHandler`.
1516

1617
#### Other Changes
1718
* Added HTTP/2 PING keepalive (default ON) for Gateway service endpoints to detect silently-broken connections. - See [PR 49095](https://github.com/Azure/azure-sdk-for-java/pull/49095)

sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/http/Http2PingCloseRewrapHandler.java

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
import io.netty.channel.ChannelHandlerContext;
88
import io.netty.channel.ChannelInboundHandlerAdapter;
99

10+
import java.util.concurrent.atomic.AtomicLong;
11+
1012
/**
1113
* Per-request HTTP/2 child-stream handler that translates a parent-TCP-channel close
1214
* driven by {@link Http2PingHandler} into a typed {@link Http2PingTimeoutChannelClosedException}.
@@ -40,6 +42,46 @@ final class Http2PingCloseRewrapHandler extends ChannelInboundHandlerAdapter {
4042

4143
private Http2PingCloseRewrapHandler() {}
4244

45+
/**
46+
* Records inbound H2 stream reads on the parent channel so the PING handler
47+
* can recognize a busy connection as non-idle and skip the PING send.
48+
* <p>
49+
* {@code channelReadComplete} fires at the end of each inbound read cycle on
50+
* the child channel -- so this method stamps the parent attribute exactly once
51+
* per batch of HEADERS/DATA frames demuxed by {@link io.netty.handler.codec.http2.Http2MultiplexHandler},
52+
* not once per frame. Under normal load this is plenty of granularity to
53+
* suppress unnecessary PINGs without becoming a hot-path allocator.
54+
* <p>
55+
* Edge cases:
56+
* <ul>
57+
* <li><b>Backpressure / autoRead off</b>: {@link io.netty.handler.codec.http2.Http2MultiplexHandler} may
58+
* buffer frames in the child's inbound queue and defer dispatch. In that
59+
* case {@code channelReadComplete} may fire later than the wire-arrival
60+
* time, but it WILL fire when the read cycle ends. Worst case: one benign
61+
* extra PING during the brief buffering window.</li>
62+
* <li><b>Non-H2 channels</b>: this handler is only installed on H2 child
63+
* streams (where {@code ch.parent() != null}), so the parent reference
64+
* above is non-null in normal operation. Defensive null-check kept for
65+
* safety if the install topology ever changes.</li>
66+
* </ul>
67+
* Uses {@code accumulateAndGet(now, Math::max)} so concurrent updates from
68+
* sibling child streams remain monotonic (in practice child streams sharing a
69+
* parent run on the same event loop, so contention is nil; the monotonic
70+
* semantics document intent and future-proof against multi-event-loop
71+
* scenarios).
72+
*/
73+
@Override
74+
public void channelReadComplete(ChannelHandlerContext ctx) throws Exception {
75+
Channel parent = ctx.channel().parent();
76+
if (parent != null) {
77+
AtomicLong holder = parent.attr(Http2PingHandler.LAST_CHILD_STREAM_READ_ACTIVITY_NANOS).get();
78+
if (holder != null) {
79+
holder.accumulateAndGet(System.nanoTime(), Math::max);
80+
}
81+
}
82+
super.channelReadComplete(ctx);
83+
}
84+
4385
@Override
4486
public void channelInactive(ChannelHandlerContext ctx) throws Exception {
4587
Channel parent = ctx.channel().parent();

0 commit comments

Comments
 (0)