From b2cf8a90748744818e06f5664a7fd638eaff4ca7 Mon Sep 17 00:00:00 2001 From: Simon Mavi Stewart Date: Wed, 27 May 2026 11:08:53 +0100 Subject: [PATCH] [grid] Skip the TCP tunnel read-idle close when reads are paused The read-idle close that TcpUpgradeTunnelHandler installs after the tunnel is established (introduced by db9b07aa5a, 2026-03-11, "[grid] Router WebSocket handle dropped close frames, idle disconnects, high-latency proxying", #17197) tears down both tunnel channels when no bytes have been received for 120 seconds. That is the right behaviour when an intermediate load balancer has silently dropped the TCP connection, but it interacts badly with the backpressure mirroring added in 5cf2e2a871 (2026-05-26, "[grid] Apply TCP backpressure across the WebSocket tunnel handler", #17543): when the peer's outbound buffer crosses its high-water mark, TcpTunnelHandler sets autoRead=false on this side, no channelRead events fire, and the read-idle timer reaches its threshold even though the stall is by design. A sustained slow consumer for more than two minutes will therefore have the tunnel torn down underneath it. Gate the close in IdleCloseHandler.userEventTriggered on the channel's autoRead flag: while reads are paused by backpressure, log at FINE and ignore the event. As soon as the peer drains and TcpTunnelHandler restores autoRead=true the read-idle clock starts again from a fresh read, so a legitimately dropped connection is still detected within the same window once traffic resumes. IdleCloseHandler is promoted from a private nested class to package-private so a focused EmbeddedChannel unit test can exercise both branches: the close-both-channels behaviour on a normal idle event, and the ignored-while-paused behaviour with autoRead=false. --- .../netty/server/TcpUpgradeTunnelHandler.java | 14 ++++- .../openqa/selenium/netty/server/BUILD.bazel | 2 + .../netty/server/IdleCloseHandlerTest.java | 56 +++++++++++++++++++ 3 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 java/test/org/openqa/selenium/netty/server/IdleCloseHandlerTest.java diff --git a/java/src/org/openqa/selenium/netty/server/TcpUpgradeTunnelHandler.java b/java/src/org/openqa/selenium/netty/server/TcpUpgradeTunnelHandler.java index e1c290e4c782a..fc0910a42a15a 100644 --- a/java/src/org/openqa/selenium/netty/server/TcpUpgradeTunnelHandler.java +++ b/java/src/org/openqa/selenium/netty/server/TcpUpgradeTunnelHandler.java @@ -379,7 +379,7 @@ public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause) { * the TCP connection without sending a FIN or RST (common with AWS ALB, k8s ingress-nginx at * their default 60 s idle timeout). */ - private static final class IdleCloseHandler extends ChannelInboundHandlerAdapter { + static final class IdleCloseHandler extends ChannelInboundHandlerAdapter { private final Channel peer; @@ -390,6 +390,18 @@ private static final class IdleCloseHandler extends ChannelInboundHandlerAdapter @Override public void userEventTriggered(ChannelHandlerContext ctx, Object evt) throws Exception { if (evt instanceof IdleStateEvent) { + // Read-idle while backpressure has paused this channel is expected, not a sign of a + // dropped connection: the peer's outbound buffer crossed its high-water mark, the + // TcpTunnelHandler set autoRead=false on this side, and bytes will not be read again + // until the peer drains. Skip the close so a sustained slow consumer does not get the + // tunnel torn down underneath it. + if (!ctx.channel().config().isAutoRead()) { + LOG.log( + Level.FINE, + "TCP tunnel read-idle on {0} ignored: reads paused by backpressure", + ctx.channel()); + return; + } LOG.log( Level.FINE, "TCP tunnel read-idle timeout on {0}, closing both channels", diff --git a/java/test/org/openqa/selenium/netty/server/BUILD.bazel b/java/test/org/openqa/selenium/netty/server/BUILD.bazel index 893d8bfb7ecb2..2d30564771552 100644 --- a/java/test/org/openqa/selenium/netty/server/BUILD.bazel +++ b/java/test/org/openqa/selenium/netty/server/BUILD.bazel @@ -2,6 +2,7 @@ load("@rules_jvm_external//:defs.bzl", "artifact") load("//java:defs.bzl", "JUNIT5_DEPS", "java_library", "java_test_suite") SMALL_TEST_SRCS = [ + "IdleCloseHandlerTest.java", "MessageInboundConverterTest.java", "MessageOutboundConverterTest.java", "RequestConverterTest.java", @@ -20,6 +21,7 @@ java_test_suite( artifact("io.netty:netty-buffer"), artifact("io.netty:netty-codec-http"), artifact("io.netty:netty-common"), + artifact("io.netty:netty-handler"), artifact("io.netty:netty-transport"), artifact("org.junit.jupiter:junit-jupiter-api"), artifact("org.assertj:assertj-core"), diff --git a/java/test/org/openqa/selenium/netty/server/IdleCloseHandlerTest.java b/java/test/org/openqa/selenium/netty/server/IdleCloseHandlerTest.java new file mode 100644 index 0000000000000..4ae4ec2c90c72 --- /dev/null +++ b/java/test/org/openqa/selenium/netty/server/IdleCloseHandlerTest.java @@ -0,0 +1,56 @@ +// Licensed to the Software Freedom Conservancy (SFC) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The SFC licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.openqa.selenium.netty.server; + +import static org.assertj.core.api.Assertions.assertThat; + +import io.netty.channel.embedded.EmbeddedChannel; +import io.netty.handler.timeout.IdleStateEvent; +import org.junit.jupiter.api.Test; + +class IdleCloseHandlerTest { + + @Test + void idleEventClosesBothChannels() { + EmbeddedChannel peer = new EmbeddedChannel(); + EmbeddedChannel self = new EmbeddedChannel(new TcpUpgradeTunnelHandler.IdleCloseHandler(peer)); + + assertThat(self.config().isAutoRead()).isTrue(); + self.pipeline().fireUserEventTriggered(IdleStateEvent.READER_IDLE_STATE_EVENT); + + assertThat(self.isOpen()).isFalse(); + assertThat(peer.isOpen()).isFalse(); + } + + @Test + void idleEventIsIgnoredWhileBackpressureHasPausedReads() { + EmbeddedChannel peer = new EmbeddedChannel(); + EmbeddedChannel self = new EmbeddedChannel(new TcpUpgradeTunnelHandler.IdleCloseHandler(peer)); + + // Simulate backpressure: TcpTunnelHandler would have set autoRead=false on this channel + // because the peer's outbound buffer was full. + self.config().setAutoRead(false); + + // The read-idle event must NOT tear down the tunnel — no bytes arriving is the expected + // consequence of pausing reads, not a sign of a dropped connection. + self.pipeline().fireUserEventTriggered(IdleStateEvent.READER_IDLE_STATE_EVENT); + + assertThat(self.isOpen()).isTrue(); + assertThat(peer.isOpen()).isTrue(); + } +}