Skip to content

Commit 417c3c1

Browse files
committed
Fix that NewClientHandler() could hang indefinitely, preventing new connection attempts
There is some race condition when the `async_write()`/`async_flush()` operation for the `icinga::Hello` message fails (connection reset by peer for example) around the same time the connect timeout fires and calls `cancel()` on the stream, the following call to `async_shutdown()` may block indefinitely. If that happens, the endpoint remains in the connecting state and no new connection attemps are initiated. This commit fixes the issue by removing the `Defer` containing the `async_shutdown()`. The purpose of `async_shutdown()` is to signal a clean termination of the connection to the peer, which really isn't something that makes sense to to in a `Defer` block that is also executed in case of errors. For the one situation where doing a clean TLS shutdown makes some sense (closing anonymous client connections), a call to GracefulShutdown() is added to that specific code path. A large part of the change is just changing the indentation of the code, given that a now unnecessary `try`/`catch` block is removed. The following Go code creates a TLS server that can be used to demonstrate the issue. Note that given that a race condition is involved, this is not reliable and the sleep duration may need some fine-tuning. For this to work, `ApiListener.tls_handshake_timeout` needs to be set to a large-enough value like 60s to disable the timeout for `async_handshake()` itself so that the overall connect timeout is the one that fires. However, changing the timeout is not a prerequisite for the problem, it just makes it easier to reproduce. The error can also happen with the default timeouts if the TCP connect takes long enough so that the handshake is started late enough that its timeout expires after the connect timeout. package main import ( "crypto/tls" "log" "net" "time" ) func main() { cert, err := tls.LoadX509KeyPair("bad-agent.crt", "bad-agent.key") if err != nil { panic(err) } listener, err := tls.Listen("tcp", ":1337", &tls.Config{ Certificates: []tls.Certificate{cert}, }) if err != nil { panic(err) } log.Println("Listening on", listener.Addr()) for { conn, err := listener.Accept() if err != nil { panic(err) } go handle(conn.(*tls.Conn)) } } func handle(conn *tls.Conn) { addr := conn.RemoteAddr().String() log.Println(addr, "new connection") time.Sleep(15*time.Second - 10*time.Millisecond) log.Println(addr, "SetLinger(0)", conn.NetConn().(*net.TCPConn).SetLinger(0)) log.Println(addr, "Handshake()", conn.Handshake()) log.Println(addr, "conn.NetConn().Close()", conn.NetConn().Close()) } With additional logging in the `catch` block for `boost::system::system_error` and `Defer shutdownSslConn` (both removed by this commit), this showed the following. Note that in particular, `async_shutdown()` never returned, indicating that it hangs in there. [2026-04-24 17:32:56 +0200] information/ApiListener: Reconnecting to endpoint 'bad-agent' via host 'host.docker.internal' and port '1337' [2026-04-24 17:33:11 +0200] critical/ApiListener: Timeout while reconnecting to endpoint 'bad-agent' via host 'host.docker.internal' and port '1337', cancelling attempt [2026-04-24 17:33:11 +0200] information/ApiListener: New client connection for identity 'bad-agent' to [172.17.0.1]:1337 [2026-04-24 17:33:12 +0200] information/ApiListener: rethrowing for bad-agent: Error: Connection reset by peer [system:104 at /usr/include/boost/asio/detail/reactive_socket_send_op.hpp:137 in function 'do_complete'] [2026-04-24 17:33:12 +0200] information/ApiListener: doing async_shutdown for bad-agent
1 parent cf699b2 commit 417c3c1

1 file changed

Lines changed: 47 additions & 65 deletions

File tree

lib/remote/apilistener.cpp

Lines changed: 47 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -748,16 +748,6 @@ void ApiListener::NewClientHandlerInternal(
748748
return;
749749
}
750750

751-
Defer shutdownSslConn ([&sslConn, &yc]() {
752-
// Ignore the error, but do not throw an exception being swallowed at all cost.
753-
// https://github.com/Icinga/icinga2/issues/7351
754-
boost::system::error_code ec;
755-
756-
// Using async_shutdown() instead of AsioTlsStream::GracefulDisconnect() as this whole function
757-
// is already guarded by a timeout based on the connect timeout.
758-
sslConn.async_shutdown(yc[ec]);
759-
});
760-
761751
std::shared_ptr<X509> cert (sslConn.GetPeerCertificate());
762752
bool verify_ok = false;
763753
String identity;
@@ -809,8 +799,46 @@ void ApiListener::NewClientHandlerInternal(
809799

810800
ClientType ctype;
811801

812-
try {
813-
if (role == RoleClient) {
802+
if (role == RoleClient) {
803+
JsonRpc::SendMessage(client, new Dictionary({
804+
{ "jsonrpc", "2.0" },
805+
{ "method", "icinga::Hello" },
806+
{ "params", new Dictionary({
807+
{ "version", (double)l_AppVersionInt },
808+
{ "capabilities", (double)ApiCapabilities::MyCapabilities }
809+
}) }
810+
}), yc);
811+
812+
client->async_flush(yc);
813+
814+
ctype = ClientJsonRpc;
815+
} else {
816+
{
817+
boost::system::error_code ec;
818+
819+
if (client->async_fill(yc[ec]) == 0u) {
820+
if (identity.IsEmpty()) {
821+
Log(LogInformation, "ApiListener")
822+
<< "No data received on new API connection " << conninfo << ": " << ec.message()
823+
<< ". Ensure that the remote endpoints are properly configured in a cluster setup.";
824+
} else {
825+
Log(LogWarning, "ApiListener")
826+
<< "No data received on new API connection " << conninfo << " for identity '" << identity << "': " << ec.message()
827+
<< ". Ensure that the remote endpoints are properly configured in a cluster setup.";
828+
}
829+
830+
return;
831+
}
832+
}
833+
834+
char firstByte = 0;
835+
836+
{
837+
asio::mutable_buffer firstByteBuf (&firstByte, 1);
838+
client->peek(firstByteBuf);
839+
}
840+
841+
if (firstByte >= '0' && firstByte <= '9') {
814842
JsonRpc::SendMessage(client, new Dictionary({
815843
{ "jsonrpc", "2.0" },
816844
{ "method", "icinga::Hello" },
@@ -824,54 +852,8 @@ void ApiListener::NewClientHandlerInternal(
824852

825853
ctype = ClientJsonRpc;
826854
} else {
827-
{
828-
boost::system::error_code ec;
829-
830-
if (client->async_fill(yc[ec]) == 0u) {
831-
if (identity.IsEmpty()) {
832-
Log(LogInformation, "ApiListener")
833-
<< "No data received on new API connection " << conninfo << ": " << ec.message()
834-
<< ". Ensure that the remote endpoints are properly configured in a cluster setup.";
835-
} else {
836-
Log(LogWarning, "ApiListener")
837-
<< "No data received on new API connection " << conninfo << " for identity '" << identity << "': " << ec.message()
838-
<< ". Ensure that the remote endpoints are properly configured in a cluster setup.";
839-
}
840-
841-
return;
842-
}
843-
}
844-
845-
char firstByte = 0;
846-
847-
{
848-
asio::mutable_buffer firstByteBuf (&firstByte, 1);
849-
client->peek(firstByteBuf);
850-
}
851-
852-
if (firstByte >= '0' && firstByte <= '9') {
853-
JsonRpc::SendMessage(client, new Dictionary({
854-
{ "jsonrpc", "2.0" },
855-
{ "method", "icinga::Hello" },
856-
{ "params", new Dictionary({
857-
{ "version", (double)l_AppVersionInt },
858-
{ "capabilities", (double)ApiCapabilities::MyCapabilities }
859-
}) }
860-
}), yc);
861-
862-
client->async_flush(yc);
863-
864-
ctype = ClientJsonRpc;
865-
} else {
866-
ctype = ClientHttp;
867-
}
868-
}
869-
} catch (const boost::system::system_error& systemError) {
870-
if (systemError.code() == boost::asio::error::operation_aborted) {
871-
shutdownSslConn.Cancel();
855+
ctype = ClientHttp;
872856
}
873-
874-
throw;
875857
}
876858

877859
std::shared_lock wgLock(*m_ListenerWaitGroup, std::try_to_lock);
@@ -910,20 +892,20 @@ void ApiListener::NewClientHandlerInternal(
910892
<< "Ignoring anonymous JSON-RPC connection " << conninfo
911893
<< ". Max connections (" << GetMaxAnonymousClients() << ") exceeded.";
912894

913-
aclient = nullptr;
914-
}
895+
// Close the connection cleanly, signals the other endpoint
896+
// that the connection didn't break but was closed intentionally.
897+
client->GracefulDisconnect(*strand, yc);
915898

916-
if (aclient) {
917-
aclient->Start();
918-
shutdownSslConn.Cancel();
899+
return;
919900
}
901+
902+
aclient->Start();
920903
} else {
921904
Log(LogNotice, "ApiListener", "New HTTP client");
922905

923906
HttpServerConnection::Ptr aclient = new HttpServerConnection(m_WaitGroup, identity, verify_ok, client);
924907
AddHttpClient(aclient);
925908
aclient->Start();
926-
shutdownSslConn.Cancel();
927909
}
928910
}
929911

0 commit comments

Comments
 (0)