Skip to content

Commit d583078

Browse files
authored
Fix server goroutine leak: add idle timeout to bridged streams (skycoin#372)
Bridged streams (bidirectional copy between two clients through the server) blocked forever on io.Copy Read when one side disconnected without cleanly closing the connection. This caused goroutines to accumulate — observed as 55K+ stuck goroutines in production. Added idleTimeoutConn wrapper that resets a per-operation deadline on each Read/Write. If no data flows for 5 minutes, the deadline fires, io.Copy returns an error, CopyReadWriteCloser closes both streams, and the goroutine exits. The timeout resets on each successful read/write, so active streams are not affected. Only truly idle/dead streams are cleaned up.
1 parent 28ba3f1 commit d583078

1 file changed

Lines changed: 37 additions & 4 deletions

File tree

pkg/dmsg/server_session.go

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -265,17 +265,50 @@ func (ss *ServerSession) bridgeStream(log logrus.FieldLogger, yStr io.ReadWriteC
265265
}
266266
log.Debug("Forwarded stream response.")
267267

268-
// Clear the read deadline before the long-lived bidirectional copy.
269-
if conn, ok := yStr.(net.Conn); ok {
270-
conn.SetReadDeadline(time.Time{}) //nolint:errcheck,gosec
271-
}
268+
// Set an idle timeout on both sides of the bridge. If no data flows
269+
// in either direction for this duration, both streams are closed.
270+
// Without this, half-dead connections (client disconnected without
271+
// sending FIN) cause goroutines to leak indefinitely — observed as
272+
// 55K+ stuck goroutines in production dmsg servers.
273+
const streamIdleTimeout = 5 * time.Minute
274+
275+
// Wrap both streams with idle-timeout deadlines.
276+
yStr = &idleTimeoutConn{rwc: yStr, timeout: streamIdleTimeout}
277+
yStr2 = &idleTimeoutConn{rwc: yStr2, timeout: streamIdleTimeout}
272278

273279
log.Info("Serving stream.")
274280
ss.m.RecordStream(metrics.DeltaConnect)
275281
defer ss.m.RecordStream(metrics.DeltaDisconnect)
276282
return netutil.CopyReadWriteCloser(yStr, yStr2)
277283
}
278284

285+
// idleTimeoutConn wraps a ReadWriteCloser with per-operation deadlines.
286+
// If the underlying connection supports SetReadDeadline/SetWriteDeadline,
287+
// each Read/Write resets the deadline. If the connection goes idle (no data
288+
// in either direction), the deadline fires and the blocked io.Copy returns.
289+
type idleTimeoutConn struct {
290+
rwc io.ReadWriteCloser
291+
timeout time.Duration
292+
}
293+
294+
func (c *idleTimeoutConn) Read(p []byte) (int, error) {
295+
if conn, ok := c.rwc.(net.Conn); ok {
296+
conn.SetReadDeadline(time.Now().Add(c.timeout)) //nolint:errcheck,gosec
297+
}
298+
return c.rwc.Read(p)
299+
}
300+
301+
func (c *idleTimeoutConn) Write(p []byte) (int, error) {
302+
if conn, ok := c.rwc.(net.Conn); ok {
303+
conn.SetWriteDeadline(time.Now().Add(c.timeout)) //nolint:errcheck,gosec
304+
}
305+
return c.rwc.Write(p)
306+
}
307+
308+
func (c *idleTimeoutConn) Close() error {
309+
return c.rwc.Close()
310+
}
311+
279312
// forwardViaPeer tries to forward a stream request through peer server sessions.
280313
// This is only called for client-originated requests (not peer-originated, enforcing 1-hop max).
281314
func (ss *ServerSession) forwardViaPeer(log logrus.FieldLogger, yStr io.ReadWriteCloser, req StreamRequest) error {

0 commit comments

Comments
 (0)