Skip to content

Commit d2ed749

Browse files
jackcclaude
andcommitted
Fix data race when context cancelled during connect
The application-supplied BuildContextWatcherHandler was armed during the auth handshake. If the context was cancelled mid-connect, handlers that read *PgConn fields (notably CancelRequestContextWatcherHandler, which reads pgConn.pid and pgConn.secretKey via CancelRequest) would race with the connect goroutine's writes to those fields when handling BackendKeyData. CancelRequest semantics also don't make sense before the connection is established. Use the deadline-only handler during connect and swap in the application-supplied handler at ReadyForQuery, before any queries ValidateConnect might run. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent cfd733d commit d2ed749

2 files changed

Lines changed: 98 additions & 8 deletions

File tree

pgconn/pgconn.go

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,13 @@ func connectOne(ctx context.Context, config *Config, connectConfig *connectOneCo
362362
}
363363
}
364364

365-
pgConn.contextWatcher = ctxwatch.NewContextWatcher(config.BuildContextWatcherHandler(pgConn))
365+
// Use a deadline-only watcher during connect. The application-supplied
366+
// BuildContextWatcherHandler may read *PgConn fields (e.g.
367+
// CancelRequestContextWatcherHandler reads pgConn.pid and
368+
// pgConn.secretKey), which would race with this function's writes to
369+
// those fields when handling BackendKeyData. The application handler is
370+
// installed below, after the connection reaches connStatusIdle.
371+
pgConn.contextWatcher = ctxwatch.NewContextWatcher(&DeadlineContextWatcherHandler{Conn: pgConn.conn})
366372
pgConn.contextWatcher.Watch(ctx)
367373
defer pgConn.contextWatcher.Unwatch()
368374

@@ -454,14 +460,13 @@ func connectOne(ctx context.Context, config *Config, connectConfig *connectOneCo
454460
}
455461
case *pgproto3.ReadyForQuery:
456462
pgConn.status = connStatusIdle
457-
if config.ValidateConnect != nil {
458-
// ValidateConnect may execute commands that cause the context to be watched again. Unwatch first to avoid
459-
// the watch already in progress panic. This is that last thing done by this method so there is no need to
460-
// restart the watch after ValidateConnect returns.
461-
//
462-
// See https://github.com/jackc/pgconn/issues/40.
463-
pgConn.contextWatcher.Unwatch()
463+
// The connect-phase deadline-only watcher is no longer needed; replace
464+
// it with the application-supplied watcher so subsequent operations
465+
// (including any queries run by ValidateConnect) use it.
466+
pgConn.contextWatcher.Unwatch()
467+
pgConn.contextWatcher = ctxwatch.NewContextWatcher(config.BuildContextWatcherHandler(pgConn))
464468

469+
if config.ValidateConnect != nil {
465470
err := config.ValidateConnect(ctx, pgConn)
466471
if err != nil {
467472
if _, ok := err.(*NotPreferredError); ignoreNotPreferredErr && ok {

pgconn/pgconn_test.go

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4841,6 +4841,91 @@ func TestCancelRequestContextWatcherHandler(t *testing.T) {
48414841
}
48424842
}
48434843

4844+
// raceProbeHandler implements ctxwatch.Handler. On HandleCancel it synchronously reads pgConn.PID() and
4845+
// pgConn.SecretKey() — the same fields that CancelRequestContextWatcherHandler's spawned goroutine reads via
4846+
// CancelRequest. Reading synchronously inside HandleCancel guarantees the read happens before HandleUnwatchAfterCancel
4847+
// can cancel anything, eliminating the timing-window flakiness of the real handler and making this a reliable
4848+
// race-detector reproducer for the underlying issue: during connect, *PgConn-reading handlers race with the auth loop's
4849+
// writes to pgConn.pid and pgConn.secretKey when BackendKeyData arrives.
4850+
type raceProbeHandler struct {
4851+
conn *pgconn.PgConn
4852+
pid uint32
4853+
key []byte
4854+
}
4855+
4856+
func (h *raceProbeHandler) HandleCancel(context.Context) {
4857+
h.pid = h.conn.PID()
4858+
h.key = h.conn.SecretKey()
4859+
}
4860+
4861+
func (h *raceProbeHandler) HandleUnwatchAfterCancel() {}
4862+
4863+
// TestConnectContextCancelHandlerRace reproduces the data race that occurred when the application-supplied
4864+
// BuildContextWatcherHandler was armed during the connect handshake. If the context was cancelled while connectOne was
4865+
// still running, a handler that reads *PgConn fields (such as CancelRequestContextWatcherHandler, which calls
4866+
// CancelRequest reading pgConn.pid and pgConn.secretKey) would race with the auth loop's writes to those fields when
4867+
// handling BackendKeyData.
4868+
//
4869+
// The fix is to use the deadline-only handler during connect and only swap in the application-supplied handler after
4870+
// the connection is established.
4871+
//
4872+
// Run with -race to detect the race.
4873+
//
4874+
// Race originally reported in https://github.com/jackc/pgx/pull/2534.
4875+
func TestConnectContextCancelHandlerRace(t *testing.T) {
4876+
t.Parallel()
4877+
4878+
for i := range 10 {
4879+
t.Run(fmt.Sprintf("Stress %d", i), func(t *testing.T) {
4880+
t.Parallel()
4881+
4882+
script := &pgmock.Script{
4883+
Steps: []pgmock.Step{
4884+
pgmock.ExpectAnyMessage(&pgproto3.StartupMessage{ProtocolVersion: pgproto3.ProtocolVersion30, Parameters: map[string]string{}}),
4885+
pgmock.SendMessage(&pgproto3.AuthenticationOk{}),
4886+
pgmock.SendMessage(&pgproto3.BackendKeyData{ProcessID: 12345, SecretKey: []byte{1, 2, 3, 4}}),
4887+
pgmockWaitStep(200 * time.Millisecond),
4888+
pgmock.SendMessage(&pgproto3.ReadyForQuery{TxStatus: 'I'}),
4889+
},
4890+
}
4891+
4892+
ln, err := net.Listen("tcp", "127.0.0.1:")
4893+
require.NoError(t, err)
4894+
defer ln.Close()
4895+
4896+
go func() {
4897+
conn, err := ln.Accept()
4898+
if err != nil {
4899+
return
4900+
}
4901+
defer conn.Close()
4902+
_ = conn.SetDeadline(time.Now().Add(5 * time.Second))
4903+
_ = script.Run(pgproto3.NewBackend(conn, conn))
4904+
}()
4905+
4906+
host, port, _ := strings.Cut(ln.Addr().String(), ":")
4907+
connStr := fmt.Sprintf("sslmode=disable host=%s port=%s user=test database=test", host, port)
4908+
config, err := pgconn.ParseConfig(connStr)
4909+
require.NoError(t, err)
4910+
config.BuildContextWatcherHandler = func(conn *pgconn.PgConn) ctxwatch.Handler {
4911+
return &raceProbeHandler{conn: conn}
4912+
}
4913+
config.ConnectTimeout = 5 * time.Second
4914+
4915+
ctx, cancel := context.WithCancel(context.Background())
4916+
// Cancel during the 200ms wait between BackendKeyData and ReadyForQuery,
4917+
// after the auth loop has written pid/secretKey.
4918+
time.AfterFunc(50*time.Millisecond, cancel)
4919+
defer cancel()
4920+
4921+
pgConn, err := pgconn.ConnectConfig(ctx, config)
4922+
if err == nil {
4923+
closeConn(t, pgConn)
4924+
}
4925+
})
4926+
}
4927+
}
4928+
48444929
func TestConnectProtocolVersion32(t *testing.T) {
48454930
t.Parallel()
48464931

0 commit comments

Comments
 (0)