Skip to content

Commit 379f4b5

Browse files
0x46616c6bclaude
andcommitted
🐛 Keep SASL connection alive across Postfix auth-client idle gap
The AUTH loop in sasl.go refreshed SetReadDeadline with ReadTimeout (10s) before each idle wait. Postfix' xsasl_dovecot client, however, caches the auth-client connection per smtpd worker for up to smtpd_timeout (default 300s) and reuses it across SMTP sessions. As soon as the gap between two sessions exceeded 10s, the adapter closed the socket; on the next AUTH Postfix wrote into a half-open connection, read EOF, and reported `454 4.7.0 Temporary authentication failure: Connection lost to authentication server` instead of a clean FAIL. The symptom was most visible for ROLE_SPAM users: their auth permanently fails, so the mail client keeps retrying, and every fresh smtpd worker hits the stale-socket race. Successful authentications hid the bug because the client's automatic retry succeeded on the second attempt. Introduce saslIdleTimeout (360s, longer than smtpd_timeout) and use it only for the AUTH-loop idle wait between requests. Per-operation deadlines for the client handshake (sasl.go) and LOGIN CONT roundtrips stay at ReadTimeout (10s) so a hung client cannot pin a worker. Reproduced locally with boky/postfix + a stub Userli upstream: pre-fix `spam@…` after ≥35s idle → 454; post-fix → 535 with `reason=user disabled due to spam role`. Add TestSASL_PersistentConnection_SurvivesIdleGap covering the new behaviour and adjust TestSASL_IdleConnection_ClosesCleanly to override saslIdleTimeout so it remains fast. Co-Authored-By: Claude <claude@anthropic.com>
1 parent 81d075d commit 379f4b5

2 files changed

Lines changed: 61 additions & 9 deletions

File tree

sasl.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,14 @@ import (
1717
"go.uber.org/zap"
1818
)
1919

20+
// saslIdleTimeout bounds how long the AUTH loop waits for the next request
21+
// on a persistent connection. Postfix' xsasl_dovecot client caches the
22+
// auth socket per smtpd worker up to smtpd_timeout (default 300s); closing
23+
// earlier leaves Postfix with a half-open socket and produces
24+
// `454 Connection lost to authentication server` on the next AUTH instead
25+
// of a clean FAIL. It is a var (not const) so tests can shorten it.
26+
var saslIdleTimeout = 360 * time.Second
27+
2028
// SASLServer implements the Dovecot SASL authentication protocol.
2129
// It acts as a Dovecot auth server so Postfix can authenticate SMTP
2230
// clients via smtpd_sasl_type=dovecot without requiring Dovecot itself.
@@ -73,13 +81,17 @@ func (s *SASLServer) HandleConnection(ctx context.Context, conn net.Conn) {
7381
return
7482
}
7583

76-
// Handle AUTH requests in a loop (persistent connection)
84+
// Handle AUTH requests in a loop (persistent connection). Use the
85+
// longer saslIdleTimeout here so Postfix' xsasl_dovecot client can
86+
// reuse the cached auth socket across smtpd sessions without us
87+
// closing it mid-cache (which produces 454 Connection lost on the
88+
// next AUTH instead of a clean FAIL).
7789
for {
7890
if ctx.Err() != nil {
7991
return
8092
}
8193

82-
_ = conn.SetReadDeadline(time.Now().Add(ReadTimeout))
94+
_ = conn.SetReadDeadline(time.Now().Add(saslIdleTimeout))
8395

8496
line, err := reader.ReadString('\n')
8597
if err != nil {

sasl_test.go

Lines changed: 47 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -485,22 +485,62 @@ func TestSASL_FailResponse_SanitizesReason(t *testing.T) {
485485
}
486486

487487
// TestSASL_IdleConnection_ClosesCleanly verifies that the server cleanly
488-
// closes idle connections via the per-iteration ReadDeadline rather than
489-
// holding onto them under the global ConnectionTimeout (which previously
490-
// caused `reader.ReadString` to time out mid-AUTH after 60s, closing the
491-
// connection without sending a FAIL — Postfix then mapped that EOF to
492-
// 454 "Connection lost to authentication server" instead of 535).
488+
// closes truly idle connections via saslIdleTimeout rather than holding
489+
// onto them forever. Uses a short override so the test stays fast.
493490
func TestSASL_IdleConnection_ClosesCleanly(t *testing.T) {
494491
logger = zap.NewNop()
495492
mockService := &MockUserliService{}
493+
494+
prev := saslIdleTimeout
495+
saslIdleTimeout = 500 * time.Millisecond
496+
t.Cleanup(func() { saslIdleTimeout = prev })
497+
496498
addr, _ := startSASLTestServer(t, mockService)
497499

498500
h := newSASLTestHelper(t, addr)
499501
defer h.close()
500502
h.doHandshake()
501503

502-
// Allow more than ReadTimeout (10s) of idle so the server closes us.
503-
h.conn.SetReadDeadline(time.Now().Add(ReadTimeout + 5*time.Second))
504+
// Allow more than saslIdleTimeout so the server closes us.
505+
h.conn.SetReadDeadline(time.Now().Add(saslIdleTimeout + 2*time.Second))
504506
_, err := h.reader.ReadString('\n')
505507
assert.ErrorIs(t, err, io.EOF, "server must close idle connections cleanly")
506508
}
509+
510+
// TestSASL_PersistentConnection_SurvivesIdleGap verifies that the AUTH
511+
// loop holds the connection open across an idle gap longer than the
512+
// per-operation ReadTimeout. Postfix' xsasl_dovecot caches the auth
513+
// socket per smtpd worker; closing it earlier caused
514+
// `454 Connection lost to authentication server` on the next AUTH.
515+
func TestSASL_PersistentConnection_SurvivesIdleGap(t *testing.T) {
516+
logger = zap.NewNop()
517+
mockService := &MockUserliService{}
518+
mockService.On("Authenticate", mock.Anything, "user@example.org", "secret").
519+
Return(true, "success", nil)
520+
mockService.On("Authenticate", mock.Anything, "user@example.org", "wrong").
521+
Return(false, "authentication failed", nil)
522+
523+
prev := saslIdleTimeout
524+
saslIdleTimeout = 5 * time.Second
525+
t.Cleanup(func() { saslIdleTimeout = prev })
526+
527+
addr, _ := startSASLTestServer(t, mockService)
528+
529+
h := newSASLTestHelper(t, addr)
530+
defer h.close()
531+
h.doHandshake()
532+
533+
resp := h.sendPlainAuth("1", "user@example.org", "secret")
534+
assert.Equal(t, "OK\t1\tuser=user@example.org", resp)
535+
536+
// Idle longer than the per-operation ReadTimeout (10s would be too
537+
// long for a unit test; we just need to outlast the previous broken
538+
// timeout). The server must keep the connection alive.
539+
time.Sleep(2 * time.Second)
540+
541+
resp = h.sendPlainAuth("2", "user@example.org", "wrong")
542+
assert.True(t, strings.HasPrefix(resp, "FAIL\t2\t"),
543+
"server must accept AUTH after idle gap, got %q", resp)
544+
assert.Contains(t, resp, "reason=authentication failed")
545+
mockService.AssertExpectations(t)
546+
}

0 commit comments

Comments
 (0)