Skip to content

Commit c48d17a

Browse files
fix(sns): collapse concurrent cert-cache misses via singleflight (bug-bash #1/#9) (#231)
snsVerifier.getCert read the cert cache under RLock and, on a miss, fetched the cert and wrote it under Lock — with nothing between the read-miss and the write. A burst of concurrent SNS deliveries that all miss the cache (cold start, or the instant after the 24h TTL expires) each fire their own fetchCert over the network and then race to overwrite the map: a thundering herd against the AWS cert endpoint. Collapse the miss path through a singleflight.Group keyed by certURL (same primitive already used in internal/cache/redis.go, team_summary.go) so a concurrent burst issues exactly ONE fetch between them; followers receive the leader's result. The fast-path RLock cache hit is unchanged. Test: TestSNSVerifyFinal2_GetCert_SingleflightCollapsesConcurrentMisses — 20 concurrent missing-cache getCert calls, fetch held open on a channel so all collapse onto one in-flight call; asserts exactly 1 fetchCert. Passes under -race; getCert 100% covered. Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 3f0f87c commit c48d17a

2 files changed

Lines changed: 74 additions & 6 deletions

File tree

internal/handlers/sns_verify.go

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ import (
4343
"strings"
4444
"sync"
4545
"time"
46+
47+
"golang.org/x/sync/singleflight"
4648
)
4749

4850
// snsSigningCertHostRegex enforces "sns.<region>.amazonaws.com" hostnames
@@ -92,6 +94,13 @@ type snsVerifier struct {
9294

9395
mu sync.RWMutex
9496
certCache map[string]snsCertCacheEntry
97+
98+
// sfGroup collapses concurrent cache-miss fetches for the same certURL
99+
// into a single in-flight network call. Without it a burst of SNS
100+
// deliveries that all miss the cache (cold start, or just after a TTL
101+
// expiry) each fire their own fetchCert and then race to overwrite the
102+
// map — a thundering herd against the AWS cert endpoint. See getCert.
103+
sfGroup singleflight.Group
95104
}
96105

97106
// newSNSVerifier returns a verifier with production defaults.
@@ -216,6 +225,14 @@ func (v *snsVerifier) verify(msg snsMessage) error {
216225
}
217226

218227
// getCert returns a cached certificate or fetches it via fetchCert.
228+
//
229+
// The miss path is collapsed through a singleflight group keyed by certURL so
230+
// a burst of concurrent SNS deliveries that all miss the cache (cold start, or
231+
// the instant after a TTL expiry) issue exactly ONE fetchCert call between
232+
// them rather than a thundering herd against the AWS cert endpoint. The fetch
233+
// closure re-checks the cache under the lock (double-checked locking) before
234+
// fetching, so a request that joins just after the leader populated the cache
235+
// returns the fresh entry without a redundant network call.
219236
func (v *snsVerifier) getCert(certURL string) (*x509.Certificate, error) {
220237
v.mu.RLock()
221238
entry, ok := v.certCache[certURL]
@@ -224,15 +241,20 @@ func (v *snsVerifier) getCert(certURL string) (*x509.Certificate, error) {
224241
return entry.cert, nil
225242
}
226243

227-
cert, err := v.fetchCert("sns", certURL)
244+
cert, err, _ := v.sfGroup.Do(certURL, func() (any, error) {
245+
c, err := v.fetchCert("sns", certURL)
246+
if err != nil {
247+
return nil, err
248+
}
249+
v.mu.Lock()
250+
v.certCache[certURL] = snsCertCacheEntry{cert: c, fetched: time.Now()}
251+
v.mu.Unlock()
252+
return c, nil
253+
})
228254
if err != nil {
229255
return nil, err
230256
}
231-
232-
v.mu.Lock()
233-
v.certCache[certURL] = snsCertCacheEntry{cert: cert, fetched: time.Now()}
234-
v.mu.Unlock()
235-
return cert, nil
257+
return cert.(*x509.Certificate), nil
236258
}
237259

238260
// defaultFetchCert fetches the PEM cert at certURL and returns the first

internal/handlers/sns_verify_final2_test.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ import (
2525
"encoding/base64"
2626
"errors"
2727
"math/big"
28+
"sync"
29+
"sync/atomic"
2830
"testing"
2931
"time"
3032
)
@@ -183,3 +185,47 @@ func TestSNSVerifyFinal2_GetCert_CacheHitAndMiss(t *testing.T) {
183185
t.Error("getCert must propagate fetch error")
184186
}
185187
}
188+
189+
// TestSNSVerifyFinal2_GetCert_SingleflightCollapsesConcurrentMisses proves the
190+
// bug-bash #1/#9 fix: a burst of concurrent cache-miss getCert calls for the
191+
// same certURL collapses into exactly ONE fetchCert call (no thundering herd
192+
// against the AWS cert endpoint). The fetch is held open on a channel so every
193+
// goroutine joins the same in-flight singleflight call before it completes.
194+
func TestSNSVerifyFinal2_GetCert_SingleflightCollapsesConcurrentMisses(t *testing.T) {
195+
cert, _ := final2GenCertKey(t)
196+
v := newSNSVerifier()
197+
198+
var calls int32
199+
proceed := make(chan struct{})
200+
v.fetchCert = func(_ string, _ string) (*x509.Certificate, error) {
201+
atomic.AddInt32(&calls, 1)
202+
<-proceed // hold the flight open so concurrent callers collapse into it
203+
return cert, nil
204+
}
205+
206+
const n = 20
207+
var wg sync.WaitGroup
208+
errs := make([]error, n)
209+
for i := 0; i < n; i++ {
210+
wg.Add(1)
211+
go func(i int) {
212+
defer wg.Done()
213+
_, errs[i] = v.getCert(final2AWSCertURL)
214+
}(i)
215+
}
216+
217+
// Give all goroutines time to enter the singleflight Do and collapse onto
218+
// the leader's still-open flight, then release the fetch.
219+
time.Sleep(50 * time.Millisecond)
220+
close(proceed)
221+
wg.Wait()
222+
223+
for i, err := range errs {
224+
if err != nil {
225+
t.Fatalf("getCert[%d] returned error: %v", i, err)
226+
}
227+
}
228+
if got := atomic.LoadInt32(&calls); got != 1 {
229+
t.Errorf("singleflight should collapse %d concurrent misses into 1 fetch, got %d", n, got)
230+
}
231+
}

0 commit comments

Comments
 (0)