55 "errors"
66 "fmt"
77 "log/slog"
8+ "strings"
89 "sync"
910 "time"
1011
@@ -25,8 +26,30 @@ const (
2526 // contention bounded; this is only tolerable in modes where the script write
2627 // is targeting the non-authoritative backend.
2728 maxScriptWriteGoroutines = 64
29+
30+ // maxCompactedRetries caps retries when the secondary returns
31+ // "read timestamp has been compacted". Each attempt re-sends the command so
32+ // the secondary re-selects a fresh read snapshot; a small bound is enough
33+ // because the compaction waterline advances slowly relative to SecondaryTimeout.
34+ maxCompactedRetries = 3
35+ // compactedRetryInitialBackoff is the first delay before retrying a secondary
36+ // command that failed with a compacted-read error.
37+ compactedRetryInitialBackoff = 10 * time .Millisecond
2838)
2939
40+ // readTSCompactedMarker is the substring produced by
41+ // store.ErrReadTSCompacted as it flows through gRPC (wrapped as
42+ // FailedPrecondition) and Lua PCall. Matching on substring is necessary
43+ // because both layers erase the typed error.
44+ const readTSCompactedMarker = "read timestamp has been compacted"
45+
46+ func isReadTSCompactedError (err error ) bool {
47+ if err == nil {
48+ return false
49+ }
50+ return strings .Contains (err .Error (), readTSCompactedMarker )
51+ }
52+
3053// DualWriter routes commands to primary and secondary backends based on mode.
3154type DualWriter struct {
3255 primary Backend
@@ -225,14 +248,7 @@ func (d *DualWriter) writeSecondary(cmd string, iArgs []any) {
225248 defer cancel ()
226249
227250 start := time .Now ()
228- result := d .secondary .Do (sCtx , iArgs ... )
229- _ , sErr := result .Result ()
230- if isNoScriptError (sErr ) {
231- if fallbackArgs , ok := d .evalFallbackArgs (cmd , iArgs ); ok {
232- result = d .secondary .Do (sCtx , fallbackArgs ... )
233- _ , sErr = result .Result ()
234- }
235- }
251+ sErr := d .executeSecondary (sCtx , cmd , iArgs )
236252 d .metrics .CommandDuration .WithLabelValues (cmd , d .secondary .Name ()).Observe (time .Since (start ).Seconds ())
237253
238254 if sErr != nil && ! errors .Is (sErr , redis .Nil ) {
@@ -248,6 +264,38 @@ func (d *DualWriter) writeSecondary(cmd string, iArgs []any) {
248264 d .metrics .CommandTotal .WithLabelValues (cmd , d .secondary .Name (), "ok" ).Inc ()
249265}
250266
267+ // executeSecondary sends the command to the secondary, handling the NOSCRIPT
268+ // → EVAL fallback and transparently retrying when the secondary reports that
269+ // the read snapshot has been compacted. A re-sent command causes the backend
270+ // to re-select a fresh read timestamp, which is the only way to recover once
271+ // the original startTS has fallen behind MinRetainedTS on a peer node.
272+ func (d * DualWriter ) executeSecondary (sCtx context.Context , cmd string , iArgs []any ) error {
273+ backoff := compactedRetryInitialBackoff
274+ var sErr error
275+ for attempt := 0 ; ; attempt ++ {
276+ result := d .secondary .Do (sCtx , iArgs ... )
277+ _ , sErr = result .Result ()
278+ if isNoScriptError (sErr ) {
279+ if fallbackArgs , ok := d .evalFallbackArgs (cmd , iArgs ); ok {
280+ result = d .secondary .Do (sCtx , fallbackArgs ... )
281+ _ , sErr = result .Result ()
282+ }
283+ }
284+ if ! isReadTSCompactedError (sErr ) {
285+ return sErr
286+ }
287+ if attempt >= maxCompactedRetries {
288+ return sErr
289+ }
290+ select {
291+ case <- sCtx .Done ():
292+ return sErr
293+ case <- time .After (backoff ):
294+ }
295+ backoff *= 2
296+ }
297+ }
298+
251299// goWrite launches fn in a bounded write goroutine.
252300func (d * DualWriter ) goWrite (fn func ()) {
253301 d .goAsyncWithSem (d .writeSem , fn )
0 commit comments