Skip to content

Commit 09b21f8

Browse files
[PublicationAwaiter] Don't fail awaiting on a single error (#925)
This changes the Await loop so that it no longer fails on a single transient error when fetching/parsing the checkpoint. For now this is set to fail on two consecutive errors, but perhaps this should be changed to keep trying until the context is cancelled for maximum stickiness. For now, this seems like a pragmatic compromise between these worlds. This is important because it will make the write path for logs more likely to return happy status codes if there are intermittent problems reading checkpoint storage. Co-authored-by: Al Cutter <alcutter@google.com>
1 parent db87c51 commit 09b21f8

1 file changed

Lines changed: 17 additions & 10 deletions

File tree

await.go

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -90,17 +90,24 @@ func (a *PublicationAwaiter) Await(ctx context.Context, future IndexFuture) (Ind
9090
if a.preWaitSignaller != nil {
9191
a.preWaitSignaller <- struct{}{}
9292
}
93-
for (a.size <= i.Index && a.err == nil) && ctx.Err() == nil {
94-
a.c.Wait()
95-
}
9693

97-
// Make sure we report any errors that caused us to stop early
98-
err = a.err
99-
if err == nil {
100-
err = ctx.Err()
94+
// Await the tree growing to include the new leaf, or for two consecutive errors to be reported.
95+
errorObserved := false
96+
for ctx.Err() == nil {
97+
if a.size > i.Index {
98+
return i, a.checkpoint, nil // Success
99+
}
100+
if a.err != nil {
101+
if errorObserved {
102+
return i, a.checkpoint, a.err // Second consecutive error
103+
}
104+
}
105+
errorObserved = a.err != nil
106+
a.c.Wait()
101107
}
102108

103-
return i, a.checkpoint, err
109+
// The loop only exits if the context was cancelled or expired.
110+
return i, nil, ctx.Err()
104111
})
105112
}
106113

@@ -148,8 +155,8 @@ func (a *PublicationAwaiter) pollLoop(ctx context.Context, readCheckpoint func(c
148155
a.checkpoint = cp
149156
a.size = cpSize
150157
a.err = cpErr
151-
// Note that for now, this releases all clients in the event of a single failure.
152-
// If this causes problems, this could be changed to attempt retries.
158+
// Note that this releases all clients in the event of any failure.
159+
// However individual clients (via Await) can decide whether to ignore or fail.
153160
a.c.Broadcast()
154161
span.AddEvent("Broadcast Sent")
155162
a.c.L.Unlock()

0 commit comments

Comments
 (0)