Skip to content

Commit e1debcf

Browse files
vault: poll for DKG result in NewReportingPlugin (~12s startup speedup) (#21710)
vault: poll for DKG result in NewReportingPlugin instead of failing immediately When the vault OCR3.1 oracle starts before DKG completes, NewReportingPlugin fails because the DKG result package is not yet in the database. This triggers libocr exponential backoff (1s, 2s, 4s, 8s... up to 2min), wasting ~12s even after the DKG result becomes available. Add pollForKeyMaterial() that polls the DB every 2s within the MaxDurationInitialization context window (10s). The vault oracle now starts within seconds of DKG completion instead of waiting for the next retry cycle. Made-with: Cursor
1 parent 41586b4 commit e1debcf

1 file changed

Lines changed: 27 additions & 1 deletion

File tree

core/services/ocr2/plugins/vault/plugin.go

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"regexp"
1313
"slices"
1414
"sort"
15+
"time"
1516

1617
"golang.org/x/crypto/curve25519"
1718
"golang.org/x/crypto/nacl/box"
@@ -136,6 +137,31 @@ func (r *ReportingPluginFactory) getKeyMaterial(ctx context.Context, instanceID
136137
return publicKey, privateKeyShare, nil
137138
}
138139

140+
const dkgPollInterval = 2 * time.Second
141+
142+
// pollForKeyMaterial polls the DKG result package database until the key
143+
// material for the given instance ID is available or the context is cancelled.
144+
// This avoids returning an immediate error when the DKG protocol hasn't
145+
// completed yet, which would trigger libocr's exponential backoff (up to 2
146+
// minutes between retries). By polling here within the MaxDurationInitialization
147+
// window, the vault oracle can start as soon as the DKG result is written.
148+
func (r *ReportingPluginFactory) pollForKeyMaterial(ctx context.Context, instanceID string) (publicKey *tdh2easy.PublicKey, privateKeyShare *tdh2easy.PrivateShare, err error) {
149+
for {
150+
publicKey, privateKeyShare, err = r.getKeyMaterial(ctx, instanceID)
151+
if err == nil {
152+
return publicKey, privateKeyShare, nil
153+
}
154+
155+
r.lggr.Debugw("DKG result package not yet available, will retry", "instanceID", instanceID, "error", err)
156+
157+
select {
158+
case <-ctx.Done():
159+
return nil, nil, fmt.Errorf("context cancelled while waiting for DKG key material (instanceID=%s): %w", instanceID, err)
160+
case <-time.After(dkgPollInterval):
161+
}
162+
}
163+
}
164+
139165
func initializePluginLimits(ctx context.Context, limitsFactory limits.Factory) (ocr3_1types.ReportingPluginLimits, error) {
140166
maxQueryBytes, err := cresettings.Default.VaultMaxQuerySizeLimit.GetOrDefault(ctx, limitsFactory.Settings)
141167
if err != nil {
@@ -277,7 +303,7 @@ func (r *ReportingPluginFactory) NewReportingPlugin(ctx context.Context, config
277303
}
278304

279305
r.lggr.Debugw("fetching key material for instance id", "instanceID", *configProto.DKGInstanceID)
280-
publicKey, privateKeyShare, err := r.getKeyMaterial(ctx, *configProto.DKGInstanceID)
306+
publicKey, privateKeyShare, err := r.pollForKeyMaterial(ctx, *configProto.DKGInstanceID)
281307
if err != nil {
282308
return nil, ocr3_1types.ReportingPluginInfo1{}, fmt.Errorf("could not get key material from DB: %w", err)
283309
}

0 commit comments

Comments
 (0)