From 616ce0082610f33c7109eac06c1ace42fc178ec7 Mon Sep 17 00:00:00 2001 From: Prashant Yadav Date: Wed, 25 Mar 2026 16:09:45 -0700 Subject: [PATCH] vault: poll for DKG result in NewReportingPlugin instead of failing immediately When the vault OCR3.1 oracle starts before DKG completes, NewReportingPlugin fails because the DKG result package is not yet in the database. This triggers libocr exponential backoff (1s, 2s, 4s, 8s... up to 2min), wasting ~12s even after the DKG result becomes available. Add pollForKeyMaterial() that polls the DB every 2s within the MaxDurationInitialization context window (10s). The vault oracle now starts within seconds of DKG completion instead of waiting for the next retry cycle. Made-with: Cursor --- core/services/ocr2/plugins/vault/plugin.go | 28 +++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/core/services/ocr2/plugins/vault/plugin.go b/core/services/ocr2/plugins/vault/plugin.go index d6b30f26be0..92c8e83b8af 100644 --- a/core/services/ocr2/plugins/vault/plugin.go +++ b/core/services/ocr2/plugins/vault/plugin.go @@ -12,6 +12,7 @@ import ( "regexp" "slices" "sort" + "time" "golang.org/x/crypto/curve25519" "golang.org/x/crypto/nacl/box" @@ -136,6 +137,31 @@ func (r *ReportingPluginFactory) getKeyMaterial(ctx context.Context, instanceID return publicKey, privateKeyShare, nil } +const dkgPollInterval = 2 * time.Second + +// pollForKeyMaterial polls the DKG result package database until the key +// material for the given instance ID is available or the context is cancelled. +// This avoids returning an immediate error when the DKG protocol hasn't +// completed yet, which would trigger libocr's exponential backoff (up to 2 +// minutes between retries). By polling here within the MaxDurationInitialization +// window, the vault oracle can start as soon as the DKG result is written. +func (r *ReportingPluginFactory) pollForKeyMaterial(ctx context.Context, instanceID string) (publicKey *tdh2easy.PublicKey, privateKeyShare *tdh2easy.PrivateShare, err error) { + for { + publicKey, privateKeyShare, err = r.getKeyMaterial(ctx, instanceID) + if err == nil { + return publicKey, privateKeyShare, nil + } + + r.lggr.Debugw("DKG result package not yet available, will retry", "instanceID", instanceID, "error", err) + + select { + case <-ctx.Done(): + return nil, nil, fmt.Errorf("context cancelled while waiting for DKG key material (instanceID=%s): %w", instanceID, err) + case <-time.After(dkgPollInterval): + } + } +} + func initializePluginLimits(ctx context.Context, limitsFactory limits.Factory) (ocr3_1types.ReportingPluginLimits, error) { maxQueryBytes, err := cresettings.Default.VaultMaxQuerySizeLimit.GetOrDefault(ctx, limitsFactory.Settings) if err != nil { @@ -277,7 +303,7 @@ func (r *ReportingPluginFactory) NewReportingPlugin(ctx context.Context, config } r.lggr.Debugw("fetching key material for instance id", "instanceID", *configProto.DKGInstanceID) - publicKey, privateKeyShare, err := r.getKeyMaterial(ctx, *configProto.DKGInstanceID) + publicKey, privateKeyShare, err := r.pollForKeyMaterial(ctx, *configProto.DKGInstanceID) if err != nil { return nil, ocr3_1types.ReportingPluginInfo1{}, fmt.Errorf("could not get key material from DB: %w", err) }