From 0671d99a3c855e201075da90cc5ca61a06c1a321 Mon Sep 17 00:00:00 2001 From: maclane Date: Fri, 19 Jun 2026 17:30:53 -0400 Subject: [PATCH] feat(frost): conclude the oversized signing done check by signature quorum (7.3 t-of-included PR3/3) Under RFC-21 Phase 7.3 t-of-included finalize, an interactive signing attempt is signed by the first t responsive committers of an (optionally) oversized included set; the non-subset and offline members never broadcast a signing done check, so the outer signingDoneCheck - which required a confirmation from EVERY attempt member - would hang an otherwise-successful attempt to its timeout and force a needless retry. Conclude the done check on a deterministic threshold quorum instead, keeping the legacy (non-oversized) path byte-for-byte unchanged: - non-oversized (included == honestThreshold; today's selector output and the whole coarse path): the legacy all-members rule is UNCHANGED - wait for every attempt member, require all signatures equal, return max(endBlock). - oversized (included > honestThreshold): bucket done checks by signature and conclude once a bucket holds >= honestThreshold distinct senders (the minimum that proves a valid threshold signature). Minority buckets (divergent or adversarial signatures) are IGNORED, never fatal, so one bad done message cannot fracture the group. The end block is the DETERMINISTIC attempt timeout block, not a network-order-dependent max over done messages. The deterministic end block is the crux: batch scheduling derives the next signature's start as signingStartBlock = prev endBlock + interlude, so a per-node max over a network-order-dependent subset would desync the batch. Returning attemptTimeoutBlock (which every honest node computes identically) fixes it. With honest majority (t > groupSize/2) at most one signature bucket can reach t - even under coordinator equivocation - so the quorum is unique and no body-hash / proto change is needed; the >1-quorum branch is unreachable and intentionally non-fatal. Design locked via a Codex+Gemini consult after the initial first-t-arrivals approach was shown to be cross-node non-deterministic (it fed an order-dependent end block into batch scheduling). Tests: legacy done-check tests unchanged; added oversized conclude-on-quorum (with a deterministic end block), minority-divergent-ignored, and split-below-quorum-times-out. Co-Authored-By: Claude Opus 4.8 --- pkg/tbtc/signing.go | 1 + pkg/tbtc/signing_done.go | 204 +++++++++++++++++++++++++--------- pkg/tbtc/signing_done_test.go | 185 +++++++++++++++++++++++++++++- 3 files changed, 338 insertions(+), 52 deletions(-) diff --git a/pkg/tbtc/signing.go b/pkg/tbtc/signing.go index c609afe73e..1c5e9437a2 100644 --- a/pkg/tbtc/signing.go +++ b/pkg/tbtc/signing.go @@ -373,6 +373,7 @@ func (se *signingExecutor) signWithTaprootMerkleRoot( doneCheck := newSigningDoneCheck( se.groupParameters.GroupSize, + se.groupParameters.HonestThreshold, se.broadcastChannel, se.membershipValidator, ) diff --git a/pkg/tbtc/signing_done.go b/pkg/tbtc/signing_done.go index f14426d87f..7b3b717054 100644 --- a/pkg/tbtc/signing_done.go +++ b/pkg/tbtc/signing_done.go @@ -47,23 +47,42 @@ func (sdm *signingDoneMessage) Type() string { // successful signature calculation across all signing group members. type signingDoneCheck struct { groupSize int + honestThreshold int broadcastChannel net.BroadcastChannel membershipValidator *group.MembershipValidator - receiveCtx context.Context - cancelReceiveCtx context.CancelFunc - expectedSignersCount int - doneSigners map[group.MemberIndex]*signingDoneMessage - doneSignersMutex sync.RWMutex + receiveCtx context.Context + cancelReceiveCtx context.CancelFunc + // attemptMemberCount is len(attemptMembersIndexes) for the live attempt - the + // number of confirmations the legacy (non-oversized) rule waits for. + attemptMemberCount int + // oversized is true when the attempt's included set is larger than the honest + // threshold: the RFC-21 Phase 7.3 t-of-included case, where the attempt is + // signed by a t-subset so the non-subset / offline members never report done + // and the legacy all-members rule would hang. It selects the + // quorum-by-signature completion rule. When false (included == threshold, the + // pre-oversizing selector output and the whole coarse path) the legacy rule is + // used UNCHANGED. + oversized bool + // attemptTimeoutBlock is the deterministic block the attempt concludes by + // (announcementEndBlock + signingAttemptMaximumProtocolBlocks). On the oversized + // path it is returned as the result end block instead of a network-order- + // dependent max over done messages, so every honest node feeds signBatch the + // same next-signature start block (signingStartBlock = prev endBlock + interlude). + attemptTimeoutBlock uint64 + doneSigners map[group.MemberIndex]*signingDoneMessage + doneSignersMutex sync.RWMutex } func newSigningDoneCheck( groupSize int, + honestThreshold int, broadcastChannel net.BroadcastChannel, membershipValidator *group.MembershipValidator, ) *signingDoneCheck { return &signingDoneCheck{ groupSize: groupSize, + honestThreshold: honestThreshold, broadcastChannel: broadcastChannel, membershipValidator: membershipValidator, } @@ -91,7 +110,17 @@ func (sdc *signingDoneCheck) listen( sdc.receiveCtx, sdc.cancelReceiveCtx = context.WithCancel(ctx) sdc.doneSignersMutex.Lock() - sdc.expectedSignersCount = len(attemptMembersIndexes) + sdc.attemptMemberCount = len(attemptMembersIndexes) + // An included set larger than the honest threshold is the RFC-21 Phase 7.3 + // t-of-included case: the attempt is signed by a t-subset and the non-subset / + // offline members never report done, so the all-members rule would hang. The + // oversized path concludes on a quorum of >= honestThreshold matching + // signatures instead, with a deterministic end block. included == threshold + // (today's selector output and the whole coarse path) keeps the legacy rule + // byte-for-byte, so behavior is unchanged until participant selection oversizes + // the set. + sdc.oversized = len(attemptMembersIndexes) > sdc.honestThreshold + sdc.attemptTimeoutBlock = attemptTimeoutBlock sdc.doneSigners = make(map[group.MemberIndex]*signingDoneMessage) sdc.doneSignersMutex.Unlock() @@ -149,12 +178,12 @@ func (sdc *signingDoneCheck) signalDone( }, net.BackoffRetransmissionStrategy) } -// waitUntilAllDone blocks until it receives all the required done checks from -// members or until the passed context is done. In the first case, it returns -// the signature computed by the signing members and the block at which the -// slowest signer completed the signature computation process. If the expected -// done checks are not received on time, the function returns an error. If at -// least one signature is different from others, the function returns an error. +// waitUntilAllDone blocks until the attempt's completion rule is met or the +// passed context is done. On success it returns the agreed signature and a +// deterministic end block (the same value on every honest node): on the legacy +// path the block at which the slowest attempt member completed, on the oversized +// path the attempt timeout block. It returns errWaitDoneTimedOut if the rule is +// not met on time, and a non-nil error on a fatal divergence (legacy path only). func (sdc *signingDoneCheck) waitUntilAllDone(ctx context.Context) ( *signing.Result, uint64, @@ -171,35 +200,123 @@ func (sdc *signingDoneCheck) waitUntilAllDone(ctx context.Context) ( return nil, 0, errWaitDoneTimedOut case <-ticker.C: - expectedSignersCount, doneSigners := sdc.snapshotDoneSigners() - if expectedSignersCount == len(doneSigners) { - var signature *frost.Signature - var latestEndBlock uint64 - - for _, doneMessage := range doneSigners { - if signature == nil { - signature = doneMessage.signature - } else { - if !signature.Equals(doneMessage.signature) { - return nil, 0, fmt.Errorf( - "not matching signatures detected: [%v] and [%v]", - signature, - doneMessage.signature, - ) - } - } - - if doneMessage.endBlock > latestEndBlock { - latestEndBlock = doneMessage.endBlock - } - } - - return &signing.Result{Signature: signature}, latestEndBlock, nil + result, endBlock, concluded, err := sdc.evaluateDone() + if err != nil { + return nil, 0, err + } + if concluded { + return result, endBlock, nil } } } } +// evaluateDone snapshots the done checks collected so far and applies the +// attempt's completion rule. It returns (result, endBlock, true, nil) once the +// attempt can conclude, (nil, 0, false, nil) while still waiting, or +// (nil, 0, false, err) on a fatal divergence. The legacy (non-oversized) rule +// and the oversized t-of-included rule are kept fully separate so the legacy / +// coarse path is byte-for-byte unchanged. +func (sdc *signingDoneCheck) evaluateDone() (*signing.Result, uint64, bool, error) { + sdc.doneSignersMutex.RLock() + oversized := sdc.oversized + attemptMemberCount := sdc.attemptMemberCount + attemptTimeoutBlock := sdc.attemptTimeoutBlock + honestThreshold := sdc.honestThreshold + doneSigners := make([]*signingDoneMessage, 0, len(sdc.doneSigners)) + for _, doneMessage := range sdc.doneSigners { + doneSigners = append(doneSigners, doneMessage.clone()) + } + sdc.doneSignersMutex.RUnlock() + + if oversized { + return concludeOversizedDone(doneSigners, honestThreshold, attemptTimeoutBlock) + } + return concludeLegacyDone(doneSigners, attemptMemberCount) +} + +// concludeLegacyDone is the pre-7.3 rule, UNCHANGED: conclude once every attempt +// member confirmed, require all signatures equal, and return the max end block. +// The attemptMemberCount > 0 guard rejects the pre-listen state (no attempt +// configured) so an empty done set is never read as success. +func concludeLegacyDone( + doneSigners []*signingDoneMessage, + attemptMemberCount int, +) (*signing.Result, uint64, bool, error) { + if attemptMemberCount == 0 || len(doneSigners) != attemptMemberCount { + return nil, 0, false, nil + } + + var signature *frost.Signature + var latestEndBlock uint64 + for _, doneMessage := range doneSigners { + if signature == nil { + signature = doneMessage.signature + } else if !signature.Equals(doneMessage.signature) { + return nil, 0, false, fmt.Errorf( + "not matching signatures detected: [%v] and [%v]", + signature, + doneMessage.signature, + ) + } + + if doneMessage.endBlock > latestEndBlock { + latestEndBlock = doneMessage.endBlock + } + } + + return &signing.Result{Signature: signature}, latestEndBlock, true, nil +} + +// concludeOversizedDone is the RFC-21 Phase 7.3 t-of-included rule: bucket the +// done checks by signature (one done message per sender, so each member is in +// exactly one bucket) and conclude once a bucket holds >= honestThreshold +// distinct senders - the minimum that proves a valid threshold signature. +// Minority buckets (divergent or adversarial signatures) are IGNORED, never +// fatal, so a single bad done message cannot fracture the group. The end block +// is the deterministic attempt timeout block, not a network-order-dependent max, +// so every honest node returns the same value for batch scheduling. +func concludeOversizedDone( + doneSigners []*signingDoneMessage, + honestThreshold int, + attemptTimeoutBlock uint64, +) (*signing.Result, uint64, bool, error) { + if honestThreshold <= 0 { + return nil, 0, false, nil + } + + bucketSig := map[string]*frost.Signature{} + bucketCount := map[string]int{} + for _, doneMessage := range doneSigners { + serialized := doneMessage.signature.Serialize() + key := string(serialized[:]) + if _, ok := bucketSig[key]; !ok { + bucketSig[key] = doneMessage.signature + } + bucketCount[key]++ + } + + var quorumSig *frost.Signature + quorums := 0 + for key, count := range bucketCount { + if count >= honestThreshold { + quorums++ + quorumSig = bucketSig[key] + } + } + + // Exactly one >= t bucket is the only reachable outcome under honest majority + // (honestThreshold > groupSize/2 means two disjoint >= t buckets cannot + // coexist). It carries the one valid signature and concludes deterministically. + // quorums == 0 keeps waiting; quorums > 1 is unreachable and intentionally NOT + // concluded (the attempt fails via the ctx timeout rather than picking a bucket + // nondeterministically - a bare done-message split is not coordinator blame). + if quorums == 1 { + return &signing.Result{Signature: quorumSig}, attemptTimeoutBlock, true, nil + } + return nil, 0, false, nil +} + // isValidDoneMessage validates the given signingDoneMessage in the context // of the given signing attempt. func (sdc *signingDoneCheck) isValidDoneMessage( @@ -250,21 +367,6 @@ func (sdc *signingDoneCheck) recordDoneMessage( return true } -func (sdc *signingDoneCheck) snapshotDoneSigners() ( - int, - []*signingDoneMessage, -) { - sdc.doneSignersMutex.RLock() - defer sdc.doneSignersMutex.RUnlock() - - result := make([]*signingDoneMessage, 0, len(sdc.doneSigners)) - for _, doneMessage := range sdc.doneSigners { - result = append(result, doneMessage.clone()) - } - - return sdc.expectedSignersCount, result -} - func (sdm *signingDoneMessage) clone() *signingDoneMessage { if sdm == nil { return nil diff --git a/pkg/tbtc/signing_done_test.go b/pkg/tbtc/signing_done_test.go index 911cf9c69d..6a08dece7c 100644 --- a/pkg/tbtc/signing_done_test.go +++ b/pkg/tbtc/signing_done_test.go @@ -281,16 +281,198 @@ func TestSigningDoneCheck_AnotherSignature(t *testing.T) { } } +// TestSigningDoneCheck_ThresholdSubsetConcludes covers RFC-21 Phase 7.3 +// t-of-included finalize: the included set is oversized (larger than the honest +// threshold) and only the t-subset that actually signed reports done - the rest +// are offline. The check must conclude on a quorum of t matching signatures +// rather than hanging for the absent members, and must return the DETERMINISTIC +// attempt timeout block as the end block (not a network-order-dependent max over +// the done messages, which would desync batch scheduling across nodes). +func TestSigningDoneCheck_ThresholdSubsetConcludes(t *testing.T) { + groupParameters := &GroupParameters{ + GroupSize: 5, + GroupQuorum: 4, + HonestThreshold: 3, + } + + doneCheck := setupSigningDoneCheck(t, groupParameters) + + memberIndexes := make([]group.MemberIndex, doneCheck.groupSize) + for i := range memberIndexes { + memberIndexes[i] = group.MemberIndex(i + 1) + } + + ctx, cancelCtx := context.WithTimeout(context.Background(), 5*time.Second) + defer cancelCtx() + + message := big.NewInt(100) + attemptNumber := uint64(1) + attemptTimeoutBlock := uint64(1000) + // Oversized included set: ALL five members are included, but only the first + // three (the chosen signing subset) report done; members 4 and 5 are offline. + attemptMemberIndexes := memberIndexes + result := &signing.Result{ + Signature: mustFrostSignatureFromBigInts(big.NewInt(200), big.NewInt(300)), + } + + doneCheck.listen( + ctx, + message, + attemptNumber, + attemptTimeoutBlock, + attemptMemberIndexes, + ) + + // The three reporters carry DIFFERENT end blocks (501, 502, 503); the result + // must still be the deterministic attempt timeout block, proving the returned + // end block does not depend on which/how-many done messages arrived. + for i := 1; i <= groupParameters.HonestThreshold; i++ { + err := doneCheck.signalDone( + ctx, + uint8(i), + message, + attemptNumber, + result, + 500+uint64(i), + ) + if err != nil { + t.Fatal(err) + } + } + + returnedResult, endBlock, err := doneCheck.waitUntilAllDone(ctx) + if err != nil { + t.Fatalf("expected conclusion on the t-subset, got error: [%v]", err) + } + if returnedResult == nil || !result.Signature.Equals(returnedResult.Signature) { + t.Fatalf("unexpected result: [%v]", returnedResult) + } + testutils.AssertIntsEqual(t, "end block", int(attemptTimeoutBlock), int(endBlock)) +} + +// TestSigningDoneCheck_OversizedIgnoresMinorityDivergentSignature proves the +// oversized rule is robust to a minority adversary: a single done message +// carrying a DIFFERENT signature must not fail the check or fracture the group. +// t members report the correct signature (a quorum) while one reports a +// divergent one; the check concludes on the quorum and ignores the minority, +// rather than erroring the way the legacy all-members match-all rule would. +func TestSigningDoneCheck_OversizedIgnoresMinorityDivergentSignature(t *testing.T) { + groupParameters := &GroupParameters{ + GroupSize: 5, + GroupQuorum: 4, + HonestThreshold: 3, + } + + doneCheck := setupSigningDoneCheck(t, groupParameters) + + memberIndexes := make([]group.MemberIndex, doneCheck.groupSize) + for i := range memberIndexes { + memberIndexes[i] = group.MemberIndex(i + 1) + } + + ctx, cancelCtx := context.WithTimeout(context.Background(), 5*time.Second) + defer cancelCtx() + + message := big.NewInt(100) + attemptNumber := uint64(1) + attemptTimeoutBlock := uint64(1000) + attemptMemberIndexes := memberIndexes // oversized (5 > threshold 3) + correctResult := &signing.Result{ + Signature: mustFrostSignatureFromBigInts(big.NewInt(200), big.NewInt(300)), + } + divergentResult := &signing.Result{ + Signature: mustFrostSignatureFromBigInts(big.NewInt(201), big.NewInt(300)), + } + + doneCheck.listen(ctx, message, attemptNumber, attemptTimeoutBlock, attemptMemberIndexes) + + // Members 1..3 report the correct signature (a quorum); member 4 reports a + // divergent one. + for i := 1; i <= groupParameters.HonestThreshold; i++ { + if err := doneCheck.signalDone(ctx, uint8(i), message, attemptNumber, correctResult, 100); err != nil { + t.Fatal(err) + } + } + if err := doneCheck.signalDone(ctx, uint8(4), message, attemptNumber, divergentResult, 100); err != nil { + t.Fatal(err) + } + + returnedResult, endBlock, err := doneCheck.waitUntilAllDone(ctx) + if err != nil { + t.Fatalf("a minority divergent signature must be ignored, not fatal; got error: [%v]", err) + } + if returnedResult == nil || !correctResult.Signature.Equals(returnedResult.Signature) { + t.Fatalf("expected the quorum signature, got: [%v]", returnedResult) + } + testutils.AssertIntsEqual(t, "end block", int(attemptTimeoutBlock), int(endBlock)) +} + +// TestSigningDoneCheck_OversizedSplitBelowQuorumTimesOut covers the no-quorum +// case (e.g. a coordinator equivocation splitting honest nodes across two +// signatures, plus offline members): with the reporters split below the +// threshold in every signature bucket, no bucket reaches t, so the check must +// time out rather than conclude on a non-quorum subset. +func TestSigningDoneCheck_OversizedSplitBelowQuorumTimesOut(t *testing.T) { + groupParameters := &GroupParameters{ + GroupSize: 5, + GroupQuorum: 4, + HonestThreshold: 3, + } + + doneCheck := setupSigningDoneCheck(t, groupParameters) + + memberIndexes := make([]group.MemberIndex, doneCheck.groupSize) + for i := range memberIndexes { + memberIndexes[i] = group.MemberIndex(i + 1) + } + + ctx, cancelCtx := context.WithTimeout(context.Background(), 1*time.Second) + defer cancelCtx() + + message := big.NewInt(100) + attemptNumber := uint64(1) + attemptTimeoutBlock := uint64(1000) + attemptMemberIndexes := memberIndexes // oversized (5 > threshold 3) + resultA := &signing.Result{ + Signature: mustFrostSignatureFromBigInts(big.NewInt(200), big.NewInt(300)), + } + resultB := &signing.Result{ + Signature: mustFrostSignatureFromBigInts(big.NewInt(201), big.NewInt(300)), + } + + doneCheck.listen(ctx, message, attemptNumber, attemptTimeoutBlock, attemptMemberIndexes) + + // 2 report signature A, 2 report signature B, 1 offline: no bucket reaches t=3. + for _, i := range []uint8{1, 2} { + if err := doneCheck.signalDone(ctx, i, message, attemptNumber, resultA, 100); err != nil { + t.Fatal(err) + } + } + for _, i := range []uint8{3, 4} { + if err := doneCheck.signalDone(ctx, i, message, attemptNumber, resultB, 100); err != nil { + t.Fatal(err) + } + } + + returnedResult, endBlock, err := doneCheck.waitUntilAllDone(ctx) + if returnedResult != nil { + t.Errorf("expected nil result on a below-quorum split, has [%v]", returnedResult) + } + testutils.AssertIntsEqual(t, "end block", 0, int(endBlock)) + testutils.AssertErrorsSame(t, errWaitDoneTimedOut, err) +} + // signingDoneCheckComponents holds the shared state used to construct one or // more signingDoneCheck instances that communicate over the same channel. type signingDoneCheckComponents struct { groupSize int + honestThreshold int broadcastChannel net.BroadcastChannel membershipValidator *group.MembershipValidator } func (c *signingDoneCheckComponents) newCheck() *signingDoneCheck { - return newSigningDoneCheck(c.groupSize, c.broadcastChannel, c.membershipValidator) + return newSigningDoneCheck(c.groupSize, c.honestThreshold, c.broadcastChannel, c.membershipValidator) } // setupSigningDoneCheckComponents builds the shared channel and validator @@ -341,6 +523,7 @@ func setupSigningDoneCheckComponents( return &signingDoneCheckComponents{ groupSize: groupParameters.GroupSize, + honestThreshold: groupParameters.HonestThreshold, broadcastChannel: broadcastChannel, membershipValidator: membershipValidator, }