Skip to content

Commit 0671d99

Browse files
mswilkisonclaude
andcommitted
feat(frost): conclude the oversized signing done check by signature quorum (7.3 t-of-included PR3/3)
Under RFC-21 Phase 7.3 t-of-included finalize, an interactive signing attempt is signed by the first t responsive committers of an (optionally) oversized included set; the non-subset and offline members never broadcast a signing done check, so the outer signingDoneCheck - which required a confirmation from EVERY attempt member - would hang an otherwise-successful attempt to its timeout and force a needless retry. Conclude the done check on a deterministic threshold quorum instead, keeping the legacy (non-oversized) path byte-for-byte unchanged: - non-oversized (included == honestThreshold; today's selector output and the whole coarse path): the legacy all-members rule is UNCHANGED - wait for every attempt member, require all signatures equal, return max(endBlock). - oversized (included > honestThreshold): bucket done checks by signature and conclude once a bucket holds >= honestThreshold distinct senders (the minimum that proves a valid threshold signature). Minority buckets (divergent or adversarial signatures) are IGNORED, never fatal, so one bad done message cannot fracture the group. The end block is the DETERMINISTIC attempt timeout block, not a network-order-dependent max over done messages. The deterministic end block is the crux: batch scheduling derives the next signature's start as signingStartBlock = prev endBlock + interlude, so a per-node max over a network-order-dependent subset would desync the batch. Returning attemptTimeoutBlock (which every honest node computes identically) fixes it. With honest majority (t > groupSize/2) at most one signature bucket can reach t - even under coordinator equivocation - so the quorum is unique and no body-hash / proto change is needed; the >1-quorum branch is unreachable and intentionally non-fatal. Design locked via a Codex+Gemini consult after the initial first-t-arrivals approach was shown to be cross-node non-deterministic (it fed an order-dependent end block into batch scheduling). Tests: legacy done-check tests unchanged; added oversized conclude-on-quorum (with a deterministic end block), minority-divergent-ignored, and split-below-quorum-times-out. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 9a2d113 commit 0671d99

3 files changed

Lines changed: 338 additions & 52 deletions

File tree

pkg/tbtc/signing.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,7 @@ func (se *signingExecutor) signWithTaprootMerkleRoot(
373373

374374
doneCheck := newSigningDoneCheck(
375375
se.groupParameters.GroupSize,
376+
se.groupParameters.HonestThreshold,
376377
se.broadcastChannel,
377378
se.membershipValidator,
378379
)

pkg/tbtc/signing_done.go

Lines changed: 153 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -47,23 +47,42 @@ func (sdm *signingDoneMessage) Type() string {
4747
// successful signature calculation across all signing group members.
4848
type signingDoneCheck struct {
4949
groupSize int
50+
honestThreshold int
5051
broadcastChannel net.BroadcastChannel
5152
membershipValidator *group.MembershipValidator
5253

53-
receiveCtx context.Context
54-
cancelReceiveCtx context.CancelFunc
55-
expectedSignersCount int
56-
doneSigners map[group.MemberIndex]*signingDoneMessage
57-
doneSignersMutex sync.RWMutex
54+
receiveCtx context.Context
55+
cancelReceiveCtx context.CancelFunc
56+
// attemptMemberCount is len(attemptMembersIndexes) for the live attempt - the
57+
// number of confirmations the legacy (non-oversized) rule waits for.
58+
attemptMemberCount int
59+
// oversized is true when the attempt's included set is larger than the honest
60+
// threshold: the RFC-21 Phase 7.3 t-of-included case, where the attempt is
61+
// signed by a t-subset so the non-subset / offline members never report done
62+
// and the legacy all-members rule would hang. It selects the
63+
// quorum-by-signature completion rule. When false (included == threshold, the
64+
// pre-oversizing selector output and the whole coarse path) the legacy rule is
65+
// used UNCHANGED.
66+
oversized bool
67+
// attemptTimeoutBlock is the deterministic block the attempt concludes by
68+
// (announcementEndBlock + signingAttemptMaximumProtocolBlocks). On the oversized
69+
// path it is returned as the result end block instead of a network-order-
70+
// dependent max over done messages, so every honest node feeds signBatch the
71+
// same next-signature start block (signingStartBlock = prev endBlock + interlude).
72+
attemptTimeoutBlock uint64
73+
doneSigners map[group.MemberIndex]*signingDoneMessage
74+
doneSignersMutex sync.RWMutex
5875
}
5976

6077
func newSigningDoneCheck(
6178
groupSize int,
79+
honestThreshold int,
6280
broadcastChannel net.BroadcastChannel,
6381
membershipValidator *group.MembershipValidator,
6482
) *signingDoneCheck {
6583
return &signingDoneCheck{
6684
groupSize: groupSize,
85+
honestThreshold: honestThreshold,
6786
broadcastChannel: broadcastChannel,
6887
membershipValidator: membershipValidator,
6988
}
@@ -91,7 +110,17 @@ func (sdc *signingDoneCheck) listen(
91110
sdc.receiveCtx, sdc.cancelReceiveCtx = context.WithCancel(ctx)
92111

93112
sdc.doneSignersMutex.Lock()
94-
sdc.expectedSignersCount = len(attemptMembersIndexes)
113+
sdc.attemptMemberCount = len(attemptMembersIndexes)
114+
// An included set larger than the honest threshold is the RFC-21 Phase 7.3
115+
// t-of-included case: the attempt is signed by a t-subset and the non-subset /
116+
// offline members never report done, so the all-members rule would hang. The
117+
// oversized path concludes on a quorum of >= honestThreshold matching
118+
// signatures instead, with a deterministic end block. included == threshold
119+
// (today's selector output and the whole coarse path) keeps the legacy rule
120+
// byte-for-byte, so behavior is unchanged until participant selection oversizes
121+
// the set.
122+
sdc.oversized = len(attemptMembersIndexes) > sdc.honestThreshold
123+
sdc.attemptTimeoutBlock = attemptTimeoutBlock
95124
sdc.doneSigners = make(map[group.MemberIndex]*signingDoneMessage)
96125
sdc.doneSignersMutex.Unlock()
97126

@@ -149,12 +178,12 @@ func (sdc *signingDoneCheck) signalDone(
149178
}, net.BackoffRetransmissionStrategy)
150179
}
151180

152-
// waitUntilAllDone blocks until it receives all the required done checks from
153-
// members or until the passed context is done. In the first case, it returns
154-
// the signature computed by the signing members and the block at which the
155-
// slowest signer completed the signature computation process. If the expected
156-
// done checks are not received on time, the function returns an error. If at
157-
// least one signature is different from others, the function returns an error.
181+
// waitUntilAllDone blocks until the attempt's completion rule is met or the
182+
// passed context is done. On success it returns the agreed signature and a
183+
// deterministic end block (the same value on every honest node): on the legacy
184+
// path the block at which the slowest attempt member completed, on the oversized
185+
// path the attempt timeout block. It returns errWaitDoneTimedOut if the rule is
186+
// not met on time, and a non-nil error on a fatal divergence (legacy path only).
158187
func (sdc *signingDoneCheck) waitUntilAllDone(ctx context.Context) (
159188
*signing.Result,
160189
uint64,
@@ -171,35 +200,123 @@ func (sdc *signingDoneCheck) waitUntilAllDone(ctx context.Context) (
171200
return nil, 0, errWaitDoneTimedOut
172201

173202
case <-ticker.C:
174-
expectedSignersCount, doneSigners := sdc.snapshotDoneSigners()
175-
if expectedSignersCount == len(doneSigners) {
176-
var signature *frost.Signature
177-
var latestEndBlock uint64
178-
179-
for _, doneMessage := range doneSigners {
180-
if signature == nil {
181-
signature = doneMessage.signature
182-
} else {
183-
if !signature.Equals(doneMessage.signature) {
184-
return nil, 0, fmt.Errorf(
185-
"not matching signatures detected: [%v] and [%v]",
186-
signature,
187-
doneMessage.signature,
188-
)
189-
}
190-
}
191-
192-
if doneMessage.endBlock > latestEndBlock {
193-
latestEndBlock = doneMessage.endBlock
194-
}
195-
}
196-
197-
return &signing.Result{Signature: signature}, latestEndBlock, nil
203+
result, endBlock, concluded, err := sdc.evaluateDone()
204+
if err != nil {
205+
return nil, 0, err
206+
}
207+
if concluded {
208+
return result, endBlock, nil
198209
}
199210
}
200211
}
201212
}
202213

214+
// evaluateDone snapshots the done checks collected so far and applies the
215+
// attempt's completion rule. It returns (result, endBlock, true, nil) once the
216+
// attempt can conclude, (nil, 0, false, nil) while still waiting, or
217+
// (nil, 0, false, err) on a fatal divergence. The legacy (non-oversized) rule
218+
// and the oversized t-of-included rule are kept fully separate so the legacy /
219+
// coarse path is byte-for-byte unchanged.
220+
func (sdc *signingDoneCheck) evaluateDone() (*signing.Result, uint64, bool, error) {
221+
sdc.doneSignersMutex.RLock()
222+
oversized := sdc.oversized
223+
attemptMemberCount := sdc.attemptMemberCount
224+
attemptTimeoutBlock := sdc.attemptTimeoutBlock
225+
honestThreshold := sdc.honestThreshold
226+
doneSigners := make([]*signingDoneMessage, 0, len(sdc.doneSigners))
227+
for _, doneMessage := range sdc.doneSigners {
228+
doneSigners = append(doneSigners, doneMessage.clone())
229+
}
230+
sdc.doneSignersMutex.RUnlock()
231+
232+
if oversized {
233+
return concludeOversizedDone(doneSigners, honestThreshold, attemptTimeoutBlock)
234+
}
235+
return concludeLegacyDone(doneSigners, attemptMemberCount)
236+
}
237+
238+
// concludeLegacyDone is the pre-7.3 rule, UNCHANGED: conclude once every attempt
239+
// member confirmed, require all signatures equal, and return the max end block.
240+
// The attemptMemberCount > 0 guard rejects the pre-listen state (no attempt
241+
// configured) so an empty done set is never read as success.
242+
func concludeLegacyDone(
243+
doneSigners []*signingDoneMessage,
244+
attemptMemberCount int,
245+
) (*signing.Result, uint64, bool, error) {
246+
if attemptMemberCount == 0 || len(doneSigners) != attemptMemberCount {
247+
return nil, 0, false, nil
248+
}
249+
250+
var signature *frost.Signature
251+
var latestEndBlock uint64
252+
for _, doneMessage := range doneSigners {
253+
if signature == nil {
254+
signature = doneMessage.signature
255+
} else if !signature.Equals(doneMessage.signature) {
256+
return nil, 0, false, fmt.Errorf(
257+
"not matching signatures detected: [%v] and [%v]",
258+
signature,
259+
doneMessage.signature,
260+
)
261+
}
262+
263+
if doneMessage.endBlock > latestEndBlock {
264+
latestEndBlock = doneMessage.endBlock
265+
}
266+
}
267+
268+
return &signing.Result{Signature: signature}, latestEndBlock, true, nil
269+
}
270+
271+
// concludeOversizedDone is the RFC-21 Phase 7.3 t-of-included rule: bucket the
272+
// done checks by signature (one done message per sender, so each member is in
273+
// exactly one bucket) and conclude once a bucket holds >= honestThreshold
274+
// distinct senders - the minimum that proves a valid threshold signature.
275+
// Minority buckets (divergent or adversarial signatures) are IGNORED, never
276+
// fatal, so a single bad done message cannot fracture the group. The end block
277+
// is the deterministic attempt timeout block, not a network-order-dependent max,
278+
// so every honest node returns the same value for batch scheduling.
279+
func concludeOversizedDone(
280+
doneSigners []*signingDoneMessage,
281+
honestThreshold int,
282+
attemptTimeoutBlock uint64,
283+
) (*signing.Result, uint64, bool, error) {
284+
if honestThreshold <= 0 {
285+
return nil, 0, false, nil
286+
}
287+
288+
bucketSig := map[string]*frost.Signature{}
289+
bucketCount := map[string]int{}
290+
for _, doneMessage := range doneSigners {
291+
serialized := doneMessage.signature.Serialize()
292+
key := string(serialized[:])
293+
if _, ok := bucketSig[key]; !ok {
294+
bucketSig[key] = doneMessage.signature
295+
}
296+
bucketCount[key]++
297+
}
298+
299+
var quorumSig *frost.Signature
300+
quorums := 0
301+
for key, count := range bucketCount {
302+
if count >= honestThreshold {
303+
quorums++
304+
quorumSig = bucketSig[key]
305+
}
306+
}
307+
308+
// Exactly one >= t bucket is the only reachable outcome under honest majority
309+
// (honestThreshold > groupSize/2 means two disjoint >= t buckets cannot
310+
// coexist). It carries the one valid signature and concludes deterministically.
311+
// quorums == 0 keeps waiting; quorums > 1 is unreachable and intentionally NOT
312+
// concluded (the attempt fails via the ctx timeout rather than picking a bucket
313+
// nondeterministically - a bare done-message split is not coordinator blame).
314+
if quorums == 1 {
315+
return &signing.Result{Signature: quorumSig}, attemptTimeoutBlock, true, nil
316+
}
317+
return nil, 0, false, nil
318+
}
319+
203320
// isValidDoneMessage validates the given signingDoneMessage in the context
204321
// of the given signing attempt.
205322
func (sdc *signingDoneCheck) isValidDoneMessage(
@@ -250,21 +367,6 @@ func (sdc *signingDoneCheck) recordDoneMessage(
250367
return true
251368
}
252369

253-
func (sdc *signingDoneCheck) snapshotDoneSigners() (
254-
int,
255-
[]*signingDoneMessage,
256-
) {
257-
sdc.doneSignersMutex.RLock()
258-
defer sdc.doneSignersMutex.RUnlock()
259-
260-
result := make([]*signingDoneMessage, 0, len(sdc.doneSigners))
261-
for _, doneMessage := range sdc.doneSigners {
262-
result = append(result, doneMessage.clone())
263-
}
264-
265-
return sdc.expectedSignersCount, result
266-
}
267-
268370
func (sdm *signingDoneMessage) clone() *signingDoneMessage {
269371
if sdm == nil {
270372
return nil

0 commit comments

Comments
 (0)