From b3ff29a6176bcbe1540ed89f6ce7e5533acccce3 Mon Sep 17 00:00:00 2001 From: maclane Date: Fri, 19 Jun 2026 12:37:26 -0400 Subject: [PATCH] feat(frost): route coarse ROAST evidence into the transition bundle (7.3 PR2b-2 step 2) The transition exchange aggregated FORCED-EMPTY proof-of-attendance snapshots, so NextAttempt's f+1 accuser tally never saw the real rejects/overflows/conflicts the coarse receive loop collects -- blame/exclusion could not fire. submitSnapshotIfActive recorded that evidence to the DRIVE handle, which is never aggregated (the exchange is the sole bundle producer, keyed by the OBSERVE handle), so it was a write-only dead end. Bridge the two: a member-keyed pending-evidence stash keyed by (RoastSessionID, member, attemptHash) -- the namespace-independent coordinate shared by the drive and observe worlds -- carries the raw recorder snapshot. submitSnapshotIfActive stashes it (dropping the dead drive-handle RecordEvidence); BroadcastForcedSnapshot consumes it and builds the ONE signed snapshot it broadcasts, so the elected coordinator's AggregateBundle includes the real evidence and computeNextAttempt can establish + exclude an f+1-accused member. Lifecycle mirrors the observe registry: consume on broadcast, clear on local success and session end, TTL sweep as a backstop; the Evidence is deep-copied on store. The stale selector comment is retired (the ROAST selector is already wired under frost_roast_retry). Design locked via Codex + Gemini consult (both PROCEED, no holes). Scoped to the coarse f+1 path; coordinator-equivocation proofs (interactive Round2Collector) are step 2b. Co-Authored-By: Claude Opus 4.8 --- ..._retry_attempt_handle_frost_roast_retry.go | 4 + ...oast_retry_evidence_stash_default_build.go | 20 + ..._retry_evidence_stash_frost_roast_retry.go | 204 +++++++++ ...y_evidence_stash_frost_roast_retry_test.go | 165 ++++++++ pkg/frost/signing/roast_retry_submit.go | 135 ++---- ...ast_retry_submit_frost_roast_retry_test.go | 386 +++++------------- ...ition_exchange_frost_native_roast_retry.go | 34 +- ..._exchange_frost_native_roast_retry_test.go | 96 +++++ pkg/tbtc/signing_loop.go | 10 +- ...ion_controller_frost_native_roast_retry.go | 9 + 10 files changed, 666 insertions(+), 397 deletions(-) create mode 100644 pkg/frost/signing/roast_retry_evidence_stash_default_build.go create mode 100644 pkg/frost/signing/roast_retry_evidence_stash_frost_roast_retry.go create mode 100644 pkg/frost/signing/roast_retry_evidence_stash_frost_roast_retry_test.go diff --git a/pkg/frost/signing/roast_retry_attempt_handle_frost_roast_retry.go b/pkg/frost/signing/roast_retry_attempt_handle_frost_roast_retry.go index 1651d1d9e1..3921ffe9d0 100644 --- a/pkg/frost/signing/roast_retry_attempt_handle_frost_roast_retry.go +++ b/pkg/frost/signing/roast_retry_attempt_handle_frost_roast_retry.go @@ -161,6 +161,10 @@ func sessionHandleSweepLoop(stop <-chan struct{}) { // anything past the TTL. evictStaleObservedAttempts(ObservedAttemptRegistryTTL) evictStaleRoastTransitions(RoastTransitionRegistryTTL) + // Stashed coarse-path evidence is normally consumed by + // BroadcastForcedSnapshot or cleared on success/session-end (RFC-21 + // Phase 7.3 PR2b-2 step 2); sweep any orphaned by an abnormal end. + evictStalePendingEvidence(PendingEvidenceRegistryTTL) } } } diff --git a/pkg/frost/signing/roast_retry_evidence_stash_default_build.go b/pkg/frost/signing/roast_retry_evidence_stash_default_build.go new file mode 100644 index 0000000000..61c10087d2 --- /dev/null +++ b/pkg/frost/signing/roast_retry_evidence_stash_default_build.go @@ -0,0 +1,20 @@ +//go:build !frost_roast_retry + +package signing + +import ( + "github.com/keep-network/keep-core/pkg/frost/roast/attempt" + "github.com/keep-network/keep-core/pkg/protocol/group" +) + +// stashPendingEvidence is a no-op in the default build: with no ROAST-retry +// orchestration there is never a session-handle binding, so submitSnapshotIfActive +// returns before reaching this call. The build-tagged implementation does the real +// stashing the transition exchange's BroadcastForcedSnapshot consumes. +func stashPendingEvidence( + _ string, + _ group.MemberIndex, + _ [attempt.MessageDigestLength]byte, + _ attempt.Evidence, +) { +} diff --git a/pkg/frost/signing/roast_retry_evidence_stash_frost_roast_retry.go b/pkg/frost/signing/roast_retry_evidence_stash_frost_roast_retry.go new file mode 100644 index 0000000000..ae77698b96 --- /dev/null +++ b/pkg/frost/signing/roast_retry_evidence_stash_frost_roast_retry.go @@ -0,0 +1,204 @@ +//go:build frost_roast_retry + +package signing + +import ( + "sync" + "time" + + "github.com/keep-network/keep-core/pkg/frost/roast/attempt" + "github.com/keep-network/keep-core/pkg/protocol/group" +) + +// PendingEvidenceRegistryTTL is how long a stashed evidence entry is retained +// before the background sweeper evicts it. Matches the observe-binding TTL: a +// stashed snapshot is useless once the session it tracks is archived. +const PendingEvidenceRegistryTTL = SessionHandleBindingTTL + +// pendingEvidenceKey scopes one local seat's captured coarse-path evidence to a +// specific attempt of a ROAST session. +// +// RFC-21 Phase 7.3 PR2b-2 step 2 (the blame bridge): the coarse receive loop +// captures real rejects/overflows/conflicts into an EvidenceRecorder and, at +// end-of-collect, stashes the raw snapshot here (submitSnapshotIfActive). The +// transition exchange's BroadcastForcedSnapshot then reads it so the seat's +// broadcast snapshot -- and therefore the elected coordinator's aggregated bundle +// -- carries real evidence instead of a forced-empty proof-of-attendance one, so +// NextAttempt's f+1 accuser tally can finally fire. +// +// The key is (RoastSessionID, member, attemptHash): the same coordinate space the +// observe binding uses (attemptHash == ctx.Hash() is namespace-independent, and +// the stashing ctx.SessionID is the STABLE RoastSessionID == the exchange's +// e.roastSessionID). The member component isolates a multi-seat operator's sibling +// seats: they share a RoastSessionID and attemptHash but each stashes -- and +// broadcasts -- its OWN evidence, so neither overwrites the other. +type pendingEvidenceKey struct { + sessionID string + member group.MemberIndex + attemptHash [attempt.MessageDigestLength]byte +} + +type pendingEvidenceEntry struct { + evidence attempt.Evidence + createdAt time.Time +} + +var ( + pendingEvidenceMu sync.Mutex + pendingEvidenceRegistry = map[pendingEvidenceKey]pendingEvidenceEntry{} +) + +// stashPendingEvidence stores a deep COPY of the captured evidence for +// (sessionID, member, attemptHash). A later call for the same key overwrites the +// earlier one (a re-driven attempt re-stashes). The deep copy means a later +// mutation of the caller's recorder snapshot cannot race the exchange's read. +func stashPendingEvidence( + sessionID string, + member group.MemberIndex, + attemptHash [attempt.MessageDigestLength]byte, + evidence attempt.Evidence, +) { + key := pendingEvidenceKey{sessionID, member, attemptHash} + pendingEvidenceMu.Lock() + defer pendingEvidenceMu.Unlock() + pendingEvidenceRegistry[key] = pendingEvidenceEntry{ + evidence: copyEvidence(evidence), + createdAt: time.Now(), + } +} + +// takePendingEvidence returns the stashed evidence for (sessionID, member, +// attemptHash) and a presence flag, REMOVING it (consume-on-read). The returned +// value is the sole reference -- the entry is deleted from the registry -- so the +// caller owns it exclusively without a further copy. BroadcastForcedSnapshot calls +// it: a present entry means the coarse receive loop observed real evidence for the +// attempt; absent means none was captured (the broadcast then carries an empty +// proof-of-attendance snapshot). +func takePendingEvidence( + sessionID string, + member group.MemberIndex, + attemptHash [attempt.MessageDigestLength]byte, +) (attempt.Evidence, bool) { + key := pendingEvidenceKey{sessionID, member, attemptHash} + pendingEvidenceMu.Lock() + defer pendingEvidenceMu.Unlock() + entry, ok := pendingEvidenceRegistry[key] + if !ok { + return attempt.Evidence{}, false + } + delete(pendingEvidenceRegistry, key) + return entry.evidence, true +} + +// clearPendingEvidence removes any stashed evidence for (sessionID, member, +// attemptHash). +func clearPendingEvidence( + sessionID string, + member group.MemberIndex, + attemptHash [attempt.MessageDigestLength]byte, +) { + key := pendingEvidenceKey{sessionID, member, attemptHash} + pendingEvidenceMu.Lock() + defer pendingEvidenceMu.Unlock() + delete(pendingEvidenceRegistry, key) +} + +// ClearPendingEvidenceOnLocalSuccess removes the stashed evidence for an attempt +// this seat completed successfully. A succeeded attempt produces no transition +// bundle, so BroadcastForcedSnapshot never consumes the stash; clearing here +// prevents a leak until the TTL backstop. Exported so the pkg/tbtc transition +// controller can call it alongside ClearObservedAttemptOnLocalSuccess (RFC-21 +// Phase 7.3 PR2b-2). +func ClearPendingEvidenceOnLocalSuccess( + sessionID string, + member group.MemberIndex, + attemptHash [attempt.MessageDigestLength]byte, +) { + clearPendingEvidence(sessionID, member, attemptHash) +} + +// clearPendingEvidenceForSession removes every stashed evidence entry for +// (sessionID, member), regardless of attempt hash. The transition exchange calls +// it when the session ends (its listener context is done), mirroring +// clearObservedAttemptsForSession, so a signing whose attempts succeeded -- and +// therefore never consumed the stash via BroadcastForcedSnapshot -- does not leave +// entries behind. +func clearPendingEvidenceForSession(sessionID string, member group.MemberIndex) { + pendingEvidenceMu.Lock() + defer pendingEvidenceMu.Unlock() + for key := range pendingEvidenceRegistry { + if key.sessionID == sessionID && key.member == member { + delete(pendingEvidenceRegistry, key) + } + } +} + +// evictStalePendingEvidence sweeps the registry and removes entries older than +// maxAge. Exposed at the package level so tests can invoke it directly with small +// maxAge values and so the shared session-handle sweeper can fold it into one +// background goroutine: a session that ends abnormally must not orphan a stash +// entry past its backstop TTL. +func evictStalePendingEvidence(maxAge time.Duration) int { + cutoff := time.Now().Add(-maxAge) + pendingEvidenceMu.Lock() + defer pendingEvidenceMu.Unlock() + evicted := 0 + for key, entry := range pendingEvidenceRegistry { + if entry.createdAt.Before(cutoff) { + delete(pendingEvidenceRegistry, key) + evicted++ + } + } + return evicted +} + +// ResetPendingEvidenceRegistryForTest clears every stashed entry. Test-only seam; +// not for production code paths. +func ResetPendingEvidenceRegistryForTest() { + pendingEvidenceMu.Lock() + defer pendingEvidenceMu.Unlock() + pendingEvidenceRegistry = map[pendingEvidenceKey]pendingEvidenceEntry{} +} + +// PendingEvidenceStashedForTest reports whether any stash entry exists for +// (sessionID, member), regardless of attempt hash. Exported test seam so +// downstream-package tests can assert the stash without reaching into the +// unexported registry. +func PendingEvidenceStashedForTest(sessionID string, member group.MemberIndex) bool { + pendingEvidenceMu.Lock() + defer pendingEvidenceMu.Unlock() + for key := range pendingEvidenceRegistry { + if key.sessionID == sessionID && key.member == member { + return true + } + } + return false +} + +// copyEvidence returns a deep copy of an attempt.Evidence: the three maps are +// re-allocated and the per-sender RejectEntry slices are cloned. RejectEntry is a +// flat value type (Reason string, Count uint), so a slice copy is a full deep +// copy. nil maps stay nil (a nil category means "did not fire", which +// NewLocalEvidenceSnapshot and NextAttempt both treat as empty). +func copyEvidence(e attempt.Evidence) attempt.Evidence { + out := attempt.Evidence{} + if e.Overflows != nil { + out.Overflows = make(map[group.MemberIndex]uint, len(e.Overflows)) + for k, v := range e.Overflows { + out.Overflows[k] = v + } + } + if e.Rejects != nil { + out.Rejects = make(map[group.MemberIndex][]attempt.RejectEntry, len(e.Rejects)) + for k, v := range e.Rejects { + out.Rejects[k] = append([]attempt.RejectEntry(nil), v...) + } + } + if e.Conflicts != nil { + out.Conflicts = make(map[group.MemberIndex]uint, len(e.Conflicts)) + for k, v := range e.Conflicts { + out.Conflicts[k] = v + } + } + return out +} diff --git a/pkg/frost/signing/roast_retry_evidence_stash_frost_roast_retry_test.go b/pkg/frost/signing/roast_retry_evidence_stash_frost_roast_retry_test.go new file mode 100644 index 0000000000..c88a54aad1 --- /dev/null +++ b/pkg/frost/signing/roast_retry_evidence_stash_frost_roast_retry_test.go @@ -0,0 +1,165 @@ +//go:build frost_roast_retry + +package signing + +import ( + "testing" + "time" + + "github.com/keep-network/keep-core/pkg/frost/roast/attempt" + "github.com/keep-network/keep-core/pkg/protocol/group" +) + +func stashTestHash(b byte) [attempt.MessageDigestLength]byte { + var h [attempt.MessageDigestLength]byte + h[0] = b + return h +} + +func stashTestEvidence() attempt.Evidence { + return attempt.Evidence{ + Overflows: map[group.MemberIndex]uint{2: 1}, + Rejects: map[group.MemberIndex][]attempt.RejectEntry{ + 3: {{Reason: "r", Count: 1}}, + }, + Conflicts: map[group.MemberIndex]uint{4: 1}, + } +} + +func TestPendingEvidenceStash_StoreTakeConsumes(t *testing.T) { + ResetPendingEvidenceRegistryForTest() + t.Cleanup(ResetPendingEvidenceRegistryForTest) + + hash := stashTestHash(0x11) + stashPendingEvidence("s", 1, hash, stashTestEvidence()) + + if !PendingEvidenceStashedForTest("s", 1) { + t.Fatal("entry must be present after store") + } + got, ok := takePendingEvidence("s", 1, hash) + if !ok { + t.Fatal("take must find the stored entry") + } + if got.Overflows[2] != 1 || got.Conflicts[4] != 1 || len(got.Rejects[3]) != 1 { + t.Fatalf("taken evidence does not match stored: %+v", got) + } + if _, ok := takePendingEvidence("s", 1, hash); ok { + t.Fatal("take must consume: a second take finds nothing") + } + if PendingEvidenceStashedForTest("s", 1) { + t.Fatal("entry must be gone after consume") + } +} + +// TestPendingEvidenceStash_DeepCopyIsolatesCallerMutation proves copyEvidence +// deep-copies on store: mutating the caller's Evidence (maps AND the per-sender +// reject slice) after stashing must not change what the exchange later reads. +func TestPendingEvidenceStash_DeepCopyIsolatesCallerMutation(t *testing.T) { + ResetPendingEvidenceRegistryForTest() + t.Cleanup(ResetPendingEvidenceRegistryForTest) + + hash := stashTestHash(0x22) + src := stashTestEvidence() + stashPendingEvidence("s", 1, hash, src) + + // Mutate every layer of the source after the store. + src.Overflows[2] = 99 + src.Overflows[5] = 7 // new key + src.Rejects[3][0].Count = 99 + src.Conflicts[4] = 99 + + got, ok := takePendingEvidence("s", 1, hash) + if !ok { + t.Fatal("take must find the stored entry") + } + if got.Overflows[2] != 1 { + t.Fatalf("overflow count must reflect store-time value 1; got %d", got.Overflows[2]) + } + if _, present := got.Overflows[5]; present { + t.Fatal("a key added to the source after store must not appear in the stash") + } + if got.Rejects[3][0].Count != 1 { + t.Fatalf("reject slice element must reflect store-time value 1; got %d", got.Rejects[3][0].Count) + } + if got.Conflicts[4] != 1 { + t.Fatalf("conflict count must reflect store-time value 1; got %d", got.Conflicts[4]) + } +} + +// TestPendingEvidenceStash_MemberKeyedIsolation asserts two seats sharing the same +// (sessionID, attemptHash) keep separate entries -- the member-keying that prevents +// a sibling seat from overwriting another's evidence. +func TestPendingEvidenceStash_MemberKeyedIsolation(t *testing.T) { + ResetPendingEvidenceRegistryForTest() + t.Cleanup(ResetPendingEvidenceRegistryForTest) + + hash := stashTestHash(0x33) + stashPendingEvidence("s", 1, hash, attempt.Evidence{Overflows: map[group.MemberIndex]uint{7: 1}}) + stashPendingEvidence("s", 2, hash, attempt.Evidence{Overflows: map[group.MemberIndex]uint{8: 1}}) + + got1, ok1 := takePendingEvidence("s", 1, hash) + got2, ok2 := takePendingEvidence("s", 2, hash) + if !ok1 || !ok2 { + t.Fatalf("both member entries must exist; ok1=%v ok2=%v", ok1, ok2) + } + if got1.Overflows[7] != 1 || got1.Overflows[8] != 0 { + t.Fatalf("seat 1 entry bled; got %+v", got1.Overflows) + } + if got2.Overflows[8] != 1 || got2.Overflows[7] != 0 { + t.Fatalf("seat 2 entry bled; got %+v", got2.Overflows) + } +} + +func TestPendingEvidenceStash_ClearForSession(t *testing.T) { + ResetPendingEvidenceRegistryForTest() + t.Cleanup(ResetPendingEvidenceRegistryForTest) + + stashPendingEvidence("s", 1, stashTestHash(0x01), stashTestEvidence()) + stashPendingEvidence("s", 1, stashTestHash(0x02), stashTestEvidence()) + stashPendingEvidence("other", 1, stashTestHash(0x03), stashTestEvidence()) + + clearPendingEvidenceForSession("s", 1) + + if PendingEvidenceStashedForTest("s", 1) { + t.Fatal("clearForSession must remove every attempt of (s,1)") + } + if !PendingEvidenceStashedForTest("other", 1) { + t.Fatal("clearForSession must not touch a different session") + } +} + +func TestClearPendingEvidenceOnLocalSuccess(t *testing.T) { + ResetPendingEvidenceRegistryForTest() + t.Cleanup(ResetPendingEvidenceRegistryForTest) + + hash := stashTestHash(0x44) + stashPendingEvidence("s", 1, hash, stashTestEvidence()) + ClearPendingEvidenceOnLocalSuccess("s", 1, hash) + if PendingEvidenceStashedForTest("s", 1) { + t.Fatal("success clear must remove the attempt's stash entry") + } +} + +// TestEvictStalePendingEvidence asserts the TTL backstop: a fresh entry survives a +// long TTL and is swept once the cutoff passes its creation time. A negative +// max-age sets the cutoff in the future, so the entry is deterministically stale +// without sleeping. +func TestEvictStalePendingEvidence(t *testing.T) { + ResetPendingEvidenceRegistryForTest() + t.Cleanup(ResetPendingEvidenceRegistryForTest) + + stashPendingEvidence("s", 1, stashTestHash(0x55), stashTestEvidence()) + + if n := evictStalePendingEvidence(time.Hour); n != 0 { + t.Fatalf("a fresh entry must survive a long TTL; evicted %d", n) + } + if !PendingEvidenceStashedForTest("s", 1) { + t.Fatal("entry must remain after a no-op sweep") + } + if n := evictStalePendingEvidence(-time.Hour); n != 1 { + t.Fatalf("a future cutoff must evict the entry; evicted %d", n) + } + if PendingEvidenceStashedForTest("s", 1) { + t.Fatal("entry must be gone after the sweep") + } +} diff --git a/pkg/frost/signing/roast_retry_submit.go b/pkg/frost/signing/roast_retry_submit.go index f4d7157b84..bdd659f9c9 100644 --- a/pkg/frost/signing/roast_retry_submit.go +++ b/pkg/frost/signing/roast_retry_submit.go @@ -1,43 +1,39 @@ package signing import ( - "github.com/ipfs/go-log/v2" - "github.com/keep-network/keep-core/pkg/frost/roast" "github.com/keep-network/keep-core/pkg/frost/roast/attempt" "github.com/keep-network/keep-core/pkg/protocol/group" ) -// roastRetryLogger is the logger the snapshot-submission path uses -// for non-fatal diagnostics (submission failures, signature errors). -// A submission failure does not propagate to the signing flow: -// Phase 4 ships the submission code path unused in production, and -// even when wired (Phase 5+) a transient submission failure is -// recoverable by the next attempt's evidence flow. -var roastRetryLogger = log.Logger("keep-frost-roast-retry") - -// submitSnapshotIfActive is invoked at end-of-collect to push the -// receive loop's accumulated evidence into the ROAST coordinator's -// RecordEvidence pipeline. member is the local seat whose receive loop -// is submitting (request.MemberIndex). The path is fully member-aware -// (RFC-21 Phase 7.3 PR2b-2): a multi-seat operator's sibling seats each -// submit their own evidence against their own coordinator and attempt -// handle, so they no longer mis-attribute or collide (which is why the -// PR2b-1.5 multi-seat no-op guard is gone). The function is a no-op when -// any of the following is true: +// submitSnapshotIfActive is invoked at end-of-collect to capture the receive +// loop's accumulated evidence for the ROAST blame pipeline. member is the local +// seat whose receive loop is submitting (request.MemberIndex). The path is fully +// member-aware (RFC-21 Phase 7.3 PR2b-2): a multi-seat operator's sibling seats +// each capture their own evidence against their own attempt binding, so they never +// mis-attribute or collide. +// +// RFC-21 Phase 7.3 PR2b-2 step 2 (the blame bridge): the captured evidence is +// STASHED keyed by the attempt's (RoastSessionID, member, attemptHash), NOT +// recorded against the drive handle. The drive handle is never aggregated -- the +// transition exchange is the sole bundle producer and aggregates the OBSERVE +// handle -- so a RecordEvidence here was a write-only dead end. Stashing instead +// lets the exchange's BroadcastForcedSnapshot build + sign ONE snapshot carrying +// this evidence, so the elected coordinator's AggregateBundle includes it and +// NextAttempt's f+1 accuser tally can finally fire. // -// - this member has no registered ROAST-retry coordinator (default -// build where RegisterRoastRetryCoordinator is a no-op, or a -// partially-registered operator where only a sibling seat is wired); -// - no session-handle binding exists for (sessionID, member) (the -// typical Phase-4 state, where the orchestration layer that calls -// SetCurrentAttemptHandleForSession is not yet implemented); -// - the recorder is nil / a NoOp (no events were captured). +// The function is a no-op when any of the following holds: // -// Otherwise the function builds a LocalEvidenceSnapshot, signs it with -// this member's registered Signer, and submits it via -// Coordinator.RecordEvidence. Errors at any step are logged at WARN -// level and otherwise swallowed -- snapshot submission must not break -// the receive loop's primary signing behaviour. +// - no session-handle binding exists for (sessionID, member): the default build +// (where currentAttemptHandleForCollect always returns ok=false), or a state +// where the orchestration layer that calls SetCurrentAttemptHandleForSession +// has not run; +// - the recorder is nil / a NoOp (no events were captured); +// - the captured evidence is empty across all three categories: the exchange +// still broadcasts an empty proof-of-attendance snapshot for the attempt, so +// skipping the stash here does not silence-park the seat. +// +// Capturing must never break the receive loop's primary signing behaviour, so the +// function returns silently on every skip condition. func submitSnapshotIfActive( sessionID string, member group.MemberIndex, @@ -46,14 +42,12 @@ func submitSnapshotIfActive( if recorder == nil { return } - // Member-aware lookup: a multi-seat operator's elected seat aggregates with - // its OWN coordinator and the snapshot is attributed to deps.SelfMember == - // member. A seat with no coordinator (partial registration) submits nothing. - deps, ok := RegisteredRoastRetryCoordinatorForMember(member) - if !ok { - return - } - handle, ctx, ok := currentAttemptHandleForCollect(sessionID, member) + // The drive binding (set by BeginOrchestrationForSession) signals this seat is + // driving a ROAST attempt and carries its AttemptContext. ctx.SessionID is the + // STABLE RoastSessionID and ctx.Hash() the attempt hash -- the + // namespace-independent coordinate the transition exchange keys its observe + // binding (and this stash) by, so the broadcast resolves the same entry. + _, ctx, ok := currentAttemptHandleForCollect(sessionID, member) if !ok { return } @@ -61,61 +55,14 @@ func submitSnapshotIfActive( if len(evidence.Overflows) == 0 && len(evidence.Rejects) == 0 && len(evidence.Conflicts) == 0 { - // Truly nothing observed worth submitting; emitting an empty - // snapshot is still meaningful in the ROAST protocol - // (proof-of-attendance) but adds noise to the bundle, so we - // skip it. The emptiness test MUST consider all three evidence - // categories, not just overflows: a validation-blamable Reject - // (e.g. an attempt-context-hash mismatch) or a first-write-wins - // Conflict populates Rejects/Conflicts WITHOUT any Overflow, and - // NextAttempt's exclusion path consumes snapshot.Rejects - // (next_attempt.go). Dropping a reject/conflict-only snapshot - // here would silently starve the blame pipeline of exactly the - // validation evidence it needs. + // Truly nothing observed worth carrying. The emptiness test MUST consider + // all three categories, not just overflows: a validation-blamable Reject + // (e.g. an attempt-context-hash mismatch) or a first-write-wins Conflict + // populates Rejects/Conflicts WITHOUT any Overflow, and NextAttempt's + // exclusion path consumes snapshot.Rejects (next_attempt.go). Dropping a + // reject/conflict-only snapshot here would silently starve the blame + // pipeline of exactly the validation evidence it needs. return } - snap := buildSignedSnapshot(deps, ctx, evidence) - if snap == nil { - return - } - if err := deps.Coordinator.RecordEvidence(handle, snap); err != nil { - roastRetryLogger.Warnf( - "roast-retry: RecordEvidence failed for session %q: %v", - sessionID, - err, - ) - } -} - -// buildSignedSnapshot constructs and signs a LocalEvidenceSnapshot -// from the captured evidence. Returns nil and logs on signature -// failure; callers treat nil as "skip submission" and continue. -func buildSignedSnapshot( - deps RoastRetryDeps, - ctx attempt.AttemptContext, - evidence attempt.Evidence, -) *roast.LocalEvidenceSnapshot { - snap := roast.NewLocalEvidenceSnapshot( - group.MemberIndex(deps.SelfMember), - ctx.Hash(), - evidence, - ) - payload, err := snap.SignableBytes() - if err != nil { - roastRetryLogger.Warnf( - "roast-retry: canonicalising snapshot failed: %v", - err, - ) - return nil - } - sig, err := deps.Signer.Sign(payload) - if err != nil { - roastRetryLogger.Warnf( - "roast-retry: signing snapshot failed: %v", - err, - ) - return nil - } - snap.OperatorSignature = sig - return snap + stashPendingEvidence(ctx.SessionID, member, ctx.Hash(), evidence) } diff --git a/pkg/frost/signing/roast_retry_submit_frost_roast_retry_test.go b/pkg/frost/signing/roast_retry_submit_frost_roast_retry_test.go index b813f02000..3c97c7111d 100644 --- a/pkg/frost/signing/roast_retry_submit_frost_roast_retry_test.go +++ b/pkg/frost/signing/roast_retry_submit_frost_roast_retry_test.go @@ -3,8 +3,6 @@ package signing import ( - "errors" - "sync" "testing" "github.com/keep-network/keep-core/pkg/frost/roast" @@ -12,87 +10,6 @@ import ( "github.com/keep-network/keep-core/pkg/protocol/group" ) -// captureCoordinator is a roast.Coordinator wrapper that records -// every RecordEvidence call so tests can assert what was submitted. -// It delegates everything else to an embedded real coordinator. -type captureCoordinator struct { - inner roast.Coordinator - mu sync.Mutex - recordedFor []roast.AttemptHandle - recordedSnp []*roast.LocalEvidenceSnapshot - recordErr error -} - -func newCaptureCoordinator(inner roast.Coordinator) *captureCoordinator { - return &captureCoordinator{inner: inner} -} - -func (c *captureCoordinator) BeginAttempt(ctx attempt.AttemptContext) (roast.AttemptHandle, error) { - return c.inner.BeginAttempt(ctx) -} -func (c *captureCoordinator) State(h roast.AttemptHandle) (roast.AttemptState, error) { - return c.inner.State(h) -} -func (c *captureCoordinator) SelectedCoordinator(h roast.AttemptHandle) (group.MemberIndex, error) { - return c.inner.SelectedCoordinator(h) -} -func (c *captureCoordinator) RecordEvidence(h roast.AttemptHandle, s *roast.LocalEvidenceSnapshot) error { - c.mu.Lock() - defer c.mu.Unlock() - if c.recordErr != nil { - return c.recordErr - } - c.recordedFor = append(c.recordedFor, h) - c.recordedSnp = append(c.recordedSnp, s) - return c.inner.RecordEvidence(h, s) -} -func (c *captureCoordinator) AggregateBundle(h roast.AttemptHandle) (*roast.TransitionMessage, error) { - return c.inner.AggregateBundle(h) -} -func (c *captureCoordinator) MarkSucceeded(h roast.AttemptHandle) error { - return c.inner.MarkSucceeded(h) -} -func (c *captureCoordinator) VerifyBundle(h roast.AttemptHandle, m *roast.TransitionMessage) error { - return c.inner.VerifyBundle(h, m) -} -func (c *captureCoordinator) NextAttempt( - h roast.AttemptHandle, m *roast.TransitionMessage, t uint, pk []byte, -) (attempt.AttemptContext, error) { - return c.inner.NextAttempt(h, m, t, pk) -} - -// deterministicSigner produces SHA256(memberID || payload)-style -// signatures the captureSignatureVerifier accepts. -type deterministicSigner struct { - id group.MemberIndex -} - -func (d *deterministicSigner) Sign(payload []byte) ([]byte, error) { - out := make([]byte, len(payload)+1) - out[0] = byte(d.id) - copy(out[1:], payload) - return out, nil -} - -type deterministicVerifier struct{} - -func (deterministicVerifier) Verify( - payload []byte, signature []byte, signer group.MemberIndex, -) error { - if len(signature) != len(payload)+1 { - return errors.New("deterministicVerifier: length mismatch") - } - if signature[0] != byte(signer) { - return errors.New("deterministicVerifier: signer byte mismatch") - } - for i, b := range payload { - if signature[i+1] != b { - return errors.New("deterministicVerifier: payload byte mismatch") - } - } - return nil -} - func newTestContextForSubmit(t *testing.T, sessionID string) attempt.AttemptContext { t.Helper() ctx, err := attempt.NewAttemptContext( @@ -110,111 +27,76 @@ func newTestContextForSubmit(t *testing.T, sessionID string) attempt.AttemptCont return ctx } -func TestSubmitSnapshotIfActive_NoOpWhenRegistryEmpty(t *testing.T) { - ResetRoastRetryRegistrationForTest() +// TestSubmitSnapshotIfActive_NilRecorderIsNoOp guards the cheap nil guard: the +// receive loop falls back to a nil/NoOp recorder when ROAST retry is inactive, and +// submit must not panic or stash. +func TestSubmitSnapshotIfActive_NilRecorderIsNoOp(t *testing.T) { ResetSessionHandleRegistryForTest() - t.Cleanup(ResetRoastRetryRegistrationForTest) + ResetPendingEvidenceRegistryForTest() t.Cleanup(ResetSessionHandleRegistryForTest) + t.Cleanup(ResetPendingEvidenceRegistryForTest) - // No registration, no binding. submit should be a no-op. - recorder := attempt.NewBoundedRecorder() - recorder.RecordOverflow(7) - submitSnapshotIfActive("session-x", 1, recorder) - // Nothing to assert observably: success is the absence of a - // panic and no calls to a non-existent coordinator. + submitSnapshotIfActive("session-nil", 1, nil) + + if PendingEvidenceStashedForTest("session-nil", 1) { + t.Fatal("nil recorder must not stash") + } } +// TestSubmitSnapshotIfActive_NoOpWhenSessionUnbound asserts that without a +// session-handle binding (the orchestration layer has not run, or the default +// build), submit stashes nothing -- there is no attempt to attribute evidence to. func TestSubmitSnapshotIfActive_NoOpWhenSessionUnbound(t *testing.T) { - ResetRoastRetryRegistrationForTest() ResetSessionHandleRegistryForTest() - t.Cleanup(ResetRoastRetryRegistrationForTest) + ResetPendingEvidenceRegistryForTest() t.Cleanup(ResetSessionHandleRegistryForTest) - - innerCoord := roast.NewInMemoryCoordinator() - cap := newCaptureCoordinator(innerCoord) - RegisterRoastRetryCoordinator(RoastRetryDeps{ - Coordinator: cap, - Signer: &deterministicSigner{id: 1}, - Verifier: deterministicVerifier{}, - SelfMember: 1, - }) + t.Cleanup(ResetPendingEvidenceRegistryForTest) recorder := attempt.NewBoundedRecorder() recorder.RecordOverflow(7) submitSnapshotIfActive("session-with-no-binding", 1, recorder) - if len(cap.recordedFor) != 0 { - t.Fatalf( - "expected no RecordEvidence calls when session unbound; got %d", - len(cap.recordedFor), - ) + if PendingEvidenceStashedForTest("session-with-no-binding", 1) { + t.Fatal("expected no stash when session unbound") } } +// TestSubmitSnapshotIfActive_NoOpWhenRecorderEmpty asserts a bound attempt whose +// recorder captured zero events stashes nothing: the exchange still broadcasts an +// empty proof-of-attendance snapshot, so skipping the stash does not silence-park +// the seat. func TestSubmitSnapshotIfActive_NoOpWhenRecorderEmpty(t *testing.T) { - ResetRoastRetryRegistrationForTest() ResetSessionHandleRegistryForTest() - t.Cleanup(ResetRoastRetryRegistrationForTest) + ResetPendingEvidenceRegistryForTest() t.Cleanup(ResetSessionHandleRegistryForTest) + t.Cleanup(ResetPendingEvidenceRegistryForTest) - innerCoord := roast.NewInMemoryCoordinatorWithSigning( - 1, - &deterministicSigner{id: 1}, - deterministicVerifier{}, - ) - cap := newCaptureCoordinator(innerCoord) - RegisterRoastRetryCoordinator(RoastRetryDeps{ - Coordinator: cap, - Signer: &deterministicSigner{id: 1}, - Verifier: deterministicVerifier{}, - SelfMember: 1, - }) - + const selfMember group.MemberIndex = 1 ctx := newTestContextForSubmit(t, "session-empty") - handle, err := cap.BeginAttempt(ctx) - if err != nil { - t.Fatalf("begin: %v", err) - } - SetCurrentAttemptHandleForSession("session-empty", 1, handle, ctx) + SetCurrentAttemptHandleForSession("session-empty", selfMember, roast.AttemptHandle{}, ctx) - // Recorder is bounded but has captured zero events. recorder := attempt.NewBoundedRecorder() - submitSnapshotIfActive("session-empty", 1, recorder) + submitSnapshotIfActive("session-empty", selfMember, recorder) - if len(cap.recordedFor) != 0 { - t.Fatalf( - "expected no RecordEvidence for empty snapshot; got %d", - len(cap.recordedFor), - ) + if PendingEvidenceStashedForTest("session-empty", selfMember) { + t.Fatal("expected no stash for an empty snapshot") } } -func TestSubmitSnapshotIfActive_SubmitsSignedSnapshotWhenBoundAndPopulated(t *testing.T) { - ResetRoastRetryRegistrationForTest() +// TestSubmitSnapshotIfActive_StashesEvidenceWhenBoundAndPopulated is the core +// blame-bridge wiring (RFC-21 Phase 7.3 PR2b-2 step 2): a bound attempt with +// captured evidence stashes the RAW evidence keyed by the attempt's +// (RoastSessionID==ctx.SessionID, member, attemptHash==ctx.Hash()) so the +// transition exchange's BroadcastForcedSnapshot can carry it into the bundle. +func TestSubmitSnapshotIfActive_StashesEvidenceWhenBoundAndPopulated(t *testing.T) { ResetSessionHandleRegistryForTest() - t.Cleanup(ResetRoastRetryRegistrationForTest) + ResetPendingEvidenceRegistryForTest() t.Cleanup(ResetSessionHandleRegistryForTest) + t.Cleanup(ResetPendingEvidenceRegistryForTest) const selfMember group.MemberIndex = 1 - innerCoord := roast.NewInMemoryCoordinatorWithSigning( - selfMember, - &deterministicSigner{id: selfMember}, - deterministicVerifier{}, - ) - cap := newCaptureCoordinator(innerCoord) - RegisterRoastRetryCoordinator(RoastRetryDeps{ - Coordinator: cap, - Signer: &deterministicSigner{id: selfMember}, - Verifier: deterministicVerifier{}, - SelfMember: uint32(selfMember), - }) - ctx := newTestContextForSubmit(t, "session-real") - handle, err := cap.BeginAttempt(ctx) - if err != nil { - t.Fatalf("begin: %v", err) - } - SetCurrentAttemptHandleForSession("session-real", selfMember, handle, ctx) + SetCurrentAttemptHandleForSession("session-real", selfMember, roast.AttemptHandle{}, ctx) recorder := attempt.NewBoundedRecorder() recorder.RecordOverflow(3) @@ -222,135 +104,86 @@ func TestSubmitSnapshotIfActive_SubmitsSignedSnapshotWhenBoundAndPopulated(t *te recorder.RecordOverflow(5) submitSnapshotIfActive("session-real", selfMember, recorder) - if len(cap.recordedFor) != 1 { - t.Fatalf("expected 1 RecordEvidence; got %d", len(cap.recordedFor)) - } - if cap.recordedFor[0] != handle { - t.Fatal("RecordEvidence handle mismatch") + evidence, ok := takePendingEvidence(ctx.SessionID, selfMember, ctx.Hash()) + if !ok { + t.Fatal("expected stashed evidence after a populated submit") } - snap := cap.recordedSnp[0] - if snap.SenderID() != selfMember { - t.Fatalf("snapshot sender: got %d want %d", snap.SenderID(), selfMember) + // 2 distinct senders observed (3 twice, 5 once). + if len(evidence.Overflows) != 2 { + t.Fatalf("expected 2 overflow senders stashed; got %d", len(evidence.Overflows)) } - if len(snap.OperatorSignature) == 0 { - t.Fatal("snapshot must be signed") + if evidence.Overflows[3] != 2 || evidence.Overflows[5] != 1 { + t.Fatalf("stashed overflow counts wrong: %+v", evidence.Overflows) } - // 2 distinct senders observed. - if len(snap.Overflows) != 2 { - t.Fatalf("expected 2 overflow entries; got %d", len(snap.Overflows)) + // take consumes: a second take finds nothing. + if _, ok := takePendingEvidence(ctx.SessionID, selfMember, ctx.Hash()); ok { + t.Fatal("take must consume the stash entry") } } -// TestSubmitSnapshotIfActive_MultiSeatSubmitsPerSeat asserts the PR2b-2 member- -// aware submit path: with two local seats registered, each seat submits its own -// evidence against ITS OWN coordinator and attempt handle. The PR2b-1.5 count>1 -// no-op guard is gone, so both submissions land -- and neither lands on the -// sibling's coordinator. -func TestSubmitSnapshotIfActive_MultiSeatSubmitsPerSeat(t *testing.T) { - ResetRoastRetryRegistrationForTest() +// TestSubmitSnapshotIfActive_StashesRejectOnlySnapshot guards the all-categories +// emptiness test: a snapshot carrying ONLY reject evidence -- no overflow, no +// conflict -- must still be stashed. A validation-blamable Reject populates +// Evidence.Rejects without any Overflow, and NextAttempt's exclusion path consumes +// snapshot.Rejects; an overflow-only emptiness check would starve the blame +// pipeline. +func TestSubmitSnapshotIfActive_StashesRejectOnlySnapshot(t *testing.T) { ResetSessionHandleRegistryForTest() - t.Cleanup(ResetRoastRetryRegistrationForTest) + ResetPendingEvidenceRegistryForTest() t.Cleanup(ResetSessionHandleRegistryForTest) + t.Cleanup(ResetPendingEvidenceRegistryForTest) - cap1 := newCaptureCoordinator(roast.NewInMemoryCoordinatorWithSigning( - 1, &deterministicSigner{id: 1}, deterministicVerifier{}, - )) - cap2 := newCaptureCoordinator(roast.NewInMemoryCoordinatorWithSigning( - 2, &deterministicSigner{id: 2}, deterministicVerifier{}, - )) - RegisterRoastRetryCoordinatorForMember(1, RoastRetryDeps{ - Coordinator: cap1, - Signer: &deterministicSigner{id: 1}, - Verifier: deterministicVerifier{}, - SelfMember: 1, - }) - RegisterRoastRetryCoordinatorForMember(2, RoastRetryDeps{ - Coordinator: cap2, - Signer: &deterministicSigner{id: 2}, - Verifier: deterministicVerifier{}, - SelfMember: 2, - }) - - ctx := newTestContextForSubmit(t, "session-ms") - handle1, err := cap1.BeginAttempt(ctx) - if err != nil { - t.Fatalf("seat 1 begin: %v", err) - } - handle2, err := cap2.BeginAttempt(ctx) - if err != nil { - t.Fatalf("seat 2 begin: %v", err) - } - SetCurrentAttemptHandleForSession("session-ms", 1, handle1, ctx) - SetCurrentAttemptHandleForSession("session-ms", 2, handle2, ctx) - - rec1 := attempt.NewBoundedRecorder() - rec1.RecordOverflow(3) - rec2 := attempt.NewBoundedRecorder() - rec2.RecordOverflow(4) + const selfMember group.MemberIndex = 1 + ctx := newTestContextForSubmit(t, "session-reject-only") + SetCurrentAttemptHandleForSession("session-reject-only", selfMember, roast.AttemptHandle{}, ctx) - submitSnapshotIfActive("session-ms", 1, rec1) - submitSnapshotIfActive("session-ms", 2, rec2) + recorder := attempt.NewBoundedRecorder() + recorder.RecordReject(2, "attempt_context_hash_mismatch") + submitSnapshotIfActive("session-reject-only", selfMember, recorder) - // Each seat's evidence landed on ITS OWN coordinator (no mis-attribution), - // each against its own handle and signed as its own member. - if len(cap1.recordedFor) != 1 || cap1.recordedFor[0] != handle1 { - t.Fatalf("seat 1 must record once against its own handle; got %+v", cap1.recordedFor) - } - if len(cap2.recordedFor) != 1 || cap2.recordedFor[0] != handle2 { - t.Fatalf("seat 2 must record once against its own handle; got %+v", cap2.recordedFor) - } - if cap1.recordedSnp[0].SenderID() != 1 { - t.Fatalf("seat 1 snapshot sender: got %d want 1", cap1.recordedSnp[0].SenderID()) + evidence, ok := takePendingEvidence(ctx.SessionID, selfMember, ctx.Hash()) + if !ok { + t.Fatal("reject-only snapshot must be stashed") } - if cap2.recordedSnp[0].SenderID() != 2 { - t.Fatalf("seat 2 snapshot sender: got %d want 2", cap2.recordedSnp[0].SenderID()) + if len(evidence.Rejects[2]) == 0 { + t.Fatalf("stashed evidence must carry the reject for sender 2; got %+v", evidence.Rejects) } } -// TestSubmitSnapshotIfActive_SubmitsRejectOnlySnapshot guards the fold of Codex's -// PR2b-2 P2: a snapshot carrying ONLY reject evidence -- no overflow, no conflict -// -- must still be signed and submitted. A validation-blamable Reject (e.g. an -// attempt-context-hash mismatch) populates Evidence.Rejects without any Overflow, -// and NextAttempt's exclusion path consumes snapshot.Rejects; the old overflow-only -// emptiness check silently dropped such snapshots, starving the blame pipeline. -func TestSubmitSnapshotIfActive_SubmitsRejectOnlySnapshot(t *testing.T) { - ResetRoastRetryRegistrationForTest() +// TestSubmitSnapshotIfActive_MultiSeatStashesPerSeat asserts the PR2b-2 +// member-aware path: with two local seats bound to the SAME attempt (same +// RoastSessionID + attemptHash), each seat stashes its OWN evidence under its own +// member key -- neither overwrites the other (the member-keying that fixes the +// sibling-collision hazard). +func TestSubmitSnapshotIfActive_MultiSeatStashesPerSeat(t *testing.T) { ResetSessionHandleRegistryForTest() - t.Cleanup(ResetRoastRetryRegistrationForTest) + ResetPendingEvidenceRegistryForTest() t.Cleanup(ResetSessionHandleRegistryForTest) + t.Cleanup(ResetPendingEvidenceRegistryForTest) - const selfMember group.MemberIndex = 1 - cap := newCaptureCoordinator(roast.NewInMemoryCoordinatorWithSigning( - selfMember, &deterministicSigner{id: selfMember}, deterministicVerifier{}, - )) - RegisterRoastRetryCoordinator(RoastRetryDeps{ - Coordinator: cap, - Signer: &deterministicSigner{id: selfMember}, - Verifier: deterministicVerifier{}, - SelfMember: uint32(selfMember), - }) + ctx := newTestContextForSubmit(t, "session-ms") + SetCurrentAttemptHandleForSession("session-ms", 1, roast.AttemptHandle{}, ctx) + SetCurrentAttemptHandleForSession("session-ms", 2, roast.AttemptHandle{}, ctx) - ctx := newTestContextForSubmit(t, "session-reject-only") - handle, err := cap.BeginAttempt(ctx) - if err != nil { - t.Fatalf("begin: %v", err) - } - SetCurrentAttemptHandleForSession("session-reject-only", selfMember, handle, ctx) + rec1 := attempt.NewBoundedRecorder() + rec1.RecordOverflow(3) + rec2 := attempt.NewBoundedRecorder() + rec2.RecordOverflow(4) - // Only a reject -- no overflow, no conflict. - recorder := attempt.NewBoundedRecorder() - recorder.RecordReject(2, "attempt_context_hash_mismatch") - submitSnapshotIfActive("session-reject-only", selfMember, recorder) + submitSnapshotIfActive("session-ms", 1, rec1) + submitSnapshotIfActive("session-ms", 2, rec2) - if len(cap.recordedFor) != 1 { - t.Fatalf("reject-only snapshot must be submitted; got %d RecordEvidence calls", len(cap.recordedFor)) + ev1, ok1 := takePendingEvidence(ctx.SessionID, 1, ctx.Hash()) + ev2, ok2 := takePendingEvidence(ctx.SessionID, 2, ctx.Hash()) + if !ok1 || !ok2 { + t.Fatalf("both seats must stash their own evidence; got ok1=%v ok2=%v", ok1, ok2) } - snap := cap.recordedSnp[0] - if len(snap.Rejects) == 0 { - t.Fatal("submitted snapshot must carry the reject evidence") + // Seat 1 observed sender 3 only; seat 2 observed sender 4 only -- no bleed. + if ev1.Overflows[3] != 1 || ev1.Overflows[4] != 0 { + t.Fatalf("seat 1 stash must isolate its own evidence; got %+v", ev1.Overflows) } - if len(snap.OperatorSignature) == 0 { - t.Fatal("reject-only snapshot must be signed") + if ev2.Overflows[4] != 1 || ev2.Overflows[3] != 0 { + t.Fatalf("seat 2 stash must isolate its own evidence; got %+v", ev2.Overflows) } } @@ -403,32 +236,3 @@ func TestClearCurrentAttemptHandleForSession_RemovesBinding(t *testing.T) { t.Fatal("binding must be cleared") } } - -func TestSubmitSnapshotIfActive_RecordEvidenceFailureIsLoggedNotPropagated(t *testing.T) { - ResetRoastRetryRegistrationForTest() - ResetSessionHandleRegistryForTest() - t.Cleanup(ResetRoastRetryRegistrationForTest) - t.Cleanup(ResetSessionHandleRegistryForTest) - - innerCoord := roast.NewInMemoryCoordinatorWithSigning( - 1, &deterministicSigner{id: 1}, deterministicVerifier{}, - ) - cap := newCaptureCoordinator(innerCoord) - cap.recordErr = errors.New("synthetic RecordEvidence failure") - RegisterRoastRetryCoordinator(RoastRetryDeps{ - Coordinator: cap, - Signer: &deterministicSigner{id: 1}, - Verifier: deterministicVerifier{}, - SelfMember: 1, - }) - - ctx := newTestContextForSubmit(t, "session-failure") - handle, _ := cap.BeginAttempt(ctx) - SetCurrentAttemptHandleForSession("session-failure", 1, handle, ctx) - - recorder := attempt.NewBoundedRecorder() - recorder.RecordOverflow(3) - - // Must not panic. Caller is unaffected. - submitSnapshotIfActive("session-failure", 1, recorder) -} diff --git a/pkg/frost/signing/roast_transition_exchange_frost_native_roast_retry.go b/pkg/frost/signing/roast_transition_exchange_frost_native_roast_retry.go index 43171150fb..61908b980f 100644 --- a/pkg/frost/signing/roast_transition_exchange_frost_native_roast_retry.go +++ b/pkg/frost/signing/roast_transition_exchange_frost_native_roast_retry.go @@ -22,9 +22,10 @@ import ( // coordinator's local observe handle (only the elected seat collects, for // aggregation); peers' transition bundles are verified against this seat's // own observe handle and, when valid, stored as the next-attempt record. -// - BroadcastForcedSnapshot: a participating seat publishes a forced (empty) -// proof-of-attendance snapshot so it appears in the aggregated bundle and is -// not silence-parked by NextAttempt. It records its OWN snapshot before +// - BroadcastForcedSnapshot: a participating seat publishes its proof-of- +// attendance snapshot -- carrying the real evidence the coarse receive loop +// stashed for the attempt, if any -- so it appears in the aggregated bundle +// and is not silence-parked by NextAttempt. It records its OWN snapshot before // broadcasting, so VerifyBundle's censorship check is meaningful. // - AggregateAndBroadcast: the elected seat aggregates the collected snapshots // into a coordinator-signed bundle, stores it locally via the same @@ -83,6 +84,10 @@ func (e *RoastTransitionExchange) listen() { // produced a transition record to clear, so its bindings would otherwise // linger until the TTL sweep. defer clearObservedAttemptsForSession(e.roastSessionID, e.member) + // Likewise drop any stashed coarse-path evidence this seat captured but never + // broadcast -- a succeeded attempt never reaches BroadcastForcedSnapshot, which + // is what consumes the stash (RFC-21 Phase 7.3 PR2b-2 step 2). + defer clearPendingEvidenceForSession(e.roastSessionID, e.member) for { select { case <-e.ctx.Done(): @@ -180,10 +185,19 @@ func (e *RoastTransitionExchange) HasLostSync() bool { return e.lostSync.Load() } -// BroadcastForcedSnapshot publishes this seat's forced (empty) proof-of-attendance -// snapshot for the attempt, recording it locally BEFORE the broadcast so the -// censorship check on the returned bundle is meaningful. A no-op when the seat -// has no observe binding for the attempt or signing fails. +// BroadcastForcedSnapshot publishes this seat's proof-of-attendance snapshot for +// the attempt, recording it locally BEFORE the broadcast so the censorship check +// on the returned bundle is meaningful. A no-op when the seat has no observe +// binding for the attempt or signing fails. +// +// RFC-21 Phase 7.3 PR2b-2 step 2 (the blame bridge): the snapshot carries the REAL +// evidence the coarse receive loop captured for this attempt, if any. The coarse +// path stashes its recorder snapshot keyed by the same (RoastSessionID, member, +// attemptHash); this consumes it so the seat's broadcast -- and therefore the +// elected coordinator's aggregated bundle -- carries real rejects/overflows/ +// conflicts and NextAttempt's f+1 tally can fire. When the stash is empty (the +// seat observed nothing) the snapshot is the empty proof-of-attendance one, which +// must still be broadcast so the seat is not silence-parked. func (e *RoastTransitionExchange) BroadcastForcedSnapshot( attemptHash [attempt.MessageDigestLength]byte, ) { @@ -191,7 +205,11 @@ func (e *RoastTransitionExchange) BroadcastForcedSnapshot( if !ok { return } - snapshot := roast.NewLocalEvidenceSnapshot(e.member, attemptHash, attempt.Evidence{}) + // takePendingEvidence returns the zero Evidence on a miss, which + // NewLocalEvidenceSnapshot renders as the empty proof-of-attendance snapshot -- + // still broadcast so the seat is not silence-parked. + evidence, _ := takePendingEvidence(e.roastSessionID, e.member, attemptHash) + snapshot := roast.NewLocalEvidenceSnapshot(e.member, attemptHash, evidence) payload, err := snapshot.SignableBytes() if err != nil { e.logger.Warnf("roast transition: forced snapshot signable bytes: [%v]", err) diff --git a/pkg/frost/signing/roast_transition_exchange_frost_native_roast_retry_test.go b/pkg/frost/signing/roast_transition_exchange_frost_native_roast_retry_test.go index 5a73a209fc..9e74d3283b 100644 --- a/pkg/frost/signing/roast_transition_exchange_frost_native_roast_retry_test.go +++ b/pkg/frost/signing/roast_transition_exchange_frost_native_roast_retry_test.go @@ -633,3 +633,99 @@ func TestRoastTransitionExchange_NonElectedDoesNotAggregate(t *testing.T) { t.Fatal("non-elected seat must not store a transition record from AggregateAndBroadcast") } } + +// TestRoastTransitionExchange_StashedEvidenceDrivesExclusion is the end-to-end +// blame-bridge proof (RFC-21 Phase 7.3 PR2b-2 step 2): coarse-path evidence stashed +// by the receive loop is carried by BroadcastForcedSnapshot into the aggregated +// bundle, so NextAttempt's f+1 accuser tally excludes the blamed member -- the +// exclusion the pre-bridge forced-EMPTY snapshots could never trigger. +func TestRoastTransitionExchange_StashedEvidenceDrivesExclusion(t *testing.T) { + ResetObservedAttemptRegistryForTest() + ResetRoastTransitionRegistryForTest() + ResetPendingEvidenceRegistryForTest() + t.Cleanup(ResetObservedAttemptRegistryForTest) + t.Cleanup(ResetRoastTransitionRegistryForTest) + t.Cleanup(ResetPendingEvidenceRegistryForTest) + + roastSessionID := "exchange-blame-bridge-session" + included := []group.MemberIndex{1, 2, 3} + dkgKey := []byte{0x09, 0x0a} + // original group size 3; quorum = ExclusionAccuserQuorum(3, threshold) = + // 3 - 2 + 1 = 2, so two accusers establish a permanent reject exclusion. + const threshold uint = 2 + + ctx := newExchangeTestContext(t, roastSessionID, included, dkgKey) + hash := ctx.Hash() + nodes := newExchangeTestNodes(t, roastSessionID, ctx, dkgKey) + + // Members 1 and 2 each observed a validation reject against member 3 during the + // coarse receive loop; their submit stashed it. Member 3 observed nothing and + // broadcasts only an empty proof-of-attendance snapshot. + rejectAgainst3 := attempt.Evidence{ + Rejects: map[group.MemberIndex][]attempt.RejectEntry{ + 3: {{Reason: "attempt_context_hash_mismatch", Count: 1}}, + }, + } + stashPendingEvidence(roastSessionID, 1, hash, rejectAgainst3) + stashPendingEvidence(roastSessionID, 2, hash, rejectAgainst3) + + bundleMsg, elected := produceTransitionBundleForTest(t, roastSessionID, ctx, nodes) + + // The aggregated bundle must carry real reject evidence from >=2 accusers, not + // the forced-empty snapshots of the pre-bridge design. + bundle := &roast.TransitionMessage{} + if err := bundle.Unmarshal(bundleMsg.Payload); err != nil { + t.Fatalf("unmarshal bundle: %v", err) + } + accusers := 0 + for i := range bundle.Bundle { + if len(bundle.Bundle[i].Rejects) > 0 { + accusers++ + } + } + if accusers < 2 { + t.Fatalf("bundle must carry reject evidence from >=2 accusers; got %d", accusers) + } + + // BroadcastForcedSnapshot consumed the stash. + if PendingEvidenceStashedForTest(roastSessionID, 1) || + PendingEvidenceStashedForTest(roastSessionID, 2) { + t.Fatal("BroadcastForcedSnapshot must consume the stash") + } + + // A non-elected accuser verifies the bundle against its own observe handle and + // computes the next attempt: member 3 meets the f+1 reject quorum -> excluded. + var verifier group.MemberIndex + for _, m := range included { + if m != elected { + verifier = m + break + } + } + binding, ok := observedAttempt(roastSessionID, verifier, hash) + if !ok { + t.Fatalf("non-elected seat %d must still hold its observe binding", verifier) + } + if err := nodes[verifier].coord.VerifyBundle(binding.handle, bundle); err != nil { + t.Fatalf("verify bundle: %v", err) + } + next, err := nodes[verifier].coord.NextAttempt(binding.handle, bundle, threshold, dkgKey) + if err != nil { + t.Fatalf("next attempt: %v", err) + } + + excluded3 := false + for _, m := range next.ExcludedSet { + if m == 3 { + excluded3 = true + } + } + if !excluded3 { + t.Fatalf("member 3 must be excluded by the f+1 reject quorum; excluded=%v", next.ExcludedSet) + } + for _, m := range next.IncludedSet { + if m == 3 { + t.Fatalf("excluded member 3 must not be in the next included set; included=%v", next.IncludedSet) + } + } +} diff --git a/pkg/tbtc/signing_loop.go b/pkg/tbtc/signing_loop.go index 57e7feb1ce..754f932424 100644 --- a/pkg/tbtc/signing_loop.go +++ b/pkg/tbtc/signing_loop.go @@ -119,10 +119,12 @@ type signingRetryLoop struct { doneCheck signingDoneCheckStrategy - // participantSelector dispatches qualified-operator selection. - // Default: legacy retry shuffle. Phase 7 may install a - // ROAST-driven implementation behind the frost_roast_retry - // build tag once AggregateBundle production is wired upstream. + // participantSelector dispatches qualified-operator selection. The default + // build uses the legacy retry shuffle; the frost_roast_retry build installs the + // ROAST-driven selector (signing_loop_selector_frost_roast_retry.go), which + // consumes the transition record the exchange produces -- now carrying the real + // blame evidence the coarse path captures (RFC-21 Phase 7.3 PR2b-2 step 2) -- + // and falls back to the shuffle only on the uniform initial/inactive condition. participantSelector signingParticipantSelector // transitionController, when non-nil, owns the session-scoped ROAST diff --git a/pkg/tbtc/signing_loop_roast_transition_controller_frost_native_roast_retry.go b/pkg/tbtc/signing_loop_roast_transition_controller_frost_native_roast_retry.go index 5ad533407e..c0c2380974 100644 --- a/pkg/tbtc/signing_loop_roast_transition_controller_frost_native_roast_retry.go +++ b/pkg/tbtc/signing_loop_roast_transition_controller_frost_native_roast_retry.go @@ -186,6 +186,15 @@ func (c *roastTransitionControllerImpl) OnAttemptSucceeded() { c.requestTemplate.MemberIndex, hash, ) + // Also drop any coarse-path evidence stashed for this attempt: a succeeded + // attempt never reaches OnAttemptFailed -> BroadcastForcedSnapshot (which is + // what consumes the stash), so without this the entry would leak until the TTL + // sweep (RFC-21 Phase 7.3 PR2b-2 step 2, the blame bridge). + signing.ClearPendingEvidenceOnLocalSuccess( + c.requestTemplate.RoastSessionID, + c.requestTemplate.MemberIndex, + hash, + ) } func (c *roastTransitionControllerImpl) HasLostSync() bool {