Skip to content
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ func driveInteractiveRoastSigningIfEnabled(
// SelectedCoordinator(handle) return ErrUnknownAttempt and hard-fail below --
// safe (no wrong signature), and only reachable by reconfiguring the
// coordinator mid-session. An absent registration falls back to coarse.
deps, ok := RegisteredRoastRetryCoordinator()
deps, ok := RegisteredRoastRetryCoordinatorForMember(request.MemberIndex)
if !ok || deps.Coordinator == nil {
return nil, nil
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,19 @@ func ObserveAttemptForTransition(
return zeroHash, fmt.Errorf("observe attempt: request is nil")
}

// Respect the readiness opt-in gate, exactly as BeginOrchestrationForSession
// does: when ROAST retry is opted out, observing is pointless (nothing
// consumes the binding) and must stay inert. Opt-out is a deterministic
// static condition every honest node sees identically.
if err := EnsureRoastRetryReadinessOptIn(); err != nil {
// Respect the per-seat readiness + registration gate, exactly as the selector
// and BeginOrchestrationForSession do: when THIS seat has no registered
// coordinator (or readiness is opted out), observing is pointless (nothing
// consumes the binding) and must stay inert. A deterministic static condition
// every honest node sees identically per seat. RFC-21 Phase 7.3 PR2b-1.5: a
// multi-seat operator observes per local seat with that seat's coordinator.
if !RoastRetryActiveForMember(request.MemberIndex) {
return zeroHash, nil
}

deps, ok := RegisteredRoastRetryCoordinator()
deps, ok := RegisteredRoastRetryCoordinatorForMember(request.MemberIndex)
if !ok || deps.Coordinator == nil {
// No orchestration registered -- static fallback, every honest node
// observes the same empty registry.
// No coordinator registered for this seat -- static fallback.
return zeroHash, nil
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ func attemptRoastRetryOrchestrationFromRequest(
// The cross-attempt transition record is produced + keyed (by the stable
// RoastSessionID) entirely in the transition exchange now, not here.
handle, cleanup, err := BeginOrchestrationForSession(
request.SessionID, attemptCtx,
request.SessionID, request.MemberIndex, attemptCtx,
)
if err != nil {
switch {
Expand Down
108 changes: 91 additions & 17 deletions pkg/frost/signing/roast_retry_orchestration.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ package signing
// The orchestration layer in this file participates in a load-bearing
// decision that prevents split-brain group fracture in the ROAST retry
// path. Errors returned through the orchestration boundary are
// classified into one of two categories, and the consumer (the
// classified into one of three categories, and the consumer (the
// signing-loop dispatcher) routes them accordingly:
//
// STATIC errors -> safe to fall back to the legacy retry path.
Expand All @@ -18,19 +18,39 @@ package signing
// Detected via errors.Is in
// signing_loop_roast_dispatcher.go.
//
// RUNTIME errors -> HARD FAIL. No fallback. Any error that arises
// from per-attempt protocol state (BeginAttempt
// internals, AttemptContext binding mismatches,
// transition-bundle verification failures, etc.)
// can be observed by some participants and not
// others within the same attempt. Falling back to
// legacy under those conditions would leave some
// operators running the new code path and others
// running legacy on the same attempt -- the canonical
// definition of split-brain fracture. The
// orchestration layer therefore returns these as
// bare (non-sentinel) errors that the dispatcher
// treats as terminal.
// RUNTIME errors -> NO FALLBACK, RETRY THE NEXT ATTEMPT. Any error
// that arises from per-attempt protocol state
// (BeginAttempt internals, AttemptContext binding
// mismatches, transition-bundle verification
// failures, etc.) can be observed by some
// participants and not others within the same
// attempt. Falling back to legacy under those
// conditions would leave some operators running the
// new code path and others running legacy on the
// same attempt -- the canonical definition of
// split-brain fracture. The orchestration layer
// therefore returns these as bare (non-sentinel)
// errors; the signingRetryLoop does NOT fall back to
// coarse, but it DOES retry on the next attempt
// because the fault may be transient and clear.
//
// TERMINAL errors -> ABORT THE RETRY LOOP. A STATIC condition that no
// future attempt can resolve, e.g. multi-seat
// interactive ROAST orchestration, which is not yet
// member-safe (the session handle binding is keyed
// by sessionID alone, so sibling seats collide;
// member-keyed handles land in a later PR). Unlike a
// RUNTIME error, retrying is FUTILE: every attempt
// re-derives the same static outcome, so the loop
// would spin until timeout AND synthesize garbage
// failed-attempt transitions (OnAttemptFailed).
// Coarse fallback is also unsafe (interactive<->coarse
// mixing fractures the group), so terminating is the
// only non-fracturing option. The orchestration layer
// wraps ErrTerminalSigningFailure; the signingRetryLoop
// matches it via errors.Is and exits immediately
// (return nil, err) BEFORE the retry/transition
// machinery.
//
// The classification is enforced at this file's boundary: any error
// surfaced from this package that is intended to permit fallback MUST
Expand All @@ -55,6 +75,7 @@ import (

"github.com/keep-network/keep-core/pkg/frost/roast"
"github.com/keep-network/keep-core/pkg/frost/roast/attempt"
"github.com/keep-network/keep-core/pkg/protocol/group"
)

// ErrNoRoastRetryCoordinatorRegistered is returned by
Expand All @@ -71,6 +92,24 @@ var ErrNoRoastRetryCoordinatorRegistered = errors.New(
"roast orchestration: no coordinator registered",
)

// ErrTerminalSigningFailure classifies an orchestration error as TERMINAL: a
// static condition no future attempt can resolve, so the signingRetryLoop must
// ABORT the loop (return nil, err) rather than retry the next attempt. It is the
// third disposition in the taxonomy above. Orchestration code wraps it
// (fmt.Errorf("%w: ...", ErrTerminalSigningFailure)) and the loop matches it via
// errors.Is. It is distinct from ErrNoRoastRetryCoordinatorRegistered (STATIC,
// coarse-fallback) and from bare RUNTIME errors (no fallback, but retried): a
// TERMINAL error is futile to retry and unsafe to coarse-fall-back, so the only
// non-fracturing disposition is to stop.
//
// Current sole producer: BeginOrchestrationForSession, for a multi-seat operator
// whose interactive ROAST orchestration is not yet member-safe.
//
// Use errors.Is to detect.
var ErrTerminalSigningFailure = errors.New(
"terminal signing failure",
)

// BeginOrchestrationForSession encapsulates the per-session
// BeginAttempt + binding-population step the RFC-21 Phase 5
// orchestration layer performs. Callers in the layer above the
Expand All @@ -94,6 +133,7 @@ var ErrNoRoastRetryCoordinatorRegistered = errors.New(
// this no longer takes the DKG group public key.
func BeginOrchestrationForSession(
sessionID string,
member group.MemberIndex,
ctx attempt.AttemptContext,
) (roast.AttemptHandle, func(), error) {
if err := EnsureRoastRetryReadinessOptIn(); err != nil {
Expand All @@ -102,11 +142,45 @@ func BeginOrchestrationForSession(
err,
)
}
deps, ok := RegisteredRoastRetryCoordinator()
// RFC-21 Phase 7.3 PR2b-1.5: mint the handle from THIS seat's coordinator, so a
// multi-seat operator's elected seat aggregates with its own binding.
deps, ok := RegisteredRoastRetryCoordinatorForMember(member)
memberCount := registeredRoastRetryMemberCount()
// Multi-seat is not yet member-safe here: the session handle binding below
// (SetCurrentAttemptHandleForSession) is keyed by sessionID alone, so two local
// seats in the same attempt would collide. Fail CLOSED for any multi-seat operator
// -- a hard (non-sentinel) error the dispatcher treats as terminal, NEVER the
// legacy-fallback sentinel -- until PR2b-2 wires member-keyed handles. Returning the
// sentinel here would let this seat run the coarse/legacy path while sibling seats
// drive bound ROAST messages, splitting the attempt into mixed bound/unbound. This
// mirrors the coarse evidence path's multi-seat guard (submitSnapshotIfActive) in
// this same PR.
if memberCount > 1 {
return roast.AttemptHandle{}, nil, fmt.Errorf(
"%w: multi-seat orchestration is not yet member-aware; "+
"fail closed for session %q until PR2b-2",
ErrTerminalSigningFailure,
sessionID,
)
}
if !ok {
// memberCount is 0 or 1 here. count==0: no seat is registered anywhere, so ROAST
// is not active for the process -- a uniform, group-wide condition every honest
// node decides identically -> safe legacy fallback (the sentinel). count==1: a
// sibling seat IS registered but not THIS one (a partially-registered operator),
// so advertising the legacy fallback for this seat while the sibling drives bound
// ROAST would fracture the attempt -> fail CLOSED instead (Codex re-review).
if memberCount == 0 {
return roast.AttemptHandle{}, nil, fmt.Errorf(
"%w: caller should fall back to legacy behaviour",
ErrNoRoastRetryCoordinatorRegistered,
)
}
return roast.AttemptHandle{}, nil, fmt.Errorf(
"%w: caller should fall back to legacy behaviour",
ErrNoRoastRetryCoordinatorRegistered,
"%w: seat %d has no registered coordinator while a sibling "+
"seat is ROAST-active; fail closed",
ErrTerminalSigningFailure,
member,
)
}
if deps.Coordinator == nil {
Expand Down
2 changes: 1 addition & 1 deletion pkg/frost/signing/roast_retry_orchestration_bundle_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ func TestCleanup_ClearsBindingAndProducesNoTransitionRecord(t *testing.T) {
})

const sessionID = "cleanup-no-record-session"
handle, cleanup, err := BeginOrchestrationForSession(sessionID, ctx)
handle, cleanup, err := BeginOrchestrationForSession(sessionID, elected, ctx)
if err != nil {
t.Fatalf("begin: %v", err)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ func TestBeginOrchestrationForSession_HappyPath(t *testing.T) {
})

ctx := newOrchestrationTestContext(t)
handle, cleanup, err := BeginOrchestrationForSession("session-A", ctx)
handle, cleanup, err := BeginOrchestrationForSession("session-A", 1, ctx)
if err != nil {
t.Fatalf("begin: %v", err)
}
Expand Down Expand Up @@ -80,7 +80,7 @@ func TestBeginOrchestrationForSession_ErrorsWhenRegistryEmpty(t *testing.T) {

// Readiness env var is set; the registry is empty -- we expect
// the registry-empty error, not the env-var error.
_, _, err := BeginOrchestrationForSession("session-X", newOrchestrationTestContext(t))
_, _, err := BeginOrchestrationForSession("session-X", 1, newOrchestrationTestContext(t))
if err == nil {
t.Fatal("expected error when registry is empty")
}
Expand Down Expand Up @@ -110,7 +110,7 @@ func TestBeginOrchestrationForSession_ErrorsWhenReadinessOptInUnset(t *testing.T
SelfMember: 1,
})

_, _, err := BeginOrchestrationForSession("session-no-optin", newOrchestrationTestContext(t))
_, _, err := BeginOrchestrationForSession("session-no-optin", 1, newOrchestrationTestContext(t))
if !errors.Is(err, ErrRoastRetryReadinessOptOut) {
t.Fatalf("expected ErrRoastRetryReadinessOptOut, got %v", err)
}
Expand All @@ -130,7 +130,7 @@ func TestBeginOrchestrationForSession_ErrorsWhenCoordinatorNil(t *testing.T) {
SelfMember: 1,
})

_, _, err := BeginOrchestrationForSession("session-Y", newOrchestrationTestContext(t))
_, _, err := BeginOrchestrationForSession("session-Y", 1, newOrchestrationTestContext(t))
if err == nil {
t.Fatal("expected error when Coordinator is nil")
}
Expand All @@ -154,7 +154,7 @@ func TestBeginOrchestrationForSession_PropagatesBeginAttemptError(t *testing.T)
SelfMember: 1,
})

_, _, err := BeginOrchestrationForSession("session-Z", newOrchestrationTestContext(t))
_, _, err := BeginOrchestrationForSession("session-Z", 1, newOrchestrationTestContext(t))
if err == nil {
t.Fatal("expected error from coordinator")
}
Expand All @@ -163,6 +163,85 @@ func TestBeginOrchestrationForSession_PropagatesBeginAttemptError(t *testing.T)
}
}

// assertOrchestrationFailedClosed asserts err is a HARD fail-closed: non-nil,
// neither static-fallback sentinel, and that no session binding leaked.
func assertOrchestrationFailedClosed(t *testing.T, sessionID string, cleanup func(), err error) {
t.Helper()
if err == nil {
t.Fatal("expected a fail-closed error, got nil")
}
if errors.Is(err, ErrNoRoastRetryCoordinatorRegistered) {
t.Fatalf("must NOT return the legacy-fallback sentinel; got %v", err)
}
if errors.Is(err, ErrRoastRetryReadinessOptOut) {
t.Fatalf("must NOT return the readiness sentinel; got %v", err)
}
// Must be classified TERMINAL so the signingRetryLoop aborts instead of
// retrying the (static, never-resolving) multi-seat condition.
if !errors.Is(err, ErrTerminalSigningFailure) {
t.Fatalf("multi-seat fail-closed must be classified terminal (ErrTerminalSigningFailure); got %v", err)
}
if cleanup != nil {
t.Fatal("a failed begin must not return a cleanup")
}
if _, _, ok := currentAttemptHandleForCollect(sessionID); ok {
t.Fatal("fail-closed must not create a session binding")
}
}

// TestBeginOrchestrationForSession_FailsClosedPartialMultiSeat is the Codex
// re-review case: a multi-seat operator that has at least one seat registered but
// NOT this one. The member-aware lookup misses, and rather than returning the
// legacy-fallback sentinel (which would let this seat run coarse/legacy while the
// registered sibling drives bound ROAST -> fracture), Begin fails CLOSED.
func TestBeginOrchestrationForSession_FailsClosedPartialMultiSeat(t *testing.T) {
t.Setenv(RoastRetryReadinessOptInEnvVar, "true")
ResetRoastRetryRegistrationForTest()
ResetSessionHandleRegistryForTest()
t.Cleanup(ResetRoastRetryRegistrationForTest)
t.Cleanup(ResetSessionHandleRegistryForTest)

// Only seat 1 is registered; this Execute is for the unregistered seat 2.
RegisterRoastRetryCoordinatorForMember(1, RoastRetryDeps{
Coordinator: roast.NewInMemoryCoordinatorWithSigning(1, roast.NoOpSigner(), roast.NoOpSignatureVerifier()),
SelfMember: 1,
})

_, cleanup, err := BeginOrchestrationForSession("session-partial", 2, newOrchestrationTestContext(t))
assertOrchestrationFailedClosed(t, "session-partial", cleanup, err)
if !strings.Contains(err.Error(), "fail closed") {
t.Fatalf("error must explain the fail-closed; got %v", err)
}
}

// TestBeginOrchestrationForSession_FailsClosedFullMultiSeat asserts the
// fully-registered multi-seat case also fails closed: the session-handle binding
// is still keyed by sessionID alone, so two local seats would collide. Deferred
// to PR2b-2; until then any multi-seat operator fails closed rather than mis-bind.
func TestBeginOrchestrationForSession_FailsClosedFullMultiSeat(t *testing.T) {
t.Setenv(RoastRetryReadinessOptInEnvVar, "true")
ResetRoastRetryRegistrationForTest()
ResetSessionHandleRegistryForTest()
t.Cleanup(ResetRoastRetryRegistrationForTest)
t.Cleanup(ResetSessionHandleRegistryForTest)

// Both local seats registered -> multi-seat; call with a registered member.
RegisterRoastRetryCoordinatorForMember(1, RoastRetryDeps{
Coordinator: roast.NewInMemoryCoordinatorWithSigning(1, roast.NoOpSigner(), roast.NoOpSignatureVerifier()),
SelfMember: 1,
})
RegisterRoastRetryCoordinatorForMember(2, RoastRetryDeps{
Coordinator: roast.NewInMemoryCoordinatorWithSigning(2, roast.NoOpSigner(), roast.NoOpSignatureVerifier()),
SelfMember: 2,
})

_, cleanup, err := BeginOrchestrationForSession("session-multiseat", 1, newOrchestrationTestContext(t))
assertOrchestrationFailedClosed(t, "session-multiseat", cleanup, err)
if !strings.Contains(err.Error(), "multi-seat") {
t.Fatalf("error must explain the multi-seat fail-closed; got %v", err)
}
}

func TestEndOrchestrationForSession_RemovesBinding(t *testing.T) {
ResetSessionHandleRegistryForTest()
t.Cleanup(ResetSessionHandleRegistryForTest)
Expand Down
2 changes: 1 addition & 1 deletion pkg/frost/signing/roast_retry_orchestration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ func TestBeginOrchestrationForSession_DefaultBuildReturnsError(t *testing.T) {
t.Fatalf("ctx: %v", err)
}

_, _, err = BeginOrchestrationForSession("session-default-build", ctx)
_, _, err = BeginOrchestrationForSession("session-default-build", 1, ctx)
if err == nil {
t.Fatal("default build must return error from BeginOrchestrationForSession")
}
Expand Down
29 changes: 26 additions & 3 deletions pkg/frost/signing/roast_retry_readiness.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (
"fmt"
"os"
"strings"

"github.com/keep-network/keep-core/pkg/protocol/group"
)

// RoastRetryReadinessOptInEnvVar is the environment variable name
Expand Down Expand Up @@ -75,13 +77,34 @@ func RoastRetryReadinessOptInEnabled() bool {
// can never be created instead of using the uniform legacy shuffle (Codex P2-1).
// Always false in builds without the frost_roast_retry tag (the registration and
// producer default stubs both report unavailable).
// readinessAndProducerReady is the build+env prefix shared by RoastRetryActive and
// RoastRetryActiveForMember: the readiness opt-in is set AND the transition producer
// is built in (frost_native). Both gates additionally require a registered
// coordinator (any entry / the specific member's).
func readinessAndProducerReady() bool {
return RoastRetryReadinessOptInEnabled() && roastTransitionProducerAvailable()
}

func RoastRetryActive() bool {
if !RoastRetryReadinessOptInEnabled() {
if !readinessAndProducerReady() {
return false
}
if !roastTransitionProducerAvailable() {
_, ok := RegisteredRoastRetryCoordinator()
return ok
}

// RoastRetryActiveForMember reports whether ROAST retry is runtime-active for a
// SPECIFIC local seat: readiness opt-in AND the producer is built in AND THIS
// member has a coordinator registered. Member-aware paths (the per-seat signing
// loop, the per-member selector, observe, and the exchange) use it so a multi-seat
// operator activates ROAST per seat -- a seat with no registered coordinator stays
// on the legacy path rather than fail-closing. Always false in builds without the
// frost_roast_retry tag (the per-member registration default stub reports
// not-registered). RFC-21 Phase 7.3 PR2b-1.5.
func RoastRetryActiveForMember(member group.MemberIndex) bool {
if !readinessAndProducerReady() {
return false
}
_, ok := RegisteredRoastRetryCoordinator()
_, ok := RegisteredRoastRetryCoordinatorForMember(member)
return ok
}
Loading
Loading