|
| 1 | +package sync |
| 2 | + |
| 3 | +import ( |
| 4 | + "fmt" |
| 5 | + "time" |
| 6 | +) |
| 7 | + |
| 8 | +// poisonedLock is a sentinel lock installed into the Manager's syncLockMap when, |
| 9 | +// during Initialize, the controller cannot re-establish a holder that a Running |
| 10 | +// workflow's status claims to hold. |
| 11 | +// |
| 12 | +// The soundness invariant is: if a Workflow's status records that it is holding |
| 13 | +// a lock, the in-memory lock map must reflect that hold after Initialize. |
| 14 | +// Otherwise a racing workflow's TryAcquire would find the lock absent, create a |
| 15 | +// fresh one, and acquire a lock that is - per persisted state - already held. |
| 16 | +// For a mutex that means two workflows running concurrently under the same |
| 17 | +// mutex. |
| 18 | +// |
| 19 | +// Rather than silently dropping the holder (the previous behaviour), we install |
| 20 | +// this lock, which refuses every acquire and reports a poisoned-state message. |
| 21 | +// That message surfaces on the waiting node's synchronization status, marking |
| 22 | +// the node/workflow as blocked by a poisoned lock so an operator can intervene. |
| 23 | +// |
| 24 | +// The poison is in-memory only and is cleared on the next controller restart, |
| 25 | +// at which point Initialize re-evaluates: if the offending workflow is no longer |
| 26 | +// Running the lock is recreated clean; if it is still Running and still |
| 27 | +// unresolvable, it is poisoned again. |
| 28 | +type poisonedLock struct { |
| 29 | + name string |
| 30 | + reason string |
| 31 | +} |
| 32 | + |
| 33 | +var _ semaphore = &poisonedLock{} |
| 34 | + |
| 35 | +func newPoisonedLock(name, reason string) *poisonedLock { |
| 36 | + return &poisonedLock{name: name, reason: reason} |
| 37 | +} |
| 38 | + |
| 39 | +func (p *poisonedLock) message() string { |
| 40 | + return fmt.Sprintf("lock %s is in a poisoned state: %s; manual intervention required", p.name, p.reason) |
| 41 | +} |
| 42 | + |
| 43 | +func (p *poisonedLock) acquire(_ string, _ *transaction) (bool, error) { |
| 44 | + return false, nil |
| 45 | +} |
| 46 | + |
| 47 | +// reacquire is a no-op: a poisoned lock refuses all holds until restart. It |
| 48 | +// returns nil because the poison already protects the recorded hold; failing |
| 49 | +// the holding workflow on top of that would punish it for an unrelated |
| 50 | +// holder's poisoning. |
| 51 | +func (p *poisonedLock) reacquire(_ string, _ *transaction) error { |
| 52 | + return nil |
| 53 | +} |
| 54 | + |
| 55 | +func (p *poisonedLock) checkAcquire(_ string, _ *transaction) (bool, bool, string) { |
| 56 | + return false, false, p.message() |
| 57 | +} |
| 58 | + |
| 59 | +func (p *poisonedLock) tryAcquire(_ string, _ *transaction) (bool, string, error) { |
| 60 | + return false, p.message(), nil |
| 61 | +} |
| 62 | + |
| 63 | +func (p *poisonedLock) release(_ string) bool { return false } |
| 64 | + |
| 65 | +func (p *poisonedLock) addToQueue(_ string, _ int32, _ time.Time) error { |
| 66 | + return nil |
| 67 | +} |
| 68 | + |
| 69 | +func (p *poisonedLock) removeFromQueue(_ string) error { return nil } |
| 70 | + |
| 71 | +func (p *poisonedLock) getCurrentHolders() ([]string, error) { return nil, nil } |
| 72 | + |
| 73 | +func (p *poisonedLock) getCurrentPending() ([]string, error) { return nil, nil } |
| 74 | + |
| 75 | +func (p *poisonedLock) getName() string { return p.name } |
| 76 | + |
| 77 | +func (p *poisonedLock) getLimit() int { return 0 } |
| 78 | + |
| 79 | +func (p *poisonedLock) probeWaiting() {} |
| 80 | + |
| 81 | +// lock returns true so that tryAcquireImpl proceeds to checkAcquire, which |
| 82 | +// returns the poisoned-state message rather than a generic "failed to lock()". |
| 83 | +func (p *poisonedLock) lock() bool { return true } |
| 84 | + |
| 85 | +func (p *poisonedLock) unlock() {} |
0 commit comments