Skip to content

Commit efe992c

Browse files
committed
Review fixes
1 parent 0a74505 commit efe992c

2 files changed

Lines changed: 100 additions & 156 deletions

File tree

src/Pulsar.Client/Api/AutoClusterFailover.fs

Lines changed: 41 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,13 @@ type internal AutoClusterMode =
2323

2424
type internal AutoClusterState = {
2525
Mode: AutoClusterMode
26-
FailedTimestamp: DateTime option
27-
RecoveredTimestamp: DateTime option
26+
PrimaryFailedTimestamp: DateTime option
27+
PrimaryRecoveredTimestamp: DateTime option
2828
}
2929

3030
[<RequireQualifiedAccess>]
3131
type internal AutoClusterDecision =
32-
| Noop
32+
| NoAction
3333
| SwitchToSecondary of index: int
3434
| SwitchToPrimary
3535

@@ -44,74 +44,56 @@ module internal AutoClusterFailoverLogic =
4444

4545
let initialState = {
4646
Mode = AutoClusterMode.Primary
47-
FailedTimestamp = None
48-
RecoveredTimestamp = None
47+
PrimaryFailedTimestamp = None
48+
PrimaryRecoveredTimestamp = None
4949
}
5050

51-
/// Determines whether secondary endpoints need to be probed this tick.
52-
/// This avoids unnecessary network calls when the primary is healthy or the
53-
/// failover delay hasn't elapsed yet.
54-
let shouldProbeSecondaries
55-
(now: DateTime)
56-
(config: AutoClusterConfig)
57-
(primaryAvailable: bool)
58-
(state: AutoClusterState)
59-
: bool =
60-
match state.Mode with
61-
| AutoClusterMode.Primary ->
62-
not primaryAvailable &&
63-
match state.FailedTimestamp with
64-
| Some ts -> now - ts >= config.FailoverDelay
65-
| None -> false
66-
| AutoClusterMode.Secondary _ ->
67-
false
68-
6951
/// Single pure state transition covering both primary and secondary modes.
7052
/// - primaryAvailable: result of probing the primary endpoint.
71-
/// - availableSecondaryIndex: index of the first available secondary
53+
/// - findFirstAvailableSecondary: get index of the first available secondary
7254
/// (None means no secondary was probed or none was available).
7355
let step
7456
(now: DateTime)
7557
(config: AutoClusterConfig)
7658
(primaryAvailable: bool)
77-
(availableSecondaryIndex: int option)
78-
(state: AutoClusterState)
79-
: AutoClusterState * AutoClusterDecision =
80-
81-
match state.Mode with
82-
| AutoClusterMode.Primary ->
83-
if primaryAvailable then
84-
{ state with FailedTimestamp = None }, AutoClusterDecision.Noop
85-
else
86-
match state.FailedTimestamp with
59+
(findFirstAvailableSecondary: unit -> Task<int option>)
60+
(state: AutoClusterState) =
61+
backgroundTask {
62+
match state.Mode, primaryAvailable with
63+
| AutoClusterMode.Primary, true ->
64+
return { state with PrimaryFailedTimestamp = None }, AutoClusterDecision.NoAction
65+
| AutoClusterMode.Primary, false ->
66+
match state.PrimaryFailedTimestamp with
8767
| None ->
88-
{ state with FailedTimestamp = Some now }, AutoClusterDecision.Noop
68+
return { state with PrimaryFailedTimestamp = Some now }, AutoClusterDecision.NoAction
8969
| Some ts when now - ts >= config.FailoverDelay ->
90-
match availableSecondaryIndex with
70+
match! findFirstAvailableSecondary() with
9171
| Some idx ->
92-
{ Mode = AutoClusterMode.Secondary idx
93-
FailedTimestamp = None
94-
RecoveredTimestamp = None },
95-
AutoClusterDecision.SwitchToSecondary idx
72+
return {
73+
Mode = AutoClusterMode.Secondary idx
74+
PrimaryFailedTimestamp = None
75+
PrimaryRecoveredTimestamp = None
76+
}, AutoClusterDecision.SwitchToSecondary idx
9677
| None ->
97-
state, AutoClusterDecision.Noop
78+
Log.Logger.LogWarning("Secondary cluster is not available yet after failover delay")
79+
return state, AutoClusterDecision.NoAction
9880
| _ ->
99-
state, AutoClusterDecision.Noop
100-
101-
| AutoClusterMode.Secondary _ ->
102-
if primaryAvailable then
103-
match state.RecoveredTimestamp with
81+
return state, AutoClusterDecision.NoAction
82+
| AutoClusterMode.Secondary _, true ->
83+
match state.PrimaryRecoveredTimestamp with
10484
| None ->
105-
{ state with RecoveredTimestamp = Some now }, AutoClusterDecision.Noop
85+
return { state with PrimaryRecoveredTimestamp = Some now }, AutoClusterDecision.NoAction
10686
| Some ts when now - ts >= config.SwitchBackDelay ->
107-
{ Mode = AutoClusterMode.Primary
108-
FailedTimestamp = None
109-
RecoveredTimestamp = None },
110-
AutoClusterDecision.SwitchToPrimary
87+
return {
88+
Mode = AutoClusterMode.Primary
89+
PrimaryFailedTimestamp = None
90+
PrimaryRecoveredTimestamp = None
91+
}, AutoClusterDecision.SwitchToPrimary
11192
| _ ->
112-
state, AutoClusterDecision.Noop
113-
else
114-
{ state with RecoveredTimestamp = None }, AutoClusterDecision.Noop
93+
return state, AutoClusterDecision.NoAction
94+
| AutoClusterMode.Secondary _, false ->
95+
return { state with PrimaryRecoveredTimestamp = None }, AutoClusterDecision.NoAction
96+
}
11597

11698
// Orchestrator
11799

@@ -172,25 +154,19 @@ type AutoClusterFailover
172154
match context with
173155
| Some ctx -> do! ctx.UpdateServiceInfo(primary)
174156
| None -> ()
175-
| AutoClusterDecision.Noop -> ()
157+
| AutoClusterDecision.NoAction -> ()
176158
}
177159

178160
let tick () =
179161
backgroundTask {
180162
try
181163
let! primaryAvailable = probeAvailable primaryServiceInfo.EndPointResolver
182164
let now = getCurrentTime()
183-
let! availableSecondaryIndex =
184-
if AutoClusterFailoverLogic.shouldProbeSecondaries now config primaryAvailable state then
185-
findFirstAvailableSecondary()
186-
else
187-
Task.FromResult(None)
188-
let newState, decision =
189-
AutoClusterFailoverLogic.step now config primaryAvailable availableSecondaryIndex state
165+
let! newState, decision =
166+
AutoClusterFailoverLogic.step now config primaryAvailable findFirstAvailableSecondary state
190167
state <- newState
191168
do! applyDecision decision
192-
with
193-
| Flatten ex ->
169+
with Flatten ex ->
194170
Log.Logger.LogError(ex, "Error checking cluster")
195171
}
196172

@@ -219,7 +195,7 @@ type AutoClusterFailover
219195
use cts = new CancellationTokenSource(30_000)
220196
do! client.ConnectAsync(endpoint.Host, endpoint.Port, cts.Token)
221197
return true
222-
with ex ->
198+
with Flatten ex ->
223199
Log.Logger.LogWarning(ex, "Failed to probe available, url: {0}", endpoint)
224200
return false
225201
}

tests/UnitTests/Api/AutoClusterFailoverTests.fs

Lines changed: 59 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -18,150 +18,118 @@ module AutoClusterFailoverTests =
1818
SwitchBackDelay = TimeSpan.FromSeconds(60.0)
1919
SecondaryCount = 2
2020
}
21+
let noSecondaryAvailable = fun () -> Task.FromResult(None)
22+
let secondaryAvailable index = fun () -> Task.FromResult(Some index)
2123

2224
// ---- Pure state-machine tests ----
2325

2426
[<Tests>]
25-
let pureTests =
27+
let stepTests =
2628

2729
testList "AutoClusterFailoverLogic" [
2830

29-
// -- shouldProbeSecondaries --
30-
31-
test "shouldProbeSecondaries: false when primary is available" {
32-
let state = { AutoClusterFailoverLogic.initialState with FailedTimestamp = Some t0 }
33-
AutoClusterFailoverLogic.shouldProbeSecondaries t0 config true state
34-
|> Expect.isFalse "should not probe when primary is up"
35-
}
36-
37-
test "shouldProbeSecondaries: false when no failedTimestamp yet" {
38-
let state = AutoClusterFailoverLogic.initialState
39-
AutoClusterFailoverLogic.shouldProbeSecondaries t0 config false state
40-
|> Expect.isFalse "should not probe before first failure recorded"
41-
}
42-
43-
test "shouldProbeSecondaries: false when within failover delay" {
44-
let state = { AutoClusterFailoverLogic.initialState with FailedTimestamp = Some t0 }
45-
let now = t0 + TimeSpan.FromSeconds(29.0)
46-
AutoClusterFailoverLogic.shouldProbeSecondaries now config false state
47-
|> Expect.isFalse "should not probe before delay elapsed"
48-
}
49-
50-
test "shouldProbeSecondaries: true when primary down and delay elapsed" {
51-
let state = { AutoClusterFailoverLogic.initialState with FailedTimestamp = Some t0 }
52-
let now = t0 + TimeSpan.FromSeconds(30.0)
53-
AutoClusterFailoverLogic.shouldProbeSecondaries now config false state
54-
|> Expect.isTrue "should probe after delay elapsed"
55-
}
56-
57-
test "shouldProbeSecondaries: false when on secondary" {
58-
let state = { AutoClusterFailoverLogic.initialState with Mode = AutoClusterMode.Secondary 0 }
59-
AutoClusterFailoverLogic.shouldProbeSecondaries t0 config false state
60-
|> Expect.isFalse "should not probe secondaries when already on secondary"
61-
}
62-
6331
// -- step: on primary --
6432

65-
test "step: primary available clears failedTimestamp" {
66-
let state = { AutoClusterFailoverLogic.initialState with FailedTimestamp = Some t0 }
67-
let newState, decision =
68-
AutoClusterFailoverLogic.step t0 config true None state
69-
newState.FailedTimestamp |> Expect.isNone "FailedTimestamp should be cleared"
70-
decision |> Expect.equal "should be Noop" AutoClusterDecision.Noop
33+
testTask "step: primary available clears failedTimestamp" {
34+
let state = { AutoClusterFailoverLogic.initialState with PrimaryFailedTimestamp = Some t0 }
35+
let! newState, decision =
36+
AutoClusterFailoverLogic.step t0 config true noSecondaryAvailable state
37+
newState.PrimaryFailedTimestamp |> Expect.isNone "FailedTimestamp should be cleared"
38+
decision |> Expect.equal "should be Noop" AutoClusterDecision.NoAction
7139
}
7240

73-
test "step: primary unavailable records failedTimestamp on first failure" {
41+
testTask "step: primary unavailable records failedTimestamp on first failure" {
7442
let state = AutoClusterFailoverLogic.initialState
75-
let newState, decision =
76-
AutoClusterFailoverLogic.step t0 config false None state
77-
newState.FailedTimestamp |> Expect.equal "should record timestamp" (Some t0)
78-
decision |> Expect.equal "should be Noop" AutoClusterDecision.Noop
43+
let! newState, decision =
44+
AutoClusterFailoverLogic.step t0 config false noSecondaryAvailable state
45+
newState.PrimaryFailedTimestamp |> Expect.equal "should record timestamp" (Some t0)
46+
decision |> Expect.equal "should be Noop" AutoClusterDecision.NoAction
7947
}
8048

81-
test "step: does not switch before failover delay elapses" {
82-
let state = { AutoClusterFailoverLogic.initialState with FailedTimestamp = Some t0 }
49+
testTask "step: does not switch before failover delay elapses" {
50+
let state = { AutoClusterFailoverLogic.initialState with PrimaryFailedTimestamp = Some t0 }
8351
let now = t0 + TimeSpan.FromSeconds(29.0)
84-
let newState, decision =
85-
AutoClusterFailoverLogic.step now config false (Some 0) state
52+
let! newState, decision =
53+
AutoClusterFailoverLogic.step now config false (secondaryAvailable 0) state
8654
newState.Mode |> Expect.equal "should stay Primary" AutoClusterMode.Primary
87-
decision |> Expect.equal "should be Noop" AutoClusterDecision.Noop
55+
decision |> Expect.equal "should be Noop" AutoClusterDecision.NoAction
8856
}
8957

90-
test "step: switches to secondary after failover delay" {
91-
let state = { AutoClusterFailoverLogic.initialState with FailedTimestamp = Some t0 }
58+
testTask "step: switches to secondary after failover delay" {
59+
let state = { AutoClusterFailoverLogic.initialState with PrimaryFailedTimestamp = Some t0 }
9260
let now = t0 + TimeSpan.FromSeconds(30.0)
93-
let newState, decision =
94-
AutoClusterFailoverLogic.step now config false (Some 1) state
61+
let! newState, decision =
62+
AutoClusterFailoverLogic.step now config false (secondaryAvailable 1) state
9563
newState.Mode |> Expect.equal "should switch to Secondary 1" (AutoClusterMode.Secondary 1)
96-
newState.FailedTimestamp |> Expect.isNone "FailedTimestamp should be cleared"
64+
newState.PrimaryFailedTimestamp |> Expect.isNone "FailedTimestamp should be cleared"
9765
decision |> Expect.equal "should switch" (AutoClusterDecision.SwitchToSecondary 1)
9866
}
9967

100-
test "step: no available secondary keeps state unchanged" {
101-
let state = { AutoClusterFailoverLogic.initialState with FailedTimestamp = Some t0 }
68+
testTask "step: no available secondary keeps state unchanged" {
69+
let state = { AutoClusterFailoverLogic.initialState with PrimaryFailedTimestamp = Some t0 }
10270
let now = t0 + TimeSpan.FromSeconds(30.0)
103-
let newState, decision =
104-
AutoClusterFailoverLogic.step now config false None state
105-
newState.FailedTimestamp |> Expect.equal "should keep timestamp" (Some t0)
106-
decision |> Expect.equal "should be Noop" AutoClusterDecision.Noop
71+
let! newState, decision =
72+
AutoClusterFailoverLogic.step now config false noSecondaryAvailable state
73+
newState.PrimaryFailedTimestamp |> Expect.equal "should keep timestamp" (Some t0)
74+
decision |> Expect.equal "should be Noop" AutoClusterDecision.NoAction
10775
}
10876

109-
test "step: primary becomes available resets failedTimestamp" {
110-
let state = { AutoClusterFailoverLogic.initialState with FailedTimestamp = Some t0 }
77+
testTask "step: primary becomes available resets failedTimestamp" {
78+
let state = { AutoClusterFailoverLogic.initialState with PrimaryFailedTimestamp = Some t0 }
11179
let now = t0 + TimeSpan.FromSeconds(10.0)
112-
let newState, _ =
113-
AutoClusterFailoverLogic.step now config true None state
114-
newState.FailedTimestamp |> Expect.isNone "FailedTimestamp should be cleared"
80+
let! newState, _ =
81+
AutoClusterFailoverLogic.step now config true noSecondaryAvailable state
82+
newState.PrimaryFailedTimestamp |> Expect.isNone "FailedTimestamp should be cleared"
11583
}
11684

11785
// -- step: on secondary --
11886

119-
test "step: on secondary, primary available records recoveredTimestamp" {
87+
testTask "step: on secondary, primary available records recoveredTimestamp" {
12088
let state = { AutoClusterFailoverLogic.initialState with Mode = AutoClusterMode.Secondary 0 }
121-
let newState, decision =
122-
AutoClusterFailoverLogic.step t0 config true None state
123-
newState.RecoveredTimestamp |> Expect.equal "should record timestamp" (Some t0)
124-
decision |> Expect.equal "should be Noop" AutoClusterDecision.Noop
89+
let! newState, decision =
90+
AutoClusterFailoverLogic.step t0 config true noSecondaryAvailable state
91+
newState.PrimaryRecoveredTimestamp |> Expect.equal "should record timestamp" (Some t0)
92+
decision |> Expect.equal "should be Noop" AutoClusterDecision.NoAction
12593
}
12694

127-
test "step: on secondary, does not switch back before switchBackDelay" {
95+
testTask "step: on secondary, does not switch back before switchBackDelay" {
12896
let state = {
12997
Mode = AutoClusterMode.Secondary 0
130-
FailedTimestamp = None
131-
RecoveredTimestamp = Some t0
98+
PrimaryFailedTimestamp = None
99+
PrimaryRecoveredTimestamp = Some t0
132100
}
133101
let now = t0 + TimeSpan.FromSeconds(59.0)
134-
let newState, decision =
135-
AutoClusterFailoverLogic.step now config true None state
102+
let! newState, decision =
103+
AutoClusterFailoverLogic.step now config true noSecondaryAvailable state
136104
newState.Mode |> Expect.equal "should stay Secondary" (AutoClusterMode.Secondary 0)
137-
decision |> Expect.equal "should be Noop" AutoClusterDecision.Noop
105+
decision |> Expect.equal "should be Noop" AutoClusterDecision.NoAction
138106
}
139107

140-
test "step: on secondary, switches back to primary after switchBackDelay" {
108+
testTask "step: on secondary, switches back to primary after switchBackDelay" {
141109
let state = {
142110
Mode = AutoClusterMode.Secondary 0
143-
FailedTimestamp = None
144-
RecoveredTimestamp = Some t0
111+
PrimaryFailedTimestamp = None
112+
PrimaryRecoveredTimestamp = Some t0
145113
}
146114
let now = t0 + TimeSpan.FromSeconds(60.0)
147-
let newState, decision =
148-
AutoClusterFailoverLogic.step now config true None state
115+
let! newState, decision =
116+
AutoClusterFailoverLogic.step now config true noSecondaryAvailable state
149117
newState.Mode |> Expect.equal "should be Primary" AutoClusterMode.Primary
150-
newState.RecoveredTimestamp |> Expect.isNone "RecoveredTimestamp should be cleared"
118+
newState.PrimaryRecoveredTimestamp |> Expect.isNone "RecoveredTimestamp should be cleared"
151119
decision |> Expect.equal "should switch back" AutoClusterDecision.SwitchToPrimary
152120
}
153121

154-
test "step: on secondary, primary goes down again clears recoveredTimestamp" {
122+
testTask "step: on secondary, primary goes down again clears recoveredTimestamp" {
155123
let state = {
156124
Mode = AutoClusterMode.Secondary 0
157-
FailedTimestamp = None
158-
RecoveredTimestamp = Some t0
125+
PrimaryFailedTimestamp = None
126+
PrimaryRecoveredTimestamp = Some t0
159127
}
160128
let now = t0 + TimeSpan.FromSeconds(10.0)
161-
let newState, decision =
162-
AutoClusterFailoverLogic.step now config false None state
163-
newState.RecoveredTimestamp |> Expect.isNone "RecoveredTimestamp should be cleared"
164-
decision |> Expect.equal "should be Noop" AutoClusterDecision.Noop
129+
let! newState, decision =
130+
AutoClusterFailoverLogic.step now config false noSecondaryAvailable state
131+
newState.PrimaryRecoveredTimestamp |> Expect.isNone "RecoveredTimestamp should be cleared"
132+
decision |> Expect.equal "should be Noop" AutoClusterDecision.NoAction
165133
}
166134
]
167135

0 commit comments

Comments
 (0)