@@ -123,61 +123,77 @@ enum class HostDBType : uint8_t {
123123};
124124
125125/* * Information about a single target.
126+ *
127+ * Each instance tracks the health state of one upstream address. The state is derived from @c _last_failure and the caller-supplied
128+ * @a fail_window:
129+ *
130+ * | State | Description |
131+ * |---------|-----------------------------------------------------------------------------------|
132+ * | Up | No known failure; eligible for normal selection. |
133+ * | Down | Blocked; no connections permitted until @c _last_failure + @a fail_window elapses |
134+ * | Suspect | Fail window has elapsed; connections are permitted. |
135+ * | | On success transitions to Up (@c mark_up); on failure returns to Down. |
136+ *
137+ * State transition diagram:
138+ *
139+ * @startuml
140+ * hide empty description
141+ *
142+ * [*] --> Up
143+ * Up --> Down : connect failure\n(mark_down)
144+ * Down --> Suspect : fail_window elapses
145+ * Suspect --> Up : connect success\n(mark_up)
146+ * Suspect --> Down : connect failure\n(mark_down)
147+ * @enduml
148+ *
149+ * State transition and `fail_window` time chart:
150+ *
151+ * @code
152+ * |<-- fail_window -->|
153+ * -+----------+--------------------+--------------------+----------+----> time
154+ * | Up | Down | Suspect | Up |
155+ * -+----------+--------------------+--------------------+----------+---->
156+ * ^ ^ ^
157+ * \ \ \
158+ * (_last_failure) (_last_failure + fail_window) (connect success)
159+ * @endcode
126160 */
127- struct HostDBInfo {
161+ class HostDBInfo
162+ {
163+ public:
128164 using self_type = HostDBInfo; // /< Self reference type.
129165
166+ // / Health state of this target.
167+ enum class State {
168+ UP,
169+ DOWN,
170+ SUSPECT,
171+ };
172+
130173 // / Default constructor.
131174 HostDBInfo () = default ;
132175
133176 HostDBInfo &operator =(HostDBInfo const &that);
134177
135178 // / Absolute time of when this target failed.
136179 // / A value of zero (@c TS_TIME_ZERO ) indicates no failure.
137- ts_time last_fail_time () const ;
138-
139- // / Target is alive - no known failure.
140- bool is_alive ();
141-
142- // / Target has failed and is still in the blocked time window.
143- bool is_down (ts_time now, ts_seconds fail_window);
144-
145- /* * Select this target.
146- *
147- * @param now Current time.
148- * @param fail_window Failure window.
149- * @return Status of the selection.
150- *
151- * If a zombie is selected the failure time is updated to make it appear down to other threads in a thread safe
152- * manner. The caller should check @c last_fail_time to see if a zombie was selected.
153- */
154- bool select (ts_time now, ts_seconds fail_window) const ;
180+ ts_time last_fail_time () const ;
181+ uint8_t fail_count () const ;
182+ char const *srvname () const ;
155183
156- /* * Mark the entry as down.
157- *
158- * @param now Time of the failure.
159- * @return @c true if @a this was marked down, @c false if not.
160- *
161- * This can return @c false if the entry is already marked down, in which case the failure time is not updated.
162- */
163- bool mark_down (ts_time now);
184+ // / Return the current health state of this target.
185+ State state (ts_time now, ts_seconds fail_window) const ;
164186
165- std::pair<bool , uint8_t > increment_fail_count (ts_time now, uint8_t max_retries);
187+ // Sugars of checking state
188+ bool is_up () const ;
189+ bool is_down (ts_time now, ts_seconds fail_window) const ;
190+ bool is_suspect (ts_time now, ts_seconds fail_window) const ;
166191
167- /* * Mark the target as up / alive.
168- *
169- * @return Previous alive state of the target.
170- */
171- bool mark_up ();
172-
173- char const *srvname () const ;
192+ // State controllers
193+ bool mark_up ();
194+ bool mark_down (ts_time now, ts_seconds fail_window);
195+ std::pair<bool , uint8_t > increment_fail_count (ts_time now, uint8_t max_retries, ts_seconds fail_window);
174196
175- /* * Migrate data after a DNS update.
176- *
177- * @param that Source item.
178- *
179- * This moves only specific state information, it is not a generic copy.
180- */
181197 void migrate_from (self_type const &that);
182198
183199 // / A target is either an IP address or an SRV record.
@@ -187,16 +203,8 @@ struct HostDBInfo {
187203 SRVInfo srv; // /< SRV record.
188204 } data{IpAddr{}};
189205
190- // / Data that migrates after updated DNS records are processed.
191- // / @see migrate_from
192- // / @{
193- // / Last time a failure was recorded.
194- std::atomic<ts_time> last_failure{TS_TIME_ZERO};
195- // / Count of connection failures
196- std::atomic<uint8_t > fail_count{0 };
197206 // / Expected HTTP version of the target based on earlier transactions.
198207 HTTPVersion http_version = HTTP_INVALID;
199- // / @}
200208
201209 self_type &assign (IpAddr const &addr);
202210
@@ -207,96 +215,11 @@ struct HostDBInfo {
207215 HostDBType type = HostDBType::UNSPEC; // /< Invalid data.
208216
209217 friend HostDBContinuation;
210- };
211218
212- inline HostDBInfo &
213- HostDBInfo::operator =(HostDBInfo const &that)
214- {
215- if (this != &that) {
216- memcpy (static_cast <void *>(this ), static_cast <const void *>(&that), sizeof (*this ));
217- }
218- return *this ;
219- }
220-
221- inline ts_time
222- HostDBInfo::last_fail_time () const
223- {
224- return last_failure;
225- }
226-
227- inline bool
228- HostDBInfo::is_alive ()
229- {
230- return this ->last_fail_time () == TS_TIME_ZERO;
231- }
232-
233- /* *
234- Check if this HostDBInfo is currently marked DOWN (true) or UP (false). Returns true while within the `fail_window` period after
235- `last_failure`. Once `fail_window` expires, the host is treated as UP and this function returns false.
236-
237- |<-- fail_window -->|
238- ----------------+-------------------+-----------------> time
239- UP | DOWN | UP
240- (is_down=false) | (is_down=true) | (is_down=false)
241- | |
242- ^ ^
243- \ \
244- last_failure last_failure + fail_window
245- */
246- inline bool
247- HostDBInfo::is_down (ts_time now, ts_seconds fail_window)
248- {
249- auto last_fail = this ->last_fail_time ();
250- return (last_fail != TS_TIME_ZERO) && (now <= last_fail + fail_window);
251- }
252-
253- inline bool
254- HostDBInfo::mark_up ()
255- {
256- auto t = last_failure.exchange (TS_TIME_ZERO);
257- bool was_down = t != TS_TIME_ZERO;
258- if (was_down) {
259- fail_count.store (0 );
260- }
261- return was_down;
262- }
263-
264- inline bool
265- HostDBInfo::mark_down (ts_time now)
266- {
267- auto t0{TS_TIME_ZERO};
268- return last_failure.compare_exchange_strong (t0, now);
269- }
270-
271- inline std::pair<bool , uint8_t >
272- HostDBInfo::increment_fail_count (ts_time now, uint8_t max_retries)
273- {
274- auto fcount = ++fail_count;
275- bool marked_down = false ;
276- if (fcount >= max_retries) {
277- marked_down = mark_down (now);
278- }
279- return std::make_pair (marked_down, fcount);
280- }
281-
282- inline bool
283- HostDBInfo::select (ts_time now, ts_seconds fail_window) const
284- {
285- auto t0 = this ->last_fail_time ();
286- if (t0 == TS_TIME_ZERO) {
287- return true ; // it's alive and so is valid for selection.
288- }
289- // Return true and give it a try if enough time is elapsed since the last failure
290- return (t0 + fail_window < now);
291- }
292-
293- inline void
294- HostDBInfo::migrate_from (HostDBInfo::self_type const &that)
295- {
296- this ->last_failure = that.last_failure .load ();
297- this ->fail_count = that.fail_count .load ();
298- this ->http_version = that.http_version ;
299- }
219+ private:
220+ std::atomic<ts_time> _last_failure{TS_TIME_ZERO}; // /< Last time a failure was recorded
221+ std::atomic<uint8_t > _fail_count{0 }; // /< Count of connection failures
222+ };
300223
301224// ----
302225/* * Root item for HostDB.
@@ -371,15 +294,12 @@ class HostDBRecord : public RefCountObj
371294
372295 /* * Pick the next round robin and update the record atomically.
373296 *
374- * @note This may select a zombie server and reserve it for the caller, therefore the caller must
375- * attempt to connect to the selected target if possible.
376- *
377- * @param now Current time to use for aliveness calculations.
378- * @param fail_window Blackout time for down servers.
379- * @return Status of the updated target.
297+ * @note This may select a suspect server. The caller must attempt to connect to the selected
298+ * target if possible.
380299 *
381- * If the return value is @c HostDBInfo::Status::DOWN this means all targets are down and there is
382- * no valid upstream.
300+ * @param[in] now Current time to use for HostDBInfo state calculations.
301+ * @param[in] fail_window Blackout time for down servers.
302+ * @return The selected target, or @c nullptr if all targets are down.
383303 *
384304 * @note Concurrency - this is not done under lock and depends on the caller for correct use.
385305 * For strict round robin, it is a feature that every call will get a distinct index. For
@@ -434,9 +354,9 @@ class HostDBRecord : public RefCountObj
434354 * This accounts for the round robin setting. The default is to use "client affinity" in
435355 * which case @a hash_addr is as a hash seed to select the target.
436356 *
437- * This may select a zombie target, which can be detected by checking the target's last
438- * failure time. If it is not @c TS_TIME_ZERO the target is a zombie. Other transactions will
439- * be blocked from selecting that target until @a fail_window time has passed .
357+ * This may select a suspect target (fail window elapsed, connections permitted again), which can
358+ * be detected by checking the target's last failure time. If it is not @c TS_TIME_ZERO the target
359+ * is a suspect. Multiple threads may concurrently select the same suspect target .
440360 *
441361 * In cases other than strict round robin, a base target is selected. If valid, that is returned,
442362 * but if not then the targets in this record are searched until a valid one is found. The result
@@ -588,7 +508,7 @@ struct ResolveInfo {
588508
589509 // / Keep a reference to the base HostDB object, so it doesn't get GC'd.
590510 Ptr<HostDBRecord> record;
591- HostDBInfo *active = nullptr ; // /< Active host record.
511+ HostDBInfo *active = nullptr ; // /< Active HostDBInfo
592512
593513 // / Working address. The meaning / source of the value depends on other elements.
594514 // / This is the "resolved" address if @a resolved_p is @c true.
@@ -646,19 +566,20 @@ struct ResolveInfo {
646566 */
647567 bool resolve_immediate ();
648568
649- /* * Mark the active target as down .
569+ /* * Mark the active target as DOWN .
650570 *
651- * @param now Time of failure.
571+ * @param[in] now Time of failure.
572+ * @param[in] fail_window The fail window duration (proxy.config.http.down_server.cache_time).
652573 * @return @c true if the server was marked as down, @c false if not.
653574 *
654575 */
655- bool mark_active_server_down (ts_time now);
576+ bool mark_active_server_down (ts_time now, ts_seconds fail_window );
656577
657- /* * Mark the active target as alive .
578+ /* * Mark the active target as UP .
658579 *
659580 * @return @c true if the target changed state.
660581 */
661- bool mark_active_server_alive ();
582+ bool mark_active_server_up ();
662583
663584 // / Select / resolve to the next RR entry for the record.
664585 bool select_next_rr ();
@@ -863,15 +784,15 @@ ResolveInfo::set_active(sockaddr const *s)
863784}
864785
865786inline bool
866- ResolveInfo::mark_active_server_alive ()
787+ ResolveInfo::mark_active_server_up ()
867788{
868789 return active->mark_up ();
869790}
870791
871792inline bool
872- ResolveInfo::mark_active_server_down (ts_time now)
793+ ResolveInfo::mark_active_server_down (ts_time now, ts_seconds fail_window )
873794{
874- return active != nullptr && active->mark_down (now);
795+ return active != nullptr && active->mark_down (now, fail_window );
875796}
876797
877798inline bool
0 commit comments