Skip to content

Commit ab4427d

Browse files
committed
Fix connect attempt retries
1 parent 992584c commit ab4427d

13 files changed

Lines changed: 346 additions & 149 deletions

include/iocore/hostdb/HostDBProcessor.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -582,7 +582,7 @@ struct ResolveInfo {
582582
bool mark_active_server_up();
583583

584584
/// Select / resolve to the next RR entry for the record.
585-
bool select_next_rr();
585+
bool select_next_rr(ts_time now, ts_seconds fail_window);
586586

587587
bool is_srv() const;
588588
};

include/proxy/http/HttpTransact.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1034,7 +1034,7 @@ class HttpTransact
10341034
static void handle_response_from_parent_plugin(State *s);
10351035
static void handle_response_from_server(State *s);
10361036
static void delete_server_rr_entry(State *s, int max_retries);
1037-
static void retry_server_connection_not_open(State *s, ServerState_t conn_state, unsigned max_retries);
1037+
static void retry_server_connection_not_open(State *s, unsigned max_retries);
10381038
static void error_log_connection_failure(State *s, ServerState_t conn_state);
10391039
static void handle_server_connection_not_open(State *s);
10401040
static void handle_forward_server_connection_open(State *s);
@@ -1078,6 +1078,8 @@ class HttpTransact
10781078
static bool handle_trace_and_options_requests(State *s, HTTPHdr *incoming_hdr);
10791079
static void bootstrap_state_variables_from_request(State *s, HTTPHdr *incoming_request);
10801080

1081+
static uint8_t origin_server_connect_attempts_max_retries(State *s);
1082+
10811083
// WARNING: this function may be called multiple times for the same transaction.
10821084
//
10831085
static void initialize_state_variables_from_request(State *s, HTTPHdr *obsolete_incoming_request);

src/iocore/hostdb/HostDB.cc

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1706,13 +1706,19 @@ ResolveInfo::set_active(HostDBInfo *info)
17061706
}
17071707

17081708
bool
1709-
ResolveInfo::select_next_rr()
1709+
ResolveInfo::select_next_rr(ts_time now, ts_seconds fail_window)
17101710
{
17111711
if (active) {
17121712
if (auto rr_info{this->record->rr_info()}; rr_info.count() > 1) {
1713-
unsigned limit = active - rr_info.data(), idx = (limit + 1) % rr_info.count();
1714-
while ((idx = (idx + 1) % rr_info.count()) != limit && !rr_info[idx].is_up()) {}
1715-
active = &rr_info[idx];
1713+
unsigned limit = active - rr_info.data();
1714+
size_t idx = (limit + 1) % rr_info.count();
1715+
for (; idx != limit; idx = (idx + 1) % rr_info.count()) {
1716+
if (!rr_info[idx].is_down(now, fail_window)) {
1717+
active = &rr_info[idx];
1718+
break;
1719+
}
1720+
}
1721+
17161722
return idx != limit; // if the active record was actually changed.
17171723
}
17181724
}

src/proxy/http/HttpConfig.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1348,6 +1348,10 @@ HttpConfig::reconfigure()
13481348
"will never redispatch to another server",
13491349
m_master.oride.connect_attempts_rr_retries, params->oride.connect_attempts_max_retries);
13501350
}
1351+
if (m_master.oride.connect_attempts_rr_retries > 0 && params->oride.connect_attempts_max_retries_down_server == 0) {
1352+
Warning("connect_attempts_max_retries_down_server=0 with round-robin enabled skips probing recovering (SUSPECT) origins; "
1353+
"set connect_attempts_max_retries_down_server >= 1 is recommended");
1354+
}
13511355
params->oride.connect_attempts_retry_backoff_base = m_master.oride.connect_attempts_retry_backoff_base;
13521356

13531357
params->oride.connect_attempts_rr_retries = m_master.oride.connect_attempts_rr_retries;

src/proxy/http/HttpSM.cc

Lines changed: 32 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
#include "proxy/http/HttpConfig.h"
2626
#include "tscore/ink_hrtime.h"
27+
#include "tscore/ink_time.h"
2728
#include "tsutil/Metrics.h"
2829
#include "tsutil/ts_bw_format.h"
2930
#include "proxy/ProxyTransaction.h"
@@ -4731,9 +4732,12 @@ HttpSM::do_hostdb_update_if_necessary()
47314732
t_state.dns_info.active->http_version = t_state.updated_server_version;
47324733
}
47334734

4735+
char addrbuf[INET6_ADDRPORTSTRLEN];
4736+
SMDbg(dbg_ctl_http, "update hostdb info: %s", ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf, sizeof(addrbuf)));
4737+
47344738
// Check to see if we need to report or clear a connection failure
47354739
if (track_connect_fail()) {
4736-
this->mark_host_failure(&t_state.dns_info, ts_clock::from_time_t(t_state.client_request_time));
4740+
this->mark_host_failure(&t_state.dns_info, ts_clock::now());
47374741
} else {
47384742
if (t_state.dns_info.mark_active_server_up()) {
47394743
char addrbuf[INET6_ADDRPORTSTRLEN];
@@ -4748,8 +4752,6 @@ HttpSM::do_hostdb_update_if_necessary()
47484752
}
47494753
}
47504754

4751-
char addrbuf[INET6_ADDRPORTSTRLEN];
4752-
SMDbg(dbg_ctl_http, "server info = %s", ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf, sizeof(addrbuf)));
47534755
return;
47544756
}
47554757

@@ -5521,12 +5523,6 @@ HttpSM::do_http_server_open(bool raw, bool only_direct)
55215523
return;
55225524
}
55235525
}
5524-
if (HttpTransact::is_server_negative_cached(&t_state) == true &&
5525-
t_state.txn_conf->connect_attempts_max_retries_down_server <= 0) {
5526-
SMDbg(dbg_ctl_http_seq, "Not connecting to the server because it is marked down.");
5527-
call_transact_and_set_next_state(HttpTransact::OriginDown);
5528-
return;
5529-
}
55305526

55315527
// Check for self loop.
55325528
if (!_ua.get_txn()->is_outbound_transparent() && HttpTransact::will_this_request_self_loop(&t_state)) {
@@ -5972,34 +5968,35 @@ HttpSM::do_transform_open()
59725968
void
59735969
HttpSM::mark_host_failure(ResolveInfo *info, ts_time time_down)
59745970
{
5975-
char addrbuf[INET6_ADDRPORTSTRLEN];
5971+
ink_assert(time_down != TS_TIME_ZERO);
59765972

5977-
if (info->active) {
5978-
if (time_down != TS_TIME_ZERO) {
5979-
ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf, sizeof(addrbuf));
5980-
// Increment the fail_count
5981-
if (auto [down, fail_count] = info->active->increment_fail_count(time_down, t_state.txn_conf->connect_attempts_rr_retries,
5982-
t_state.txn_conf->down_server_timeout);
5983-
down) {
5984-
char *url_str = t_state.hdr_info.client_request.url_string_get_ref(nullptr);
5985-
std::string_view host_name{t_state.unmapped_url.host_get()};
5986-
swoc::bwprint(error_bw_buffer, "CONNECT : {::s} connecting to {} for host='{}' url='{}' fail_count='{}' marking down",
5987-
swoc::bwf::Errno(t_state.current.server->connect_result), t_state.current.server->dst_addr, host_name,
5988-
swoc::bwf::FirstOf(url_str, "<none>"), fail_count);
5989-
Log::error("%s", error_bw_buffer.c_str());
5990-
SMDbg(dbg_ctl_http, "hostdb update marking IP: %s as down", addrbuf);
5991-
ATS_PROBE2(hostdb_mark_ip_as_down, sm_id, addrbuf);
5992-
} else {
5993-
ATS_PROBE3(hostdb_inc_ip_failcount, sm_id, addrbuf, fail_count);
5994-
SMDbg(dbg_ctl_http, "hostdb increment IP failcount %s to %d", addrbuf, fail_count);
5995-
}
5996-
} else { // Clear the failure
5997-
info->active->mark_up();
5998-
}
5973+
if (info->active == nullptr) {
5974+
return;
5975+
}
5976+
5977+
char addrbuf[INET6_ADDRPORTSTRLEN];
5978+
ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf, sizeof(addrbuf));
5979+
5980+
uint8_t max_connect_retries = HttpTransact::origin_server_connect_attempts_max_retries(&t_state);
5981+
ts_seconds fail_window = t_state.txn_conf->down_server_timeout;
5982+
5983+
// Mark the host DOWN only after every attempt has failed. `max_connect_retries` counts only "retries", so the total attempt
5984+
// budget is `max_connect_retries + 1` (the initial connect plus each retry).
5985+
auto [down, fail_count] = info->active->increment_fail_count(time_down, max_connect_retries + 1, fail_window);
5986+
5987+
if (down) {
5988+
char *url_str = t_state.hdr_info.client_request.url_string_get_ref(nullptr);
5989+
std::string_view host_name{t_state.unmapped_url.host_get()};
5990+
swoc::bwprint(error_bw_buffer, "CONNECT : {::s} connecting to {} for host='{}' url='{}' fail_count='{}' marking down",
5991+
swoc::bwf::Errno(t_state.current.server->connect_result), t_state.current.server->dst_addr, host_name,
5992+
swoc::bwf::FirstOf(url_str, "<none>"), fail_count);
5993+
Log::error("%s", error_bw_buffer.c_str());
5994+
SMDbg(dbg_ctl_http, "hostdb update marking IP: %s as down", addrbuf);
5995+
ATS_PROBE2(hostdb_mark_ip_as_down, sm_id, addrbuf);
5996+
} else {
5997+
ATS_PROBE3(hostdb_inc_ip_failcount, sm_id, addrbuf, fail_count);
5998+
SMDbg(dbg_ctl_http, "hostdb increment IP failcount %s to %d", addrbuf, fail_count);
59995999
}
6000-
#ifdef DEBUG
6001-
ink_assert(std::chrono::system_clock::now() + t_state.txn_conf->down_server_timeout > time_down);
6002-
#endif
60036000
}
60046001

60056002
void

0 commit comments

Comments
 (0)