Skip to content

Commit 7994168

Browse files
committed
fix(healthcheck): update targets incrementally instead of rebuilding
When an upstream's nodes change but its `checks` config is unchanged, the health-check manager destroyed the existing checker and built a new one. Two problems followed: 1. fetch_checker() keys the working checker by a version derived from both modifiedIndex and the nodes version, so a node-only change makes it return nil until the timer rebuilds the checker. During that window api_ctx.up_checker is nil and the balancer routes traffic to nodes already known to be unhealthy (#13282). 2. The rebuild throws away the checker's accumulated health state and re-probes every node from scratch. The manager now reconciles the existing checker's targets in place with add_target/remove_target when the `checks` config is unchanged (compared with core.table.deep_eq), keeping the checker and its state alive. timer_working_pool_check no longer destroys a checker for a node-only version change, and when a rebuild is genuinely required (the `checks` config changed) the new checker is created and inserted into the working pool before the old one is released, so fetch_checker never observes a nil gap. Bumps the lua-resty-healthcheck-api7 rockspec dependency to 3.3.0-0, which contains the companion library fix (clean every checker each window + release the periodic lock when idle) required by this change. Adds t/node/healthcheck-incremental-update.t: a node-only change must not destroy/rebuild the checker (no "clear checker"), while a checks-config change still rebuilds it.
1 parent b79329d commit 7994168

2 files changed

Lines changed: 264 additions & 20 deletions

File tree

apisix/healthcheck_manager.lua

Lines changed: 115 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ local jp = require("jsonpath")
3131
local config_util = require("apisix.core.config_util")
3232

3333
local _M = {}
34-
local working_pool = {} -- resource_path -> {version = ver, checker = checker}
34+
-- resource_path -> {version = ver, checker = checker, checks = checks}
35+
local working_pool = {}
3536
local waiting_pool = {} -- resource_path -> resource_ver
3637

3738
local DELAYED_CLEAR_TIMEOUT = 10
@@ -44,6 +45,31 @@ end
4445
_M.get_healthchecker_name = get_healthchecker_name
4546

4647

48+
-- Compute the desired set of health-check targets for an upstream config.
49+
-- Returns a map keyed by "host:port:hostheader" so the working set can be
50+
-- diffed cheaply against a checker's current targets.
51+
local function compute_targets(up_conf)
52+
local host = up_conf.checks and up_conf.checks.active and up_conf.checks.active.host
53+
local port = up_conf.checks and up_conf.checks.active and up_conf.checks.active.port
54+
local up_hdr = up_conf.pass_host == "rewrite" and up_conf.upstream_host
55+
local use_node_hdr = up_conf.pass_host == "node" or nil
56+
57+
local targets = {}
58+
for _, node in ipairs(up_conf.nodes) do
59+
local host_hdr = up_hdr or (use_node_hdr and node.domain) or nil
60+
local target = {
61+
host = node.host,
62+
port = port or node.port,
63+
check_host = host,
64+
host_hdr = host_hdr,
65+
}
66+
local key = target.host .. ":" .. tostring(target.port) .. ":" .. tostring(host_hdr or "")
67+
targets[key] = target
68+
end
69+
return targets
70+
end
71+
72+
4773
local function create_checker(up_conf)
4874
if not up_conf.checks then
4975
return nil
@@ -71,25 +97,66 @@ local function create_checker(up_conf)
7197
end
7298

7399
-- Add target nodes
74-
local host = up_conf.checks and up_conf.checks.active and up_conf.checks.active.host
75-
local port = up_conf.checks and up_conf.checks.active and up_conf.checks.active.port
76-
local up_hdr = up_conf.pass_host == "rewrite" and up_conf.upstream_host
77-
local use_node_hdr = up_conf.pass_host == "node" or nil
78-
79-
for _, node in ipairs(up_conf.nodes) do
80-
local host_hdr = up_hdr or (use_node_hdr and node.domain)
81-
local ok, err = checker:add_target(node.host, port or node.port, host,
82-
true, host_hdr)
100+
for _, target in pairs(compute_targets(up_conf)) do
101+
local ok, err = checker:add_target(target.host, target.port, target.check_host,
102+
true, target.host_hdr)
83103
if not ok then
84-
core.log.error("failed to add healthcheck target: ", node.host, ":",
85-
port or node.port, " err: ", err)
104+
core.log.error("failed to add healthcheck target: ", target.host, ":",
105+
target.port, " err: ", err)
86106
end
87107
end
88108

89109
return checker
90110
end
91111

92112

113+
-- Incrementally reconcile an existing checker's targets to match up_conf.
114+
-- Used when only the upstream nodes changed but the `checks` config did not,
115+
-- so the checker can keep running (and keep its accumulated health state)
116+
-- instead of being destroyed and rebuilt.
117+
local function sync_checker_targets(checker, up_conf)
118+
local desired = compute_targets(up_conf)
119+
120+
-- index current targets the same way as desired. Read the authoritative
121+
-- shm target list (the per-worker checker.targets array can lag behind a
122+
-- recent add/remove event).
123+
if not healthcheck then
124+
healthcheck = require("resty.healthcheck")
125+
end
126+
local current = {}
127+
local target_list = healthcheck.get_target_list(get_healthchecker_name(up_conf),
128+
healthcheck_shdict_name) or {}
129+
for _, t in ipairs(target_list) do
130+
-- target_list entries carry hostheader; map it back to our key shape
131+
local key = t.ip .. ":" .. tostring(t.port) .. ":" .. tostring(t.hostheader or "")
132+
current[key] = t
133+
end
134+
135+
-- add targets that are desired but not present
136+
for key, target in pairs(desired) do
137+
if not current[key] then
138+
local ok, err = checker:add_target(target.host, target.port, target.check_host,
139+
true, target.host_hdr)
140+
if not ok then
141+
core.log.error("failed to add healthcheck target: ", target.host, ":",
142+
target.port, " err: ", err)
143+
end
144+
end
145+
end
146+
147+
-- remove targets that are present but no longer desired
148+
for key, t in pairs(current) do
149+
if not desired[key] then
150+
local ok, err = checker:remove_target(t.ip, t.port, t.hostname)
151+
if not ok then
152+
core.log.error("failed to remove healthcheck target: ", t.ip, ":",
153+
t.port, " err: ", err)
154+
end
155+
end
156+
end
157+
end
158+
159+
93160
function _M.fetch_checker(resource_path, resource_ver)
94161
local working_item = working_pool[resource_path]
95162
if working_item and working_item.version == resource_ver then
@@ -130,10 +197,11 @@ function _M.fetch_node_status(checker, ip, port, hostname)
130197
end
131198

132199

133-
local function add_working_pool(resource_path, resource_ver, checker)
200+
local function add_working_pool(resource_path, resource_ver, checker, checks)
134201
working_pool[resource_path] = {
135202
version = resource_ver,
136-
checker = checker
203+
checker = checker,
204+
checks = checks,
137205
}
138206
end
139207

@@ -202,22 +270,43 @@ local function timer_create_checker()
202270
goto continue
203271
end
204272

205-
-- if a checker exists then delete it before creating a new one
273+
-- If a checker already exists and the `checks` config is unchanged
274+
-- (only the upstream nodes changed), reconcile its targets in place
275+
-- instead of destroying and rebuilding it. A destroy-and-rebuild
276+
-- leaves `up_checker == nil` for the rebuild window, during which
277+
-- traffic is routed to nodes already known to be unhealthy, and it
278+
-- throws away the checker's accumulated health state.
206279
local existing_checker = working_pool[resource_path]
280+
if existing_checker and existing_checker.checker
281+
and not existing_checker.checker.dead
282+
and upstream.checks
283+
and core.table.deep_eq(existing_checker.checks, upstream.checks) then
284+
sync_checker_targets(existing_checker.checker, upstream)
285+
add_working_pool(resource_path, resource_ver, existing_checker.checker,
286+
upstream.checks)
287+
core.log.info("reused checker with incremental targets: ",
288+
tostring(existing_checker.checker), " for resource: ",
289+
resource_path, " and version: ", resource_ver)
290+
goto continue
291+
end
292+
293+
-- The checks config changed (or no checker exists): build a fresh
294+
-- checker first, and only release the old one *after* the new one is
295+
-- in the working pool, so fetch_checker never observes a nil gap.
296+
local checker = create_checker(upstream)
297+
if not checker then
298+
goto continue
299+
end
207300
if existing_checker then
208301
existing_checker.checker:delayed_clear(DELAYED_CLEAR_TIMEOUT)
209302
existing_checker.checker:stop()
210303
core.log.info("releasing existing checker: ", tostring(existing_checker.checker),
211304
" for resource: ", resource_path, " and version: ",
212305
existing_checker.version)
213306
end
214-
local checker = create_checker(upstream)
215-
if not checker then
216-
goto continue
217-
end
218307
core.log.info("create new checker: ", tostring(checker), " for resource: ",
219308
resource_path, " and version: ", resource_ver)
220-
add_working_pool(resource_path, resource_ver, checker)
309+
add_working_pool(resource_path, resource_ver, checker, upstream.checks)
221310
end
222311

223312
::continue::
@@ -258,6 +347,12 @@ local function timer_working_pool_check()
258347
" current version: ", current_ver, " item version: ", item.version)
259348
if item.version == current_ver then
260349
need_destroy = false
350+
elseif upstream.checks and core.table.deep_eq(item.checks, upstream.checks) then
351+
-- Version changed but only because of the upstream nodes; the
352+
-- `checks` config is identical. Keep the checker alive so
353+
-- timer_create_checker can reconcile its targets incrementally
354+
-- (avoids a destroy-and-rebuild nil window for the checker).
355+
need_destroy = false
261356
end
262357
end
263358

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
use t::APISIX 'no_plan';
18+
19+
repeat_each(1);
20+
log_level('warn');
21+
no_root_location();
22+
no_shuffle();
23+
24+
run_tests();
25+
26+
__DATA__
27+
28+
=== TEST 1: node-only change reuses the checker (no destroy-and-rebuild)
29+
--- extra_init_worker_by_lua
30+
local healthcheck = require("resty.healthcheck")
31+
local new = healthcheck.new
32+
healthcheck.new = function(...)
33+
ngx.log(ngx.WARN, "create new checker")
34+
local obj = new(...)
35+
local clear = obj.delayed_clear
36+
obj.delayed_clear = function(...)
37+
ngx.log(ngx.WARN, "clear checker")
38+
return clear(...)
39+
end
40+
return obj
41+
end
42+
43+
--- config
44+
location /t {
45+
content_by_lua_block {
46+
local checks = [[{
47+
"active":{
48+
"http_path":"/hello",
49+
"timeout":1,
50+
"type":"http",
51+
"healthy":{ "interval":1, "successes":1 },
52+
"unhealthy":{ "interval":1, "http_failures":2 }
53+
}
54+
}]]
55+
local function cfg(nodes)
56+
return [[{
57+
"upstream": {
58+
"nodes": ]] .. nodes .. [[,
59+
"type": "roundrobin",
60+
"checks": ]] .. checks .. [[
61+
},
62+
"uri": "/hello"
63+
}]]
64+
end
65+
66+
local t = require("lib.test_admin").test
67+
-- initial config: one node -> creates the checker
68+
assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT,
69+
cfg('{"127.0.0.1:1980": 1}')) < 300)
70+
t('/hello', ngx.HTTP_GET)
71+
ngx.sleep(2)
72+
73+
-- node-only change (checks unchanged): should reconcile in place,
74+
-- NOT create a new checker nor delayed_clear the old one
75+
assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT,
76+
cfg('{"127.0.0.1:1980": 1, "127.0.0.1:1981": 1}')) < 300)
77+
t('/hello', ngx.HTTP_GET)
78+
ngx.sleep(2)
79+
ngx.say("done")
80+
}
81+
}
82+
83+
--- request
84+
GET /t
85+
--- response_body
86+
done
87+
--- no_error_log
88+
clear checker
89+
--- error_log
90+
create new checker
91+
--- timeout: 8
92+
93+
94+
95+
=== TEST 2: checks-config change still rebuilds the checker
96+
--- extra_init_worker_by_lua
97+
local healthcheck = require("resty.healthcheck")
98+
local new = healthcheck.new
99+
healthcheck.new = function(...)
100+
local obj = new(...)
101+
local clear = obj.delayed_clear
102+
obj.delayed_clear = function(...)
103+
ngx.log(ngx.WARN, "clear checker")
104+
return clear(...)
105+
end
106+
return obj
107+
end
108+
109+
--- config
110+
location /t {
111+
content_by_lua_block {
112+
local function cfg(interval)
113+
return [[{
114+
"upstream": {
115+
"nodes": {"127.0.0.1:1980": 1},
116+
"type": "roundrobin",
117+
"checks": {
118+
"active":{
119+
"http_path":"/hello",
120+
"timeout":1,
121+
"type":"http",
122+
"healthy":{ "interval":]] .. interval .. [[, "successes":1 },
123+
"unhealthy":{ "interval":1, "http_failures":2 }
124+
}
125+
}
126+
},
127+
"uri": "/hello"
128+
}]]
129+
end
130+
131+
local t = require("lib.test_admin").test
132+
assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT, cfg(1)) < 300)
133+
t('/hello', ngx.HTTP_GET)
134+
ngx.sleep(2)
135+
-- change the checks config -> must rebuild (delayed_clear old checker)
136+
assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT, cfg(2)) < 300)
137+
t('/hello', ngx.HTTP_GET)
138+
ngx.sleep(2)
139+
ngx.say("done")
140+
}
141+
}
142+
143+
--- request
144+
GET /t
145+
--- response_body
146+
done
147+
--- error_log
148+
clear checker
149+
--- timeout: 8

0 commit comments

Comments
 (0)