@@ -31,7 +31,8 @@ local jp = require("jsonpath")
3131local config_util = require (" apisix.core.config_util" )
3232
3333local _M = {}
34- local working_pool = {} -- resource_path -> {version = ver, checker = checker}
34+ -- resource_path -> {version = ver, checker = checker, checks = checks}
35+ local working_pool = {}
3536local waiting_pool = {} -- resource_path -> resource_ver
3637
3738local DELAYED_CLEAR_TIMEOUT = 10
4445_M .get_healthchecker_name = get_healthchecker_name
4546
4647
48+ -- Compute the desired set of health-check targets for an upstream config.
49+ -- Returns a map keyed by "host:port:hostheader" so the working set can be
50+ -- diffed cheaply against a checker's current targets.
51+ local function compute_targets (up_conf )
52+ local host = up_conf .checks and up_conf .checks .active and up_conf .checks .active .host
53+ local port = up_conf .checks and up_conf .checks .active and up_conf .checks .active .port
54+ local up_hdr = up_conf .pass_host == " rewrite" and up_conf .upstream_host
55+ local use_node_hdr = up_conf .pass_host == " node" or nil
56+
57+ local targets = {}
58+ for _ , node in ipairs (up_conf .nodes ) do
59+ local host_hdr = up_hdr or (use_node_hdr and node .domain ) or nil
60+ local target = {
61+ host = node .host ,
62+ port = port or node .port ,
63+ check_host = host ,
64+ host_hdr = host_hdr ,
65+ }
66+ local key = target .host .. " :" .. tostring (target .port ) .. " :" .. tostring (host_hdr or " " )
67+ targets [key ] = target
68+ end
69+ return targets
70+ end
71+
72+
4773local function create_checker (up_conf )
4874 if not up_conf .checks then
4975 return nil
@@ -71,25 +97,66 @@ local function create_checker(up_conf)
7197 end
7298
7399 -- Add target nodes
74- local host = up_conf .checks and up_conf .checks .active and up_conf .checks .active .host
75- local port = up_conf .checks and up_conf .checks .active and up_conf .checks .active .port
76- local up_hdr = up_conf .pass_host == " rewrite" and up_conf .upstream_host
77- local use_node_hdr = up_conf .pass_host == " node" or nil
78-
79- for _ , node in ipairs (up_conf .nodes ) do
80- local host_hdr = up_hdr or (use_node_hdr and node .domain )
81- local ok , err = checker :add_target (node .host , port or node .port , host ,
82- true , host_hdr )
100+ for _ , target in pairs (compute_targets (up_conf )) do
101+ local ok , err = checker :add_target (target .host , target .port , target .check_host ,
102+ true , target .host_hdr )
83103 if not ok then
84- core .log .error (" failed to add healthcheck target: " , node .host , " :" ,
85- port or node .port , " err: " , err )
104+ core .log .error (" failed to add healthcheck target: " , target .host , " :" ,
105+ target .port , " err: " , err )
86106 end
87107 end
88108
89109 return checker
90110end
91111
92112
113+ -- Incrementally reconcile an existing checker's targets to match up_conf.
114+ -- Used when only the upstream nodes changed but the `checks` config did not,
115+ -- so the checker can keep running (and keep its accumulated health state)
116+ -- instead of being destroyed and rebuilt.
117+ local function sync_checker_targets (checker , up_conf )
118+ local desired = compute_targets (up_conf )
119+
120+ -- index current targets the same way as desired. Read the authoritative
121+ -- shm target list (the per-worker checker.targets array can lag behind a
122+ -- recent add/remove event).
123+ if not healthcheck then
124+ healthcheck = require (" resty.healthcheck" )
125+ end
126+ local current = {}
127+ local target_list = healthcheck .get_target_list (get_healthchecker_name (up_conf ),
128+ healthcheck_shdict_name ) or {}
129+ for _ , t in ipairs (target_list ) do
130+ -- target_list entries carry hostheader; map it back to our key shape
131+ local key = t .ip .. " :" .. tostring (t .port ) .. " :" .. tostring (t .hostheader or " " )
132+ current [key ] = t
133+ end
134+
135+ -- add targets that are desired but not present
136+ for key , target in pairs (desired ) do
137+ if not current [key ] then
138+ local ok , err = checker :add_target (target .host , target .port , target .check_host ,
139+ true , target .host_hdr )
140+ if not ok then
141+ core .log .error (" failed to add healthcheck target: " , target .host , " :" ,
142+ target .port , " err: " , err )
143+ end
144+ end
145+ end
146+
147+ -- remove targets that are present but no longer desired
148+ for key , t in pairs (current ) do
149+ if not desired [key ] then
150+ local ok , err = checker :remove_target (t .ip , t .port , t .hostname )
151+ if not ok then
152+ core .log .error (" failed to remove healthcheck target: " , t .ip , " :" ,
153+ t .port , " err: " , err )
154+ end
155+ end
156+ end
157+ end
158+
159+
93160function _M .fetch_checker (resource_path , resource_ver )
94161 local working_item = working_pool [resource_path ]
95162 if working_item and working_item .version == resource_ver then
@@ -130,10 +197,11 @@ function _M.fetch_node_status(checker, ip, port, hostname)
130197end
131198
132199
133- local function add_working_pool (resource_path , resource_ver , checker )
200+ local function add_working_pool (resource_path , resource_ver , checker , checks )
134201 working_pool [resource_path ] = {
135202 version = resource_ver ,
136- checker = checker
203+ checker = checker ,
204+ checks = checks ,
137205 }
138206end
139207
@@ -202,22 +270,43 @@ local function timer_create_checker()
202270 goto continue
203271 end
204272
205- -- if a checker exists then delete it before creating a new one
273+ -- If a checker already exists and the `checks` config is unchanged
274+ -- (only the upstream nodes changed), reconcile its targets in place
275+ -- instead of destroying and rebuilding it. A destroy-and-rebuild
276+ -- leaves `up_checker == nil` for the rebuild window, during which
277+ -- traffic is routed to nodes already known to be unhealthy, and it
278+ -- throws away the checker's accumulated health state.
206279 local existing_checker = working_pool [resource_path ]
280+ if existing_checker and existing_checker .checker
281+ and not existing_checker .checker .dead
282+ and upstream .checks
283+ and core .table .deep_eq (existing_checker .checks , upstream .checks ) then
284+ sync_checker_targets (existing_checker .checker , upstream )
285+ add_working_pool (resource_path , resource_ver , existing_checker .checker ,
286+ upstream .checks )
287+ core .log .info (" reused checker with incremental targets: " ,
288+ tostring (existing_checker .checker ), " for resource: " ,
289+ resource_path , " and version: " , resource_ver )
290+ goto continue
291+ end
292+
293+ -- The checks config changed (or no checker exists): build a fresh
294+ -- checker first, and only release the old one *after* the new one is
295+ -- in the working pool, so fetch_checker never observes a nil gap.
296+ local checker = create_checker (upstream )
297+ if not checker then
298+ goto continue
299+ end
207300 if existing_checker then
208301 existing_checker .checker :delayed_clear (DELAYED_CLEAR_TIMEOUT )
209302 existing_checker .checker :stop ()
210303 core .log .info (" releasing existing checker: " , tostring (existing_checker .checker ),
211304 " for resource: " , resource_path , " and version: " ,
212305 existing_checker .version )
213306 end
214- local checker = create_checker (upstream )
215- if not checker then
216- goto continue
217- end
218307 core .log .info (" create new checker: " , tostring (checker ), " for resource: " ,
219308 resource_path , " and version: " , resource_ver )
220- add_working_pool (resource_path , resource_ver , checker )
309+ add_working_pool (resource_path , resource_ver , checker , upstream . checks )
221310 end
222311
223312 :: continue::
@@ -258,6 +347,12 @@ local function timer_working_pool_check()
258347 " current version: " , current_ver , " item version: " , item .version )
259348 if item .version == current_ver then
260349 need_destroy = false
350+ elseif upstream .checks and core .table .deep_eq (item .checks , upstream .checks ) then
351+ -- Version changed but only because of the upstream nodes; the
352+ -- `checks` config is identical. Keep the checker alive so
353+ -- timer_create_checker can reconcile its targets incrementally
354+ -- (avoids a destroy-and-rebuild nil window for the checker).
355+ need_destroy = false
261356 end
262357 end
263358
0 commit comments