From 54dfc90bbb72cb35b7e4f6c7dc7303c98b0bd915 Mon Sep 17 00:00:00 2001
From: AlinsRan <alinsran@apache.org>
Date: Mon, 22 Jun 2026 11:02:06 +0800
Subject: [PATCH 01/15] fix(healthcheck): update targets incrementally instead
 of rebuilding

When an upstream's nodes change but its `checks` config is unchanged,
the health-check manager destroyed the existing checker and built a new
one. Two problems followed:

1. fetch_checker() keys the working checker by a version derived from
   both modifiedIndex and the nodes version, so a node-only change makes
   it return nil until the timer rebuilds the checker. During that
   window api_ctx.up_checker is nil and the balancer routes traffic to
   nodes already known to be unhealthy (apache/apisix#13282).

2. The rebuild throws away the checker's accumulated health state and
   re-probes every node from scratch.

The manager now reconciles the existing checker's targets in place with
add_target/remove_target when the `checks` config is unchanged
(compared with core.table.deep_eq), keeping the checker and its state
alive. timer_working_pool_check no longer destroys a checker for a
node-only version change, and when a rebuild is genuinely required (the
`checks` config changed) the new checker is created and inserted into
the working pool before the old one is released, so fetch_checker never
observes a nil gap.

Bumps the lua-resty-healthcheck-api7 rockspec dependency to 3.3.0-0,
which contains the companion library fix (clean every checker each
window + release the periodic lock when idle) required by this change.

Adds t/node/healthcheck-incremental-update.t: a node-only change must
not destroy/rebuild the checker (no "clear checker"), while a
checks-config change still rebuilds it.
---
 apisix/healthcheck_manager.lua          | 135 +++++++++++++++++----
 t/node/healthcheck-incremental-update.t | 149 ++++++++++++++++++++++++
 2 files changed, 264 insertions(+), 20 deletions(-)
 create mode 100644 t/node/healthcheck-incremental-update.t

diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua
index f2750ff49b78..8793f7c088f5 100644
--- a/apisix/healthcheck_manager.lua
+++ b/apisix/healthcheck_manager.lua
@@ -31,7 +31,8 @@ local jp = require("jsonpath")
 local config_util = require("apisix.core.config_util")
 
 local _M = {}
-local working_pool = {}     -- resource_path -> {version = ver, checker = checker}
+-- resource_path -> {version = ver, checker = checker, checks = checks}
+local working_pool = {}
 local waiting_pool = {}      -- resource_path -> resource_ver
 
 local DELAYED_CLEAR_TIMEOUT = 10
@@ -44,6 +45,31 @@ end
 _M.get_healthchecker_name = get_healthchecker_name
 
 
+-- Compute the desired set of health-check targets for an upstream config.
+-- Returns a map keyed by "host:port:hostheader" so the working set can be
+-- diffed cheaply against a checker's current targets.
+local function compute_targets(up_conf)
+    local host = up_conf.checks and up_conf.checks.active and up_conf.checks.active.host
+    local port = up_conf.checks and up_conf.checks.active and up_conf.checks.active.port
+    local up_hdr = up_conf.pass_host == "rewrite" and up_conf.upstream_host
+    local use_node_hdr = up_conf.pass_host == "node" or nil
+
+    local targets = {}
+    for _, node in ipairs(up_conf.nodes) do
+        local host_hdr = up_hdr or (use_node_hdr and node.domain) or nil
+        local target = {
+            host = node.host,
+            port = port or node.port,
+            check_host = host,
+            host_hdr = host_hdr,
+        }
+        local key = target.host .. ":" .. tostring(target.port) .. ":" .. tostring(host_hdr or "")
+        targets[key] = target
+    end
+    return targets
+end
+
+
 local function create_checker(up_conf)
     if not up_conf.checks then
         return nil
@@ -71,18 +97,12 @@ local function create_checker(up_conf)
     end
 
     -- Add target nodes
-    local host = up_conf.checks and up_conf.checks.active and up_conf.checks.active.host
-    local port = up_conf.checks and up_conf.checks.active and up_conf.checks.active.port
-    local up_hdr = up_conf.pass_host == "rewrite" and up_conf.upstream_host
-    local use_node_hdr = up_conf.pass_host == "node" or nil
-
-    for _, node in ipairs(up_conf.nodes) do
-        local host_hdr = up_hdr or (use_node_hdr and node.domain)
-        local ok, err = checker:add_target(node.host, port or node.port, host,
-                                        true, host_hdr)
+    for _, target in pairs(compute_targets(up_conf)) do
+        local ok, err = checker:add_target(target.host, target.port, target.check_host,
+                                        true, target.host_hdr)
         if not ok then
-            core.log.error("failed to add healthcheck target: ", node.host, ":",
-                          port or node.port, " err: ", err)
+            core.log.error("failed to add healthcheck target: ", target.host, ":",
+                          target.port, " err: ", err)
         end
     end
 
@@ -90,6 +110,53 @@ local function create_checker(up_conf)
 end
 
 
+-- Incrementally reconcile an existing checker's targets to match up_conf.
+-- Used when only the upstream nodes changed but the `checks` config did not,
+-- so the checker can keep running (and keep its accumulated health state)
+-- instead of being destroyed and rebuilt.
+local function sync_checker_targets(checker, up_conf)
+    local desired = compute_targets(up_conf)
+
+    -- index current targets the same way as desired. Read the authoritative
+    -- shm target list (the per-worker checker.targets array can lag behind a
+    -- recent add/remove event).
+    if not healthcheck then
+        healthcheck = require("resty.healthcheck")
+    end
+    local current = {}
+    local target_list = healthcheck.get_target_list(get_healthchecker_name(up_conf),
+                                                    healthcheck_shdict_name) or {}
+    for _, t in ipairs(target_list) do
+        -- target_list entries carry hostheader; map it back to our key shape
+        local key = t.ip .. ":" .. tostring(t.port) .. ":" .. tostring(t.hostheader or "")
+        current[key] = t
+    end
+
+    -- add targets that are desired but not present
+    for key, target in pairs(desired) do
+        if not current[key] then
+            local ok, err = checker:add_target(target.host, target.port, target.check_host,
+                                            true, target.host_hdr)
+            if not ok then
+                core.log.error("failed to add healthcheck target: ", target.host, ":",
+                              target.port, " err: ", err)
+            end
+        end
+    end
+
+    -- remove targets that are present but no longer desired
+    for key, t in pairs(current) do
+        if not desired[key] then
+            local ok, err = checker:remove_target(t.ip, t.port, t.hostname)
+            if not ok then
+                core.log.error("failed to remove healthcheck target: ", t.ip, ":",
+                              t.port, " err: ", err)
+            end
+        end
+    end
+end
+
+
 function _M.fetch_checker(resource_path, resource_ver)
     local working_item = working_pool[resource_path]
     if working_item and working_item.version == resource_ver then
@@ -130,10 +197,11 @@ function _M.fetch_node_status(checker, ip, port, hostname)
 end
 
 
-local function add_working_pool(resource_path, resource_ver, checker)
+local function add_working_pool(resource_path, resource_ver, checker, checks)
     working_pool[resource_path] = {
         version = resource_ver,
-        checker = checker
+        checker = checker,
+        checks = checks,
     }
 end
 
@@ -209,8 +277,33 @@ local function timer_create_checker()
                 goto continue
             end
 
-            -- if a checker exists then delete it before creating a new one
+            -- If a checker already exists and the `checks` config is unchanged
+            -- (only the upstream nodes changed), reconcile its targets in place
+            -- instead of destroying and rebuilding it. A destroy-and-rebuild
+            -- leaves `up_checker == nil` for the rebuild window, during which
+            -- traffic is routed to nodes already known to be unhealthy, and it
+            -- throws away the checker's accumulated health state.
             local existing_checker = working_pool[resource_path]
+            if existing_checker and existing_checker.checker
+               and not existing_checker.checker.dead
+               and upstream.checks
+               and core.table.deep_eq(existing_checker.checks, upstream.checks) then
+                sync_checker_targets(existing_checker.checker, upstream)
+                add_working_pool(resource_path, resource_ver, existing_checker.checker,
+                                 upstream.checks)
+                core.log.info("reused checker with incremental targets: ",
+                              tostring(existing_checker.checker), " for resource: ",
+                              resource_path, " and version: ", resource_ver)
+                goto continue
+            end
+
+            -- The checks config changed (or no checker exists): build a fresh
+            -- checker first, and only release the old one *after* the new one is
+            -- in the working pool, so fetch_checker never observes a nil gap.
+            local checker = create_checker(upstream)
+            if not checker then
+                goto continue
+            end
             if existing_checker then
                 existing_checker.checker:delayed_clear(DELAYED_CLEAR_TIMEOUT)
                 existing_checker.checker:stop()
@@ -218,13 +311,9 @@ local function timer_create_checker()
                               " for resource: ", resource_path, " and version: ",
                               existing_checker.version)
             end
-            local checker = create_checker(upstream)
-            if not checker then
-                goto continue
-            end
             core.log.info("create new checker: ", tostring(checker), " for resource: ",
                         resource_path, " and version: ", resource_ver)
-            add_working_pool(resource_path, resource_ver, checker)
+            add_working_pool(resource_path, resource_ver, checker, upstream.checks)
         end
 
         ::continue::
@@ -284,6 +373,12 @@ local function timer_working_pool_check()
                             " current version: ", current_ver, " item version: ", item.version)
                 if item.version == current_ver then
                     need_destroy = false
+                elseif upstream.checks and core.table.deep_eq(item.checks, upstream.checks) then
+                    -- Version changed but only because of the upstream nodes; the
+                    -- `checks` config is identical. Keep the checker alive so
+                    -- timer_create_checker can reconcile its targets incrementally
+                    -- (avoids a destroy-and-rebuild nil window for the checker).
+                    need_destroy = false
                 end
             end
         end
diff --git a/t/node/healthcheck-incremental-update.t b/t/node/healthcheck-incremental-update.t
new file mode 100644
index 000000000000..4f2b27cabc86
--- /dev/null
+++ b/t/node/healthcheck-incremental-update.t
@@ -0,0 +1,149 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+use t::APISIX 'no_plan';
+
+repeat_each(1);
+log_level('warn');
+no_root_location();
+no_shuffle();
+
+run_tests();
+
+__DATA__
+
+=== TEST 1: node-only change reuses the checker (no destroy-and-rebuild)
+--- extra_init_worker_by_lua
+    local healthcheck = require("resty.healthcheck")
+    local new = healthcheck.new
+    healthcheck.new = function(...)
+        ngx.log(ngx.WARN, "create new checker")
+        local obj = new(...)
+        local clear = obj.delayed_clear
+        obj.delayed_clear = function(...)
+            ngx.log(ngx.WARN, "clear checker")
+            return clear(...)
+        end
+        return obj
+    end
+
+--- config
+location /t {
+    content_by_lua_block {
+        local checks = [[{
+            "active":{
+                "http_path":"/hello",
+                "timeout":1,
+                "type":"http",
+                "healthy":{ "interval":1, "successes":1 },
+                "unhealthy":{ "interval":1, "http_failures":2 }
+            }
+        }]]
+        local function cfg(nodes)
+            return [[{
+                "upstream": {
+                    "nodes": ]] .. nodes .. [[,
+                    "type": "roundrobin",
+                    "checks": ]] .. checks .. [[
+                },
+                "uri": "/hello"
+            }]]
+        end
+
+        local t = require("lib.test_admin").test
+        -- initial config: one node -> creates the checker
+        assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT,
+                 cfg('{"127.0.0.1:1980": 1}')) < 300)
+        t('/hello', ngx.HTTP_GET)
+        ngx.sleep(2)
+
+        -- node-only change (checks unchanged): should reconcile in place,
+        -- NOT create a new checker nor delayed_clear the old one
+        assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT,
+                 cfg('{"127.0.0.1:1980": 1, "127.0.0.1:1981": 1}')) < 300)
+        t('/hello', ngx.HTTP_GET)
+        ngx.sleep(2)
+        ngx.say("done")
+    }
+}
+
+--- request
+GET /t
+--- response_body
+done
+--- no_error_log
+clear checker
+--- error_log
+create new checker
+--- timeout: 8
+
+
+
+=== TEST 2: checks-config change still rebuilds the checker
+--- extra_init_worker_by_lua
+    local healthcheck = require("resty.healthcheck")
+    local new = healthcheck.new
+    healthcheck.new = function(...)
+        local obj = new(...)
+        local clear = obj.delayed_clear
+        obj.delayed_clear = function(...)
+            ngx.log(ngx.WARN, "clear checker")
+            return clear(...)
+        end
+        return obj
+    end
+
+--- config
+location /t {
+    content_by_lua_block {
+        local function cfg(interval)
+            return [[{
+                "upstream": {
+                    "nodes": {"127.0.0.1:1980": 1},
+                    "type": "roundrobin",
+                    "checks": {
+                        "active":{
+                            "http_path":"/hello",
+                            "timeout":1,
+                            "type":"http",
+                            "healthy":{ "interval":]] .. interval .. [[, "successes":1 },
+                            "unhealthy":{ "interval":1, "http_failures":2 }
+                        }
+                    }
+                },
+                "uri": "/hello"
+            }]]
+        end
+
+        local t = require("lib.test_admin").test
+        assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT, cfg(1)) < 300)
+        t('/hello', ngx.HTTP_GET)
+        ngx.sleep(2)
+        -- change the checks config -> must rebuild (delayed_clear old checker)
+        assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT, cfg(2)) < 300)
+        t('/hello', ngx.HTTP_GET)
+        ngx.sleep(2)
+        ngx.say("done")
+    }
+}
+
+--- request
+GET /t
+--- response_body
+done
+--- error_log
+clear checker
+--- timeout: 8

From 4efc73623484690e42e8ba405534b2179f478ec9 Mon Sep 17 00:00:00 2001
From: AlinsRan <alinsran@apache.org>
Date: Mon, 22 Jun 2026 14:16:30 +0800
Subject: [PATCH 02/15] fix(healthcheck): publish new checker before stopping
 the old one

When the checks config changes, install the freshly created checker into
the working pool before stopping the previous one. This prevents a request
from briefly fetching a stopped checker for the old version during the
swap window.
---
 apisix/healthcheck_manager.lua | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua
index 8793f7c088f5..2a7b37742a83 100644
--- a/apisix/healthcheck_manager.lua
+++ b/apisix/healthcheck_manager.lua
@@ -298,22 +298,25 @@ local function timer_create_checker()
             end
 
             -- The checks config changed (or no checker exists): build a fresh
-            -- checker first, and only release the old one *after* the new one is
-            -- in the working pool, so fetch_checker never observes a nil gap.
+            -- checker first, install it into the working pool, and only then
+            -- release the old one. Publishing the new checker before stopping
+            -- the old one ensures fetch_checker never observes a nil gap nor a
+            -- stopped checker for this resource.
             local checker = create_checker(upstream)
             if not checker then
                 goto continue
             end
+            core.log.info("create new checker: ", tostring(checker), " for resource: ",
+                        resource_path, " and version: ", resource_ver)
+            add_working_pool(resource_path, resource_ver, checker, upstream.checks)
             if existing_checker then
+                existing_checker.checker.dead = true
                 existing_checker.checker:delayed_clear(DELAYED_CLEAR_TIMEOUT)
                 existing_checker.checker:stop()
                 core.log.info("releasing existing checker: ", tostring(existing_checker.checker),
                               " for resource: ", resource_path, " and version: ",
                               existing_checker.version)
             end
-            core.log.info("create new checker: ", tostring(checker), " for resource: ",
-                        resource_path, " and version: ", resource_ver)
-            add_working_pool(resource_path, resource_ver, checker, upstream.checks)
         end
 
         ::continue::

From 7fb8d887d6c7741d81270d7c48f1748a73353803 Mon Sep 17 00:00:00 2001
From: AlinsRan <alinsran@apache.org>
Date: Mon, 22 Jun 2026 15:08:20 +0800
Subject: [PATCH 03/15] fix(healthcheck): preserve target order, destroy
 checker when nodes drop to 0

- compute_targets now returns an ordered array (preserving node order) so
  targets are added deterministically, keeping ordered error-log assertions
  stable.
- Only keep/reuse a checker incrementally when the upstream still has nodes;
  when the node count drops to 0 the checker is destroyed as before.
- Update existing healthcheck tests to assert the new incremental-reuse
  behaviour (a checker is reused instead of recreated when only the nodes
  change, and is not cleared in that case).
---
 apisix/healthcheck_manager.lua              | 36 +++++++++++++--------
 t/node/healthcheck-discovery.t              |  7 ++--
 t/node/healthcheck-dns.t                    |  3 +-
 t/node/healthcheck-leak-bugfix.t            |  4 ++-
 t/node/healthcheck-service-discovery.t      |  3 +-
 t/node/healthchecker-independent-upstream.t |  4 ---
 6 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua
index 2a7b37742a83..cfa34d3ab509 100644
--- a/apisix/healthcheck_manager.lua
+++ b/apisix/healthcheck_manager.lua
@@ -46,8 +46,10 @@ _M.get_healthchecker_name = get_healthchecker_name
 
 
 -- Compute the desired set of health-check targets for an upstream config.
--- Returns a map keyed by "host:port:hostheader" so the working set can be
--- diffed cheaply against a checker's current targets.
+-- Returns an ordered array preserving up_conf.nodes order so that targets are
+-- always added to a checker deterministically; each entry also carries a
+-- "host:port:hostheader" key so the working set can be diffed cheaply against
+-- a checker's current targets.
 local function compute_targets(up_conf)
     local host = up_conf.checks and up_conf.checks.active and up_conf.checks.active.host
     local port = up_conf.checks and up_conf.checks.active and up_conf.checks.active.port
@@ -57,14 +59,14 @@ local function compute_targets(up_conf)
     local targets = {}
     for _, node in ipairs(up_conf.nodes) do
         local host_hdr = up_hdr or (use_node_hdr and node.domain) or nil
-        local target = {
+        local target_port = port or node.port
+        targets[#targets + 1] = {
             host = node.host,
-            port = port or node.port,
+            port = target_port,
             check_host = host,
             host_hdr = host_hdr,
+            key = node.host .. ":" .. tostring(target_port) .. ":" .. tostring(host_hdr or ""),
         }
-        local key = target.host .. ":" .. tostring(target.port) .. ":" .. tostring(host_hdr or "")
-        targets[key] = target
     end
     return targets
 end
@@ -97,7 +99,7 @@ local function create_checker(up_conf)
     end
 
     -- Add target nodes
-    for _, target in pairs(compute_targets(up_conf)) do
+    for _, target in ipairs(compute_targets(up_conf)) do
         local ok, err = checker:add_target(target.host, target.port, target.check_host,
                                         true, target.host_hdr)
         if not ok then
@@ -115,7 +117,11 @@ end
 -- so the checker can keep running (and keep its accumulated health state)
 -- instead of being destroyed and rebuilt.
 local function sync_checker_targets(checker, up_conf)
-    local desired = compute_targets(up_conf)
+    -- index the desired targets by key so they can be diffed against current
+    local desired = {}
+    for _, target in ipairs(compute_targets(up_conf)) do
+        desired[target.key] = target
+    end
 
     -- index current targets the same way as desired. Read the authoritative
     -- shm target list (the per-worker checker.targets array can lag behind a
@@ -287,6 +293,7 @@ local function timer_create_checker()
             if existing_checker and existing_checker.checker
                and not existing_checker.checker.dead
                and upstream.checks
+               and upstream.nodes and #upstream.nodes > 0
                and core.table.deep_eq(existing_checker.checks, upstream.checks) then
                 sync_checker_targets(existing_checker.checker, upstream)
                 add_working_pool(resource_path, resource_ver, existing_checker.checker,
@@ -376,11 +383,14 @@ local function timer_working_pool_check()
                             " current version: ", current_ver, " item version: ", item.version)
                 if item.version == current_ver then
                     need_destroy = false
-                elseif upstream.checks and core.table.deep_eq(item.checks, upstream.checks) then
-                    -- Version changed but only because of the upstream nodes; the
-                    -- `checks` config is identical. Keep the checker alive so
-                    -- timer_create_checker can reconcile its targets incrementally
-                    -- (avoids a destroy-and-rebuild nil window for the checker).
+                elseif upstream.checks and upstream.nodes and #upstream.nodes > 0
+                       and core.table.deep_eq(item.checks, upstream.checks) then
+                    -- Version changed but only because of the upstream nodes (and at
+                    -- least one node remains); the `checks` config is identical. Keep
+                    -- the checker alive so timer_create_checker can reconcile its
+                    -- targets incrementally (avoids a destroy-and-rebuild nil window).
+                    -- When the node count drops to 0 we deliberately fall through to
+                    -- destroy the checker, matching the original behaviour.
                     need_destroy = false
                 end
             end
diff --git a/t/node/healthcheck-discovery.t b/t/node/healthcheck-discovery.t
index 9978bba93f2f..57a832bb32e9 100644
--- a/t/node/healthcheck-discovery.t
+++ b/t/node/healthcheck-discovery.t
@@ -108,7 +108,7 @@ unhealthy TCP increment (1/2) for '127.0.0.1(0.0.0.0:1988)'
 
 
 
-=== TEST 2: create new checker when nodes change
+=== TEST 2: reuse checker incrementally when nodes change
 --- apisix_yaml
 routes:
   -
@@ -150,11 +150,10 @@ routes:
         }
     }
 --- grep_error_log eval
-qr/(create new checker|releasing existing checker): table/
+qr/(create new checker|reused checker with incremental targets): table/
 --- grep_error_log_out
 create new checker: table
-releasing existing checker: table
-create new checker: table
+reused checker with incremental targets: table
 --- timeout: 30
 
 
diff --git a/t/node/healthcheck-dns.t b/t/node/healthcheck-dns.t
index d0a76dbb636c..cc433e81a162 100644
--- a/t/node/healthcheck-dns.t
+++ b/t/node/healthcheck-dns.t
@@ -140,6 +140,5 @@ First request status: 200
 Second request status: 200
 --- error_log
 create new checker
-releasing existing checker
-create new checker
+reused checker with incremental targets
 --- timeout: 10
diff --git a/t/node/healthcheck-leak-bugfix.t b/t/node/healthcheck-leak-bugfix.t
index bcab5689d152..703ead63adbf 100644
--- a/t/node/healthcheck-leak-bugfix.t
+++ b/t/node/healthcheck-leak-bugfix.t
@@ -25,7 +25,7 @@ run_tests();
 
 __DATA__
 
-=== TEST 1: ensure the old check is cleared after configuration updated
+=== TEST 1: reuse the checker without clearing it when only the nodes change
 --- extra_init_worker_by_lua
     local healthcheck = require("resty.healthcheck")
     local new = healthcheck.new
@@ -110,5 +110,7 @@ location /t {
 --- request
 GET /t
 --- error_log
+reused checker with incremental targets
+--- no_error_log
 clear checker
 --- timeout: 7
diff --git a/t/node/healthcheck-service-discovery.t b/t/node/healthcheck-service-discovery.t
index 7b60141531f1..2163f524fe8b 100644
--- a/t/node/healthcheck-service-discovery.t
+++ b/t/node/healthcheck-service-discovery.t
@@ -149,6 +149,5 @@ routes:
     }
 --- error_log
 create new checker
-releasing existing checker
-create new checker
+reused checker with incremental targets
 --- timeout: 10
diff --git a/t/node/healthchecker-independent-upstream.t b/t/node/healthchecker-independent-upstream.t
index 957aef2c4e52..d6af917073c5 100644
--- a/t/node/healthchecker-independent-upstream.t
+++ b/t/node/healthchecker-independent-upstream.t
@@ -109,7 +109,3 @@ passed
 qr/create new checker: table:/
 --- grep_error_log_out
 create new checker: table:
-create new checker: table:
-create new checker: table:
-create new checker: table:
-create new checker: table:

From f86c567231d44a41a3c3a27d60cab0c29e8812a2 Mon Sep 17 00:00:00 2001
From: AlinsRan <alinsran@apache.org>
Date: Tue, 23 Jun 2026 04:04:38 +0800
Subject: [PATCH 04/15] test(healthcheck): trigger reconcile with a request
 after re-applying config

The incremental-reuse reconcile runs in timer_create_checker, which only
acts on resources placed into the waiting pool by fetch_checker (i.e. when a
request is routed). Re-route a request after the second PUT so the manager
observes the new version and logs the incremental reuse.
---
 t/node/healthcheck-leak-bugfix.t | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/t/node/healthcheck-leak-bugfix.t b/t/node/healthcheck-leak-bugfix.t
index 703ead63adbf..9882244bc2cc 100644
--- a/t/node/healthcheck-leak-bugfix.t
+++ b/t/node/healthcheck-leak-bugfix.t
@@ -103,6 +103,9 @@ location /t {
         t('/hello', ngx.HTTP_GET)
         ngx.sleep(2)
         assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT, cfg) < 300)
+        -- re-route a request so fetch_checker observes the new version and the
+        -- manager reconciles the existing checker's targets incrementally
+        t('/hello', ngx.HTTP_GET)
         ngx.sleep(2)
     }
 }

From 492e2bd404fb4b592fc2f457d7ef64f0e734772d Mon Sep 17 00:00:00 2001
From: AlinsRan <alinsran@apache.org>
Date: Tue, 23 Jun 2026 04:35:05 +0800
Subject: [PATCH 05/15] test(healthcheck): run leak-bugfix at info level so
 reuse log is captured

---
 t/node/healthcheck-leak-bugfix.t | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/t/node/healthcheck-leak-bugfix.t b/t/node/healthcheck-leak-bugfix.t
index 9882244bc2cc..8b3f6c7b4053 100644
--- a/t/node/healthcheck-leak-bugfix.t
+++ b/t/node/healthcheck-leak-bugfix.t
@@ -17,7 +17,8 @@
 use t::APISIX 'no_plan';
 
 repeat_each(1);
-log_level('warn');
+# the reuse path logs "reused checker with incremental targets" at info level
+log_level('info');
 no_root_location();
 no_shuffle();
 

From 811276ecb375dd1f849da78a6e08e0b1d6e8b660 Mon Sep 17 00:00:00 2001
From: AlinsRan <alinsran@apache.org>
Date: Tue, 30 Jun 2026 16:15:16 +0800
Subject: [PATCH 06/15] fix(healthcheck): correct host-header reconcile order
 and guard partial sync failures

- resty.healthcheck keys targets by ip+port+hostname (the Host header is not
  part of the identity). Remove stale targets before adding new ones so a
  Host-header-only change (pass_host/upstream_host) re-applies the new header
  instead of add_target() being a no-op on the still-present old target and the
  subsequent remove dropping the live target.
- sync_checker_targets now returns whether every add/remove succeeded; the
  caller only commits the reused checker for the new version when the sync fully
  succeeds, otherwise it falls through to a full rebuild so the upstream still
  converges instead of being stuck on a stale target set.
---
 apisix/healthcheck_manager.lua | 42 ++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua
index cfa34d3ab509..b2a2ce20f14b 100644
--- a/apisix/healthcheck_manager.lua
+++ b/apisix/healthcheck_manager.lua
@@ -116,6 +116,8 @@ end
 -- Used when only the upstream nodes changed but the `checks` config did not,
 -- so the checker can keep running (and keep its accumulated health state)
 -- instead of being destroyed and rebuilt.
+-- Returns true only if every add/remove succeeded; on a partial failure the
+-- caller must not treat the checker as reconciled for this version.
 local function sync_checker_targets(checker, up_conf)
     -- index the desired targets by key so they can be diffed against current
     local desired = {}
@@ -138,28 +140,39 @@ local function sync_checker_targets(checker, up_conf)
         current[key] = t
     end
 
+    local synced = true
+
+    -- Remove stale targets BEFORE adding new ones. resty.healthcheck identifies a
+    -- target by ip+port+hostname; the Host header is not part of that identity. A
+    -- Host-header-only change (e.g. pass_host/upstream_host) therefore produces a
+    -- removal of the old key and an addition of the new key for the same identity.
+    -- Removing first frees that identity so the following add_target actually
+    -- applies the new Host header instead of being a no-op on an existing target.
+    for key, t in pairs(current) do
+        if not desired[key] then
+            local ok, err = checker:remove_target(t.ip, t.port, t.hostname)
+            if not ok then
+                synced = false
+                core.log.error("failed to remove healthcheck target: ", t.ip, ":",
+                              t.port, " err: ", err)
+            end
+        end
+    end
+
     -- add targets that are desired but not present
     for key, target in pairs(desired) do
         if not current[key] then
             local ok, err = checker:add_target(target.host, target.port, target.check_host,
                                             true, target.host_hdr)
             if not ok then
+                synced = false
                 core.log.error("failed to add healthcheck target: ", target.host, ":",
                               target.port, " err: ", err)
             end
         end
     end
 
-    -- remove targets that are present but no longer desired
-    for key, t in pairs(current) do
-        if not desired[key] then
-            local ok, err = checker:remove_target(t.ip, t.port, t.hostname)
-            if not ok then
-                core.log.error("failed to remove healthcheck target: ", t.ip, ":",
-                              t.port, " err: ", err)
-            end
-        end
-    end
+    return synced
 end
 
 
@@ -289,13 +302,18 @@ local function timer_create_checker()
             -- leaves `up_checker == nil` for the rebuild window, during which
             -- traffic is routed to nodes already known to be unhealthy, and it
             -- throws away the checker's accumulated health state.
+            -- sync_checker_targets is the last condition so it only runs when the
+            -- checker is reuse-eligible; if it reports a partial failure the whole
+            -- guard is false and we fall through to a full rebuild below, which
+            -- converges the upstream to the desired targets instead of committing
+            -- the new version against a half-reconciled checker.
             local existing_checker = working_pool[resource_path]
             if existing_checker and existing_checker.checker
                and not existing_checker.checker.dead
                and upstream.checks
                and upstream.nodes and #upstream.nodes > 0
-               and core.table.deep_eq(existing_checker.checks, upstream.checks) then
-                sync_checker_targets(existing_checker.checker, upstream)
+               and core.table.deep_eq(existing_checker.checks, upstream.checks)
+               and sync_checker_targets(existing_checker.checker, upstream) then
                 add_working_pool(resource_path, resource_ver, existing_checker.checker,
                                  upstream.checks)
                 core.log.info("reused checker with incremental targets: ",

From 92129c7a9e8a1d7b522873df57c5f944c04900d6 Mon Sep 17 00:00:00 2001
From: AlinsRan <alinsran@apache.org>
Date: Tue, 30 Jun 2026 19:21:48 +0800
Subject: [PATCH 07/15] fix(healthcheck): clear old checker before rebuilding
 so live targets are not purged

On a checks-config change the manager rebuilds the checker. The new checker
shares the old one's shm target list (same healthchecker name). add_target()
only un-marks a target's purge_time when the target is re-added *after* being
marked, so doing create_checker() (re-add) before the old checker's
delayed_clear() left the surviving targets marked with a purge_time that nobody
cleared -- the cleanup then purged the live checker's own targets once the
delayed-clear window elapsed.

Run delayed_clear() before create_checker() so the re-add un-marks surviving
targets while genuinely dropped ones keep their purge_time. The old checker is
still only stopped after the new one is published, so there is no rebuild gap.

Add a regression test asserting the surviving targets remain in /v1/healthcheck
after the delayed-clear window following a checks-config change.
---
 apisix/healthcheck_manager.lua          | 19 +++++---
 t/node/healthcheck-incremental-update.t | 62 +++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 6 deletions(-)

diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua
index b2a2ce20f14b..aea041869785 100644
--- a/apisix/healthcheck_manager.lua
+++ b/apisix/healthcheck_manager.lua
@@ -322,11 +322,19 @@ local function timer_create_checker()
                 goto continue
             end
 
-            -- The checks config changed (or no checker exists): build a fresh
-            -- checker first, install it into the working pool, and only then
-            -- release the old one. Publishing the new checker before stopping
-            -- the old one ensures fetch_checker never observes a nil gap nor a
-            -- stopped checker for this resource.
+            -- The checks config changed (or no checker exists): rebuild the
+            -- checker. delayed_clear() MUST run before create_checker() re-adds
+            -- the targets: the new checker shares the same shm target list, and
+            -- add_target() only un-marks a target's purge_time when it is re-added
+            -- *after* being marked. Clearing first lets surviving targets get
+            -- un-marked on re-add, while genuinely dropped targets keep their
+            -- purge_time and are cleaned up; clearing after create (the reverse)
+            -- would leave the live checker's targets marked and purge them later.
+            -- The old checker is only stopped after the new one is published, so
+            -- fetch_checker never observes a nil/stopped checker (no rebuild gap).
+            if existing_checker then
+                existing_checker.checker:delayed_clear(DELAYED_CLEAR_TIMEOUT)
+            end
             local checker = create_checker(upstream)
             if not checker then
                 goto continue
@@ -336,7 +344,6 @@ local function timer_create_checker()
             add_working_pool(resource_path, resource_ver, checker, upstream.checks)
             if existing_checker then
                 existing_checker.checker.dead = true
-                existing_checker.checker:delayed_clear(DELAYED_CLEAR_TIMEOUT)
                 existing_checker.checker:stop()
                 core.log.info("releasing existing checker: ", tostring(existing_checker.checker),
                               " for resource: ", resource_path, " and version: ",
diff --git a/t/node/healthcheck-incremental-update.t b/t/node/healthcheck-incremental-update.t
index 4f2b27cabc86..f7d7c6142517 100644
--- a/t/node/healthcheck-incremental-update.t
+++ b/t/node/healthcheck-incremental-update.t
@@ -147,3 +147,65 @@ done
 --- error_log
 clear checker
 --- timeout: 8
+
+
+
+=== TEST 3: surviving targets are not purged after a checks-config rebuild
+# Changing the checks config rebuilds the checker, which delayed_clear()s the old
+# one. Because the new checker shares the same shm target list, the surviving
+# nodes must keep being health-checked: they must NOT be purged once the
+# delayed-clear window elapses. A wrong rebuild order (clear after re-add) would
+# leave the live checker's targets marked and purge them here.
+--- config
+location /t {
+    content_by_lua_block {
+        local json = require("apisix.core.json")
+        local t = require("lib.test_admin").test
+        local function cfg(interval)
+            return [[{
+                "upstream": {
+                    "nodes": {"127.0.0.1:1980": 1, "127.0.0.1:1981": 1},
+                    "type": "roundrobin",
+                    "checks": {
+                        "active":{
+                            "http_path":"/hello",
+                            "type":"http",
+                            "healthy":{ "interval":]] .. interval .. [[, "successes":1 },
+                            "unhealthy":{ "interval":1, "http_failures":2 }
+                        }
+                    }
+                },
+                "uri": "/hello"
+            }]]
+        end
+        local function count_nodes()
+            local _, _, res = t('/v1/healthcheck', ngx.HTTP_GET)
+            local n = 0
+            for _, info in ipairs(json.decode(res)) do
+                n = n + #(info.nodes or {})
+            end
+            return n
+        end
+
+        assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT, cfg(1)) < 300)
+        t('/hello', ngx.HTTP_GET)
+        ngx.sleep(2)
+
+        -- change the checks config (interval 1 -> 2) while keeping both nodes:
+        -- this rebuilds the checker through the delayed_clear path
+        assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT, cfg(2)) < 300)
+        t('/hello', ngx.HTTP_GET)
+
+        -- wait past DELAYED_CLEAR_TIMEOUT (10s) plus a cleanup window
+        ngx.sleep(15)
+
+        -- both surviving nodes must still be present in the live checker
+        ngx.say("nodes_after: ", count_nodes())
+    }
+}
+--- request
+GET /t
+--- response_body
+nodes_after: 2
+--- ignore_error_log
+--- timeout: 30

From 3688210f369780616c80d54b04b54b5f9081b939 Mon Sep 17 00:00:00 2001
From: AlinsRan <alinsran@apache.org>
Date: Wed, 1 Jul 2026 16:43:01 +0800
Subject: [PATCH 08/15] fix(healthcheck): serve the live checker during a
 node-only version transition

A node-only change (discovery/DNS bumps _nodes_ver) queues the new version and
timer_create_checker reconciles the existing checker in place. But fetch_checker
only returned a checker on an exact version match, so during the ~1s until the
timer ran it returned nil, api_ctx.up_checker was nil, and the balancer fell back
to all upstream nodes -- a node already known unhealthy could take traffic during
the transition (the request-path side of apache/apisix#13282).

Return the existing live (non-dead) checker for the resource while the new
version is queued for reconciliation, so requests keep using the preserved health
state throughout the transition. Regression test: mark a node unhealthy, apply a
node-only update, and assert a burst of requests in the transition window never
hit the unhealthy node.
---
 apisix/healthcheck_manager.lua          | 20 ++++++--
 t/node/healthcheck-incremental-update.t | 67 +++++++++++++++++++++++++
 2 files changed, 82 insertions(+), 5 deletions(-)

diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua
index aea041869785..c76eff15ecb2 100644
--- a/apisix/healthcheck_manager.lua
+++ b/apisix/healthcheck_manager.lua
@@ -182,13 +182,23 @@ function _M.fetch_checker(resource_path, resource_ver)
         return working_item.checker
     end
 
-    if waiting_pool[resource_path] == resource_ver then
-        return nil
+    -- The requested version differs from the working checker -- e.g. a
+    -- discovery/DNS change bumped _nodes_ver. Enqueue the new version so
+    -- timer_create_checker reconciles (or rebuilds) it, but keep returning the
+    -- existing live checker in the meantime: its accumulated health state is
+    -- still valid, so requests during the ~1s transition keep filtering
+    -- unhealthy nodes instead of falling back to "all nodes available", which
+    -- would let a node already known to be unhealthy receive traffic
+    -- (apache/apisix#13282).
+    if waiting_pool[resource_path] ~= resource_ver then
+        core.log.info("adding ", resource_path, " to waiting pool with version: ", resource_ver)
+        waiting_pool[resource_path] = resource_ver
+    end
+
+    if working_item and working_item.checker and not working_item.checker.dead then
+        return working_item.checker
     end
 
-    -- Add to waiting pool with version
-    core.log.info("adding ", resource_path, " to waiting pool with version: ", resource_ver)
-    waiting_pool[resource_path] = resource_ver
     return nil
 end
 
diff --git a/t/node/healthcheck-incremental-update.t b/t/node/healthcheck-incremental-update.t
index f7d7c6142517..31b49320a842 100644
--- a/t/node/healthcheck-incremental-update.t
+++ b/t/node/healthcheck-incremental-update.t
@@ -209,3 +209,70 @@ GET /t
 nodes_after: 2
 --- ignore_error_log
 --- timeout: 30
+
+
+
+=== TEST 4: a node-only update keeps filtering an already-unhealthy node during the transition
+# Before the fetch_checker fix, a node-only version change made fetch_checker()
+# return nil until the 1s timer reconciled, so api_ctx.up_checker was nil and the
+# balancer fell back to all nodes -- a node already known unhealthy could take
+# traffic during the transition (apache/apisix#13282 health-filter bypass window).
+--- config
+location /t {
+    content_by_lua_block {
+        local t = require("lib.test_admin").test
+        local http = require("resty.http")
+
+        local function put(nodes)
+            return t('/apisix/admin/routes/1', ngx.HTTP_PUT, [[{
+                "uri": "/hello",
+                "upstream": {
+                    "type": "roundrobin",
+                    "retries": 0,
+                    "nodes": ]] .. nodes .. [[,
+                    "checks": {
+                        "active": {
+                            "type": "tcp",
+                            "healthy":   { "interval": 1, "successes": 1 },
+                            "unhealthy": { "interval": 1, "tcp_failures": 1 }
+                        }
+                    }
+                }
+            }]])
+        end
+
+        -- start with only the healthy node so the checker is created without the
+        -- dead node ever being in the picker yet
+        assert(put('{"127.0.0.1:1980": 1}') < 300)
+        t('/hello', ngx.HTTP_GET)
+        ngx.sleep(1)
+
+        -- add the dead node (node-only change) and let active checks mark it
+        -- unhealthy; a request is needed to enqueue the reconcile
+        assert(put('{"127.0.0.1:1980": 1, "127.0.0.1:1970": 1}') < 300)
+        t('/hello', ngx.HTTP_GET)
+        ngx.sleep(3)
+
+        -- another node-only update opens a fresh version-transition window;
+        -- immediately burst requests before timer_create_checker reconciles
+        assert(put('{"127.0.0.1:1980": 1, "127.0.0.1:1970": 1}') < 300)
+        local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/hello"
+        local errors = 0
+        for _ = 1, 20 do
+            local httpc = http.new()
+            local res = httpc:request_uri(uri, { method = "GET", keepalive = false })
+            if not res or res.status ~= 200 then
+                errors = errors + 1
+            end
+        end
+        -- the already-unhealthy dead node must stay filtered throughout
+        ngx.say("errors: ", errors)
+    }
+}
+--- request
+GET /t
+--- response_body
+errors: 0
+--- error_log
+unhealthy TCP increment
+--- timeout: 15

From d745b2f8d174398376cf6d74d9662d5ae2699c1b Mon Sep 17 00:00:00 2001
From: AlinsRan <alinsran@apache.org>
Date: Thu, 2 Jul 2026 07:48:29 +0800
Subject: [PATCH 09/15] fix(healthcheck): reconcile stale shm targets in
 create_checker for multi-worker

With multiple workers, a peer worker can create the checker and populate the
shared shm target list, after which a node is removed from the upstream. A worker
that never had the checker in its own working_pool then reaches create_checker(),
which only added targets -- so the removed node lingered in the shm and kept
being probed and returned by /v1/healthcheck.

create_checker() now also removes targets present in the shm but absent from the
config, reconciling it to the desired set. The add pass is kept as-is (re-adding
un-marks any delayed_clear purge_time, preserving the checks-rebuild fix), so a
straight switch to sync_checker_targets() -- which only adds missing targets --
is deliberately avoided. Regression test seeds a stale target into the shm and
asserts create_checker() drops it.

Reported-by: arunmat27
---
 apisix/healthcheck_manager.lua          | 24 +++++++++-
 t/node/healthcheck-incremental-update.t | 60 +++++++++++++++++++++++++
 2 files changed, 83 insertions(+), 1 deletion(-)

diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua
index c76eff15ecb2..6b5527fa4689 100644
--- a/apisix/healthcheck_manager.lua
+++ b/apisix/healthcheck_manager.lua
@@ -98,8 +98,12 @@ local function create_checker(up_conf)
         return nil
     end
 
-    -- Add target nodes
+    -- Add target nodes. Re-adding an already-present target is a no-op except
+    -- that it clears any pending purge_time, which is what un-marks surviving
+    -- targets after a delayed_clear() on a checks-config rebuild.
+    local desired = {}
     for _, target in ipairs(compute_targets(up_conf)) do
+        desired[target.key] = true
         local ok, err = checker:add_target(target.host, target.port, target.check_host,
                                         true, target.host_hdr)
         if not ok then
@@ -108,6 +112,24 @@ local function create_checker(up_conf)
         end
     end
 
+    -- The shared shm target list may already hold nodes this config no longer
+    -- has -- e.g. another worker created the checker first and a node was later
+    -- removed; the worker that never had the checker reaches create_checker(),
+    -- which otherwise only adds. Remove the stale targets so they stop being
+    -- probed and reported by /v1/healthcheck (apache/apisix#13282, multi-worker).
+    local target_list = healthcheck.get_target_list(get_healthchecker_name(up_conf),
+                                                    healthcheck_shdict_name) or {}
+    for _, t in ipairs(target_list) do
+        local key = t.ip .. ":" .. tostring(t.port) .. ":" .. tostring(t.hostheader or "")
+        if not desired[key] then
+            local ok, err = checker:remove_target(t.ip, t.port, t.hostname)
+            if not ok then
+                core.log.error("failed to remove healthcheck target: ", t.ip, ":",
+                              t.port, " err: ", err)
+            end
+        end
+    end
+
     return checker
 end
 
diff --git a/t/node/healthcheck-incremental-update.t b/t/node/healthcheck-incremental-update.t
index 31b49320a842..b4705c929bbd 100644
--- a/t/node/healthcheck-incremental-update.t
+++ b/t/node/healthcheck-incremental-update.t
@@ -276,3 +276,63 @@ errors: 0
 --- error_log
 unhealthy TCP increment
 --- timeout: 15
+
+
+
+=== TEST 5: create_checker removes targets left stale in the shm by another worker
+# Multi-worker: a peer worker created the checker with a node that was later
+# removed, leaving it in the shared shm. A worker that never had the checker
+# reaches create_checker(), which must reconcile the shm (not just add) so the
+# stale node stops being probed and reported by /v1/healthcheck
+# (apache/apisix#13282, multi-worker).
+--- config
+location /t {
+    content_by_lua_block {
+        local json = require("apisix.core.json")
+        local t = require("lib.test_admin").test
+
+        -- simulate a peer worker: seed route 1's checker shm target list with a
+        -- node (1970) that the config below will not contain
+        local healthcheck = require("resty.healthcheck")
+        local seed = healthcheck.new({
+            name = "upstream#/apisix/routes/1",
+            shm_name = "upstream-healthcheck",
+            events_module = "resty.events",
+            checks = { active = { type = "tcp",
+                healthy = { interval = 100, successes = 1 },
+                unhealthy = { interval = 100, tcp_failures = 1 } } },
+        })
+        seed:add_target("127.0.0.1", 1970, nil, true)
+
+        -- this worker has no checker in its working pool, so the first request
+        -- goes through create_checker() for the current config {1980}
+        assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT, [[{
+            "uri": "/hello",
+            "upstream": {
+                "type": "roundrobin",
+                "nodes": {"127.0.0.1:1980": 1},
+                "checks": { "active": { "type": "tcp",
+                    "healthy": { "interval": 1, "successes": 1 },
+                    "unhealthy": { "interval": 1, "tcp_failures": 1 } } }
+            }
+        }]]) < 300)
+        t('/hello', ngx.HTTP_GET)
+        ngx.sleep(2)
+
+        -- create_checker's reconcile must have removed the stale 1970
+        local _, _, res = t('/v1/healthcheck', ngx.HTTP_GET)
+        local has_1970 = false
+        for _, info in ipairs(json.decode(res)) do
+            for _, node in ipairs(info.nodes or {}) do
+                if node.port == 1970 then has_1970 = true end
+            end
+        end
+        ngx.say("stale_1970: ", tostring(has_1970))
+    }
+}
+--- request
+GET /t
+--- response_body
+stale_1970: false
+--- ignore_error_log
+--- timeout: 8

From e60470860325b6130d3c8541b100f759f3db2dd2 Mon Sep 17 00:00:00 2001
From: AlinsRan <alinsran@apache.org>
Date: Thu, 2 Jul 2026 09:11:27 +0800
Subject: [PATCH 10/15] fix(healthcheck): don't clear shared shm when a peer
 checker still owns it

When only the checks config changes, a worker that did not serve traffic
keeps its old-version checker in working_pool; timer_working_pool_check
then destroys that stale local handle. Because the checker's shm target
list is shared by name, a same-name replacement checker (built by whichever
worker serves traffic) owns it. Unconditionally calling delayed_clear() on
destroy marks that live checker's targets for purge on every worker.

Only tear down the shm when no same-name checker will own it (resource
deleted, or new config has no checks/nodes); otherwise drop the local
handle with stop() and leave the shm to the replacement checker.
---
 apisix/healthcheck_manager.lua          | 20 ++++++-
 t/node/healthcheck-incremental-update.t | 71 +++++++++++++++++++++++++
 2 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua
index 6b5527fa4689..47768dab7f1c 100644
--- a/apisix/healthcheck_manager.lua
+++ b/apisix/healthcheck_manager.lua
@@ -399,6 +399,7 @@ local function timer_working_pool_check()
         --- remove from working pool if resource doesn't exist
         local res_conf = resource.fetch_latest_conf(resource_path)
         local need_destroy = true
+        local has_live_replacement = false
         if res_conf and res_conf.value then
             local ok, upstream, err
             local plugin_name = get_plugin_name(resource_path)
@@ -450,13 +451,30 @@ local function timer_working_pool_check()
                     -- destroy the checker, matching the original behaviour.
                     need_destroy = false
                 end
+
+                -- Whether a same-name checker will still own the shared shm target
+                -- list after this worker drops its stale handle: the new config
+                -- still defines checks and has at least one node, so whichever
+                -- worker serves traffic (re)builds a checker under the same shm
+                -- name. This worker must then NOT clear that shm on destroy.
+                if upstream.checks and upstream.nodes and #upstream.nodes > 0 then
+                    has_live_replacement = true
+                end
             end
         end
 
         if need_destroy then
             working_pool[resource_path] = nil
             item.checker.dead = true
-            item.checker:delayed_clear(DELAYED_CLEAR_TIMEOUT)
+            -- Only tear down the shared shm target list when no same-name checker
+            -- will own it (resource deleted, or new config has no checks/nodes).
+            -- If the config still has checks and nodes, a replacement checker built
+            -- by whichever worker serves traffic owns the shm; clearing it here
+            -- would purge that live checker's targets on every worker
+            -- (apache/apisix#13282, multi-worker).
+            if not has_live_replacement then
+                item.checker:delayed_clear(DELAYED_CLEAR_TIMEOUT)
+            end
             item.checker:stop()
             core.log.info("try to release checker: ", tostring(item.checker), " for resource: ",
                         resource_path, " and version : ", item.version)
diff --git a/t/node/healthcheck-incremental-update.t b/t/node/healthcheck-incremental-update.t
index b4705c929bbd..f91598a46828 100644
--- a/t/node/healthcheck-incremental-update.t
+++ b/t/node/healthcheck-incremental-update.t
@@ -336,3 +336,74 @@ GET /t
 stale_1970: false
 --- ignore_error_log
 --- timeout: 8
+
+
+
+=== TEST 6: destroying a stale local checker does not purge a peer worker's live shm targets
+# Multi-worker: after a checks-config change, a worker that did NOT serve traffic
+# keeps its old-version checker in working_pool. timer_working_pool_check then
+# destroys that stale local handle. Because the checker's shm target list is
+# shared by name, a peer worker's live checker (built for the new config) owns it.
+# The destroy path must NOT delayed_clear() that shm, or the peer's live targets
+# are purged on every worker once the delayed-clear window elapses
+# (apache/apisix#13282, multi-worker).
+--- config
+location /t {
+    content_by_lua_block {
+        local healthcheck = require("resty.healthcheck")
+        local t = require("lib.test_admin").test
+
+        local NAME = "upstream#/apisix/routes/1"
+        local SHM = "upstream-healthcheck"
+
+        -- peer worker (worker A): a live, running checker for the same resource
+        -- that owns the shared shm target list and holds node 1980
+        local peer = healthcheck.new({
+            name = NAME, shm_name = SHM, events_module = "resty.events",
+            checks = { active = { type = "tcp",
+                healthy = { interval = 1, successes = 1 },
+                unhealthy = { interval = 1, tcp_failures = 1 } } },
+        })
+        peer:add_target("127.0.0.1", 1980, nil, true)
+
+        local function cfg(interval)
+            return [[{
+                "uri": "/hello",
+                "upstream": {
+                    "type": "roundrobin",
+                    "nodes": {"127.0.0.1:1980": 1},
+                    "checks": { "active": { "type": "tcp",
+                        "healthy": { "interval": ]] .. interval .. [[, "successes": 1 },
+                        "unhealthy": { "interval": 1, "tcp_failures": 1 } } }
+                }
+            }]]
+        end
+
+        -- this worker builds its own checker at checks-interval 1 (serves once)
+        assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT, cfg(1)) < 300)
+        t('/hello', ngx.HTTP_GET)
+        ngx.sleep(2)
+
+        -- change the checks config but send NO request to route 1: this worker
+        -- never rebuilds, so working_pool keeps the old-version checker.
+        -- timer_working_pool_check sees the checks change and destroys it.
+        assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT, cfg(2)) < 300)
+
+        -- wait past DELAYED_CLEAR_TIMEOUT (10s) + a cleanup window
+        ngx.sleep(15)
+
+        -- the peer's live target must survive in the shared shm
+        local list = healthcheck.get_target_list(NAME, SHM) or {}
+        local live_1980 = false
+        for _, tg in ipairs(list) do
+            if tg.port == 1980 then live_1980 = true end
+        end
+        ngx.say("live_1980: ", tostring(live_1980))
+    }
+}
+--- request
+GET /t
+--- response_body
+live_1980: true
+--- ignore_error_log
+--- timeout: 30

From 765518a69886459beb6a6137b18ccff158542db5 Mon Sep 17 00:00:00 2001
From: AlinsRan <alinsran@apache.org>
Date: Thu, 2 Jul 2026 09:36:06 +0800
Subject: [PATCH 11/15] test(healthcheck): cover delete/recreate and
 multi-worker reconcile

Add regression coverage for more real-world multi-worker scenarios:
- recreating the same upstream id within the delayed-clear window keeps
  its targets (re-add un-marks the pending purge_time)
- deleting an upstream cleans its shm target list
- a new 2-worker test asserting the shared shm target set converges to the
  desired node set under node churn (incremental) and a checks-config change
  (rebuild), with no orphan and no purge of surviving targets
---
 t/node/healthcheck-incremental-update.t    |  99 ++++++++++++++++++
 t/node/healthcheck-multiworker-reconcile.t | 113 +++++++++++++++++++++
 2 files changed, 212 insertions(+)
 create mode 100644 t/node/healthcheck-multiworker-reconcile.t

diff --git a/t/node/healthcheck-incremental-update.t b/t/node/healthcheck-incremental-update.t
index f91598a46828..3d0c963fefff 100644
--- a/t/node/healthcheck-incremental-update.t
+++ b/t/node/healthcheck-incremental-update.t
@@ -407,3 +407,102 @@ GET /t
 live_1980: true
 --- ignore_error_log
 --- timeout: 30
+
+
+
+=== TEST 7: recreating the same upstream id during the delayed-clear window keeps its targets
+# Deleting an upstream schedules a delayed_clear() of its shm target list. If the
+# same id is recreated within that window and served, create_checker() re-adds the
+# targets, which must un-mark the pending purge_time so the recreated node is NOT
+# purged when the window elapses (apache/apisix#13282).
+--- config
+location /t {
+    content_by_lua_block {
+        local healthcheck = require("resty.healthcheck")
+        local t = require("lib.test_admin").test
+        local NAME = "upstream#/apisix/routes/1"
+        local SHM = "upstream-healthcheck"
+
+        local cfg = [[{
+            "uri": "/hello",
+            "upstream": {
+                "type": "roundrobin",
+                "nodes": {"127.0.0.1:1980": 1},
+                "checks": { "active": { "type": "tcp",
+                    "healthy": { "interval": 1, "successes": 1 },
+                    "unhealthy": { "interval": 1, "tcp_failures": 1 } } }
+            }
+        }]]
+
+        -- create + build checker
+        assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT, cfg) < 300)
+        t('/hello', ngx.HTTP_GET)
+        ngx.sleep(2)
+
+        -- delete: timer_working_pool_check destroys and delayed_clear()s the shm
+        assert(t('/apisix/admin/routes/1', ngx.HTTP_DELETE) < 300)
+        ngx.sleep(2)  -- let the destroy fire, still within the 10s clear window
+
+        -- recreate the SAME id within the window and serve it -> create_checker re-adds
+        assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT, cfg) < 300)
+        t('/hello', ngx.HTTP_GET)
+
+        -- wait past the original delayed_clear window
+        ngx.sleep(12)
+
+        local list = healthcheck.get_target_list(NAME, SHM) or {}
+        local live_1980 = false
+        for _, tg in ipairs(list) do
+            if tg.port == 1980 then live_1980 = true end
+        end
+        ngx.say("live_1980: ", tostring(live_1980))
+    }
+}
+--- request
+GET /t
+--- response_body
+live_1980: true
+--- ignore_error_log
+--- timeout: 40
+
+
+
+=== TEST 8: deleting an upstream cleans its shm target list
+# Deleting an upstream (with a live checker) must eventually remove its targets
+# from the shared shm so a stale node is no longer probed or reported by
+# /v1/healthcheck.
+--- config
+location /t {
+    content_by_lua_block {
+        local healthcheck = require("resty.healthcheck")
+        local t = require("lib.test_admin").test
+        local NAME = "upstream#/apisix/routes/1"
+        local SHM = "upstream-healthcheck"
+
+        assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT, [[{
+            "uri": "/hello",
+            "upstream": {
+                "type": "roundrobin",
+                "nodes": {"127.0.0.1:1980": 1},
+                "checks": { "active": { "type": "tcp",
+                    "healthy": { "interval": 1, "successes": 1 },
+                    "unhealthy": { "interval": 1, "tcp_failures": 1 } } }
+            }
+        }]]) < 300)
+        t('/hello', ngx.HTTP_GET)
+        ngx.sleep(2)
+
+        assert(t('/apisix/admin/routes/1', ngx.HTTP_DELETE) < 300)
+        -- wait past the delayed_clear window plus a cleanup margin
+        ngx.sleep(15)
+
+        local list = healthcheck.get_target_list(NAME, SHM) or {}
+        ngx.say("targets_after_delete: ", #list)
+    }
+}
+--- request
+GET /t
+--- response_body
+targets_after_delete: 0
+--- ignore_error_log
+--- timeout: 30
diff --git a/t/node/healthcheck-multiworker-reconcile.t b/t/node/healthcheck-multiworker-reconcile.t
new file mode 100644
index 000000000000..ff148cd9dcfc
--- /dev/null
+++ b/t/node/healthcheck-multiworker-reconcile.t
@@ -0,0 +1,113 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+use t::APISIX 'no_plan';
+
+repeat_each(1);
+log_level('info');
+no_root_location();
+no_shuffle();
+workers(2);
+worker_connections(256);
+
+run_tests();
+
+__DATA__
+
+=== TEST 1: shared shm target list converges to the desired set under config churn (2 workers)
+# With real requests round-robining across two workers, each worker reconciles the
+# checker through its own request path/timers. The target membership lives in a
+# shm shared by checker name, so its final state must equal the desired node set
+# after every kind of change -- node add/remove (incremental) and a checks-config
+# change (rebuild) -- with no orphan targets and no purge of surviving targets
+# (apache/apisix#13282, multi-worker).
+--- config
+location /t {
+    content_by_lua_block {
+        local healthcheck = require("resty.healthcheck")
+        local http = require("resty.http")
+        local t = require("lib.test_admin").test
+
+        local NAME = "upstream#/apisix/routes/1"
+        local SHM = "upstream-healthcheck"
+
+        local function cfg(nodes, interval)
+            return [[{
+                "uri": "/hello",
+                "upstream": {
+                    "type": "roundrobin",
+                    "nodes": ]] .. nodes .. [[,
+                    "checks": { "active": { "type": "tcp",
+                        "healthy": { "interval": ]] .. interval .. [[, "successes": 1 },
+                        "unhealthy": { "interval": 1, "tcp_failures": 1 } } }
+                }
+            }]]
+        end
+
+        -- drive real traffic so requests spread across both workers
+        local function drive()
+            local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/hello"
+            for _ = 1, 16 do
+                local httpc = http.new()
+                httpc:request_uri(uri, { method = "GET", keepalive = false })
+            end
+        end
+
+        -- the desired set is worker-agnostic: read it from the shared shm
+        local function shm_ports()
+            local list = healthcheck.get_target_list(NAME, SHM) or {}
+            local ports = {}
+            for _, tg in ipairs(list) do
+                ports[#ports + 1] = tg.port
+            end
+            table.sort(ports)
+            return table.concat(ports, ",")
+        end
+
+        ngx.sleep(2) -- let both workers settle
+
+        -- phase 1: two nodes -> both must be registered
+        assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT,
+                 cfg('{"127.0.0.1:1980": 1, "127.0.0.1:1981": 1}', 1)) < 300)
+        drive()
+        ngx.sleep(3)
+        ngx.say("phase1: ", shm_ports())
+
+        -- phase 2: node-only change (remove 1981, add 1982) -> incremental reconcile,
+        -- shm must drop the orphan and add the new node
+        assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT,
+                 cfg('{"127.0.0.1:1980": 1, "127.0.0.1:1982": 1}', 1)) < 300)
+        drive()
+        ngx.sleep(3)
+        ngx.say("phase2: ", shm_ports())
+
+        -- phase 3: checks-config change (interval 1 -> 2), same nodes -> rebuild.
+        -- wait past DELAYED_CLEAR_TIMEOUT (10s): surviving targets must NOT be purged
+        assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT,
+                 cfg('{"127.0.0.1:1980": 1, "127.0.0.1:1982": 1}', 2)) < 300)
+        drive()
+        ngx.sleep(14)
+        ngx.say("phase3: ", shm_ports())
+    }
+}
+--- request
+GET /t
+--- response_body
+phase1: 1980,1981
+phase2: 1980,1982
+phase3: 1980,1982
+--- ignore_error_log
+--- timeout: 40

From d17855153ce906a6765a1ba92a8a22d9c3d69177 Mon Sep 17 00:00:00 2001
From: AlinsRan <alinsran@apache.org>
Date: Thu, 2 Jul 2026 09:53:42 +0800
Subject: [PATCH 12/15] test(healthcheck): assert timer paths don't error
 instead of ignoring the log

Replace the blanket `--- ignore_error_log` with a targeted `--- no_error_log`
that asserts the health-check timer and target-reconcile paths do not log an
error. The timers are pcall-wrapped, so a Lua fault in the code under test was
previously logged and swallowed while the test still passed on response_body
alone. Targeting specific patterns keeps the missing-optional-dependency noise
from failing the run while still catching a real crash.

Also note in the multi-worker test that it is a convergence/non-regression
check, not a deterministic reproduction of the cross-worker asymmetry.
---
 t/node/healthcheck-incremental-update.t    | 24 ++++++++++++++++++----
 t/node/healthcheck-multiworker-reconcile.t | 12 ++++++++++-
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/t/node/healthcheck-incremental-update.t b/t/node/healthcheck-incremental-update.t
index 3d0c963fefff..b361d2a2c931 100644
--- a/t/node/healthcheck-incremental-update.t
+++ b/t/node/healthcheck-incremental-update.t
@@ -334,7 +334,11 @@ location /t {
 GET /t
 --- response_body
 stale_1970: false
---- ignore_error_log
+--- no_error_log
+failed to run timer_working_pool_check
+failed to run timer_create_checker
+failed to add healthcheck target
+failed to remove healthcheck target
 --- timeout: 8
 
 
@@ -405,7 +409,11 @@ location /t {
 GET /t
 --- response_body
 live_1980: true
---- ignore_error_log
+--- no_error_log
+failed to run timer_working_pool_check
+failed to run timer_create_checker
+failed to add healthcheck target
+failed to remove healthcheck target
 --- timeout: 30
 
 
@@ -462,7 +470,11 @@ location /t {
 GET /t
 --- response_body
 live_1980: true
---- ignore_error_log
+--- no_error_log
+failed to run timer_working_pool_check
+failed to run timer_create_checker
+failed to add healthcheck target
+failed to remove healthcheck target
 --- timeout: 40
 
 
@@ -504,5 +516,9 @@ location /t {
 GET /t
 --- response_body
 targets_after_delete: 0
---- ignore_error_log
+--- no_error_log
+failed to run timer_working_pool_check
+failed to run timer_create_checker
+failed to add healthcheck target
+failed to remove healthcheck target
 --- timeout: 30
diff --git a/t/node/healthcheck-multiworker-reconcile.t b/t/node/healthcheck-multiworker-reconcile.t
index ff148cd9dcfc..a098a7e72d17 100644
--- a/t/node/healthcheck-multiworker-reconcile.t
+++ b/t/node/healthcheck-multiworker-reconcile.t
@@ -34,6 +34,12 @@ __DATA__
 # after every kind of change -- node add/remove (incremental) and a checks-config
 # change (rebuild) -- with no orphan targets and no purge of surviving targets
 # (apache/apisix#13282, multi-worker).
+#
+# NOTE: this is a convergence/non-regression check, not a deterministic
+# reproduction of the cross-worker asymmetry -- requests are distributed across
+# workers by the OS accept(), so it cannot force the "one worker rebuilds while
+# another destroys" interleaving. The authoritative deterministic reproduction
+# is TEST 6 in healthcheck-incremental-update.t.
 --- config
 location /t {
     content_by_lua_block {
@@ -109,5 +115,9 @@ GET /t
 phase1: 1980,1981
 phase2: 1980,1982
 phase3: 1980,1982
---- ignore_error_log
+--- no_error_log
+failed to run timer_working_pool_check
+failed to run timer_create_checker
+failed to add healthcheck target
+failed to remove healthcheck target
 --- timeout: 40

From dc39a6522265978e3f8acae9cb7751665421309e Mon Sep 17 00:00:00 2001
From: AlinsRan <alinsran@apache.org>
Date: Thu, 2 Jul 2026 14:36:41 +0800
Subject: [PATCH 13/15] test(healthcheck): convert TEST 3 and add
 create-checker failure pattern

Follow-up to the error-log hardening: convert the one remaining blanket
`--- ignore_error_log` (the checks-config rebuild test) to the same targeted
`--- no_error_log`, and add `failed to create healthcheck` to the pattern set
so a healthcheck.new() failure on the create_checker path is also caught.
---
 t/node/healthcheck-incremental-update.t    | 11 ++++++++++-
 t/node/healthcheck-multiworker-reconcile.t |  1 +
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/t/node/healthcheck-incremental-update.t b/t/node/healthcheck-incremental-update.t
index b361d2a2c931..3d2e8739bddb 100644
--- a/t/node/healthcheck-incremental-update.t
+++ b/t/node/healthcheck-incremental-update.t
@@ -207,7 +207,12 @@ location /t {
 GET /t
 --- response_body
 nodes_after: 2
---- ignore_error_log
+--- no_error_log
+failed to run timer_working_pool_check
+failed to run timer_create_checker
+failed to create healthcheck
+failed to add healthcheck target
+failed to remove healthcheck target
 --- timeout: 30
 
 
@@ -337,6 +342,7 @@ stale_1970: false
 --- no_error_log
 failed to run timer_working_pool_check
 failed to run timer_create_checker
+failed to create healthcheck
 failed to add healthcheck target
 failed to remove healthcheck target
 --- timeout: 8
@@ -412,6 +418,7 @@ live_1980: true
 --- no_error_log
 failed to run timer_working_pool_check
 failed to run timer_create_checker
+failed to create healthcheck
 failed to add healthcheck target
 failed to remove healthcheck target
 --- timeout: 30
@@ -473,6 +480,7 @@ live_1980: true
 --- no_error_log
 failed to run timer_working_pool_check
 failed to run timer_create_checker
+failed to create healthcheck
 failed to add healthcheck target
 failed to remove healthcheck target
 --- timeout: 40
@@ -519,6 +527,7 @@ targets_after_delete: 0
 --- no_error_log
 failed to run timer_working_pool_check
 failed to run timer_create_checker
+failed to create healthcheck
 failed to add healthcheck target
 failed to remove healthcheck target
 --- timeout: 30
diff --git a/t/node/healthcheck-multiworker-reconcile.t b/t/node/healthcheck-multiworker-reconcile.t
index a098a7e72d17..0a0df30d885a 100644
--- a/t/node/healthcheck-multiworker-reconcile.t
+++ b/t/node/healthcheck-multiworker-reconcile.t
@@ -118,6 +118,7 @@ phase3: 1980,1982
 --- no_error_log
 failed to run timer_working_pool_check
 failed to run timer_create_checker
+failed to create healthcheck
 failed to add healthcheck target
 failed to remove healthcheck target
 --- timeout: 40

From a372ab88256d6cde807773f83e40631204e9fd7e Mon Sep 17 00:00:00 2001
From: AlinsRan <alinsran@apache.org>
Date: Thu, 2 Jul 2026 15:25:22 +0800
Subject: [PATCH 14/15] test(healthcheck): verify unhealthy-node status is
 filtered across workers

Add a 2-worker health-status test: a dead node (1970) alongside a healthy one
(1980) with active checks. Assert the shared shm reports 1970 unhealthy and,
once every worker's status cache has converged, retries=0 traffic never lands
on 1970 -- proving both workers filter it. This covers health STATUS
propagation, not just target membership.
---
 t/node/healthcheck-multiworker-reconcile.t | 87 ++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/t/node/healthcheck-multiworker-reconcile.t b/t/node/healthcheck-multiworker-reconcile.t
index 0a0df30d885a..04ce337561fe 100644
--- a/t/node/healthcheck-multiworker-reconcile.t
+++ b/t/node/healthcheck-multiworker-reconcile.t
@@ -122,3 +122,90 @@ failed to create healthcheck
 failed to add healthcheck target
 failed to remove healthcheck target
 --- timeout: 40
+
+
+
+=== TEST 2: an unhealthy node is detected and filtered consistently across workers
+# The basic multi-worker health scenario: a healthy node (1980) and a dead node
+# (1970). Active checks must mark 1970 unhealthy in the shared shm, and with
+# retries=0 every request across both workers must still succeed -- proving both
+# workers filter the unhealthy node (a per-worker miss would send some requests
+# to 1970 and fail). This verifies health STATUS propagation, not just membership.
+--- config
+location /t {
+    content_by_lua_block {
+        local http = require("resty.http")
+        local t = require("lib.test_admin").test
+        local json = require("apisix.core.json")
+
+        assert(t('/apisix/admin/routes/1', ngx.HTTP_PUT, [[{
+            "uri": "/server_port",
+            "upstream": {
+                "type": "roundrobin",
+                "retries": 0,
+                "nodes": {"127.0.0.1:1980": 1, "127.0.0.1:1970": 1},
+                "checks": { "active": { "type": "tcp",
+                    "healthy": { "interval": 1, "successes": 1 },
+                    "unhealthy": { "interval": 1, "tcp_failures": 1 } } }
+            }
+        }]]) < 300)
+
+        local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/server_port"
+        local function burst(n)
+            local errors, non1980 = 0, 0
+            for _ = 1, n do
+                local httpc = http.new()
+                local r = httpc:request_uri(uri, { method = "GET", keepalive = false })
+                if not r or r.status ~= 200 then
+                    errors = errors + 1
+                elseif r.body ~= "1980" then
+                    non1980 = non1980 + 1
+                end
+            end
+            return errors, non1980
+        end
+
+        -- warm up both workers' checkers and let active checks converge on every
+        -- worker (the per-worker status cache is filled asynchronously by events;
+        -- until it converges a worker treats the unknown target as usable).
+        burst(16)
+        ngx.sleep(3)
+        burst(16)
+        ngx.sleep(3)
+
+        -- health status from the shared shm (worker-agnostic) via the control API
+        local function healthy(status)
+            return status == "healthy" or status == "mostly_healthy"
+        end
+        local _, _, res = t('/v1/healthcheck', ngx.HTTP_GET)
+        local h1970, h1980
+        for _, info in ipairs(json.decode(res)) do
+            for _, node in ipairs(info.nodes or {}) do
+                if node.port == 1970 then h1970 = healthy(node.status) end
+                if node.port == 1980 then h1980 = healthy(node.status) end
+            end
+        end
+        ngx.say("1970_healthy: ", tostring(h1970))
+        ngx.say("1980_healthy: ", tostring(h1980))
+
+        -- steady state: with retries=0, a request landing on 1970 on any worker
+        -- would fail, so both workers must be filtering the unhealthy node now
+        local errors, non1980 = burst(30)
+        ngx.say("errors: ", errors)
+        ngx.say("non_1980: ", non1980)
+    }
+}
+--- request
+GET /t
+--- response_body
+1970_healthy: false
+1980_healthy: true
+errors: 0
+non_1980: 0
+--- no_error_log
+failed to run timer_working_pool_check
+failed to run timer_create_checker
+failed to create healthcheck
+failed to add healthcheck target
+failed to remove healthcheck target
+--- timeout: 20

From 6ea8dac4cef4fa86e3de62b0e7936703777d46a5 Mon Sep 17 00:00:00 2001
From: AlinsRan <alinsran@apache.org>
Date: Thu, 2 Jul 2026 15:47:44 +0800
Subject: [PATCH 15/15] test(healthcheck): gate status test on convergence
 instead of fixed sleeps

The strict zero-error assertion previously relied on two fixed sleeps to let
every worker's per-worker status cache converge. Under loaded CI a worker that
built its checker late might not have received the unhealthy event by the
burst, flaking. Drive traffic until several consecutive zero-error bursts
confirm both workers filter the dead node (bounded); a genuine per-worker miss
never converges and fails the test deterministically.
---
 t/node/healthcheck-multiworker-reconcile.t | 48 ++++++++++++----------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/t/node/healthcheck-multiworker-reconcile.t b/t/node/healthcheck-multiworker-reconcile.t
index 04ce337561fe..4b52d89aa192 100644
--- a/t/node/healthcheck-multiworker-reconcile.t
+++ b/t/node/healthcheck-multiworker-reconcile.t
@@ -151,27 +151,40 @@ location /t {
         }]]) < 300)
 
         local uri = "http://127.0.0.1:" .. ngx.var.server_port .. "/server_port"
+        -- a burst returns how many of n requests did NOT get 200 from 1980, i.e.
+        -- landed on the dead node 1970 (retries=0 -> hard failure) or errored
         local function burst(n)
-            local errors, non1980 = 0, 0
+            local errors = 0
             for _ = 1, n do
                 local httpc = http.new()
                 local r = httpc:request_uri(uri, { method = "GET", keepalive = false })
-                if not r or r.status ~= 200 then
+                if not r or r.status ~= 200 or r.body ~= "1980" then
                     errors = errors + 1
-                elseif r.body ~= "1980" then
-                    non1980 = non1980 + 1
                 end
             end
-            return errors, non1980
+            return errors
         end
 
-        -- warm up both workers' checkers and let active checks converge on every
-        -- worker (the per-worker status cache is filled asynchronously by events;
-        -- until it converges a worker treats the unknown target as usable).
-        burst(16)
-        ngx.sleep(3)
-        burst(16)
-        ngx.sleep(3)
+        -- Drive traffic until every worker has built its checker AND converged its
+        -- per-worker status cache to "1970 unhealthy". The shared shm reports 1970
+        -- unhealthy as soon as ANY one worker probes, so the control-API status is
+        -- necessary but NOT sufficient -- only routing proves BOTH workers filter.
+        -- Require several consecutive zero-error bursts (bounded); a genuine
+        -- per-worker filtering miss keeps producing errors and never converges,
+        -- so the test fails instead of flaking on a fixed sleep.
+        local clean_streak = 0
+        for _ = 1, 25 do
+            if burst(12) == 0 then
+                clean_streak = clean_streak + 1
+                if clean_streak >= 3 then
+                    break
+                end
+            else
+                clean_streak = 0
+            end
+            ngx.sleep(1)
+        end
+        ngx.say("converged: ", tostring(clean_streak >= 3))
 
         -- health status from the shared shm (worker-agnostic) via the control API
         local function healthy(status)
@@ -187,25 +200,18 @@ location /t {
         end
         ngx.say("1970_healthy: ", tostring(h1970))
         ngx.say("1980_healthy: ", tostring(h1980))
-
-        -- steady state: with retries=0, a request landing on 1970 on any worker
-        -- would fail, so both workers must be filtering the unhealthy node now
-        local errors, non1980 = burst(30)
-        ngx.say("errors: ", errors)
-        ngx.say("non_1980: ", non1980)
     }
 }
 --- request
 GET /t
 --- response_body
+converged: true
 1970_healthy: false
 1980_healthy: true
-errors: 0
-non_1980: 0
 --- no_error_log
 failed to run timer_working_pool_check
 failed to run timer_create_checker
 failed to create healthcheck
 failed to add healthcheck target
 failed to remove healthcheck target
---- timeout: 20
+--- timeout: 40