From 0865e6ebdc47c6177911bf8333a65635b36a5085 Mon Sep 17 00:00:00 2001 From: Joe Julian Date: Sun, 8 Mar 2026 10:29:27 -0700 Subject: [PATCH] tests: add reconnect gating reproducer --- tests/basic/fuse/fuse-daemon-stall-enotconn.t | 74 +++++++++++++++++++ xlators/protocol/client/src/client.c | 36 +++++++++ xlators/protocol/client/src/client.h | 1 + 3 files changed, 111 insertions(+) create mode 100644 tests/basic/fuse/fuse-daemon-stall-enotconn.t diff --git a/tests/basic/fuse/fuse-daemon-stall-enotconn.t b/tests/basic/fuse/fuse-daemon-stall-enotconn.t new file mode 100644 index 00000000000..8cb9cb8fcc2 --- /dev/null +++ b/tests/basic/fuse/fuse-daemon-stall-enotconn.t @@ -0,0 +1,74 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +holdfile="" + +function cleanup_test { + rm -f "$holdfile" + force_umount "$M0" >/dev/null 2>&1 || true + cleanup +} + +function mount_log { + local logdir="" + + logdir=$($CLI --print-logdir) + ls "$logdir"/mnt-glusterfs-*.log 2>/dev/null | head -1 +} + +function hold_hook_logged { + local logfile="" + + logfile=$(mount_log) + grep -c "debug-disconnect-notify-holdfile is blocking RPC_CLNT_DISCONNECT notify" "$logfile" +} + +function connect_count { + local logfile="" + + logfile=$(mount_log) + grep -c "Connected, attached to remote volume" "$logfile" +} + +cleanup + +holdfile="$B0/$V0-child-down.hold" +trap cleanup_test EXIT + +TEST glusterd +TEST pidof glusterd + +TEST $CLI volume create $V0 $H0:$B0/${V0}1 +TEST $CLI volume set $V0 performance.quick-read off +TEST $CLI volume set $V0 performance.write-behind off +TEST $CLI volume set $V0 performance.io-cache off +TEST $CLI volume set $V0 performance.stat-prefetch off +TEST $CLI volume set $V0 performance.client-io-threads off +TEST $CLI volume set $V0 ping-timeout 2 +TEST $CLI volume start $V0 +EXPECT 'Started' volinfo_field $V0 'Status' + +TEST touch "$holdfile" +TEST $GFS --xlator-option="$V0-client-0.debug-disconnect-notify-holdfile=$holdfile" \ + --volfile-id=/$V0 --volfile-server=$H0 $M0 +TEST touch $M0/preflight + +TEST kill_brick $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "0" brick_up_status $V0 $H0 $B0/${V0}1 + +# Wait until the client has entered the intentionally stalled disconnect notify. +EXPECT_WITHIN 8 "1" hold_hook_logged + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 + +# This is the bug reproducer: reconnect should not be gated on synchronous +# disconnect notify. Current code schedules reconnect only after notify +# returns, so the second successful connect never appears while the holdfile +# exists. +EXPECT_WITHIN 8 "2" connect_count + +trap - EXIT +cleanup_test diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c index e0508776546..33a0e14faed 100644 --- a/xlators/protocol/client/src/client.c +++ b/xlators/protocol/client/src/client.c @@ -167,6 +167,32 @@ client_notify_dispatch(xlator_t *this, int32_t event, void *data, ...) return ret; } +static void +client_debug_hold_disconnect_notify(xlator_t *this) +{ + clnt_conf_t *conf = NULL; + + conf = this->private; + if (!conf || !conf->debug_disconnect_notify_holdfile) + return; + + if (access(conf->debug_disconnect_notify_holdfile, F_OK) != 0) + return; + + gf_log(this->name, GF_LOG_WARNING, + "debug-disconnect-notify-holdfile is blocking RPC_CLNT_DISCONNECT " + "notify on %s", + conf->debug_disconnect_notify_holdfile); + + while (access(conf->debug_disconnect_notify_holdfile, F_OK) == 0) { + sleep(1); + } + + gf_log(this->name, GF_LOG_WARNING, + "debug-disconnect-notify-holdfile released RPC_CLNT_DISCONNECT " + "notify"); +} + int client_submit_request(xlator_t *this, void *req, call_frame_t *frame, rpc_clnt_prog_t *prog, int procnum, fop_cbk_fn_t cbkfn, @@ -2237,6 +2263,8 @@ client_rpc_notify(struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, client_mark_fd_bad(this); if (!conf->skip_notify) { + client_debug_hold_disconnect_notify(this); + if (conf->can_log_disconnect) { if (!conf->disconnect_err_logged) { gf_smsg(this->name, GF_LOG_INFO, 0, @@ -2426,6 +2454,8 @@ build_client_config(xlator_t *this, clnt_conf_t *conf) GF_OPTION_INIT("ping-timeout", conf->opt.ping_timeout, time, out); GF_OPTION_INIT("remote-subvolume", conf->opt.remote_subvolume, path, out); + GF_OPTION_INIT("debug-disconnect-notify-holdfile", + conf->debug_disconnect_notify_holdfile, path, out); if (!conf->opt.remote_subvolume) gf_smsg(this->name, GF_LOG_WARNING, EINVAL, PC_MSG_REMOTE_SUBVOL_NOT_GIVEN, NULL); @@ -2932,6 +2962,12 @@ struct volume_options options[] = { {.key = {"remote-subvolume"}, .type = GF_OPTION_TYPE_ANY, .default_value = "{{ brick.path }}"}, + { + .key = {"debug-disconnect-notify-holdfile"}, + .type = GF_OPTION_TYPE_PATH, + .description = "Test-only option that blocks RPC_CLNT_DISCONNECT " + "notify while the given path exists.", + }, {.key = {"frame-timeout", "rpc-timeout"}, .type = GF_OPTION_TYPE_TIME, .min = 0, diff --git a/xlators/protocol/client/src/client.h b/xlators/protocol/client/src/client.h index 880d3f8b173..8b45c5fcd9f 100644 --- a/xlators/protocol/client/src/client.h +++ b/xlators/protocol/client/src/client.h @@ -145,6 +145,7 @@ typedef struct clnt_conf { complaince as bricks cleanup any granted locks when a client disconnects. */ + char *debug_disconnect_notify_holdfile; gf_boolean_t connection_to_brick; /*True from attempt to connect to brick till disconnection to brick*/ pthread_cond_t fini_complete_cond; /* Used to wait till we finsh the fini