From bd3c3619e988e1cff4727f9e60ca8570f5ae5176 Mon Sep 17 00:00:00 2001 From: Lin Liu Date: Tue, 10 Mar 2026 05:46:27 +0000 Subject: [PATCH] CA-424403: Host reboot hangs due to root disk iscsi node.startup does not survive restart_daemon iscsi local_db path is `/var/lib/iscsi` In iscsi.restart_daemon, `local_db/{nodes,send_targets}` was removed to clean stalled nodes. In iscsi.discovery, `iscsiadm -m discovery` was run to discovery nodes, however, this will reset all nodes.startup to default `automatic`, including the root disk nodes. To resovle this, `local_db/nodes` was backup and restore before and after the command. However, if restart_daemon was run before discovery, the local_db would be desotryed and discovery can't back up the local_db. To fix the issue, the local_db is backup and restored during restart_daemon as well, so it can survive restart_daemon. Signed-off-by: Lin Liu --- libs/sm/core/iscsi.py | 92 ++++++++++++++++++++++++++++++------------- tests/test_iscsi.py | 71 +++++++++++++++++++++++++-------- 2 files changed, 119 insertions(+), 44 deletions(-) diff --git a/libs/sm/core/iscsi.py b/libs/sm/core/iscsi.py index 6e3f82628..02dae853f 100644 --- a/libs/sm/core/iscsi.py +++ b/libs/sm/core/iscsi.py @@ -26,6 +26,8 @@ from sm.core import lock import glob import tempfile +from contextlib import contextmanager +from pathlib import Path from configparser import RawConfigParser import io @@ -119,15 +121,21 @@ def save_rootdisk_nodes(tmpdirname): root_iqns = get_rootdisk_IQNs() if root_iqns: srcdirs = [os.path.join(_ISCSI_DB_PATH, 'nodes', iqn) for iqn in root_iqns] - util.doexec(['/bin/cp', '-a'] + srcdirs + [tmpdirname]) + (rc, stdout, stderr) = util.doexec(['/bin/cp', '-a'] + srcdirs + [tmpdirname]) + if rc != 0: + util.SMlog("save_rootdisk_nodes: cp FAILED rc=%d stderr=%r stdout=%r" + % (rc, stderr, stdout)) def restore_rootdisk_nodes(tmpdirname): root_iqns = get_rootdisk_IQNs() if root_iqns: srcdirs = [os.path.join(tmpdirname, iqn) for iqn in root_iqns] - util.doexec(['/bin/cp', '-a'] + srcdirs + - [os.path.join(_ISCSI_DB_PATH, 'nodes')]) + (rc, stdout, stderr) = util.doexec(['/bin/cp', '-a'] + srcdirs + + [os.path.join(_ISCSI_DB_PATH, 'nodes')]) + if rc != 0: + util.SMlog("restore_rootdisk_nodes: cp FAILED rc=%d stderr=%r stdout=%r" + % (rc, stderr, stdout)) def rescan_target(portal, target): @@ -159,12 +167,7 @@ def discovery(target, port, chapuser, chappass, targetIQN="any", # Save configuration of root LUN nodes and restore after discovery # otherwise when we do a discovery on the same filer as is hosting # our root disk we'll reset the config of the root LUNs - - # FIXME: Replace this with TemporaryDirectory when moving to Python3 - tmpdirname = tempfile.mkdtemp() try: - save_rootdisk_nodes(tmpdirname) - if ':' in target: targetstring = "[%s]:%s" % (target, str(port)) else: @@ -180,7 +183,8 @@ def discovery(target, port, chapuser, chappass, targetIQN="any", "-n", "discovery.sendtargets.auth.password", "-v", chappass] fail_msg = "Discovery failed. Check target settings and " \ "username/password (if applicable)" - try: + + with saved_rootdisk_nodes(): if chapuser != "" and chappass != "": exn_on_failure(cmd_discdb + ["-o", "new"], fail_msg) exn_on_failure(cmd_discdb + ["-o", "update"] + auth_args, fail_msg) @@ -188,12 +192,10 @@ def discovery(target, port, chapuser, chappass, targetIQN="any", else: cmd = cmd_disc (stdout, stderr) = exn_on_failure(cmd, fail_msg) - except: - raise xs_errors.XenError('ISCSILogin') - finally: - restore_rootdisk_nodes(tmpdirname) - finally: - shutil.rmtree(tmpdirname) + except xs_errors.XenError: + raise + except: + raise xs_errors.XenError('ISCSILogin') return parse_node_output(stdout, targetIQN) @@ -412,20 +414,37 @@ def stop_daemon(): exn_on_failure(cmd, failuremessage) -def restart_daemon(): - stop_daemon() - if os.path.exists(os.path.join(_ISCSI_DB_PATH, 'nodes')): - try: - shutil.rmtree(os.path.join(_ISCSI_DB_PATH, 'nodes')) - except: - pass +def _clear_dir(dirpath): + """Remove all *contents* of dirpath, keeping the directory itself. + Creates the directory if it does not yet exist. Errors on individual + entries are logged and swallowed so that a single bad entry does not + abort the entire wipe.""" + p = Path(dirpath) + p.mkdir(parents=True, exist_ok=True) + for child in p.iterdir(): try: - shutil.rmtree(os.path.join(_ISCSI_DB_PATH, 'send_targets')) - except: - pass - cmd = ["/usr/bin/systemctl", "start", "iscsid.service"] - failuremessage = "Failed to start iscsi daemon" - exn_on_failure(cmd, failuremessage) + if child.is_dir(follow_symlinks=False): + shutil.rmtree(child) + else: + child.unlink() + except Exception as e: + util.SMlog("_clear_dir: failed to remove %s: %s" % (child, e)) + + +def restart_daemon(): + """Stop iscsid, wipe the node/send_targets DB, then restart. + + Boot-IQN node entries are saved before the wipe and restored + afterwards via the saved_rootdisk_nodes context manager so that + the boot nodes survive the restart. + """ + with saved_rootdisk_nodes(): + stop_daemon() + for subdir in ('nodes', 'send_targets'): + _clear_dir(os.path.join(_ISCSI_DB_PATH, subdir)) + cmd = ["/usr/bin/systemctl", "start", "iscsid.service"] + failuremessage = "Failed to start iscsi daemon" + exn_on_failure(cmd, failuremessage) def wait_for_devs(targetIQN, portal): @@ -565,6 +584,23 @@ def _checkAnyTGT(): return False +@contextmanager +def saved_rootdisk_nodes(): + """Context manager that snapshots the boot-IQN node DB entries into a + temporary directory on entry and restores them on exit. + + The restore runs unconditionally in the finally block so it is safe to + use around operations that may raise. The temporary directory is + cleaned up automatically when the context exits. + """ + with tempfile.TemporaryDirectory() as tmpdir: + save_rootdisk_nodes(tmpdir) + try: + yield + finally: + restore_rootdisk_nodes(tmpdir) + + def ensure_daemon_running_ok(localiqn): """Check that the daemon is running and the initiator name is correct""" if not is_iscsi_daemon_running(): diff --git a/tests/test_iscsi.py b/tests/test_iscsi.py index 6e9094cb9..b4b3a31e2 100644 --- a/tests/test_iscsi.py +++ b/tests/test_iscsi.py @@ -9,7 +9,7 @@ class Test_iscsi(unittest.TestCase): @mock.patch('sm.core.iscsi.get_rootdisk_IQNs') - @mock.patch('sm.core.iscsi.util.doexec') + @mock.patch('sm.core.iscsi.util.doexec', return_value=(0, '', '')) def test_save_rootdisk_nodes(self, doexec, get_rootdisk_iqns): get_rootdisk_iqns.return_value = [TEST_IQN] @@ -20,9 +20,9 @@ def test_save_rootdisk_nodes(self, doexec, get_rootdisk_iqns): '00.ecd28.mo121', '/my/safe/tempdir']) + @mock.patch('sm.core.iscsi.util.doexec', return_value=(0, '', '')) @mock.patch('sm.core.iscsi.get_rootdisk_IQNs') - @mock.patch('sm.core.iscsi.util.doexec') - def test_restore_rootdisk_nodes(self, doexec, get_rootdisk_iqns): + def test_restore_rootdisk_nodes(self, get_rootdisk_iqns, doexec): get_rootdisk_iqns.return_value = [TEST_IQN] iscsi.restore_rootdisk_nodes("/my/safe/tempdir") @@ -31,25 +31,65 @@ def test_restore_rootdisk_nodes(self, doexec, get_rootdisk_iqns): '/my/safe/tempdir/iqn.2003-01.com.bla:00.ecd28.mo121', '/var/lib/iscsi/nodes']) + @mock.patch('sm.core.iscsi._clear_dir') @mock.patch('sm.core.iscsi.stop_daemon', mock.Mock()) @mock.patch('sm.core.iscsi.exn_on_failure', mock.Mock()) - @mock.patch('sm.core.iscsi.util.doexec', mock.Mock()) - @mock.patch('sm.core.iscsi.os.path.exists') - @mock.patch('sm.core.iscsi.shutil.rmtree') - def test_restart_daemon(self, rmtree, exists): - exists.return_value = True - + @mock.patch('sm.core.iscsi.saved_rootdisk_nodes') + def test_restart_daemon(self, _saved, clear_dir): + """restart_daemon must clear nodes/ and send_targets/ (keeping the + directories), via _clear_dir, before starting iscsid.""" iscsi.restart_daemon() - rmtree.assert_has_calls([mock.call('/var/lib/iscsi/nodes'), - mock.call('/var/lib/iscsi/send_targets')]) - + clear_dir.assert_any_call('/var/lib/iscsi/nodes') + clear_dir.assert_any_call('/var/lib/iscsi/send_targets') + + @mock.patch('sm.core.iscsi.restart_daemon') + @mock.patch('sm.core.iscsi.set_current_initiator_name') + @mock.patch('sm.core.iscsi.is_iscsi_daemon_running', return_value=False) + def test_ensure_daemon_running_ok_not_running( + self, _is_running, set_iqn, restart): + """When iscsid is not running, set initiator name and restart.""" + iscsi.ensure_daemon_running_ok(TEST_IQN) + + set_iqn.assert_called_once_with(TEST_IQN) + restart.assert_called_once() + + @mock.patch('sm.core.iscsi.restart_daemon') + @mock.patch('sm.core.iscsi.set_current_initiator_name') + @mock.patch('sm.core.iscsi._checkAnyTGT', return_value=False) + @mock.patch('sm.core.iscsi.get_current_initiator_name', return_value='other-iqn') + @mock.patch('sm.core.iscsi.is_iscsi_daemon_running', return_value=True) + def test_ensure_daemon_running_ok_iqn_mismatch( + self, _is_running, _get_iqn, _check_tgt, set_iqn, restart): + """IQN mismatch with no non-root sessions: set initiator name and restart.""" + iscsi.ensure_daemon_running_ok(TEST_IQN) + + set_iqn.assert_called_once_with(TEST_IQN) + restart.assert_called_once() + + @mock.patch('sm.core.iscsi.restart_daemon') + @mock.patch('sm.core.iscsi.get_current_initiator_name', return_value=TEST_IQN) + @mock.patch('sm.core.iscsi.is_iscsi_daemon_running', return_value=True) + def test_ensure_daemon_running_ok_no_restart_needed( + self, _is_running, _get_iqn, restart): + """When iscsid is running with the correct IQN, no restart occurs.""" + iscsi.ensure_daemon_running_ok(TEST_IQN) + + restart.assert_not_called() + + @mock.patch('sm.core.iscsi.restart_daemon', side_effect=Exception("start failed")) + @mock.patch('sm.core.iscsi.set_current_initiator_name') + @mock.patch('sm.core.iscsi.is_iscsi_daemon_running', return_value=False) + def test_ensure_daemon_running_ok_restart_fails_propagates( + self, _is_running, _set_iqn, _restart): + """If restart_daemon raises, the exception propagates out.""" + with self.assertRaises(Exception): + iscsi.ensure_daemon_running_ok(TEST_IQN) @mock.patch('sm.core.iscsi.util.doexec', mock.Mock()) @mock.patch('sm.core.iscsi.exn_on_failure') @mock.patch('sm.core.iscsi.tempfile', autospec=True) - @mock.patch('sm.core.iscsi.shutil.rmtree', autospec=True) - def test_discovery_success(self, rmtree, mock_tempfile, mock_exc): + def test_discovery_success(self, mock_tempfile, mock_exc): mock_exc.return_value = ("test-target,1000 " + TEST_IQN, "") iscsi.discovery('test-target', 3260, "", "") @@ -62,8 +102,7 @@ def test_discovery_success(self, rmtree, mock_tempfile, mock_exc): @mock.patch('sm.core.iscsi.util.doexec', mock.Mock()) @mock.patch('sm.core.iscsi.exn_on_failure', autospec=True) @mock.patch('sm.core.iscsi.tempfile', autospec=True) - @mock.patch('sm.core.iscsi.shutil.rmtree', autospec=True) - def test_discovery_chap_success(self, rmtree, mock_tempfile, mock_exc): + def test_discovery_chap_success(self, mock_tempfile, mock_exc): mock_exc.side_effect = [ ("New discovery record for [test-target:3260] added", ""), ("",""),