Skip to content

Commit 56cb5f5

Browse files
committed
[ironic] Ensure unprovision happens for new states
States were added to the Ironic API to enable the node servicing feature, which can be performed on nodes provisioned with Nova instances. Current nova, if asked to delete these instances, will only remove the instance metadata and not tear them down. This change has two parts: - I have added the new, relevant states to _UNPROVISION_STATES in driver.py, which now allows Nova to know that SERVIC* states and DEPLOYHOLD are safe to unprovision from. - I have added all existing ironic states to ironic_states.py and the PROVISION_STATE_LIST constant and check the state against it -- in a case where a completely unknown state is returned, we should attempt an unprovision. This fix needs to be backported as far as possible, as this bug has existed since Antelope / 2023.1 (DEPLOYHOLD) or Bobcat / 2023.3 (SERVIC*). Assisted-by: Claude Code Closes-bug: #2131960 Change-Id: I31c70d35b0e6e9f8d2252bfb2f0bdec477cc6cc7 Signed-off-by: Jay Faulkner <jay@jvf.cc>
1 parent 53aadaf commit 56cb5f5

4 files changed

Lines changed: 157 additions & 8 deletions

File tree

nova/tests/unit/virt/ironic/test_driver.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1746,6 +1746,91 @@ def test_destroy(self):
17461746
for state in ironic_states.PROVISION_STATE_LIST:
17471747
self._test_destroy(state)
17481748

1749+
@mock.patch.object(ironic_driver.IronicDriver,
1750+
'_remove_instance_info_from_node')
1751+
@mock.patch.object(ironic_driver.IronicDriver, '_cleanup_deploy')
1752+
def test_destroy_servicing_states_unprovision(self, mock_cleanup_deploy,
1753+
mock_remove_instance_info):
1754+
"""Test that servicing and deployhold states trigger unprovisioning.
1755+
1756+
This is a regression test for bug 2131960 where nodes in servicing
1757+
states (SERVICING, SERVICEWAIT, SERVICEHOLD, SERVICEFAIL) and
1758+
DEPLOYHOLD were not being properly unprovisioned when destroyed.
1759+
"""
1760+
# Test all servicing-related states and deployhold
1761+
servicing_states = [
1762+
ironic_states.DEPLOYHOLD,
1763+
ironic_states.SERVICING,
1764+
ironic_states.SERVICEWAIT,
1765+
ironic_states.SERVICEFAIL,
1766+
ironic_states.SERVICEHOLD,
1767+
]
1768+
1769+
for state in servicing_states:
1770+
mock_cleanup_deploy.reset_mock()
1771+
mock_remove_instance_info.reset_mock()
1772+
self.mock_conn.set_node_provision_state.reset_mock()
1773+
1774+
node_id = 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee'
1775+
network_info = 'foo'
1776+
1777+
node = _get_cached_node(
1778+
driver='fake', id=node_id, provision_state=state)
1779+
instance = fake_instance.fake_instance_obj(self.ctx, node=node_id)
1780+
1781+
def fake_set_node_provision_state(*_):
1782+
node.provision_state = None
1783+
1784+
self.mock_conn.nodes.return_value = iter([node])
1785+
self.mock_conn.set_node_provision_state.side_effect = \
1786+
fake_set_node_provision_state
1787+
self.driver.destroy(self.ctx, instance, network_info, None)
1788+
1789+
# All these states should trigger unprovisioning, not cleanup
1790+
self.mock_conn.set_node_provision_state.assert_called_once_with(
1791+
node_id, 'deleted',
1792+
)
1793+
self.assertFalse(mock_remove_instance_info.called)
1794+
self.assertFalse(mock_cleanup_deploy.called)
1795+
1796+
@mock.patch.object(ironic_driver.IronicDriver,
1797+
'_remove_instance_info_from_node')
1798+
@mock.patch.object(ironic_driver.IronicDriver, '_cleanup_deploy')
1799+
def test_destroy_unknown_state_unprovision(self, mock_cleanup_deploy,
1800+
mock_remove_instance_info):
1801+
"""Test that unknown provision states trigger unprovisioning.
1802+
1803+
This is a regression test for bug 2131960. As a safety mechanism,
1804+
nodes in provision states that are not recognized (not in
1805+
PROVISION_STATE_LIST) should trigger unprovisioning rather than
1806+
cleanup to ensure proper node teardown even when encountering
1807+
unexpected or future Ironic states.
1808+
"""
1809+
# Test with a completely unknown state that doesn't exist in
1810+
# PROVISION_STATE_LIST
1811+
node_id = 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee'
1812+
network_info = 'foo'
1813+
unknown_state = 'unknown-future-state'
1814+
1815+
node = _get_cached_node(
1816+
driver='fake', id=node_id, provision_state=unknown_state)
1817+
instance = fake_instance.fake_instance_obj(self.ctx, node=node_id)
1818+
1819+
def fake_set_node_provision_state(*_):
1820+
node.provision_state = None
1821+
1822+
self.mock_conn.nodes.return_value = iter([node])
1823+
self.mock_conn.set_node_provision_state.side_effect = \
1824+
fake_set_node_provision_state
1825+
self.driver.destroy(self.ctx, instance, network_info, None)
1826+
1827+
# Unknown states should trigger unprovisioning for safety
1828+
self.mock_conn.set_node_provision_state.assert_called_once_with(
1829+
node_id, 'deleted',
1830+
)
1831+
self.assertFalse(mock_remove_instance_info.called)
1832+
self.assertFalse(mock_cleanup_deploy.called)
1833+
17491834
@mock.patch.object(ironic_driver.IronicDriver,
17501835
'_validate_instance_and_node')
17511836
@mock.patch.object(ironic_driver.IronicDriver,

nova/virt/ironic/driver.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,12 @@
7171

7272
_UNPROVISION_STATES = (ironic_states.ACTIVE, ironic_states.DEPLOYFAIL,
7373
ironic_states.ERROR, ironic_states.DEPLOYWAIT,
74-
ironic_states.DEPLOYING, ironic_states.RESCUE,
75-
ironic_states.RESCUING, ironic_states.RESCUEWAIT,
76-
ironic_states.RESCUEFAIL, ironic_states.UNRESCUING,
77-
ironic_states.UNRESCUEFAIL)
74+
ironic_states.DEPLOYING, ironic_states.DEPLOYHOLD,
75+
ironic_states.RESCUE, ironic_states.RESCUING,
76+
ironic_states.RESCUEWAIT, ironic_states.RESCUEFAIL,
77+
ironic_states.UNRESCUING, ironic_states.UNRESCUEFAIL,
78+
ironic_states.SERVICING, ironic_states.SERVICEWAIT,
79+
ironic_states.SERVICEFAIL, ironic_states.SERVICEHOLD)
7880

7981
_NODE_FIELDS = ('uuid', 'power_state', 'target_power_state', 'provision_state',
8082
'target_provision_state', 'last_error', 'maintenance',
@@ -1370,13 +1372,18 @@ def destroy(self, context, instance, network_info,
13701372
# without raising any exceptions.
13711373
return
13721374

1373-
if node.provision_state in _UNPROVISION_STATES:
1375+
if (node.provision_state in _UNPROVISION_STATES or
1376+
node.provision_state not in ironic_states.PROVISION_STATE_LIST):
13741377
# NOTE(mgoddard): Ironic's node tear-down procedure includes all of
13751378
# the things we do in _cleanup_deploy, so let's not repeat them
13761379
# here. Doing so would also race with the node cleaning process,
13771380
# which may acquire the node lock and prevent us from making
13781381
# changes to the node. See
13791382
# https://bugs.launchpad.net/nova/+bug/2019977
1383+
# NOTE(JayF): For maximum safety, we call unprovision also in cases
1384+
# where we see an unrecognized state. See
1385+
# https://bugs.launchpad.net/nova/+bug/2131960 for more
1386+
# information.
13801387
self._unprovision(instance, node)
13811388
else:
13821389
self._cleanup_deploy(node, instance, network_info)

nova/virt/ironic/ironic_states.py

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,9 @@
110110
CLEANFAIL = 'clean failed'
111111
""" Node failed cleaning. This requires operator intervention to resolve. """
112112

113+
CLEANHOLD = 'clean hold'
114+
""" Node is being held by a cleaning step. """
115+
113116
ERROR = 'error'
114117
""" An error occurred during node processing.
115118
@@ -131,6 +134,8 @@
131134
INSPECTFAIL = 'inspect failed'
132135
""" Node inspection failed. """
133136

137+
INSPECTWAIT = 'inspect wait'
138+
""" Node is waiting for inspection callback. """
134139

135140
RESCUE = 'rescue'
136141
""" Node is in rescue mode.
@@ -158,6 +163,30 @@
158163
UNRESCUING = "unrescuing"
159164
""" Node is unrescuing. """
160165

166+
DEPLOYHOLD = 'deploy hold'
167+
""" Node is being held by a deploy step. """
168+
169+
SERVICING = 'servicing'
170+
""" Node is actively being changed by a service step. """
171+
172+
SERVICEWAIT = 'service wait'
173+
""" Node is waiting for an operation to complete. """
174+
175+
SERVICEFAIL = 'service failed'
176+
""" Node has failed in a service step execution. """
177+
178+
SERVICEHOLD = 'service hold'
179+
""" Node is being held for direct intervention from a service step. """
180+
181+
ENROLL = 'enroll'
182+
""" Node being entrolled into Ironic.
183+
Nova should never see a node in this state."""
184+
185+
VERIFYING = 'verifying'
186+
""" Node driver attributes being verified.
187+
Nova should never see a node in this state."""
188+
189+
161190
##############
162191
# Power states
163192
##############
@@ -176,7 +205,10 @@
176205
##################
177206

178207
PROVISION_STATE_LIST = (NOSTATE, MANAGEABLE, AVAILABLE, ACTIVE, DEPLOYWAIT,
179-
DEPLOYING, DEPLOYFAIL, DEPLOYDONE, DELETING, DELETED,
180-
CLEANING, CLEANWAIT, CLEANFAIL, ERROR, REBUILD,
181-
INSPECTING, INSPECTFAIL)
208+
DEPLOYING, DEPLOYFAIL, DEPLOYDONE, DEPLOYHOLD,
209+
DELETING, DELETED, CLEANING, CLEANWAIT, CLEANFAIL,
210+
CLEANHOLD, ERROR, REBUILD, INSPECTING, INSPECTFAIL,
211+
INSPECTWAIT, RESCUE, RESCUEFAIL, RESCUEWAIT, RESCUING,
212+
SERVICING, SERVICEWAIT, SERVICEFAIL, SERVICEHOLD,
213+
UNRESCUEFAIL, UNRESCUING, ENROLL, VERIFYING)
182214
""" A list of all provision states. """
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
---
2+
security:
3+
- |
4+
[`bug 2131960 <https://bugs.launchpad.net/nova/+bug/2131960>`_]
5+
Fixed improper teardown of Ironic nodes in servicing states.
6+
Previously, when Nova's virt driver called ``destroy()`` on Ironic
7+
nodes in certain states (``deploy hold``, ``servicing``, ``service wait``,
8+
``service hold``, ``service failed``), these nodes would fail to undergo
9+
proper teardown, resulting in orphaned instances where instance metadata
10+
was removed but nodes remained unprovisioned in Ironic. In flat networking
11+
configurations, this could allow nodes traversing servicing states to
12+
briefly access tenant networks. In Neutron networking configurations,
13+
while VIFs would detach correctly, nodes would remain in Ironic with a
14+
shutdown IPA, and future deprovisioning attempts would fail due to missing
15+
networking metadata. These states are now properly handled to ensure
16+
complete node deprovisioning.
17+
fixes:
18+
- |
19+
[`bug 2131960 <https://bugs.launchpad.net/nova/+bug/2131960>`_]
20+
Fixed a bug where Ironic nodes in servicing-related provision states
21+
(``deploy hold``, ``servicing``, ``service wait``, ``service hold``,
22+
``service failed``) were not properly unprovisioned when instances were
23+
destroyed. The Nova Ironic virt driver now correctly triggers
24+
unprovisioning for nodes in these states, preventing orphaned node
25+
configurations in Ironic.

0 commit comments

Comments
 (0)