Skip to content

Commit 0bcb667

Browse files
committed
Wipe agent token on inspection completion to fix cleaning failures
The backport of f734efb ("Set node alive when inspection finished") causes a regression in 4.15 where ironic-inspector is still used. The patch makes nodes appear fast-trackable after inspection when the agent is dead and boot media is detached, causing the AgentConnectionFailed handler to park nodes in clean wait indefinitely. This happens because 4.15 uses ironic-inspector, which powers off the node after inspection. The f734efb fix was designed for 4.16 where built-in agent inspection keeps IPA running across the inspection-to-cleaning transition. Revert 0feaa17 and instead fix the root cause: the agent_secret_token persists from inspection into cleaning. When IPA reboots for cleaning, it cannot get a new token because Ironic refuses to generate one when one already exists, causing repeated lookup loops without heartbeats until clean_callback_timeout fires. The fix adds wipe_token_and_url() when inspection finishes successfully, both in the synchronous path (inspect_hardware) and the ironic-inspector callback path (continue_inspection). This follows the same pattern already used on inspection start and abort.
1 parent 42f3342 commit 0bcb667

9 files changed

Lines changed: 25 additions & 47 deletions

File tree

ironic/conductor/cleaning.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -98,12 +98,6 @@ def do_node_clean(task, clean_steps=None, disable_ramdisk=False):
9898
'out-of-band only cleaning has been requested for node '
9999
'%s', node.uuid)
100100
prepare_result = None
101-
except exception.AgentConnectionFailed:
102-
LOG.info('Agent is not yet running on node %(node)s, waiting for'
103-
' agent to come up for fast track', {'node': node.uuid})
104-
target_state = states.MANAGEABLE if manual_clean else None
105-
task.process_event('wait', target_state=target_state)
106-
return
107101
except Exception as e:
108102
msg = (_('Failed to prepare node %(node)s for cleaning: %(e)s')
109103
% {'node': node.uuid, 'e': e})

ironic/conductor/deployments.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -180,11 +180,6 @@ def do_node_deploy(task, conductor_id=None, configdrive=None,
180180

181181
try:
182182
task.driver.deploy.prepare(task)
183-
except exception.AgentConnectionFailed:
184-
LOG.info('Agent is not yet running on node %(node)s, waiting for agent'
185-
' to come up for fast track', {'node': node.uuid})
186-
task.process_event('wait')
187-
return
188183
except exception.IronicException as e:
189184
with excutils.save_and_reraise_exception():
190185
utils.deploying_error_handler(

ironic/conductor/inspection.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ def handle_failure(e, log_func=LOG.error):
6161
raise exception.HardwareInspectionFailure(error=error)
6262

6363
if new_state == states.MANAGEABLE:
64+
utils.wipe_token_and_url(task)
6465
task.process_event('done')
6566
LOG.info('Successfully inspected node %(node)s',
6667
{'node': node.uuid})
@@ -151,5 +152,6 @@ def continue_inspection(task, inventory, plugin_data):
151152
error=True)
152153
task.process_event('fail')
153154

155+
utils.wipe_token_and_url(task)
154156
task.process_event('done')
155157
LOG.info('Successfully finished inspection of node %s', node.uuid)

ironic/conductor/utils.py

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1119,18 +1119,13 @@ def fast_track_able(task):
11191119
def value_within_timeout(value, timeout):
11201120
"""Checks if the time is within the previous timeout seconds from now.
11211121
1122-
:param value: a datetime or string representing date and time or None.
1122+
:param value: a string representing date and time or None.
11231123
:param timeout: timeout in seconds.
11241124
"""
11251125
# use native datetime objects for conversion and compare
11261126
# slightly odd because py2 compatability :(
1127-
if isinstance(value, datetime.datetime):
1128-
# Converts to a offset-naive datetime(as created by timeutils.utcnow())
1129-
last = value.replace(tzinfo=None)
1130-
else:
1131-
defaultdt = '1970-01-01T00:00:00.000000'
1132-
last = datetime.datetime.strptime(value or defaultdt,
1133-
'%Y-%m-%dT%H:%M:%S.%f')
1127+
last = datetime.datetime.strptime(value or '1970-01-01T00:00:00.000000',
1128+
"%Y-%m-%dT%H:%M:%S.%f")
11341129
# If we found nothing, we assume that the time is essentially epoch.
11351130
time_delta = datetime.timedelta(seconds=timeout)
11361131
last_valid = timeutils.utcnow() - time_delta
@@ -1147,20 +1142,14 @@ def agent_is_alive(node, timeout=None):
11471142
:param node: A node object.
11481143
:param timeout: Heartbeat timeout, defaults to `fast_track_timeout`.
11491144
"""
1150-
1151-
timeout = timeout or CONF.deploy.fast_track_timeout
1152-
if node.power_state == states.POWER_ON and \
1153-
node.inspection_finished_at and \
1154-
value_within_timeout(node.inspection_finished_at, timeout):
1155-
return True
1156-
11571145
# If no agent_url is present then we have powered down since the
11581146
# last agent heartbeat
11591147
if not node.driver_internal_info.get('agent_url'):
11601148
return False
11611149

11621150
return value_within_timeout(
1163-
node.driver_internal_info.get('agent_last_heartbeat'), timeout)
1151+
node.driver_internal_info.get('agent_last_heartbeat'),
1152+
timeout or CONF.deploy.fast_track_timeout)
11641153

11651154

11661155
def is_fast_track(task):

ironic/drivers/modules/inspector/interface.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,7 @@ def clean_up(task, finish=True):
366366
msg = _('Inspection clean up failed: %s') % errors
367367
inspection_error_handler(task, msg, raise_exc=False, clean_up=False)
368368
elif finish:
369+
cond_utils.wipe_token_and_url(task)
369370
LOG.info('Inspection finished successfully for node %s',
370371
task.node.uuid)
371372
task.process_event('done')

ironic/tests/unit/conductor/test_inspection.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,9 @@ class TestContinueInspection(db_base.DbTestCase):
127127
def setUp(self):
128128
super().setUp()
129129
self.node = obj_utils.create_test_node(
130-
self.context, provision_state=states.INSPECTING)
130+
self.context, provision_state=states.INSPECTING,
131+
driver_internal_info={'agent_url': 'url',
132+
'agent_secret_token': 'token'})
131133
self.inventory = {"test": "inventory"}
132134
self.plugin_data = {"plugin": "data", "logs": "delete me"}
133135

@@ -144,6 +146,9 @@ def test_ok(self, mock_store, mock_continue):
144146
self.assertNotIn("logs", self.plugin_data)
145147
self.node.refresh()
146148
self.assertEqual(states.MANAGEABLE, self.node.provision_state)
149+
self.assertNotIn('agent_url', self.node.driver_internal_info)
150+
self.assertNotIn('agent_secret_token',
151+
self.node.driver_internal_info)
147152

148153
@mock.patch.object(inspect_utils, 'store_inspection_data', autospec=True)
149154
def test_ok_asynchronous(self, mock_store, mock_continue):

ironic/tests/unit/conductor/test_utils.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2184,18 +2184,6 @@ def test_is_fast_track_no_heartbeat(self, mock_get_power):
21842184
self.context, self.node.uuid, shared=False) as task:
21852185
self.assertFalse(conductor_utils.is_fast_track(task))
21862186

2187-
def test_is_fast_track_inspected_no_heartbeat(self, mock_get_power):
2188-
mock_get_power.return_value = states.POWER_ON
2189-
self.node = obj_utils.create_test_node(
2190-
self.context, driver='fake-hardware',
2191-
uuid=uuidutils.generate_uuid(),
2192-
inspection_finished_at=timeutils.utcnow(),
2193-
power_state=states.POWER_ON
2194-
)
2195-
with task_manager.acquire(
2196-
self.context, self.node.uuid, shared=False) as task:
2197-
self.assertTrue(conductor_utils.is_fast_track(task))
2198-
21992187
def test_is_fast_track_powered_after_heartbeat(self, mock_get_power):
22002188
mock_get_power.return_value = states.POWER_ON
22012189
with task_manager.acquire(

ironic/tests/unit/drivers/modules/inspector/test_interface.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,9 @@ def test_exception_ignored(self, mock_client):
364364
self.assertFalse(self.task.process_event.called)
365365

366366
def test_status_ok(self, mock_client):
367+
self.node.set_driver_internal_info('agent_url', 'url')
368+
self.node.set_driver_internal_info('agent_secret_token', 'token')
369+
self.node.save()
367370
mock_get = mock_client.return_value.get_introspection
368371
mock_get.return_value = mock.Mock(is_finished=True,
369372
error=None,
@@ -374,10 +377,15 @@ def test_status_ok(self, mock_client):
374377
self.assertFalse(self.driver.network.remove_inspection_network.called)
375378
self.assertFalse(self.driver.boot.clean_up_ramdisk.called)
376379
self.assertFalse(self.driver.power.set_power_state.called)
380+
self.assertNotIn('agent_url', self.node.driver_internal_info)
381+
self.assertNotIn('agent_secret_token',
382+
self.node.driver_internal_info)
377383

378384
def test_status_ok_managed(self, mock_client):
379385
utils.set_node_nested_field(self.node, 'driver_internal_info',
380386
'inspector_manage_boot', True)
387+
self.node.set_driver_internal_info('agent_url', 'url')
388+
self.node.set_driver_internal_info('agent_secret_token', 'token')
381389
self.node.save()
382390
mock_get = mock_client.return_value.get_introspection
383391
mock_get.return_value = mock.Mock(is_finished=True,
@@ -391,6 +399,9 @@ def test_status_ok_managed(self, mock_client):
391399
self.driver.boot.clean_up_ramdisk.assert_called_once_with(self.task)
392400
self.driver.power.set_power_state.assert_called_once_with(
393401
self.task, 'power off', timeout=None)
402+
self.assertNotIn('agent_url', self.node.driver_internal_info)
403+
self.assertNotIn('agent_secret_token',
404+
self.node.driver_internal_info)
394405

395406
def test_status_ok_managed_no_power_off(self, mock_client):
396407
CONF.set_override('power_off', False, group='inspector')

releasenotes/notes/set-node-alive-when-inspection-finished-1ec74828852eaeef.yaml

Lines changed: 0 additions & 7 deletions
This file was deleted.

0 commit comments

Comments
 (0)