Skip to content

Commit 5ed520d

Browse files
committed
Handle the node being locked
If the node is locked, a lookup cannot be performed when an agent token needs to be generated, which tends to error like this: ironic_python_agent.ironic_api_client [-] Failed looking up node with addresses '00:6f:bb:34:b3:4d,00:6f:bb:34:b3:4b' at https://172.22.0.2:6385. Error 409: Node c25e451b-d2fb-4168-b690-f15bc8365520 is locked by host 172.22.0.2, please retry after the current operation is completed.. Check if inspection has completed. Problem is, if we keep pounding on the door, we can actually worsen the situation, and previously we would just just let tenacity retry. We will now hold for 30 seconds before proceeding, so we have hopefully allowed the operation to complete. Also fixes the error logging to help human's sanity. Change-Id: I97d3e27e2adb731794a7746737d3788c6e7977a0
1 parent b68a4c8 commit 5ed520d

File tree

3 files changed

+65
-2
lines changed

3 files changed

+65
-2
lines changed

ironic_python_agent/ironic_api_client.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
import json
16+
import time
1617

1718
from oslo_config import cfg
1819
from oslo_log import log
@@ -45,6 +46,7 @@ class APIClient(object):
4546
heartbeat_api = '/%s/heartbeat/{uuid}' % api_version
4647
_ironic_api_version = None
4748
agent_token = None
49+
lookup_lock_pause = 0
4850

4951
def __init__(self, api_url):
5052
self.api_url = api_url.rstrip('/')
@@ -177,7 +179,7 @@ def heartbeat(self, uuid, advertise_address, advertise_protocol='http',
177179
raise errors.HeartbeatError(error)
178180

179181
def lookup_node(self, hardware_info, timeout, starting_interval,
180-
node_uuid=None, max_interval=30):
182+
node_uuid=None, max_interval=60):
181183
retry = tenacity.retry(
182184
retry=tenacity.retry_if_result(lambda r: r is False),
183185
stop=tenacity.stop_after_delay(timeout),
@@ -245,10 +247,31 @@ def _do_lookup(self, hardware_info, node_uuid):
245247
LOG.error(exc_msg)
246248
raise errors.LookupNodeError(msg)
247249

250+
if response.status_code == requests.codes.CONFLICT:
251+
if self.lookup_lock_pause == 0:
252+
self.lookup_lock_pause = 5
253+
elif self.lookup_lock_pause == 5:
254+
self.lookup_lock_pause = 10
255+
elif self.lookup_lock_pause == 10:
256+
# If we're reaching this point, we've got a long held
257+
# persistent lock, which means things can go very sideways
258+
# or the ironic deployment is downright grumpy. Either way,
259+
# we need to slow things down.
260+
self.lookup_lock_pause = 30
261+
LOG.warning(
262+
'Ironic has responded with a conflict, signaling the '
263+
'node is locked. We will wait %(time)s seconds before trying '
264+
'again. %(err)s',
265+
{'time': self.lookup_lock_pause,
266+
'error': self._error_from_response(response)}
267+
)
268+
time.sleep(self.lookup_lock_pause)
269+
return False
270+
248271
if response.status_code != requests.codes.OK:
249272
LOG.warning(
250273
'Failed looking up node with addresses %r at %s. '
251-
'%s. Check if inspection has completed.',
274+
'Check if inspection has completed? %s',
252275
params['addresses'], self.api_url,
253276
self._error_from_response(response)
254277
)

ironic_python_agent/tests/unit/test_ironic_api_client.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
import json
16+
import time
1617
from unittest import mock
1718

1819
from oslo_config import cfg
@@ -461,6 +462,32 @@ def test_do_lookup_transient_exceptions(self, mock_log):
461462
mock_log.error.assert_has_calls([])
462463
self.assertEqual(1, mock_log.warning.call_count)
463464

465+
@mock.patch.object(time, 'sleep', autospec=True)
466+
@mock.patch.object(ironic_api_client, 'LOG', autospec=True)
467+
def test_do_lookup_node_locked(self, mock_log, mock_sleep):
468+
response = FakeResponse(status_code=409, content={})
469+
self.api_client.session.request = mock.Mock()
470+
self.api_client.session.request.return_value = response
471+
self.assertEqual(0, self.api_client.lookup_lock_pause)
472+
error = self.api_client._do_lookup(self.hardware_info,
473+
node_uuid=None)
474+
self.assertFalse(error)
475+
mock_log.error.assert_has_calls([])
476+
self.assertEqual(1, mock_log.warning.call_count)
477+
self.assertEqual(1, mock_sleep.call_count)
478+
self.assertEqual(5, self.api_client.lookup_lock_pause)
479+
error = self.api_client._do_lookup(self.hardware_info,
480+
node_uuid=None)
481+
self.assertEqual(10, self.api_client.lookup_lock_pause)
482+
error = self.api_client._do_lookup(self.hardware_info,
483+
node_uuid=None)
484+
self.assertFalse(error)
485+
self.assertEqual(30, self.api_client.lookup_lock_pause)
486+
error = self.api_client._do_lookup(self.hardware_info,
487+
node_uuid=None)
488+
self.assertFalse(error)
489+
self.assertEqual(30, self.api_client.lookup_lock_pause)
490+
464491
@mock.patch.object(ironic_api_client, 'LOG', autospec=True)
465492
def test_do_lookup_unknown_exception(self, mock_log):
466493
self.api_client.session.request = mock.Mock()
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
---
2+
fixes:
3+
- |
4+
Fixes, or at least lessens the case where a running Ironic agent can stack
5+
up numerous lookup requests against an Ironic deployment when a node is
6+
locked. In particular, this is beause the lookup also drives generation of
7+
the agent token, which requires the conductor to allocate a worker, and
8+
generate the token, and return the result to the API client.
9+
Ironic's retry logic will now wait up to ``60`` seconds, and if an HTTP
10+
Conflict (409) message is received, the agent will automatically pause
11+
lookup operations for thirty seconds as opposed continue to attempt
12+
lookups which could create more work for the Ironic deployment
13+
needlessly.

0 commit comments

Comments
 (0)