Skip to content

Commit 54e5fcc

Browse files
committed
Handle the node being locked
If the node is locked, a lookup cannot be performed when an agent token needs to be generated, which tends to error like this: ironic_python_agent.ironic_api_client [-] Failed looking up node with addresses '00:6f:bb:34:b3:4d,00:6f:bb:34:b3:4b' at https://172.22.0.2:6385. Error 409: Node c25e451b-d2fb-4168-b690-f15bc8365520 is locked by host 172.22.0.2, please retry after the current operation is completed.. Check if inspection has completed. Problem is, if we keep pounding on the door, we can actually worsen the situation, and previously we would just just let tenacity retry. We will now hold for 30 seconds before proceeding, so we have hopefully allowed the operation to complete. Also fixes the error logging to help human's sanity. Change-Id: I97d3e27e2adb731794a7746737d3788c6e7977a0 (cherry picked from commit 5ed520d) (cherry picked from commit 06214d7)
1 parent 9f84c8b commit 54e5fcc

File tree

3 files changed

+66
-2
lines changed

3 files changed

+66
-2
lines changed

ironic_python_agent/ironic_api_client.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import time
16+
1517
from oslo_config import cfg
1618
from oslo_log import log
1719
from oslo_serialization import jsonutils
@@ -44,6 +46,7 @@ class APIClient(object):
4446
heartbeat_api = '/%s/heartbeat/{uuid}' % api_version
4547
_ironic_api_version = None
4648
agent_token = None
49+
lookup_lock_pause = 0
4750

4851
def __init__(self, api_url):
4952
self.api_url = api_url.rstrip('/')
@@ -175,7 +178,7 @@ def heartbeat(self, uuid, advertise_address, advertise_protocol='http',
175178
raise errors.HeartbeatError(error)
176179

177180
def lookup_node(self, hardware_info, timeout, starting_interval,
178-
node_uuid=None, max_interval=30):
181+
node_uuid=None, max_interval=60):
179182
retry = tenacity.retry(
180183
retry=tenacity.retry_if_result(lambda r: r is False),
181184
stop=tenacity.stop_after_delay(timeout),
@@ -243,10 +246,31 @@ def _do_lookup(self, hardware_info, node_uuid):
243246
LOG.error(exc_msg)
244247
raise errors.LookupNodeError(msg)
245248

249+
if response.status_code == requests.codes.CONFLICT:
250+
if self.lookup_lock_pause == 0:
251+
self.lookup_lock_pause = 5
252+
elif self.lookup_lock_pause == 5:
253+
self.lookup_lock_pause = 10
254+
elif self.lookup_lock_pause == 10:
255+
# If we're reaching this point, we've got a long held
256+
# persistent lock, which means things can go very sideways
257+
# or the ironic deployment is downright grumpy. Either way,
258+
# we need to slow things down.
259+
self.lookup_lock_pause = 30
260+
LOG.warning(
261+
'Ironic has responded with a conflict, signaling the '
262+
'node is locked. We will wait %(time)s seconds before trying '
263+
'again. %(err)s',
264+
{'time': self.lookup_lock_pause,
265+
'error': self._error_from_response(response)}
266+
)
267+
time.sleep(self.lookup_lock_pause)
268+
return False
269+
246270
if response.status_code != requests.codes.OK:
247271
LOG.warning(
248272
'Failed looking up node with addresses %r at %s. '
249-
'%s. Check if inspection has completed.',
273+
'Check if inspection has completed? %s',
250274
params['addresses'], self.api_url,
251275
self._error_from_response(response)
252276
)

ironic_python_agent/tests/unit/test_ironic_api_client.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
import json
16+
import time
1617
from unittest import mock
1718

1819
from oslo_config import cfg
@@ -462,6 +463,32 @@ def test_do_lookup_transient_exceptions(self, mock_log):
462463
mock_log.error.assert_has_calls([])
463464
self.assertEqual(1, mock_log.warning.call_count)
464465

466+
@mock.patch.object(time, 'sleep', autospec=True)
467+
@mock.patch.object(ironic_api_client, 'LOG', autospec=True)
468+
def test_do_lookup_node_locked(self, mock_log, mock_sleep):
469+
response = FakeResponse(status_code=409, content={})
470+
self.api_client.session.request = mock.Mock()
471+
self.api_client.session.request.return_value = response
472+
self.assertEqual(0, self.api_client.lookup_lock_pause)
473+
error = self.api_client._do_lookup(self.hardware_info,
474+
node_uuid=None)
475+
self.assertFalse(error)
476+
mock_log.error.assert_has_calls([])
477+
self.assertEqual(1, mock_log.warning.call_count)
478+
self.assertEqual(1, mock_sleep.call_count)
479+
self.assertEqual(5, self.api_client.lookup_lock_pause)
480+
error = self.api_client._do_lookup(self.hardware_info,
481+
node_uuid=None)
482+
self.assertEqual(10, self.api_client.lookup_lock_pause)
483+
error = self.api_client._do_lookup(self.hardware_info,
484+
node_uuid=None)
485+
self.assertFalse(error)
486+
self.assertEqual(30, self.api_client.lookup_lock_pause)
487+
error = self.api_client._do_lookup(self.hardware_info,
488+
node_uuid=None)
489+
self.assertFalse(error)
490+
self.assertEqual(30, self.api_client.lookup_lock_pause)
491+
465492
@mock.patch.object(ironic_api_client, 'LOG', autospec=True)
466493
def test_do_lookup_unknown_exception(self, mock_log):
467494
self.api_client.session.request = mock.Mock()
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
---
2+
fixes:
3+
- |
4+
Fixes, or at least lessens the case where a running Ironic agent can stack
5+
up numerous lookup requests against an Ironic deployment when a node is
6+
locked. In particular, this is beause the lookup also drives generation of
7+
the agent token, which requires the conductor to allocate a worker, and
8+
generate the token, and return the result to the API client.
9+
Ironic's retry logic will now wait up to ``60`` seconds, and if an HTTP
10+
Conflict (409) message is received, the agent will automatically pause
11+
lookup operations for thirty seconds as opposed continue to attempt
12+
lookups which could create more work for the Ironic deployment
13+
needlessly.

0 commit comments

Comments
 (0)