Skip to content

Commit 02b08ff

Browse files
authored
Merge pull request #65 from stackhpc/upstream/xena-2024-02-12
Synchronise xena with upstream
2 parents 105ac36 + 5acb4d6 commit 02b08ff

File tree

3 files changed

+66
-2
lines changed

3 files changed

+66
-2
lines changed

ironic_python_agent/ironic_api_client.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import time
16+
1517
from oslo_config import cfg
1618
from oslo_log import log
1719
from oslo_serialization import jsonutils
@@ -44,6 +46,7 @@ class APIClient(object):
4446
heartbeat_api = '/%s/heartbeat/{uuid}' % api_version
4547
_ironic_api_version = None
4648
agent_token = None
49+
lookup_lock_pause = 0
4750

4851
def __init__(self, api_url):
4952
self.api_url = api_url.rstrip('/')
@@ -175,7 +178,7 @@ def heartbeat(self, uuid, advertise_address, advertise_protocol='http',
175178
raise errors.HeartbeatError(error)
176179

177180
def lookup_node(self, hardware_info, timeout, starting_interval,
178-
node_uuid=None, max_interval=30):
181+
node_uuid=None, max_interval=60):
179182
retry = tenacity.retry(
180183
retry=tenacity.retry_if_result(lambda r: r is False),
181184
stop=tenacity.stop_after_delay(timeout),
@@ -243,10 +246,31 @@ def _do_lookup(self, hardware_info, node_uuid):
243246
LOG.error(exc_msg)
244247
raise errors.LookupNodeError(msg)
245248

249+
if response.status_code == requests.codes.CONFLICT:
250+
if self.lookup_lock_pause == 0:
251+
self.lookup_lock_pause = 5
252+
elif self.lookup_lock_pause == 5:
253+
self.lookup_lock_pause = 10
254+
elif self.lookup_lock_pause == 10:
255+
# If we're reaching this point, we've got a long held
256+
# persistent lock, which means things can go very sideways
257+
# or the ironic deployment is downright grumpy. Either way,
258+
# we need to slow things down.
259+
self.lookup_lock_pause = 30
260+
LOG.warning(
261+
'Ironic has responded with a conflict, signaling the '
262+
'node is locked. We will wait %(time)s seconds before trying '
263+
'again. %(err)s',
264+
{'time': self.lookup_lock_pause,
265+
'error': self._error_from_response(response)}
266+
)
267+
time.sleep(self.lookup_lock_pause)
268+
return False
269+
246270
if response.status_code != requests.codes.OK:
247271
LOG.warning(
248272
'Failed looking up node with addresses %r at %s. '
249-
'%s. Check if inspection has completed.',
273+
'Check if inspection has completed? %s',
250274
params['addresses'], self.api_url,
251275
self._error_from_response(response)
252276
)

ironic_python_agent/tests/unit/test_ironic_api_client.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
import json
16+
import time
1617
from unittest import mock
1718

1819
from oslo_config import cfg
@@ -462,6 +463,32 @@ def test_do_lookup_transient_exceptions(self, mock_log):
462463
mock_log.error.assert_has_calls([])
463464
self.assertEqual(1, mock_log.warning.call_count)
464465

466+
@mock.patch.object(time, 'sleep', autospec=True)
467+
@mock.patch.object(ironic_api_client, 'LOG', autospec=True)
468+
def test_do_lookup_node_locked(self, mock_log, mock_sleep):
469+
response = FakeResponse(status_code=409, content={})
470+
self.api_client.session.request = mock.Mock()
471+
self.api_client.session.request.return_value = response
472+
self.assertEqual(0, self.api_client.lookup_lock_pause)
473+
error = self.api_client._do_lookup(self.hardware_info,
474+
node_uuid=None)
475+
self.assertFalse(error)
476+
mock_log.error.assert_has_calls([])
477+
self.assertEqual(1, mock_log.warning.call_count)
478+
self.assertEqual(1, mock_sleep.call_count)
479+
self.assertEqual(5, self.api_client.lookup_lock_pause)
480+
error = self.api_client._do_lookup(self.hardware_info,
481+
node_uuid=None)
482+
self.assertEqual(10, self.api_client.lookup_lock_pause)
483+
error = self.api_client._do_lookup(self.hardware_info,
484+
node_uuid=None)
485+
self.assertFalse(error)
486+
self.assertEqual(30, self.api_client.lookup_lock_pause)
487+
error = self.api_client._do_lookup(self.hardware_info,
488+
node_uuid=None)
489+
self.assertFalse(error)
490+
self.assertEqual(30, self.api_client.lookup_lock_pause)
491+
465492
@mock.patch.object(ironic_api_client, 'LOG', autospec=True)
466493
def test_do_lookup_unknown_exception(self, mock_log):
467494
self.api_client.session.request = mock.Mock()
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
---
2+
fixes:
3+
- |
4+
Fixes, or at least lessens the case where a running Ironic agent can stack
5+
up numerous lookup requests against an Ironic deployment when a node is
6+
locked. In particular, this is beause the lookup also drives generation of
7+
the agent token, which requires the conductor to allocate a worker, and
8+
generate the token, and return the result to the API client.
9+
Ironic's retry logic will now wait up to ``60`` seconds, and if an HTTP
10+
Conflict (409) message is received, the agent will automatically pause
11+
lookup operations for thirty seconds as opposed continue to attempt
12+
lookups which could create more work for the Ironic deployment
13+
needlessly.

0 commit comments

Comments
 (0)