Skip to content

Synchronise wallaby with upstream #64

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 26 additions & 2 deletions ironic_python_agent/ironic_api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import time

from oslo_config import cfg
from oslo_log import log
from oslo_serialization import jsonutils
Expand Down Expand Up @@ -44,6 +46,7 @@ class APIClient(object):
heartbeat_api = '/%s/heartbeat/{uuid}' % api_version
_ironic_api_version = None
agent_token = None
lookup_lock_pause = 0

def __init__(self, api_url):
self.api_url = api_url.rstrip('/')
Expand Down Expand Up @@ -175,7 +178,7 @@ def heartbeat(self, uuid, advertise_address, advertise_protocol='http',
raise errors.HeartbeatError(error)

def lookup_node(self, hardware_info, timeout, starting_interval,
node_uuid=None, max_interval=30):
node_uuid=None, max_interval=60):
retry = tenacity.retry(
retry=tenacity.retry_if_result(lambda r: r is False),
stop=tenacity.stop_after_delay(timeout),
Expand Down Expand Up @@ -243,10 +246,31 @@ def _do_lookup(self, hardware_info, node_uuid):
LOG.error(exc_msg)
raise errors.LookupNodeError(msg)

if response.status_code == requests.codes.CONFLICT:
if self.lookup_lock_pause == 0:
self.lookup_lock_pause = 5
elif self.lookup_lock_pause == 5:
self.lookup_lock_pause = 10
elif self.lookup_lock_pause == 10:
# If we're reaching this point, we've got a long held
# persistent lock, which means things can go very sideways
# or the ironic deployment is downright grumpy. Either way,
# we need to slow things down.
self.lookup_lock_pause = 30
LOG.warning(
'Ironic has responded with a conflict, signaling the '
'node is locked. We will wait %(time)s seconds before trying '
'again. %(err)s',
{'time': self.lookup_lock_pause,
'error': self._error_from_response(response)}
)
time.sleep(self.lookup_lock_pause)
return False

if response.status_code != requests.codes.OK:
LOG.warning(
'Failed looking up node with addresses %r at %s. '
'%s. Check if inspection has completed.',
'Check if inspection has completed? %s',
params['addresses'], self.api_url,
self._error_from_response(response)
)
Expand Down
27 changes: 27 additions & 0 deletions ironic_python_agent/tests/unit/test_ironic_api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

import json
import time
from unittest import mock

from oslo_config import cfg
Expand Down Expand Up @@ -462,6 +463,32 @@ def test_do_lookup_transient_exceptions(self, mock_log):
mock_log.error.assert_has_calls([])
self.assertEqual(1, mock_log.warning.call_count)

@mock.patch.object(time, 'sleep', autospec=True)
@mock.patch.object(ironic_api_client, 'LOG', autospec=True)
def test_do_lookup_node_locked(self, mock_log, mock_sleep):
response = FakeResponse(status_code=409, content={})
self.api_client.session.request = mock.Mock()
self.api_client.session.request.return_value = response
self.assertEqual(0, self.api_client.lookup_lock_pause)
error = self.api_client._do_lookup(self.hardware_info,
node_uuid=None)
self.assertFalse(error)
mock_log.error.assert_has_calls([])
self.assertEqual(1, mock_log.warning.call_count)
self.assertEqual(1, mock_sleep.call_count)
self.assertEqual(5, self.api_client.lookup_lock_pause)
error = self.api_client._do_lookup(self.hardware_info,
node_uuid=None)
self.assertEqual(10, self.api_client.lookup_lock_pause)
error = self.api_client._do_lookup(self.hardware_info,
node_uuid=None)
self.assertFalse(error)
self.assertEqual(30, self.api_client.lookup_lock_pause)
error = self.api_client._do_lookup(self.hardware_info,
node_uuid=None)
self.assertFalse(error)
self.assertEqual(30, self.api_client.lookup_lock_pause)

@mock.patch.object(ironic_api_client, 'LOG', autospec=True)
def test_do_lookup_unknown_exception(self, mock_log):
self.api_client.session.request = mock.Mock()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
---
fixes:
- |
Fixes, or at least lessens the case where a running Ironic agent can stack
up numerous lookup requests against an Ironic deployment when a node is
locked. In particular, this is beause the lookup also drives generation of
the agent token, which requires the conductor to allocate a worker, and
generate the token, and return the result to the API client.
Ironic's retry logic will now wait up to ``60`` seconds, and if an HTTP
Conflict (409) message is received, the agent will automatically pause
lookup operations for thirty seconds as opposed continue to attempt
lookups which could create more work for the Ironic deployment
needlessly.