Skip to content

Commit f41358c

Browse files
committed
Add StackHPC Ironic tunings
1 parent 1fd7190 commit f41358c

File tree

9 files changed

+165
-0
lines changed

9 files changed

+165
-0
lines changed

doc/source/configuration/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ the various features provided.
1111
walled-garden
1212
release-train
1313
host-images
14+
ironic
1415
lvm
1516
swap
1617
cephadm

doc/source/configuration/ironic.rst

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
======
2+
Ironic
3+
======
4+
5+
Cleaning
6+
========
7+
8+
Storage
9+
-------
10+
11+
Hardware assisted secure erase, i.e the ``erase_devices`` clean step, is
12+
enabled by default. This is normally dependent on the `Hardware Manager
13+
<https://docs.openstack.org/ironic-python-agent/latest/contributor/hardware_managers.html>`__
14+
in use. For example, when using the GenericHardwareManager the priority would
15+
be 10, whereas if using the `ProliantHardwareManager
16+
<https://docs.openstack.org/ironic/latest/admin/drivers/ilo.html#disk-erase-support>`__
17+
it would be 0. The idea is that we will prevent the catastrophic case where
18+
data could be leaked to another tenant; forcing you to have to explicitly relax
19+
this setting if this is a risk you want to take. This can be customised by
20+
editing the following variables:
21+
22+
.. code-block::
23+
:caption: $KAYOBE_CONFIG_PATH/kolla/config/ironic/ironic-conductor.conf
24+
25+
[deploy]
26+
erase_devices_priority=10
27+
erase_devices_metadata_priority=0
28+
29+
See `Ironic documentation
30+
<https://docs.openstack.org/ironic/latest/admin/cleaning.html>`__ for more
31+
details.
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[DEFAULT]
2+
timeout = 0
3+
{% if "genericswitch" in kolla_neutron_ml2_mechanism_drivers %}
4+
# We are increasing the RPC response timeouts to 5 minutes due to the neutron
5+
# generic switch driver, which synchronously applies switch configuration for
6+
# each ironic port during node provisioning and tear down.
7+
# The specific API calls that require this long timeout are:
8+
# - Creation and deletion of VLAN networks.
9+
# - Creation or update of ports, adding binding information.
10+
# - Update of ports, removing binding information.
11+
# - Deletion of ports.
12+
rpc_response_timeout = 360
13+
{% endif %}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[DEFAULT]
2+
# Avoid some timeouts of heartbeats and vif deletes
3+
rpc_response_timeout = 360
4+
5+
[neutron]
6+
timeout = 300
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
[DEFAULT]
2+
# Make direct deploy faster, transfer sparse qcow2 images
3+
force_raw_images = False
4+
# Avoid some rpc timeouts
5+
rpc_response_timeout = 360
6+
7+
[conductor]
8+
automated_clean=true
9+
# We have busy conductors failing to heartbeat
10+
# Default is 10 secs
11+
heartbeat_interval = 30
12+
# Default is 60 seconds
13+
heartbeat_timeout = 360
14+
sync_local_state_interval = 360
15+
16+
# Normally this is 100. We see eventlet threads
17+
# not making much progress, to for saftey reduce
18+
# this by half, should leave work on rabbit queu
19+
workers_pool_size = 50
20+
# Normally this is 8, keep it same
21+
period_max_workers = 8
22+
23+
# Increase power sync interval to reduce load
24+
sync_power_state_interval = 120
25+
power_failure_recovery_interval = 120
26+
# Stop checking for orphan allocations for now
27+
check_allocations_interval = 120
28+
29+
# Wait much longer before provision timeout check, to reduce background load
30+
# The default is 60 seconds
31+
check_provision_state_interval = 120
32+
check_rescue_state_interval = 120
33+
34+
[database]
35+
# Usually this is 50, reduce to stop DB connection timeouts
36+
# and instead just make eventlet threads wait a bit longer
37+
max_overflow = 5
38+
# By default this is 30 seconds, but as we reduce
39+
# the pool overflow, some people will need to wait longer
40+
pool_timeout = 60
41+
42+
[deploy]
43+
# Force Hardware assisted secure erase by default.
44+
erase_devices_priority=10
45+
erase_devices_metadata_priority=0
46+
47+
[pxe]
48+
# Increase cache size to 120GB and TTL to 28 hours
49+
image_cache_size = 122880
50+
image_cache_ttl = 100800
51+
52+
[neutron]
53+
# Increase the neutron client timeout to allow for the slow management
54+
# switches.
55+
timeout = 300
56+
request_timeout = 300
57+
58+
[glance]
59+
# Retry image download at least once if failure
60+
num_retries = 1

etc/kayobe/kolla/config/neutron.conf

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
[DEFAULT]
2+
{% if kolla_enable_ironic | bool and "genericswitch" in kolla_neutron_ml2_mechanism_drivers %}
3+
# We are increasing the RPC response timeouts to 5 minutes due to the neutron
4+
# generic switch driver, which synchronously applies switch configuration for
5+
# each ironic port during node provisioning and tear down.
6+
# The specific API calls that require this long timeout are:
7+
# - Creation and deletion of VLAN networks.
8+
# - Creation or update of ports, adding binding information.
9+
# - Update of ports, removing binding information.
10+
# - Deletion of ports.
11+
rpc_response_timeout = 360
12+
{% endif %}

etc/kayobe/kolla/config/nova.conf

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,15 @@
1+
[DEFAULT]
2+
{% if kolla_enable_ironic | bool and "genericswitch" in kolla_neutron_ml2_mechanism_drivers %}
3+
# We are increasing the RPC response timeouts to 5 minutes due to the neutron
4+
# generic switch driver, which synchronously applies switch configuration for
5+
# each ironic port during node provisioning and tear down.
6+
# The specific API calls that require this long timeout are:
7+
# - Creation and deletion of VLAN networks.
8+
# - Creation or update of ports, adding binding information.
9+
# - Update of ports, removing binding information.
10+
# - Deletion of ports.
11+
rpc_response_timeout = 360
12+
{% endif %}
13+
114
[libvirt]
215
hw_machine_type = x86_64=q35

etc/kayobe/kolla/config/nova/nova-compute-ironic.conf

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,16 @@
22
[DEFAULT]
33
host = {{ kolla_nova_compute_ironic_static_host_name | mandatory('You must set a static host name to help with service failover. See the operations documentation, Ironic section.') }}
44
{% endif %}
5+
# Don't limit the number of concurrent builds for the nova ironic compute
6+
# service.
7+
max_concurrent_builds = 35
8+
9+
force_config_drive = True
10+
11+
[ironic]
12+
# Ramp up maximum retries to allow time for baremetal node reboot and switch configs
13+
api_max_retries = 720
14+
15+
[compute]
16+
# Don't disable the compute service due to failed builds.
17+
consecutive_build_service_disable_threshold = 0

etc/kayobe/kolla/globals.yml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,19 @@ prometheus_instance_label: "{% raw %}{{ ansible_facts.hostname }}{% endraw %}"
5353
# in Yoga. This is required to include a valid value for the flavor_id label on
5454
# openstack_nova_server_status metrics.
5555
prometheus_openstack_exporter_compute_api_version: "2.1"
56+
57+
{% if kolla_enable_ironic | bool and "genericswitch" in kolla_neutron_ml2_mechanism_drivers %}
58+
# NOTE: We are increasing the HAProxy timeouts to 5 minutes due to the neutron
59+
# generic switch driver, which synchronously applies switch configuration for
60+
# each ironic port during node provisioning and tear down.
61+
# The specific API calls that require this long timeout are:
62+
# - Creation and deletion of VLAN networks.
63+
# - Creation or update of ports, adding binding information.
64+
# - Update of ports, removing binding information.
65+
# - Deletion of ports.
66+
haproxy_client_timeout: 5m30
67+
haproxy_server_timeout: 5m30
68+
# If using Neutron backend TLS:
69+
neutron_tls_proxy_client_timeout: 5m30
70+
neutron_tls_proxy_server_timeout: 5m30
71+
{% endif %}

0 commit comments

Comments
 (0)