Skip to content

Commit 4f90796

Browse files
committed
Add StackHPC Ironic tunings
1 parent 4effb00 commit 4f90796

9 files changed

Lines changed: 179 additions & 1 deletion

File tree

doc/source/configuration/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ the various features provided.
1010

1111
release-train
1212
host-images
13+
ironic
1314
lvm
1415
cephadm
1516
monitoring
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
======
2+
Ironic
3+
======
4+
5+
Cleaning
6+
========
7+
8+
Storage
9+
-------
10+
11+
Hardware assisted secure erase, i.e the ``erase_devices`` clean step, is
12+
enabled by default. This is normally dependent on the `Hardware Manager
13+
<https://docs.openstack.org/ironic-python-agent/latest/contributor/hardware_managers.html>`__
14+
in use. For example, when using the GenericHardwareManager the priority would
15+
be 10, whereas if using the `ProliantHardwareManager
16+
<https://docs.openstack.org/ironic/latest/admin/drivers/ilo.html#disk-erase-support>`__
17+
it would be 0. The idea is that we will prevent the catastrophic case where
18+
data could be leaked to another tenant; forcing you to have to explicitly relax
19+
this setting if this is a risk you want to take. This can be customised by
20+
editing the following variables:
21+
22+
.. code-block::
23+
:caption: $KAYOBE_CONFIG_PATH/kolla/config/ironic/ironic-conductor.conf
24+
25+
[deploy]
26+
erase_devices_priority=10
27+
erase_devices_metadata_priority=0
28+
29+
See `Ironic documentation
30+
<https://docs.openstack.org/ironic/latest/admin/cleaning.html>`__ for more
31+
details.
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[DEFAULT]
2+
timeout = 0
3+
{% if "genericswitch" in kolla_neutron_ml2_mechanism_drivers %}
4+
# We are increasing the RPC response timeouts to 5 minutes due to the neutron
5+
# generic switch driver, which synchronously applies switch configuration for
6+
# each ironic port during node provisioning and tear down.
7+
# The specific API calls that require this long timeout are:
8+
# - Creation and deletion of VLAN networks.
9+
# - Creation or update of ports, adding binding information.
10+
# - Update of ports, removing binding information.
11+
# - Deletion of ports.
12+
rpc_response_timeout = 360
13+
{% endif %}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[DEFAULT]
2+
# Avoid some timeouts of heartbeats and vif deletes
3+
rpc_response_timeout = 360
4+
5+
[neutron]
6+
timeout = 300
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
[DEFAULT]
2+
# Make direct deploy faster, transfer sparse qcow2 images
3+
force_raw_images = False
4+
# Avoid some rpc timeouts
5+
rpc_response_timeout = 360
6+
7+
[conductor]
8+
automated_clean=true
9+
# We have busy conductors failing to heartbeat
10+
# Default is 10 secs
11+
heartbeat_interval = 30
12+
# Default is 60 seconds
13+
heartbeat_timeout = 360
14+
sync_local_state_interval = 360
15+
16+
# Normally this is 100. We see eventlet threads
17+
# not making much progress, to for saftey reduce
18+
# this by half, should leave work on rabbit queu
19+
workers_pool_size = 50
20+
# Normally this is 8, keep it same
21+
period_max_workers = 8
22+
23+
# Increase power sync interval to reduce load
24+
sync_power_state_interval = 120
25+
power_failure_recovery_interval = 120
26+
# Stop checking for orphan allocations for now
27+
check_allocations_interval = 120
28+
29+
# Wait much longer before provision timeout check, to reduce background load
30+
# The default is 60 seconds
31+
check_provision_state_interval = 120
32+
check_rescue_state_interval = 120
33+
34+
[database]
35+
# Usually this is 50, reduce to stop DB connection timeouts
36+
# and instead just make eventlet threads wait a bit longer
37+
max_overflow = 5
38+
# By default this is 30 seconds, but as we reduce
39+
# the pool overflow, some people will need to wait longer
40+
pool_timeout = 60
41+
42+
[deploy]
43+
# Force Hardware assisted secure erase by default.
44+
erase_devices_priority=10
45+
erase_devices_metadata_priority=0
46+
47+
[pxe]
48+
# Increase cache size to 120GB and TTL to 28 hours
49+
image_cache_size = 122880
50+
image_cache_ttl = 100800
51+
52+
[neutron]
53+
# Increase the neutron client timeout to allow for the slow management
54+
# switches.
55+
timeout = 300
56+
request_timeout = 300
57+
58+
[glance]
59+
# Retry image download at least once if failure
60+
num_retries = 1
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
[DEFAULT]
2+
{% if kolla_enable_ironic | bool and "genericswitch" in kolla_neutron_ml2_mechanism_drivers %}
3+
# We are increasing the RPC response timeouts to 5 minutes due to the neutron
4+
# generic switch driver, which synchronously applies switch configuration for
5+
# each ironic port during node provisioning and tear down.
6+
# The specific API calls that require this long timeout are:
7+
# - Creation and deletion of VLAN networks.
8+
# - Creation or update of ports, adding binding information.
9+
# - Update of ports, removing binding information.
10+
# - Deletion of ports.
11+
rpc_response_timeout = 360
12+
{% endif %}

etc/kayobe/kolla/config/nova.conf

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,15 @@
1+
[DEFAULT]
2+
{% if kolla_enable_ironic | bool and "genericswitch" in kolla_neutron_ml2_mechanism_drivers %}
3+
# We are increasing the RPC response timeouts to 5 minutes due to the neutron
4+
# generic switch driver, which synchronously applies switch configuration for
5+
# each ironic port during node provisioning and tear down.
6+
# The specific API calls that require this long timeout are:
7+
# - Creation and deletion of VLAN networks.
8+
# - Creation or update of ports, adding binding information.
9+
# - Update of ports, removing binding information.
10+
# - Deletion of ports.
11+
rpc_response_timeout = 360
12+
{% endif %}
13+
114
[libvirt]
215
hw_machine_type = x86_64=q35
Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,17 @@
1-
{% if kolla_enable_ironic|bool and kolla_nova_compute_ironic_host is not none %}
21
[DEFAULT]
2+
{% if kolla_enable_ironic|bool and kolla_nova_compute_ironic_host is not none %}
33
host = {{ kolla_nova_compute_ironic_static_host_name | mandatory('You must set a static host name to help with service failover. See the operations documentation, Ironic section.') }}
44
{% endif %}
5+
# Don't limit the number of concurrent builds for the nova ironic compute
6+
# service.
7+
max_concurrent_builds = 35
8+
9+
force_config_drive = True
10+
11+
[ironic]
12+
# Ramp up maximum retries to allow time for baremetal node reboot and switch configs
13+
api_max_retries = 720
14+
15+
[compute]
16+
# Don't disable the compute service due to failed builds.
17+
consecutive_build_service_disable_threshold = 0

etc/kayobe/kolla/globals.yml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,32 @@ rabbitmq_image: "{% raw %}{{ docker_registry ~ '/' if docker_registry else '' }}
6565
# https://github.com/sysown/proxysql/issues/4877 or K-A bumps ProxySQL to 3.x.
6666
database_enable_tls_internal: false
6767
database_enable_tls_backend: false
68+
69+
#############################################################################
70+
# HAProxy tunings
71+
72+
{% if kolla_enable_ironic | bool and "genericswitch" in kolla_neutron_ml2_mechanism_drivers %}
73+
# NOTE: We are increasing the HAProxy timeouts to 5 minutes due to the neutron
74+
# generic switch driver, which synchronously applies switch configuration for
75+
# each ironic port during node provisioning and tear down.
76+
# The specific API calls that require this long timeout are:
77+
# - Creation and deletion of VLAN networks.
78+
# - Creation or update of ports, adding binding information.
79+
# - Update of ports, removing binding information.
80+
# - Deletion of ports.
81+
haproxy_client_timeout: 5m30
82+
haproxy_server_timeout: 5m30
83+
# If using Neutron backend TLS:
84+
neutron_tls_proxy_client_timeout: 5m30
85+
neutron_tls_proxy_server_timeout: 5m30
86+
{% endif %}
87+
88+
#############################################################################
89+
# Neutron tunings
90+
91+
{% if kolla_enable_ironic | bool and "genericswitch" in kolla_neutron_ml2_mechanism_drivers %}
92+
neutron_tls_proxy_client_timeout: 5m30
93+
neutron_tls_proxy_server_timeout: 5m30
94+
{% endif %}
95+
96+
#############################################################################

0 commit comments

Comments
 (0)