Skip to content

Commit 772b9c9

Browse files
mwdd146980claude
andauthored
[consul] Make health check status cache size and TTL configurable (DataDog#23603)
* Make consul health check status cache configurable Adds health_checks_cache_size and health_checks_cache_ttl options so users with large Consul clusters (>5000 distinct health checks) can size the in-memory transition-detection cache appropriately. Defaults preserve the prior 5000/3600 behavior. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * Add changelog entry for DataDog#23603 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * Reword health_checks_cache_size description for plainer language Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * Address review: mark cache options fleet_configurable, cover init_config path Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * Reject non-positive health check cache settings via spec minimum Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 53f43d3 commit 772b9c9

8 files changed

Lines changed: 99 additions & 2 deletions

File tree

consul/assets/configuration/spec.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,27 @@ files:
157157
display_default: false
158158
example: true
159159

160+
- name: health_checks_cache_size
161+
fleet_configurable: true
162+
description: |
163+
Maximum number of health check entries kept in the cache used to detect status
164+
transitions. Increase this if your Consul cluster reports more than 5000 distinct
165+
health checks across nodes. Evicted entries cause status transitions to be missed
166+
and failure events to be re-emitted on subsequent check runs.
167+
value:
168+
type: integer
169+
example: 5000
170+
minimum: 1
171+
172+
- name: health_checks_cache_ttl
173+
fleet_configurable: true
174+
description: |
175+
Time-to-live in seconds for entries in the health check status cache.
176+
value:
177+
type: integer
178+
example: 3600
179+
minimum: 1
180+
160181
- template: instances/http
161182
- template: instances/default
162183
- template: logs

consul/changelog.d/23603.added

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Make the health check status cache size and TTL configurable via ``health_checks_cache_size`` and ``health_checks_cache_ttl``.

consul/datadog_checks/consul/common.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
# Increase the number of threads to collect consul services checks
2525
THREADS_COUNT = 1
2626

27+
# Defaults for the in-memory cache used to detect health check status transitions
28+
HEALTH_CHECKS_CACHE_SIZE = 5000
29+
HEALTH_CHECKS_CACHE_TTL = 3600
30+
2731
STATUS_SC = {
2832
'up': AgentCheck.OK,
2933
'passing': AgentCheck.OK,

consul/datadog_checks/consul/config_models/defaults.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,14 @@ def instance_enable_legacy_tags_normalization():
4848
return True
4949

5050

51+
def instance_health_checks_cache_size():
52+
return 5000
53+
54+
55+
def instance_health_checks_cache_ttl():
56+
return 3600
57+
58+
5159
def instance_kerberos_auth():
5260
return 'disabled'
5361

consul/datadog_checks/consul/config_models/instance.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from types import MappingProxyType
1313
from typing import Any, Optional
1414

15-
from pydantic import BaseModel, ConfigDict, field_validator, model_validator
15+
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
1616
from typing_extensions import Literal
1717

1818
from datadog_checks.base.utils.functions import identity
@@ -76,6 +76,8 @@ class InstanceConfig(BaseModel):
7676
enable_legacy_tags_normalization: Optional[bool] = None
7777
extra_headers: Optional[MappingProxyType[str, Any]] = None
7878
headers: Optional[MappingProxyType[str, Any]] = None
79+
health_checks_cache_size: Optional[int] = Field(None, ge=1)
80+
health_checks_cache_ttl: Optional[int] = Field(None, ge=1)
7981
kerberos_auth: Optional[Literal['required', 'optional', 'disabled']] = None
8082
kerberos_cache: Optional[str] = None
8183
kerberos_delegate: Optional[bool] = None

consul/datadog_checks/consul/consul.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
CONSUL_CHECK,
2525
HEALTH_CHECK,
2626
HEALTH_CHECK_METRIC,
27+
HEALTH_CHECKS_CACHE_SIZE,
28+
HEALTH_CHECKS_CACHE_TTL,
2729
MAX_CONFIG_TTL,
2830
MAX_SERVICES,
2931
SOURCE_TYPE_NAME,
@@ -133,7 +135,15 @@ def __init__(self, name, init_config, instances):
133135
if 'acl_token' in self.instance:
134136
self.http.options['headers']['X-Consul-Token'] = self.instance['acl_token']
135137

136-
self.health_checks = TTLCache(ttl=3600, maxsize=5000)
138+
cache_size = self.instance.get(
139+
'health_checks_cache_size',
140+
self.init_config.get('health_checks_cache_size', HEALTH_CHECKS_CACHE_SIZE),
141+
)
142+
cache_ttl = self.instance.get(
143+
'health_checks_cache_ttl',
144+
self.init_config.get('health_checks_cache_ttl', HEALTH_CHECKS_CACHE_TTL),
145+
)
146+
self.health_checks = TTLCache(ttl=cache_ttl, maxsize=cache_size)
137147

138148
def _is_dogstatsd_configured(self):
139149
"""Check if the agent has a consul dogstatsd profile configured"""

consul/datadog_checks/consul/data/conf.yaml.example

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,19 @@ instances:
147147
#
148148
# collect_health_checks: true
149149

150+
## @param health_checks_cache_size - integer - optional - default: 5000
151+
## Maximum number of health check entries kept in the cache used to detect status
152+
## transitions. Increase this if your Consul cluster reports more than 5000 distinct
153+
## health checks across nodes. Evicted entries cause status transitions to be missed
154+
## and failure events to be re-emitted on subsequent check runs.
155+
#
156+
# health_checks_cache_size: 5000
157+
158+
## @param health_checks_cache_ttl - integer - optional - default: 3600
159+
## Time-to-live in seconds for entries in the health check status cache.
160+
#
161+
# health_checks_cache_ttl: 3600
162+
150163
## @param proxy - mapping - optional
151164
## This overrides the `proxy` setting in `init_config`.
152165
##

consul/tests/test_unit.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -671,3 +671,41 @@ def test_config(test_case, extra_config, expected_http_kwargs, mocker):
671671
}
672672
http_wargs.update(expected_http_kwargs)
673673
mock_session.get.assert_called_with('/v1/status/leader', **http_wargs)
674+
675+
676+
def test_health_checks_cache_defaults():
677+
consul_check = ConsulCheck(common.CHECK_NAME, {}, [consul_mocks.MOCK_CONFIG])
678+
assert consul_check.health_checks.maxsize == 5000
679+
assert consul_check.health_checks.ttl == 3600
680+
681+
682+
def test_health_checks_cache_configurable():
683+
config = dict(consul_mocks.MOCK_CONFIG)
684+
config['health_checks_cache_size'] = 10
685+
config['health_checks_cache_ttl'] = 60
686+
consul_check = ConsulCheck(common.CHECK_NAME, {}, [config])
687+
assert consul_check.health_checks.maxsize == 10
688+
assert consul_check.health_checks.ttl == 60
689+
690+
691+
def test_health_checks_cache_configurable_via_init_config():
692+
init_config = {'health_checks_cache_size': 42, 'health_checks_cache_ttl': 7}
693+
consul_check = ConsulCheck(common.CHECK_NAME, init_config, [consul_mocks.MOCK_CONFIG])
694+
assert consul_check.health_checks.maxsize == 42
695+
assert consul_check.health_checks.ttl == 7
696+
697+
698+
def test_health_checks_cache_eviction_re_emits_failure_event(aggregator):
699+
config = dict(consul_mocks.MOCK_CONFIG_DISABLE_SERVICE_TAG)
700+
config['collect_health_checks'] = True
701+
config['health_checks_cache_size'] = 1
702+
consul_check = ConsulCheck(common.CHECK_NAME, {}, [config])
703+
my_mocks = consul_mocks._get_consul_mocks()
704+
my_mocks['consul_request'] = consul_mocks.mock_get_health_check
705+
consul_mocks.mock_check(consul_check, my_mocks)
706+
707+
consul_check.check(None)
708+
consul_check.check(None)
709+
710+
failure_events = [e for e in aggregator.events if e['event_type'] == 'consul.check_failed']
711+
assert len(failure_events) == 2

0 commit comments

Comments
 (0)