diff --git a/consul/assets/configuration/spec.yaml b/consul/assets/configuration/spec.yaml index b43df67b44145..d9a2721d562e9 100644 --- a/consul/assets/configuration/spec.yaml +++ b/consul/assets/configuration/spec.yaml @@ -157,6 +157,27 @@ files: display_default: false example: true + - name: health_checks_cache_size + fleet_configurable: true + description: | + Maximum number of health check entries kept in the cache used to detect status + transitions. Increase this if your Consul cluster reports more than 5000 distinct + health checks across nodes. Evicted entries cause status transitions to be missed + and failure events to be re-emitted on subsequent check runs. + value: + type: integer + example: 5000 + minimum: 1 + + - name: health_checks_cache_ttl + fleet_configurable: true + description: | + Time-to-live in seconds for entries in the health check status cache. + value: + type: integer + example: 3600 + minimum: 1 + - template: instances/http - template: instances/default - template: logs diff --git a/consul/changelog.d/23603.added b/consul/changelog.d/23603.added new file mode 100644 index 0000000000000..fa7b7820e07bb --- /dev/null +++ b/consul/changelog.d/23603.added @@ -0,0 +1 @@ +Make the health check status cache size and TTL configurable via ``health_checks_cache_size`` and ``health_checks_cache_ttl``. \ No newline at end of file diff --git a/consul/datadog_checks/consul/common.py b/consul/datadog_checks/consul/common.py index bd3d9f78be898..9ae36d1bc8288 100644 --- a/consul/datadog_checks/consul/common.py +++ b/consul/datadog_checks/consul/common.py @@ -24,6 +24,10 @@ # Increase the number of threads to collect consul services checks THREADS_COUNT = 1 +# Defaults for the in-memory cache used to detect health check status transitions +HEALTH_CHECKS_CACHE_SIZE = 5000 +HEALTH_CHECKS_CACHE_TTL = 3600 + STATUS_SC = { 'up': AgentCheck.OK, 'passing': AgentCheck.OK, diff --git a/consul/datadog_checks/consul/config_models/defaults.py b/consul/datadog_checks/consul/config_models/defaults.py index 97798a85f9c69..8366ca94f0602 100644 --- a/consul/datadog_checks/consul/config_models/defaults.py +++ b/consul/datadog_checks/consul/config_models/defaults.py @@ -48,6 +48,14 @@ def instance_enable_legacy_tags_normalization(): return True +def instance_health_checks_cache_size(): + return 5000 + + +def instance_health_checks_cache_ttl(): + return 3600 + + def instance_kerberos_auth(): return 'disabled' diff --git a/consul/datadog_checks/consul/config_models/instance.py b/consul/datadog_checks/consul/config_models/instance.py index 8d8be33d6d249..5f65a49cb7088 100644 --- a/consul/datadog_checks/consul/config_models/instance.py +++ b/consul/datadog_checks/consul/config_models/instance.py @@ -12,7 +12,7 @@ from types import MappingProxyType from typing import Any, Optional -from pydantic import BaseModel, ConfigDict, field_validator, model_validator +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator from typing_extensions import Literal from datadog_checks.base.utils.functions import identity @@ -76,6 +76,8 @@ class InstanceConfig(BaseModel): enable_legacy_tags_normalization: Optional[bool] = None extra_headers: Optional[MappingProxyType[str, Any]] = None headers: Optional[MappingProxyType[str, Any]] = None + health_checks_cache_size: Optional[int] = Field(None, ge=1) + health_checks_cache_ttl: Optional[int] = Field(None, ge=1) kerberos_auth: Optional[Literal['required', 'optional', 'disabled']] = None kerberos_cache: Optional[str] = None kerberos_delegate: Optional[bool] = None diff --git a/consul/datadog_checks/consul/consul.py b/consul/datadog_checks/consul/consul.py index eb33aa991a396..1501e2b5a48ef 100644 --- a/consul/datadog_checks/consul/consul.py +++ b/consul/datadog_checks/consul/consul.py @@ -24,6 +24,8 @@ CONSUL_CHECK, HEALTH_CHECK, HEALTH_CHECK_METRIC, + HEALTH_CHECKS_CACHE_SIZE, + HEALTH_CHECKS_CACHE_TTL, MAX_CONFIG_TTL, MAX_SERVICES, SOURCE_TYPE_NAME, @@ -133,7 +135,15 @@ def __init__(self, name, init_config, instances): if 'acl_token' in self.instance: self.http.options['headers']['X-Consul-Token'] = self.instance['acl_token'] - self.health_checks = TTLCache(ttl=3600, maxsize=5000) + cache_size = self.instance.get( + 'health_checks_cache_size', + self.init_config.get('health_checks_cache_size', HEALTH_CHECKS_CACHE_SIZE), + ) + cache_ttl = self.instance.get( + 'health_checks_cache_ttl', + self.init_config.get('health_checks_cache_ttl', HEALTH_CHECKS_CACHE_TTL), + ) + self.health_checks = TTLCache(ttl=cache_ttl, maxsize=cache_size) def _is_dogstatsd_configured(self): """Check if the agent has a consul dogstatsd profile configured""" diff --git a/consul/datadog_checks/consul/data/conf.yaml.example b/consul/datadog_checks/consul/data/conf.yaml.example index 547288a234296..13b918baca614 100644 --- a/consul/datadog_checks/consul/data/conf.yaml.example +++ b/consul/datadog_checks/consul/data/conf.yaml.example @@ -147,6 +147,19 @@ instances: # # collect_health_checks: true + ## @param health_checks_cache_size - integer - optional - default: 5000 + ## Maximum number of health check entries kept in the cache used to detect status + ## transitions. Increase this if your Consul cluster reports more than 5000 distinct + ## health checks across nodes. Evicted entries cause status transitions to be missed + ## and failure events to be re-emitted on subsequent check runs. + # + # health_checks_cache_size: 5000 + + ## @param health_checks_cache_ttl - integer - optional - default: 3600 + ## Time-to-live in seconds for entries in the health check status cache. + # + # health_checks_cache_ttl: 3600 + ## @param proxy - mapping - optional ## This overrides the `proxy` setting in `init_config`. ## diff --git a/consul/tests/test_unit.py b/consul/tests/test_unit.py index 2b844e3601da8..f7dd1bc797608 100644 --- a/consul/tests/test_unit.py +++ b/consul/tests/test_unit.py @@ -671,3 +671,41 @@ def test_config(test_case, extra_config, expected_http_kwargs, mocker): } http_wargs.update(expected_http_kwargs) mock_session.get.assert_called_with('/v1/status/leader', **http_wargs) + + +def test_health_checks_cache_defaults(): + consul_check = ConsulCheck(common.CHECK_NAME, {}, [consul_mocks.MOCK_CONFIG]) + assert consul_check.health_checks.maxsize == 5000 + assert consul_check.health_checks.ttl == 3600 + + +def test_health_checks_cache_configurable(): + config = dict(consul_mocks.MOCK_CONFIG) + config['health_checks_cache_size'] = 10 + config['health_checks_cache_ttl'] = 60 + consul_check = ConsulCheck(common.CHECK_NAME, {}, [config]) + assert consul_check.health_checks.maxsize == 10 + assert consul_check.health_checks.ttl == 60 + + +def test_health_checks_cache_configurable_via_init_config(): + init_config = {'health_checks_cache_size': 42, 'health_checks_cache_ttl': 7} + consul_check = ConsulCheck(common.CHECK_NAME, init_config, [consul_mocks.MOCK_CONFIG]) + assert consul_check.health_checks.maxsize == 42 + assert consul_check.health_checks.ttl == 7 + + +def test_health_checks_cache_eviction_re_emits_failure_event(aggregator): + config = dict(consul_mocks.MOCK_CONFIG_DISABLE_SERVICE_TAG) + config['collect_health_checks'] = True + config['health_checks_cache_size'] = 1 + consul_check = ConsulCheck(common.CHECK_NAME, {}, [config]) + my_mocks = consul_mocks._get_consul_mocks() + my_mocks['consul_request'] = consul_mocks.mock_get_health_check + consul_mocks.mock_check(consul_check, my_mocks) + + consul_check.check(None) + consul_check.check(None) + + failure_events = [e for e in aggregator.events if e['event_type'] == 'consul.check_failed'] + assert len(failure_events) == 2 diff --git a/postgres/changelog.d/23647.fixed b/postgres/changelog.d/23647.fixed new file mode 100644 index 0000000000000..2b40d45842aa9 --- /dev/null +++ b/postgres/changelog.d/23647.fixed @@ -0,0 +1 @@ +Eliminate reference cycle in diagnostic instrumentation \ No newline at end of file diff --git a/postgres/datadog_checks/postgres/diagnose.py b/postgres/datadog_checks/postgres/diagnose.py index 9044799e911f7..7520e0f34c185 100644 --- a/postgres/datadog_checks/postgres/diagnose.py +++ b/postgres/datadog_checks/postgres/diagnose.py @@ -32,6 +32,11 @@ RECOMMENDED_TRACK_ACTIVITY_QUERY_SIZE = 4096 +def run_diagnostics(check): + """Entry point for ``Diagnosis.register()``; creates a short-lived worker per invocation.""" + PostgresDiagnose(check)._run() + + class PostgresDiagnose: """Explicit pre-flight diagnostics for `datadog-agent diagnose`.""" @@ -42,21 +47,6 @@ def __init__(self, check): # when shared_preload_libraries is empty). Reset at the top of the first orchestrator. self._failed = set() - # -- registration --------------------------------------------------------- - - def register(self): - """Register the diagnostic entry point with the check's Diagnosis object. - - Idempotent: re-invoking `register` on the same Diagnosis object is a no-op. - ``Diagnosis.register`` extends an internal list, so without this guard a - repeated call would stack the entry point and produce NĂ— the diagnostics. - """ - d = self._check.diagnosis - if getattr(d, '_postgres_diagnostics_registered', False): - return - d._postgres_diagnostics_registered = True - d.register(self._run) - # -- orchestrator --------------------------------------------------------- def _run(self): diff --git a/postgres/datadog_checks/postgres/postgres.py b/postgres/datadog_checks/postgres/postgres.py index 113e4402af133..0d9223662d657 100644 --- a/postgres/datadog_checks/postgres/postgres.py +++ b/postgres/datadog_checks/postgres/postgres.py @@ -47,7 +47,7 @@ from .__about__ import __version__ from .config import build_config, sanitize -from .diagnose import PostgresDiagnose +from .diagnose import run_diagnostics from .util import ( ANALYZE_PROGRESS_METRICS, AWS_RDS_HOSTNAME_SUFFIX, @@ -191,8 +191,7 @@ def __init__(self, name, init_config, instances): ttl=self._config.database_instance_collection_interval, ) # type: TTLCache - # Register explicit pre-flight diagnostics for `datadog-agent diagnose`. - PostgresDiagnose(self).register() + self.diagnosis.register(functools.partial(run_diagnostics, self)) def _submit_initialization_health_event(self): try: