Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions consul/assets/configuration/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,27 @@ files:
display_default: false
example: true

- name: health_checks_cache_size
fleet_configurable: true
description: |
Maximum number of health check entries kept in the cache used to detect status
transitions. Increase this if your Consul cluster reports more than 5000 distinct
health checks across nodes. Evicted entries cause status transitions to be missed
and failure events to be re-emitted on subsequent check runs.
value:
type: integer
example: 5000
minimum: 1

- name: health_checks_cache_ttl
fleet_configurable: true
description: |
Time-to-live in seconds for entries in the health check status cache.
value:
type: integer
example: 3600
minimum: 1

- template: instances/http
- template: instances/default
- template: logs
Expand Down
1 change: 1 addition & 0 deletions consul/changelog.d/23603.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Make the health check status cache size and TTL configurable via ``health_checks_cache_size`` and ``health_checks_cache_ttl``.
4 changes: 4 additions & 0 deletions consul/datadog_checks/consul/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
# Increase the number of threads to collect consul services checks
THREADS_COUNT = 1

# Defaults for the in-memory cache used to detect health check status transitions
HEALTH_CHECKS_CACHE_SIZE = 5000
HEALTH_CHECKS_CACHE_TTL = 3600

STATUS_SC = {
'up': AgentCheck.OK,
'passing': AgentCheck.OK,
Expand Down
8 changes: 8 additions & 0 deletions consul/datadog_checks/consul/config_models/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,14 @@ def instance_enable_legacy_tags_normalization():
return True


def instance_health_checks_cache_size():
return 5000


def instance_health_checks_cache_ttl():
return 3600


def instance_kerberos_auth():
return 'disabled'

Expand Down
4 changes: 3 additions & 1 deletion consul/datadog_checks/consul/config_models/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from types import MappingProxyType
from typing import Any, Optional

from pydantic import BaseModel, ConfigDict, field_validator, model_validator
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
from typing_extensions import Literal

from datadog_checks.base.utils.functions import identity
Expand Down Expand Up @@ -76,6 +76,8 @@ class InstanceConfig(BaseModel):
enable_legacy_tags_normalization: Optional[bool] = None
extra_headers: Optional[MappingProxyType[str, Any]] = None
headers: Optional[MappingProxyType[str, Any]] = None
health_checks_cache_size: Optional[int] = Field(None, ge=1)
health_checks_cache_ttl: Optional[int] = Field(None, ge=1)
kerberos_auth: Optional[Literal['required', 'optional', 'disabled']] = None
kerberos_cache: Optional[str] = None
kerberos_delegate: Optional[bool] = None
Expand Down
12 changes: 11 additions & 1 deletion consul/datadog_checks/consul/consul.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
CONSUL_CHECK,
HEALTH_CHECK,
HEALTH_CHECK_METRIC,
HEALTH_CHECKS_CACHE_SIZE,
HEALTH_CHECKS_CACHE_TTL,
MAX_CONFIG_TTL,
MAX_SERVICES,
SOURCE_TYPE_NAME,
Expand Down Expand Up @@ -133,7 +135,15 @@ def __init__(self, name, init_config, instances):
if 'acl_token' in self.instance:
self.http.options['headers']['X-Consul-Token'] = self.instance['acl_token']

self.health_checks = TTLCache(ttl=3600, maxsize=5000)
cache_size = self.instance.get(
'health_checks_cache_size',
self.init_config.get('health_checks_cache_size', HEALTH_CHECKS_CACHE_SIZE),
)
cache_ttl = self.instance.get(
'health_checks_cache_ttl',
self.init_config.get('health_checks_cache_ttl', HEALTH_CHECKS_CACHE_TTL),
)
self.health_checks = TTLCache(ttl=cache_ttl, maxsize=cache_size)

def _is_dogstatsd_configured(self):
"""Check if the agent has a consul dogstatsd profile configured"""
Expand Down
13 changes: 13 additions & 0 deletions consul/datadog_checks/consul/data/conf.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,19 @@ instances:
#
# collect_health_checks: true

## @param health_checks_cache_size - integer - optional - default: 5000
## Maximum number of health check entries kept in the cache used to detect status
## transitions. Increase this if your Consul cluster reports more than 5000 distinct
## health checks across nodes. Evicted entries cause status transitions to be missed
## and failure events to be re-emitted on subsequent check runs.
#
# health_checks_cache_size: 5000

## @param health_checks_cache_ttl - integer - optional - default: 3600
## Time-to-live in seconds for entries in the health check status cache.
#
# health_checks_cache_ttl: 3600

## @param proxy - mapping - optional
## This overrides the `proxy` setting in `init_config`.
##
Expand Down
38 changes: 38 additions & 0 deletions consul/tests/test_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,3 +671,41 @@ def test_config(test_case, extra_config, expected_http_kwargs, mocker):
}
http_wargs.update(expected_http_kwargs)
mock_session.get.assert_called_with('/v1/status/leader', **http_wargs)


def test_health_checks_cache_defaults():
consul_check = ConsulCheck(common.CHECK_NAME, {}, [consul_mocks.MOCK_CONFIG])
assert consul_check.health_checks.maxsize == 5000
assert consul_check.health_checks.ttl == 3600


def test_health_checks_cache_configurable():
config = dict(consul_mocks.MOCK_CONFIG)
config['health_checks_cache_size'] = 10
config['health_checks_cache_ttl'] = 60
consul_check = ConsulCheck(common.CHECK_NAME, {}, [config])
assert consul_check.health_checks.maxsize == 10
assert consul_check.health_checks.ttl == 60


def test_health_checks_cache_configurable_via_init_config():
init_config = {'health_checks_cache_size': 42, 'health_checks_cache_ttl': 7}
consul_check = ConsulCheck(common.CHECK_NAME, init_config, [consul_mocks.MOCK_CONFIG])
assert consul_check.health_checks.maxsize == 42
assert consul_check.health_checks.ttl == 7


def test_health_checks_cache_eviction_re_emits_failure_event(aggregator):
config = dict(consul_mocks.MOCK_CONFIG_DISABLE_SERVICE_TAG)
config['collect_health_checks'] = True
config['health_checks_cache_size'] = 1
consul_check = ConsulCheck(common.CHECK_NAME, {}, [config])
my_mocks = consul_mocks._get_consul_mocks()
my_mocks['consul_request'] = consul_mocks.mock_get_health_check
consul_mocks.mock_check(consul_check, my_mocks)

consul_check.check(None)
consul_check.check(None)

failure_events = [e for e in aggregator.events if e['event_type'] == 'consul.check_failed']
assert len(failure_events) == 2
1 change: 1 addition & 0 deletions postgres/changelog.d/23647.fixed
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Eliminate reference cycle in diagnostic instrumentation
20 changes: 5 additions & 15 deletions postgres/datadog_checks/postgres/diagnose.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@
RECOMMENDED_TRACK_ACTIVITY_QUERY_SIZE = 4096


def run_diagnostics(check):
"""Entry point for ``Diagnosis.register()``; creates a short-lived worker per invocation."""
PostgresDiagnose(check)._run()


class PostgresDiagnose:
"""Explicit pre-flight diagnostics for `datadog-agent diagnose`."""

Expand All @@ -42,21 +47,6 @@ def __init__(self, check):
# when shared_preload_libraries is empty). Reset at the top of the first orchestrator.
self._failed = set()

# -- registration ---------------------------------------------------------

def register(self):
"""Register the diagnostic entry point with the check's Diagnosis object.

Idempotent: re-invoking `register` on the same Diagnosis object is a no-op.
``Diagnosis.register`` extends an internal list, so without this guard a
repeated call would stack the entry point and produce N× the diagnostics.
"""
d = self._check.diagnosis
if getattr(d, '_postgres_diagnostics_registered', False):
return
d._postgres_diagnostics_registered = True
d.register(self._run)

# -- orchestrator ---------------------------------------------------------

def _run(self):
Expand Down
5 changes: 2 additions & 3 deletions postgres/datadog_checks/postgres/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@

from .__about__ import __version__
from .config import build_config, sanitize
from .diagnose import PostgresDiagnose
from .diagnose import run_diagnostics
from .util import (
ANALYZE_PROGRESS_METRICS,
AWS_RDS_HOSTNAME_SUFFIX,
Expand Down Expand Up @@ -191,8 +191,7 @@ def __init__(self, name, init_config, instances):
ttl=self._config.database_instance_collection_interval,
) # type: TTLCache

# Register explicit pre-flight diagnostics for `datadog-agent diagnose`.
PostgresDiagnose(self).register()
self.diagnosis.register(functools.partial(run_diagnostics, self))

def _submit_initialization_health_event(self):
try:
Expand Down
Loading