Skip to content

Commit 8e13820

Browse files
Emit Warning Events for Consul Integration (DataDog#23779)
* Add new config health_check_warning_events * Emit event for warnings * Removing whitespace to match original message * Add test for warning events * Add changelog * Fix typo * Apply lint * Change wording
1 parent cf213ba commit 8e13820

8 files changed

Lines changed: 95 additions & 16 deletions

File tree

consul/assets/configuration/spec.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,16 @@ files:
178178
example: 3600
179179
minimum: 1
180180

181+
- name: health_check_warning_events
182+
fleet_configurable: true
183+
description: |
184+
Whether to emit an event when a Consul health check transitions to `warning`.
185+
Events for critical health checks are always emitted when `collect_health_checks` is enabled.
186+
value:
187+
type: boolean
188+
example: true
189+
default: false
190+
181191
- template: instances/http
182192
- template: instances/default
183193
- template: logs

consul/changelog.d/23779.added

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add new config `health_check_warning_events` to emit an event when a Consul health check transitions to `warning`.

consul/datadog_checks/consul/config_models/defaults.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@ def instance_enable_legacy_tags_normalization():
4848
return True
4949

5050

51+
def instance_health_check_warning_events():
52+
return False
53+
54+
5155
def instance_health_checks_cache_size():
5256
return 5000
5357

consul/datadog_checks/consul/config_models/instance.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ class InstanceConfig(BaseModel):
7676
enable_legacy_tags_normalization: Optional[bool] = None
7777
extra_headers: Optional[MappingProxyType[str, Any]] = None
7878
headers: Optional[MappingProxyType[str, Any]] = None
79+
health_check_warning_events: Optional[bool] = None
7980
health_checks_cache_size: Optional[int] = Field(None, ge=1)
8081
health_checks_cache_ttl: Optional[int] = Field(None, ge=1)
8182
kerberos_auth: Optional[Literal['required', 'optional', 'disabled']] = None

consul/datadog_checks/consul/consul.py

Lines changed: 33 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,12 @@ def __init__(self, name, init_config, instances):
116116
self.collect_health_checks = self.instance.get(
117117
'collect_health_checks', self.init_config.get('collect_health_checks', False)
118118
)
119+
self.health_check_warning_events = is_affirmative(
120+
self.instance.get(
121+
'health_check_warning_events',
122+
self.init_config.get('health_check_warning_events', False),
123+
)
124+
)
119125

120126
if self.threads_count > 1:
121127
self.thread_pool = ThreadPool(self.threads_count)
@@ -407,22 +413,33 @@ def check(self, _):
407413
self.gauge(HEALTH_CHECK_METRIC, status_value, tags=main_tags + node_tags)
408414
self.health_checks[hc_id] = status_value
409415

410-
if last_hc_value != status_value and status_value == 3:
411-
check_name = check.get("Name", "Consul Health Check")
412-
check_output = check.get("Output", "")
413-
self.event(
414-
{
415-
"timestamp": timestamp(),
416-
"event_type": "consul.check_failed",
417-
"alert_type": "error",
418-
"source_type_name": SOURCE_TYPE_NAME,
419-
"msg_title": f"{check_name} Failed",
420-
"aggregation_key": "consul.status_check",
421-
"msg_text": f"Check {check_id} for service {service_name}, id: {service_id}"
422-
f"failed on node {node_name}: {check_output}",
423-
"tags": node_tags,
424-
}
425-
)
416+
if last_hc_value != status_value:
417+
if status_value == 3 or (status_value == 2 and self.health_check_warning_events):
418+
check_name = check.get("Name", "Consul Health Check")
419+
check_output = check.get("Output", "")
420+
421+
if status_value == 3:
422+
event_type = "consul.check_failed"
423+
alert_type = "error"
424+
label = "failed"
425+
else:
426+
event_type = "consul.check_warning"
427+
alert_type = "warning"
428+
label = "warning"
429+
430+
self.event(
431+
{
432+
"timestamp": timestamp(),
433+
"event_type": event_type,
434+
"alert_type": alert_type,
435+
"source_type_name": SOURCE_TYPE_NAME,
436+
"msg_title": f"{check_name} {label.capitalize()}",
437+
"aggregation_key": "consul.status_check",
438+
"msg_text": f"Check {check_id} for service {service_name}, id: {service_id}"
439+
f"{label} on node {node_name}: {check_output}",
440+
"tags": node_tags,
441+
}
442+
)
426443

427444
if sc_id not in service_checks:
428445
service_checks[sc_id] = {'status': status, 'tags': tags}

consul/datadog_checks/consul/data/conf.yaml.example

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,12 @@ instances:
160160
#
161161
# health_checks_cache_ttl: 3600
162162

163+
## @param health_check_warning_events - boolean - optional - default: false
164+
## Whether to emit an event when a Consul health check transitions to `warning`.
165+
## Events for critical health checks are always emitted when `collect_health_checks` is enabled.
166+
#
167+
# health_check_warning_events: true
168+
163169
## @param proxy - mapping - optional
164170
## This overrides the `proxy` setting in `init_config`.
165171
##

consul/tests/consul_mocks.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,13 @@ def mock_get_coord_nodes_benchmark(num_nodes):
306306
return nodes
307307

308308

309+
def mock_get_health_check_with_warning(_):
310+
checks = mock_get_health_check(_)
311+
checks[0]["Status"] = "warning"
312+
checks[0]["Output"] = "disk usage high"
313+
return checks
314+
315+
309316
def mock_get_health_check(_):
310317
return [
311318
{

consul/tests/test_unit.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,39 @@ def test_health_checks(aggregator, collect_health_checks, expected_metric_count,
331331
aggregator.assert_event(exact_match=False, count=expected_metric_count, **event)
332332

333333

334+
@pytest.mark.parametrize(
335+
'health_check_warning_events, expected_warning_events',
336+
[
337+
pytest.param(True, 1, id="warning events enabled"),
338+
pytest.param(False, 0, id="warning events disabled"),
339+
],
340+
)
341+
def test_health_check_warning_events(aggregator, health_check_warning_events, expected_warning_events):
342+
config = consul_mocks.MOCK_CONFIG_DISABLE_SERVICE_TAG.copy()
343+
config['collect_health_checks'] = True
344+
config['health_check_warning_events'] = health_check_warning_events
345+
consul_check = ConsulCheck(common.CHECK_NAME, {}, [config])
346+
my_mocks = consul_mocks._get_consul_mocks()
347+
my_mocks['consul_request'] = consul_mocks.mock_get_health_check_with_warning
348+
consul_mocks.mock_check(consul_check, my_mocks)
349+
consul_check.check(None)
350+
351+
warning_events = [e for e in aggregator.events if e['event_type'] == 'consul.check_warning']
352+
assert len(warning_events) == expected_warning_events
353+
if expected_warning_events:
354+
assert warning_events[0]['alert_type'] == 'warning'
355+
assert warning_events[0]['msg_title'] == "Service 'server-loadbalancer' check Warning"
356+
assert (
357+
warning_events[0]['msg_text']
358+
== "Check server-loadbalancer for service server-loadbalancer, id: server-loadbalancerwarning "
359+
"on node node-2: disk usage high"
360+
)
361+
362+
consul_check.check(None)
363+
warning_events = [e for e in aggregator.events if e['event_type'] == 'consul.check_warning']
364+
assert len(warning_events) == 1
365+
366+
334367
def test_service_checks_disable_service_tag(aggregator):
335368
consul_check = ConsulCheck(common.CHECK_NAME, {}, [consul_mocks.MOCK_CONFIG_DISABLE_SERVICE_TAG])
336369
my_mocks = consul_mocks._get_consul_mocks()

0 commit comments

Comments
 (0)