Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions consul/assets/configuration/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,16 @@ files:
example: 3600
minimum: 1

- name: health_check_warning_events
fleet_configurable: true
description: |
Whether to emit an event when a Consul health check transitions to `warning`.
Events for critical health checks are always emitted when `collect_health_checks` is enabled.
value:
type: boolean
example: true
default: false

- template: instances/http
- template: instances/default
- template: logs
Expand Down
1 change: 1 addition & 0 deletions consul/changelog.d/23779.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add new config `health_check_warning_events` to emit an event when a Consul health check transitions to `warning`.
4 changes: 4 additions & 0 deletions consul/datadog_checks/consul/config_models/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ def instance_enable_legacy_tags_normalization():
return True


def instance_health_check_warning_events():
return False


def instance_health_checks_cache_size():
return 5000

Expand Down
1 change: 1 addition & 0 deletions consul/datadog_checks/consul/config_models/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ class InstanceConfig(BaseModel):
enable_legacy_tags_normalization: Optional[bool] = None
extra_headers: Optional[MappingProxyType[str, Any]] = None
headers: Optional[MappingProxyType[str, Any]] = None
health_check_warning_events: Optional[bool] = None
health_checks_cache_size: Optional[int] = Field(None, ge=1)
health_checks_cache_ttl: Optional[int] = Field(None, ge=1)
kerberos_auth: Optional[Literal['required', 'optional', 'disabled']] = None
Expand Down
49 changes: 33 additions & 16 deletions consul/datadog_checks/consul/consul.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,12 @@ def __init__(self, name, init_config, instances):
self.collect_health_checks = self.instance.get(
'collect_health_checks', self.init_config.get('collect_health_checks', False)
)
self.health_check_warning_events = is_affirmative(
self.instance.get(
'health_check_warning_events',
self.init_config.get('health_check_warning_events', False),
)
)

if self.threads_count > 1:
self.thread_pool = ThreadPool(self.threads_count)
Expand Down Expand Up @@ -407,22 +413,33 @@ def check(self, _):
self.gauge(HEALTH_CHECK_METRIC, status_value, tags=main_tags + node_tags)
self.health_checks[hc_id] = status_value

if last_hc_value != status_value and status_value == 3:
check_name = check.get("Name", "Consul Health Check")
check_output = check.get("Output", "")
self.event(
{
"timestamp": timestamp(),
"event_type": "consul.check_failed",
"alert_type": "error",
"source_type_name": SOURCE_TYPE_NAME,
"msg_title": f"{check_name} Failed",
"aggregation_key": "consul.status_check",
"msg_text": f"Check {check_id} for service {service_name}, id: {service_id}"
f"failed on node {node_name}: {check_output}",
"tags": node_tags,
}
)
if last_hc_value != status_value:
if status_value == 3 or (status_value == 2 and self.health_check_warning_events):
check_name = check.get("Name", "Consul Health Check")
check_output = check.get("Output", "")

if status_value == 3:
event_type = "consul.check_failed"
alert_type = "error"
label = "failed"
else:
event_type = "consul.check_warning"
alert_type = "warning"
label = "warning"

self.event(
{
"timestamp": timestamp(),
"event_type": event_type,
"alert_type": alert_type,
"source_type_name": SOURCE_TYPE_NAME,
"msg_title": f"{check_name} {label.capitalize()}",
"aggregation_key": "consul.status_check",
"msg_text": f"Check {check_id} for service {service_name}, id: {service_id}"
f"{label} on node {node_name}: {check_output}",
"tags": node_tags,
}
)

if sc_id not in service_checks:
service_checks[sc_id] = {'status': status, 'tags': tags}
Expand Down
6 changes: 6 additions & 0 deletions consul/datadog_checks/consul/data/conf.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,12 @@ instances:
#
# health_checks_cache_ttl: 3600

## @param health_check_warning_events - boolean - optional - default: false
## Whether to emit an event when a Consul health check transitions to `warning`.
## Events for critical health checks are always emitted when `collect_health_checks` is enabled.
#
# health_check_warning_events: true

## @param proxy - mapping - optional
## This overrides the `proxy` setting in `init_config`.
##
Expand Down
7 changes: 7 additions & 0 deletions consul/tests/consul_mocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,13 @@ def mock_get_coord_nodes_benchmark(num_nodes):
return nodes


def mock_get_health_check_with_warning(_):
checks = mock_get_health_check(_)
checks[0]["Status"] = "warning"
checks[0]["Output"] = "disk usage high"
return checks


def mock_get_health_check(_):
return [
{
Expand Down
33 changes: 33 additions & 0 deletions consul/tests/test_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,39 @@ def test_health_checks(aggregator, collect_health_checks, expected_metric_count,
aggregator.assert_event(exact_match=False, count=expected_metric_count, **event)


@pytest.mark.parametrize(
'health_check_warning_events, expected_warning_events',
[
pytest.param(True, 1, id="warning events enabled"),
pytest.param(False, 0, id="warning events disabled"),
],
)
def test_health_check_warning_events(aggregator, health_check_warning_events, expected_warning_events):
config = consul_mocks.MOCK_CONFIG_DISABLE_SERVICE_TAG.copy()
config['collect_health_checks'] = True
config['health_check_warning_events'] = health_check_warning_events
consul_check = ConsulCheck(common.CHECK_NAME, {}, [config])
my_mocks = consul_mocks._get_consul_mocks()
my_mocks['consul_request'] = consul_mocks.mock_get_health_check_with_warning
consul_mocks.mock_check(consul_check, my_mocks)
consul_check.check(None)

warning_events = [e for e in aggregator.events if e['event_type'] == 'consul.check_warning']
assert len(warning_events) == expected_warning_events
if expected_warning_events:
assert warning_events[0]['alert_type'] == 'warning'
assert warning_events[0]['msg_title'] == "Service 'server-loadbalancer' check Warning"
assert (
warning_events[0]['msg_text']
== "Check server-loadbalancer for service server-loadbalancer, id: server-loadbalancerwarning "
"on node node-2: disk usage high"
)

consul_check.check(None)
warning_events = [e for e in aggregator.events if e['event_type'] == 'consul.check_warning']
assert len(warning_events) == 1


def test_service_checks_disable_service_tag(aggregator):
consul_check = ConsulCheck(common.CHECK_NAME, {}, [consul_mocks.MOCK_CONFIG_DISABLE_SERVICE_TAG])
my_mocks = consul_mocks._get_consul_mocks()
Expand Down
2 changes: 2 additions & 0 deletions ddev/hatch.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ e2e-env = false
dependencies = [
"pyyaml",
"vcrpy",
# vcrpy uses aiohttp.streams.AsyncStreamReaderMixin, removed in aiohttp 3.14
"aiohttp<3.14",
]
# TODO: remove this when the old CLI is gone
pre-install-commands = [
Expand Down
1 change: 1 addition & 0 deletions external_dns/changelog.d/23671.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added metrics external_dns.controller.last_reconcile & external_dns.controller.consecutive.soft.errors
2 changes: 2 additions & 0 deletions external_dns/datadog_checks/external_dns/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,6 @@
'source_errors_total': 'source.errors.total',
'registry_errors_total': 'registry.errors.total',
'external_dns_controller_last_sync_timestamp_seconds': 'controller.last_sync',
'external_dns_controller_consecutive_soft_errors': 'controller.consecutive.soft.errors',
'external_dns_controller_last_reconcile_timestamp_seconds': 'controller.last_reconcile',
}
14 changes: 8 additions & 6 deletions external_dns/metadata.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name,curated_metric
external_dns.controller.last_sync,gauge,,second,,Timestamp of last successful sync with the DNS provider,0,external_dns,controller last sync timestamp,
external_dns.registry.endpoints.total,gauge,,resource,,Number of registry endpoints,0,external_dns,registry endpoints,
external_dns.registry.errors.total,gauge,,error,,Number of registry errors,-1,external_dns,registry errors,
external_dns.source.endpoints.total,gauge,,resource,,Number of source endpoints,0,external_dns,source endpoints,
external_dns.source.errors.total,gauge,,error,,Number of source errors,-1,external_dns,source errors,
metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name,curated_metric,sample_tags
external_dns.controller.consecutive.soft.errors,gauge,,error,,Number of consecutive soft errors in reconciliation loop,-1,external_dns,controller consecutive soft errors,,
external_dns.controller.last_reconcile,gauge,,second,,Timestamp of last reconcile attempt,0,external_dns,controller last reconcile timestamp,,
external_dns.controller.last_sync,gauge,,second,,Timestamp of last successful sync with the DNS provider,0,external_dns,controller last sync timestamp,,
external_dns.registry.endpoints.total,gauge,,resource,,Number of registry endpoints,0,external_dns,registry endpoints,,
external_dns.registry.errors.total,gauge,,error,,Number of registry errors,-1,external_dns,registry errors,,
external_dns.source.endpoints.total,gauge,,resource,,Number of source endpoints,0,external_dns,source endpoints,,
external_dns.source.errors.total,gauge,,error,,Number of source errors,-1,external_dns,source errors,,
8 changes: 7 additions & 1 deletion external_dns/tests/fixtures/metrics.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,10 @@ registry_errors_total 0
source_errors_total 0
# HELP external_dns_controller_last_sync_timestamp_seconds Timestamp of last successful sync with the DNS provider
# TYPE external_dns_controller_last_sync_timestamp_seconds gauge
external_dns_controller_last_sync_timestamp_seconds 1.6343090342347014e+09
external_dns_controller_last_sync_timestamp_seconds 1.6343090342347014e+09
# HELP external_dns_controller_consecutive_soft_errors Number of consecutive soft errors in reconciliation loop
# TYPE external_dns_controller_consecutive_soft_errors gauge
external_dns_controller_consecutive_soft_errors 0
# HELP external_dns_controller_last_reconcile_timestamp_seconds Timestamp of last reconcile attempt
# TYPE external_dns_controller_last_reconcile_timestamp_seconds gauge
external_dns_controller_last_reconcile_timestamp_seconds 1.715520123e+09
8 changes: 8 additions & 0 deletions tibco_ems/assets/configuration/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,12 @@ files:
value:
type: string
require_trusted_provider: true
- name: use_ssl
description: |
Set to `true` to connect to the Tibco EMS server using SSL (`ssl://host:port`).
When `false`, the check uses `tcp://host:port`.
value:
type: boolean
default: false
example: true
- template: instances/default
1 change: 1 addition & 0 deletions tibco_ems/changelog.d/23732.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add new config ``use_ssl`` to connect via SSL instead of TCP.
4 changes: 4 additions & 0 deletions tibco_ems/datadog_checks/tibco_ems/config_models/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,7 @@ def instance_min_collection_interval():

def instance_port():
return 7222


def instance_use_ssl():
return False
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ class InstanceConfig(BaseModel):
service: Optional[str] = None
tags: Optional[tuple[str, ...]] = None
tibemsadmin: Optional[str] = None
use_ssl: Optional[bool] = None
username: Optional[str] = None

@model_validator(mode='before')
Expand Down
6 changes: 6 additions & 0 deletions tibco_ems/datadog_checks/tibco_ems/data/conf.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ instances:
#
# tibemsadmin: <TIBEMSADMIN>

## @param use_ssl - boolean - optional - default: false
## Set to `true` to connect to the Tibco EMS server using SSL (`ssl://host:port`).
## When `false`, the check uses `tcp://host:port`.
#
# use_ssl: true

## @param tags - list of strings - optional
## A list of tags to attach to every metric and service check emitted by this instance.
##
Expand Down
11 changes: 8 additions & 3 deletions tibco_ems/datadog_checks/tibco_ems/tibco_ems.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@
import subprocess
from typing import Any # noqa: F401

from datadog_checks.base import AgentCheck
from datadog_checks.base import AgentCheck, is_affirmative

from .constants import SHOW_METRIC_DATA, UNIT_PATTERN

DEFAULT_HOST = 'localhost'
DEFAULT_PORT = 7222
TO_BYTES = {'b': 1, 'kb': 1e3, 'mb': 1e6, 'gb': 1e9, 'tb': 1e12}
CONNECTION_STRING = 'tcp://{}:{}'
TCP_CONNECTION_STRING = 'tcp://{}:{}'
SSL_CONNECTION_STRING = 'ssl://{}:{}'


class TibcoEMSCheck(AgentCheck):
Expand All @@ -29,7 +30,11 @@ def __init__(self, name, init_config, instances):
username = self.instance.get('username')
password = self.instance.get('password')
script_path = self.instance.get('script_path')
server_string = CONNECTION_STRING.format(host, port)
use_ssl = is_affirmative(self.instance.get('use_ssl', False))
if not use_ssl:
server_string = TCP_CONNECTION_STRING.format(host, port)
else:
server_string = SSL_CONNECTION_STRING.format(host, port)
self.tags = self.instance.get('tags', [])

self.cmd = tibemsadmin_cmd + [
Expand Down
5 changes: 5 additions & 0 deletions tibco_ems/tests/test_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,8 @@ def test_base_tags(dd_run_check, instance):

# assert the lenght of tags does not grow indefinitely
assert len(check.tags) == 3


def test_use_ssl_server_string(instance):
check = TibcoEMSCheck('tibco_ems', {}, [{**instance, 'use_ssl': True}])
assert 'ssl://localhost:7222' in check.cmd
Loading