diff --git a/.github/workflows/assign-issue.yml b/.github/workflows/assign-issue.yml index 316c16ecc9fcf..5872b05a0c2d7 100644 --- a/.github/workflows/assign-issue.yml +++ b/.github/workflows/assign-issue.yml @@ -17,8 +17,7 @@ jobs: contents: read issues: write actions: write - id-token: write - uses: DataDog/issue-triage-action/.github/workflows/issue-triage.yml@ee200860ba35ba6ed3ad565e918afb16811fe975 # v1.0.0 + uses: DataDog/issue-triage-action/.github/workflows/issue-triage.yml@b39f0bc12abc52fe8aa70dc9b9353bf307a13219 # v1.0.1 with: slack_map_path: .github/workflows/config/github_slack_map.yaml environment_name: issue-assigner diff --git a/envoy/changelog.d/24237.added b/envoy/changelog.d/24237.added new file mode 100644 index 0000000000000..6ee50fc52801a --- /dev/null +++ b/envoy/changelog.d/24237.added @@ -0,0 +1 @@ +Add adaptive concurrency gradient controller metrics. diff --git a/envoy/datadog_checks/envoy/metrics.py b/envoy/datadog_checks/envoy/metrics.py index 9a8aea6d73730..bce35f2dfc824 100644 --- a/envoy/datadog_checks/envoy/metrics.py +++ b/envoy/datadog_checks/envoy/metrics.py @@ -387,6 +387,27 @@ 'envoy_http_rbac_denied': 'http.rbac_denied', 'envoy_http_rbac_shadow_allowed': 'http.rbac_shadow_allowed', 'envoy_http_rbac_shadow_denied': 'http.rbac_shadow_denied', + 'envoy_http_adaptive_concurrency_gradient_controller_rq_blocked': ( + 'http.adaptive_concurrency.gradient_controller.rq_blocked' + ), + 'envoy_http_adaptive_concurrency_gradient_controller_min_rtt_calculation_active': ( + 'http.adaptive_concurrency.gradient_controller.min_rtt_calculation_active' + ), + 'envoy_http_adaptive_concurrency_gradient_controller_concurrency_limit': ( + 'http.adaptive_concurrency.gradient_controller.concurrency_limit' + ), + 'envoy_http_adaptive_concurrency_gradient_controller_gradient': ( + 'http.adaptive_concurrency.gradient_controller.gradient' + ), + 'envoy_http_adaptive_concurrency_gradient_controller_burst_queue_size': ( + 'http.adaptive_concurrency.gradient_controller.burst_queue_size' + ), + 'envoy_http_adaptive_concurrency_gradient_controller_min_rtt_msecs': ( + 'http.adaptive_concurrency.gradient_controller.min_rtt_msecs' + ), + 'envoy_http_adaptive_concurrency_gradient_controller_sample_rtt_msecs': ( + 'http.adaptive_concurrency.gradient_controller.sample_rtt_msecs' + ), 'envoy_http_local_rate_limit_enabled': 'http.local_rate_limit_enabled', 'envoy_http_local_rate_limit_enforced': 'http.local_rate_limit_enforced', 'envoy_http_local_rate_limit_rate_limited': 'http.local_rate_limit_rate_limited', @@ -4092,6 +4113,69 @@ ), 'method': 'monotonic_count', }, + 'http.adaptive_concurrency.gradient_controller.rq_blocked': { + 'tags': ( + ('stat_prefix',), + (), + (), + (), + ), + 'method': 'monotonic_count', + }, + 'http.adaptive_concurrency.gradient_controller.min_rtt_calculation_active': { + 'tags': ( + ('stat_prefix',), + (), + (), + (), + ), + 'method': 'gauge', + }, + 'http.adaptive_concurrency.gradient_controller.concurrency_limit': { + 'tags': ( + ('stat_prefix',), + (), + (), + (), + ), + 'method': 'gauge', + }, + 'http.adaptive_concurrency.gradient_controller.gradient': { + 'tags': ( + ('stat_prefix',), + (), + (), + (), + ), + 'method': 'gauge', + }, + 'http.adaptive_concurrency.gradient_controller.burst_queue_size': { + 'tags': ( + ('stat_prefix',), + (), + (), + (), + ), + 'method': 'gauge', + }, + 'http.adaptive_concurrency.gradient_controller.min_rtt_msecs': { + 'tags': ( + ('stat_prefix',), + (), + (), + (), + ), + 'method': 'gauge', + }, + 'http.adaptive_concurrency.gradient_controller.sample_rtt_msecs': { + 'tags': ( + ('stat_prefix',), + (), + (), + (), + ), + 'method': 'gauge', + }, } # fmt: on diff --git a/envoy/metadata.csv b/envoy/metadata.csv index a11fdd89d5532..c0556c7cecc73 100644 --- a/envoy/metadata.csv +++ b/envoy/metadata.csv @@ -1029,3 +1029,11 @@ envoy.tls_inspector.sni.not_found.count,count,,,,[OpenMetrics V2] Total number o envoy.tls_inspector.bytes_processed.bucket,count,,,,[OpenMetrics V2] Records sizes which records the number of bytes the tls_inspector processed while analyzing for tls usage,0,envoy,, envoy.tls_inspector.bytes_processed.count,count,,,,[OpenMetrics V2] Count of records sizes which records the number of bytes the tls_inspector processed while analyzing for tls usage,0,envoy,, envoy.tls_inspector.bytes_processed.sum,count,,,,[OpenMetrics V2] Total sum of records sizes which records the number of bytes the tls_inspector processed while analyzing for tls usage,0,envoy,, +envoy.http.adaptive_concurrency.gradient_controller.rq_blocked,count,,request,,[Legacy] Total requests blocked by the adaptive concurrency filter.,-1,envoy,, +envoy.http.adaptive_concurrency.gradient_controller.rq_blocked.count,count,,request,,[OpenMetrics V2] Total requests blocked by the adaptive concurrency filter.,-1,envoy,, +envoy.http.adaptive_concurrency.gradient_controller.min_rtt_calculation_active,gauge,,,,"[OpenMetrics V2] Set to 1 if the gradient controller is in the process of a minRTT calculation, and 0 otherwise.",0,envoy,, +envoy.http.adaptive_concurrency.gradient_controller.concurrency_limit,gauge,,request,,[OpenMetrics V2] The current concurrency limit.,0,envoy,, +envoy.http.adaptive_concurrency.gradient_controller.gradient,gauge,,,,"[OpenMetrics V2] The current gradient value multiplied by 1,000.",0,envoy,, +envoy.http.adaptive_concurrency.gradient_controller.burst_queue_size,gauge,,request,,[OpenMetrics V2] The current headroom value in the concurrency limit calculation.,0,envoy,, +envoy.http.adaptive_concurrency.gradient_controller.min_rtt_msecs,gauge,,millisecond,,[OpenMetrics V2] The current measured minimum round-trip time.,0,envoy,, +envoy.http.adaptive_concurrency.gradient_controller.sample_rtt_msecs,gauge,,millisecond,,[OpenMetrics V2] The current measured sample round-trip time aggregate.,0,envoy,, diff --git a/envoy/tests/common.py b/envoy/tests/common.py index 228eaa0a79537..eea2bd9a4e6f0 100644 --- a/envoy/tests/common.py +++ b/envoy/tests/common.py @@ -398,6 +398,28 @@ CONNECTION_LIMIT_STAT_PREFIX_TAG = 'stat_prefix:ingress_http' +ADAPTIVE_CONCURRENCY_PROMETHEUS_COUNTER_METRIC = "http.adaptive_concurrency.gradient_controller.rq_blocked.count" + +ADAPTIVE_CONCURRENCY_PROMETHEUS_VALUES = { + ADAPTIVE_CONCURRENCY_PROMETHEUS_COUNTER_METRIC: 5, + "http.adaptive_concurrency.gradient_controller.min_rtt_calculation_active": 0, + "http.adaptive_concurrency.gradient_controller.concurrency_limit": 30, + "http.adaptive_concurrency.gradient_controller.gradient": 1000, + "http.adaptive_concurrency.gradient_controller.burst_queue_size": 22, + "http.adaptive_concurrency.gradient_controller.min_rtt_msecs": 100, + "http.adaptive_concurrency.gradient_controller.sample_rtt_msecs": 95, +} + +ADAPTIVE_CONCURRENCY_PROMETHEUS_METRICS = list(ADAPTIVE_CONCURRENCY_PROMETHEUS_VALUES.keys()) + +ADAPTIVE_CONCURRENCY_PROMETHEUS_STAT_PREFIX_TAG = 'stat_prefix:ingress_http' + +ADAPTIVE_CONCURRENCY_PROMETHEUS_GAUGE_VALUES = { + metric: value + for metric, value in ADAPTIVE_CONCURRENCY_PROMETHEUS_VALUES.items() + if metric != ADAPTIVE_CONCURRENCY_PROMETHEUS_COUNTER_METRIC +} + TLS_INSPECTOR_METRICS = [ "tls_inspector.client_hello_too_large.count", "tls_inspector.tls.found.count", diff --git a/envoy/tests/docker/api_v2/front-envoy.yaml b/envoy/tests/docker/api_v2/front-envoy.yaml index d99116ae561fa..f6b953d3b0b85 100644 --- a/envoy/tests/docker/api_v2/front-envoy.yaml +++ b/envoy/tests/docker/api_v2/front-envoy.yaml @@ -24,6 +24,22 @@ static_resources: route: cluster: service2 http_filters: + - name: envoy.filters.http.adaptive_concurrency + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.adaptive_concurrency.v3.AdaptiveConcurrency + gradient_controller_config: + sample_aggregate_percentile: + value: 90 + concurrency_limit_params: + concurrency_update_interval: 0.1s + min_rtt_calc_params: + jitter: + value: 10 + interval: 60s + request_count: 50 + enabled: + default_value: true + runtime_key: "adaptive_concurrency.enabled" - name: envoy.router clusters: - name: service1 diff --git a/envoy/tests/docker/api_v3/front-envoy.yaml b/envoy/tests/docker/api_v3/front-envoy.yaml index 24b6590fd1b78..682f8e882b095 100644 --- a/envoy/tests/docker/api_v3/front-envoy.yaml +++ b/envoy/tests/docker/api_v3/front-envoy.yaml @@ -132,6 +132,22 @@ static_resources: grpc_service: envoy_grpc: cluster_name: ext-proc + - name: envoy.filters.http.adaptive_concurrency + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.adaptive_concurrency.v3.AdaptiveConcurrency + gradient_controller_config: + sample_aggregate_percentile: + value: 90 + concurrency_limit_params: + concurrency_update_interval: 0.1s + min_rtt_calc_params: + jitter: + value: 10 + interval: 60s + request_count: 50 + enabled: + default_value: true + runtime_key: "adaptive_concurrency.enabled" - name: envoy.filters.http.router typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router diff --git a/envoy/tests/fixtures/legacy/adaptive_concurrency.txt b/envoy/tests/fixtures/legacy/adaptive_concurrency.txt new file mode 100644 index 0000000000000..2060fc44b8ce0 --- /dev/null +++ b/envoy/tests/fixtures/legacy/adaptive_concurrency.txt @@ -0,0 +1,7 @@ +http.ingress_http.adaptive_concurrency.gradient_controller.rq_blocked: 5 +http.ingress_http.adaptive_concurrency.gradient_controller.min_rtt_calculation_active: 1 +http.ingress_http.adaptive_concurrency.gradient_controller.concurrency_limit: 30 +http.ingress_http.adaptive_concurrency.gradient_controller.gradient: 1000 +http.ingress_http.adaptive_concurrency.gradient_controller.burst_queue_size: 22 +http.ingress_http.adaptive_concurrency.gradient_controller.min_rtt_msecs: 100 +http.ingress_http.adaptive_concurrency.gradient_controller.sample_rtt_msecs: 95 diff --git a/envoy/tests/fixtures/openmetrics/openmetrics.txt b/envoy/tests/fixtures/openmetrics/openmetrics.txt index d2882e2a89bfe..54bbfb668a0b4 100644 --- a/envoy/tests/fixtures/openmetrics/openmetrics.txt +++ b/envoy/tests/fixtures/openmetrics/openmetrics.txt @@ -1377,3 +1377,17 @@ envoy_tls_inspector_bytes_processed_bucket{le="3600000"} 0 envoy_tls_inspector_bytes_processed_bucket{le="+Inf"} 0 envoy_tls_inspector_bytes_processed_sum{} 0 envoy_tls_inspector_bytes_processed_count{} 0 +# TYPE envoy_http_adaptive_concurrency_gradient_controller_rq_blocked counter +envoy_http_adaptive_concurrency_gradient_controller_rq_blocked{envoy_http_conn_manager_prefix="ingress_http"} 5 +# TYPE envoy_http_adaptive_concurrency_gradient_controller_min_rtt_calculation_active gauge +envoy_http_adaptive_concurrency_gradient_controller_min_rtt_calculation_active{envoy_http_conn_manager_prefix="ingress_http"} 0 +# TYPE envoy_http_adaptive_concurrency_gradient_controller_concurrency_limit gauge +envoy_http_adaptive_concurrency_gradient_controller_concurrency_limit{envoy_http_conn_manager_prefix="ingress_http"} 30 +# TYPE envoy_http_adaptive_concurrency_gradient_controller_gradient gauge +envoy_http_adaptive_concurrency_gradient_controller_gradient{envoy_http_conn_manager_prefix="ingress_http"} 1000 +# TYPE envoy_http_adaptive_concurrency_gradient_controller_burst_queue_size gauge +envoy_http_adaptive_concurrency_gradient_controller_burst_queue_size{envoy_http_conn_manager_prefix="ingress_http"} 22 +# TYPE envoy_http_adaptive_concurrency_gradient_controller_min_rtt_msecs gauge +envoy_http_adaptive_concurrency_gradient_controller_min_rtt_msecs{envoy_http_conn_manager_prefix="ingress_http"} 100 +# TYPE envoy_http_adaptive_concurrency_gradient_controller_sample_rtt_msecs gauge +envoy_http_adaptive_concurrency_gradient_controller_sample_rtt_msecs{envoy_http_conn_manager_prefix="ingress_http"} 95 diff --git a/envoy/tests/legacy/common.py b/envoy/tests/legacy/common.py index ddeec416d1b82..1c3dc144b180c 100644 --- a/envoy/tests/legacy/common.py +++ b/envoy/tests/legacy/common.py @@ -90,3 +90,17 @@ ] RBAC_METRICS = RBAC_ENFORCE_METRICS + RBAC_SHADOW_METRICS + +ADAPTIVE_CONCURRENCY_STAT_PREFIX_TAG = ['stat_prefix:ingress_http'] + +ADAPTIVE_CONCURRENCY_METRIC_VALUES = { + "envoy.http.adaptive_concurrency.gradient_controller.rq_blocked": 5, + "envoy.http.adaptive_concurrency.gradient_controller.min_rtt_calculation_active": 1, + "envoy.http.adaptive_concurrency.gradient_controller.concurrency_limit": 30, + "envoy.http.adaptive_concurrency.gradient_controller.gradient": 1000, + "envoy.http.adaptive_concurrency.gradient_controller.burst_queue_size": 22, + "envoy.http.adaptive_concurrency.gradient_controller.min_rtt_msecs": 100, + "envoy.http.adaptive_concurrency.gradient_controller.sample_rtt_msecs": 95, +} + +ADAPTIVE_CONCURRENCY_METRICS = list(ADAPTIVE_CONCURRENCY_METRIC_VALUES.keys()) diff --git a/envoy/tests/legacy/test_e2e.py b/envoy/tests/legacy/test_e2e.py index 0a2dc2f8c4b63..1ac898b22a108 100644 --- a/envoy/tests/legacy/test_e2e.py +++ b/envoy/tests/legacy/test_e2e.py @@ -7,7 +7,7 @@ from datadog_checks.dev.utils import get_metadata_metrics from datadog_checks.envoy import Envoy -from .common import FLAVOR, HOST +from .common import ADAPTIVE_CONCURRENCY_METRICS, ADAPTIVE_CONCURRENCY_STAT_PREFIX_TAG, FLAVOR, HOST pytestmark = [pytest.mark.e2e] @@ -288,6 +288,11 @@ def test_e2e(dd_agent_check, exercise_envoy): for metric in METRICS: aggregator.assert_metric(metric) + for metric in ADAPTIVE_CONCURRENCY_METRICS: + aggregator.assert_metric(metric) + for tag in ADAPTIVE_CONCURRENCY_STAT_PREFIX_TAG: + aggregator.assert_metric_has_tag(metric, tag) + if FLAVOR == 'api_v2': for metric in METRICS_V2: aggregator.assert_metric(metric) diff --git a/envoy/tests/legacy/test_integration.py b/envoy/tests/legacy/test_integration.py index 60d6942df21eb..11d8f174ffca6 100644 --- a/envoy/tests/legacy/test_integration.py +++ b/envoy/tests/legacy/test_integration.py @@ -7,7 +7,14 @@ from datadog_checks.dev.utils import get_metadata_metrics from datadog_checks.envoy.metrics import METRIC_PREFIX, METRICS -from .common import ENVOY_VERSION, EXT_AUTHZ_METRICS, INSTANCES, RBAC_METRICS +from .common import ( + ADAPTIVE_CONCURRENCY_METRICS, + ADAPTIVE_CONCURRENCY_STAT_PREFIX_TAG, + ENVOY_VERSION, + EXT_AUTHZ_METRICS, + INSTANCES, + RBAC_METRICS, +) CHECK_NAME = 'envoy' UNIQUE_METRICS = EXT_AUTHZ_METRICS + RBAC_METRICS @@ -53,6 +60,20 @@ def test_success(aggregator, check, dd_run_check): aggregator.assert_metrics_using_metadata(metadata_metrics) +def test_adaptive_concurrency_metrics(aggregator, check, dd_run_check): + instance = INSTANCES['main'] + c = check(instance) + dd_run_check(c) + + # Guard against a vacuous pass: the controller needs real traffic to emit stats. + assert any(metric in aggregator.metric_names for metric in ADAPTIVE_CONCURRENCY_METRICS) + + for metric in ADAPTIVE_CONCURRENCY_METRICS: + aggregator.assert_metric(metric) + for tag in ADAPTIVE_CONCURRENCY_STAT_PREFIX_TAG: + aggregator.assert_metric_has_tag(metric, tag) + + def test_metadata_integration(datadog_agent, check): instance = INSTANCES['main'] c = check(instance) diff --git a/envoy/tests/legacy/test_unit.py b/envoy/tests/legacy/test_unit.py index b43b531857fc6..cb92a5e48eb25 100644 --- a/envoy/tests/legacy/test_unit.py +++ b/envoy/tests/legacy/test_unit.py @@ -12,6 +12,9 @@ from datadog_checks.envoy.metrics import METRIC_PREFIX, METRICS from .common import ( + ADAPTIVE_CONCURRENCY_METRIC_VALUES, + ADAPTIVE_CONCURRENCY_METRICS, + ADAPTIVE_CONCURRENCY_STAT_PREFIX_TAG, CONNECTION_LIMIT_METRICS, CONNECTION_LIMIT_STAT_PREFIX_TAG, ENVOY_VERSION, @@ -351,3 +354,19 @@ def test_connection_limit_metrics(aggregator, fixture_path, mock_http_response, aggregator.assert_metric_has_tag(metric, tag, count=1) aggregator.assert_metrics_using_metadata(get_metadata_metrics()) + + +def test_adaptive_concurrency_metrics(aggregator, fixture_path, mock_http_response, check, dd_run_check): + instance = INSTANCES['main'] + c = check(instance) + + mock_http_response(file_path=fixture_path('./legacy/adaptive_concurrency.txt')) + dd_run_check(c) + + # Pin the fixture values so a wrong mapping or metric type would be caught, not just a wrong name. + for metric in ADAPTIVE_CONCURRENCY_METRICS: + aggregator.assert_metric(metric, value=ADAPTIVE_CONCURRENCY_METRIC_VALUES[metric]) + for tag in ADAPTIVE_CONCURRENCY_STAT_PREFIX_TAG: + aggregator.assert_metric_has_tag(metric, tag, count=1) + + aggregator.assert_metrics_using_metadata(get_metadata_metrics()) diff --git a/envoy/tests/test_e2e.py b/envoy/tests/test_e2e.py index 69839ea093cd7..7c485350467b9 100644 --- a/envoy/tests/test_e2e.py +++ b/envoy/tests/test_e2e.py @@ -8,6 +8,7 @@ from datadog_checks.envoy import Envoy from .common import ( + ADAPTIVE_CONCURRENCY_PROMETHEUS_METRICS, CONNECTION_LIMIT_METRICS, DEFAULT_INSTANCE, FLAKY_METRICS, @@ -24,7 +25,13 @@ def test_e2e(dd_agent_check, exercise_envoy): aggregator = dd_agent_check(DEFAULT_INSTANCE, rate=True) - for metric in PROMETHEUS_METRICS + LOCAL_RATE_LIMIT_METRICS + CONNECTION_LIMIT_METRICS + TLS_INSPECTOR_METRICS: + for metric in ( + PROMETHEUS_METRICS + + LOCAL_RATE_LIMIT_METRICS + + CONNECTION_LIMIT_METRICS + + TLS_INSPECTOR_METRICS + + ADAPTIVE_CONCURRENCY_PROMETHEUS_METRICS + ): formatted_metric = "envoy.{}".format(metric) if metric in FLAKY_METRICS: aggregator.assert_metric(formatted_metric, at_least=0) diff --git a/envoy/tests/test_integration.py b/envoy/tests/test_integration.py index 40f26db186439..91ac13e79514a 100644 --- a/envoy/tests/test_integration.py +++ b/envoy/tests/test_integration.py @@ -9,6 +9,7 @@ from datadog_checks.envoy.metrics import METRIC_PREFIX, METRICS from .common import ( + ADAPTIVE_CONCURRENCY_PROMETHEUS_METRICS, CONNECTION_LIMIT_METRICS, DEFAULT_INSTANCE, ENVOY_VERSION, @@ -38,7 +39,13 @@ def test_check(aggregator, dd_run_check, check): dd_run_check(c) dd_run_check(c) - for metric in PROMETHEUS_METRICS + LOCAL_RATE_LIMIT_METRICS + CONNECTION_LIMIT_METRICS + TLS_INSPECTOR_METRICS: + for metric in ( + PROMETHEUS_METRICS + + LOCAL_RATE_LIMIT_METRICS + + CONNECTION_LIMIT_METRICS + + TLS_INSPECTOR_METRICS + + ADAPTIVE_CONCURRENCY_PROMETHEUS_METRICS + ): formatted_metric = "envoy.{}".format(metric) if metric in FLAKY_METRICS: aggregator.assert_metric(formatted_metric, at_least=0) @@ -61,6 +68,35 @@ def test_check(aggregator, dd_run_check, check): ) +def test_adaptive_concurrency_symmetric(aggregator, dd_run_check, check): + """Verify both collection directions for adaptive concurrency metrics. + + Uses a scoped metadata subset to avoid the ~650 exclusions a global + check_symmetric_inclusion=True would need. + """ + c = check(DEFAULT_INSTANCE) + dd_run_check(c) + dd_run_check(c) + + # The legacy check emits 'rq_blocked'; OpenMetrics emits 'rq_blocked.count'. This test scrapes /stats/prometheus. + legacy_only = 'envoy.http.adaptive_concurrency.gradient_controller.rq_blocked' + adaptive_metadata = { + k: v for k, v in get_metadata_metrics().items() if 'adaptive_concurrency' in k and k != legacy_only + } + + # Guard against a vacuous pass: the controller needs real traffic to emit stats. + assert any('adaptive_concurrency' in m for m in aggregator.metric_names) + + # Restrict the forward direction to adaptive concurrency metrics. + non_adaptive = [m for m in aggregator.metric_names if 'adaptive_concurrency' not in m] + + aggregator.assert_metrics_using_metadata( + adaptive_metadata, + check_symmetric_inclusion=True, + exclude=non_adaptive, + ) + + def test_metadata_integration(dd_run_check, datadog_agent, check): c = check(DEFAULT_INSTANCE) c.check_id = 'test:123' diff --git a/envoy/tests/test_unit.py b/envoy/tests/test_unit.py index dfa88c05efa24..58d1561a03a18 100644 --- a/envoy/tests/test_unit.py +++ b/envoy/tests/test_unit.py @@ -9,6 +9,9 @@ from datadog_checks.envoy.metrics import PROMETHEUS_METRICS_MAP from .common import ( + ADAPTIVE_CONCURRENCY_PROMETHEUS_GAUGE_VALUES, + ADAPTIVE_CONCURRENCY_PROMETHEUS_METRICS, + ADAPTIVE_CONCURRENCY_PROMETHEUS_STAT_PREFIX_TAG, CLUSTER_AND_LISTENER_SSL_METRICS, CONNECT_STATE_METRIC, CONNECTION_LIMIT_METRICS, @@ -53,6 +56,14 @@ def test_check(aggregator, dd_run_check, check, mock_http_response): aggregator.assert_metric('envoy.{}'.format(metric)) aggregator.assert_metric_has_tag('envoy.{}'.format(metric), CONNECTION_LIMIT_STAT_PREFIX_TAG) + for metric in ADAPTIVE_CONCURRENCY_PROMETHEUS_METRICS: + aggregator.assert_metric('envoy.{}'.format(metric)) + aggregator.assert_metric_has_tag('envoy.{}'.format(metric), ADAPTIVE_CONCURRENCY_PROMETHEUS_STAT_PREFIX_TAG) + + # Pin the gauge fixture values so a wrong mapping (e.g. a x1000-scaled gradient) would be caught. + for metric, value in ADAPTIVE_CONCURRENCY_PROMETHEUS_GAUGE_VALUES.items(): + aggregator.assert_metric('envoy.{}'.format(metric), value=value) + aggregator.assert_service_check( "envoy.openmetrics.health", status=AgentCheck.OK, tags=['endpoint:http://localhost:8001/stats/prometheus'] ) diff --git a/istio/changelog.d/23396.fixed b/istio/changelog.d/23396.fixed new file mode 100644 index 0000000000000..ce4a6dac038fe --- /dev/null +++ b/istio/changelog.d/23396.fixed @@ -0,0 +1 @@ +Fix collisions between gauge and counter metrics that share a name after the OpenMetrics V2 parser strips the ``_total`` suffix. ``istio.go.memstats.alloc_bytes`` (gauge) and ``istio.go.memstats.alloc_bytes.count`` (counter), and their ``istio.mesh.agent.go.memstats.alloc_bytes`` counterparts, are now both collected. diff --git a/istio/datadog_checks/istio/metrics.py b/istio/datadog_checks/istio/metrics.py index 6d447a7785d77..a47a1f3233157 100644 --- a/istio/datadog_checks/istio/metrics.py +++ b/istio/datadog_checks/istio/metrics.py @@ -497,12 +497,28 @@ # Helper function that will strip _total from both the raw metric name and the metric name def construct_metrics_config(metric_map): metrics = [] + + # Metrics where both gauge (X) and counter (X_total) exist must use native_dynamic + # to avoid the OpenMetrics V2 parser locking to the wrong type after stripping _total. + dynamic_metrics = { + name + for name in metric_map + if not name.endswith('_total') + and '{}_total'.format(name) in metric_map + and '{}_total'.format(name) not in NON_CONFORMING_METRICS + } + for raw_metric_name, metric_name in metric_map.items(): if raw_metric_name.endswith('_total') and raw_metric_name not in NON_CONFORMING_METRICS: - raw_metric_name = raw_metric_name[:-6] + base_name = raw_metric_name[:-6] + if base_name in dynamic_metrics: + continue + raw_metric_name = base_name metric_name = metric_name[:-6] config = {raw_metric_name: {'name': metric_name}} + if raw_metric_name in dynamic_metrics: + config[raw_metric_name]['type'] = 'native_dynamic' metrics.append(config) return metrics diff --git a/istio/metadata.csv b/istio/metadata.csv index 45fa2066634b1..b7f26c5ae8a89 100644 --- a/istio/metadata.csv +++ b/istio/metadata.csv @@ -297,6 +297,7 @@ istio.go.gc_duration_seconds.sum,count,,second,,[OpenMetrics V1 and V2 and Istio istio.go.goroutines,gauge,,thread,,[OpenMetrics V1 and V2 and Istio v1.5+] Number of goroutines that currently exist.,0,istio,, istio.go.info,gauge,,,,[OpenMetrics V1 and V2 and Istio v1.5+] Information about the Go environment.,0,istio,, istio.go.memstats.alloc_bytes,gauge,,byte,,[OpenMetrics V1 and V2 and Istio v1.5+] Number of bytes allocated and still in use.,0,istio,, +istio.go.memstats.alloc_bytes.count,count,,byte,,[OpenMetrics V2 and Istio v1.5+] Total number of bytes allocated even if freed.,0,istio,, istio.go.memstats.alloc_bytes_total,count,,byte,,[OpenMetrics V1 and V2 and Istio v1.5+] Total number of bytes allocated even if freed.,0,istio,, istio.go.memstats.buck_hash_sys_bytes,gauge,,byte,,[OpenMetrics V1 and V2 and Istio v1.5+] Number of bytes used by the profiling bucket hash table.,0,istio,, istio.go.memstats.frees_total,count,,byte,,[OpenMetrics V1 and V2 and Istio v1.5+] Total number of frees.,0,istio,, @@ -363,6 +364,7 @@ istio.mesh.agent.conflict.inbound_listener,gauge,,,,[OpenMetrics V1 and V2] Numb istio.mesh.agent.go.memstats.sys_bytes,gauge,,byte,,[OpenMetrics V1 and V2] Number of bytes obtained from system,0,istio,, istio.mesh.agent.pilot.xds,gauge,,,,[OpenMetrics V1 and V2] Number of endpoints connected to this pilot using XDS.,0,istio,, istio.mesh.agent.go.memstats.alloc_bytes,gauge,,byte,,[OpenMetrics V1 and V2] Number of bytes allocated and still in use.,0,istio,, +istio.mesh.agent.go.memstats.alloc_bytes.count,count,,byte,,[OpenMetrics V2] Total number of bytes allocated even if freed.,0,istio,, istio.mesh.agent.go.memstats.heap_idle_bytes,gauge,,byte,,[OpenMetrics V1 and V2] Number of idle bytes in the heap.,0,istio,, istio.mesh.agent.process.resident_memory_bytes,gauge,,byte,,[OpenMetrics V1 and V2] Resident memory size in bytes.,0,istio,, istio.mesh.agent.conflict.outbound_listener.tcp_over_current_tcp,gauge,,,,[OpenMetrics V1 and V2] Number of conflicting tcp listeners with current tcp listener.,-1,istio,, diff --git a/istio/tests/common.py b/istio/tests/common.py index bba05a7bb2cf2..ee679d35fd8a6 100644 --- a/istio/tests/common.py +++ b/istio/tests/common.py @@ -236,6 +236,7 @@ 'istio.go.goroutines', 'istio.go.info', 'istio.go.memstats.alloc_bytes', + 'istio.go.memstats.alloc_bytes.count', 'istio.go.memstats.buck_hash_sys_bytes', 'istio.go.memstats.frees.count', 'istio.go.memstats.gc_cpu_fraction', @@ -314,6 +315,7 @@ 'istio.mesh.agent.go.memstats.sys_bytes', 'istio.mesh.agent.pilot.xds', 'istio.mesh.agent.go.memstats.alloc_bytes', + 'istio.mesh.agent.go.memstats.alloc_bytes.count', 'istio.mesh.agent.go.memstats.heap_idle_bytes', 'istio.mesh.agent.process.resident_memory_bytes', 'istio.mesh.agent.conflict.outbound_listener.tcp_over_current_tcp', diff --git a/istio/tests/test_metrics_config.py b/istio/tests/test_metrics_config.py new file mode 100644 index 0000000000000..66182780f9ecf --- /dev/null +++ b/istio/tests/test_metrics_config.py @@ -0,0 +1,71 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from datadog_checks.istio.metrics import NON_CONFORMING_METRICS, construct_metrics_config + + +def _entries_for(metrics, key): + return [next(iter(cfg.values())) for cfg in metrics if key in cfg] + + +def test_pair_uses_native_dynamic_and_drops_total_entry(): + metrics = construct_metrics_config( + { + 'go_memstats_alloc_bytes': 'go.memstats.alloc_bytes', + 'go_memstats_alloc_bytes_total': 'go.memstats.alloc_bytes_total', + } + ) + + assert metrics == [{'go_memstats_alloc_bytes': {'name': 'go.memstats.alloc_bytes', 'type': 'native_dynamic'}}] + + +def test_lone_total_is_stripped_and_has_no_explicit_type(): + metrics = construct_metrics_config({'foo_bar_total': 'foo.bar_total'}) + + assert metrics == [{'foo_bar': {'name': 'foo.bar'}}] + + +def test_non_total_only_metric_is_passed_through(): + metrics = construct_metrics_config({'foo_bar': 'foo.bar'}) + + assert metrics == [{'foo_bar': {'name': 'foo.bar'}}] + + +def test_non_conforming_total_is_preserved(): + non_conforming = NON_CONFORMING_METRICS[0] + + metrics = construct_metrics_config({non_conforming: 'preserved.name_total'}) + + assert metrics == [{non_conforming: {'name': 'preserved.name_total'}}] + + +def test_pair_where_total_is_non_conforming_is_not_dynamic(): + non_conforming = NON_CONFORMING_METRICS[0] + base = non_conforming[:-6] + + metrics = construct_metrics_config( + { + base: 'base.name', + non_conforming: 'base.name_total', + } + ) + + base_entries = _entries_for(metrics, base) + total_entries = _entries_for(metrics, non_conforming) + assert base_entries == [{'name': 'base.name'}] + assert total_entries == [{'name': 'base.name_total'}] + + +def test_multiple_pairs_are_each_dynamic(): + metrics = construct_metrics_config( + { + 'a': 'a', + 'a_total': 'a_total', + 'b': 'b', + 'b_total': 'b_total', + } + ) + + assert {'a': {'name': 'a', 'type': 'native_dynamic'}} in metrics + assert {'b': {'name': 'b', 'type': 'native_dynamic'}} in metrics + assert not any('a_total' in cfg or 'b_total' in cfg for cfg in metrics) diff --git a/istio/tests/test_unit_istio_v2.py b/istio/tests/test_unit_istio_v2.py index a43be3b52ad92..3bb0f434e6265 100644 --- a/istio/tests/test_unit_istio_v2.py +++ b/istio/tests/test_unit_istio_v2.py @@ -27,6 +27,11 @@ def test_istiod(aggregator, dd_run_check, mock_http_response): for metric in common.ISTIOD_V2_METRICS: aggregator.assert_metric(metric) + aggregator.assert_metric('istio.go.memstats.alloc_bytes', value=2.9097592e07, metric_type=aggregator.GAUGE) + aggregator.assert_metric( + 'istio.go.memstats.alloc_bytes.count', value=1.123329752e09, metric_type=aggregator.MONOTONIC_COUNT + ) + aggregator.assert_metrics_using_metadata(get_metadata_metrics(), check_submission_type=True) aggregator.assert_all_metrics_covered() @@ -118,6 +123,15 @@ def test_istio_agent(aggregator, dd_run_check, mock_http_response): for metric in common.ISTIO_AGENT_METRICS: aggregator.assert_metric(metric) + aggregator.assert_metric( + 'istio.mesh.agent.go.memstats.alloc_bytes', value=7.647864e06, metric_type=aggregator.GAUGE + ) + aggregator.assert_metric( + 'istio.mesh.agent.go.memstats.alloc_bytes.count', + value=2.260668e07, + metric_type=aggregator.MONOTONIC_COUNT, + ) + aggregator.assert_metrics_using_metadata(get_metadata_metrics(), check_submission_type=True) diff --git a/mysql/changelog.d/24178.fixed b/mysql/changelog.d/24178.fixed new file mode 100644 index 0000000000000..23269d78f189d --- /dev/null +++ b/mysql/changelog.d/24178.fixed @@ -0,0 +1 @@ +Fix MariaDB multi-channel replication reporting 0 channels by using SHOW ALL REPLICAS STATUS when no specific channel is configured. diff --git a/mysql/datadog_checks/mysql/mysql.py b/mysql/datadog_checks/mysql/mysql.py index 099ad51c16f9d..c95b27d092045 100644 --- a/mysql/datadog_checks/mysql/mysql.py +++ b/mysql/datadog_checks/mysql/mysql.py @@ -1144,7 +1144,12 @@ def _get_replica_stats(self, db): for replica in replica_status: # MySQL <5.7 does not have Channel_Name. # For MySQL >=5.7 'Channel_Name' is set to an empty string by default - channel = self._config.replication_channel or replica.get('Channel_Name') or 'default' + channel = ( + self._config.replication_channel + or replica.get('Channel_Name') + or replica.get('Connection_name') + or 'default' + ) for key, value in replica.items(): if value is not None: replica_results[key]['channel:{0}'.format(channel)] = value diff --git a/mysql/datadog_checks/mysql/queries.py b/mysql/datadog_checks/mysql/queries.py index 76bd25bec4170..9c1d1e30e2c5d 100644 --- a/mysql/datadog_checks/mysql/queries.py +++ b/mysql/datadog_checks/mysql/queries.py @@ -288,13 +288,17 @@ def show_replica_status_query(version, is_mariadb: bool, channel: str = '') -> tuple[str, tuple[str, ...]]: if version.version_compatible((10, 5, 1)) or not is_mariadb and version.version_compatible((8, 0, 22)): - base_query = "SHOW REPLICA STATUS" + replica_keyword = "REPLICA" else: - base_query = "SHOW SLAVE STATUS" + replica_keyword = "SLAVE" + base_query = "SHOW {0} STATUS".format(replica_keyword) if channel and not is_mariadb: return ("{0} FOR CHANNEL %s".format(base_query), (channel,)) + elif is_mariadb and not channel: + # MariaDB uses Connection_name (not Channel_Name) to identify channels. + return ("SHOW ALL {0}S STATUS".format(replica_keyword), ()) else: - return ("{0}".format(base_query), ()) + return (base_query, ()) def get_indexes_query(version, is_mariadb, placeholders): diff --git a/mysql/tests/test_unit.py b/mysql/tests/test_unit.py index f99d0b6d93b46..9a601b0f68343 100644 --- a/mysql/tests/test_unit.py +++ b/mysql/tests/test_unit.py @@ -704,6 +704,20 @@ def test_collect_replication_metrics_returns_vars_when_has_replicas_connected(): assert results.get('Replicas_connected') == 2 +def test_get_replica_stats_tags_each_mariadb_connection(): + """Each MariaDB Connection_name maps to its own channel tag in _get_replica_stats.""" + mysql_check = MySql(common.CHECK_NAME, {}, instances=[{'server': 'localhost', 'user': 'datadog'}]) + mysql_check._config.replication_enabled = True + mysql_check._get_replica_replication_status = mock.MagicMock( + return_value=[ + {'Connection_name': 'conn_a', 'Seconds_Behind_Master': 1}, + {'Connection_name': 'conn_b', 'Seconds_Behind_Master': 2}, + ] + ) + results = mysql_check._get_replica_stats(mock.MagicMock()) + assert results['Seconds_Behind_Master'] == {'channel:conn_a': 1, 'channel:conn_b': 2} + + def test_source_with_zero_replicas_emits_warning_service_check(aggregator, instance_basic): """Test that a source with 0 connected replicas emits WARNING for replica-loss detection.""" mysql_check = MySql(common.CHECK_NAME, {}, instances=[instance_basic]) @@ -777,10 +791,28 @@ class TestShowReplicaStatusQuery: 'my-channel', 'SHOW REPLICA STATUS', (), - id='mariadb_ignores_channel', + id='mariadb_modern_with_channel', ), pytest.param( - '10.4.0-MariaDB', 'MariaDB', True, '', 'SHOW SLAVE STATUS', (), id='mariadb_legacy_no_channel' + '10.5.1-MariaDB', + 'MariaDB', + True, + '', + 'SHOW ALL REPLICAS STATUS', + (), + id='mariadb_modern_no_channel', + ), + pytest.param( + '10.4.0-MariaDB', 'MariaDB', True, '', 'SHOW ALL SLAVES STATUS', (), id='mariadb_legacy_no_channel' + ), + pytest.param( + '10.4.0-MariaDB', + 'MariaDB', + True, + 'my-channel', + 'SHOW SLAVE STATUS', + (), + id='mariadb_legacy_with_channel', ), ], )