From 9a8c93c50ada0070b133229609f580198032b761 Mon Sep 17 00:00:00 2001 From: Juanpe Araque Date: Mon, 8 Sep 2025 15:27:43 +0100 Subject: [PATCH 01/13] Add a KindLoad context manager that allows loading custom docker images into a kind cluster (#21288) * Add a KindLoad context manager that allows loading custom docker images into a kind cluster * Extract wrapper setup in kind_run * Add changelog * Fix changelog number * Fix changelog number * Clean up * Remove deprecated ContestManager type --- datadog_checks_dev/changelog.d/21288.added | 1 + datadog_checks_dev/datadog_checks/dev/kind.py | 61 ++++++++++++++++++- datadog_checks_dev/tests/test_kind.py | 55 ++++++++++++++++- 3 files changed, 114 insertions(+), 3 deletions(-) create mode 100644 datadog_checks_dev/changelog.d/21288.added diff --git a/datadog_checks_dev/changelog.d/21288.added b/datadog_checks_dev/changelog.d/21288.added new file mode 100644 index 0000000000000..ae31bd602bb41 --- /dev/null +++ b/datadog_checks_dev/changelog.d/21288.added @@ -0,0 +1 @@ +Add KindLoad wrapper utility that allows to load Docker images into kind for testing \ No newline at end of file diff --git a/datadog_checks_dev/datadog_checks/dev/kind.py b/datadog_checks_dev/datadog_checks/dev/kind.py index f0a7cda5d13f5..02cde0ef2d784 100644 --- a/datadog_checks_dev/datadog_checks/dev/kind.py +++ b/datadog_checks_dev/datadog_checks/dev/kind.py @@ -1,8 +1,11 @@ # (C) Datadog, Inc. 2019-present # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) +from __future__ import annotations + from contextlib import contextmanager from shutil import which +from typing import TYPE_CHECKING import pytest @@ -12,6 +15,28 @@ from .subprocess import run_command from .utils import get_active_env, get_current_check_name +if TYPE_CHECKING: + from contextlib import AbstractContextManager + from typing import Self + + +def _setup_wrappers(wrappers: list[AbstractContextManager] | None, cluster_name: str): + """Set up wrappers with cluster-specific configuration. + + :param wrappers: List of wrapper instances to configure + :param cluster_name: The name of the Kind cluster + """ + if not wrappers: + return + + for wrapper in wrappers: + match wrapper: + case KindLoad(): + wrapper.cluster_name = cluster_name + case _: + # No special setup needed for other wrapper types + pass + @contextmanager def kind_run( @@ -63,6 +88,9 @@ def kind_run( set_up = KindUp(cluster_name, kind_config) tear_down = KindDown(cluster_name) + # Set up wrappers with cluster-specific configuration + _setup_wrappers(wrappers, cluster_name) + with environment_run( up=set_up, down=tear_down, @@ -82,7 +110,7 @@ class KindUp(LazyFunction): `kind create cluster --name -cluster` """ - def __init__(self, cluster_name, kind_config): + def __init__(self, cluster_name: str, kind_config: str | None): self.cluster_name = cluster_name self.kind_config = kind_config @@ -100,8 +128,37 @@ def __call__(self): class KindDown(LazyFunction): """Delete the kind cluster, calling `delete cluster`.""" - def __init__(self, cluster_name): + def __init__(self, cluster_name: str): self.cluster_name = cluster_name def __call__(self): run_command(['kind', 'delete', 'cluster', '--name', self.cluster_name], check=True) + + +class KindLoad: + """Context manager for loading Docker images into a Kind cluster. + + This context manager should be passed to the wrappers argument in environment_run + to load images into the Kind cluster after it's created. + + Example: + with kind_run(wrappers=[KindLoad("my-image:latest")]): + # The image is now loaded in the kind cluster + pass + """ + + def __init__(self, image: str): + self.image = image + self.cluster_name: str | None = None + + def __enter__(self) -> Self: + if self.cluster_name is None: + raise RuntimeError("cluster_name must be set before entering KindLoad context") + + load_cmd = ['kind', 'load', 'docker-image', self.image, '--name', self.cluster_name] + run_command(load_cmd, check=True) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Exit the context manager (no cleanup needed for image loading).""" + pass diff --git a/datadog_checks_dev/tests/test_kind.py b/datadog_checks_dev/tests/test_kind.py index 6dae58ade8773..5c91da8cddac7 100644 --- a/datadog_checks_dev/tests/test_kind.py +++ b/datadog_checks_dev/tests/test_kind.py @@ -6,7 +6,7 @@ from mock.mock import MagicMock, patch from datadog_checks.dev.ci import running_on_ci -from datadog_checks.dev.kind import kind_run +from datadog_checks.dev.kind import KindLoad, kind_run from .common import not_windows_ci @@ -58,3 +58,56 @@ def test_retry_condition_failed_only_on_first_run(self): pass assert condition.call_count == 2 + + +class TestKindLoad: + def test_kind_load_context_manager_without_cluster_name(self): + kind_load = KindLoad("test-image:latest") + + with pytest.raises(RuntimeError, match="cluster_name must be set before entering KindLoad context"): + with kind_load: + pass + + @patch('datadog_checks.dev.kind.run_command') + def test_kind_load_context_manager_with_cluster_name(self, mock_run_command): + """Test that KindLoad calls the correct kind load command when cluster_name is set.""" + image = "test-image:latest" + cluster_name = "test-cluster" + kind_load = KindLoad(image) + kind_load.cluster_name = cluster_name + + with kind_load as ctx: + assert ctx is kind_load + + mock_run_command.assert_called_once_with( + ['kind', 'load', 'docker-image', image, '--name', cluster_name], check=True + ) + + @not_windows_ci + @patch('datadog_checks.dev.kind.run_command') + def test_kind_load_integration_with_kind_run(self, mock_run_command): + """Test that KindLoad integrates correctly with kind_run.""" + image = "test-image:latest" + kind_load = KindLoad(image) + + with ( + patch('datadog_checks.dev.kind.KindUp') as mock_kind_up, + patch('datadog_checks.dev.kind.KindDown') as mock_kind_down, + ): + mock_up_instance = MagicMock() + mock_up_instance.return_value = "kubeconfig_path" + mock_kind_up.return_value = mock_up_instance + mock_down_instance = MagicMock() + mock_kind_down.return_value = mock_down_instance + + with kind_run(wrappers=[kind_load]): + # Verify that cluster_name was set on the KindLoad instance + assert kind_load.cluster_name is not None + assert kind_load.cluster_name.startswith('cluster-') + + # Verify that the kind load command was called + expected_calls = [ + call for call in mock_run_command.call_args_list if call[0][0][:3] == ['kind', 'load', 'docker-image'] + ] + assert len(expected_calls) == 1 + assert expected_calls[0][0][0] == ['kind', 'load', 'docker-image', image, '--name', kind_load.cluster_name] From 0b3bad2e7f97d9cff780777613c1d43ded856c8a Mon Sep 17 00:00:00 2001 From: Juanpe Araque Date: Mon, 8 Sep 2025 15:36:45 +0100 Subject: [PATCH 02/13] Increate TTL in RateLimitingTTLCache cache test (#21290) --- datadog_checks_base/tests/base/utils/db/test_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datadog_checks_base/tests/base/utils/db/test_util.py b/datadog_checks_base/tests/base/utils/db/test_util.py index cb56693b5d488..616a3d6c31a96 100644 --- a/datadog_checks_base/tests/base/utils/db/test_util.py +++ b/datadog_checks_base/tests/base/utils/db/test_util.py @@ -105,7 +105,7 @@ def test_constant_rate_limiter_shell_execute(): def test_ratelimiting_ttl_cache(): - ttl = 0.1 + ttl = 2 cache = RateLimitingTTLCache(maxsize=5, ttl=ttl) for i in range(5): From 767a1ffe5c6b910700f52fe20f008a83e4b11d3b Mon Sep 17 00:00:00 2001 From: davidfeng-datadog Date: Mon, 8 Sep 2025 11:24:20 -0400 Subject: [PATCH 03/13] [IDP-592] Add owner field to Windows Agent integrations (#21001) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [IDP-592] Add owner field to Windows Agent integrations Add owner field set to "windows-agent" to manifest.json files for: - active_directory - aspdotnet - dotnetclr - exchange_server - hyperv - iis - win32_event_log - wincrashdetect - windows_certificate - windows_performance_counters - windows_registry - windows_service - wmi_check This categorizes these integrations under the Windows Agent team for better ownership tracking and maintenance. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude * Update owner field from windows-agent to windows-products Based on feedback from the team that they've renamed their team. The correct Datadog team slug is now windows-products. --------- Co-authored-by: Claude --- active_directory/manifest.json | 1 + aspdotnet/manifest.json | 1 + dotnetclr/manifest.json | 1 + exchange_server/manifest.json | 1 + hyperv/manifest.json | 1 + iis/manifest.json | 1 + win32_event_log/manifest.json | 1 + wincrashdetect/manifest.json | 1 + windows_certificate/manifest.json | 1 + windows_performance_counters/manifest.json | 1 + windows_registry/manifest.json | 1 + windows_service/manifest.json | 1 + wmi_check/manifest.json | 1 + 13 files changed, 13 insertions(+) diff --git a/active_directory/manifest.json b/active_directory/manifest.json index 5b4d6cc325331..1f50ae53854b7 100644 --- a/active_directory/manifest.json +++ b/active_directory/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "e03a0916-8708-4417-82e4-1f0c7bbee655", "app_id": "active-directory", + "owner": "windows-products", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/aspdotnet/manifest.json b/aspdotnet/manifest.json index dcdfaec3adce2..e81292a3445a3 100644 --- a/aspdotnet/manifest.json +++ b/aspdotnet/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "7d801e88-1fad-433e-81d9-07449fd45e13", "app_id": "aspdotnet", + "owner": "windows-products", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/dotnetclr/manifest.json b/dotnetclr/manifest.json index b7a9370f58338..a57b93e25d743 100644 --- a/dotnetclr/manifest.json +++ b/dotnetclr/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "2147d078-2742-413e-83eb-58400657de56", "app_id": "dotnetclr", + "owner": "windows-products", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/exchange_server/manifest.json b/exchange_server/manifest.json index 943f85bfa6ad0..6eaff89b842e5 100644 --- a/exchange_server/manifest.json +++ b/exchange_server/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "e334d30a-a7df-4c06-9d1f-d8b6663df38a", "app_id": "exchange-server", + "owner": "windows-products", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/hyperv/manifest.json b/hyperv/manifest.json index e59b2002cef1f..023db749a6563 100644 --- a/hyperv/manifest.json +++ b/hyperv/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "6024e97b-c3c6-45e3-ba71-a48adeebc191", "app_id": "hyper-v", + "owner": "windows-products", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/iis/manifest.json b/iis/manifest.json index 88ce018d27f73..b958d78070457 100644 --- a/iis/manifest.json +++ b/iis/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "4620121f-b5ca-4b9c-aca2-c69bf18bc362", "app_id": "iis", + "owner": "windows-products", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/win32_event_log/manifest.json b/win32_event_log/manifest.json index 79d40cbd8f2f2..c202fbe34cf43 100644 --- a/win32_event_log/manifest.json +++ b/win32_event_log/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "8a0f4809-8470-4f7c-a7e8-350ba64123aa", "app_id": "event-viewer", + "owner": "windows-products", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/wincrashdetect/manifest.json b/wincrashdetect/manifest.json index 06c1d13dc6ce1..0e8c7ed63c4d3 100644 --- a/wincrashdetect/manifest.json +++ b/wincrashdetect/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "44210c4a-0fe6-4702-88bf-d720e492a806", "app_id": "wincrashdetect", + "owner": "windows-products", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/windows_certificate/manifest.json b/windows_certificate/manifest.json index 206f71fe312a5..3b00f20026a8c 100644 --- a/windows_certificate/manifest.json +++ b/windows_certificate/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "67feed3c-1676-4d6b-9d72-3ca8c0a6e3dc", "app_id": "windows-certificate", + "owner": "windows-products", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/windows_performance_counters/manifest.json b/windows_performance_counters/manifest.json index 749b2be8e4208..88d4f0055573d 100644 --- a/windows_performance_counters/manifest.json +++ b/windows_performance_counters/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "ec86de4d-a080-4160-8b0a-b937bbea08e9", "app_id": "windows-performance-counters", + "owner": "windows-products", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/windows_registry/manifest.json b/windows_registry/manifest.json index 409738934e29c..90794be6b0fe9 100644 --- a/windows_registry/manifest.json +++ b/windows_registry/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "cc166a5c-6742-4811-b3e1-93dbec0ac5b2", "app_id": "windows-registry", + "owner": "windows-products", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/windows_service/manifest.json b/windows_service/manifest.json index e8688d9e60cd1..78446a596d637 100644 --- a/windows_service/manifest.json +++ b/windows_service/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "1d895e93-d6f1-49f9-82bc-a03df7ff215c", "app_id": "windows-service", + "owner": "windows-products", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/wmi_check/manifest.json b/wmi_check/manifest.json index 2377cea00aea5..1e913ba889074 100644 --- a/wmi_check/manifest.json +++ b/wmi_check/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "ddd1578f-d511-4d57-b5dd-33c0ea7c391e", "app_id": "wmi", + "owner": "windows-products", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", From 99220cf2ef9c7bd8eed9feb8e126f5df62c5eaca Mon Sep 17 00:00:00 2001 From: davidfeng-datadog Date: Mon, 8 Sep 2025 12:07:28 -0400 Subject: [PATCH 04/13] [IDP-592] Add owner field to Network Device Monitoring Core integrations (#21002) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `owner` field set to `"network-device-monitoring-core"` to manifest.json files for Network Device Monitoring Core integrations (snmp, snmp_chatsworth_products, snmp_arista, snmp_juniper, snmp_netapp, snmp_cisco, snmp_f5, snmp_dell, snmp_fortinet, snmp_check_point, snmp_aruba, snmp_hewlett_packard_enterprise, snmp_american_power_conversion). **Note:** These integrations are our best guesses for the Network Device Monitoring Core team. If you don't own these integrations, or if you also own other integrations that should be included, please let us know. This provides clear ownership tracking for these SNMP-based network device monitoring integrations as part of the initiative to add owner fields to all integration manifest.json files. **For reviewer:** Please confirm: 1. Is `"network-device-monitoring-core"` the correct Datadog team slug in org 2 for tracking ownership? 2. For the display-tile feature flags in SDP, what workday team should we use as the `team` tag? We believe it should be "Network Device Monitoring Core" - could you confirm this is the correct workday team name? Thank you for your review\! 🤖 Generated with [Claude Code](https://claude.ai/code) Co-authored-by: Claude --- snmp/manifest.json | 1 + snmp_american_power_conversion/manifest.json | 1 + snmp_arista/manifest.json | 1 + snmp_aruba/manifest.json | 1 + snmp_chatsworth_products/manifest.json | 1 + snmp_check_point/manifest.json | 1 + snmp_cisco/manifest.json | 1 + snmp_dell/manifest.json | 1 + snmp_f5/manifest.json | 1 + snmp_fortinet/manifest.json | 1 + snmp_hewlett_packard_enterprise/manifest.json | 1 + snmp_juniper/manifest.json | 1 + snmp_netapp/manifest.json | 1 + 13 files changed, 13 insertions(+) diff --git a/snmp/manifest.json b/snmp/manifest.json index da32f20277c7c..c8f23d127a47d 100644 --- a/snmp/manifest.json +++ b/snmp/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "4fc8e176-17ce-4346-9544-bec30ac47a00", "app_id": "snmp", + "owner": "network-device-monitoring-core", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/snmp_american_power_conversion/manifest.json b/snmp_american_power_conversion/manifest.json index 2e3a52522ec3a..669157c1f18e7 100644 --- a/snmp_american_power_conversion/manifest.json +++ b/snmp_american_power_conversion/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "6b5325b8-443d-42e0-8545-f7dc42acacb4", "app_id": "snmp-american-power-conversion", + "owner": "network-device-monitoring-core", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/snmp_arista/manifest.json b/snmp_arista/manifest.json index fde42e37bcb43..b326a4548a460 100644 --- a/snmp_arista/manifest.json +++ b/snmp_arista/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "b5d6950a-7880-4b47-b9e9-49fe38e00490", "app_id": "snmp-arista", + "owner": "network-device-monitoring-core", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/snmp_aruba/manifest.json b/snmp_aruba/manifest.json index 840676c6d1c20..0e494e2e551ee 100644 --- a/snmp_aruba/manifest.json +++ b/snmp_aruba/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "39ecfe88-b733-43f6-b8c5-99450430b776", "app_id": "snmp-aruba", + "owner": "network-device-monitoring-core", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/snmp_chatsworth_products/manifest.json b/snmp_chatsworth_products/manifest.json index e54f03e76d8c6..0ad9afb337ce3 100644 --- a/snmp_chatsworth_products/manifest.json +++ b/snmp_chatsworth_products/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "344b37df-ba82-4352-b277-ba1f1ccf716f", "app_id": "snmp-chatsworth-products", + "owner": "network-device-monitoring-core", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/snmp_check_point/manifest.json b/snmp_check_point/manifest.json index 7110fdf679bec..35002cc1c6802 100644 --- a/snmp_check_point/manifest.json +++ b/snmp_check_point/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "ea753ad3-1b17-4b05-bca5-d6933308e55a", "app_id": "snmp-check-point", + "owner": "network-device-monitoring-core", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/snmp_cisco/manifest.json b/snmp_cisco/manifest.json index c8d85bdc5a147..1497b26eb4f86 100644 --- a/snmp_cisco/manifest.json +++ b/snmp_cisco/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "91202d4a-1af4-4c64-88e4-5ba02b23c69f", "app_id": "snmp-cisco", + "owner": "network-device-monitoring-core", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/snmp_dell/manifest.json b/snmp_dell/manifest.json index 18765a2bc0c1e..d252fa91d9861 100644 --- a/snmp_dell/manifest.json +++ b/snmp_dell/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "2d90389f-0e85-49a8-8fd9-715ff1836a23", "app_id": "snmp-dell", + "owner": "network-device-monitoring-core", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/snmp_f5/manifest.json b/snmp_f5/manifest.json index e4a67b9f51760..4c1266094aef7 100644 --- a/snmp_f5/manifest.json +++ b/snmp_f5/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "07050d86-968b-49e2-970e-599f535eece2", "app_id": "snmp-f5", + "owner": "network-device-monitoring-core", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/snmp_fortinet/manifest.json b/snmp_fortinet/manifest.json index 45c5c2d9a1620..b38d3bd695d10 100644 --- a/snmp_fortinet/manifest.json +++ b/snmp_fortinet/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "e501cab9-ba54-495c-80c2-ca3d373561a8", "app_id": "snmp-fortinet", + "owner": "network-device-monitoring-core", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/snmp_hewlett_packard_enterprise/manifest.json b/snmp_hewlett_packard_enterprise/manifest.json index f3127cde41167..d4c8d14e2dc61 100644 --- a/snmp_hewlett_packard_enterprise/manifest.json +++ b/snmp_hewlett_packard_enterprise/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "48134faf-2af6-4512-9853-ebe2a8620515", "app_id": "snmp-hewlett-packard-enterprise", + "owner": "network-device-monitoring-core", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/snmp_juniper/manifest.json b/snmp_juniper/manifest.json index 2d719e497e25f..abb2795490e62 100644 --- a/snmp_juniper/manifest.json +++ b/snmp_juniper/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "783d0088-b478-4b3c-9654-ec4fbfefc18d", "app_id": "snmp-juniper", + "owner": "network-device-monitoring-core", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/snmp_netapp/manifest.json b/snmp_netapp/manifest.json index 9b3c3117bba16..6ce1d90e82644 100644 --- a/snmp_netapp/manifest.json +++ b/snmp_netapp/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "d50aeab6-c26b-49df-aeb1-910d5d1a3e48", "app_id": "snmp-netapp", + "owner": "network-device-monitoring-core", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", From 586c0a2fafb609e95680d08407f7791f8a1b27ae Mon Sep 17 00:00:00 2001 From: Juanpe Araque Date: Mon, 8 Sep 2025 17:20:11 +0100 Subject: [PATCH 05/13] Move KindLoad to a condition instead of a wrapper (#21292) * Move KindLoad to a condition instead of a wrapper * Add changelog * Clean up typing --- datadog_checks_dev/changelog.d/21292.fixed | 1 + datadog_checks_dev/datadog_checks/dev/kind.py | 78 ++++++++----------- datadog_checks_dev/tests/test_kind.py | 16 ++-- 3 files changed, 40 insertions(+), 55 deletions(-) create mode 100644 datadog_checks_dev/changelog.d/21292.fixed diff --git a/datadog_checks_dev/changelog.d/21292.fixed b/datadog_checks_dev/changelog.d/21292.fixed new file mode 100644 index 0000000000000..6f46092a12f66 --- /dev/null +++ b/datadog_checks_dev/changelog.d/21292.fixed @@ -0,0 +1 @@ +Move KindLoad to a built-in condition instead of a wrapper \ No newline at end of file diff --git a/datadog_checks_dev/datadog_checks/dev/kind.py b/datadog_checks_dev/datadog_checks/dev/kind.py index 02cde0ef2d784..2a43fe95c042e 100644 --- a/datadog_checks_dev/datadog_checks/dev/kind.py +++ b/datadog_checks_dev/datadog_checks/dev/kind.py @@ -4,8 +4,9 @@ from __future__ import annotations from contextlib import contextmanager +from dataclasses import dataclass from shutil import which -from typing import TYPE_CHECKING +from typing import Any, Callable, ContextManager, Protocol, runtime_checkable import pytest @@ -15,59 +16,49 @@ from .subprocess import run_command from .utils import get_active_env, get_current_check_name -if TYPE_CHECKING: - from contextlib import AbstractContextManager - from typing import Self +def _setup_conditions(conditions: list[Callable[[], Any]] | None, cluster_config: ClusterConfig): + if not conditions: + return -def _setup_wrappers(wrappers: list[AbstractContextManager] | None, cluster_name: str): - """Set up wrappers with cluster-specific configuration. + for condition in conditions: + if isinstance(condition, ClusterCondition): + condition.add_cluster_info(cluster_config) - :param wrappers: List of wrapper instances to configure - :param cluster_name: The name of the Kind cluster - """ - if not wrappers: - return - for wrapper in wrappers: - match wrapper: - case KindLoad(): - wrapper.cluster_name = cluster_name - case _: - # No special setup needed for other wrapper types - pass +@dataclass +class ClusterConfig: + cluster_name: str + + +@runtime_checkable +class ClusterCondition(Protocol): + def add_cluster_info(self, cluster_config: ClusterConfig): ... @contextmanager def kind_run( - sleep=None, - endpoints=None, - conditions=None, - env_vars=None, - wrappers=None, - kind_config=None, - attempts=None, - attempts_wait=1, + sleep: float | None = None, + endpoints: str | list[str] | None = None, + conditions: list[Callable[[], Any]] | None = None, + env_vars: dict[str, str] | None = None, + wrappers: list[ContextManager] | None = None, + kind_config: str | None = None, + attempts: int | None = None, + attempts_wait: int = 1, ): """ This utility provides a convenient way to safely set up and tear down Kind environments. :param sleep: Number of seconds to wait before yielding. - :type sleep: ``float`` :param endpoints: Endpoints to verify access for before yielding. Shorthand for adding ``conditions.CheckEndpoints(endpoints)`` to the ``conditions`` argument. - :type endpoints: ``list`` of ``str``, or a single ``str`` :param conditions: A list of callable objects that will be executed before yielding to check for errors. - :type conditions: ``callable`` :param env_vars: A dictionary to update ``os.environ`` with during execution. - :type env_vars: ``dict`` :param wrappers: A list of context managers to use during execution. :param kind_config: A path to a yaml file that contains the configuration for creating the kind cluster. - :type kind_config: ``str`` :param attempts: Number of attempts to run `up` and the `conditions` successfully. Defaults to 2 in CI. - :type attempts: ``int`` :param attempts_wait: Time to wait between attempts. - :type attempts_wait: ``int`` """ if not which('kind'): pytest.skip('Kind not available') @@ -88,8 +79,7 @@ def kind_run( set_up = KindUp(cluster_name, kind_config) tear_down = KindDown(cluster_name) - # Set up wrappers with cluster-specific configuration - _setup_wrappers(wrappers, cluster_name) + _setup_conditions(conditions, ClusterConfig(cluster_name)) with environment_run( up=set_up, @@ -135,14 +125,14 @@ def __call__(self): run_command(['kind', 'delete', 'cluster', '--name', self.cluster_name], check=True) -class KindLoad: - """Context manager for loading Docker images into a Kind cluster. +class KindLoad(LazyFunction): + """Condition for loading Docker images into a Kind cluster. - This context manager should be passed to the wrappers argument in environment_run + This condition should be passed to the conditions argument in environment_run to load images into the Kind cluster after it's created. Example: - with kind_run(wrappers=[KindLoad("my-image:latest")]): + with kind_run(conditions=[KindLoad("my-image:latest")]): # The image is now loaded in the kind cluster pass """ @@ -151,14 +141,12 @@ def __init__(self, image: str): self.image = image self.cluster_name: str | None = None - def __enter__(self) -> Self: + def __call__(self): if self.cluster_name is None: - raise RuntimeError("cluster_name must be set before entering KindLoad context") + raise RuntimeError("cluster_name must be set before calling KindLoad") load_cmd = ['kind', 'load', 'docker-image', self.image, '--name', self.cluster_name] run_command(load_cmd, check=True) - return self - def __exit__(self, exc_type, exc_val, exc_tb): - """Exit the context manager (no cleanup needed for image loading).""" - pass + def add_cluster_info(self, cluster_config: ClusterConfig): + self.cluster_name = cluster_config.cluster_name diff --git a/datadog_checks_dev/tests/test_kind.py b/datadog_checks_dev/tests/test_kind.py index 5c91da8cddac7..61e595d103e8b 100644 --- a/datadog_checks_dev/tests/test_kind.py +++ b/datadog_checks_dev/tests/test_kind.py @@ -61,23 +61,20 @@ def test_retry_condition_failed_only_on_first_run(self): class TestKindLoad: - def test_kind_load_context_manager_without_cluster_name(self): + def test_kind_load_without_cluster_name(self): kind_load = KindLoad("test-image:latest") - with pytest.raises(RuntimeError, match="cluster_name must be set before entering KindLoad context"): - with kind_load: - pass + with pytest.raises(RuntimeError, match="cluster_name must be set before calling KindLoad"): + kind_load() @patch('datadog_checks.dev.kind.run_command') - def test_kind_load_context_manager_with_cluster_name(self, mock_run_command): - """Test that KindLoad calls the correct kind load command when cluster_name is set.""" + def test_kind_load_with_cluster_name(self, mock_run_command): image = "test-image:latest" cluster_name = "test-cluster" kind_load = KindLoad(image) kind_load.cluster_name = cluster_name - with kind_load as ctx: - assert ctx is kind_load + kind_load() mock_run_command.assert_called_once_with( ['kind', 'load', 'docker-image', image, '--name', cluster_name], check=True @@ -86,7 +83,6 @@ def test_kind_load_context_manager_with_cluster_name(self, mock_run_command): @not_windows_ci @patch('datadog_checks.dev.kind.run_command') def test_kind_load_integration_with_kind_run(self, mock_run_command): - """Test that KindLoad integrates correctly with kind_run.""" image = "test-image:latest" kind_load = KindLoad(image) @@ -100,7 +96,7 @@ def test_kind_load_integration_with_kind_run(self, mock_run_command): mock_down_instance = MagicMock() mock_kind_down.return_value = mock_down_instance - with kind_run(wrappers=[kind_load]): + with kind_run(conditions=[kind_load]): # Verify that cluster_name was set on the KindLoad instance assert kind_load.cluster_name is not None assert kind_load.cluster_name.startswith('cluster-') From 04d89efca407b44175d5f406ddbc7d0408c84d57 Mon Sep 17 00:00:00 2001 From: Juanpe Araque Date: Mon, 8 Sep 2025 17:54:29 +0100 Subject: [PATCH 06/13] Fix Velero tests and stop using bitnami images (#21274) * Move bitnami image used for testing to bitnami legacy * Add changelog * Increase timeout when installing velero * Decrease timeout for installation. Did not work. * Add custom kubectl image --- velero/changelog.d/21274.fixed | 1 + velero/pyproject.toml | 3 +++ velero/tests/conftest.py | 21 ++++++++++++++++++--- velero/tests/kind/kubectl.Dockerfile | 8 ++++++++ velero/tests/kind/velero-values.yaml | 11 ++++++++++- 5 files changed, 40 insertions(+), 4 deletions(-) create mode 100644 velero/changelog.d/21274.fixed create mode 100644 velero/tests/kind/kubectl.Dockerfile diff --git a/velero/changelog.d/21274.fixed b/velero/changelog.d/21274.fixed new file mode 100644 index 0000000000000..c7f59bee120ae --- /dev/null +++ b/velero/changelog.d/21274.fixed @@ -0,0 +1 @@ +Remove usage of bitnami images for testing and add ruff rules from parent directory \ No newline at end of file diff --git a/velero/pyproject.toml b/velero/pyproject.toml index c8d8ff71f63f3..9e39462e1318d 100644 --- a/velero/pyproject.toml +++ b/velero/pyproject.toml @@ -58,3 +58,6 @@ include = [ dev-mode-dirs = [ ".", ] + +[tool.ruff] +extend = "../pyproject.toml" diff --git a/velero/tests/conftest.py b/velero/tests/conftest.py index 0c2a4a0874f8e..9d6aaf6bd3a64 100644 --- a/velero/tests/conftest.py +++ b/velero/tests/conftest.py @@ -2,13 +2,13 @@ # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) import os -from contextlib import ExitStack +from contextlib import ExitStack, contextmanager import pytest from datadog_checks.dev import TempDir, run_command from datadog_checks.dev.fs import path_join -from datadog_checks.dev.kind import kind_run +from datadog_checks.dev.kind import KindLoad, kind_run from datadog_checks.dev.kube_port_forward import port_forward from .common import MOCKED_INSTANCE, PORT @@ -17,6 +17,19 @@ KIND_DIR = os.path.join(HERE, 'kind') +@contextmanager +def build_and_load_kubectl_image(image_tag: str): + print("Building custom kubectl image...") + dockerfile_path = os.path.join(KIND_DIR, 'kubectl.Dockerfile') + + # Build the custom kubectl image + run_command( + ['docker', 'build', '-t', image_tag, '-f', dockerfile_path, '.'], + check=True, + ) + yield + + def setup_velero(): """Set up Velero, MinIO and Nginx in the Kind cluster.""" # Apply MinIO deployment @@ -63,10 +76,12 @@ def get_instances(velero_host, velero_port, node_agent_host, node_agent_port): @pytest.fixture(scope='session') def dd_environment(): kind_config = os.path.join(KIND_DIR, 'kind-config.yaml') + custom_kubectl_image_tag = "custom-kubectl:latest" with TempDir('helm_dir') as helm_dir: with kind_run( - conditions=[setup_velero], + wrappers=[build_and_load_kubectl_image(custom_kubectl_image_tag)], + conditions=[KindLoad(custom_kubectl_image_tag), setup_velero], kind_config=kind_config, env_vars={ "HELM_CACHE_HOME": path_join(helm_dir, 'Caches'), diff --git a/velero/tests/kind/kubectl.Dockerfile b/velero/tests/kind/kubectl.Dockerfile new file mode 100644 index 0000000000000..1f1064408488c --- /dev/null +++ b/velero/tests/kind/kubectl.Dockerfile @@ -0,0 +1,8 @@ +FROM debian:bullseye-slim + +RUN apt-get update && apt-get install -y --no-install-recommends wget gnupg coreutils ca-certificates && \ + rm -rf /var/lib/apt/lists/* + +RUN wget -q -O kubectl https://dl.k8s.io/release/v1.28.0/bin/linux/amd64/kubectl && \ + install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl && \ + rm kubectl diff --git a/velero/tests/kind/velero-values.yaml b/velero/tests/kind/velero-values.yaml index db9b4eb2ffc90..c3aa52e3af7fb 100644 --- a/velero/tests/kind/velero-values.yaml +++ b/velero/tests/kind/velero-values.yaml @@ -35,6 +35,15 @@ metrics: # Deploy node-agent since we're monitoring it deployNodeAgent: true +# Override kubectl image to avoid bitnami dependency +# This should be updated when velero has a final solution to depending on bitnami images +# https://github.com/vmware-tanzu/helm-charts/issues/698 +kubectl: + image: + repository: custom-kubectl + tag: "latest" + pullPolicy: Never + # Default resource requests/limits resources: requests: @@ -42,4 +51,4 @@ resources: memory: 128Mi limits: cpu: 1000m - memory: 512Mi \ No newline at end of file + memory: 512Mi From 391a631109376a36829f18618a40572aa26a8862 Mon Sep 17 00:00:00 2001 From: Bo Huang Date: Mon, 8 Sep 2025 14:58:16 -0400 Subject: [PATCH 07/13] [openai] Update documentation and dashboards for admin key support (#21177) * [openai] Update documentation and dashboards for admin key support * dashboard and readme updates * add to manifest * alphabetical * Apply suggestions from code review Co-authored-by: Rosa Trieu <107086888+rtrieu@users.noreply.github.com> * remove invalid metadata * dashboard tweak --------- Co-authored-by: Rosa Trieu <107086888+rtrieu@users.noreply.github.com> --- openai/README.md | 15 ++++--- .../usage_admin_overview_dashboard.json | 1 + openai/manifest.json | 3 +- openai/metadata.csv | 44 +++++++++++++------ 4 files changed, 44 insertions(+), 19 deletions(-) create mode 100644 openai/assets/dashboards/usage_admin_overview_dashboard.json diff --git a/openai/README.md b/openai/README.md index f865ef7fe3740..056b292a65c22 100644 --- a/openai/README.md +++ b/openai/README.md @@ -13,7 +13,7 @@ Get cost estimation, prompt and completion sampling, error tracking, performance -**Note**: This setup method only collects `openai.api.usage.*` metrics. To collect all metrics provided by this integration, also follow the APM setup instructions. +**Note**: Providing an admin key only collects `audio_speeches`, `audio_transcriptions`, `code_interpreter_sessions`, `completions`, `embeddings`, `images`, `moderations`, and `vector_stores` metrics. Providing a project key is deprecated, and only collects `openai.api.usage.*` metrics. To collect all metrics provided by this integration, also follow the APM setup instructions. ### Installation @@ -27,6 +27,7 @@ Datadog's OpenAI integration allows you to collect usage metrics, cost data, and - An **OpenAI account** with the admin write permissions - A **valid OpenAI API key** with appropriate access for **usage and cost metrics** or **LLM Observability**. +- An admin-scoped API key is required to ingest usage and cost data. ## Setup @@ -49,9 +50,9 @@ Datadog's OpenAI integration allows you to collect usage metrics, cost data, and ### Additional Notes -- This integration only collects `openai.api.usage*` metrics. +- This integration only collects `audio_speeches`, `audio_transcriptions`, `code_interpreter_sessions`, `completions`, `embeddings`, `images`, `moderations`, and `vector_stores` metrics. - If you enable Cloud Cost Management for OpenAI, you have access to cost metrics. -- No additional permissions or setup are required for standard usage metrics. +- An admin-scoped API key is required. ## Additional Resources @@ -413,7 +414,7 @@ Validate that the APM Node.js library can communicate with your Agent by examini -**Note**: To collect `openai.api.usage.*` metrics, follow the API key setup instructions. +**Note**: To collect OpenAI `audio_speeches`, `audio_transcriptions`, `code_interpreter_sessions`, `completions`, `embeddings`, `images`, `moderations`, and `vector_stores` metrics, follow the API key setup instructions. ### Installation @@ -469,7 +470,11 @@ To validate that the APM PHP library can communicate with your Agent, examine th ### Metrics -The `openai.api.usage.*` metrics are only collected with the API key setup method. All remaining metrics below are collected with the APM setup methods. +The `openai.api.usage.*` metrics are collected when a project-scoped API key is provided. Project-scoped API key support will be deprecated in the near future. + +The `audio_speeches`, `audio_transcriptions`, `code_interpreter_sessions`, `completions`, `embeddings`, `images`, `moderations`, and `vector_stores` metrics are collected when an admin-scoped API key is provided. + +All remaining metrics below are collected with the APM setup methods. See [metadata.csv][4] for a list of metrics provided by this integration. diff --git a/openai/assets/dashboards/usage_admin_overview_dashboard.json b/openai/assets/dashboards/usage_admin_overview_dashboard.json new file mode 100644 index 0000000000000..b195baf6a487f --- /dev/null +++ b/openai/assets/dashboards/usage_admin_overview_dashboard.json @@ -0,0 +1 @@ +{"title":"OpenAI Admin Usage Overview","description":"Comprehensive dashboard for monitoring OpenAI API usage across all services, models, and projects","widgets":[{"id":8499078704982623,"definition":{"type":"image","url":"/static/images/logos/openai_large.svg","url_dark_theme":"/static/images/logos/openai_reversed_large.svg","sizing":"cover","has_background":true,"has_border":true,"vertical_align":"center","horizontal_align":"center"},"layout":{"x":0,"y":0,"width":4,"height":3}},{"id":1,"definition":{"type":"note","content":"# OpenAI Admin Usage Overview\nThis dashboard provides comprehensive monitoring of OpenAI usage across different services for all projects including:\n- **Completions**\n- **Audio Services**\n- **Embeddings And Vector Stores**\n- **Images and Moderations**","background_color":"white","font_size":"14","text_align":"left","vertical_align":"top","show_tick":false,"tick_pos":"50%","tick_edge":"left","has_padding":true},"layout":{"x":4,"y":0,"width":8,"height":3}},{"id":6068911808917193,"definition":{"title":"Completions","background_color":"vivid_blue","show_title":true,"type":"group","layout_type":"ordered","widgets":[{"id":3,"definition":{"title":"Total Completion Requests","show_legend":false,"type":"timeseries","requests":[{"response_format":"timeseries","queries":[{"data_source":"metrics","name":"query1","query":"sum:openai.completions.num_model_requests{model:$model,project_id:$project_id,user_id:$user_id,api_key_id:$api_key_id}.as_count()"}],"style":{"palette":"dog_classic","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","include_zero":true}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":4,"definition":{"title":"Input vs Output Tokens","show_legend":true,"legend_layout":"horizontal","legend_columns":["avg","min","max","value","sum"],"type":"timeseries","requests":[{"response_format":"timeseries","queries":[{"data_source":"metrics","name":"query1","query":"sum:openai.completions.input_tokens{model:$model,project_id:$project_id,user_id:$user_id,api_key_id:$api_key_id}.as_count()"}],"style":{"palette":"green","line_type":"solid","line_width":"normal"},"display_type":"line"},{"response_format":"timeseries","queries":[{"data_source":"metrics","name":"query1","query":"sum:openai.completions.output_tokens{model:$model,project_id:$project_id,user_id:$user_id,api_key_id:$api_key_id}.as_count()"}],"style":{"palette":"orange","line_type":"solid","line_width":"normal"},"display_type":"line"}],"yaxis":{"scale":"linear","include_zero":true}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":5,"definition":{"title":"Cached & Audio Tokens","show_legend":true,"type":"timeseries","requests":[{"response_format":"timeseries","queries":[{"data_source":"metrics","name":"query1","query":"sum:openai.completions.input_cached_tokens{model:$model,project_id:$project_id,user_id:$user_id,api_key_id:$api_key_id}.as_count()"}],"style":{"palette":"purple"},"display_type":"line"},{"response_format":"timeseries","queries":[{"data_source":"metrics","name":"query1","query":"sum:openai.completions.input_audio_tokens{model:$model,project_id:$project_id,user_id:$user_id,api_key_id:$api_key_id}.as_count()"}],"style":{"palette":"blue"},"display_type":"line"},{"response_format":"timeseries","queries":[{"data_source":"metrics","name":"query1","query":"sum:openai.completions.output_audio_tokens{model:$model,project_id:$project_id,user_id:$user_id,api_key_id:$api_key_id}.as_count()"}],"style":{"palette":"pink"},"display_type":"line"}],"yaxis":{"scale":"linear","include_zero":true}},"layout":{"x":8,"y":0,"width":4,"height":2}}]},"layout":{"x":0,"y":3,"width":12,"height":3}},{"id":6032901899865607,"definition":{"title":"Audio Services","background_color":"vivid_green","show_title":true,"type":"group","layout_type":"ordered","widgets":[{"id":7,"definition":{"title":"Speech Generation (Characters)","show_legend":false,"type":"timeseries","requests":[{"response_format":"timeseries","queries":[{"data_source":"metrics","name":"query1","query":"sum:openai.audio_speeches.characters{model:$model,project_id:$project_id,user_id:$user_id,api_key_id:$api_key_id}.as_count()"}],"style":{"palette":"green","line_type":"solid","line_width":"normal"},"display_type":"area"}],"yaxis":{"scale":"linear","include_zero":true}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":8,"definition":{"title":"Transcription (Seconds)","show_legend":false,"type":"timeseries","requests":[{"response_format":"timeseries","queries":[{"data_source":"metrics","name":"query1","query":"sum:openai.audio_transcriptions.seconds{model:$model,project_id:$project_id,user_id:$user_id,api_key_id:$api_key_id}.as_count()"}],"style":{"palette":"blue","line_type":"solid","line_width":"normal"},"display_type":"area"}],"yaxis":{"scale":"linear","include_zero":true}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":9,"definition":{"title":"Total Audio Requests","type":"query_value","requests":[{"conditional_formats":[{"comparator":">","value":0,"palette":"green_on_white"}],"response_format":"scalar","queries":[{"data_source":"metrics","name":"query1","query":"sum:openai.audio_speeches.num_model_requests{model:$model,project_id:$project_id,user_id:$user_id,api_key_id:$api_key_id}.as_count()","aggregator":"sum"},{"data_source":"metrics","name":"query2","query":"sum:openai.audio_transcriptions.num_model_requests{model:$model,project_id:$project_id,user_id:$user_id,api_key_id:$api_key_id}.as_count()","aggregator":"sum"}],"formulas":[{"formula":"query1 + query2"}]}],"autoscale":true,"precision":0},"layout":{"x":8,"y":0,"width":4,"height":2}}]},"layout":{"x":0,"y":6,"width":12,"height":3}},{"id":2193703564126644,"definition":{"title":"Embedding and Vector Stores","background_color":"vivid_purple","show_title":true,"type":"group","layout_type":"ordered","widgets":[{"id":11,"definition":{"title":"Embedding Requests & Tokens","show_legend":true,"type":"timeseries","requests":[{"response_format":"timeseries","queries":[{"data_source":"metrics","name":"query1","query":"sum:openai.embeddings.num_model_requests{model:$model,project_id:$project_id,user_id:$user_id,api_key_id:$api_key_id}.as_count()"}],"style":{"palette":"purple"},"display_type":"bars"},{"response_format":"timeseries","queries":[{"data_source":"metrics","name":"query1","query":"sum:openai.embeddings.input_tokens{model:$model,project_id:$project_id,user_id:$user_id,api_key_id:$api_key_id}.as_count()"}],"formulas":[{"formula":"query1 / 100"}],"style":{"palette":"orange"},"display_type":"line"}],"yaxis":{"scale":"linear","include_zero":true}},"layout":{"x":0,"y":0,"width":4,"height":2}},{"id":12,"definition":{"title":"Vector Store Usage (Bytes)","show_legend":false,"type":"timeseries","requests":[{"response_format":"timeseries","queries":[{"data_source":"metrics","name":"query1","query":"sum:openai.vector_stores.usage_bytes{project_id:$project_id}.as_count()"}],"style":{"palette":"purple","line_type":"solid","line_width":"normal"},"display_type":"area"}],"yaxis":{"scale":"linear","include_zero":true}},"layout":{"x":4,"y":0,"width":4,"height":2}},{"id":13,"definition":{"title":"Code Interpreter Sessions","type":"query_value","requests":[{"conditional_formats":[{"comparator":">","value":0,"palette":"purple_on_white"}],"response_format":"scalar","queries":[{"data_source":"metrics","name":"query1","query":"sum:openai.code_interpreter_sessions.num_sessions{project_id:$project_id}.as_count()","aggregator":"sum"}]}],"autoscale":true,"precision":0},"layout":{"x":8,"y":0,"width":4,"height":2}}]},"layout":{"x":0,"y":9,"width":12,"height":3}},{"id":1488470520304589,"definition":{"title":"Images and Moderation","background_color":"vivid_orange","show_title":true,"type":"group","layout_type":"ordered","widgets":[{"id":15,"definition":{"title":"Image Generation Requests","show_legend":true,"type":"timeseries","requests":[{"response_format":"timeseries","queries":[{"data_source":"metrics","name":"query1","query":"sum:openai.images.num_model_requests{model:$model,project_id:$project_id,user_id:$user_id,api_key_id:$api_key_id} by {size}.as_count()"}],"style":{"palette":"datadog16"},"display_type":"bars"}],"yaxis":{"scale":"linear","include_zero":true}},"layout":{"x":0,"y":0,"width":3,"height":2}},{"id":16,"definition":{"title":"Images Generated","show_legend":true,"type":"timeseries","requests":[{"response_format":"timeseries","queries":[{"data_source":"metrics","name":"query1","query":"sum:openai.images.images{model:$model,project_id:$project_id,user_id:$user_id,api_key_id:$api_key_id} by {source}.as_count()"}],"style":{"palette":"datadog16"},"display_type":"area"}],"yaxis":{"scale":"linear","include_zero":true}},"layout":{"x":3,"y":0,"width":3,"height":2}},{"id":17,"definition":{"title":"Moderation Requests","show_legend":false,"type":"timeseries","requests":[{"response_format":"timeseries","queries":[{"data_source":"metrics","name":"query1","query":"sum:openai.moderations.num_model_requests{model:$model,project_id:$project_id,user_id:$user_id,api_key_id:$api_key_id}.as_count()"}],"style":{"palette":"red","line_type":"solid","line_width":"normal"},"display_type":"bars"}],"yaxis":{"scale":"linear","include_zero":true}},"layout":{"x":6,"y":0,"width":3,"height":2}},{"id":18,"definition":{"title":"Moderation Tokens","type":"query_value","requests":[{"conditional_formats":[{"comparator":">","value":0,"palette":"red_on_white"}],"response_format":"scalar","queries":[{"data_source":"metrics","name":"query1","query":"sum:openai.moderations.input_tokens{model:$model,project_id:$project_id,user_id:$user_id,api_key_id:$api_key_id}.as_count()","aggregator":"sum"}]}],"autoscale":true,"precision":0},"layout":{"x":9,"y":0,"width":3,"height":2}}]},"layout":{"x":0,"y":12,"width":12,"height":3}},{"id":5973801688305818,"definition":{"title":"Usage by Project and User","background_color":"vivid_yellow","show_title":true,"type":"group","layout_type":"ordered","widgets":[{"id":20,"definition":{"title":"Top Projects by Total Requests","type":"toplist","requests":[{"style":{"palette":"datadog16"},"response_format":"scalar","queries":[{"data_source":"metrics","name":"query1","query":"sum:openai.completions.num_model_requests{*} by {project_id}.as_count()","aggregator":"sum"},{"data_source":"metrics","name":"query2","query":"sum:openai.embeddings.num_model_requests{*} by {project_id}.as_count()","aggregator":"sum"},{"data_source":"metrics","name":"query3","query":"sum:openai.images.num_model_requests{*} by {project_id}.as_count()","aggregator":"sum"}],"formulas":[{"formula":"query1 + query2 + query3"}],"sort":{"count":10,"order_by":[{"type":"formula","index":0,"order":"desc"}]}}]},"layout":{"x":0,"y":0,"width":3,"height":3}},{"id":21,"definition":{"title":"Top Users by Token Usage","type":"toplist","requests":[{"style":{"palette":"datadog16"},"response_format":"scalar","queries":[{"data_source":"metrics","name":"query1","query":"sum:openai.completions.input_tokens{*} by {user_id}.as_count()","aggregator":"sum"},{"data_source":"metrics","name":"query2","query":"sum:openai.completions.output_tokens{*} by {user_id}.as_count()","aggregator":"sum"}],"formulas":[{"formula":"query1 + query2"}],"sort":{"count":10,"order_by":[{"type":"formula","index":0,"order":"desc"}]}}]},"layout":{"x":3,"y":0,"width":3,"height":3}},{"id":22,"definition":{"title":"Top API Keys by Activity","type":"toplist","requests":[{"style":{"palette":"datadog16"},"response_format":"scalar","queries":[{"data_source":"metrics","name":"query1","query":"sum:openai.completions.num_model_requests{*} by {api_key_id}.as_count()","aggregator":"sum"}],"formulas":[{"formula":"query1"}],"sort":{"count":10,"order_by":[{"type":"formula","index":0,"order":"desc"}]}}]},"layout":{"x":6,"y":0,"width":3,"height":3}},{"id":23,"definition":{"title":"Top Models by Usage","type":"toplist","requests":[{"style":{"palette":"datadog16"},"response_format":"scalar","queries":[{"data_source":"metrics","name":"query1","query":"sum:openai.completions.num_model_requests{*} by {model}.as_count()","aggregator":"sum"}],"formulas":[{"formula":"query1"}],"sort":{"count":10,"order_by":[{"type":"formula","index":0,"order":"desc"}]}}]},"layout":{"x":9,"y":0,"width":3,"height":3}}]},"layout":{"x":0,"y":15,"width":12,"height":4}}],"template_variables":[{"name":"project_id","prefix":"project_id","available_values":[],"default":"*"},{"name":"model","prefix":"model","available_values":[],"default":"*"},{"name":"user_id","prefix":"user_id","available_values":[],"default":"*"},{"name":"api_key_id","prefix":"api_key_id","available_values":[],"default":"*"}],"layout_type":"ordered","notify_list":[],"reflow_type":"fixed"} \ No newline at end of file diff --git a/openai/manifest.json b/openai/manifest.json index 364ca327eb110..6cabf679bb8de 100644 --- a/openai/manifest.json +++ b/openai/manifest.json @@ -79,7 +79,8 @@ "dashboards": { "OpenAI Overview Dashboard": "assets/dashboards/overview_dashboard.json", "OpenAI Usage Overview": "assets/dashboards/usage_overview_dashboard.json", - "OpenAI Cost Overview": "assets/dashboards/cost_overview_dashboard.json" + "OpenAI Cost Overview": "assets/dashboards/cost_overview_dashboard.json", + "OpenAI Admin Usage Overview": "assets/dashboards/usage_admin_overview_dashboard.json" }, "monitors": { "OpenAI API usage is approaching rate limit": "assets/monitors/request_limits.json", diff --git a/openai/metadata.csv b/openai/metadata.csv index 347907744e597..c9d2259b41b32 100644 --- a/openai/metadata.csv +++ b/openai/metadata.csv @@ -1,13 +1,31 @@ -metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name,curated_metric -openai.api.usage.n_context_tokens_total,gauge,,token,,Total number of context tokens used (all-time),0,openai,n context tokens total, -openai.api.usage.n_generated_tokens_total,gauge,,token,,Total number of generated response tokens (all-time),0,openai,n generated tokens total, -openai.api.usage.n_requests,count,,request,,Total number of requests,0,openai,n requests, -openai.organization.ratelimit.requests.remaining,gauge,10,request,,Number of requests remaining in the rate limit.,0,openai,openai, -openai.organization.ratelimit.tokens.remaining,gauge,10,token,,Number of tokens remaining in the rate limit.,0,openai,openai, -openai.ratelimit.requests,gauge,10,request,,Number of requests in the rate limit.,0,openai,openai, -openai.ratelimit.tokens,gauge,10,token,,Number of tokens in the rate limit.,0,openai,openai, -openai.request.duration,gauge,10,nanosecond,,Request duration distribution.,0,openai,openai, -openai.request.error,count,10,error,,Number of errors.,0,openai,openai, -openai.tokens.completion,gauge,10,token,,Number of tokens used in the completion of a response from OpenAI.,0,openai,openai, -openai.tokens.prompt,gauge,10,token,,Number of tokens used in the prompt of a request to OpenAI.,0,openai,openai, -openai.tokens.total,gauge,10,token,,Total number of tokens used in a request to OpenAI.,0,openai,openai, +metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name,curated_metric,sample_tags +openai.api.usage.n_context_tokens_total,gauge,,token,,(Deprecated) Total number of context tokens used (all-time),0,openai,n context tokens total,, +openai.api.usage.n_generated_tokens_total,gauge,,token,,(Deprecated) Total number of generated response tokens (all-time),0,openai,n generated tokens total,, +openai.api.usage.n_requests,count,,request,,(Deprecated) Total number of requests,0,openai,n requests,, +openai.audio_speeches.characters,count,,,,Number of characters generated for text-to-speech,0,openai,audio speech chars,, +openai.audio_speeches.num_model_requests,count,,request,,Number of text-to-speech model requests,0,openai,audio speech requests,, +openai.audio_transcriptions.num_model_requests,count,,request,,Number of audio transcription model requests,0,openai,audio transcription requests,, +openai.audio_transcriptions.seconds,count,,second,,Number of seconds of audio transcribed,0,openai,audio transcription secs,, +openai.code_interpreter_sessions.num_sessions,count,,session,,Number of code interpreter sessions,0,openai,code interpreter sessions,, +openai.completions.input_audio_tokens,count,,token,,Number of audio input tokens for completions,0,openai,audio input tokens,, +openai.completions.input_cached_tokens,count,,token,,Number of cached input tokens for completions,0,openai,cached input tokens,, +openai.completions.input_tokens,count,,token,,Number of input tokens for completions,0,openai,completion input tokens,, +openai.completions.num_model_requests,count,,request,,Number of completion model requests,0,openai,completion requests,, +openai.completions.output_audio_tokens,count,,token,,Number of audio output tokens for completions,0,openai,audio output tokens,, +openai.completions.output_tokens,count,,token,,Number of output tokens for completions,0,openai,completion output tokens,, +openai.embeddings.input_tokens,count,,token,,Number of input tokens for embeddings,0,openai,embedding input tokens,, +openai.embeddings.num_model_requests,count,,request,,Number of embedding model requests,0,openai,embedding requests,, +openai.images.images,count,,,,Number of images generated,0,openai,images generated,, +openai.images.num_model_requests,count,,request,,Number of image generation model requests,0,openai,image requests,, +openai.moderations.input_tokens,count,,token,,Number of input tokens for moderations,0,openai,moderation input tokens,, +openai.moderations.num_model_requests,count,,request,,Number of moderation model requests,0,openai,moderation requests,, +openai.organization.ratelimit.requests.remaining,gauge,10,request,,Number of requests remaining in the rate limit.,0,openai,openai,, +openai.organization.ratelimit.tokens.remaining,gauge,10,token,,Number of tokens remaining in the rate limit.,0,openai,openai,, +openai.ratelimit.requests,gauge,10,request,,Number of requests in the rate limit.,0,openai,openai,, +openai.ratelimit.tokens,gauge,10,token,,Number of tokens in the rate limit.,0,openai,openai,, +openai.request.duration,gauge,10,nanosecond,,Request duration distribution.,0,openai,openai,, +openai.request.error,count,10,error,,Number of errors.,0,openai,openai,, +openai.tokens.completion,gauge,10,token,,Number of tokens used in the completion of a response from OpenAI.,0,openai,openai,, +openai.tokens.prompt,gauge,10,token,,Number of tokens used in the prompt of a request to OpenAI.,0,openai,openai,, +openai.tokens.total,gauge,10,token,,Total number of tokens used in a request to OpenAI.,0,openai,openai,, +openai.vector_stores.usage_bytes,gauge,,byte,,Number of bytes used in vector stores,0,openai,vector store usage,, From 929aec4180693afad51f3c7907fd8e916b7ef893 Mon Sep 17 00:00:00 2001 From: Jon Rosario Date: Mon, 8 Sep 2025 15:18:45 -0400 Subject: [PATCH 08/13] Triviajon k8s dashboards patch 1 (#21280) * Fix replicasets dashboard invalid formula * Fix statefulsets dashboard invalid formula --- kubernetes/assets/dashboards/kubernetes_replicasets.json | 2 +- kubernetes/assets/dashboards/kubernetes_statefulsets.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kubernetes/assets/dashboards/kubernetes_replicasets.json b/kubernetes/assets/dashboards/kubernetes_replicasets.json index 2cb8253c91608..7202f6746f698 100644 --- a/kubernetes/assets/dashboards/kubernetes_replicasets.json +++ b/kubernetes/assets/dashboards/kubernetes_replicasets.json @@ -432,7 +432,7 @@ { "name": "query1", "data_source": "metrics", - "query": "top(avg:kubernetes_state.replicaset.replicas{$scope,$kube_namespace,$kube_cluster_name,$kube_replica_set} by {kube_cluster_name,kube_namespace,kube_replica_set}, 10, 'mean', 'desc')", + "query": "avg:kubernetes_state.replicaset.replicas{$scope,$kube_namespace,$kube_cluster_name,$kube_replica_set} by {kube_cluster_name,kube_namespace,kube_replica_set}", "aggregator": "avg" } ] diff --git a/kubernetes/assets/dashboards/kubernetes_statefulsets.json b/kubernetes/assets/dashboards/kubernetes_statefulsets.json index e66eb92a4b9b7..8ca712ff0c2b5 100644 --- a/kubernetes/assets/dashboards/kubernetes_statefulsets.json +++ b/kubernetes/assets/dashboards/kubernetes_statefulsets.json @@ -1263,7 +1263,7 @@ { "name": "query1", "data_source": "metrics", - "query": "top(avg:kubernetes_state.daemonset.ready{$scope,$kube_namespace,$kube_cluster_name,$kube_stateful_set} by {kube_cluster_name,kube_namespace,kube_replica_set}, 10, 'mean', 'desc')", + "query": "avg:kubernetes_state.daemonset.ready{$scope,$kube_namespace,$kube_cluster_name,$kube_stateful_set} by {kube_cluster_name,kube_namespace,kube_replica_set}", "aggregator": "avg" } ] From 332ec19da97592fdd43679657b21fb66c9fbdb2b Mon Sep 17 00:00:00 2001 From: davidfeng-datadog Date: Mon, 8 Sep 2025 15:42:00 -0400 Subject: [PATCH 09/13] [IDP-592] Add owner field to Database Monitoring integrations (#20999) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [IDP-592] Add owner field to Database Monitoring integrations Add owner field set to "database-monitoring" to manifest.json files for: - oracle - postgres - sqlserver This categorizes these integrations under the Database Monitoring team for better ownership tracking and maintenance. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude * add mongo and mysql --------- Co-authored-by: Claude --- mongo/manifest.json | 3 ++- mysql/manifest.json | 7 +++---- oracle/manifest.json | 1 + postgres/manifest.json | 1 + sqlserver/manifest.json | 1 + 5 files changed, 8 insertions(+), 5 deletions(-) diff --git a/mongo/manifest.json b/mongo/manifest.json index 20bbf8d9967cd..9b52b6f67e861 100644 --- a/mongo/manifest.json +++ b/mongo/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "54cca53a-3c87-4b53-beb4-fce95d1fcfb5", "app_id": "mongodb", + "owner": "database-monitoring", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", @@ -80,4 +81,4 @@ "mongodb_processes": "assets/saved_views/mongodb_processes.json" } } -} \ No newline at end of file +} diff --git a/mysql/manifest.json b/mysql/manifest.json index 6a1fa8356e9de..60a0e94f38bea 100644 --- a/mysql/manifest.json +++ b/mysql/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "f6177896-da1e-4bc4-ab19-fd32e8868647", "app_id": "mysql", + "owner": "database-monitoring", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", @@ -49,9 +50,7 @@ "service_checks": { "metadata_path": "assets/service_checks.json" }, - "process_signatures": [ - "mysqld" - ], + "process_signatures": ["mysqld"], "source_type_id": 18, "auto_install": true }, @@ -70,4 +69,4 @@ "mysql_processes": "assets/saved_views/mysql_processes.json" } } -} \ No newline at end of file +} diff --git a/oracle/manifest.json b/oracle/manifest.json index 819a771d8e703..979003a90645b 100644 --- a/oracle/manifest.json +++ b/oracle/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "34835d2b-a812-4aac-8cc2-d298db851b80", "app_id": "oracle", + "owner": "database-monitoring", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/postgres/manifest.json b/postgres/manifest.json index e6a459bb6fcd6..0a03fadb76614 100644 --- a/postgres/manifest.json +++ b/postgres/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "e6b3c5ec-b293-4a22-9145-277a12a9abd4", "app_id": "postgres", + "owner": "database-monitoring", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", diff --git a/sqlserver/manifest.json b/sqlserver/manifest.json index b1acae09ce722..c03090c51492f 100644 --- a/sqlserver/manifest.json +++ b/sqlserver/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "bfa2f276-da05-4153-b8d4-48d4e41f5e40", "app_id": "sql-server", + "owner": "database-monitoring", "display_on_public_website": true, "tile": { "overview": "README.md#Overview", From a66488714226276358ee36b3194aedbd66dcbe54 Mon Sep 17 00:00:00 2001 From: Nicholas Muesch Date: Mon, 8 Sep 2025 15:54:08 -0400 Subject: [PATCH 10/13] [IXP-593] Migrate Amazon App Mesh (#21204) * Migrate Amazon App Mesh * Update README.md * Update README.md * Update manifest.json * Labeller * Update CHANGELOG.md * Update amazon_app_mesh/README.md Co-authored-by: David Jones * Update README and add missing metrics information * Remove metrics from the manifest --------- Co-authored-by: Nathan Adams Co-authored-by: David Jones Co-authored-by: Juanpe Araque --- .github/workflows/config/labeler.yml | 2 + amazon_app_mesh/CHANGELOG.md | 7 + amazon_app_mesh/README.md | 228 +++++++++++++++++++++ amazon_app_mesh/assets/service_checks.json | 1 + amazon_app_mesh/manifest.json | 48 +++++ 5 files changed, 286 insertions(+) create mode 100644 amazon_app_mesh/CHANGELOG.md create mode 100644 amazon_app_mesh/README.md create mode 100644 amazon_app_mesh/assets/service_checks.json create mode 100644 amazon_app_mesh/manifest.json diff --git a/.github/workflows/config/labeler.yml b/.github/workflows/config/labeler.yml index c9feb765ecf46..def21bcde5106 100644 --- a/.github/workflows/config/labeler.yml +++ b/.github/workflows/config/labeler.yml @@ -45,6 +45,8 @@ integration/airbyte: - airbyte/**/* integration/airflow: - airflow/**/* +integration/amazon_app_mesh: +- amazon_app_mesh/**/* integration/amazon_eks: - amazon_eks/**/* integration/amazon_eks_blueprints: diff --git a/amazon_app_mesh/CHANGELOG.md b/amazon_app_mesh/CHANGELOG.md new file mode 100644 index 0000000000000..41036452ffa87 --- /dev/null +++ b/amazon_app_mesh/CHANGELOG.md @@ -0,0 +1,7 @@ +# CHANGELOG - AWS App Mesh + +## 1.0.0 / 2019-10-01 + +***Added***: + +* add Amazon App Mesh integration diff --git a/amazon_app_mesh/README.md b/amazon_app_mesh/README.md new file mode 100644 index 0000000000000..f8a0f08fe244e --- /dev/null +++ b/amazon_app_mesh/README.md @@ -0,0 +1,228 @@ +## Overview + +[AWS App Mesh][1] is a service mesh that provides application-level networking to your micro services running on Amazon ECS Fargate or Amazon EKS clusters. + + +## Setup + + + + +Use the instructions below to enable metric collection for the AWS App Mesh proxy sidecar, called Envoy. Users can choose to add sidecars in one of three modes: deploying, patching the deployment later, or using the AWS App Mesh injector controller. All modes are supported by the following steps. + +#### Metric collection + +**Prerequisite**: Deploy Datadog Agents as a DaemonSet in your Kubernetes cluster using the [EKS integration][1] documentation. + +1. Due to limitations in App Mesh, forwarding metrics from EKS to Datadog requires the Egress filter to be set to `Allow External Traffic`. + +2. Create a ConfigMap in your cluster to automatically discover App Mesh's Envoy side cars that are added to each pod: + + ```yaml + apiVersion: v1 + kind: ConfigMap + metadata: + name: datadog-config + data: + envoy: |- + ad_identifiers: + - aws-appmesh-envoy + init_config: + instances: + - stats_url: http://%%host%%:9901/stats + tags: + - : # Example - cluster:eks-appmesh + ``` + +3. Update the `volumeMounts` object in your Datadog Agent's DaemonSet YAML file: + + ```yaml + volumeMounts: + - name: datadog-config + mountPath: /conf.d + ``` + +4. Update the `volumes` object in your Datadog Agent's DaemonSet YAML file: + + ```yaml + volumes: + - name: datadog-config + configMap: + name: datadog-config + items: + - key: envoy + path: envoy.yaml + ``` + +#### Log collection + + + + + +#### Trace collection + +Select the namespace to deploy the `datadog-agent` and service, for example: `monitoring`. Use this in the option to deploy the appmesh-injector with: + +```shell + helm upgrade -i appmesh-controller eks/appmesh-controller \ + --namespace appmesh-system \ + --set sidecar.logLevel=debug \ + --set tracing.enabled=true \ + --set tracing.provider=datadog \ + --set tracing.address=ref:status.hostIP \ + --set tracing.port=8126 +``` + + +Alternatively, the appmesh injector can be deployed by following the [App Mesh with EKS][3] documentation using the option `enable-datadog-tracing=true` or environment variable `ENABLE_DATADOG_TRACING=true`. + +[1]: https://docs.datadoghq.com/integrations/amazon_eks/ +[2]: /agent/kubernetes/daemonset_setup/#log-collection +[3]: https://github.com/aws/aws-app-mesh-examples/blob/master/walkthroughs/eks/base.md#install-app-mesh--kubernetes-components + + + + +#### Metric collection + +**Prerequisite**: Add Datadog Agents to each of your Fargate task definitions with App Mesh enabled, such as an Envoy sidecar injected, using the [ECS Fargate integration][1] documentation. + +1. Due to limitations in App Mesh, forwarding metrics from an ECS task to Datadog requires the Egress filter to be set to `Allow External Traffic`. + +2. Update all task definitions containing the Envoy sidecar and Datadog Agent with the following Docker labels. See [Integration Setup for ECS Fargate][2] for details. + + ```text + "dockerLabels": { + com.datadoghq.ad.instances : [{"stats_url": "http://%%host%%:9901/stats"}] + com.datadoghq.ad.check_names : ["envoy"] + com.datadoghq.ad.init_configs : [{}] + }, + ``` + +#### Log collection + + + + + +#### Trace collection + +1. Enable trace collection with the instructions in the [ECS Fargate integration][4] documentation. + +Set the AWS App Mesh parameters `ENABLE_ENVOY_DATADOG_TRACING` and `DATADOG_TRACER_PORT` as environment variables in the ECS Fargate task definition. Learn more in the [AWS App Mesh][5] documentation. + +[1]: https://docs.datadoghq.com/integrations/ecs_fargate/ +[2]: https://docs.datadoghq.com/integrations/faq/integration-setup-ecs-fargate/ +[3]: https://docs.datadoghq.com/integrations/ecs_fargate/#log-collection +[4]: https://docs.datadoghq.com/integrations/ecs_fargate/#trace-collection +[5]: https://docs.aws.amazon.com/app-mesh/latest/userguide/envoy.html + + + + +#### Metric collection + +**Prerequisite**: Add Datadog Agents to each of your ECS EC2 task definitions with App Mesh enabled, such as an Envoy sidecar injected, using the [ECS integration][1] documentation. + +1. Due to limitations in App Mesh, forwarding metrics from an ECS task to Datadog requires the Egress filter to be set to `Allow External Traffic`. + +2. Update all task definitions containing the Envoy sidecar and Datadog Agent with the following Docker labels. See [Integration Setup for ECS Fargate][2] for details. + + ```text + "dockerLabels": { + com.datadoghq.ad.instances : [{"stats_url": "http://%%host%%:9901/stats"}] + com.datadoghq.ad.check_names : ["envoy"] + com.datadoghq.ad.init_configs : [{}] + }, + ``` + +#### Log collection + + + + + +#### Trace collection + +1. Enable trace collection with the instructions in the [ECS integration][4] documentation. + +2. Set the AWS App Mesh parameters `ENABLE_ENVOY_DATADOG_TRACING` and `DATADOG_TRACER_PORT` as environment variables in the ECS task definition. Learn more in the [AWS App Mesh][5] documentation. + +[1]: https://docs.datadoghq.com/integrations/amazon_ecs/ +[2]: https://docs.datadoghq.com/integrations/faq/integration-setup-ecs-fargate/ +[3]: https://docs.datadoghq.com/integrations/amazon_ecs/#log-collection +[4]: https://docs.datadoghq.com/integrations/amazon_ecs/#trace-collection +[5]: https://docs.aws.amazon.com/app-mesh/latest/userguide/envoy.html + + + + +## Data Collected + +### Metrics + +See the [Envoy integration][2] for a list of metrics. + +### Events + +The AWS App Mesh integration does not include any events. + +### Service Checks + +The AWS App Mesh integration does not include any service checks. + +## Troubleshooting + +Need help? Contact [Datadog support][3]. + +## Further Reading + +- [Envoy integration][4] + +[1]: https://aws.amazon.com/app-mesh +[2]: https://docs.datadoghq.com/integrations/envoy/#metrics +[3]: https://docs.datadoghq.com/help/ +[4]: https://docs.datadoghq.com/integrations/envoy/ diff --git a/amazon_app_mesh/assets/service_checks.json b/amazon_app_mesh/assets/service_checks.json new file mode 100644 index 0000000000000..0637a088a01e8 --- /dev/null +++ b/amazon_app_mesh/assets/service_checks.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/amazon_app_mesh/manifest.json b/amazon_app_mesh/manifest.json new file mode 100644 index 0000000000000..e838b44ba4c80 --- /dev/null +++ b/amazon_app_mesh/manifest.json @@ -0,0 +1,48 @@ +{ + "manifest_version": "2.0.0", + "app_uuid": "507d48b6-cfda-4f62-b95a-67de98a50cc6", + "app_id": "amazon-app-mesh", + "display_on_public_website": true, + "tile": { + "overview": "README.md#Overview", + "configuration": "README.md#Setup", + "support": "README.md#Support", + "changelog": "CHANGELOG.md", + "description": "Amazon App Mesh is an open source edge and service proxy.", + "title": "AWS App Mesh", + "media": [], + "classifier_tags": [ + "Category::AWS", + "Category::Cloud", + "Category::Log Collection", + "Category::Network", + "Category::Tracing", + "Offering::Integration" + ], + "resources": [ + { + "url": "https://docs.datadoghq.com/integrations/envoy/", + "resource_type": "documentation" + } + ] + }, + "assets": { + "integration": { + "auto_install": false, + "source_type_name": "Amazon App Mesh", + "events": { + "creates_events": true + }, + "service_checks": { + "metadata_path": "assets/service_checks.json" + }, + "source_type_id": 221 + } + }, + "author": { + "support_email": "help@datadoghq.com", + "name": "Datadog", + "homepage": "https://www.datadoghq.com", + "sales_email": "info@datadoghq.com" + } +} From 8eb2e3acd90d6fb722383b7ca13a89c12168ea85 Mon Sep 17 00:00:00 2001 From: mahipdeora25 Date: Mon, 8 Sep 2025 15:57:36 -0400 Subject: [PATCH 11/13] Update slurm_overview.json (#21295) fixed spelling error Co-authored-by: Steven Yuen --- slurm/assets/dashboards/slurm_overview.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm/assets/dashboards/slurm_overview.json b/slurm/assets/dashboards/slurm_overview.json index e394b8b32875e..bbe55082caa64 100644 --- a/slurm/assets/dashboards/slurm_overview.json +++ b/slurm/assets/dashboards/slurm_overview.json @@ -358,7 +358,7 @@ { "id": 4075969519773896, "definition": { - "title": "Parition and Node Metrics", + "title": "Partition and Node Metrics", "background_color": "vivid_blue", "show_title": true, "type": "group", From dedbd655c2aa508185a572c564a9067a7f041744 Mon Sep 17 00:00:00 2001 From: Eric Weaver Date: Mon, 8 Sep 2025 16:08:41 -0400 Subject: [PATCH 12/13] Fix flaky Postgres cursor test (#21281) * Fix flaky cursor test * Remove debug lines * isolate db interaction by test name * run format --- postgres/tests/test_cursor.py | 7 ++-- postgres/tests/test_progress_stats.py | 47 +++++++++++++++++++-------- postgres/tests/utils.py | 16 ++++----- 3 files changed, 45 insertions(+), 25 deletions(-) diff --git a/postgres/tests/test_cursor.py b/postgres/tests/test_cursor.py index 2482d4d5b0c68..589bb9cdcfea4 100644 --- a/postgres/tests/test_cursor.py +++ b/postgres/tests/test_cursor.py @@ -7,10 +7,10 @@ @pytest.mark.integration -@pytest.mark.flaky @pytest.mark.usefixtures('dd_environment') @pytest.mark.parametrize('ignore', [True, False]) def test_integration_connection_with_commenter_cursor(integration_check, pg_instance, ignore): + pg_instance['application_name'] = 'test_integration_connection_with_commenter_cursor_{}'.format(ignore) check = integration_check(pg_instance) with check.db() as conn: @@ -44,9 +44,10 @@ def __check_prepand_sql_comment(pg_instance, ignore): with super_conn.cursor() as cursor: cursor.execute( ( - "SELECT query FROM pg_stat_activity where query like '%generate_series%' " + "SELECT query FROM pg_stat_activity where application_name = %s and query like '%%generate_series%%' " "and query not like '%%pg_stat_activity%%'" - ) + ), + (pg_instance['application_name'],), ) result = cursor.fetchall() assert len(result) > 0 diff --git a/postgres/tests/test_progress_stats.py b/postgres/tests/test_progress_stats.py index 444de384b7fd3..11ccde6eae837 100644 --- a/postgres/tests/test_progress_stats.py +++ b/postgres/tests/test_progress_stats.py @@ -30,8 +30,8 @@ pytestmark = [pytest.mark.integration, pytest.mark.usefixtures('dd_environment')] -def _check_analyze_progress(check, pg_instance, table): - thread = run_vacuum_thread(pg_instance, f'ANALYZE {table}') +def _check_analyze_progress(check, pg_instance, table, application_name): + thread = run_vacuum_thread(pg_instance, f'ANALYZE {table}', application_name) # Wait for vacuum to be reported _wait_for_value( @@ -44,14 +44,15 @@ def _check_analyze_progress(check, pg_instance, table): check.check(pg_instance) # Kill vacuum and cleanup thread - kill_vacuum(pg_instance) + kill_vacuum(pg_instance, application_name) thread.join() @requires_over_13 def test_analyze_progress_inherited(aggregator, integration_check, pg_instance): + pg_instance['application_name'] = 'test_analyze_progress_inherited' check = integration_check(pg_instance) - _check_analyze_progress(check, pg_instance, 'test_part') + _check_analyze_progress(check, pg_instance, 'test_part', pg_instance['application_name']) expected_tags = _get_expected_tags(check, pg_instance) + [ 'child_relation:test_part1', 'phase:acquiring inherited sample rows', @@ -64,8 +65,9 @@ def test_analyze_progress_inherited(aggregator, integration_check, pg_instance): @requires_over_13 def test_analyze_progress(aggregator, integration_check, pg_instance): + pg_instance['application_name'] = 'test_analyze_progress' check = integration_check(pg_instance) - _check_analyze_progress(check, pg_instance, 'test_part1') + _check_analyze_progress(check, pg_instance, 'test_part1', pg_instance['application_name']) expected_tags = _get_expected_tags(check, pg_instance) + [ 'phase:acquiring sample rows', 'table:test_part1', @@ -77,10 +79,13 @@ def test_analyze_progress(aggregator, integration_check, pg_instance): @requires_over_17 def test_vacuum_progress(aggregator, integration_check, pg_instance): + pg_instance['application_name'] = 'test_vacuum_progress' check = integration_check(pg_instance) # Start vacuum - thread = run_vacuum_thread(pg_instance, 'VACUUM (DISABLE_PAGE_SKIPPING) test_part1') + thread = run_vacuum_thread( + pg_instance, 'VACUUM (DISABLE_PAGE_SKIPPING) test_part1', pg_instance['application_name'] + ) # Wait for vacuum to be reported _wait_for_value( @@ -93,7 +98,7 @@ def test_vacuum_progress(aggregator, integration_check, pg_instance): check.check(pg_instance) # Kill vacuum and cleanup thread - kill_vacuum(pg_instance) + kill_vacuum(pg_instance, pg_instance['application_name']) thread.join() expected_tags = _get_expected_tags(check, pg_instance) + [ @@ -108,10 +113,13 @@ def test_vacuum_progress(aggregator, integration_check, pg_instance): @requires_over_12 @requires_under_17 def test_vacuum_progress_lt_17(aggregator, integration_check, pg_instance): + pg_instance['application_name'] = 'test_vacuum_progress_lt_17' check = integration_check(pg_instance) # Start vacuum - thread = run_vacuum_thread(pg_instance, 'VACUUM (DISABLE_PAGE_SKIPPING) test_part1') + thread = run_vacuum_thread( + pg_instance, 'VACUUM (DISABLE_PAGE_SKIPPING) test_part1', pg_instance['application_name'] + ) # Wait for vacuum to be reported _wait_for_value( @@ -124,7 +132,7 @@ def test_vacuum_progress_lt_17(aggregator, integration_check, pg_instance): check.check(pg_instance) # Kill vacuum and cleanup thread - kill_vacuum(pg_instance) + kill_vacuum(pg_instance, pg_instance['application_name']) thread.join() expected_tags = _get_expected_tags(check, pg_instance) + [ @@ -138,13 +146,18 @@ def test_vacuum_progress_lt_17(aggregator, integration_check, pg_instance): @requires_over_12 def test_index_progress(aggregator, integration_check, pg_instance): + pg_instance['application_name'] = 'test_index_progress' check = integration_check(pg_instance) # Keep test_part locked to prevent create index concurrently from finishing conn = lock_table(pg_instance, 'test_part1', 'ROW EXCLUSIVE') # Start vacuum in a thread - thread = run_query_thread(pg_instance, 'CREATE INDEX CONCURRENTLY test_progress_index ON test_part1 (id);') + thread = run_query_thread( + pg_instance, + 'CREATE INDEX CONCURRENTLY test_progress_index ON test_part1 (id);', + pg_instance['application_name'], + ) # Wait for blocked created index to appear _wait_for_value( @@ -156,7 +169,7 @@ def test_index_progress(aggregator, integration_check, pg_instance): check.check(pg_instance) # Kill the create index - kill_session(pg_instance, 'CREATE INDEX') + kill_session(pg_instance, 'CREATE INDEX', pg_instance['application_name']) # Cleanup connection and thread conn.close() @@ -177,15 +190,21 @@ def test_index_progress(aggregator, integration_check, pg_instance): @requires_over_12 def test_cluster_vacuum_progress(aggregator, integration_check, pg_instance): + pg_instance['application_name'] = 'test_cluster_vacuum_progress' check = integration_check(pg_instance) # Keep pg_class lock to block vacuum full during initilizing phase - conn = lock_table(pg_instance, 'pg_catalog.pg_class', 'EXCLUSIVE') + conn = lock_table(pg_instance, 'pg_catalog.pg_class', 'EXCLUSIVE', pg_instance['application_name']) # Start vacuum in a thread - thread = run_vacuum_thread(pg_instance, 'VACUUM FULL personsdup1') + thread = run_vacuum_thread(pg_instance, 'VACUUM FULL personsdup1', pg_instance['application_name']) - _wait_for_value(pg_instance, lower_threshold=0, query="select count(*) FROM pg_stat_progress_cluster;") + _wait_for_value( + pg_instance, + lower_threshold=0, + query="select count(*) FROM pg_stat_progress_cluster;", + application_name=pg_instance['application_name'], + ) check.check(pg_instance) # Cleanup connection and thread diff --git a/postgres/tests/utils.py b/postgres/tests/utils.py index dcf6ce13c41da..afc47ec52579e 100644 --- a/postgres/tests/utils.py +++ b/postgres/tests/utils.py @@ -67,16 +67,16 @@ def _get_superconn(db_instance, application_name='test', autocommit=True): ) -def lock_table(pg_instance, table, lock_mode): - lock_conn = _get_superconn(pg_instance) +def lock_table(pg_instance, table, lock_mode, application_name='test'): + lock_conn = _get_superconn(pg_instance, application_name) cur = lock_conn.cursor() cur.execute('BEGIN') cur.execute(f'lock {table} IN {lock_mode} MODE') return lock_conn -def kill_session(pg_instance, query_pattern): - with _get_superconn(pg_instance) as conn: +def kill_session(pg_instance, query_pattern, application_name='test'): + with _get_superconn(pg_instance, application_name) as conn: with conn.cursor() as cur: cur.execute( f"""SELECT pg_cancel_backend(pid) @@ -85,19 +85,19 @@ def kill_session(pg_instance, query_pattern): ) -def kill_vacuum(pg_instance): - kill_session(pg_instance, '^vacuum') +def kill_vacuum(pg_instance, application_name='test'): + kill_session(pg_instance, '^vacuum', application_name) # Wait until the query yielding a single value cross the provided threshold -def _wait_for_value(db_instance, lower_threshold, query, attempts=10): +def _wait_for_value(db_instance, lower_threshold, query, attempts=10, application_name='test'): value = 0 current_attempt = 0 # Stats table behave slightly differently than normal tables # Repeating the same query within a transaction will yield the # same value, despite the fact that the transaction is in READ COMMITED # To avoid this, we avoid transaction block created by the with statement - conn = _get_superconn(db_instance) + conn = _get_superconn(db_instance, application_name) while value <= lower_threshold and current_attempt < attempts: with conn.cursor() as cur: cur.execute(query) From 045fe96dc8e7ec1b1cef534f0e185ae624e477b6 Mon Sep 17 00:00:00 2001 From: davidfeng-datadog Date: Mon, 8 Sep 2025 16:13:38 -0400 Subject: [PATCH 13/13] Add owner field to Event Management manifest files (#21010) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with [Claude Code](https://claude.ai/code) Co-authored-by: Claude --- solarwinds/manifest.json | 1 + 1 file changed, 1 insertion(+) diff --git a/solarwinds/manifest.json b/solarwinds/manifest.json index 937d68503953b..65feabedae183 100644 --- a/solarwinds/manifest.json +++ b/solarwinds/manifest.json @@ -2,6 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "b9c52007-5909-44c7-b4a2-624f0efffa9e", "app_id": "solarwinds", + "owner": "event-management", "display_on_public_website": false, "tile": { "overview": "README.md#Overview",