Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
992 changes: 992 additions & 0 deletions gpu/assets/dashboards/aws_gpu_cost_overview.json

Large diffs are not rendered by default.

920 changes: 920 additions & 0 deletions gpu/assets/dashboards/azure_gpu_cost_overview.json

Large diffs are not rendered by default.

903 changes: 903 additions & 0 deletions gpu/assets/dashboards/gcp_gpu_cost_overview.json

Large diffs are not rendered by default.

1,458 changes: 1,458 additions & 0 deletions gpu/assets/dashboards/gpu_cost_overview.json

Large diffs are not rendered by default.

708 changes: 708 additions & 0 deletions gpu/assets/dashboards/k8s_gpu_cost_overview.json

Large diffs are not rendered by default.

7 changes: 6 additions & 1 deletion gpu/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,12 @@
},
"assets": {
"dashboards": {
"network_metrics": "assets/dashboards/network_metrics.json"
"network_metrics": "assets/dashboards/network_metrics.json",
"gpu_cost_overview": "assets/dashboards/gpu_cost_overview.json",
"aws_gpu_cost_overview": "assets/dashboards/aws_gpu_cost_overview.json",
"azure_gpu_cost_overview": "assets/dashboards/azure_gpu_cost_overview.json",
"gcp_gpu_cost_overview": "assets/dashboards/gcp_gpu_cost_overview.json",
"k8s_gpu_cost_overview": "assets/dashboards/k8s_gpu_cost_overview.json"
},
"integration": {
"source_type_name": "GPU",
Expand Down
6 changes: 6 additions & 0 deletions kafka_consumer/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

<!-- towncrier release notes start -->

## 7.2.1 / 2026-05-12

***Fixed***:

* Switch cluster monitoring's earliest-offset fetch to AdminClient.list_offsets(earliest), and isolate its failures so an earliest-offset error no longer drops topic.message_rate, partition.isr, topic.config.*, and other unrelated topic-metadata metrics. ([#23580](https://github.com/DataDog/integrations-core/pull/23580))

## 7.2.0 / 2026-04-15

***Added***:
Expand Down
1 change: 0 additions & 1 deletion kafka_consumer/changelog.d/23580.fixed

This file was deleted.

2 changes: 1 addition & 1 deletion kafka_consumer/datadog_checks/kafka_consumer/__about__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)

__version__ = "7.2.0"
__version__ = "7.2.1"
1 change: 1 addition & 0 deletions nutanix/changelog.d/23609.fixed
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Always emit `ntnx_host_type`, `ntnx_hypervisor_type`, and `ntnx_node_status` tags, with `$unknown` as the fallback when the source field is missing.
93 changes: 38 additions & 55 deletions nutanix/datadog_checks/nutanix/infrastructure_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,9 @@
from datadog_checks.nutanix.check import NutanixCheck


# Sentinel values from the Nutanix v4.0 specs that should be treated as unknown.
_SENTINEL_STATE_VALUES = frozenset({"$unknown", "$redacted", "undetermined"})


def _norm_state(value: object) -> str:
"""Lowercase ``value``, mapping spec sentinels and missing values to ``unknown``."""
if not isinstance(value, str) or not value:
return "unknown"
normalized = value.lower()
return "unknown" if normalized in _SENTINEL_STATE_VALUES else normalized
def _normalize_tag_value(value: object) -> str:
"""Lowercase ``value``; missing values fall back to ``$unknown`` (the API's spec sentinel)."""
return value.lower() if isinstance(value, str) and value else "$unknown"


@dataclass
Expand Down Expand Up @@ -245,8 +238,8 @@ def _report_vm_basic_metrics(self, vm: dict, hostname: str, vm_tags: list[str])
"""Report basic VM metrics (counts and status)."""
self.check.gauge("vm.count", 1, hostname=hostname, tags=vm_tags)

power_state = vm.get("powerState", "$UNKNOWN")
status_value = 0 if power_state == "ON" else 1 if power_state == "PAUSED" else 2
power_state = _normalize_tag_value(vm.get("powerState"))
status_value = 0 if power_state == "on" else 1 if power_state == "paused" else 2
self.check.gauge("vm.status", status_value, hostname=hostname, tags=vm_tags)

self._report_vm_capacity_metrics(vm, hostname, vm_tags)
Expand Down Expand Up @@ -477,10 +470,10 @@ def _report_host_capacity_metrics(self, host: dict, hostname: str, host_tags: li

def _report_host_status_metrics(self, host: dict, hostname: str, host_tags: list[str]) -> None:
"""Report host node status as a gauge (0=OK, 1=WARNING, 2=CRITICAL/UNKNOWN)."""
node_status_ok = {"NORMAL", "NEW_NODE", "PREPROTECTED"}
node_status_warning = {"TO_BE_PREPROTECTED", "TO_BE_REMOVED", "OK_TO_BE_REMOVED"}
node_status_ok = {"normal", "new_node", "preprotected"}
node_status_warning = {"to_be_preprotected", "to_be_removed", "ok_to_be_removed"}

node_status = host.get("nodeStatus", "$UNKNOWN")
node_status = _normalize_tag_value(host.get("nodeStatus"))

if node_status in node_status_ok:
status_value = 0
Expand All @@ -494,70 +487,59 @@ def _report_host_status_metrics(self, host: dict, hostname: str, host_tags: list

def _extract_host_tags(self, host: dict) -> list[str]:
"""Extract tags from a host object."""
tags = []
host_name = host.get("hostName")
host_type = _normalize_tag_value(host.get("hostType"))
maintenance_state = _normalize_tag_value(host.get("maintenanceState"))
hypervisor_name = get_nested(host, "hypervisor/fullName")
hypervisor_type = _normalize_tag_value(get_nested(host, "hypervisor/type"))
connection_state = _normalize_tag_value(get_nested(host, "hypervisor/acropolisConnectionState"))

tags = []
tags.append("ntnx_type:host")

if host_name := host.get("hostName"):
if host_name:
tags.append(f"ntnx_host_name:{host_name}")

if host_type := host.get("hostType"):
tags.append(f"ntnx_host_type:{host_type}")

tags.append(f"ntnx_maintenance_state:{_norm_state(host.get('maintenanceState'))}")

# hypervisor tags
if hypervisor_name := get_nested(host, "hypervisor/fullName"):
tags.append(f"ntnx_host_type:{host_type}")
tags.append(f"ntnx_maintenance_state:{maintenance_state}")
if hypervisor_name:
tags.append(f"ntnx_hypervisor_name:{hypervisor_name}")
if hypervisor_type := get_nested(host, "hypervisor/type"):
tags.append(f"ntnx_hypervisor_type:{hypervisor_type}")
tags.append(f"ntnx_connection_state:{_norm_state(get_nested(host, 'hypervisor/acropolisConnectionState'))}")

# Add category tags
tags.append(f"ntnx_hypervisor_type:{hypervisor_type}")
tags.append(f"ntnx_connection_state:{connection_state}")
tags.extend(self.check.extract_category_tags(host))

return tags

def _extract_cluster_tags(self, cluster: dict) -> list[str]:
"""Extract tags from a cluster object."""
tags = []

cluster_name = cluster.get("name")
operation_mode = _normalize_tag_value(get_nested(cluster, "config/operationMode"))

tags = []
if cluster_name:
tags.append(f"ntnx_cluster_name:{cluster_name}")

tags.append(f"ntnx_operation_mode:{_norm_state(get_nested(cluster, 'config/operationMode'))}")

# Add category tags
tags.append(f"ntnx_operation_mode:{operation_mode}")
tags.extend(self.check.extract_category_tags(cluster))

return tags

def _extract_vm_tags(self, vm: dict) -> list[str]:
"""Extract tags from a VM object."""
tags = []
vm_name = vm.get("name")
host_id = get_nested(vm, "host/extId")
cluster_id = get_nested(vm, "cluster/extId")
is_agent_vm = is_affirmative(vm.get("isAgentVm"))
power_state = _normalize_tag_value(vm.get("powerState"))

tags = []
tags.append("ntnx_type:vm")

vm_name = vm.get("name")
if vm_name:
tags.append(f"ntnx_vm_name:{vm_name}")

# Add category tags
tags.extend(self.check.extract_category_tags(vm))

host_id = get_nested(vm, "host/extId")
if host_id and host_id in self.host_names:
tags.append(f"ntnx_host_name:{self.host_names[host_id]}")

cluster_id = get_nested(vm, "cluster/extId")
if cluster_id and cluster_id in self.cluster_names:
tags.append(f"ntnx_cluster_name:{self.cluster_names[cluster_id]}")

is_agent_vm = is_affirmative(vm.get("isAgentVm"))
tags.append(f"ntnx_is_agent_vm:{is_agent_vm}")

tags.append(f"ntnx_power_state:{_norm_state(vm.get('powerState'))}")
tags.append(f"ntnx_power_state:{power_state}")

return tags

Expand Down Expand Up @@ -623,18 +605,19 @@ def _build_disks_by_host_cache(self) -> None:
self._disks_by_host.setdefault(node_id, []).append(disk)

def _aggregate_disk_status(self, disks: list[dict]) -> str:
"""Return the worst disk status across ``disks``: degraded > normal > unknown."""
statuses = {d.get("status") for d in disks if d.get("status")}
"""Return the worst disk status across ``disks``: degraded > normal > $unknown."""
statuses = {_normalize_tag_value(d.get("status")) for d in disks if d.get("status")}
if statuses & DEGRADED_DISK_STATUSES:
return "degraded"
if "NORMAL" in statuses:
if "normal" in statuses:
return "normal"
return "unknown"
return "$unknown"

def _get_disk_status_storage_tags(self, host_id: str) -> dict[str, list[str]]:
"""Return per-key extra tags adding ``ntnx_disk_status`` on host storage_* metrics."""
status = self._aggregate_disk_status(self._disks_by_host.get(host_id, []))
return {key: [f"ntnx_disk_status:{status}"] for key in HOST_STORAGE_STAT_KEYS}
disk_status_tag = f"ntnx_disk_status:{status}"
return {key: [disk_status_tag] for key in HOST_STORAGE_STAT_KEYS}

def _build_stats_params(self) -> dict[str, str | int]:
"""Build the common query parameters for stats API calls."""
Expand Down
6 changes: 3 additions & 3 deletions nutanix/datadog_checks/nutanix/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@
# Disk status enum values that aggregate to ntnx_disk_status:degraded.
DEGRADED_DISK_STATUSES: frozenset[str] = frozenset(
{
"MARKED_FOR_REMOVAL_BUT_NOT_DETACHABLE",
"DATA_MIGRATION_INITIATED",
"DETACHABLE",
"marked_for_removal_but_not_detachable",
"data_migration_initiated",
"detachable",
}
)

Expand Down
4 changes: 2 additions & 2 deletions nutanix/tests/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@
'ntnx_cluster_name:datadog-nutanix-dev',
'ntnx_connection_state:connected',
'ntnx_host_name:10-0-0-103-aws-us-east-1a',
'ntnx_host_type:HYPER_CONVERGED',
'ntnx_host_type:hyper_converged',
'ntnx_hypervisor_name:AHV 10.3',
'ntnx_hypervisor_type:AHV',
'ntnx_hypervisor_type:ahv',
'ntnx_maintenance_state:normal',
'ntnx_operation_mode:normal',
'ntnx_type:host',
Expand Down
2 changes: 1 addition & 1 deletion nutanix/tests/test_disk_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def test_disks_endpoint_failure_falls_back_to_unknown(dd_run_check, aggregator,
aggregator.assert_metric(
"nutanix.host.storage_capacity",
at_least=1,
tags=HOST_TAGS + ['ntnx_disk_status:unknown'],
tags=HOST_TAGS + ['ntnx_disk_status:$unknown'],
hostname=HOST_NAME,
)

Expand Down
46 changes: 45 additions & 1 deletion nutanix/tests/test_hosts.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,51 @@ def test_host_status_metrics(dd_run_check, aggregator, mock_instance, mock_http_
dd_run_check(check)

aggregator.assert_metric(
"nutanix.host.status", value=0, tags=HOST_TAGS + ['ntnx_node_status:NORMAL'], hostname=HOST_NAME
"nutanix.host.status", value=0, tags=HOST_TAGS + ['ntnx_node_status:normal'], hostname=HOST_NAME
)


def test_host_tags_fall_back_to_unknown_when_source_fields_missing(
dd_run_check, aggregator, mock_instance, mock_http_get, mocker
):
"""When hostType, hypervisor.type, or nodeStatus are missing, tags emit ``$unknown``."""
cluster_id = "00064715-c043-5d8f-ee4b-176ec875554d"
sparse_host = {
"extId": "d8787814-4fe8-4ba5-931f-e1ee31c294a6",
"hostName": HOST_NAME,
"hypervisor": {"fullName": "AHV 10.3"},
"cluster": {"uuid": cluster_id},
}
mocker.patch(
"datadog_checks.nutanix.infrastructure_monitor.InfrastructureMonitor._list_hosts_by_cluster",
side_effect=lambda cid: [sparse_host] if cid == cluster_id else [],
)
check = NutanixCheck('nutanix', {}, [mock_instance])
dd_run_check(check)

expected_tags = [
'Team:agent-integrations',
'cluster_category:cluster_value1',
'cluster_category:cluster_value2',
'cluster_category:cluster_value3',
'ntnx_cluster_name:datadog-nutanix-dev',
'ntnx_connection_state:$unknown',
f'ntnx_host_name:{HOST_NAME}',
'ntnx_host_type:$unknown',
'ntnx_hypervisor_name:AHV 10.3',
'ntnx_hypervisor_type:$unknown',
'ntnx_maintenance_state:$unknown',
'ntnx_operation_mode:normal',
'ntnx_type:host',
'nutanix',
'prism_central:10.0.0.197',
]
aggregator.assert_metric("nutanix.host.count", value=1, tags=expected_tags, hostname=HOST_NAME)
aggregator.assert_metric(
"nutanix.host.status",
value=2,
tags=expected_tags + ['ntnx_node_status:$unknown'],
hostname=HOST_NAME,
)


Expand Down
2 changes: 1 addition & 1 deletion requirements-agent-release.txt
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ datadog-jboss-wildfly==3.4.0
datadog-journald==3.2.0
datadog-juniper-srx-firewall==1.3.0
datadog-kafka-actions==2.6.0
datadog-kafka-consumer==7.2.0
datadog-kafka-consumer==7.2.1
datadog-kafka==4.5.0
datadog-karpenter==3.4.1
datadog-keda==2.4.1
Expand Down
1 change: 0 additions & 1 deletion snmp/assets/dashboards/datacenter_overview.json
Original file line number Diff line number Diff line change
Expand Up @@ -4728,6 +4728,5 @@
],
"layout_type": "ordered",
"notify_list": [],
"pause_auto_refresh": false,
"reflow_type": "fixed"
}
4 changes: 2 additions & 2 deletions teamcity/tests/docker/mockserver/docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
services:
teamcity:
image: mockserver/mockserver:latest
image: mockserver/mockserver:5.15.0
container_name: teamcity
command: -serverPort 8111
ports:
- 8111:8111
environment:
MOCKSERVER_INITIALIZATION_JSON_PATH: /config/initializerJson.json
MOCKSERVER_WATCH_INITIALIZATION_JSON: true
MOCKSERVER_SERVER_PORT: 8111
volumes:
- ./config:/config
Loading