Skip to content

Commit 860ccb2

Browse files
NouemanKHALclaude
andauthored
[nutanix] Warn and skip entities missing extId or name (DataDog#23612)
* fix(nutanix): normalize all ntnx_* tags to lowercase with $unknown fallback Route every enum-backed ntnx_* tag (host_type, hypervisor_type, node_status, plus the previously-handled state tags) through a single _norm_state helper so they all follow one rule: lowercase the API value, fall back to "\$unknown" when the source is missing. Picks "\$unknown" (the API spec's own sentinel) as the fallback so there's no mismatch between "value present but says \$UNKNOWN" and "value missing" — both surface as ntnx_X:\$unknown. ntnx_disk_status's "unknown" fallback is updated to "\$unknown" for the same reason. * docs(nutanix): add changelog for tag normalization * docs(nutanix): shorten changelog to one customer-facing line * refactor(nutanix): extract tag values into named variables Hoist _norm_state and get_nested calls out of f-strings in the tag-extraction helpers. Each tag computation now binds to a named local first, making the read top-down and easier to step through. * refactor(nutanix): rename _norm_state to _normalize_tag_value The helper is used for type, state, mode, and status tags — not just state — so the broader name better describes what it does. * refactor(nutanix): collapse node_status to one variable, restore tags = [] Lowercase the node-status comparison sets so the normalized tag value serves both the status_value lookup and the tag emission, removing the need for a separate node_status_tag local. Restore the tags = [] preamble in the tag-extraction helpers since it makes the building intent obvious. * fix(nutanix): normalize powerState in vm.status gauge Match what _report_host_status_metrics does: route the powerState lookup through _normalize_tag_value and lowercase the comparison literals. Removes the asymmetry where vm.status was the only metric still relying on raw uppercase API values. Addresses review feedback on PR DataDog#23609. * fix(nutanix): warn and skip entities missing the main id or name Hosts now warn-and-skip when hostName is missing (previously they emitted metrics with hostname=None, useless for correlation), VMs upgrade their existing missing-id/name skip from debug to warning, and clusters now warn-and-skip when name is missing (previously they were processed with no ntnx_cluster_name tag and weren't cached for VM/host cluster-name tagging). * docs(nutanix): add changelog for entity-validation skip * docs(nutanix): shorten changelog to one customer-facing line * refactor(nutanix): rename VM hostname locals/params to vm_name The variable was named after how it's consumed downstream (the hostname= kwarg to gauge calls), not what it actually is — the VM's "name" field from the API. Rename to vm_name for symmetry with host_name on the host side; the gauge call still passes it as hostname=vm_name. * refactor(nutanix): hoist cluster log label to a variable * Delete nutanix/changelog.d/23609.fixed * Keep collecting VMs for unnamed hosts * test(nutanix): add tests for skip behavior on missing entity fields Cover the three new skip paths added in this PR: - VM missing extId: verify vm_count decrements and metric absent - VM missing name: same invariant - Cluster missing name: verify cluster_count unchanged and warning logged Each VM test is parametrized for both batch and non-batch collection modes. Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
1 parent e8805df commit 860ccb2

4 files changed

Lines changed: 184 additions & 48 deletions

File tree

nutanix/changelog.d/23612.fixed

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Skip hosts, VMs, and clusters missing an `extId` or name, and log a warning.

nutanix/datadog_checks/nutanix/infrastructure_monitor.py

Lines changed: 69 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -152,10 +152,12 @@ def collect_cluster_metrics(self) -> None:
152152
# Process each cluster
153153
processed, skipped = 0, 0
154154
for cluster in clusters:
155-
cluster_name = cluster.get("name", "unknown")
155+
cluster_id = cluster.get("extId")
156+
cluster_name = cluster.get("name")
157+
cluster_label = cluster_name or "unknown"
156158

157159
if self._is_prism_central_cluster(cluster):
158-
self.check.log.info("[%s] Skipping Prism Central cluster: %s", self._pc_label, cluster_name)
160+
self.check.log.info("[%s] Skipping Prism Central cluster: %s", self._pc_label, cluster_label)
159161
self._collect_pc_version_metadata(cluster)
160162
skipped += 1
161163
continue
@@ -164,9 +166,13 @@ def collect_cluster_metrics(self) -> None:
164166
skipped += 1
165167
continue
166168

167-
cluster_id = cluster.get("extId")
168169
if not cluster_id:
169-
self.check.log.warning("[%s][%s] Cluster has no extId, skipping", self._pc_label, cluster_name)
170+
self.check.log.warning("[%s] Cluster %s has no extId, skipping", self._pc_label, cluster_label)
171+
skipped += 1
172+
continue
173+
174+
if not cluster_name:
175+
self.check.log.warning("[%s] Cluster %s has no name, skipping", self._pc_label, cluster_id)
170176
skipped += 1
171177
continue
172178

@@ -220,29 +226,32 @@ def _report_cluster_metrics(
220226
def _process_vm(self, vm: dict, vm_stats_dict: dict[str, list[dict]], cluster_name: str) -> bool:
221227
"""Report metrics for a single VM if it passes filters."""
222228
vm_id = vm.get("extId")
223-
hostname = vm.get("name")
224-
if not vm_id or not hostname:
225-
self.check.log.debug("[%s][%s] Skipping VM missing extId or name: %r", self._pc_label, cluster_name, vm)
229+
vm_name = vm.get("name")
230+
if not vm_id:
231+
self.check.log.warning("[%s][%s] VM %s has no extId, skipping", self._pc_label, cluster_name, vm_name)
232+
return False
233+
if not vm_name:
234+
self.check.log.warning("[%s][%s] VM %s has no name, skipping", self._pc_label, cluster_name, vm_id)
226235
return False
227236

228237
if not self._should_collect_vm(vm):
229238
return False
230239

231240
vm_tags = self.check.base_tags + self._extract_vm_tags(vm)
232-
self._set_external_tags_for_host(hostname, vm_tags)
233-
self._report_vm_basic_metrics(vm, hostname, vm_tags)
234-
self._report_vm_stats(vm_id, hostname, vm_tags, vm_stats_dict, cluster_name)
241+
self._set_external_tags_for_host(vm_name, vm_tags)
242+
self._report_vm_basic_metrics(vm, vm_name, vm_tags)
243+
self._report_vm_stats(vm_id, vm_name, vm_tags, vm_stats_dict, cluster_name)
235244
return True
236245

237-
def _report_vm_basic_metrics(self, vm: dict, hostname: str, vm_tags: list[str]) -> None:
246+
def _report_vm_basic_metrics(self, vm: dict, vm_name: str, vm_tags: list[str]) -> None:
238247
"""Report basic VM metrics (counts and status)."""
239-
self.check.gauge("vm.count", 1, hostname=hostname, tags=vm_tags)
248+
self.check.gauge("vm.count", 1, hostname=vm_name, tags=vm_tags)
240249

241250
power_state = _normalize_tag_value(vm.get("powerState"))
242251
status_value = 0 if power_state == "on" else 1 if power_state == "paused" else 2
243-
self.check.gauge("vm.status", status_value, hostname=hostname, tags=vm_tags)
252+
self.check.gauge("vm.status", status_value, hostname=vm_name, tags=vm_tags)
244253

245-
self._report_vm_capacity_metrics(vm, hostname, vm_tags)
254+
self._report_vm_capacity_metrics(vm, vm_name, vm_tags)
246255

247256
def _extract_vm_capacity(self, vm: dict) -> tuple[int, int, int, int, int]:
248257
"""Return (sockets, cores_per_socket, threads_per_core, vcpus_allocated, memory_bytes) for a VM."""
@@ -258,19 +267,19 @@ def _extract_vm_disk_capacity_bytes(self, vm: dict) -> int:
258267
int(get_nested(d, "backingInfo/diskSizeBytes") or 0) for d in vm.get("disks") or [] if isinstance(d, dict)
259268
)
260269

261-
def _report_vm_capacity_metrics(self, vm: dict, hostname: str, vm_tags: list[str]) -> None:
270+
def _report_vm_capacity_metrics(self, vm: dict, vm_name: str, vm_tags: list[str]) -> None:
262271
"""Report VM capacity metrics (CPU, memory, and disk allocation)."""
263272
num_sockets, num_cores_per_socket, num_threads_per_core, vcpus_allocated, memory_bytes = (
264273
self._extract_vm_capacity(vm)
265274
)
266275

267-
self.check.gauge("vm.cpu.sockets", num_sockets, hostname=hostname, tags=vm_tags)
268-
self.check.gauge("vm.cpu.cores_per_socket", num_cores_per_socket, hostname=hostname, tags=vm_tags)
269-
self.check.gauge("vm.cpu.threads_per_core", num_threads_per_core, hostname=hostname, tags=vm_tags)
270-
self.check.gauge("vm.cpu.vcpus_allocated", vcpus_allocated, hostname=hostname, tags=vm_tags)
271-
self.check.gauge("vm.memory.allocated_bytes", memory_bytes, hostname=hostname, tags=vm_tags)
276+
self.check.gauge("vm.cpu.sockets", num_sockets, hostname=vm_name, tags=vm_tags)
277+
self.check.gauge("vm.cpu.cores_per_socket", num_cores_per_socket, hostname=vm_name, tags=vm_tags)
278+
self.check.gauge("vm.cpu.threads_per_core", num_threads_per_core, hostname=vm_name, tags=vm_tags)
279+
self.check.gauge("vm.cpu.vcpus_allocated", vcpus_allocated, hostname=vm_name, tags=vm_tags)
280+
self.check.gauge("vm.memory.allocated_bytes", memory_bytes, hostname=vm_name, tags=vm_tags)
272281
self.check.gauge(
273-
"vm.disk_capacity_bytes", self._extract_vm_disk_capacity_bytes(vm), hostname=hostname, tags=vm_tags
282+
"vm.disk_capacity_bytes", self._extract_vm_disk_capacity_bytes(vm), hostname=vm_name, tags=vm_tags
274283
)
275284

276285
def _report_cluster_basic_metrics(self, cluster: dict, cluster_tags: list[str]) -> None:
@@ -357,17 +366,17 @@ def _report_cluster_stats(self, cluster_name: str, cluster_id: str, cluster_tags
357366
)
358367

359368
def _report_vm_stats(
360-
self, vm_id: str, hostname: str, vm_tags: list[str], vm_stats_dict: dict, cluster_name: str
369+
self, vm_id: str, vm_name: str, vm_tags: list[str], vm_stats_dict: dict, cluster_name: str
361370
) -> None:
362371
"""Report time-series stats for a VM."""
363372
stats = vm_stats_dict.get(vm_id)
364373
if stats:
365374
self._report_stats(
366-
f"[{self._pc_label}][{cluster_name}] VM {hostname}",
375+
f"[{self._pc_label}][{cluster_name}] VM {vm_name}",
367376
stats,
368377
VM_STATS_METRICS,
369378
vm_tags,
370-
hostname=hostname,
379+
hostname=vm_name,
371380
)
372381

373382
def _process_hosts(
@@ -409,43 +418,55 @@ def _process_single_host(
409418
self.check.log.warning("[%s][%s] Host %s has no extId, skipping", self._pc_label, cluster_name, host_name)
410419
return
411420

421+
skip_host_metrics = False
422+
if not host_name:
423+
self.check.log.warning(
424+
"[%s][%s] Host %s has no hostName, skipping host metrics",
425+
self._pc_label,
426+
cluster_name,
427+
host_id,
428+
)
429+
skip_host_metrics = True
430+
412431
if not should_collect_resource("host", host, self.check.resource_filters, self.check.log):
413432
return
414433

415-
self.host_count += 1
416-
417-
if host_name:
434+
if not skip_host_metrics:
435+
self.host_count += 1
418436
self.host_names[host_id] = host_name
419437

420-
host_tags = cluster_tags + self._extract_host_tags(host)
421-
self.check.gauge("host.count", 1, hostname=host_name, tags=host_tags)
422-
self._report_host_status_metrics(host, host_name, host_tags)
423-
self._set_external_tags_for_host(host_name, host_tags)
424-
self._report_host_capacity_metrics(host, host_name, host_tags)
438+
host_tags = cluster_tags + self._extract_host_tags(host)
439+
self.check.gauge("host.count", 1, hostname=host_name, tags=host_tags)
440+
self._report_host_status_metrics(host, host_name, host_tags)
441+
self._set_external_tags_for_host(host_name, host_tags)
442+
self._report_host_capacity_metrics(host, host_name, host_tags)
425443

426-
try:
427-
stats = self._get_stats(f"api/clustermgmt/v4.0/stats/clusters/{cluster_id}/hosts/{host_id}")
428-
if stats:
429-
self._report_stats(
430-
f"[{self._pc_label}][{cluster_name}] Host {host_name}",
431-
stats,
432-
HOST_STATS_METRICS,
433-
host_tags,
434-
hostname=host_name,
435-
extra_tags_by_key=self._get_disk_status_storage_tags(host_id),
444+
try:
445+
stats = self._get_stats(f"api/clustermgmt/v4.0/stats/clusters/{cluster_id}/hosts/{host_id}")
446+
if stats:
447+
self._report_stats(
448+
f"[{self._pc_label}][{cluster_name}] Host {host_name}",
449+
stats,
450+
HOST_STATS_METRICS,
451+
host_tags,
452+
hostname=host_name,
453+
extra_tags_by_key=self._get_disk_status_storage_tags(host_id),
454+
)
455+
except Exception:
456+
self.check.log.exception(
457+
"[%s][%s] Failed to fetch stats for host %s", self._pc_label, cluster_name, host_name
436458
)
437-
except Exception:
438-
self.check.log.exception(
439-
"[%s][%s] Failed to fetch stats for host %s", self._pc_label, cluster_name, host_name
440-
)
441459

460+
host_label = host_name or host_id
442461
try:
443462
vms = self._get_vms_for_host(host_id)
444463
except Exception:
445-
self.check.log.exception("[%s][%s] Failed to list VMs for host %s", self._pc_label, cluster_name, host_name)
464+
self.check.log.exception(
465+
"[%s][%s] Failed to list VMs for host %s", self._pc_label, cluster_name, host_label
466+
)
446467
return
447468

448-
self.check.log.debug("[%s][%s] Host %s has %d VMs", self._pc_label, cluster_name, host_name, len(vms))
469+
self.check.log.debug("[%s][%s] Host %s has %d VMs", self._pc_label, cluster_name, host_label, len(vms))
449470
for vm in vms:
450471
if self._process_vm(vm, cluster_vm_stats_dict, cluster_name):
451472
self.vm_count += 1

nutanix/tests/test_clusters.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44

55

66
import logging
7+
from copy import deepcopy
78

89
import pytest
910

1011
from datadog_checks.nutanix import NutanixCheck
12+
from tests.conftest import load_fixture_page
1113
from tests.constants import BASE_TAGS, CLUSTER_TAGS
1214

1315
pytestmark = [pytest.mark.unit]
@@ -92,6 +94,24 @@ def test_prism_central_cluster_skipped(dd_run_check, aggregator, mock_instance,
9294
assert len(pc_metrics) == 0
9395

9496

97+
def test_cluster_with_no_name_is_skipped(
98+
dd_run_check, aggregator, mock_instance, mock_http_get, mocker, caplog
99+
) -> None:
100+
clusters = deepcopy(load_fixture_page("clusters.json", 0)["data"])
101+
clusters.append({"extId": "no-name-cluster-id", "config": {"clusterFunction": ["AOS"]}})
102+
mocker.patch(
103+
"datadog_checks.nutanix.infrastructure_monitor.InfrastructureMonitor._list_clusters",
104+
return_value=clusters,
105+
)
106+
107+
check = NutanixCheck('nutanix', {}, [mock_instance])
108+
with caplog.at_level(logging.WARNING):
109+
dd_run_check(check)
110+
111+
assert check.infrastructure_monitor.cluster_count == 2
112+
assert any("no-name-cluster-id" in r.message and "has no name" in r.message for r in caplog.records)
113+
114+
95115
def test_missing_pc_ip_raises_error(dd_run_check):
96116
with pytest.raises(Exception, match="(?s)pc_ip.*required"):
97117
check = NutanixCheck('nutanix', {}, [{"pc_username": "admin", "pc_password": "secret"}])

nutanix/tests/test_vms.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
# All rights reserved
33
# Licensed under a 3-clause BSD style license (see LICENSE)
44

5+
from copy import deepcopy
56

67
import pytest
78

89
from datadog_checks.nutanix import NutanixCheck
10+
from tests.conftest import load_fixture_page
911
from tests.constants import OFF_VM_NAME, OFF_VM_TAGS, PCVM_NAME, PCVM_TAGS
1012
from tests.metrics import VM_STATS_METRICS_REQUIRED
1113

@@ -68,6 +70,98 @@ def test_batch_and_non_batch_produce_same_counts(
6870
aggregator.assert_metric("nutanix.cluster.count", count=2)
6971

7072

73+
@pytest.mark.parametrize("batch_vm_collection", [True, False])
74+
def test_vms_collected_when_host_missing_name(
75+
dd_run_check, aggregator, mock_instance, mock_http_get, mocker, batch_vm_collection
76+
) -> None:
77+
mock_instance["batch_vm_collection"] = batch_vm_collection
78+
79+
hosts = deepcopy(load_fixture_page("hosts_00064715.json", 0)["data"])
80+
hosts[0].pop("hostName")
81+
hosts_by_cluster = {
82+
"00064715-c043-5d8f-ee4b-176ec875554d": hosts,
83+
"aabbccdd-1111-2222-3333-444455556666": deepcopy(load_fixture_page("hosts_aabbccdd.json", 0)["data"]),
84+
}
85+
mocker.patch(
86+
"datadog_checks.nutanix.infrastructure_monitor.InfrastructureMonitor._list_hosts_by_cluster",
87+
side_effect=lambda cluster_id: hosts_by_cluster[cluster_id],
88+
)
89+
90+
check = NutanixCheck('nutanix', {}, [mock_instance])
91+
dd_run_check(check)
92+
93+
vm_names = {
94+
tag.split(":", 1)[1]
95+
for m in aggregator.metrics("nutanix.vm.count")
96+
for tag in m.tags
97+
if tag.startswith("ntnx_vm_name:")
98+
}
99+
assert {PCVM_NAME, "ubuntu-vm", "random-vm"}.issubset(vm_names)
100+
assert check.infrastructure_monitor.host_count == 1
101+
aggregator.assert_metric("nutanix.vm.count", count=4)
102+
aggregator.assert_metric("nutanix.host.count", count=1)
103+
104+
105+
@pytest.mark.parametrize("batch_vm_collection", [True, False])
106+
def test_vm_with_no_extid_is_skipped(
107+
dd_run_check, aggregator, mock_instance, mock_http_get, mocker, batch_vm_collection
108+
) -> None:
109+
mock_instance["batch_vm_collection"] = batch_vm_collection
110+
111+
all_vms = deepcopy(load_fixture_page("vms.json", 0)["data"])
112+
vms_by_host: dict[str, list] = {}
113+
for vm in all_vms:
114+
host_id = (vm.get("host") or {}).get("extId") or ""
115+
vms_by_host.setdefault(host_id, []).append(vm)
116+
117+
ubuntu_vm = next(v for v in vms_by_host["d8787814-4fe8-4ba5-931f-e1ee31c294a6"] if v.get("name") == "ubuntu-vm")
118+
ubuntu_vm.pop("extId")
119+
120+
mocker.patch(
121+
"datadog_checks.nutanix.infrastructure_monitor.InfrastructureMonitor._get_vms_for_host",
122+
side_effect=lambda h: vms_by_host.get(h, []),
123+
)
124+
125+
check = NutanixCheck('nutanix', {}, [mock_instance])
126+
dd_run_check(check)
127+
128+
assert check.infrastructure_monitor.vm_count == 3
129+
vm_names = {
130+
tag.split(":", 1)[1]
131+
for m in aggregator.metrics("nutanix.vm.count")
132+
for tag in m.tags
133+
if tag.startswith("ntnx_vm_name:")
134+
}
135+
assert "ubuntu-vm" not in vm_names
136+
137+
138+
@pytest.mark.parametrize("batch_vm_collection", [True, False])
139+
def test_vm_with_no_name_is_skipped(
140+
dd_run_check, aggregator, mock_instance, mock_http_get, mocker, batch_vm_collection
141+
) -> None:
142+
mock_instance["batch_vm_collection"] = batch_vm_collection
143+
144+
all_vms = deepcopy(load_fixture_page("vms.json", 0)["data"])
145+
vms_by_host: dict[str, list] = {}
146+
for vm in all_vms:
147+
host_id = (vm.get("host") or {}).get("extId") or ""
148+
vms_by_host.setdefault(host_id, []).append(vm)
149+
150+
ubuntu_vm = next(v for v in vms_by_host["d8787814-4fe8-4ba5-931f-e1ee31c294a6"] if v.get("name") == "ubuntu-vm")
151+
ubuntu_vm.pop("name")
152+
153+
mocker.patch(
154+
"datadog_checks.nutanix.infrastructure_monitor.InfrastructureMonitor._get_vms_for_host",
155+
side_effect=lambda h: vms_by_host.get(h, []),
156+
)
157+
158+
check = NutanixCheck('nutanix', {}, [mock_instance])
159+
dd_run_check(check)
160+
161+
assert check.infrastructure_monitor.vm_count == 3
162+
aggregator.assert_metric("nutanix.vm.count", count=3)
163+
164+
71165
def test_external_tags_for_vm(dd_run_check, aggregator, mock_instance, mock_http_get, datadog_agent):
72166
check = NutanixCheck('nutanix', {}, [mock_instance])
73167
dd_run_check(check)

0 commit comments

Comments
 (0)