Skip to content

Commit 0a6cfb4

Browse files
authored
[nutanix] Normalize ntnx_* tags to lowercase with $unknown fallback (DataDog#23609)
* fix(nutanix): normalize all ntnx_* tags to lowercase with $unknown fallback Route every enum-backed ntnx_* tag (host_type, hypervisor_type, node_status, plus the previously-handled state tags) through a single _norm_state helper so they all follow one rule: lowercase the API value, fall back to "\$unknown" when the source is missing. Picks "\$unknown" (the API spec's own sentinel) as the fallback so there's no mismatch between "value present but says \$UNKNOWN" and "value missing" — both surface as ntnx_X:\$unknown. ntnx_disk_status's "unknown" fallback is updated to "\$unknown" for the same reason. * docs(nutanix): add changelog for tag normalization * docs(nutanix): shorten changelog to one customer-facing line * refactor(nutanix): extract tag values into named variables Hoist _norm_state and get_nested calls out of f-strings in the tag-extraction helpers. Each tag computation now binds to a named local first, making the read top-down and easier to step through. * refactor(nutanix): rename _norm_state to _normalize_tag_value The helper is used for type, state, mode, and status tags — not just state — so the broader name better describes what it does. * refactor(nutanix): collapse node_status to one variable, restore tags = [] Lowercase the node-status comparison sets so the normalized tag value serves both the status_value lookup and the tag emission, removing the need for a separate node_status_tag local. Restore the tags = [] preamble in the tag-extraction helpers since it makes the building intent obvious. * fix(nutanix): normalize powerState in vm.status gauge Match what _report_host_status_metrics does: route the powerState lookup through _normalize_tag_value and lowercase the comparison literals. Removes the asymmetry where vm.status was the only metric still relying on raw uppercase API values. Addresses review feedback on PR DataDog#23609. * refactor(nutanix): normalize disk statuses in _aggregate_disk_status Lowercase DEGRADED_DISK_STATUSES at the constant and route disk ``status`` values through ``_normalize_tag_value`` so the comparison surface matches the rest of the module. Aligns with the convention introduced in ``_report_host_status_metrics``. * test(nutanix): cover \$unknown fallback for host enum tags Verify ``ntnx_host_type``, ``ntnx_hypervisor_type``, and ``ntnx_node_status`` emit ``\$unknown`` when ``hostType``, ``hypervisor.type``, and ``nodeStatus`` are absent from the host payload, the always-emit behavior introduced in this PR. * test(nutanix): align unknown-fallback test with repo conventions
1 parent 80bd7d4 commit 0a6cfb4

6 files changed

Lines changed: 90 additions & 62 deletions

File tree

nutanix/changelog.d/23609.fixed

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Always emit `ntnx_host_type`, `ntnx_hypervisor_type`, and `ntnx_node_status` tags, with `$unknown` as the fallback when the source field is missing.

nutanix/datadog_checks/nutanix/infrastructure_monitor.py

Lines changed: 38 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,9 @@
2323
from datadog_checks.nutanix.check import NutanixCheck
2424

2525

26-
# Sentinel values from the Nutanix v4.0 specs that should be treated as unknown.
27-
_SENTINEL_STATE_VALUES = frozenset({"$unknown", "$redacted", "undetermined"})
28-
29-
30-
def _norm_state(value: object) -> str:
31-
"""Lowercase ``value``, mapping spec sentinels and missing values to ``unknown``."""
32-
if not isinstance(value, str) or not value:
33-
return "unknown"
34-
normalized = value.lower()
35-
return "unknown" if normalized in _SENTINEL_STATE_VALUES else normalized
26+
def _normalize_tag_value(value: object) -> str:
27+
"""Lowercase ``value``; missing values fall back to ``$unknown`` (the API's spec sentinel)."""
28+
return value.lower() if isinstance(value, str) and value else "$unknown"
3629

3730

3831
@dataclass
@@ -245,8 +238,8 @@ def _report_vm_basic_metrics(self, vm: dict, hostname: str, vm_tags: list[str])
245238
"""Report basic VM metrics (counts and status)."""
246239
self.check.gauge("vm.count", 1, hostname=hostname, tags=vm_tags)
247240

248-
power_state = vm.get("powerState", "$UNKNOWN")
249-
status_value = 0 if power_state == "ON" else 1 if power_state == "PAUSED" else 2
241+
power_state = _normalize_tag_value(vm.get("powerState"))
242+
status_value = 0 if power_state == "on" else 1 if power_state == "paused" else 2
250243
self.check.gauge("vm.status", status_value, hostname=hostname, tags=vm_tags)
251244

252245
self._report_vm_capacity_metrics(vm, hostname, vm_tags)
@@ -477,10 +470,10 @@ def _report_host_capacity_metrics(self, host: dict, hostname: str, host_tags: li
477470

478471
def _report_host_status_metrics(self, host: dict, hostname: str, host_tags: list[str]) -> None:
479472
"""Report host node status as a gauge (0=OK, 1=WARNING, 2=CRITICAL/UNKNOWN)."""
480-
node_status_ok = {"NORMAL", "NEW_NODE", "PREPROTECTED"}
481-
node_status_warning = {"TO_BE_PREPROTECTED", "TO_BE_REMOVED", "OK_TO_BE_REMOVED"}
473+
node_status_ok = {"normal", "new_node", "preprotected"}
474+
node_status_warning = {"to_be_preprotected", "to_be_removed", "ok_to_be_removed"}
482475

483-
node_status = host.get("nodeStatus", "$UNKNOWN")
476+
node_status = _normalize_tag_value(host.get("nodeStatus"))
484477

485478
if node_status in node_status_ok:
486479
status_value = 0
@@ -494,70 +487,59 @@ def _report_host_status_metrics(self, host: dict, hostname: str, host_tags: list
494487

495488
def _extract_host_tags(self, host: dict) -> list[str]:
496489
"""Extract tags from a host object."""
497-
tags = []
490+
host_name = host.get("hostName")
491+
host_type = _normalize_tag_value(host.get("hostType"))
492+
maintenance_state = _normalize_tag_value(host.get("maintenanceState"))
493+
hypervisor_name = get_nested(host, "hypervisor/fullName")
494+
hypervisor_type = _normalize_tag_value(get_nested(host, "hypervisor/type"))
495+
connection_state = _normalize_tag_value(get_nested(host, "hypervisor/acropolisConnectionState"))
498496

497+
tags = []
499498
tags.append("ntnx_type:host")
500-
501-
if host_name := host.get("hostName"):
499+
if host_name:
502500
tags.append(f"ntnx_host_name:{host_name}")
503-
504-
if host_type := host.get("hostType"):
505-
tags.append(f"ntnx_host_type:{host_type}")
506-
507-
tags.append(f"ntnx_maintenance_state:{_norm_state(host.get('maintenanceState'))}")
508-
509-
# hypervisor tags
510-
if hypervisor_name := get_nested(host, "hypervisor/fullName"):
501+
tags.append(f"ntnx_host_type:{host_type}")
502+
tags.append(f"ntnx_maintenance_state:{maintenance_state}")
503+
if hypervisor_name:
511504
tags.append(f"ntnx_hypervisor_name:{hypervisor_name}")
512-
if hypervisor_type := get_nested(host, "hypervisor/type"):
513-
tags.append(f"ntnx_hypervisor_type:{hypervisor_type}")
514-
tags.append(f"ntnx_connection_state:{_norm_state(get_nested(host, 'hypervisor/acropolisConnectionState'))}")
515-
516-
# Add category tags
505+
tags.append(f"ntnx_hypervisor_type:{hypervisor_type}")
506+
tags.append(f"ntnx_connection_state:{connection_state}")
517507
tags.extend(self.check.extract_category_tags(host))
518508

519509
return tags
520510

521511
def _extract_cluster_tags(self, cluster: dict) -> list[str]:
522512
"""Extract tags from a cluster object."""
523-
tags = []
524-
525513
cluster_name = cluster.get("name")
514+
operation_mode = _normalize_tag_value(get_nested(cluster, "config/operationMode"))
515+
516+
tags = []
526517
if cluster_name:
527518
tags.append(f"ntnx_cluster_name:{cluster_name}")
528-
529-
tags.append(f"ntnx_operation_mode:{_norm_state(get_nested(cluster, 'config/operationMode'))}")
530-
531-
# Add category tags
519+
tags.append(f"ntnx_operation_mode:{operation_mode}")
532520
tags.extend(self.check.extract_category_tags(cluster))
533521

534522
return tags
535523

536524
def _extract_vm_tags(self, vm: dict) -> list[str]:
537525
"""Extract tags from a VM object."""
538-
tags = []
526+
vm_name = vm.get("name")
527+
host_id = get_nested(vm, "host/extId")
528+
cluster_id = get_nested(vm, "cluster/extId")
529+
is_agent_vm = is_affirmative(vm.get("isAgentVm"))
530+
power_state = _normalize_tag_value(vm.get("powerState"))
539531

532+
tags = []
540533
tags.append("ntnx_type:vm")
541-
542-
vm_name = vm.get("name")
543534
if vm_name:
544535
tags.append(f"ntnx_vm_name:{vm_name}")
545-
546-
# Add category tags
547536
tags.extend(self.check.extract_category_tags(vm))
548-
549-
host_id = get_nested(vm, "host/extId")
550537
if host_id and host_id in self.host_names:
551538
tags.append(f"ntnx_host_name:{self.host_names[host_id]}")
552-
553-
cluster_id = get_nested(vm, "cluster/extId")
554539
if cluster_id and cluster_id in self.cluster_names:
555540
tags.append(f"ntnx_cluster_name:{self.cluster_names[cluster_id]}")
556-
557-
is_agent_vm = is_affirmative(vm.get("isAgentVm"))
558541
tags.append(f"ntnx_is_agent_vm:{is_agent_vm}")
559-
560-
tags.append(f"ntnx_power_state:{_norm_state(vm.get('powerState'))}")
542+
tags.append(f"ntnx_power_state:{power_state}")
561543

562544
return tags
563545

@@ -623,18 +605,19 @@ def _build_disks_by_host_cache(self) -> None:
623605
self._disks_by_host.setdefault(node_id, []).append(disk)
624606

625607
def _aggregate_disk_status(self, disks: list[dict]) -> str:
626-
"""Return the worst disk status across ``disks``: degraded > normal > unknown."""
627-
statuses = {d.get("status") for d in disks if d.get("status")}
608+
"""Return the worst disk status across ``disks``: degraded > normal > $unknown."""
609+
statuses = {_normalize_tag_value(d.get("status")) for d in disks if d.get("status")}
628610
if statuses & DEGRADED_DISK_STATUSES:
629611
return "degraded"
630-
if "NORMAL" in statuses:
612+
if "normal" in statuses:
631613
return "normal"
632-
return "unknown"
614+
return "$unknown"
633615

634616
def _get_disk_status_storage_tags(self, host_id: str) -> dict[str, list[str]]:
635617
"""Return per-key extra tags adding ``ntnx_disk_status`` on host storage_* metrics."""
636618
status = self._aggregate_disk_status(self._disks_by_host.get(host_id, []))
637-
return {key: [f"ntnx_disk_status:{status}"] for key in HOST_STORAGE_STAT_KEYS}
619+
disk_status_tag = f"ntnx_disk_status:{status}"
620+
return {key: [disk_status_tag] for key in HOST_STORAGE_STAT_KEYS}
638621

639622
def _build_stats_params(self) -> dict[str, str | int]:
640623
"""Build the common query parameters for stats API calls."""

nutanix/datadog_checks/nutanix/metrics.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@
1616
# Disk status enum values that aggregate to ntnx_disk_status:degraded.
1717
DEGRADED_DISK_STATUSES: frozenset[str] = frozenset(
1818
{
19-
"MARKED_FOR_REMOVAL_BUT_NOT_DETACHABLE",
20-
"DATA_MIGRATION_INITIATED",
21-
"DETACHABLE",
19+
"marked_for_removal_but_not_detachable",
20+
"data_migration_initiated",
21+
"detachable",
2222
}
2323
)
2424

nutanix/tests/constants.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@
2929
'ntnx_cluster_name:datadog-nutanix-dev',
3030
'ntnx_connection_state:connected',
3131
'ntnx_host_name:10-0-0-103-aws-us-east-1a',
32-
'ntnx_host_type:HYPER_CONVERGED',
32+
'ntnx_host_type:hyper_converged',
3333
'ntnx_hypervisor_name:AHV 10.3',
34-
'ntnx_hypervisor_type:AHV',
34+
'ntnx_hypervisor_type:ahv',
3535
'ntnx_maintenance_state:normal',
3636
'ntnx_operation_mode:normal',
3737
'ntnx_type:host',

nutanix/tests/test_disk_status.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def test_disks_endpoint_failure_falls_back_to_unknown(dd_run_check, aggregator,
2222
aggregator.assert_metric(
2323
"nutanix.host.storage_capacity",
2424
at_least=1,
25-
tags=HOST_TAGS + ['ntnx_disk_status:unknown'],
25+
tags=HOST_TAGS + ['ntnx_disk_status:$unknown'],
2626
hostname=HOST_NAME,
2727
)
2828

nutanix/tests/test_hosts.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,51 @@ def test_host_status_metrics(dd_run_check, aggregator, mock_instance, mock_http_
4646
dd_run_check(check)
4747

4848
aggregator.assert_metric(
49-
"nutanix.host.status", value=0, tags=HOST_TAGS + ['ntnx_node_status:NORMAL'], hostname=HOST_NAME
49+
"nutanix.host.status", value=0, tags=HOST_TAGS + ['ntnx_node_status:normal'], hostname=HOST_NAME
50+
)
51+
52+
53+
def test_host_tags_fall_back_to_unknown_when_source_fields_missing(
54+
dd_run_check, aggregator, mock_instance, mock_http_get, mocker
55+
):
56+
"""When hostType, hypervisor.type, or nodeStatus are missing, tags emit ``$unknown``."""
57+
cluster_id = "00064715-c043-5d8f-ee4b-176ec875554d"
58+
sparse_host = {
59+
"extId": "d8787814-4fe8-4ba5-931f-e1ee31c294a6",
60+
"hostName": HOST_NAME,
61+
"hypervisor": {"fullName": "AHV 10.3"},
62+
"cluster": {"uuid": cluster_id},
63+
}
64+
mocker.patch(
65+
"datadog_checks.nutanix.infrastructure_monitor.InfrastructureMonitor._list_hosts_by_cluster",
66+
side_effect=lambda cid: [sparse_host] if cid == cluster_id else [],
67+
)
68+
check = NutanixCheck('nutanix', {}, [mock_instance])
69+
dd_run_check(check)
70+
71+
expected_tags = [
72+
'Team:agent-integrations',
73+
'cluster_category:cluster_value1',
74+
'cluster_category:cluster_value2',
75+
'cluster_category:cluster_value3',
76+
'ntnx_cluster_name:datadog-nutanix-dev',
77+
'ntnx_connection_state:$unknown',
78+
f'ntnx_host_name:{HOST_NAME}',
79+
'ntnx_host_type:$unknown',
80+
'ntnx_hypervisor_name:AHV 10.3',
81+
'ntnx_hypervisor_type:$unknown',
82+
'ntnx_maintenance_state:$unknown',
83+
'ntnx_operation_mode:normal',
84+
'ntnx_type:host',
85+
'nutanix',
86+
'prism_central:10.0.0.197',
87+
]
88+
aggregator.assert_metric("nutanix.host.count", value=1, tags=expected_tags, hostname=HOST_NAME)
89+
aggregator.assert_metric(
90+
"nutanix.host.status",
91+
value=2,
92+
tags=expected_tags + ['ntnx_node_status:$unknown'],
93+
hostname=HOST_NAME,
5094
)
5195

5296

0 commit comments

Comments
 (0)