Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions infiniband/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,23 @@ instances:

See [metadata.csv][7] for a list of metrics provided by this integration.

This integration reads counters from the Linux RDMA/InfiniBand sysfs interface
at `/sys/class/infiniband`. Linux exposes RDMA devices through this interface
even when the port is using Ethernet/RoCE instead of native InfiniBand, so the
check can collect metrics from compatible RDMA NICs in either mode. Use the
`link_layer`, `netdev`, and `gid_type` tags to distinguish native InfiniBand
ports from Ethernet/RoCE-backed ports.

All metrics are tagged with `device` and `port`. When the kernel exposes the
corresponding sysfs files, metrics are also tagged with:

- `link_layer`, for example `link_layer:infiniband` or `link_layer:ethernet`
- `netdev` and `gid_type` from `gid_attrs`, for example `netdev:ens5f0` and `gid_type:roce_v2`
- `firmware_version`, `hca_type`, `board_id`, and `node_type` from device metadata

The check also submits `infiniband.port.rate` from each port's negotiated link
rate.

### Events

The InfiniBand integration does not include any events.
Expand Down
120 changes: 69 additions & 51 deletions infiniband/assets/dashboards/infiniband_overview.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions infiniband/changelog.d/23901.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add port rate, link layer, network device, and device identity metadata to InfiniBand metrics.
114 changes: 102 additions & 12 deletions infiniband/datadog_checks/infiniband/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,28 @@
# Licensed under a 3-clause BSD style license (see LICENSE)
import glob
import os
import re
from typing import Any # noqa: F401

from datadog_checks.base import AgentCheck # noqa: F401

from .metrics import IB_COUNTERS, RDMA_COUNTERS, STATUS_COUNTERS

DEVICE_TAG_FILES = (
("fw_ver", "firmware_version", False),
("hca_type", "hca_type", False),
("board_id", "board_id", False),
("node_type", "node_type", True),
)
RATE_MULTIPLIERS = {
"": 1,
"k": 1_000,
"m": 1_000_000,
"g": 1_000_000_000,
}
TAG_VALUE_RE = re.compile(r'[^a-z0-9_.-]+')
RATE_RE = re.compile(r'^\s*(?P<value>\d+(?:\.\d+)?)\s*(?P<prefix>[kmg]?)b/sec\b', re.IGNORECASE)


class InfinibandCheck(AgentCheck):
# This will be the prefix of every metric and service check the integration sends
Expand Down Expand Up @@ -58,17 +74,92 @@ def check(self, _):
self.log.debug("Skipping device %s as it does not have a ports directory", device)
continue

device_tags = self._get_device_tags(os.path.join(self.base_path, device))
for port in os.listdir(dev_path):
self._collect_counters(device, port)
self._collect_counters(device, port, device_tags)

def _collect_counters(self, device, port):
def _collect_counters(self, device, port, device_tags):
port_path = os.path.join(self.base_path, device, "ports", port)
tags = self.tags + ["device:" + device, "port:" + port]
tags = self.tags + ["device:" + device, "port:" + port] + device_tags
link_layer_tag = self._get_link_layer_tag(port_path)
if link_layer_tag:
tags.append(link_layer_tag)
tags.extend(self._get_gid_attr_tags(port_path))

self._collect_port_rate(port_path, tags)
self._collect_counter_metrics(port_path, tags)
self._collect_hw_counter_metrics(port_path, tags)
self._collect_status_metrics(port_path, tags)

def _read_sysfs_file(self, file_path):
try:
with open(file_path, "r") as f:
return f.read().strip()
except OSError as e:
self.log.debug("Failed to read value from %s: %s", file_path, e)
return None

def _normalize_tag_value(self, value, *, strip_prefix=False):
if strip_prefix and ":" in value:
value = value.split(":", 1)[1]
value = TAG_VALUE_RE.sub("_", value.strip().lower()).strip("_")
return value or None

def _append_tag(self, tags, tag):
if tag and tag not in tags:
tags.append(tag)

def _get_file_tag(self, file_path, tag_name, *, strip_prefix=False):
value = self._read_sysfs_file(file_path)
if value is None:
return None

tag_value = self._normalize_tag_value(value, strip_prefix=strip_prefix)
if tag_value is None:
return None

return f"{tag_name}:{tag_value}"

def _get_device_tags(self, device_path):
tags = []
for file_name, tag_name, strip_prefix in DEVICE_TAG_FILES:
tag = self._get_file_tag(os.path.join(device_path, file_name), tag_name, strip_prefix=strip_prefix)
self._append_tag(tags, tag)
return tags

def _get_link_layer_tag(self, port_path):
return self._get_file_tag(os.path.join(port_path, "link_layer"), "link_layer")

def _get_gid_attr_tags(self, port_path):
gid_attrs_path = os.path.join(port_path, "gid_attrs")
tags = []
for ndev_path in sorted(glob.glob(os.path.join(gid_attrs_path, "ndevs", "*"))):
gid_index = os.path.basename(ndev_path)
self._append_tag(tags, self._get_file_tag(ndev_path, "netdev"))
self._append_tag(tags, self._get_file_tag(os.path.join(gid_attrs_path, "types", gid_index), "gid_type"))
return tags

def _parse_rate_bits_per_second(self, rate):
match = RATE_RE.match(rate)
if not match:
return None

value = float(match.group("value"))
multiplier = RATE_MULTIPLIERS[match.group("prefix").lower()]
return value * multiplier

def _collect_port_rate(self, port_path, tags):
rate = self._read_sysfs_file(os.path.join(port_path, "rate"))
if rate is None:
return

value = self._parse_rate_bits_per_second(rate)
if value is None:
self.log.debug("Failed to parse port rate from %s: %s", port_path, rate)
return

self.gauge("port.rate", value, tags)

def _collect_counter_metrics(self, port_path, tags):
counters_path = os.path.join(port_path, "counters")
if not os.path.isdir(counters_path):
Expand Down Expand Up @@ -121,14 +212,13 @@ def _collect_status_metrics(self, port_path, tags):
self.monotonic_count(f"port_{status_file}.count", value, metric_tags)

def _submit_counter_metric(self, file_path, metric_name, tags):
try:
with open(file_path, "r") as f:
value = int(f.read().strip())
raw_value = self._read_sysfs_file(file_path)
if raw_value is None:
return

if self.collection_type in {'gauge', 'both'}:
self.gauge(metric_name, value, tags)
value = int(raw_value)
if self.collection_type in {'gauge', 'both'}:
self.gauge(metric_name, value, tags)

if self.collection_type in {'monotonic_count', 'both'}:
self.monotonic_count(f"{metric_name}.count", value, tags)
except OSError as e:
self.log.debug("Failed to read value from %s: %s", file_path, e)
if self.collection_type in {'monotonic_count', 'both'}:
self.monotonic_count(f"{metric_name}.count", value, tags)
3 changes: 2 additions & 1 deletion infiniband/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ infiniband.multicast_rcv_packets,gauge,,packet,,"Number of multicast packets,inc
infiniband.multicast_rcv_packets.count,count,,packet,,Number of new multicast packets received since the last metric submission (legacy),0,infiniband,,,
infiniband.multicast_xmit_packets,gauge,,packet,,Number of multicast packets transmitted on all Virtual Lanes from the port (legacy),0,infiniband,,,
infiniband.multicast_xmit_packets.count,count,,packet,,Number of new multicast packets transmitted since the last metric submission (legacy),0,infiniband,,,
infiniband.port.rate,gauge,,bit,second,Negotiated port data rate in bits per second,0,infiniband,port rate,,
infiniband.port_multicast_rcv_packets,gauge,,packet,,Number of multicast packets received,0,infiniband,,,
infiniband.port_multicast_rcv_packets.count,count,,packet,,Number of new multicast packets received since the last metric submission,0,infiniband,,,
infiniband.port_multicast_xmit_packets,gauge,,packet,,Number of multicast packets transmitted on all Virtual Lanes from the port,0,infiniband,,,
Expand Down Expand Up @@ -192,4 +193,4 @@ infiniband.symbol_error.count,count,,error,,Number of new minor link errors dete
infiniband.unicast_rcv_packets,gauge,,packet,,"Number of unicast packets,including unicast packets containing errors (legacy)",0,infiniband,,,
infiniband.unicast_rcv_packets.count,count,,packet,,Number of new unicast packets received since the last metric submission (legacy),0,infiniband,,,
infiniband.unicast_xmit_packets,gauge,,packet,,Number of unicast packets transmitted on all Virtual Lanes from the port (legacy),0,infiniband,,,
infiniband.unicast_xmit_packets.count,count,,packet,,Number of new unicast packets transmitted since the last metric submission (legacy),0,infiniband,,,
infiniband.unicast_xmit_packets.count,count,,packet,,Number of new unicast packets transmitted since the last metric submission (legacy),0,infiniband,,,
14 changes: 14 additions & 0 deletions infiniband/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,19 @@
'phys_state': '5: LinkUp',
}

MOCK_RATE_DATA = '100 Gb/sec (4X EDR)'
MOCK_DEVICE_METADATA = {
'fw_ver': '16.35.4030',
'hca_type': 'MT4129',
'board_id': 'MT_0000000438',
'node_type': '1: CA',
}
MOCK_GID_ATTRS = {
'0': {
'netdev': 'ens5f0',
'type': 'RoCE v2',
},
}

MOCK_DEVICE = 'mlx5_0'
MOCK_PORT = '1'
18 changes: 17 additions & 1 deletion infiniband/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@

from .common import (
MOCK_DEVICE,
MOCK_DEVICE_METADATA,
MOCK_GID_ATTRS,
MOCK_IB_COUNTER_DATA,
MOCK_PORT,
MOCK_RATE_DATA,
MOCK_RDMA_COUNTER_DATA,
MOCK_STATUS_DATA,
)
Expand All @@ -24,7 +27,7 @@ def instance():

def _is_valid_directory(path):
# Mocks the directories name to return True so they exist
return any(x in path for x in ['infiniband', 'ports', 'counters', 'hw_counters'])
return any(x in path for x in ['infiniband', 'ports', 'counters', 'hw_counters', 'gid_attrs', 'ndevs', 'types'])


def _get_directory_contents(path):
Expand All @@ -47,18 +50,31 @@ def _get_glob_matches(pattern):
return [os.path.join(base_dir, f) for f in MOCK_IB_COUNTER_DATA.keys()]
elif 'hw_counters/*' in pattern:
return [os.path.join(base_dir, f) for f in MOCK_RDMA_COUNTER_DATA.keys()]
elif 'gid_attrs/ndevs/*' in pattern:
return [os.path.join(base_dir, gid_index) for gid_index in MOCK_GID_ATTRS]
return []


def _get_file_content(filename):
# Set up mock file content. Mock dict keys to return dict values
counter = os.path.basename(filename)
parent = os.path.basename(os.path.dirname(filename))
if filename.endswith('rate'):
return MOCK_RATE_DATA
elif parent == 'ndevs' and counter in MOCK_GID_ATTRS:
return MOCK_GID_ATTRS[counter]['netdev']
elif parent == 'types' and counter in MOCK_GID_ATTRS:
return MOCK_GID_ATTRS[counter]['type']
if counter in MOCK_IB_COUNTER_DATA:
return MOCK_IB_COUNTER_DATA[counter]
elif counter in MOCK_RDMA_COUNTER_DATA:
return MOCK_RDMA_COUNTER_DATA[counter]
elif counter in MOCK_STATUS_DATA:
return MOCK_STATUS_DATA[counter]
elif counter in MOCK_DEVICE_METADATA:
return MOCK_DEVICE_METADATA[counter]
elif counter == 'link_layer':
return 'InfiniBand'
return '0'


Expand Down
33 changes: 31 additions & 2 deletions infiniband/tests/test_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,36 @@ def _assert_metrics(aggregator, metrics, metric_prefix, tags, count=1, m_type='g
)


def _base_tags():
return [
'device:' + MOCK_DEVICE,
'port:' + MOCK_PORT,
'link_layer:infiniband',
'netdev:ens5f0',
'gid_type:roce_v2',
'firmware_version:16.35.4030',
'hca_type:mt4129',
'board_id:mt_0000000438',
'node_type:ca',
'custom:tag',
]


def test_check(aggregator, instance, mock_fs):
check = InfinibandCheck('infiniband', {}, [instance])
check.check({})

tags = ['device:' + MOCK_DEVICE, 'port:' + MOCK_PORT, 'custom:tag']
tags = _base_tags()

_assert_metrics(aggregator, MOCK_IB_COUNTER_DATA, 'infiniband', tags)
_assert_metrics(aggregator, MOCK_RDMA_COUNTER_DATA, 'infiniband.rdma', tags)
aggregator.assert_metric(
'infiniband.port.rate',
metric_type=aggregator.GAUGE,
value=100_000_000_000,
tags=tags,
count=1,
)

for status_name, status_value in MOCK_STATUS_DATA.items():
value, state_name = status_value.split(':', 1)
Expand Down Expand Up @@ -80,10 +102,17 @@ def test_collection_types(aggregator, mock_fs, collection_type, m_type, count):
check = InfinibandCheck('infiniband', {}, [instance])
check.check({})

tags = ['device:' + MOCK_DEVICE, 'port:' + MOCK_PORT, 'custom:tag']
tags = _base_tags()

_assert_metrics(aggregator, MOCK_IB_COUNTER_DATA, 'infiniband', tags, count=count, m_type=m_type)
_assert_metrics(aggregator, MOCK_RDMA_COUNTER_DATA, 'infiniband.rdma', tags, count=count, m_type=m_type)
aggregator.assert_metric(
'infiniband.port.rate',
metric_type=aggregator.GAUGE,
value=100_000_000_000,
tags=tags,
count=1,
)


def test_exclude_devices(aggregator, mock_fs):
Expand Down
Loading