Skip to content

Commit fab5e77

Browse files
committed
add EFA metrics
infiniband: add EFA collector unit tests docs: document EFA support in the infiniband collector Signed-off-by: Yizheng Jiao <jyizheng@gmail.com>
1 parent 735124b commit fab5e77

3 files changed

Lines changed: 823 additions & 0 deletions

File tree

collector/infiniband_linux.go

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,23 @@ import (
2020
"fmt"
2121
"log/slog"
2222
"os"
23+
"path/filepath"
2324
"strconv"
25+
"strings"
2426

2527
"github.com/prometheus/client_golang/prometheus"
2628
"github.com/prometheus/procfs/sysfs"
2729
)
2830

31+
// efaVendorID is the PCI vendor ID for AWS Elastic Fabric Adapter.
32+
// EFA devices register under /sys/class/infiniband but do NOT populate the
33+
// IB-spec port_xmit_data / port_rcv_data counters. Bytes/packets live in
34+
// hw_counters/{tx,rx}_{bytes,pkts} as raw values (no IB 4-octet scaling).
35+
const efaVendorID = "0x1d0f"
36+
2937
type infinibandCollector struct {
3038
fs sysfs.FS
39+
sysPath string
3140
metricDescs map[string]*prometheus.Desc
3241
logger *slog.Logger
3342
subsystem string
@@ -46,6 +55,7 @@ func NewInfiniBandCollector(logger *slog.Logger) (Collector, error) {
4655
if err != nil {
4756
return nil, fmt.Errorf("failed to open sysfs: %w", err)
4857
}
58+
i.sysPath = *sysPath
4959
i.logger = logger
5060

5161
// Detailed description for all metrics.
@@ -115,6 +125,21 @@ func NewInfiniBandCollector(logger *slog.Logger) (Collector, error) {
115125
"rx_read_requests_total": "The number of received READ requests for the associated QPs.",
116126
"rx_write_requests_total": "The number of received WRITE requests for the associated QPs.",
117127
"rx_icrc_encapsulated_errors_total": "The number of RoCE packets with ICRC errors. This counter was added in MLNX_OFED 4.4 and kernel 4.19",
128+
129+
// EFA-specific hw_counters (vendor 0x1d0f). EFA NICs do not follow the
130+
// IB spec for port_xmit_data / port_rcv_data, so the IB code path leaves
131+
// port_data_*_bytes_total empty. The EFA branch in Update() fills those
132+
// from hw_counters/{tx,rx}_bytes and additionally emits the diagnostic
133+
// counters listed here under the efa_ prefix to avoid clashing with the
134+
// Mellanox-specific hw_counters above.
135+
"efa_rx_drops_total": "EFA: packets dropped on receive (hw_counters/rx_drops).",
136+
"efa_retrans_packets_total": "EFA: retransmitted packets (hw_counters/retrans_pkts).",
137+
"efa_retrans_bytes_total": "EFA: retransmitted bytes (hw_counters/retrans_bytes).",
138+
"efa_retrans_timeout_events_total": "EFA: retransmit timeout events (hw_counters/retrans_timeout_events).",
139+
"efa_unresponsive_remote_events_total": "EFA: unresponsive remote events (hw_counters/unresponsive_remote_events).",
140+
"efa_impaired_remote_conn_events_total": "EFA: impaired remote connection events (hw_counters/impaired_remote_conn_events).",
141+
"efa_rdma_read_bytes_total": "EFA: RDMA read bytes (hw_counters/rdma_read_bytes).",
142+
"efa_rdma_write_bytes_total": "EFA: RDMA write bytes (hw_counters/rdma_write_bytes).",
118143
}
119144

120145
i.metricDescs = make(map[string]*prometheus.Desc)
@@ -142,6 +167,45 @@ func (c *infinibandCollector) pushCounter(ch chan<- prometheus.Metric, name stri
142167
}
143168
}
144169

170+
// isEFADevice reports whether the InfiniBand-class device is an AWS EFA NIC
171+
// by checking its PCI vendor ID (0x1d0f). EFA NICs register under
172+
// /sys/class/infiniband but do not follow the IB spec for byte/packet
173+
// counters, so they need a different read path (hw_counters/).
174+
func (c *infinibandCollector) isEFADevice(deviceName string) bool {
175+
path := filepath.Join(c.sysPath, "class", "infiniband", deviceName, "device", "vendor")
176+
data, err := os.ReadFile(path)
177+
if err != nil {
178+
return false
179+
}
180+
return strings.TrimSpace(string(data)) == efaVendorID
181+
}
182+
183+
// readEFAHWCounter reads a single uint64 counter from
184+
// /sys/class/infiniband/<device>/ports/<port>/hw_counters/<counter>.
185+
// Returns nil if the file is missing or unparseable, so pushCounter can skip
186+
// emitting absent series.
187+
func (c *infinibandCollector) readEFAHWCounter(deviceName string, port uint, counter string) *uint64 {
188+
path := filepath.Join(c.sysPath, "class", "infiniband", deviceName,
189+
"ports", strconv.FormatUint(uint64(port), 10), "hw_counters", counter)
190+
data, err := os.ReadFile(path)
191+
if err != nil {
192+
return nil
193+
}
194+
v, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
195+
if err != nil {
196+
c.logger.Debug("failed to parse EFA hw_counter",
197+
"path", path, "err", err)
198+
return nil
199+
}
200+
return &v
201+
}
202+
203+
// pushEFACounter is a convenience wrapper that reads a single hw_counter and
204+
// pushes it as a Prometheus counter if present.
205+
func (c *infinibandCollector) pushEFACounter(ch chan<- prometheus.Metric, metricName, counterFile, deviceName string, port uint, portStr string) {
206+
c.pushCounter(ch, metricName, c.readEFAHWCounter(deviceName, port, counterFile), deviceName, portStr)
207+
}
208+
145209
func (c *infinibandCollector) Update(ch chan<- prometheus.Metric) error {
146210
devices, err := c.fs.InfiniBandClass()
147211
if err != nil {
@@ -162,13 +226,42 @@ func (c *infinibandCollector) Update(ch chan<- prometheus.Metric) error {
162226
infoValue := 1.0
163227
ch <- prometheus.MustNewConstMetric(infoDesc, prometheus.GaugeValue, infoValue, device.Name, device.BoardID, device.FirmwareVersion, device.HCAType)
164228

229+
// EFA NICs share /sys/class/infiniband layout with IB but use
230+
// hw_counters/ for byte/packet stats (raw values, no IB ×4 scaling).
231+
// Detect once per device to avoid stat'ing /sys repeatedly per port.
232+
isEFA := c.isEFADevice(device.Name)
233+
165234
for _, port := range device.Ports {
166235
portStr := strconv.FormatUint(uint64(port.Port), 10)
167236

168237
c.pushMetric(ch, "state_id", uint64(port.StateID), port.Name, portStr, prometheus.GaugeValue)
169238
c.pushMetric(ch, "physical_state_id", uint64(port.PhysStateID), port.Name, portStr, prometheus.GaugeValue)
170239
c.pushMetric(ch, "rate_bytes_per_second", port.Rate, port.Name, portStr, prometheus.GaugeValue)
171240

241+
if isEFA {
242+
// EFA path: port.Counters (from procfs/sysfs IB-spec parser)
243+
// is empty/zero, so we read hw_counters/ directly and emit
244+
// under the existing port_data_* / port_packets_* metric
245+
// names so existing IB dashboards transparently see EFA data.
246+
c.pushEFACounter(ch, "port_data_transmitted_bytes_total", "tx_bytes", port.Name, port.Port, portStr)
247+
c.pushEFACounter(ch, "port_data_received_bytes_total", "rx_bytes", port.Name, port.Port, portStr)
248+
c.pushEFACounter(ch, "port_packets_transmitted_total", "tx_pkts", port.Name, port.Port, portStr)
249+
c.pushEFACounter(ch, "port_packets_received_total", "rx_pkts", port.Name, port.Port, portStr)
250+
251+
// EFA-only diagnostic counters — emitted under efa_* names to
252+
// avoid colliding with IB-spec semantics. Useful for tracking
253+
// fabric retransmissions and unresponsive peers.
254+
c.pushEFACounter(ch, "efa_rx_drops_total", "rx_drops", port.Name, port.Port, portStr)
255+
c.pushEFACounter(ch, "efa_retrans_packets_total", "retrans_pkts", port.Name, port.Port, portStr)
256+
c.pushEFACounter(ch, "efa_retrans_bytes_total", "retrans_bytes", port.Name, port.Port, portStr)
257+
c.pushEFACounter(ch, "efa_retrans_timeout_events_total", "retrans_timeout_events", port.Name, port.Port, portStr)
258+
c.pushEFACounter(ch, "efa_unresponsive_remote_events_total", "unresponsive_remote_events", port.Name, port.Port, portStr)
259+
c.pushEFACounter(ch, "efa_impaired_remote_conn_events_total", "impaired_remote_conn_events", port.Name, port.Port, portStr)
260+
c.pushEFACounter(ch, "efa_rdma_read_bytes_total", "rdma_read_bytes", port.Name, port.Port, portStr)
261+
c.pushEFACounter(ch, "efa_rdma_write_bytes_total", "rdma_write_bytes", port.Name, port.Port, portStr)
262+
continue
263+
}
264+
172265
c.pushCounter(ch, "legacy_multicast_packets_received_total", port.Counters.LegacyPortMulticastRcvPackets, port.Name, portStr)
173266
c.pushCounter(ch, "legacy_multicast_packets_transmitted_total", port.Counters.LegacyPortMulticastXmitPackets, port.Name, portStr)
174267
c.pushCounter(ch, "legacy_data_received_bytes_total", port.Counters.LegacyPortRcvData64, port.Name, portStr)

0 commit comments

Comments
 (0)