Skip to content

Commit 618342b

Browse files
sradcoAI Assistant
andcommitted
collector: add dmmultipath collector for DM-multipath
sysfs metrics Add a new disabled-by-default collector that reads /sys/block/dm-* to discover Device Mapper multipath devices and expose path health metrics. Multipath devices are identified by checking that dm/uuid starts with "mpath-", which distinguishes them from LVM or other DM device types. The path state is reported as-is from /sys/block/<dev>/device/state, supporting both SCSI devices (running, offline, blocked, etc.) and NVMe devices (live, connecting, dead, etc.) without hardcoding a fixed set of states. All device-level metrics include both the DM friendly name (device) and the kernel block device name (sysfs_name, e.g. dm-0) to enable direct correlation with node_disk_* I/O metrics without recording rules. No special permissions are required — the collector reads only world-readable sysfs attributes. Exposed metrics: - node_dmmultipath_device_info - node_dmmultipath_device_active - node_dmmultipath_device_size_bytes - node_dmmultipath_device_paths - node_dmmultipath_device_paths_active - node_dmmultipath_device_paths_failed - node_dmmultipath_path_state Signed-off-by: Shirly Radco <sradco@redhat.com> Co-authored-by: AI Assistant <noreply@cursor.com>
1 parent 1a4cac6 commit 618342b

File tree

4 files changed

+489
-0
lines changed

4 files changed

+489
-0
lines changed

README.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ buddyinfo | Exposes statistics of memory fragments as reported by /proc/buddyinf
191191
cgroups | A summary of the number of active and enabled cgroups | Linux
192192
cpu\_vulnerabilities | Exposes CPU vulnerability information from sysfs. | Linux
193193
devstat | Exposes device statistics | Dragonfly, FreeBSD
194+
dmmultipath | Exposes DM-multipath device and path metrics from `/sys/block/dm-*`. | Linux
194195
drm | Expose GPU metrics using sysfs / DRM, `amdgpu` is the only driver which exposes this information through DRM | Linux
195196
drbd | Exposes Distributed Replicated Block Device statistics (to version 8.4) | Linux
196197
ethtool | Exposes network interface information and network driver statistics equivalent to `ethtool`, `ethtool -S`, and `ethtool -i`. | Linux
@@ -339,6 +340,32 @@ echo 'role{role="application_server"} 1' > /path/to/directory/role.prom.$$
339340
mv /path/to/directory/role.prom.$$ /path/to/directory/role.prom
340341
```
341342

343+
### DM-Multipath Collector
344+
345+
The `dmmultipath` collector reads `/sys/block/dm-*` to discover Device Mapper
346+
multipath devices and expose path health metrics. It identifies multipath
347+
devices by checking that `dm/uuid` starts with `mpath-`, which distinguishes
348+
them from LVM or other DM device types.
349+
350+
No special permissions are required — the collector reads only world-readable
351+
sysfs attributes.
352+
353+
Enable it with `--collector.dmmultipath`.
354+
355+
#### Exposed metrics
356+
357+
| Metric | Type | Description |
358+
|--------|------|-------------|
359+
| `node_dmmultipath_device_info` | Gauge | Info metric with `device`, `sysfs_name`, and `uuid` (contains WWID for PV correlation). |
360+
| `node_dmmultipath_device_active` | Gauge | Whether the DM device is active (1) or suspended (0). Labels: `device`, `sysfs_name`. |
361+
| `node_dmmultipath_device_size_bytes` | Gauge | Size of the DM device in bytes. Labels: `device`, `sysfs_name`. |
362+
| `node_dmmultipath_device_paths` | Gauge | Number of paths. Labels: `device`, `sysfs_name`. |
363+
| `node_dmmultipath_device_paths_active` | Gauge | Number of paths in active state (SCSI `running` or NVMe `live`). Labels: `device`, `sysfs_name`. |
364+
| `node_dmmultipath_device_paths_failed` | Gauge | Number of paths not in active state. Labels: `device`, `sysfs_name`. |
365+
| `node_dmmultipath_path_state` | Gauge | Reports the underlying device state for each path. Labels: `device`, `path`, `state`. |
366+
367+
The `sysfs_name` label (e.g. `dm-0`) matches the `device` label in `node_disk_*` metrics, enabling direct correlation between multipath health and I/O statistics without recording rules.
368+
342369
### Filtering enabled collectors
343370

344371
The `node_exporter` will expose all metrics from enabled collectors by default. This is the recommended way to collect metrics to avoid errors when comparing metrics of different families.

collector/dmmultipath_linux.go

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
// Copyright The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
//go:build !nodmmultipath
15+
16+
package collector
17+
18+
import (
19+
"errors"
20+
"fmt"
21+
"log/slog"
22+
"os"
23+
24+
"github.com/prometheus/client_golang/prometheus"
25+
"github.com/prometheus/procfs/blockdevice"
26+
)
27+
28+
// isPathActive returns true for device states that indicate a healthy,
29+
// usable path. This covers SCSI ("running") and NVMe ("live") devices.
30+
func isPathActive(state string) bool {
31+
return state == "running" || state == "live"
32+
}
33+
34+
type dmMultipathCollector struct {
35+
fs blockdevice.FS
36+
logger *slog.Logger
37+
38+
deviceInfo *prometheus.Desc
39+
deviceActive *prometheus.Desc
40+
deviceSizeBytes *prometheus.Desc
41+
devicePaths *prometheus.Desc
42+
devicePathsActive *prometheus.Desc
43+
devicePathsFailed *prometheus.Desc
44+
pathState *prometheus.Desc
45+
}
46+
47+
func init() {
48+
registerCollector("dmmultipath", defaultDisabled, NewDMMultipathCollector)
49+
}
50+
51+
// NewDMMultipathCollector returns a new Collector exposing Device Mapper
52+
// multipath device metrics from /sys/block/dm-*.
53+
func NewDMMultipathCollector(logger *slog.Logger) (Collector, error) {
54+
const subsystem = "dmmultipath"
55+
56+
fs, err := blockdevice.NewFS(*procPath, *sysPath)
57+
if err != nil {
58+
return nil, fmt.Errorf("failed to open sysfs: %w", err)
59+
}
60+
61+
deviceLabels := []string{"device", "sysfs_name"}
62+
63+
return &dmMultipathCollector{
64+
fs: fs,
65+
logger: logger,
66+
deviceInfo: prometheus.NewDesc(
67+
prometheus.BuildFQName(namespace, subsystem, "device_info"),
68+
"Non-numeric information about a DM-multipath device.",
69+
[]string{"device", "sysfs_name", "uuid"}, nil,
70+
),
71+
deviceActive: prometheus.NewDesc(
72+
prometheus.BuildFQName(namespace, subsystem, "device_active"),
73+
"Whether the multipath device-mapper device is active (1) or suspended (0).",
74+
deviceLabels, nil,
75+
),
76+
deviceSizeBytes: prometheus.NewDesc(
77+
prometheus.BuildFQName(namespace, subsystem, "device_size_bytes"),
78+
"Size of the multipath device in bytes, read from /sys/block/<dm>/size.",
79+
deviceLabels, nil,
80+
),
81+
devicePaths: prometheus.NewDesc(
82+
prometheus.BuildFQName(namespace, subsystem, "device_paths"),
83+
"Number of paths for a multipath device.",
84+
deviceLabels, nil,
85+
),
86+
devicePathsActive: prometheus.NewDesc(
87+
prometheus.BuildFQName(namespace, subsystem, "device_paths_active"),
88+
"Number of paths in active state (SCSI running or NVMe live) for a multipath device.",
89+
deviceLabels, nil,
90+
),
91+
devicePathsFailed: prometheus.NewDesc(
92+
prometheus.BuildFQName(namespace, subsystem, "device_paths_failed"),
93+
"Number of paths not in active state for a multipath device.",
94+
deviceLabels, nil,
95+
),
96+
pathState: prometheus.NewDesc(
97+
prometheus.BuildFQName(namespace, subsystem, "path_state"),
98+
"Reports the underlying device state for a multipath path, as read from /sys/block/<dev>/device/state.",
99+
[]string{"device", "path", "state"}, nil,
100+
),
101+
}, nil
102+
}
103+
104+
func (c *dmMultipathCollector) Update(ch chan<- prometheus.Metric) error {
105+
devices, err := c.fs.DMMultipathDevices()
106+
if err != nil {
107+
if errors.Is(err, os.ErrNotExist) || errors.Is(err, os.ErrPermission) {
108+
c.logger.Debug("Could not read DM-multipath devices", "err", err)
109+
return ErrNoData
110+
}
111+
return fmt.Errorf("failed to scan DM-multipath devices: %w", err)
112+
}
113+
114+
for _, dev := range devices {
115+
ch <- prometheus.MustNewConstMetric(c.deviceInfo, prometheus.GaugeValue, 1,
116+
dev.Name, dev.SysfsName, dev.UUID)
117+
118+
active := 0.0
119+
if !dev.Suspended {
120+
active = 1.0
121+
}
122+
ch <- prometheus.MustNewConstMetric(c.deviceActive, prometheus.GaugeValue, active, dev.Name, dev.SysfsName)
123+
ch <- prometheus.MustNewConstMetric(c.deviceSizeBytes, prometheus.GaugeValue, float64(dev.SizeBytes), dev.Name, dev.SysfsName)
124+
125+
var activePaths, failedPaths float64
126+
for _, p := range dev.Paths {
127+
if isPathActive(p.State) {
128+
activePaths++
129+
} else {
130+
failedPaths++
131+
}
132+
133+
ch <- prometheus.MustNewConstMetric(c.pathState, prometheus.GaugeValue, 1,
134+
dev.Name, p.Device, p.State)
135+
}
136+
137+
ch <- prometheus.MustNewConstMetric(c.devicePaths, prometheus.GaugeValue, float64(len(dev.Paths)), dev.Name, dev.SysfsName)
138+
ch <- prometheus.MustNewConstMetric(c.devicePathsActive, prometheus.GaugeValue, activePaths, dev.Name, dev.SysfsName)
139+
ch <- prometheus.MustNewConstMetric(c.devicePathsFailed, prometheus.GaugeValue, failedPaths, dev.Name, dev.SysfsName)
140+
}
141+
142+
return nil
143+
}
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
// Copyright The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
//go:build !nodmmultipath
15+
16+
package collector
17+
18+
import (
19+
"io"
20+
"log/slog"
21+
"strings"
22+
"testing"
23+
24+
"github.com/prometheus/client_golang/prometheus"
25+
dto "github.com/prometheus/client_model/go"
26+
)
27+
28+
func TestDMMultipathMetrics(t *testing.T) {
29+
*procPath = "fixtures/proc"
30+
*sysPath = "fixtures/sys"
31+
32+
logger := slog.New(slog.NewTextHandler(io.Discard, nil))
33+
coll, err := NewDMMultipathCollector(logger)
34+
if err != nil {
35+
t.Fatal(err)
36+
}
37+
38+
c := coll.(*dmMultipathCollector)
39+
40+
ch := make(chan prometheus.Metric, 200)
41+
if err := c.Update(ch); err != nil {
42+
t.Fatal(err)
43+
}
44+
close(ch)
45+
46+
metrics := make(map[string][]*dto.Metric)
47+
for m := range ch {
48+
d := &dto.Metric{}
49+
if err := m.Write(d); err != nil {
50+
t.Fatal(err)
51+
}
52+
desc := m.Desc().String()
53+
metrics[desc] = append(metrics[desc], d)
54+
}
55+
56+
assertGaugeValue(t, metrics, "device_active", labelMap{"device": "mpathA", "sysfs_name": "dm-5"}, 1)
57+
assertGaugeValue(t, metrics, "device_active", labelMap{"device": "mpathB", "sysfs_name": "dm-6"}, 1)
58+
assertGaugeValue(t, metrics, "device_size_bytes", labelMap{"device": "mpathA", "sysfs_name": "dm-5"}, 53687091200)
59+
assertGaugeValue(t, metrics, `device_paths"`, labelMap{"device": "mpathA", "sysfs_name": "dm-5"}, 4)
60+
assertGaugeValue(t, metrics, `device_paths"`, labelMap{"device": "mpathB", "sysfs_name": "dm-6"}, 2)
61+
62+
// mpathA: sdi, sdj, sdk are running; sdl is offline → 3 active, 1 failed.
63+
assertGaugeValue(t, metrics, "device_paths_active", labelMap{"device": "mpathA", "sysfs_name": "dm-5"}, 3)
64+
assertGaugeValue(t, metrics, "device_paths_failed", labelMap{"device": "mpathA", "sysfs_name": "dm-5"}, 1)
65+
66+
// mpathB: sdm, sdn are both running → 2 active, 0 failed.
67+
assertGaugeValue(t, metrics, "device_paths_active", labelMap{"device": "mpathB", "sysfs_name": "dm-6"}, 2)
68+
assertGaugeValue(t, metrics, "device_paths_failed", labelMap{"device": "mpathB", "sysfs_name": "dm-6"}, 0)
69+
70+
assertGaugeValue(t, metrics, "path_state",
71+
labelMap{"device": "mpathA", "path": "sdi", "state": "running"}, 1)
72+
assertGaugeValue(t, metrics, "path_state",
73+
labelMap{"device": "mpathA", "path": "sdl", "state": "offline"}, 1)
74+
}
75+
76+
func TestDMMultipathNoDevices(t *testing.T) {
77+
*procPath = "fixtures/proc"
78+
*sysPath = t.TempDir()
79+
80+
logger := slog.New(slog.NewTextHandler(io.Discard, nil))
81+
coll, err := NewDMMultipathCollector(logger)
82+
if err != nil {
83+
t.Fatal(err)
84+
}
85+
86+
c := coll.(*dmMultipathCollector)
87+
88+
ch := make(chan prometheus.Metric, 200)
89+
err = c.Update(ch)
90+
close(ch)
91+
92+
if err != ErrNoData {
93+
t.Fatalf("expected ErrNoData, got %v", err)
94+
}
95+
}
96+
97+
func TestIsPathActive(t *testing.T) {
98+
tests := []struct {
99+
state string
100+
active bool
101+
}{
102+
{"running", true},
103+
{"live", true},
104+
{"offline", false},
105+
{"blocked", false},
106+
{"transport-offline", false},
107+
{"dead", false},
108+
{"unknown", false},
109+
{"", false},
110+
}
111+
for _, tc := range tests {
112+
got := isPathActive(tc.state)
113+
if got != tc.active {
114+
t.Errorf("isPathActive(%q) = %v, want %v", tc.state, got, tc.active)
115+
}
116+
}
117+
}
118+
119+
type labelMap map[string]string
120+
121+
func assertGaugeValue(t *testing.T, metrics map[string][]*dto.Metric, metricSubstring string, labels labelMap, expected float64) {
122+
t.Helper()
123+
for desc, ms := range metrics {
124+
if !strings.Contains(desc, metricSubstring) {
125+
continue
126+
}
127+
for _, m := range ms {
128+
if matchLabels(m.GetLabel(), labels) {
129+
got := m.GetGauge().GetValue()
130+
if got != expected {
131+
t.Errorf("%s%v: got %v, want %v", metricSubstring, labels, got, expected)
132+
}
133+
return
134+
}
135+
}
136+
}
137+
t.Errorf("metric %s%v not found", metricSubstring, labels)
138+
}
139+
140+
func matchLabels(pairs []*dto.LabelPair, want labelMap) bool {
141+
if want == nil {
142+
return len(pairs) == 0
143+
}
144+
found := 0
145+
for _, lp := range pairs {
146+
if v, ok := want[lp.GetName()]; ok && v == lp.GetValue() {
147+
found++
148+
}
149+
}
150+
return found == len(want)
151+
}

0 commit comments

Comments
 (0)