Skip to content

Commit 812baac

Browse files
sradcoAI Assistant
andcommitted
collector: add dmmultipath collector for DM-multipath sysfs metrics
Add a new disabled-by-default collector that reads /sys/block/dm-* to discover Device Mapper multipath devices and expose path health metrics. Multipath devices are identified by checking that dm/uuid starts with "mpath-", which distinguishes them from LVM or other DM device types. The path state is reported as-is from /sys/block/<dev>/device/state, supporting both SCSI devices (running, offline, blocked, etc.) and NVMe devices (live, connecting, dead, etc.) without hardcoding a fixed set of states. No special permissions are required — the collector reads only world-readable sysfs attributes. Exposed metrics: - node_dmmultipath_device_info - node_dmmultipath_device_active - node_dmmultipath_device_size_bytes - node_dmmultipath_device_paths_{total,active,failed} - node_dmmultipath_path_state Signed-off-by: Shirly Radco <sradco@redhat.com> Co-authored-by: AI Assistant <noreply@cursor.com>
1 parent a1cbf81 commit 812baac

File tree

4 files changed

+647
-0
lines changed

4 files changed

+647
-0
lines changed

README.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ buddyinfo | Exposes statistics of memory fragments as reported by /proc/buddyinf
191191
cgroups | A summary of the number of active and enabled cgroups | Linux
192192
cpu\_vulnerabilities | Exposes CPU vulnerability information from sysfs. | Linux
193193
devstat | Exposes device statistics | Dragonfly, FreeBSD
194+
dmmultipath | Exposes DM-multipath device and path metrics from `/sys/block/dm-*`. | Linux
194195
drm | Expose GPU metrics using sysfs / DRM, `amdgpu` is the only driver which exposes this information through DRM | Linux
195196
drbd | Exposes Distributed Replicated Block Device statistics (to version 8.4) | Linux
196197
ethtool | Exposes network interface information and network driver statistics equivalent to `ethtool`, `ethtool -S`, and `ethtool -i`. | Linux
@@ -339,6 +340,30 @@ echo 'role{role="application_server"} 1' > /path/to/directory/role.prom.$$
339340
mv /path/to/directory/role.prom.$$ /path/to/directory/role.prom
340341
```
341342

343+
### DM-Multipath Collector
344+
345+
The `dmmultipath` collector reads `/sys/block/dm-*` to discover Device Mapper
346+
multipath devices and expose path health metrics. It identifies multipath
347+
devices by checking that `dm/uuid` starts with `mpath-`, which distinguishes
348+
them from LVM or other DM device types.
349+
350+
No special permissions are required — the collector reads only world-readable
351+
sysfs attributes.
352+
353+
Enable it with `--collector.dmmultipath`.
354+
355+
#### Exposed metrics
356+
357+
| Metric | Type | Description |
358+
|--------|------|-------------|
359+
| `node_dmmultipath_device_info` | Gauge | Info metric with device name, sysfs name, and UUID (contains WWID for PV correlation). |
360+
| `node_dmmultipath_device_active` | Gauge | Whether the DM device is active (1) or suspended (0). |
361+
| `node_dmmultipath_device_size_bytes` | Gauge | Size of the DM device in bytes. |
362+
| `node_dmmultipath_device_paths_total` | Gauge | Total number of paths. |
363+
| `node_dmmultipath_device_paths_active` | Gauge | Number of paths in active state (SCSI `running` or NVMe `live`). |
364+
| `node_dmmultipath_device_paths_failed` | Gauge | Number of paths not in active state. |
365+
| `node_dmmultipath_path_state` | Gauge | Reports the underlying device state for each path, as read from `/sys/block/<dev>/device/state`. |
366+
342367
### Filtering enabled collectors
343368

344369
The `node_exporter` will expose all metrics from enabled collectors by default. This is the recommended way to collect metrics to avoid errors when comparing metrics of different families.

collector/dmmultipath_linux.go

Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
// Copyright The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
//go:build !nodmmultipath
15+
16+
package collector
17+
18+
import (
19+
"fmt"
20+
"log/slog"
21+
"os"
22+
"path/filepath"
23+
"strconv"
24+
"strings"
25+
26+
"github.com/prometheus/client_golang/prometheus"
27+
)
28+
29+
type dmMultipathDevice struct {
30+
Name string
31+
SysfsName string
32+
UUID string
33+
Suspended bool
34+
SizeBytes uint64
35+
Paths []dmMultipathPath
36+
}
37+
38+
type dmMultipathPath struct {
39+
Device string
40+
State string
41+
}
42+
43+
// isPathActive returns true for device states that indicate a healthy,
44+
// usable path. This covers SCSI ("running") and NVMe ("live") devices.
45+
func isPathActive(state string) bool {
46+
return state == "running" || state == "live"
47+
}
48+
49+
type dmMultipathCollector struct {
50+
logger *slog.Logger
51+
scanDevices func() ([]dmMultipathDevice, error)
52+
53+
deviceInfo *prometheus.Desc
54+
deviceActive *prometheus.Desc
55+
deviceSizeBytes *prometheus.Desc
56+
devicePathsTotal *prometheus.Desc
57+
devicePathsActive *prometheus.Desc
58+
devicePathsFailed *prometheus.Desc
59+
pathState *prometheus.Desc
60+
}
61+
62+
func init() {
63+
registerCollector("dmmultipath", defaultDisabled, NewDMMultipathCollector)
64+
}
65+
66+
// NewDMMultipathCollector returns a new Collector exposing Device Mapper
67+
// multipath device metrics from /sys/block/dm-*.
68+
func NewDMMultipathCollector(logger *slog.Logger) (Collector, error) {
69+
const subsystem = "dmmultipath"
70+
71+
deviceLabels := []string{"device"}
72+
73+
c := &dmMultipathCollector{
74+
logger: logger,
75+
deviceInfo: prometheus.NewDesc(
76+
prometheus.BuildFQName(namespace, subsystem, "device_info"),
77+
"Non-numeric information about a DM-multipath device.",
78+
[]string{"device", "sysfs_name", "uuid"}, nil,
79+
),
80+
deviceActive: prometheus.NewDesc(
81+
prometheus.BuildFQName(namespace, subsystem, "device_active"),
82+
"Whether the multipath device-mapper device is active (1) or suspended (0).",
83+
deviceLabels, nil,
84+
),
85+
deviceSizeBytes: prometheus.NewDesc(
86+
prometheus.BuildFQName(namespace, subsystem, "device_size_bytes"),
87+
"Size of the multipath device in bytes, read from /sys/block/<dm>/size.",
88+
deviceLabels, nil,
89+
),
90+
devicePathsTotal: prometheus.NewDesc(
91+
prometheus.BuildFQName(namespace, subsystem, "device_paths_total"),
92+
"Total number of paths for a multipath device.",
93+
deviceLabels, nil,
94+
),
95+
devicePathsActive: prometheus.NewDesc(
96+
prometheus.BuildFQName(namespace, subsystem, "device_paths_active"),
97+
"Number of paths in active state (SCSI running or NVMe live) for a multipath device.",
98+
deviceLabels, nil,
99+
),
100+
devicePathsFailed: prometheus.NewDesc(
101+
prometheus.BuildFQName(namespace, subsystem, "device_paths_failed"),
102+
"Number of paths not in active state for a multipath device.",
103+
deviceLabels, nil,
104+
),
105+
pathState: prometheus.NewDesc(
106+
prometheus.BuildFQName(namespace, subsystem, "path_state"),
107+
"Reports the underlying device state for a multipath path, as read from /sys/block/<dev>/device/state.",
108+
[]string{"device", "path", "state"}, nil,
109+
),
110+
}
111+
112+
c.scanDevices = func() ([]dmMultipathDevice, error) {
113+
return scanDMMultipathDevices(*sysPath)
114+
}
115+
116+
return c, nil
117+
}
118+
119+
func (c *dmMultipathCollector) Update(ch chan<- prometheus.Metric) error {
120+
devices, err := c.scanDevices()
121+
if err != nil {
122+
return fmt.Errorf("failed to scan DM-multipath devices: %w", err)
123+
}
124+
125+
for _, dev := range devices {
126+
ch <- prometheus.MustNewConstMetric(c.deviceInfo, prometheus.GaugeValue, 1,
127+
dev.Name, dev.SysfsName, dev.UUID)
128+
129+
active := 0.0
130+
if !dev.Suspended {
131+
active = 1.0
132+
}
133+
ch <- prometheus.MustNewConstMetric(c.deviceActive, prometheus.GaugeValue, active, dev.Name)
134+
ch <- prometheus.MustNewConstMetric(c.deviceSizeBytes, prometheus.GaugeValue, float64(dev.SizeBytes), dev.Name)
135+
136+
var activePaths, failedPaths float64
137+
for _, p := range dev.Paths {
138+
if isPathActive(p.State) {
139+
activePaths++
140+
} else {
141+
failedPaths++
142+
}
143+
144+
ch <- prometheus.MustNewConstMetric(c.pathState, prometheus.GaugeValue, 1,
145+
dev.Name, p.Device, p.State)
146+
}
147+
148+
ch <- prometheus.MustNewConstMetric(c.devicePathsTotal, prometheus.GaugeValue, float64(len(dev.Paths)), dev.Name)
149+
ch <- prometheus.MustNewConstMetric(c.devicePathsActive, prometheus.GaugeValue, activePaths, dev.Name)
150+
ch <- prometheus.MustNewConstMetric(c.devicePathsFailed, prometheus.GaugeValue, failedPaths, dev.Name)
151+
}
152+
153+
return nil
154+
}
155+
156+
// scanDMMultipathDevices discovers DM-multipath devices by scanning
157+
// /sys/block/dm-* and filtering on dm/uuid prefix "mpath-".
158+
func scanDMMultipathDevices(sysfsBase string) ([]dmMultipathDevice, error) {
159+
blockDir := filepath.Join(sysfsBase, "block")
160+
161+
entries, err := os.ReadDir(blockDir)
162+
if err != nil {
163+
return nil, err
164+
}
165+
166+
var devices []dmMultipathDevice
167+
for _, entry := range entries {
168+
if !strings.HasPrefix(entry.Name(), "dm-") {
169+
continue
170+
}
171+
172+
dmDir := filepath.Join(blockDir, entry.Name())
173+
uuid := readBlockAttr(filepath.Join(dmDir, "dm", "uuid"))
174+
if !strings.HasPrefix(uuid, "mpath-") {
175+
continue
176+
}
177+
178+
name := readBlockAttr(filepath.Join(dmDir, "dm", "name"))
179+
if name == "" {
180+
name = entry.Name()
181+
}
182+
183+
suspended := readBlockAttr(filepath.Join(dmDir, "dm", "suspended")) == "1"
184+
185+
var sizeBytes uint64
186+
if sectors, err := strconv.ParseUint(readBlockAttr(filepath.Join(dmDir, "size")), 10, 64); err == nil {
187+
sizeBytes = sectors * uint64(unixSectorSize)
188+
}
189+
190+
paths := scanDMPaths(sysfsBase, filepath.Join(dmDir, "slaves"))
191+
192+
devices = append(devices, dmMultipathDevice{
193+
Name: name,
194+
SysfsName: entry.Name(),
195+
UUID: uuid,
196+
Suspended: suspended,
197+
SizeBytes: sizeBytes,
198+
Paths: paths,
199+
})
200+
}
201+
202+
return devices, nil
203+
}
204+
205+
func scanDMPaths(sysfsBase, slavesDir string) []dmMultipathPath {
206+
entries, err := os.ReadDir(slavesDir)
207+
if err != nil {
208+
return nil
209+
}
210+
211+
var paths []dmMultipathPath
212+
for _, entry := range entries {
213+
state := readBlockAttr(filepath.Join(sysfsBase, "block", entry.Name(), "device", "state"))
214+
if state == "" {
215+
state = "unknown"
216+
}
217+
paths = append(paths, dmMultipathPath{
218+
Device: entry.Name(),
219+
State: state,
220+
})
221+
}
222+
223+
return paths
224+
}
225+
226+
// readBlockAttr reads a single sysfs attribute file, returning its
227+
// trimmed content or an empty string on error.
228+
func readBlockAttr(path string) string {
229+
data, err := os.ReadFile(path)
230+
if err != nil {
231+
return ""
232+
}
233+
return strings.TrimSpace(string(data))
234+
}

0 commit comments

Comments
 (0)