Skip to content

Commit 135710d

Browse files
authored
Add btrfs device error stats (#2193)
* Improve metrics filesystem scanning logic * Makes ioctl syscalls to load the device error stats. * Adds filesystem mountpoint labels to existing metrics for ease of use. Signed-off-by: Marcus Cobden <leth@users.noreply.github.com>
1 parent d7e89e7 commit 135710d

File tree

4 files changed

+251
-27
lines changed

4 files changed

+251
-27
lines changed

collector/btrfs_linux.go

Lines changed: 239 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,13 @@ package collector
1818

1919
import (
2020
"fmt"
21+
"path"
22+
"strings"
23+
"syscall"
2124

25+
dennwc "github.com/dennwc/btrfs"
2226
"github.com/go-kit/log"
27+
"github.com/go-kit/log/level"
2328
"github.com/prometheus/client_golang/prometheus"
2429
"github.com/prometheus/procfs/btrfs"
2530
)
@@ -52,34 +57,178 @@ func NewBtrfsCollector(logger log.Logger) (Collector, error) {
5257
func (c *btrfsCollector) Update(ch chan<- prometheus.Metric) error {
5358
stats, err := c.fs.Stats()
5459
if err != nil {
55-
return fmt.Errorf("failed to retrieve Btrfs stats: %w", err)
60+
return fmt.Errorf("failed to retrieve Btrfs stats from procfs: %w", err)
61+
}
62+
63+
ioctlStatsMap, err := c.getIoctlStats()
64+
if err != nil {
65+
level.Debug(c.logger).Log(
66+
"msg", "Error querying btrfs device stats with ioctl",
67+
"err", err)
68+
ioctlStatsMap = make(map[string]*btrfsIoctlFsStats)
5669
}
5770

5871
for _, s := range stats {
59-
c.updateBtrfsStats(ch, s)
72+
// match up procfs and ioctl info by filesystem UUID (without dashes)
73+
var fsUUID = strings.Replace(s.UUID, "-", "", -1)
74+
ioctlStats := ioctlStatsMap[fsUUID]
75+
c.updateBtrfsStats(ch, s, ioctlStats)
6076
}
6177

6278
return nil
6379
}
6480

81+
type btrfsIoctlFsDevStats struct {
82+
path string
83+
uuid string
84+
85+
bytesUsed uint64
86+
totalBytes uint64
87+
88+
// The error stats below match the following upstream lists:
89+
// https://github.com/dennwc/btrfs/blob/b3db0b2dedac3bf580f412034d77e0bf4b420167/btrfs.go#L132-L140
90+
// https://github.com/torvalds/linux/blob/70d605cbeecb408dd884b1f0cd3963eeeaac144c/include/uapi/linux/btrfs.h#L680-L692
91+
writeErrs uint64
92+
readErrs uint64
93+
flushErrs uint64
94+
corruptionErrs uint64
95+
generationErrs uint64
96+
}
97+
98+
type btrfsIoctlFsStats struct {
99+
uuid string
100+
devices []btrfsIoctlFsDevStats
101+
}
102+
103+
func (c *btrfsCollector) getIoctlStats() (map[string]*btrfsIoctlFsStats, error) {
104+
// Instead of introducing more ioctl calls to scan for all btrfs
105+
// filesytems re-use our mount point utils to find known mounts
106+
mountsList, err := mountPointDetails(c.logger)
107+
if err != nil {
108+
return nil, err
109+
}
110+
111+
// track devices we have successfully scanned, by device path
112+
devicesDone := make(map[string]struct{})
113+
// filesystems scann results by UUID
114+
fsStats := make(map[string]*btrfsIoctlFsStats)
115+
116+
for _, mount := range mountsList {
117+
if mount.fsType != "btrfs" {
118+
continue
119+
}
120+
121+
if _, found := devicesDone[mount.device]; found {
122+
// We already found this filesystem by another mount point
123+
continue
124+
}
125+
126+
fs, err := dennwc.Open(mount.mountPoint, true)
127+
if err != nil {
128+
// failed to open this mount point, maybe we didn't have permission
129+
// maybe we'll find another mount point for this FS later
130+
level.Debug(c.logger).Log(
131+
"msg", "Error inspecting btrfs mountpoint",
132+
"mountPoint", mount.mountPoint,
133+
"err", err)
134+
continue
135+
}
136+
137+
fsInfo, err := fs.Info()
138+
if err != nil {
139+
// Failed to get the FS info for some reason,
140+
// perhaps it'll work with a different mount point
141+
level.Debug(c.logger).Log(
142+
"msg", "Error querying btrfs filesystem",
143+
"mountPoint", mount.mountPoint,
144+
"err", err)
145+
continue
146+
}
147+
148+
fsID := fsInfo.FSID.String()
149+
if _, found := fsStats[fsID]; found {
150+
// We already found this filesystem by another mount point
151+
continue
152+
}
153+
154+
deviceStats, err := c.getIoctlDeviceStats(fs, &fsInfo)
155+
if err != nil {
156+
level.Debug(c.logger).Log(
157+
"msg", "Error querying btrfs device stats",
158+
"mountPoint", mount.mountPoint,
159+
"err", err)
160+
continue
161+
}
162+
163+
devicesDone[mount.device] = struct{}{}
164+
fsStats[fsID] = &btrfsIoctlFsStats{
165+
uuid: fsID,
166+
devices: deviceStats,
167+
}
168+
}
169+
170+
return fsStats, nil
171+
}
172+
173+
func (c *btrfsCollector) getIoctlDeviceStats(fs *dennwc.FS, fsInfo *dennwc.Info) ([]btrfsIoctlFsDevStats, error) {
174+
devices := make([]btrfsIoctlFsDevStats, 0, fsInfo.NumDevices)
175+
176+
for i := uint64(0); i <= fsInfo.MaxID; i++ {
177+
deviceInfo, err := fs.GetDevInfo(i)
178+
179+
if err != nil {
180+
if errno, ok := err.(syscall.Errno); ok && errno == syscall.ENODEV {
181+
// device IDs do not consistently start at 0, nor are ranges contiguous, so we expect this
182+
continue
183+
}
184+
return nil, err
185+
}
186+
187+
deviceStats, err := fs.GetDevStats(i)
188+
if err != nil {
189+
return nil, err
190+
}
191+
192+
devices = append(devices, btrfsIoctlFsDevStats{
193+
path: deviceInfo.Path,
194+
uuid: deviceInfo.UUID.String(),
195+
bytesUsed: deviceInfo.BytesUsed,
196+
totalBytes: deviceInfo.TotalBytes,
197+
198+
writeErrs: deviceStats.WriteErrs,
199+
readErrs: deviceStats.ReadErrs,
200+
flushErrs: deviceStats.FlushErrs,
201+
corruptionErrs: deviceStats.CorruptionErrs,
202+
generationErrs: deviceStats.GenerationErrs,
203+
})
204+
205+
if uint64(len(devices)) == fsInfo.NumDevices {
206+
break
207+
}
208+
}
209+
210+
return devices, nil
211+
}
212+
65213
// btrfsMetric represents a single Btrfs metric that is converted into a Prometheus Metric.
66214
type btrfsMetric struct {
67215
name string
216+
metricType prometheus.ValueType
68217
desc string
69218
value float64
70219
extraLabel []string
71220
extraLabelValue []string
72221
}
73222

74223
// updateBtrfsStats collects statistics for one bcache ID.
75-
func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs.Stats) {
224+
func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs.Stats, ioctlStats *btrfsIoctlFsStats) {
76225
const subsystem = "btrfs"
77226

78227
// Basic information about the filesystem.
79228
devLabels := []string{"uuid"}
80229

81230
// Retrieve the metrics.
82-
metrics := c.getMetrics(s)
231+
metrics := c.getMetrics(s, ioctlStats)
83232

84233
// Convert all gathered metrics to Prometheus Metrics and add to channel.
85234
for _, m := range metrics {
@@ -99,46 +248,112 @@ func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs.
99248

100249
ch <- prometheus.MustNewConstMetric(
101250
desc,
102-
prometheus.GaugeValue,
251+
m.metricType,
103252
m.value,
104253
labelValues...,
105254
)
106255
}
107256
}
108257

109258
// getMetrics returns metrics for the given Btrfs statistics.
110-
func (c *btrfsCollector) getMetrics(s *btrfs.Stats) []btrfsMetric {
259+
func (c *btrfsCollector) getMetrics(s *btrfs.Stats, ioctlStats *btrfsIoctlFsStats) []btrfsMetric {
111260
metrics := []btrfsMetric{
112261
{
113262
name: "info",
114263
desc: "Filesystem information",
115264
value: 1,
265+
metricType: prometheus.GaugeValue,
116266
extraLabel: []string{"label"},
117267
extraLabelValue: []string{s.Label},
118268
},
119269
{
120-
name: "global_rsv_size_bytes",
121-
desc: "Size of global reserve.",
122-
value: float64(s.Allocation.GlobalRsvSize),
270+
name: "global_rsv_size_bytes",
271+
desc: "Size of global reserve.",
272+
metricType: prometheus.GaugeValue,
273+
value: float64(s.Allocation.GlobalRsvSize),
123274
},
124275
}
125276

126-
// Information about devices.
127-
for n, dev := range s.Devices {
128-
metrics = append(metrics, btrfsMetric{
129-
name: "device_size_bytes",
130-
desc: "Size of a device that is part of the filesystem.",
131-
value: float64(dev.Size),
132-
extraLabel: []string{"device"},
133-
extraLabelValue: []string{n},
134-
})
135-
}
136-
137277
// Information about data, metadata and system data.
138278
metrics = append(metrics, c.getAllocationStats("data", s.Allocation.Data)...)
139279
metrics = append(metrics, c.getAllocationStats("metadata", s.Allocation.Metadata)...)
140280
metrics = append(metrics, c.getAllocationStats("system", s.Allocation.System)...)
141281

282+
// Information about devices.
283+
if ioctlStats == nil {
284+
for n, dev := range s.Devices {
285+
metrics = append(metrics, btrfsMetric{
286+
name: "device_size_bytes",
287+
desc: "Size of a device that is part of the filesystem.",
288+
metricType: prometheus.GaugeValue,
289+
value: float64(dev.Size),
290+
extraLabel: []string{"device"},
291+
extraLabelValue: []string{n},
292+
})
293+
}
294+
return metrics
295+
}
296+
297+
for _, dev := range ioctlStats.devices {
298+
// trim the path prefix from the device name so the value should match
299+
// the value used in the fallback branch above.
300+
// e.g. /dev/sda -> sda, /rootfs/dev/md1 -> md1
301+
_, device := path.Split(dev.path)
302+
303+
extraLabels := []string{"device", "btrfs_dev_uuid"}
304+
extraLabelValues := []string{device, dev.uuid}
305+
306+
metrics = append(metrics,
307+
btrfsMetric{
308+
name: "device_size_bytes",
309+
desc: "Size of a device that is part of the filesystem.",
310+
metricType: prometheus.GaugeValue,
311+
value: float64(dev.totalBytes),
312+
extraLabel: extraLabels,
313+
extraLabelValue: extraLabelValues,
314+
},
315+
// A bytes available metric is probably more useful than a
316+
// bytes used metric, because large numbers of bytes will
317+
// suffer from floating point representation issues
318+
// and we probably care more about the number when it's low anyway
319+
btrfsMetric{
320+
name: "device_unused_bytes",
321+
desc: "Unused bytes unused on a device that is part of the filesystem.",
322+
metricType: prometheus.GaugeValue,
323+
value: float64(dev.totalBytes - dev.bytesUsed),
324+
extraLabel: extraLabels,
325+
extraLabelValue: extraLabelValues,
326+
})
327+
328+
errorLabels := append([]string{"type"}, extraLabels...)
329+
values := []uint64{
330+
dev.writeErrs,
331+
dev.readErrs,
332+
dev.flushErrs,
333+
dev.corruptionErrs,
334+
dev.generationErrs,
335+
}
336+
btrfsErrorTypeNames := []string{
337+
"write",
338+
"read",
339+
"flush",
340+
"corruption",
341+
"generation",
342+
}
343+
344+
for i, errorType := range btrfsErrorTypeNames {
345+
metrics = append(metrics,
346+
btrfsMetric{
347+
name: "device_errors_total",
348+
desc: "Errors reported for the device",
349+
metricType: prometheus.CounterValue,
350+
value: float64(values[i]),
351+
extraLabel: errorLabels,
352+
extraLabelValue: append([]string{errorType}, extraLabelValues...),
353+
})
354+
}
355+
}
356+
142357
return metrics
143358
}
144359

@@ -148,6 +363,7 @@ func (c *btrfsCollector) getAllocationStats(a string, s *btrfs.AllocationStats)
148363
{
149364
name: "reserved_bytes",
150365
desc: "Amount of space reserved for a data type",
366+
metricType: prometheus.GaugeValue,
151367
value: float64(s.ReservedBytes),
152368
extraLabel: []string{"block_group_type"},
153369
extraLabelValue: []string{a},
@@ -168,20 +384,23 @@ func (c *btrfsCollector) getLayoutStats(a, l string, s *btrfs.LayoutUsage) []btr
168384
{
169385
name: "used_bytes",
170386
desc: "Amount of used space by a layout/data type",
387+
metricType: prometheus.GaugeValue,
171388
value: float64(s.UsedBytes),
172389
extraLabel: []string{"block_group_type", "mode"},
173390
extraLabelValue: []string{a, l},
174391
},
175392
{
176393
name: "size_bytes",
177394
desc: "Amount of space allocated for a layout/data type",
395+
metricType: prometheus.GaugeValue,
178396
value: float64(s.TotalBytes),
179397
extraLabel: []string{"block_group_type", "mode"},
180398
extraLabelValue: []string{a, l},
181399
},
182400
{
183401
name: "allocation_ratio",
184402
desc: "Data allocation ratio for a layout/data type",
403+
metricType: prometheus.GaugeValue,
185404
value: s.Ratio,
186405
extraLabel: []string{"block_group_type", "mode"},
187406
extraLabelValue: []string{a, l},

0 commit comments

Comments
 (0)