@@ -18,8 +18,13 @@ package collector
1818
1919import (
2020 "fmt"
21+ "path"
22+ "strings"
23+ "syscall"
2124
25+ dennwc "github.com/dennwc/btrfs"
2226 "github.com/go-kit/log"
27+ "github.com/go-kit/log/level"
2328 "github.com/prometheus/client_golang/prometheus"
2429 "github.com/prometheus/procfs/btrfs"
2530)
@@ -52,34 +57,178 @@ func NewBtrfsCollector(logger log.Logger) (Collector, error) {
5257func (c * btrfsCollector ) Update (ch chan <- prometheus.Metric ) error {
5358 stats , err := c .fs .Stats ()
5459 if err != nil {
55- return fmt .Errorf ("failed to retrieve Btrfs stats: %w" , err )
60+ return fmt .Errorf ("failed to retrieve Btrfs stats from procfs: %w" , err )
61+ }
62+
63+ ioctlStatsMap , err := c .getIoctlStats ()
64+ if err != nil {
65+ level .Debug (c .logger ).Log (
66+ "msg" , "Error querying btrfs device stats with ioctl" ,
67+ "err" , err )
68+ ioctlStatsMap = make (map [string ]* btrfsIoctlFsStats )
5669 }
5770
5871 for _ , s := range stats {
59- c .updateBtrfsStats (ch , s )
72+ // match up procfs and ioctl info by filesystem UUID (without dashes)
73+ var fsUUID = strings .Replace (s .UUID , "-" , "" , - 1 )
74+ ioctlStats := ioctlStatsMap [fsUUID ]
75+ c .updateBtrfsStats (ch , s , ioctlStats )
6076 }
6177
6278 return nil
6379}
6480
81+ type btrfsIoctlFsDevStats struct {
82+ path string
83+ uuid string
84+
85+ bytesUsed uint64
86+ totalBytes uint64
87+
88+ // The error stats below match the following upstream lists:
89+ // https://github.com/dennwc/btrfs/blob/b3db0b2dedac3bf580f412034d77e0bf4b420167/btrfs.go#L132-L140
90+ // https://github.com/torvalds/linux/blob/70d605cbeecb408dd884b1f0cd3963eeeaac144c/include/uapi/linux/btrfs.h#L680-L692
91+ writeErrs uint64
92+ readErrs uint64
93+ flushErrs uint64
94+ corruptionErrs uint64
95+ generationErrs uint64
96+ }
97+
98+ type btrfsIoctlFsStats struct {
99+ uuid string
100+ devices []btrfsIoctlFsDevStats
101+ }
102+
103+ func (c * btrfsCollector ) getIoctlStats () (map [string ]* btrfsIoctlFsStats , error ) {
104+ // Instead of introducing more ioctl calls to scan for all btrfs
105+ // filesytems re-use our mount point utils to find known mounts
106+ mountsList , err := mountPointDetails (c .logger )
107+ if err != nil {
108+ return nil , err
109+ }
110+
111+ // track devices we have successfully scanned, by device path
112+ devicesDone := make (map [string ]struct {})
113+ // filesystems scann results by UUID
114+ fsStats := make (map [string ]* btrfsIoctlFsStats )
115+
116+ for _ , mount := range mountsList {
117+ if mount .fsType != "btrfs" {
118+ continue
119+ }
120+
121+ if _ , found := devicesDone [mount .device ]; found {
122+ // We already found this filesystem by another mount point
123+ continue
124+ }
125+
126+ fs , err := dennwc .Open (mount .mountPoint , true )
127+ if err != nil {
128+ // failed to open this mount point, maybe we didn't have permission
129+ // maybe we'll find another mount point for this FS later
130+ level .Debug (c .logger ).Log (
131+ "msg" , "Error inspecting btrfs mountpoint" ,
132+ "mountPoint" , mount .mountPoint ,
133+ "err" , err )
134+ continue
135+ }
136+
137+ fsInfo , err := fs .Info ()
138+ if err != nil {
139+ // Failed to get the FS info for some reason,
140+ // perhaps it'll work with a different mount point
141+ level .Debug (c .logger ).Log (
142+ "msg" , "Error querying btrfs filesystem" ,
143+ "mountPoint" , mount .mountPoint ,
144+ "err" , err )
145+ continue
146+ }
147+
148+ fsID := fsInfo .FSID .String ()
149+ if _ , found := fsStats [fsID ]; found {
150+ // We already found this filesystem by another mount point
151+ continue
152+ }
153+
154+ deviceStats , err := c .getIoctlDeviceStats (fs , & fsInfo )
155+ if err != nil {
156+ level .Debug (c .logger ).Log (
157+ "msg" , "Error querying btrfs device stats" ,
158+ "mountPoint" , mount .mountPoint ,
159+ "err" , err )
160+ continue
161+ }
162+
163+ devicesDone [mount .device ] = struct {}{}
164+ fsStats [fsID ] = & btrfsIoctlFsStats {
165+ uuid : fsID ,
166+ devices : deviceStats ,
167+ }
168+ }
169+
170+ return fsStats , nil
171+ }
172+
173+ func (c * btrfsCollector ) getIoctlDeviceStats (fs * dennwc.FS , fsInfo * dennwc.Info ) ([]btrfsIoctlFsDevStats , error ) {
174+ devices := make ([]btrfsIoctlFsDevStats , 0 , fsInfo .NumDevices )
175+
176+ for i := uint64 (0 ); i <= fsInfo .MaxID ; i ++ {
177+ deviceInfo , err := fs .GetDevInfo (i )
178+
179+ if err != nil {
180+ if errno , ok := err .(syscall.Errno ); ok && errno == syscall .ENODEV {
181+ // device IDs do not consistently start at 0, nor are ranges contiguous, so we expect this
182+ continue
183+ }
184+ return nil , err
185+ }
186+
187+ deviceStats , err := fs .GetDevStats (i )
188+ if err != nil {
189+ return nil , err
190+ }
191+
192+ devices = append (devices , btrfsIoctlFsDevStats {
193+ path : deviceInfo .Path ,
194+ uuid : deviceInfo .UUID .String (),
195+ bytesUsed : deviceInfo .BytesUsed ,
196+ totalBytes : deviceInfo .TotalBytes ,
197+
198+ writeErrs : deviceStats .WriteErrs ,
199+ readErrs : deviceStats .ReadErrs ,
200+ flushErrs : deviceStats .FlushErrs ,
201+ corruptionErrs : deviceStats .CorruptionErrs ,
202+ generationErrs : deviceStats .GenerationErrs ,
203+ })
204+
205+ if uint64 (len (devices )) == fsInfo .NumDevices {
206+ break
207+ }
208+ }
209+
210+ return devices , nil
211+ }
212+
65213// btrfsMetric represents a single Btrfs metric that is converted into a Prometheus Metric.
66214type btrfsMetric struct {
67215 name string
216+ metricType prometheus.ValueType
68217 desc string
69218 value float64
70219 extraLabel []string
71220 extraLabelValue []string
72221}
73222
74223// updateBtrfsStats collects statistics for one bcache ID.
75- func (c * btrfsCollector ) updateBtrfsStats (ch chan <- prometheus.Metric , s * btrfs.Stats ) {
224+ func (c * btrfsCollector ) updateBtrfsStats (ch chan <- prometheus.Metric , s * btrfs.Stats , ioctlStats * btrfsIoctlFsStats ) {
76225 const subsystem = "btrfs"
77226
78227 // Basic information about the filesystem.
79228 devLabels := []string {"uuid" }
80229
81230 // Retrieve the metrics.
82- metrics := c .getMetrics (s )
231+ metrics := c .getMetrics (s , ioctlStats )
83232
84233 // Convert all gathered metrics to Prometheus Metrics and add to channel.
85234 for _ , m := range metrics {
@@ -99,46 +248,112 @@ func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs.
99248
100249 ch <- prometheus .MustNewConstMetric (
101250 desc ,
102- prometheus . GaugeValue ,
251+ m . metricType ,
103252 m .value ,
104253 labelValues ... ,
105254 )
106255 }
107256}
108257
109258// getMetrics returns metrics for the given Btrfs statistics.
110- func (c * btrfsCollector ) getMetrics (s * btrfs.Stats ) []btrfsMetric {
259+ func (c * btrfsCollector ) getMetrics (s * btrfs.Stats , ioctlStats * btrfsIoctlFsStats ) []btrfsMetric {
111260 metrics := []btrfsMetric {
112261 {
113262 name : "info" ,
114263 desc : "Filesystem information" ,
115264 value : 1 ,
265+ metricType : prometheus .GaugeValue ,
116266 extraLabel : []string {"label" },
117267 extraLabelValue : []string {s .Label },
118268 },
119269 {
120- name : "global_rsv_size_bytes" ,
121- desc : "Size of global reserve." ,
122- value : float64 (s .Allocation .GlobalRsvSize ),
270+ name : "global_rsv_size_bytes" ,
271+ desc : "Size of global reserve." ,
272+ metricType : prometheus .GaugeValue ,
273+ value : float64 (s .Allocation .GlobalRsvSize ),
123274 },
124275 }
125276
126- // Information about devices.
127- for n , dev := range s .Devices {
128- metrics = append (metrics , btrfsMetric {
129- name : "device_size_bytes" ,
130- desc : "Size of a device that is part of the filesystem." ,
131- value : float64 (dev .Size ),
132- extraLabel : []string {"device" },
133- extraLabelValue : []string {n },
134- })
135- }
136-
137277 // Information about data, metadata and system data.
138278 metrics = append (metrics , c .getAllocationStats ("data" , s .Allocation .Data )... )
139279 metrics = append (metrics , c .getAllocationStats ("metadata" , s .Allocation .Metadata )... )
140280 metrics = append (metrics , c .getAllocationStats ("system" , s .Allocation .System )... )
141281
282+ // Information about devices.
283+ if ioctlStats == nil {
284+ for n , dev := range s .Devices {
285+ metrics = append (metrics , btrfsMetric {
286+ name : "device_size_bytes" ,
287+ desc : "Size of a device that is part of the filesystem." ,
288+ metricType : prometheus .GaugeValue ,
289+ value : float64 (dev .Size ),
290+ extraLabel : []string {"device" },
291+ extraLabelValue : []string {n },
292+ })
293+ }
294+ return metrics
295+ }
296+
297+ for _ , dev := range ioctlStats .devices {
298+ // trim the path prefix from the device name so the value should match
299+ // the value used in the fallback branch above.
300+ // e.g. /dev/sda -> sda, /rootfs/dev/md1 -> md1
301+ _ , device := path .Split (dev .path )
302+
303+ extraLabels := []string {"device" , "btrfs_dev_uuid" }
304+ extraLabelValues := []string {device , dev .uuid }
305+
306+ metrics = append (metrics ,
307+ btrfsMetric {
308+ name : "device_size_bytes" ,
309+ desc : "Size of a device that is part of the filesystem." ,
310+ metricType : prometheus .GaugeValue ,
311+ value : float64 (dev .totalBytes ),
312+ extraLabel : extraLabels ,
313+ extraLabelValue : extraLabelValues ,
314+ },
315+ // A bytes available metric is probably more useful than a
316+ // bytes used metric, because large numbers of bytes will
317+ // suffer from floating point representation issues
318+ // and we probably care more about the number when it's low anyway
319+ btrfsMetric {
320+ name : "device_unused_bytes" ,
321+ desc : "Unused bytes unused on a device that is part of the filesystem." ,
322+ metricType : prometheus .GaugeValue ,
323+ value : float64 (dev .totalBytes - dev .bytesUsed ),
324+ extraLabel : extraLabels ,
325+ extraLabelValue : extraLabelValues ,
326+ })
327+
328+ errorLabels := append ([]string {"type" }, extraLabels ... )
329+ values := []uint64 {
330+ dev .writeErrs ,
331+ dev .readErrs ,
332+ dev .flushErrs ,
333+ dev .corruptionErrs ,
334+ dev .generationErrs ,
335+ }
336+ btrfsErrorTypeNames := []string {
337+ "write" ,
338+ "read" ,
339+ "flush" ,
340+ "corruption" ,
341+ "generation" ,
342+ }
343+
344+ for i , errorType := range btrfsErrorTypeNames {
345+ metrics = append (metrics ,
346+ btrfsMetric {
347+ name : "device_errors_total" ,
348+ desc : "Errors reported for the device" ,
349+ metricType : prometheus .CounterValue ,
350+ value : float64 (values [i ]),
351+ extraLabel : errorLabels ,
352+ extraLabelValue : append ([]string {errorType }, extraLabelValues ... ),
353+ })
354+ }
355+ }
356+
142357 return metrics
143358}
144359
@@ -148,6 +363,7 @@ func (c *btrfsCollector) getAllocationStats(a string, s *btrfs.AllocationStats)
148363 {
149364 name : "reserved_bytes" ,
150365 desc : "Amount of space reserved for a data type" ,
366+ metricType : prometheus .GaugeValue ,
151367 value : float64 (s .ReservedBytes ),
152368 extraLabel : []string {"block_group_type" },
153369 extraLabelValue : []string {a },
@@ -168,20 +384,23 @@ func (c *btrfsCollector) getLayoutStats(a, l string, s *btrfs.LayoutUsage) []btr
168384 {
169385 name : "used_bytes" ,
170386 desc : "Amount of used space by a layout/data type" ,
387+ metricType : prometheus .GaugeValue ,
171388 value : float64 (s .UsedBytes ),
172389 extraLabel : []string {"block_group_type" , "mode" },
173390 extraLabelValue : []string {a , l },
174391 },
175392 {
176393 name : "size_bytes" ,
177394 desc : "Amount of space allocated for a layout/data type" ,
395+ metricType : prometheus .GaugeValue ,
178396 value : float64 (s .TotalBytes ),
179397 extraLabel : []string {"block_group_type" , "mode" },
180398 extraLabelValue : []string {a , l },
181399 },
182400 {
183401 name : "allocation_ratio" ,
184402 desc : "Data allocation ratio for a layout/data type" ,
403+ metricType : prometheus .GaugeValue ,
185404 value : s .Ratio ,
186405 extraLabel : []string {"block_group_type" , "mode" },
187406 extraLabelValue : []string {a , l },
0 commit comments