Skip to content

Commit 9e72f4e

Browse files
committed
Add collector for PCIe devices with link information
The link status of PCIe devices sometimes changes, like link or speed downgrades, and devices disappear. This patch collects PCIe devices' link infromation to detect such failures. As a first step, this collector exports PCIe devices' - Device information (vendor_id, device_id, etc.) - Parent PCIe device (e.g. PCIe bridge, PCIe switch) - Link status (max_link_{speed|width}, current_link_{speed|width}) Signed-off-by: Naoki MATSUMOTO <m.naoki9911@gmail.com>
1 parent 7e801c9 commit 9e72f4e

File tree

1 file changed

+135
-0
lines changed

1 file changed

+135
-0
lines changed

collector/pci_device_linux.go

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
// Copyright 2017-2019 The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
//go:build !nopcidevice
15+
// +build !nopcidevice
16+
17+
package collector
18+
19+
import (
20+
"errors"
21+
"fmt"
22+
"log/slog"
23+
"os"
24+
25+
"github.com/prometheus/client_golang/prometheus"
26+
"github.com/prometheus/procfs/sysfs"
27+
)
28+
29+
type pciDeviceCollector struct {
30+
fs sysfs.FS
31+
metricDescs map[string]*prometheus.Desc
32+
logger *slog.Logger
33+
subsystem string
34+
}
35+
36+
func init() {
37+
registerCollector("pcidevice", defaultEnabled, NewPciDeviceCollector)
38+
}
39+
40+
// NewPciDeviceCollector returns a new Collector exposing PCI devices stats.
41+
func NewPciDeviceCollector(logger *slog.Logger) (Collector, error) {
42+
var i pciDeviceCollector
43+
var err error
44+
45+
i.fs, err = sysfs.NewFS(*sysPath)
46+
if err != nil {
47+
return nil, fmt.Errorf("failed to open sysfs: %w", err)
48+
}
49+
i.logger = logger
50+
51+
// Detailed description for all metrics.
52+
descriptions := map[string]string{
53+
"max_link_speed": "Value of maximum link speed (GT/s)",
54+
"max_link_width": "Value of maximum link width (number of lanes)",
55+
"current_link_speed": "Value of current link speed (GT/s)",
56+
"current_link_width": "Value of current link width (number of lanes)",
57+
}
58+
59+
i.metricDescs = make(map[string]*prometheus.Desc)
60+
i.subsystem = "pcidevice"
61+
62+
for metricName, description := range descriptions {
63+
i.metricDescs[metricName] = prometheus.NewDesc(
64+
prometheus.BuildFQName(namespace, i.subsystem, metricName),
65+
description,
66+
[]string{"segment", "bus", "device", "function"},
67+
nil,
68+
)
69+
}
70+
71+
return &i, nil
72+
}
73+
74+
func (c *pciDeviceCollector) pushMetric(ch chan<- prometheus.Metric, name string, value *float64, location sysfs.PciDeviceLocation, valueType prometheus.ValueType) {
75+
if value != nil {
76+
ch <- prometheus.MustNewConstMetric(c.metricDescs[name], valueType, *value, location.Strings()...)
77+
}
78+
}
79+
80+
func (c *pciDeviceCollector) Update(ch chan<- prometheus.Metric) error {
81+
devices, err := c.fs.PciDevices()
82+
if err != nil {
83+
if errors.Is(err, os.ErrNotExist) {
84+
c.logger.Debug("infiniband statistics not found, skipping")
85+
return ErrNoData
86+
}
87+
return fmt.Errorf("error obtaining InfiniBand class info: %w", err)
88+
}
89+
90+
for _, device := range devices {
91+
92+
// The format follows the definition in drivers/pci/pci-sysfs.c
93+
infos := [][]string{
94+
[]string{"class_id", fmt.Sprintf("0x%06x", device.Class)},
95+
[]string{"vendor_id", fmt.Sprintf("0x%04x", device.Device)},
96+
[]string{"subsystem_vendor_id", fmt.Sprintf("0x%04x", device.SubsystemVendor)},
97+
[]string{"subsystem_device_id", fmt.Sprintf("0x%04x", device.SubsystemDevice)},
98+
[]string{"revision", fmt.Sprintf("0x%02x", device.Revision)},
99+
}
100+
101+
labels := []string{}
102+
values := []string{}
103+
for i := range infos {
104+
labels = append(labels, infos[i][0])
105+
values = append(values, infos[i][1])
106+
}
107+
108+
// The device location is represented in separated format.
109+
labels = append(labels, []string{"segment", "bus", "device", "function"}...)
110+
values = append(values, device.Location.Strings()...)
111+
112+
labels = append(labels, []string{"parent_segment", "parent_bus", "parent_device", "parent_function"}...)
113+
if device.ParentLocation != nil {
114+
values = append(values, device.ParentLocation.Strings()...)
115+
} else {
116+
// TODO: is this ok?
117+
values = append(values, []string{"*", "*", "*", "*"}...)
118+
}
119+
120+
infoDesc := prometheus.NewDesc(
121+
prometheus.BuildFQName(namespace, c.subsystem, "info"),
122+
"Non-numeric data from /sys/bus/pci/devices/<location>, value is always 1.",
123+
labels,
124+
nil,
125+
)
126+
ch <- prometheus.MustNewConstMetric(infoDesc, prometheus.GaugeValue, 1.0, values...)
127+
128+
c.pushMetric(ch, "max_link_speed", device.MaxLinkSpeed, device.Location, prometheus.GaugeValue)
129+
c.pushMetric(ch, "max_link_width", device.MaxLinkWidth, device.Location, prometheus.GaugeValue)
130+
c.pushMetric(ch, "current_link_speed", device.CurrentLinkSpeed, device.Location, prometheus.GaugeValue)
131+
c.pushMetric(ch, "current_link_width", device.CurrentLinkWidth, device.Location, prometheus.GaugeValue)
132+
}
133+
134+
return nil
135+
}

0 commit comments

Comments
 (0)