Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 55 additions & 8 deletions pkg/rdmametrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"strings"
"unicode"

rawEthtool "github.com/safchain/ethtool"
"github.com/vishvananda/netlink"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/metric"
Expand Down Expand Up @@ -87,7 +88,8 @@ var (
type GetObservable func(string) (metric.Int64ObservableCounter, bool)

type EthtoolImpl struct {
Stats func(netIfName string) ([]oteltype.Metrics, error)
Stats func(netIfName string) ([]oteltype.Metrics, error)
BusInfo func(string) (string, error)
}

type NetlinkImpl struct {
Expand Down Expand Up @@ -134,7 +136,7 @@ func Register(ctx context.Context, meter metric.Meter, cache podownercache.Cache
log: log,
ch: make(chan struct{}, 10),
waitToRegisterMetrics: make(map[string]struct{}),
ethtool: EthtoolImpl{Stats: ethtool.Stats},
ethtool: EthtoolImpl{Stats: ethtool.Stats, BusInfo: rawEthtool.BusInfo},
netlinkImpl: NetlinkImpl{
RdmaLinkList: netlink.RdmaLinkList,
LinkList: netlink.LinkList,
Expand Down Expand Up @@ -247,13 +249,13 @@ func (e *exporter) Callback(ctx context.Context, observer metric.Observer) error
unRegistrationMetric = append(unRegistrationMetric, key)
return nil, false
}
nodeGuidNetDeviceNameMap, err := getNodeGuidNetDeviceNameMap(e.netlinkImpl)
vfToPfNameMap, err := getVfToPfNameMap()
if err != nil {
e.log.Error("failed to get node guid net device name map", zap.Error(err))
return fmt.Errorf("failed to get node guid net device name map: %w", err)
e.log.Error("failed to get vf to pf name map", zap.Error(err))
return fmt.Errorf("failed to get vf to pf name map: %w", err)
}
for _, netns := range list {
if err := e.processNetNS(netns, nodeGuidNetDeviceNameMap, observer, getObservable); err != nil {
if err := e.processNetNS(netns, vfToPfNameMap, observer, getObservable); err != nil {
e.log.Error("failed to process net ns", zap.String("net_ns_id", netns.ID), zap.Error(err))
continue
}
Expand Down Expand Up @@ -297,7 +299,7 @@ func (e *exporter) updateUnregisteredMetrics(unRegistrationMetric []string) {
}

func (e *exporter) processNetNS(netns NetnsItem,
nodeGuidNetDeviceNameMap map[string]string,
vfToPfNameMap map[string]string,
observer metric.Observer, getObservable GetObservable) error {

list := make([]oteltype.Metrics, 0)
Expand Down Expand Up @@ -336,7 +338,11 @@ func (e *exporter) processNetNS(netns NetnsItem,
attribute.String("sys_image_guid", deviceInfo.SysImageGuid),
attribute.Bool("is_root", deviceInfo.IsRoot),
}
if rdmaParentName, ok := nodeGuidNetDeviceNameMap[deviceInfo.SysImageGuid]; ok {
busInfo, err := e.ethtool.BusInfo(deviceInfo.NetDevName)
if err != nil {
return fmt.Errorf("failed to get ethtool bus info: %w", err)
}
if rdmaParentName, ok := vfToPfNameMap[busInfo]; ok {
commonLabels = append(commonLabels, attribute.String("rdma_parent_name", rdmaParentName))
}

Expand Down Expand Up @@ -466,6 +472,47 @@ func getNodeGuidNetDeviceNameMap(nl NetlinkImpl) (map[string]string, error) {
return res, nil
}

func getVfToPfNameMap() (map[string]string, error) {
return getVfToPfNameMapAt("/sys/devices")
}

func getVfToPfNameMapAt(devicesRoot string) (map[string]string, error) {
result := make(map[string]string)

pciDirs, err := filepath.Glob(filepath.Join(devicesRoot, "pci*"))
if err != nil {
return result, fmt.Errorf("failed to list pci devices: %w", err)
}

for _, pciRoot := range pciDirs {
err := filepath.WalkDir(pciRoot, func(path string, d os.DirEntry, err error) error {
if err != nil || d == nil || !d.IsDir() {
return nil
}

physfnPath := filepath.Join(path, "physfn")
if _, err := os.Lstat(physfnPath); err == nil {
vfPci := filepath.Base(path)

pfRealPath, err := filepath.EvalSymlinks(physfnPath)
if err != nil {
return nil
}
pfNetDir := filepath.Join(pfRealPath, "net")
if entries, err := os.ReadDir(pfNetDir); err == nil && len(entries) > 0 {
result[vfPci] = entries[0].Name()
}
}
return nil
})
if err != nil {
return nil, fmt.Errorf("failed to walk pci device %s: %w", pciRoot, err)
}
}

return result, nil
}

func getIfnameNetDevMap(nl NetlinkImpl) (map[string]RDMADevice, error) {
netList, err := nl.LinkList()
if err != nil {
Expand Down
200 changes: 192 additions & 8 deletions pkg/rdmametrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"fmt"
"net"
"os"
"path/filepath"
"reflect"
"runtime"
"testing"
Expand Down Expand Up @@ -689,14 +690,19 @@ func TestProcessNetNS(t *testing.T) {
return toRun()
},
exec: fakeExec,
ethtool: EthtoolImpl{Stats: func(netIfName string) ([]oteltype.Metrics, error) {
return []oteltype.Metrics{
{
Name: "vport_speed",
Value: int64(400000),
},
}, nil
}},
ethtool: EthtoolImpl{
Stats: func(netIfName string) ([]oteltype.Metrics, error) {
return []oteltype.Metrics{
{
Name: "vport_speed",
Value: int64(400000),
},
}, nil
},
BusInfo: func(s string) (string, error) {
return "0000:65:00.0", nil
},
},
log: logutils.Logger.Named("rdma-metrics-exporter"),
}
observer := noop.Observer{}
Expand Down Expand Up @@ -1098,3 +1104,181 @@ func TestExtractSrcIPStringIndex(t *testing.T) {
})
}
}

func TestGetVfToPfNameMapAt(t *testing.T) {
tests := []struct {
name string
setup func(tmpDir string) string
expected map[string]string
}{
{
name: "empty directory",
setup: func(tmpDir string) string {
devicesRoot := filepath.Join(tmpDir, "devices")
_ = os.MkdirAll(devicesRoot, 0755)
return devicesRoot
},
expected: map[string]string{},
},
{
name: "single VF",
setup: func(tmpDir string) string {
devicesRoot := filepath.Join(tmpDir, "devices")
pciRoot := filepath.Join(devicesRoot, "pci0000:00")
vfPath := filepath.Join(pciRoot, "0000:00:02.1")
pfPath := filepath.Join(pciRoot, "0000:00:02.0")
pfNetDir := filepath.Join(pfPath, "net")

_ = os.MkdirAll(vfPath, 0755)
_ = os.MkdirAll(pfNetDir, 0755)
_ = os.MkdirAll(filepath.Join(pfNetDir, "eth0"), 0755)
_ = os.Symlink(pfPath, filepath.Join(vfPath, "physfn"))

return devicesRoot
},
expected: map[string]string{"0000:00:02.1": "eth0"},
},
{
name: "multiple VFs and PFs on multiple buses",
setup: func(tmpDir string) string {
devicesRoot := filepath.Join(tmpDir, "devices")

// 总线1: PF1有2个VF
pci1Root := filepath.Join(devicesRoot, "pci0000:00")
pf1Path := filepath.Join(pci1Root, "0000:00:02.0")
pf1NetDir := filepath.Join(pf1Path, "net")
_ = os.MkdirAll(pf1NetDir, 0755)
_ = os.MkdirAll(filepath.Join(pf1NetDir, "eth0"), 0755)

vf1Path := filepath.Join(pci1Root, "0000:00:02.1")
vf2Path := filepath.Join(pci1Root, "0000:00:02.2")
_ = os.MkdirAll(vf1Path, 0755)
_ = os.MkdirAll(vf2Path, 0755)
_ = os.Symlink(pf1Path, filepath.Join(vf1Path, "physfn"))
_ = os.Symlink(pf1Path, filepath.Join(vf2Path, "physfn"))

// 总线2: PF2有1个VF
pci2Root := filepath.Join(devicesRoot, "pci0001:00")
pf2Path := filepath.Join(pci2Root, "0001:00:03.0")
pf2NetDir := filepath.Join(pf2Path, "net")
_ = os.MkdirAll(pf2NetDir, 0755)
_ = os.MkdirAll(filepath.Join(pf2NetDir, "eth1"), 0755)

vf3Path := filepath.Join(pci2Root, "0001:00:03.1")
_ = os.MkdirAll(vf3Path, 0755)
_ = os.Symlink(pf2Path, filepath.Join(vf3Path, "physfn"))

return devicesRoot
},
expected: map[string]string{
"0000:00:02.1": "eth0",
"0000:00:02.2": "eth0",
"0001:00:03.1": "eth1",
},
},
{
name: "VF without physfn link",
setup: func(tmpDir string) string {
devicesRoot := filepath.Join(tmpDir, "devices")
pciRoot := filepath.Join(devicesRoot, "pci0000:00")
devicePath := filepath.Join(pciRoot, "0000:00:02.0")
_ = os.MkdirAll(devicePath, 0755)
return devicesRoot
},
expected: map[string]string{},
},
{
name: "VF with broken symlink",
setup: func(tmpDir string) string {
devicesRoot := filepath.Join(tmpDir, "devices")
pciRoot := filepath.Join(devicesRoot, "pci0000:00")
vfPath := filepath.Join(pciRoot, "0000:00:02.1")
_ = os.MkdirAll(vfPath, 0755)
_ = os.Symlink(filepath.Join(tmpDir, "nonexistent"), filepath.Join(vfPath, "physfn"))
return devicesRoot
},
expected: map[string]string{},
},
{
name: "PF without net directory",
setup: func(tmpDir string) string {
devicesRoot := filepath.Join(tmpDir, "devices")
pciRoot := filepath.Join(devicesRoot, "pci0000:00")
vfPath := filepath.Join(pciRoot, "0000:00:02.1")
pfPath := filepath.Join(pciRoot, "0000:00:02.0")
_ = os.MkdirAll(vfPath, 0755)
_ = os.MkdirAll(pfPath, 0755)
_ = os.Symlink(pfPath, filepath.Join(vfPath, "physfn"))
return devicesRoot
},
expected: map[string]string{},
},
{
name: "mixed valid and invalid VFs",
setup: func(tmpDir string) string {
devicesRoot := filepath.Join(tmpDir, "devices")
pciRoot := filepath.Join(devicesRoot, "pci0000:00")

// 有效的VF
validVfPath := filepath.Join(pciRoot, "0000:00:02.1")
validPfPath := filepath.Join(pciRoot, "0000:00:02.0")
validPfNetDir := filepath.Join(validPfPath, "net")
_ = os.MkdirAll(validVfPath, 0755)
_ = os.MkdirAll(validPfNetDir, 0755)
_ = os.MkdirAll(filepath.Join(validPfNetDir, "eth0"), 0755)
_ = os.Symlink(validPfPath, filepath.Join(validVfPath, "physfn"))

// 无效的VF(损坏的符号链接)
invalidVfPath := filepath.Join(pciRoot, "0000:00:03.1")
_ = os.MkdirAll(invalidVfPath, 0755)
_ = os.Symlink(filepath.Join(tmpDir, "nonexistent"), filepath.Join(invalidVfPath, "physfn"))

return devicesRoot
},
expected: map[string]string{"0000:00:02.1": "eth0"},
},
{
name: "nested PCI directories",
setup: func(tmpDir string) string {
devicesRoot := filepath.Join(tmpDir, "devices")
pciRoot := filepath.Join(devicesRoot, "pci0000:00")
nestedVfPath := filepath.Join(pciRoot, "0000:00:01.0", "0000:01:00.1")
pfPath := filepath.Join(pciRoot, "0000:01:00.0")
pfNetDir := filepath.Join(pfPath, "net")

_ = os.MkdirAll(nestedVfPath, 0755)
_ = os.MkdirAll(pfNetDir, 0755)
_ = os.MkdirAll(filepath.Join(pfNetDir, "eth0"), 0755)
_ = os.Symlink(pfPath, filepath.Join(nestedVfPath, "physfn"))

return devicesRoot
},
expected: map[string]string{"0000:01:00.1": "eth0"},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
tmpDir := t.TempDir()
devicesRoot := tt.setup(tmpDir)

result, err := getVfToPfNameMapAt(devicesRoot)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}

if len(result) != len(tt.expected) {
t.Fatalf("expected %d entries, got %d. result: %v, expected: %v",
len(tt.expected), len(result), result, tt.expected)
}

for vf, expectedPf := range tt.expected {
if actualPf, ok := result[vf]; !ok {
t.Errorf("expected VF %s in result, but not found", vf)
} else if actualPf != expectedPf {
t.Errorf("VF %s: expected PF %s, got %s", vf, expectedPf, actualPf)
}
}
})
}
}
Loading