Skip to content

Commit e4311d8

Browse files
flblaagaudreault
andauthored
feat: add name and labels in cluster metrics (#17870) (#18453)
Signed-off-by: flbla <[email protected]> Signed-off-by: Alexandre Gaudreault <[email protected]> Co-authored-by: Alexandre Gaudreault <[email protected]>
1 parent e147247 commit e4311d8

File tree

9 files changed

+200
-39
lines changed

9 files changed

+200
-39
lines changed

cmd/argocd-application-controller/commands/argocd_application_controller.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ func NewCommand() *cobra.Command {
7272
metricsCacheExpiration time.Duration
7373
metricsAplicationLabels []string
7474
metricsAplicationConditions []string
75+
metricsClusterLabels []string
7576
kubectlParallelismLimit int64
7677
cacheSource func() (*appstatecache.Cache, error)
7778
redisClient *redis.Client
@@ -202,6 +203,7 @@ func NewCommand() *cobra.Command {
202203
metricsCacheExpiration,
203204
metricsAplicationLabels,
204205
metricsAplicationConditions,
206+
metricsClusterLabels,
205207
kubectlParallelismLimit,
206208
persistResourceHealth,
207209
clusterSharding,
@@ -272,6 +274,7 @@ func NewCommand() *cobra.Command {
272274
command.Flags().BoolVar(&repoServerStrictTLS, "repo-server-strict-tls", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_STRICT_TLS", false), "Whether to use strict validation of the TLS cert presented by the repo server")
273275
command.Flags().StringSliceVar(&metricsAplicationLabels, "metrics-application-labels", []string{}, "List of Application labels that will be added to the argocd_application_labels metric")
274276
command.Flags().StringSliceVar(&metricsAplicationConditions, "metrics-application-conditions", []string{}, "List of Application conditions that will be added to the argocd_application_conditions metric")
277+
command.Flags().StringSliceVar(&metricsClusterLabels, "metrics-cluster-labels", []string{}, "List of Cluster labels that will be added to the argocd_cluster_labels metric")
275278
command.Flags().StringVar(&otlpAddress, "otlp-address", env.StringFromEnv("ARGOCD_APPLICATION_CONTROLLER_OTLP_ADDRESS", ""), "OpenTelemetry collector address to send traces to")
276279
command.Flags().BoolVar(&otlpInsecure, "otlp-insecure", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_OTLP_INSECURE", true), "OpenTelemetry collector insecure mode")
277280
command.Flags().StringToStringVar(&otlpHeaders, "otlp-headers", env.ParseStringToStringFromEnv("ARGOCD_APPLICATION_CONTROLLER_OTLP_HEADERS", map[string]string{}, ","), "List of OpenTelemetry collector extra headers sent with traces, headers are comma-separated key-value pairs(e.g. key1=value1,key2=value2)")

controller/appcontroller.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ type ApplicationController struct {
139139
refreshRequestedApps map[string]CompareWith
140140
refreshRequestedAppsMutex *sync.Mutex
141141
metricsServer *metrics.MetricsServer
142+
metricsClusterLabels []string
142143
kubectlSemaphore *semaphore.Weighted
143144
clusterSharding sharding.ClusterShardingCache
144145
projByNameCache sync.Map
@@ -173,6 +174,7 @@ func NewApplicationController(
173174
metricsCacheExpiration time.Duration,
174175
metricsApplicationLabels []string,
175176
metricsApplicationConditions []string,
177+
metricsClusterLabels []string,
176178
kubectlParallelismLimit int64,
177179
persistResourceHealth bool,
178180
clusterSharding sharding.ClusterShardingCache,
@@ -218,6 +220,7 @@ func NewApplicationController(
218220
applicationNamespaces: applicationNamespaces,
219221
dynamicClusterDistributionEnabled: dynamicClusterDistributionEnabled,
220222
ignoreNormalizerOpts: ignoreNormalizerOpts,
223+
metricsClusterLabels: metricsClusterLabels,
221224
}
222225
if hydratorEnabled {
223226
ctrl.hydrator = hydrator.NewHydrator(&ctrl, appResyncPeriod, commitClientset)
@@ -857,8 +860,8 @@ func (ctrl *ApplicationController) Run(ctx context.Context, statusProcessors int
857860
defer ctrl.appHydrateQueue.ShutDown()
858861
defer ctrl.hydrationQueue.ShutDown()
859862

860-
ctrl.metricsServer.RegisterClustersInfoSource(ctx, ctrl.stateCache)
861863
ctrl.RegisterClusterSecretUpdater(ctx)
864+
ctrl.metricsServer.RegisterClustersInfoSource(ctx, ctrl.stateCache, ctrl.db, ctrl.metricsClusterLabels)
862865

863866
go ctrl.appInformer.Run(ctx.Done())
864867
go ctrl.projInformer.Run(ctx.Done())

controller/appcontroller_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ func newFakeControllerWithResync(data *fakeData, appResyncPeriod time.Duration,
177177
data.metricsCacheExpiration,
178178
[]string{},
179179
[]string{},
180+
[]string{},
180181
0,
181182
true,
182183
nil,

controller/metrics/clustercollector.go

Lines changed: 118 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,27 @@ import (
66
"time"
77

88
"github.com/argoproj/gitops-engine/pkg/cache"
9-
109
"github.com/prometheus/client_golang/prometheus"
10+
log "github.com/sirupsen/logrus"
11+
12+
argoappv1 "github.com/argoproj/argo-cd/v3/pkg/apis/application/v1alpha1"
13+
metricsutil "github.com/argoproj/argo-cd/v3/util/metrics"
1114
)
1215

1316
const (
1417
metricsCollectionInterval = 30 * time.Second
18+
metricsCollectionTimeout = 10 * time.Second
1519
)
1620

1721
var (
1822
descClusterDefaultLabels = []string{"server"}
1923

24+
descClusterLabels *prometheus.Desc
25+
2026
descClusterInfo = prometheus.NewDesc(
2127
"argocd_cluster_info",
2228
"Information about cluster.",
23-
append(descClusterDefaultLabels, "k8s_version"),
29+
append(descClusterDefaultLabels, "k8s_version", "name"),
2430
nil,
2531
)
2632
descClusterCacheResources = prometheus.NewDesc(
@@ -53,26 +59,99 @@ type HasClustersInfo interface {
5359
GetClustersInfo() []cache.ClusterInfo
5460
}
5561

62+
type ClusterLister func(ctx context.Context) (*argoappv1.ClusterList, error)
63+
5664
type clusterCollector struct {
57-
infoSource HasClustersInfo
58-
info []cache.ClusterInfo
59-
lock sync.Mutex
65+
infoSource HasClustersInfo
66+
lock sync.RWMutex
67+
clusterLabels []string
68+
clusterLister ClusterLister
69+
70+
latestInfo []*clusterData
71+
}
72+
73+
type clusterData struct {
74+
info *cache.ClusterInfo
75+
cluster *argoappv1.Cluster
6076
}
6177

62-
func (c *clusterCollector) Run(ctx context.Context) {
78+
func NewClusterCollector(ctx context.Context, source HasClustersInfo, clusterLister ClusterLister, clusterLabels []string) prometheus.Collector {
79+
if len(clusterLabels) > 0 {
80+
normalizedClusterLabels := metricsutil.NormalizeLabels("label", clusterLabels)
81+
descClusterLabels = prometheus.NewDesc(
82+
"argocd_cluster_labels",
83+
"Argo Cluster labels converted to Prometheus labels",
84+
append(append(descClusterDefaultLabels, "name"), normalizedClusterLabels...),
85+
nil,
86+
)
87+
}
88+
89+
collector := &clusterCollector{
90+
infoSource: source,
91+
clusterLabels: clusterLabels,
92+
clusterLister: clusterLister,
93+
lock: sync.RWMutex{},
94+
}
95+
96+
collector.setClusterData()
97+
go collector.run(ctx)
98+
99+
return collector
100+
}
101+
102+
func (c *clusterCollector) run(ctx context.Context) {
63103
//nolint:staticcheck // FIXME: complains about SA1015
64104
tick := time.Tick(metricsCollectionInterval)
65105
for {
66106
select {
67107
case <-ctx.Done():
68108
case <-tick:
69-
info := c.infoSource.GetClustersInfo()
109+
c.setClusterData()
110+
}
111+
}
112+
}
113+
114+
func (c *clusterCollector) setClusterData() {
115+
if clusterData, err := c.getClusterData(); err == nil {
116+
c.lock.Lock()
117+
c.latestInfo = clusterData
118+
c.lock.Unlock()
119+
} else {
120+
log.Warnf("error collecting cluster metrics: %v", err)
121+
}
122+
}
123+
124+
func (c *clusterCollector) getClusterData() ([]*clusterData, error) {
125+
clusterDatas := []*clusterData{}
126+
clusterInfos := c.infoSource.GetClustersInfo()
127+
128+
ctx, cancel := context.WithTimeout(context.Background(), metricsCollectionTimeout)
129+
defer cancel()
130+
clusters, err := c.clusterLister(ctx)
131+
if err != nil {
132+
return nil, err
133+
}
70134

71-
c.lock.Lock()
72-
c.info = info
73-
c.lock.Unlock()
135+
clusterMap := map[string]*argoappv1.Cluster{}
136+
for i, cluster := range clusters.Items {
137+
clusterMap[cluster.Server] = &clusters.Items[i]
138+
}
139+
140+
// Base the cluster data on the ClusterInfo because it only contains the
141+
// clusters managed by this controller instance
142+
for i, info := range clusterInfos {
143+
cluster, ok := clusterMap[info.Server]
144+
if !ok {
145+
// This should not happen, but we cannot emit incomplete metrics, so we skip this cluster
146+
log.WithField("server", info.Server).Warnf("could find cluster for metrics collection")
147+
continue
74148
}
149+
clusterDatas = append(clusterDatas, &clusterData{
150+
info: &clusterInfos[i],
151+
cluster: cluster,
152+
})
75153
}
154+
return clusterDatas, nil
76155
}
77156

78157
// Describe implements the prometheus.Collector interface
@@ -82,20 +161,41 @@ func (c *clusterCollector) Describe(ch chan<- *prometheus.Desc) {
82161
ch <- descClusterAPIs
83162
ch <- descClusterCacheAgeSeconds
84163
ch <- descClusterConnectionStatus
164+
if len(c.clusterLabels) > 0 {
165+
ch <- descClusterLabels
166+
}
85167
}
86168

87169
func (c *clusterCollector) Collect(ch chan<- prometheus.Metric) {
170+
c.lock.RLock()
171+
latestInfo := c.latestInfo
172+
c.lock.RUnlock()
173+
88174
now := time.Now()
89-
for _, c := range c.info {
90-
defaultValues := []string{c.Server}
91-
ch <- prometheus.MustNewConstMetric(descClusterInfo, prometheus.GaugeValue, 1, append(defaultValues, c.K8SVersion)...)
92-
ch <- prometheus.MustNewConstMetric(descClusterCacheResources, prometheus.GaugeValue, float64(c.ResourcesCount), defaultValues...)
93-
ch <- prometheus.MustNewConstMetric(descClusterAPIs, prometheus.GaugeValue, float64(c.APIsCount), defaultValues...)
175+
for _, clusterData := range latestInfo {
176+
info := clusterData.info
177+
name := clusterData.cluster.Name
178+
labels := clusterData.cluster.Labels
179+
180+
defaultValues := []string{info.Server}
181+
ch <- prometheus.MustNewConstMetric(descClusterInfo, prometheus.GaugeValue, 1, append(defaultValues, info.K8SVersion, name)...)
182+
ch <- prometheus.MustNewConstMetric(descClusterCacheResources, prometheus.GaugeValue, float64(info.ResourcesCount), defaultValues...)
183+
ch <- prometheus.MustNewConstMetric(descClusterAPIs, prometheus.GaugeValue, float64(info.APIsCount), defaultValues...)
94184
cacheAgeSeconds := -1
95-
if c.LastCacheSyncTime != nil {
96-
cacheAgeSeconds = int(now.Sub(*c.LastCacheSyncTime).Seconds())
185+
if info.LastCacheSyncTime != nil {
186+
cacheAgeSeconds = int(now.Sub(*info.LastCacheSyncTime).Seconds())
97187
}
98188
ch <- prometheus.MustNewConstMetric(descClusterCacheAgeSeconds, prometheus.GaugeValue, float64(cacheAgeSeconds), defaultValues...)
99-
ch <- prometheus.MustNewConstMetric(descClusterConnectionStatus, prometheus.GaugeValue, boolFloat64(c.SyncError == nil), append(defaultValues, c.K8SVersion)...)
189+
ch <- prometheus.MustNewConstMetric(descClusterConnectionStatus, prometheus.GaugeValue, boolFloat64(info.SyncError == nil), append(defaultValues, info.K8SVersion)...)
190+
191+
if len(c.clusterLabels) > 0 && labels != nil {
192+
labelValues := []string{}
193+
labelValues = append(labelValues, info.Server, name)
194+
for _, desiredLabel := range c.clusterLabels {
195+
value := labels[desiredLabel]
196+
labelValues = append(labelValues, value)
197+
}
198+
ch <- prometheus.MustNewConstMetric(descClusterLabels, prometheus.GaugeValue, 1, labelValues...)
199+
}
100200
}
101201
}

controller/metrics/clustercollector_test.go

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,36 @@ import (
55
"testing"
66

77
gitopsCache "github.com/argoproj/gitops-engine/pkg/cache"
8+
"github.com/stretchr/testify/mock"
9+
10+
dbmocks "github.com/argoproj/argo-cd/v3/util/db/mocks"
11+
12+
"github.com/argoproj/argo-cd/v3/pkg/apis/application/v1alpha1"
813
)
914

1015
func TestMetricClusterConnectivity(t *testing.T) {
16+
db := dbmocks.ArgoDB{}
17+
cluster1 := v1alpha1.Cluster{Name: "cluster1", Server: "server1", Labels: map[string]string{"env": "dev", "team": "team1"}}
18+
cluster2 := v1alpha1.Cluster{Name: "cluster2", Server: "server2", Labels: map[string]string{"env": "staging", "team": "team2"}}
19+
cluster3 := v1alpha1.Cluster{Name: "cluster3", Server: "server3", Labels: map[string]string{"env": "production", "team": "team3"}}
20+
clusterList := &v1alpha1.ClusterList{Items: []v1alpha1.Cluster{cluster1, cluster2, cluster3}}
21+
db.On("ListClusters", mock.Anything).Return(clusterList, nil)
22+
1123
type testCases struct {
1224
testCombination
13-
skip bool
14-
description string
15-
metricLabels []string
16-
clustersInfo []gitopsCache.ClusterInfo
25+
skip bool
26+
description string
27+
metricLabels []string
28+
clusterLabels []string
29+
clustersInfo []gitopsCache.ClusterInfo
1730
}
31+
1832
cases := []testCases{
1933
{
20-
description: "metric will have value 1 if connected with the cluster",
21-
skip: false,
22-
metricLabels: []string{"non-existing"},
34+
description: "metric will have value 1 if connected with the cluster",
35+
skip: false,
36+
metricLabels: []string{"non-existing"},
37+
clusterLabels: []string{"env"},
2338
testCombination: testCombination{
2439
applications: []string{fakeApp},
2540
responseContains: `
@@ -36,9 +51,10 @@ argocd_cluster_connection_status{k8s_version="1.21",server="server1"} 1
3651
},
3752
},
3853
{
39-
description: "metric will have value 0 if not connected with the cluster",
40-
skip: false,
41-
metricLabels: []string{"non-existing"},
54+
description: "metric will have value 0 if not connected with the cluster",
55+
skip: false,
56+
metricLabels: []string{"non-existing"},
57+
clusterLabels: []string{"env"},
4258
testCombination: testCombination{
4359
applications: []string{fakeApp},
4460
responseContains: `
@@ -55,16 +71,27 @@ argocd_cluster_connection_status{k8s_version="1.21",server="server1"} 0
5571
},
5672
},
5773
{
58-
description: "will have one metric per cluster",
59-
skip: false,
60-
metricLabels: []string{"non-existing"},
74+
description: "will have one metric per cluster",
75+
skip: false,
76+
metricLabels: []string{"non-existing"},
77+
clusterLabels: []string{"env", "team"},
6178
testCombination: testCombination{
6279
applications: []string{fakeApp},
6380
responseContains: `
6481
# TYPE argocd_cluster_connection_status gauge
6582
argocd_cluster_connection_status{k8s_version="1.21",server="server1"} 1
6683
argocd_cluster_connection_status{k8s_version="1.21",server="server2"} 1
6784
argocd_cluster_connection_status{k8s_version="1.21",server="server3"} 1
85+
86+
# TYPE argocd_cluster_info gauge
87+
argocd_cluster_info{k8s_version="1.21",name="cluster1",server="server1"} 1
88+
argocd_cluster_info{k8s_version="1.21",name="cluster2",server="server2"} 1
89+
argocd_cluster_info{k8s_version="1.21",name="cluster3",server="server3"} 1
90+
91+
# TYPE argocd_cluster_labels gauge
92+
argocd_cluster_labels{label_env="dev",label_team="team1",name="cluster1",server="server1"} 1
93+
argocd_cluster_labels{label_env="staging",label_team="team2",name="cluster2",server="server2"} 1
94+
argocd_cluster_labels{label_env="production",label_team="team3",name="cluster3",server="server3"} 1
6895
`,
6996
},
7097
clustersInfo: []gitopsCache.ClusterInfo{
@@ -95,7 +122,9 @@ argocd_cluster_connection_status{k8s_version="1.21",server="server3"} 1
95122
FakeAppYAMLs: c.applications,
96123
ExpectedResponse: c.responseContains,
97124
AppLabels: c.metricLabels,
125+
ClusterLabels: c.clusterLabels,
98126
ClustersInfo: c.clustersInfo,
127+
ClusterLister: db.ListClusters,
99128
}
100129
runTest(t, cfg)
101130
}

controller/metrics/metrics.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"github.com/argoproj/argo-cd/v3/common"
2121
argoappv1 "github.com/argoproj/argo-cd/v3/pkg/apis/application/v1alpha1"
2222
applister "github.com/argoproj/argo-cd/v3/pkg/client/listers/application/v1alpha1"
23+
"github.com/argoproj/argo-cd/v3/util/db"
2324
"github.com/argoproj/argo-cd/v3/util/git"
2425
"github.com/argoproj/argo-cd/v3/util/healthz"
2526
metricsutil "github.com/argoproj/argo-cd/v3/util/metrics"
@@ -222,9 +223,8 @@ func NewMetricsServer(addr string, appLister applister.ApplicationLister, appFil
222223
}, nil
223224
}
224225

225-
func (m *MetricsServer) RegisterClustersInfoSource(ctx context.Context, source HasClustersInfo) {
226-
collector := &clusterCollector{infoSource: source}
227-
go collector.Run(ctx)
226+
func (m *MetricsServer) RegisterClustersInfoSource(ctx context.Context, source HasClustersInfo, db db.ArgoDB, clusterLabels []string) {
227+
collector := NewClusterCollector(ctx, source, db.ListClusters, clusterLabels)
228228
m.registry.MustRegister(collector)
229229
}
230230

0 commit comments

Comments
 (0)