Skip to content

Commit a2361bf

Browse files
authored
fix: add cooldown to prevent resetting autoheal exp backoff preemptively (cherry-pick #23057) (#23188)
Signed-off-by: Soumya Ghosh Dastidar <[email protected]>
1 parent 5ad281e commit a2361bf

16 files changed

+104
-9
lines changed

cmd/argocd-application-controller/commands/argocd_application_controller.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ func NewCommand() *cobra.Command {
6464
selfHealBackoffTimeoutSeconds int
6565
selfHealBackoffFactor int
6666
selfHealBackoffCapSeconds int
67+
selfHealBackoffCooldownSeconds int
6768
syncTimeout int
6869
statusProcessors int
6970
operationProcessors int
@@ -196,6 +197,7 @@ func NewCommand() *cobra.Command {
196197
time.Duration(appResyncJitter)*time.Second,
197198
time.Duration(selfHealTimeoutSeconds)*time.Second,
198199
selfHealBackoff,
200+
time.Duration(selfHealBackoffCooldownSeconds)*time.Second,
199201
time.Duration(syncTimeout)*time.Second,
200202
time.Duration(repoErrorGracePeriod)*time.Second,
201203
metricsPort,
@@ -266,6 +268,7 @@ func NewCommand() *cobra.Command {
266268
command.Flags().IntVar(&selfHealBackoffTimeoutSeconds, "self-heal-backoff-timeout-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_TIMEOUT_SECONDS", 2, 0, math.MaxInt32), "Specifies initial timeout of exponential backoff between self heal attempts")
267269
command.Flags().IntVar(&selfHealBackoffFactor, "self-heal-backoff-factor", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR", 3, 0, math.MaxInt32), "Specifies factor of exponential timeout between application self heal attempts")
268270
command.Flags().IntVar(&selfHealBackoffCapSeconds, "self-heal-backoff-cap-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS", 300, 0, math.MaxInt32), "Specifies max timeout of exponential backoff between application self heal attempts")
271+
command.Flags().IntVar(&selfHealBackoffCooldownSeconds, "self-heal-backoff-cooldown-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_COOLDOWN_SECONDS", 330, 0, math.MaxInt32), "Specifies period of time the app needs to stay synced before the self heal backoff can reset")
269272
command.Flags().IntVar(&syncTimeout, "sync-timeout", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT", 0, 0, math.MaxInt32), "Specifies the timeout after which a sync would be terminated. 0 means no timeout (default 0).")
270273
command.Flags().Int64Var(&kubectlParallelismLimit, "kubectl-parallelism-limit", env.ParseInt64FromEnv("ARGOCD_APPLICATION_CONTROLLER_KUBECTL_PARALLELISM_LIMIT", 20, 0, math.MaxInt64), "Number of allowed concurrent kubectl fork/execs. Any value less than 1 means no limit.")
271274
command.Flags().BoolVar(&repoServerPlaintext, "repo-server-plaintext", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT", false), "Disable TLS on connections to repo server")

controller/appcontroller.go

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ import (
4141
"k8s.io/client-go/kubernetes"
4242
"k8s.io/client-go/tools/cache"
4343
"k8s.io/client-go/util/workqueue"
44+
"k8s.io/utils/ptr"
4445

4546
commitclient "github.com/argoproj/argo-cd/v2/commitserver/apiclient"
4647
"github.com/argoproj/argo-cd/v2/common"
@@ -135,6 +136,7 @@ type ApplicationController struct {
135136
statusRefreshJitter time.Duration
136137
selfHealTimeout time.Duration
137138
selfHealBackOff *wait.Backoff
139+
selfHealBackoffCooldown time.Duration
138140
syncTimeout time.Duration
139141
db db.ArgoDB
140142
settingsMgr *settings_util.SettingsManager
@@ -169,6 +171,7 @@ func NewApplicationController(
169171
appResyncJitter time.Duration,
170172
selfHealTimeout time.Duration,
171173
selfHealBackoff *wait.Backoff,
174+
selfHealBackoffCooldown time.Duration,
172175
syncTimeout time.Duration,
173176
repoErrorGracePeriod time.Duration,
174177
metricsPort int,
@@ -214,6 +217,7 @@ func NewApplicationController(
214217
settingsMgr: settingsMgr,
215218
selfHealTimeout: selfHealTimeout,
216219
selfHealBackOff: selfHealBackoff,
220+
selfHealBackoffCooldown: selfHealBackoffCooldown,
217221
syncTimeout: syncTimeout,
218222
clusterSharding: clusterSharding,
219223
projByNameCache: sync.Map{},
@@ -2234,17 +2238,22 @@ func (ctrl *ApplicationController) shouldSelfHeal(app *appv1.Application, alread
22342238
return true, time.Duration(0)
22352239
}
22362240

2237-
// Reset counter if the prior sync was successful OR if the revision has changed
2238-
if !alreadyAttempted || app.Status.Sync.Status == appv1.SyncStatusCodeSynced {
2241+
var timeSinceOperation *time.Duration
2242+
if app.Status.OperationState.FinishedAt != nil {
2243+
timeSinceOperation = ptr.To(time.Since(app.Status.OperationState.FinishedAt.Time))
2244+
}
2245+
2246+
// Reset counter if the prior sync was successful and the cooldown period is over OR if the revision has changed
2247+
if !alreadyAttempted || (timeSinceOperation != nil && *timeSinceOperation >= ctrl.selfHealBackoffCooldown && app.Status.Sync.Status == appv1.SyncStatusCodeSynced) {
22392248
app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount = 0
22402249
}
22412250

22422251
var retryAfter time.Duration
22432252
if ctrl.selfHealBackOff == nil {
2244-
if app.Status.OperationState.FinishedAt == nil {
2253+
if timeSinceOperation == nil {
22452254
retryAfter = ctrl.selfHealTimeout
22462255
} else {
2247-
retryAfter = ctrl.selfHealTimeout - time.Since(app.Status.OperationState.FinishedAt.Time)
2256+
retryAfter = ctrl.selfHealTimeout - *timeSinceOperation
22482257
}
22492258
} else {
22502259
backOff := *ctrl.selfHealBackOff
@@ -2254,10 +2263,11 @@ func (ctrl *ApplicationController) shouldSelfHeal(app *appv1.Application, alread
22542263
for i := 0; i < steps; i++ {
22552264
delay = backOff.Step()
22562265
}
2257-
if app.Status.OperationState.FinishedAt == nil {
2266+
2267+
if timeSinceOperation == nil {
22582268
retryAfter = delay
22592269
} else {
2260-
retryAfter = delay - time.Since(app.Status.OperationState.FinishedAt.Time)
2270+
retryAfter = delay - *timeSinceOperation
22612271
}
22622272
}
22632273
return retryAfter <= 0, retryAfter

controller/appcontroller_test.go

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@ func newFakeControllerWithResync(data *fakeData, appResyncPeriod time.Duration,
171171
time.Second,
172172
time.Minute,
173173
nil,
174+
time.Minute,
174175
0,
175176
time.Second*10,
176177
common.DefaultPortArgoCDMetrics,
@@ -2618,10 +2619,18 @@ func TestSelfHealExponentialBackoff(t *testing.T) {
26182619
alreadyAttempted: false,
26192620
expectedAttempts: 0,
26202621
syncStatus: v1alpha1.SyncStatusCodeOutOfSync,
2621-
}, {
2622+
}, { // backoff will not reset as finished tme isn't >= cooldown
26222623
attempts: 6,
2623-
finishedAt: nil,
2624-
expectedDuration: 0,
2624+
finishedAt: ptr.To(metav1.Now()),
2625+
expectedDuration: 120 * time.Second,
2626+
shouldSelfHeal: false,
2627+
alreadyAttempted: true,
2628+
expectedAttempts: 6,
2629+
syncStatus: v1alpha1.SyncStatusCodeSynced,
2630+
}, { // backoff will reset as finished time is >= cooldown
2631+
attempts: 40,
2632+
finishedAt: &metav1.Time{Time: time.Now().Add(-(1 * time.Minute))},
2633+
expectedDuration: -60 * time.Second,
26252634
shouldSelfHeal: true,
26262635
alreadyAttempted: true,
26272636
expectedAttempts: 0,

docs/operator-manual/server-commands/argocd-application-controller.md

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

manifests/base/application-controller-deployment/argocd-application-controller-deployment.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,12 @@ spec:
115115
name: argocd-cmd-params-cm
116116
key: controller.self.heal.backoff.cap.seconds
117117
optional: true
118+
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_COOLDOWN_SECONDS
119+
valueFrom:
120+
configMapKeyRef:
121+
name: argocd-cmd-params-cm
122+
key: controller.self.heal.backoff.cooldown.seconds
123+
optional: true
118124
- name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT
119125
valueFrom:
120126
configMapKeyRef:

manifests/base/application-controller/argocd-application-controller-statefulset.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,12 @@ spec:
118118
name: argocd-cmd-params-cm
119119
key: controller.self.heal.backoff.cap.seconds
120120
optional: true
121+
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_COOLDOWN_SECONDS
122+
valueFrom:
123+
configMapKeyRef:
124+
name: argocd-cmd-params-cm
125+
key: controller.self.heal.backoff.cooldown.seconds
126+
optional: true
121127
- name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT
122128
valueFrom:
123129
configMapKeyRef:

manifests/core-install-with-hydrator.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24932,6 +24932,12 @@ spec:
2493224932
key: controller.self.heal.backoff.cap.seconds
2493324933
name: argocd-cmd-params-cm
2493424934
optional: true
24935+
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_COOLDOWN_SECONDS
24936+
valueFrom:
24937+
configMapKeyRef:
24938+
key: controller.self.heal.backoff.cooldown.seconds
24939+
name: argocd-cmd-params-cm
24940+
optional: true
2493524941
- name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT
2493624942
valueFrom:
2493724943
configMapKeyRef:

manifests/core-install.yaml

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

manifests/ha/install-with-hydrator.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26925,6 +26925,12 @@ spec:
2692526925
key: controller.self.heal.backoff.cap.seconds
2692626926
name: argocd-cmd-params-cm
2692726927
optional: true
26928+
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_COOLDOWN_SECONDS
26929+
valueFrom:
26930+
configMapKeyRef:
26931+
key: controller.self.heal.backoff.cooldown.seconds
26932+
name: argocd-cmd-params-cm
26933+
optional: true
2692826934
- name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT
2692926935
valueFrom:
2693026936
configMapKeyRef:

manifests/ha/install.yaml

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)