Skip to content

Commit 9e25f53

Browse files
committed
Add k0scontrolplane heathcheck-remediation
Signed-off-by: Adrian Pedriza <adripedriza@gmail.com>
1 parent aef0261 commit 9e25f53

File tree

8 files changed

+597
-38
lines changed

8 files changed

+597
-38
lines changed

.github/workflows/go.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ jobs:
160160
- check-capi-controlplane-docker-tunneling-proxy
161161
- check-capi-controlplane-docker-worker
162162
- check-capi-docker-machine-change-template
163+
- check-capi-controlplane-remediation
163164
- check-capi-remote-machine-template-update
164165
- check-capi-docker-machine-template-update
165166
- check-capi-docker-machine-template-update-recreate

api/controlplane/v1beta1/k0s_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,11 @@ const (
3939
const (
4040
// ControlPlaneReadyCondition documents the status of the control plane
4141
ControlPlaneReadyCondition clusterv1.ConditionType = "ControlPlaneReady"
42+
43+
// RemediationInProgressAnnotation is used to keep track that a remediation is in progress,
44+
// and more specifically it tracks that the system is in between having deleted an unhealthy machine
45+
// and recreating its replacement.
46+
RemediationInProgressAnnotation = "controlplane.cluster.x-k8s.io/remediation-in-progress"
4247
)
4348

4449
// +kubebuilder:object:root=true

internal/controller/controlplane/helper.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,18 @@ func (c *K0sController) createMachine(ctx context.Context, name string, cluster
5252
}
5353
_ = ctrl.SetControllerReference(kcp, machine, c.Scheme)
5454

55-
return machine, c.Client.Patch(ctx, machine, client.Apply, &client.PatchOptions{
55+
err = c.Client.Patch(ctx, machine, client.Apply, &client.PatchOptions{
5656
FieldManager: "k0smotron",
5757
})
58+
if err != nil {
59+
return machine, err
60+
}
61+
62+
// Remove the annotation tracking that a remediation is in progress.
63+
// A remediation is completed when the replacement machine has been created above.
64+
delete(kcp.Annotations, cpv1beta1.RemediationInProgressAnnotation)
65+
66+
return machine, nil
5867
}
5968

6069
func (c *K0sController) deleteMachine(ctx context.Context, name string, kcp *cpv1beta1.K0sControlPlane) error {

internal/controller/controlplane/k0s_controlplane_controller.go

Lines changed: 49 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"strings"
2727
"time"
2828

29+
"github.com/go-logr/logr"
2930
"github.com/google/uuid"
3031
autopilot "github.com/k0sproject/k0s/pkg/apis/autopilot/v1beta2"
3132
"github.com/k0sproject/k0smotron/internal/controller/util"
@@ -263,6 +264,11 @@ func (c *K0sController) reconcile(ctx context.Context, cluster *clusterv1.Cluste
263264
return fmt.Errorf("error reconciling kubeconfig secret: %w", err)
264265
}
265266

267+
err = c.reconcileUnhealthyMachines(ctx, cluster, kcp)
268+
if err != nil {
269+
return err
270+
}
271+
266272
err = c.reconcileMachines(ctx, cluster, kcp)
267273
if err != nil {
268274
return err
@@ -406,58 +412,66 @@ func (c *K0sController) reconcileMachines(ctx context.Context, cluster *clusterv
406412

407413
if len(machineNamesToDelete) > 0 {
408414
logger.Info("Found machines to delete", "count", len(machineNamesToDelete))
409-
kubeClient, err := c.getKubeClient(ctx, cluster)
410-
if err != nil {
411-
return fmt.Errorf("error getting cluster client set for deletion: %w", err)
412-
}
413415

414416
// Remove the oldest machine abd wait for the machine to be deleted to avoid etcd issues
415-
machine := machines.Filter(func(m *clusterv1.Machine) bool {
417+
machineToDelete := machines.Filter(func(m *clusterv1.Machine) bool {
416418
return machineNamesToDelete[m.Name]
417419
}).Oldest()
418-
logger.Info("Found oldest machine to delete", "machine", machine.Name)
419-
if machine.Status.Phase == string(clusterv1.MachinePhaseDeleting) {
420-
logger.Info("Machine is being deleted, waiting for it to be deleted", "machine", machine.Name)
420+
logger.Info("Found oldest machine to delete", "machine", machineToDelete.Name)
421+
if machineToDelete.Status.Phase == string(clusterv1.MachinePhaseDeleting) {
422+
logger.Info("Machine is being deleted, waiting for it to be deleted", "machine", machineToDelete.Name)
421423
return fmt.Errorf("waiting for previous machine to be deleted")
422424
}
423425

424-
name := machine.Name
425-
426-
waitCtx, cancel := context.WithTimeout(ctx, 60*time.Second)
427-
defer cancel()
428-
err = wait.PollUntilContextCancel(waitCtx, 10*time.Second, true, func(fctx context.Context) (bool, error) {
429-
if err := c.markChildControlNodeToLeave(fctx, name, kubeClient); err != nil {
430-
return false, fmt.Errorf("error marking controlnode to leave: %w", err)
431-
}
432-
433-
ok, err := c.checkMachineLeft(fctx, name, kubeClient)
434-
if err != nil {
435-
logger.Error(err, "Error checking machine left", "machine", name)
436-
}
437-
return ok, err
438-
})
426+
err := c.runMachineDeletionSequence(ctx, logger, cluster, kcp, machineToDelete)
439427
if err != nil {
440-
return fmt.Errorf("error checking machine left: %w", err)
428+
return err
441429
}
442430

443-
if err := c.deleteControlNode(ctx, name, kubeClient); err != nil {
444-
return fmt.Errorf("error deleting controlnode: %w", err)
445-
}
431+
logger.Info("Deleted machine", "machine", machineToDelete.Name)
432+
}
433+
return nil
434+
}
446435

447-
if err := c.deleteBootstrapConfig(ctx, name, kcp); err != nil {
448-
return fmt.Errorf("error deleting machine from template: %w", err)
449-
}
436+
func (c *K0sController) runMachineDeletionSequence(ctx context.Context, logger logr.Logger, cluster *clusterv1.Cluster, kcp *cpv1beta1.K0sControlPlane, machine *clusterv1.Machine) error {
437+
kubeClient, err := c.getKubeClient(ctx, cluster)
438+
if err != nil {
439+
return fmt.Errorf("error getting cluster client set for deletion: %w", err)
440+
}
450441

451-
if err := c.deleteMachineFromTemplate(ctx, name, cluster, kcp); err != nil {
452-
return fmt.Errorf("error deleting machine from template: %w", err)
442+
waitCtx, cancel := context.WithTimeout(ctx, 60*time.Second)
443+
defer cancel()
444+
err = wait.PollUntilContextCancel(waitCtx, 10*time.Second, true, func(fctx context.Context) (bool, error) {
445+
if err := c.markChildControlNodeToLeave(fctx, machine.Name, kubeClient); err != nil {
446+
return false, fmt.Errorf("error marking controlnode to leave: %w", err)
453447
}
454448

455-
if err := c.deleteMachine(ctx, name, kcp); err != nil {
456-
return fmt.Errorf("error deleting machine from template: %w", err)
449+
ok, err := c.checkMachineLeft(fctx, machine.Name, kubeClient)
450+
if err != nil {
451+
logger.Error(err, "Error checking machine left", "machine", machine.Name)
457452
}
453+
return ok, err
454+
})
455+
if err != nil {
456+
return fmt.Errorf("error checking machine left: %w", err)
457+
}
458458

459-
logger.Info("Deleted machine", "machine", name)
459+
if err := c.deleteControlNode(ctx, machine.Name, kubeClient); err != nil {
460+
return fmt.Errorf("error deleting controlnode: %w", err)
460461
}
462+
463+
if err := c.deleteBootstrapConfig(ctx, machine.Name, kcp); err != nil {
464+
return fmt.Errorf("error deleting machine from template: %w", err)
465+
}
466+
467+
if err := c.deleteMachineFromTemplate(ctx, machine.Name, cluster, kcp); err != nil {
468+
return fmt.Errorf("error deleting machine from template: %w", err)
469+
}
470+
471+
if err := c.deleteMachine(ctx, machine.Name, kcp); err != nil {
472+
return fmt.Errorf("error deleting machine from template: %w", err)
473+
}
474+
461475
return nil
462476
}
463477

internal/controller/controlplane/k0smotron_controlplane_controller.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ import (
3333
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
3434
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
3535
bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1"
36-
"sigs.k8s.io/cluster-api/util"
3736
capiutil "sigs.k8s.io/cluster-api/util"
3837
"sigs.k8s.io/cluster-api/util/annotations"
3938
"sigs.k8s.io/cluster-api/util/secret"
@@ -273,7 +272,7 @@ func (c *K0smotronController) reconcile(ctx context.Context, cluster *clusterv1.
273272

274273
func (c *K0smotronController) ensureCertificates(ctx context.Context, cluster *clusterv1.Cluster, kcp *cpv1beta1.K0smotronControlPlane) error {
275274
certificates := secret.NewCertificatesForInitialControlPlane(&bootstrapv1.ClusterConfiguration{})
276-
return certificates.LookupOrGenerate(ctx, c.Client, util.ObjectKey(cluster), *metav1.NewControllerRef(kcp, cpv1beta1.GroupVersion.WithKind("K0smotronControlPlane")))
275+
return certificates.LookupOrGenerate(ctx, c.Client, capiutil.ObjectKey(cluster), *metav1.NewControllerRef(kcp, cpv1beta1.GroupVersion.WithKind("K0smotronControlPlane")))
277276
}
278277

279278
// SetupWithManager sets up the controller with the Manager.
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
/*
2+
Copyright 2024.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package controlplane
18+
19+
import (
20+
"context"
21+
"fmt"
22+
23+
cpv1beta1 "github.com/k0sproject/k0smotron/api/controlplane/v1beta1"
24+
"github.com/pkg/errors"
25+
kerrors "k8s.io/apimachinery/pkg/util/errors"
26+
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
27+
"sigs.k8s.io/cluster-api/util/annotations"
28+
"sigs.k8s.io/cluster-api/util/collections"
29+
"sigs.k8s.io/cluster-api/util/conditions"
30+
ctrl "sigs.k8s.io/controller-runtime"
31+
"sigs.k8s.io/controller-runtime/pkg/client"
32+
)
33+
34+
func (c *K0sController) reconcileUnhealthyMachines(ctx context.Context, cluster *clusterv1.Cluster, kcp *cpv1beta1.K0sControlPlane) (retErr error) {
35+
log := ctrl.LoggerFrom(ctx)
36+
37+
machines, err := collections.GetFilteredMachinesForCluster(ctx, c, cluster, collections.ControlPlaneMachines(cluster.Name))
38+
if err != nil {
39+
return fmt.Errorf("failed to filter machines for control plane: %w", err)
40+
}
41+
42+
healthyMachines := machines.Filter(isHealthy)
43+
44+
// cleanup pending remediation actions not completed if the underlying machine is now back to healthy.
45+
// machines to be sanitized has the following conditions:
46+
//
47+
// HealthCheckSucceeded=True (current machine's state is Health)
48+
// AND
49+
// OwnerRemediated=False (machine was marked as unhealthy previously)
50+
err = c.sanitizeHealthyMachines(ctx, healthyMachines)
51+
if err != nil {
52+
return err
53+
}
54+
if _, ok := kcp.Annotations[cpv1beta1.RemediationInProgressAnnotation]; ok {
55+
log.Info("Another remediation is already in progress. Skipping remediation.")
56+
return nil
57+
}
58+
59+
// retrieve machines marked as unheathy by MHC controller
60+
unhealthyMachines := machines.Filter(collections.HasUnhealthyCondition)
61+
62+
// no unhealthy machines to remediate. Reconciliation can move on to the next stage.
63+
if len(unhealthyMachines) == 0 {
64+
return nil
65+
}
66+
machineToBeRemediated := unhealthyMachines.Oldest()
67+
68+
if !machineToBeRemediated.ObjectMeta.DeletionTimestamp.IsZero() {
69+
log.Info("Machine to remediate is being deleted.")
70+
return nil
71+
}
72+
log = log.WithValues("Machine", machineToBeRemediated)
73+
// Always patch the machine to be remediated conditions in order to inform about remediation state.
74+
defer func() {
75+
derr := c.Status().Patch(ctx, machineToBeRemediated, client.Merge)
76+
if derr != nil {
77+
log.Error(err, "Failed to patch control plane Machine", "Machine", machineToBeRemediated.Name)
78+
if retErr == nil {
79+
retErr = errors.Wrapf(err, "failed to patch control plane Machine %s", machineToBeRemediated.Name)
80+
}
81+
return
82+
}
83+
}()
84+
// Ensure that the cluster remains available during and after the remediation process. The remediation must not
85+
// compromise the cluster's ability to serve workloads or cause disruption to the control plane's functionality.
86+
if kcp.Status.Ready {
87+
// The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance.
88+
if !(machines.Len() > 1) {
89+
log.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "replicas", machines.Len())
90+
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal to 1")
91+
return nil
92+
}
93+
94+
// The cluster MUST NOT have healthy machines still being provisioned. This rule prevents KCP taking actions while the cluster is in a transitional state.
95+
if isProvisioningHealthyMachine(healthyMachines) {
96+
log.Info("A control plane machine needs remediation, but there are other control-plane machines being provisioned. Skipping remediation")
97+
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine provisioning to complete before triggering remediation")
98+
99+
return nil
100+
}
101+
102+
// The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state.
103+
if len(machines.Filter(collections.HasDeletionTimestamp)) > 0 {
104+
log.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation")
105+
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation")
106+
return nil
107+
}
108+
}
109+
110+
// After checks, remediation can be carried out.
111+
112+
if err := c.runMachineDeletionSequence(ctx, log, cluster, kcp, machineToBeRemediated); err != nil {
113+
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
114+
return errors.Wrapf(err, "failed to delete unhealthy machine %s", machineToBeRemediated.Name)
115+
}
116+
log.Info("Remediated unhealthy machine, another new machine should take its place soon.")
117+
118+
// Mark controlplane to track that remediation is in progress and do not proceed until machine is gone.
119+
// This annotation is removed when new controlplane creates a new machine.
120+
annotations.AddAnnotations(kcp, map[string]string{
121+
cpv1beta1.RemediationInProgressAnnotation: "true",
122+
})
123+
124+
return nil
125+
}
126+
127+
func isHealthy(machine *clusterv1.Machine) bool {
128+
if machine == nil {
129+
return false
130+
}
131+
return conditions.IsTrue(machine, clusterv1.MachineHealthCheckSucceededCondition)
132+
}
133+
134+
func hasNode(machine *clusterv1.Machine) bool {
135+
if machine == nil {
136+
return false
137+
}
138+
return machine.Status.NodeRef != nil
139+
}
140+
141+
func isProvisioningHealthyMachine(healthyMachines collections.Machines) bool {
142+
return len(healthyMachines.Filter(collections.Not(hasNode))) > 0
143+
}
144+
145+
func (c *K0sController) sanitizeHealthyMachines(ctx context.Context, healthyMachines collections.Machines) error {
146+
log := ctrl.LoggerFrom(ctx)
147+
148+
errList := []error{}
149+
for _, m := range healthyMachines {
150+
if conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) && m.DeletionTimestamp.IsZero() {
151+
152+
conditions.Delete(m, clusterv1.MachineOwnerRemediatedCondition)
153+
154+
err := c.Status().Patch(ctx, m, client.Merge)
155+
if err != nil {
156+
log.Error(err, "Failed to patch control plane Machine to clean machine's unhealthy condition", "Machine", m.Name)
157+
errList = append(errList, errors.Wrapf(err, "failed to patch control plane Machine %s to clean machine's unhelthy condition", m.Name))
158+
}
159+
}
160+
}
161+
if len(errList) > 0 {
162+
return kerrors.NewAggregate(errList)
163+
}
164+
165+
return nil
166+
}

inttest/Makefile.variables

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ smoketests := \
2121
check-capi-controlplane-docker-worker \
2222
check-capi-controlplane-docker-tunneling \
2323
check-capi-controlplane-docker-tunneling-proxy \
24+
check-capi-controlplane-remediation \
2425
check-monitoring \
2526
check-capi-docker-machinedeployment \
2627
check-capi-docker-clusterclass \

0 commit comments

Comments
 (0)