|
| 1 | +/* |
| 2 | +Copyright 2024. |
| 3 | +
|
| 4 | +Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +you may not use this file except in compliance with the License. |
| 6 | +You may obtain a copy of the License at |
| 7 | +
|
| 8 | + http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +
|
| 10 | +Unless required by applicable law or agreed to in writing, software |
| 11 | +distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +See the License for the specific language governing permissions and |
| 14 | +limitations under the License. |
| 15 | +*/ |
| 16 | + |
| 17 | +package controlplane |
| 18 | + |
| 19 | +import ( |
| 20 | + "context" |
| 21 | + "fmt" |
| 22 | + |
| 23 | + cpv1beta1 "github.com/k0sproject/k0smotron/api/controlplane/v1beta1" |
| 24 | + "github.com/pkg/errors" |
| 25 | + kerrors "k8s.io/apimachinery/pkg/util/errors" |
| 26 | + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" |
| 27 | + "sigs.k8s.io/cluster-api/util/annotations" |
| 28 | + "sigs.k8s.io/cluster-api/util/collections" |
| 29 | + "sigs.k8s.io/cluster-api/util/conditions" |
| 30 | + ctrl "sigs.k8s.io/controller-runtime" |
| 31 | + "sigs.k8s.io/controller-runtime/pkg/client" |
| 32 | +) |
| 33 | + |
| 34 | +func (c *K0sController) reconcileUnhealthyMachines(ctx context.Context, cluster *clusterv1.Cluster, kcp *cpv1beta1.K0sControlPlane) (retErr error) { |
| 35 | + log := ctrl.LoggerFrom(ctx) |
| 36 | + |
| 37 | + machines, err := collections.GetFilteredMachinesForCluster(ctx, c, cluster, collections.ControlPlaneMachines(cluster.Name)) |
| 38 | + if err != nil { |
| 39 | + return fmt.Errorf("failed to filter machines for control plane: %w", err) |
| 40 | + } |
| 41 | + |
| 42 | + healthyMachines := machines.Filter(isHealthy) |
| 43 | + |
| 44 | + // cleanup pending remediation actions not completed if the underlying machine is now back to healthy. |
| 45 | + // machines to be sanitized has the following conditions: |
| 46 | + // |
| 47 | + // HealthCheckSucceeded=True (current machine's state is Health) |
| 48 | + // AND |
| 49 | + // OwnerRemediated=False (machine was marked as unhealthy previously) |
| 50 | + err = c.sanitizeHealthyMachines(ctx, healthyMachines) |
| 51 | + if err != nil { |
| 52 | + return err |
| 53 | + } |
| 54 | + if _, ok := kcp.Annotations[cpv1beta1.RemediationInProgressAnnotation]; ok { |
| 55 | + log.Info("Another remediation is already in progress. Skipping remediation.") |
| 56 | + return nil |
| 57 | + } |
| 58 | + |
| 59 | + // retrieve machines marked as unheathy by MHC controller |
| 60 | + unhealthyMachines := machines.Filter(collections.HasUnhealthyCondition) |
| 61 | + |
| 62 | + // no unhealthy machines to remediate. Reconciliation can move on to the next stage. |
| 63 | + if len(unhealthyMachines) == 0 { |
| 64 | + return nil |
| 65 | + } |
| 66 | + machineToBeRemediated := unhealthyMachines.Oldest() |
| 67 | + |
| 68 | + if !machineToBeRemediated.ObjectMeta.DeletionTimestamp.IsZero() { |
| 69 | + log.Info("Machine to remediate is being deleted.") |
| 70 | + return nil |
| 71 | + } |
| 72 | + log = log.WithValues("Machine", machineToBeRemediated) |
| 73 | + // Always patch the machine to be remediated conditions in order to inform about remediation state. |
| 74 | + defer func() { |
| 75 | + derr := c.Status().Patch(ctx, machineToBeRemediated, client.Merge) |
| 76 | + if derr != nil { |
| 77 | + log.Error(err, "Failed to patch control plane Machine", "Machine", machineToBeRemediated.Name) |
| 78 | + if retErr == nil { |
| 79 | + retErr = errors.Wrapf(err, "failed to patch control plane Machine %s", machineToBeRemediated.Name) |
| 80 | + } |
| 81 | + return |
| 82 | + } |
| 83 | + }() |
| 84 | + // Ensure that the cluster remains available during and after the remediation process. The remediation must not |
| 85 | + // compromise the cluster's ability to serve workloads or cause disruption to the control plane's functionality. |
| 86 | + if kcp.Status.Ready { |
| 87 | + // The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance. |
| 88 | + if !(machines.Len() > 1) { |
| 89 | + log.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "replicas", machines.Len()) |
| 90 | + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal to 1") |
| 91 | + return nil |
| 92 | + } |
| 93 | + |
| 94 | + // The cluster MUST NOT have healthy machines still being provisioned. This rule prevents KCP taking actions while the cluster is in a transitional state. |
| 95 | + if isProvisioningHealthyMachine(healthyMachines) { |
| 96 | + log.Info("A control plane machine needs remediation, but there are other control-plane machines being provisioned. Skipping remediation") |
| 97 | + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine provisioning to complete before triggering remediation") |
| 98 | + |
| 99 | + return nil |
| 100 | + } |
| 101 | + |
| 102 | + // The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state. |
| 103 | + if len(machines.Filter(collections.HasDeletionTimestamp)) > 0 { |
| 104 | + log.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation") |
| 105 | + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation") |
| 106 | + return nil |
| 107 | + } |
| 108 | + } |
| 109 | + |
| 110 | + // After checks, remediation can be carried out. |
| 111 | + |
| 112 | + if err := c.runMachineDeletionSequence(ctx, log, cluster, kcp, machineToBeRemediated); err != nil { |
| 113 | + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error()) |
| 114 | + return errors.Wrapf(err, "failed to delete unhealthy machine %s", machineToBeRemediated.Name) |
| 115 | + } |
| 116 | + log.Info("Remediated unhealthy machine, another new machine should take its place soon.") |
| 117 | + |
| 118 | + // Mark controlplane to track that remediation is in progress and do not proceed until machine is gone. |
| 119 | + // This annotation is removed when new controlplane creates a new machine. |
| 120 | + annotations.AddAnnotations(kcp, map[string]string{ |
| 121 | + cpv1beta1.RemediationInProgressAnnotation: "true", |
| 122 | + }) |
| 123 | + |
| 124 | + return nil |
| 125 | +} |
| 126 | + |
| 127 | +func isHealthy(machine *clusterv1.Machine) bool { |
| 128 | + if machine == nil { |
| 129 | + return false |
| 130 | + } |
| 131 | + return conditions.IsTrue(machine, clusterv1.MachineHealthCheckSucceededCondition) |
| 132 | +} |
| 133 | + |
| 134 | +func hasNode(machine *clusterv1.Machine) bool { |
| 135 | + if machine == nil { |
| 136 | + return false |
| 137 | + } |
| 138 | + return machine.Status.NodeRef != nil |
| 139 | +} |
| 140 | + |
| 141 | +func isProvisioningHealthyMachine(healthyMachines collections.Machines) bool { |
| 142 | + return len(healthyMachines.Filter(collections.Not(hasNode))) > 0 |
| 143 | +} |
| 144 | + |
| 145 | +func (c *K0sController) sanitizeHealthyMachines(ctx context.Context, healthyMachines collections.Machines) error { |
| 146 | + log := ctrl.LoggerFrom(ctx) |
| 147 | + |
| 148 | + errList := []error{} |
| 149 | + for _, m := range healthyMachines { |
| 150 | + if conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) && m.DeletionTimestamp.IsZero() { |
| 151 | + |
| 152 | + conditions.Delete(m, clusterv1.MachineOwnerRemediatedCondition) |
| 153 | + |
| 154 | + err := c.Status().Patch(ctx, m, client.Merge) |
| 155 | + if err != nil { |
| 156 | + log.Error(err, "Failed to patch control plane Machine to clean machine's unhealthy condition", "Machine", m.Name) |
| 157 | + errList = append(errList, errors.Wrapf(err, "failed to patch control plane Machine %s to clean machine's unhelthy condition", m.Name)) |
| 158 | + } |
| 159 | + } |
| 160 | + } |
| 161 | + if len(errList) > 0 { |
| 162 | + return kerrors.NewAggregate(errList) |
| 163 | + } |
| 164 | + |
| 165 | + return nil |
| 166 | +} |
0 commit comments