Skip to content

Commit ba8a84c

Browse files
tjorrialpeb
andauthored
fix(destination): GetProfile requests targeting pods directly should return endpoint data for running (not necessarily ready) pods (#13557)
* fix(destination): GetProfile requests targeting pods directly should return endpoint data for running (not necessarily ready) pods Requiring Pods to pass readiness checks before allowing Pod to Pod communication disrupts communication in e.g. clustered systems which require Pods to communicate with each other prior to establishing ready state and allowing inbound traffic. Relaxed the requirement and modified the workload watcher to only require that a Pod exists and is in Running phase. Reproduced the issue with a test setup described in #13247. Fixes #13247. --------- Signed-off-by: Tuomo <[email protected]> Co-authored-by: Alejandro Pedraza <[email protected]>
1 parent 767391e commit ba8a84c

File tree

1 file changed

+21
-19
lines changed

1 file changed

+21
-19
lines changed

controller/api/destination/watcher/workload_watcher.go

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -656,26 +656,37 @@ func (wp *workloadPublisher) unsubscribe(listener WorkloadUpdateListener) {
656656
}
657657

658658
// updatePod creates an Address instance for the given pod, that is passed to
659-
// the listener's Update() method, only if the pod's readiness state has
659+
// the listener's Update() method, only if the pod's running state has
660660
// changed. If the passed pod is nil, it means the pod (still referred to in
661661
// wp.pod) has been deleted.
662+
// Note that we care only about the running state instead of a stronger
663+
// requirement on readiness state because this is used in the context of
664+
// _endpoint_ profile subscriptions, as opposed to _service_ profile
665+
// subscriptions. The former is used when calling GetProfile for a specific
666+
// pod, usually when hitting instances of a StatefulSet, with IPs possibly
667+
// derived from a headless service. An example of this is a Cassandra cluster,
668+
// where a new node won't become ready until it's connected from other members
669+
// of the cluster. For such connections to work inside the mesh, we need
670+
// GetProfile to return the endpoint profile for the pod, even if it's not
671+
// ready.
672+
// See https://github.com/linkerd/linkerd2/issues/13247
662673
func (wp *workloadPublisher) updatePod(pod *corev1.Pod) {
663674
wp.mu.Lock()
664675
defer wp.mu.Unlock()
665676

666-
// pod wasn't ready or there was no backing pod - check if passed pod is ready
677+
// pod wasn't running or there was no backing pod - check if passed pod is running
667678
if wp.addr.Pod == nil {
668679
if pod == nil {
669680
wp.log.Trace("Pod deletion event already consumed - ignore")
670681
return
671682
}
672683

673-
if !isRunningAndReady(pod) {
674-
wp.log.Tracef("Pod %s.%s not ready - ignore", pod.Name, pod.Namespace)
684+
if !isRunning(pod) {
685+
wp.log.Tracef("Pod %s.%s not running - ignore", pod.Name, pod.Namespace)
675686
return
676687
}
677688

678-
wp.log.Debugf("Pod %s.%s became ready", pod.Name, pod.Namespace)
689+
wp.log.Debugf("Pod %s.%s started running", pod.Name, pod.Namespace)
679690
wp.addr.Pod = pod
680691

681692
// Fill in ownership.
@@ -705,9 +716,9 @@ func (wp *workloadPublisher) updatePod(pod *corev1.Pod) {
705716
return
706717
}
707718

708-
// backing pod becoming unready or getting deleted
709-
if pod == nil || !isRunningAndReady(pod) {
710-
wp.log.Debugf("Pod %s.%s deleted or it became unready - remove", wp.addr.Pod.Name, wp.addr.Pod.Namespace)
719+
// backing pod stopped running or getting deleted
720+
if pod == nil || !isRunning(pod) {
721+
wp.log.Debugf("Pod %s.%s deleted or it stopped running - remove", wp.addr.Pod.Name, wp.addr.Pod.Namespace)
711722
wp.addr.Pod = nil
712723
wp.addr.OwnerKind = ""
713724
wp.addr.OwnerName = ""
@@ -828,15 +839,6 @@ func isNamedInExternalWorkload(pr string, ew *ext.ExternalWorkload) (int32, bool
828839
return 0, false
829840
}
830841

831-
func isRunningAndReady(pod *corev1.Pod) bool {
832-
if pod == nil || pod.Status.Phase != corev1.PodRunning {
833-
return false
834-
}
835-
for _, condition := range pod.Status.Conditions {
836-
if condition.Type == corev1.PodReady && condition.Status == corev1.ConditionTrue {
837-
return true
838-
}
839-
}
840-
841-
return false
842+
func isRunning(pod *corev1.Pod) bool {
843+
return pod != nil && pod.Status.Phase == corev1.PodRunning
842844
}

0 commit comments

Comments
 (0)