Skip to content

Commit 24bc71a

Browse files
committed
fix: Detect pod configuration errors early instead of timeout
- Fail fast on missing ConfigMaps/Secrets within seconds of detection - Show actionable error messages instead of generic timeout failures - Add early detection for CreateContainerConfigError and related failures Signed-off-by: Vincent Demeester <vdemeest@redhat.com>
1 parent 220bb9f commit 24bc71a

File tree

4 files changed

+259
-69
lines changed

4 files changed

+259
-69
lines changed

pkg/apis/pipeline/v1/taskrun_types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,10 @@ const (
212212
TaskRunReasonResolvingStepActionRef = "ResolvingStepActionRef"
213213
// TaskRunReasonImagePullFailed is the reason set when the step of a task fails due to image not being pulled
214214
TaskRunReasonImagePullFailed TaskRunReason = "TaskRunImagePullFailed"
215+
// TaskRunReasonCreateContainerConfigError is the reason set when the step of a task fails due to config error (e.g., missing ConfigMap or Secret)
216+
TaskRunReasonCreateContainerConfigError TaskRunReason = "CreateContainerConfigError"
217+
// TaskRunReasonPodCreationFailed is the reason set when the pod backing the TaskRun fails to be created (e.g., CreateContainerError)
218+
TaskRunReasonPodCreationFailed TaskRunReason = "PodCreationFailed"
215219
// TaskRunReasonResultLargerThanAllowedLimit is the reason set when one of the results exceeds its maximum allowed limit of 1 KB
216220
TaskRunReasonResultLargerThanAllowedLimit TaskRunReason = "TaskRunResultLargerThanAllowedLimit"
217221
// TaskRunReasonStopSidecarFailed indicates that the sidecar is not properly stopped.

pkg/pod/status.go

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -827,28 +827,43 @@ func IsPodExceedingNodeResources(pod *corev1.Pod) bool {
827827
return false
828828
}
829829

830-
// isPodHitConfigError returns true if the Pod's status undicates there are config error raised
831-
func isPodHitConfigError(pod *corev1.Pod) bool {
830+
// hasContainerWaitingReason checks if any container (init or regular) is waiting with a reason
831+
// that matches the provided predicate function
832+
func hasContainerWaitingReason(pod *corev1.Pod, predicate func(corev1.ContainerStateWaiting) bool) bool {
833+
// Check init containers first
834+
for _, containerStatus := range pod.Status.InitContainerStatuses {
835+
if containerStatus.State.Waiting != nil && predicate(*containerStatus.State.Waiting) {
836+
return true
837+
}
838+
}
839+
// Check regular containers
832840
for _, containerStatus := range pod.Status.ContainerStatuses {
833-
if containerStatus.State.Waiting != nil && containerStatus.State.Waiting.Reason == ReasonCreateContainerConfigError {
834-
// for subPath directory creation errors, we want to allow recovery
835-
if strings.Contains(containerStatus.State.Waiting.Message, "failed to create subPath directory") {
836-
return false
837-
}
841+
if containerStatus.State.Waiting != nil && predicate(*containerStatus.State.Waiting) {
838842
return true
839843
}
840844
}
841845
return false
842846
}
843847

848+
// isPodHitConfigError returns true if the Pod's status indicates there are config error raised
849+
func isPodHitConfigError(pod *corev1.Pod) bool {
850+
return hasContainerWaitingReason(pod, func(waiting corev1.ContainerStateWaiting) bool {
851+
if waiting.Reason != ReasonCreateContainerConfigError {
852+
return false
853+
}
854+
// for subPath directory creation errors, we want to allow recovery
855+
if strings.Contains(waiting.Message, "failed to create subPath directory") {
856+
return false
857+
}
858+
return true
859+
})
860+
}
861+
844862
// isPullImageError returns true if the Pod's status indicates there are any error when pulling image
845863
func isPullImageError(pod *corev1.Pod) bool {
846-
for _, containerStatus := range pod.Status.ContainerStatuses {
847-
if containerStatus.State.Waiting != nil && isImageErrorReason(containerStatus.State.Waiting.Reason) {
848-
return true
849-
}
850-
}
851-
return false
864+
return hasContainerWaitingReason(pod, func(waiting corev1.ContainerStateWaiting) bool {
865+
return isImageErrorReason(waiting.Reason)
866+
})
852867
}
853868

854869
func isImageErrorReason(reason string) bool {

pkg/reconciler/taskrun/taskrun.go

Lines changed: 99 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -97,16 +97,26 @@ type Reconciler struct {
9797
tracerProvider trace.TracerProvider
9898
}
9999

100-
const ImagePullBackOff = "ImagePullBackOff"
100+
const (
101+
ImagePullBackOff = "ImagePullBackOff"
102+
InvalidImageName = "InvalidImageName" // Invalid image reference
103+
CreateContainerConfigError = "CreateContainerConfigError" // Missing ConfigMap/Secret, invalid env vars, etc.
104+
CreateContainerError = "CreateContainerError" // Other container creation failures
105+
ErrImagePull = "ErrImagePull" // Initial image pull failure
106+
)
101107

102108
var (
103109
// Check that our Reconciler implements taskrunreconciler.Interface
104110
_ taskrunreconciler.Interface = (*Reconciler)(nil)
105111

106112
// Pod failure reasons that trigger failure of the TaskRun
113+
// Note: ErrImagePull is intentionally not included as it's a transient state
114+
// that Kubernetes will automatically retry before transitioning to ImagePullBackOff
107115
podFailureReasons = map[string]struct{}{
108-
ImagePullBackOff: {},
109-
"InvalidImageName": {},
116+
ImagePullBackOff: {},
117+
InvalidImageName: {},
118+
CreateContainerConfigError: {},
119+
CreateContainerError: {},
110120
}
111121
)
112122

@@ -247,66 +257,101 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, tr *v1.TaskRun) pkgrecon
247257
}
248258

249259
func (c *Reconciler) checkPodFailed(ctx context.Context, tr *v1.TaskRun) (bool, v1.TaskRunReason, string) {
250-
imagePullBackOffTimeoutPodConditions := []string{string(corev1.PodInitialized), "PodReadyToStartContainers"}
251260
for _, step := range tr.Status.Steps {
252-
if step.Waiting != nil {
253-
if _, found := podFailureReasons[step.Waiting.Reason]; found {
254-
if step.Waiting.Reason == ImagePullBackOff {
255-
imagePullBackOffTimeOut := config.FromContextOrDefaults(ctx).Defaults.DefaultImagePullBackOffTimeout
256-
// only attempt to recover from the imagePullBackOff if specified
257-
if imagePullBackOffTimeOut.Seconds() != 0 {
258-
p, err := c.KubeClientSet.CoreV1().Pods(tr.Namespace).Get(ctx, tr.Status.PodName, metav1.GetOptions{})
259-
if err != nil {
260-
message := fmt.Sprintf(`the step %q in TaskRun %q failed to pull the image %q and the pod with error: "%s."`, step.Name, tr.Name, step.ImageID, err)
261-
return true, v1.TaskRunReasonImagePullFailed, message
262-
}
263-
for _, condition := range p.Status.Conditions {
264-
// check the pod condition to get the time when the pod was ready to start containers / initialized.
265-
// keep trying until the pod schedule time has exceeded the specified imagePullBackOff timeout duration
266-
if slices.Contains(imagePullBackOffTimeoutPodConditions, string(condition.Type)) {
267-
if c.Clock.Since(condition.LastTransitionTime.Time) < imagePullBackOffTimeOut {
268-
return false, "", ""
269-
}
270-
}
271-
}
272-
}
273-
}
274-
image := step.ImageID
275-
message := fmt.Sprintf(`the step %q in TaskRun %q failed to pull the image %q. The pod errored with the message: "%s."`, step.Name, tr.Name, image, step.Waiting.Message)
276-
return true, v1.TaskRunReasonImagePullFailed, message
277-
}
261+
if step.Waiting == nil {
262+
continue
263+
}
264+
265+
if _, found := podFailureReasons[step.Waiting.Reason]; !found {
266+
continue
267+
}
268+
269+
failed, reason, message := c.checkContainerFailure(
270+
ctx,
271+
tr,
272+
step.Waiting,
273+
step.Name,
274+
step.ImageID,
275+
"step",
276+
)
277+
if failed {
278+
return true, reason, message
278279
}
279280
}
281+
280282
for _, sidecar := range tr.Status.Sidecars {
281-
if sidecar.Waiting != nil {
282-
if _, found := podFailureReasons[sidecar.Waiting.Reason]; found {
283-
if sidecar.Waiting.Reason == ImagePullBackOff {
284-
imagePullBackOffTimeOut := config.FromContextOrDefaults(ctx).Defaults.DefaultImagePullBackOffTimeout
285-
// only attempt to recover from the imagePullBackOff if specified
286-
if imagePullBackOffTimeOut.Seconds() != 0 {
287-
p, err := c.KubeClientSet.CoreV1().Pods(tr.Namespace).Get(ctx, tr.Status.PodName, metav1.GetOptions{})
288-
if err != nil {
289-
message := fmt.Sprintf(`the sidecar %q in TaskRun %q failed to pull the image %q and the pod with error: "%s."`, sidecar.Name, tr.Name, sidecar.ImageID, err)
290-
return true, v1.TaskRunReasonImagePullFailed, message
291-
}
292-
for _, condition := range p.Status.Conditions {
293-
// check the pod condition to get the time when the pod was ready to start containers / initialized.
294-
// keep trying until the pod schedule time has exceeded the specified imagePullBackOff timeout duration
295-
if slices.Contains(imagePullBackOffTimeoutPodConditions, string(condition.Type)) {
296-
if c.Clock.Since(condition.LastTransitionTime.Time) < imagePullBackOffTimeOut {
297-
return false, "", ""
298-
}
299-
}
300-
}
283+
if sidecar.Waiting == nil {
284+
continue
285+
}
286+
287+
if _, found := podFailureReasons[sidecar.Waiting.Reason]; !found {
288+
continue
289+
}
290+
291+
failed, reason, message := c.checkContainerFailure(
292+
ctx,
293+
tr,
294+
sidecar.Waiting,
295+
sidecar.Name,
296+
sidecar.ImageID,
297+
"sidecar",
298+
)
299+
if failed {
300+
return true, reason, message
301+
}
302+
}
303+
304+
return false, "", ""
305+
}
306+
307+
func (c *Reconciler) checkContainerFailure(
308+
ctx context.Context,
309+
tr *v1.TaskRun,
310+
waiting *corev1.ContainerStateWaiting,
311+
name,
312+
imageID,
313+
containerType string,
314+
) (bool, v1.TaskRunReason, string) {
315+
if waiting.Reason == ImagePullBackOff {
316+
imagePullBackOffTimeOut := config.FromContextOrDefaults(ctx).Defaults.DefaultImagePullBackOffTimeout
317+
// only attempt to recover from the imagePullBackOff if specified
318+
if imagePullBackOffTimeOut.Seconds() != 0 {
319+
p, err := c.KubeClientSet.CoreV1().Pods(tr.Namespace).Get(ctx, tr.Status.PodName, metav1.GetOptions{})
320+
if err != nil {
321+
message := fmt.Sprintf(`the %s %q in TaskRun %q failed to pull the image %q. Failed to get pod with error: "%s."`, containerType, name, tr.Name, imageID, err)
322+
return true, v1.TaskRunReasonImagePullFailed, message
323+
}
324+
imagePullBackOffTimeoutPodConditions := []string{string(corev1.PodInitialized), "PodReadyToStartContainers"}
325+
for _, condition := range p.Status.Conditions {
326+
// check the pod condition to get the time when the pod was ready to start containers / initialized.
327+
// keep trying until the pod schedule time has exceeded the specified imagePullBackOff timeout duration
328+
if slices.Contains(imagePullBackOffTimeoutPodConditions, string(condition.Type)) {
329+
if c.Clock.Since(condition.LastTransitionTime.Time) < imagePullBackOffTimeOut {
330+
return false, "", ""
301331
}
302332
}
303-
image := sidecar.ImageID
304-
message := fmt.Sprintf(`the sidecar %q in TaskRun %q failed to pull the image %q. The pod errored with the message: "%s."`, sidecar.Name, tr.Name, image, sidecar.Waiting.Message)
305-
return true, v1.TaskRunReasonImagePullFailed, message
306333
}
307334
}
335+
// ImagePullBackOff timeout exceeded or not configured
336+
message := fmt.Sprintf(`the %s %q in TaskRun %q failed to pull the image %q. The pod errored with the message: "%s."`, containerType, name, tr.Name, imageID, waiting.Message)
337+
return true, v1.TaskRunReasonImagePullFailed, message
308338
}
309-
return false, "", ""
339+
340+
// Handle CreateContainerConfigError (missing ConfigMap/Secret, invalid env vars, etc.)
341+
if waiting.Reason == CreateContainerConfigError {
342+
message := fmt.Sprintf(`the %s %q in TaskRun %q failed to start. The pod errored with the message: "%s."`, containerType, name, tr.Name, waiting.Message)
343+
return true, v1.TaskRunReasonCreateContainerConfigError, message
344+
}
345+
346+
// Handle InvalidImageName (unrecoverable error)
347+
if waiting.Reason == InvalidImageName {
348+
message := fmt.Sprintf(`the %s %q in TaskRun %q failed to pull the image %q. The pod errored with the message: "%s."`, containerType, name, tr.Name, imageID, waiting.Message)
349+
return true, v1.TaskRunReasonImagePullFailed, message
350+
}
351+
352+
// Handle CreateContainerError and other generic failures
353+
message := fmt.Sprintf(`the %s %q in TaskRun %q failed to start. The pod errored with the message: "%s."`, containerType, name, tr.Name, waiting.Message)
354+
return true, v1.TaskRunReasonPodCreationFailed, message
310355
}
311356

312357
func (c *Reconciler) durationAndCountMetrics(ctx context.Context, tr *v1.TaskRun, beforeCondition *apis.Condition) {

pkg/reconciler/taskrun/taskrun_test.go

Lines changed: 128 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2846,8 +2846,8 @@ status:
28462846
}
28472847
// the error message includes the error if the pod is not found
28482848
if tc.podNotFound {
2849-
expectedStatus.Message = fmt.Sprintf(`the %s "unnamed-%d" in TaskRun "test-imagepull-fail" failed to pull the image "whatever" and the pod with error: "%s."`, tc.failure, stepNumber, tc.message)
2850-
wantEvents[1] = fmt.Sprintf(`Warning Failed the %s "unnamed-%d" in TaskRun "test-imagepull-fail" failed to pull the image "whatever" and the pod with error: "%s.`, tc.failure, stepNumber, tc.message)
2849+
expectedStatus.Message = fmt.Sprintf(`the %s "unnamed-%d" in TaskRun "test-imagepull-fail" failed to pull the image "whatever". Failed to get pod with error: "%s."`, tc.failure, stepNumber, tc.message)
2850+
wantEvents[1] = fmt.Sprintf(`Warning Failed the %s "unnamed-%d" in TaskRun "test-imagepull-fail" failed to pull the image "whatever". Failed to get pod with error: "%s.`, tc.failure, stepNumber, tc.message)
28512851
}
28522852
condition := newTr.Status.GetCondition(apis.ConditionSucceeded)
28532853
if d := cmp.Diff(expectedStatus, condition, ignoreLastTransitionTime); d != "" {
@@ -2862,6 +2862,132 @@ status:
28622862
}
28632863
}
28642864

2865+
func TestReconcileContainerFailures(t *testing.T) {
2866+
testCases := []struct {
2867+
name string
2868+
reason string
2869+
message string
2870+
containerType string
2871+
expectedReason v1.TaskRunReason
2872+
}{{
2873+
name: "CreateContainerConfigError for step - missing configmap",
2874+
reason: "CreateContainerConfigError",
2875+
message: "configmap \"config-for-testing\" not found",
2876+
containerType: "step",
2877+
expectedReason: "CreateContainerConfigError",
2878+
}, {
2879+
name: "CreateContainerConfigError for sidecar - missing secret",
2880+
reason: "CreateContainerConfigError",
2881+
message: "secret \"secret-for-testing\" not found",
2882+
containerType: "sidecar",
2883+
expectedReason: "CreateContainerConfigError",
2884+
}, {
2885+
name: "CreateContainerError for step",
2886+
reason: "CreateContainerError",
2887+
message: "failed to create container",
2888+
containerType: "step",
2889+
expectedReason: "PodCreationFailed",
2890+
}, {
2891+
name: "CreateContainerError for sidecar",
2892+
reason: "CreateContainerError",
2893+
message: "failed to create container",
2894+
containerType: "sidecar",
2895+
expectedReason: "PodCreationFailed",
2896+
}, {
2897+
name: "InvalidImageName for step",
2898+
reason: "InvalidImageName",
2899+
message: "invalid image reference",
2900+
containerType: "step",
2901+
expectedReason: "TaskRunImagePullFailed",
2902+
}, {
2903+
name: "InvalidImageName for sidecar",
2904+
reason: "InvalidImageName",
2905+
message: "invalid image reference",
2906+
containerType: "sidecar",
2907+
expectedReason: "TaskRunImagePullFailed",
2908+
}}
2909+
2910+
for _, tc := range testCases {
2911+
t.Run(tc.name, func(t *testing.T) {
2912+
taskRun := parse.MustParseV1TaskRun(t, `
2913+
metadata:
2914+
name: test-container-failure
2915+
namespace: foo
2916+
spec:
2917+
taskSpec:
2918+
sidecars:
2919+
- image: busybox
2920+
steps:
2921+
- image: alpine
2922+
status:
2923+
podName: "test-pod"
2924+
sidecars:
2925+
- container: sidecar-busybox
2926+
name: busybox
2927+
imageID: docker.io/library/busybox:latest
2928+
steps:
2929+
- container: step-alpine
2930+
name: alpine
2931+
imageID: docker.io/library/alpine:latest
2932+
`)
2933+
2934+
// Set the waiting state based on container type
2935+
if tc.containerType == "step" {
2936+
taskRun.Status.Steps[0].Waiting = &corev1.ContainerStateWaiting{
2937+
Reason: tc.reason,
2938+
Message: tc.message,
2939+
}
2940+
} else {
2941+
taskRun.Status.Sidecars[0].Waiting = &corev1.ContainerStateWaiting{
2942+
Reason: tc.reason,
2943+
Message: tc.message,
2944+
}
2945+
}
2946+
2947+
d := test.Data{
2948+
TaskRuns: []*v1.TaskRun{taskRun},
2949+
}
2950+
2951+
testAssets, cancel := getTaskRunController(t, d)
2952+
defer cancel()
2953+
2954+
// Reconcile the TaskRun
2955+
if err := testAssets.Controller.Reconciler.Reconcile(testAssets.Ctx, getRunName(taskRun)); err != nil {
2956+
t.Fatalf("Unexpected error reconciling TaskRun: %v", err)
2957+
}
2958+
2959+
// Verify the TaskRun failed with the expected reason
2960+
reconciledTr, err := testAssets.Clients.Pipeline.TektonV1().TaskRuns(taskRun.Namespace).Get(
2961+
testAssets.Ctx, taskRun.Name, metav1.GetOptions{},
2962+
)
2963+
if err != nil {
2964+
t.Fatalf("Failed to get reconciled TaskRun: %v", err)
2965+
}
2966+
2967+
condition := reconciledTr.Status.GetCondition(apis.ConditionSucceeded)
2968+
if condition == nil {
2969+
t.Fatal("TaskRun should have a Succeeded condition")
2970+
}
2971+
2972+
if condition.Status != corev1.ConditionFalse {
2973+
t.Errorf("Expected TaskRun to fail, but status is: %v", condition.Status)
2974+
}
2975+
2976+
if condition.Reason != string(tc.expectedReason) {
2977+
t.Errorf("Expected reason %q, got %q", tc.expectedReason, condition.Reason)
2978+
}
2979+
2980+
if !strings.Contains(condition.Message, tc.message) {
2981+
t.Errorf("Expected message to contain %q, got: %q", tc.message, condition.Message)
2982+
}
2983+
2984+
if !strings.Contains(condition.Message, tc.containerType) {
2985+
t.Errorf("Expected message to mention container type %q, got: %q", tc.containerType, condition.Message)
2986+
}
2987+
})
2988+
}
2989+
}
2990+
28652991
func TestReconcileWithTimeoutDisabled(t *testing.T) {
28662992
type testCase struct {
28672993
name string

0 commit comments

Comments
 (0)