Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pkg/apis/pipeline/v1/taskrun_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,10 @@ const (
TaskRunReasonResolvingStepActionRef = "ResolvingStepActionRef"
// TaskRunReasonImagePullFailed is the reason set when the step of a task fails due to image not being pulled
TaskRunReasonImagePullFailed TaskRunReason = "TaskRunImagePullFailed"
// TaskRunReasonCreateContainerConfigError is the reason set when the step of a task fails due to config error (e.g., missing ConfigMap or Secret)
TaskRunReasonCreateContainerConfigError TaskRunReason = "CreateContainerConfigError"
// TaskRunReasonPodCreationFailed is the reason set when the pod backing the TaskRun fails to be created (e.g., CreateContainerError)
TaskRunReasonPodCreationFailed TaskRunReason = "PodCreationFailed"
// TaskRunReasonResultLargerThanAllowedLimit is the reason set when one of the results exceeds its maximum allowed limit of 1 KB
TaskRunReasonResultLargerThanAllowedLimit TaskRunReason = "TaskRunResultLargerThanAllowedLimit"
// TaskRunReasonStopSidecarFailed indicates that the sidecar is not properly stopped.
Expand Down
41 changes: 28 additions & 13 deletions pkg/pod/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -827,28 +827,43 @@ func IsPodExceedingNodeResources(pod *corev1.Pod) bool {
return false
}

// isPodHitConfigError returns true if the Pod's status undicates there are config error raised
func isPodHitConfigError(pod *corev1.Pod) bool {
// hasContainerWaitingReason checks if any container (init or regular) is waiting with a reason
// that matches the provided predicate function
func hasContainerWaitingReason(pod *corev1.Pod, predicate func(corev1.ContainerStateWaiting) bool) bool {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Love this helper

// Check init containers first
for _, containerStatus := range pod.Status.InitContainerStatuses {
if containerStatus.State.Waiting != nil && predicate(*containerStatus.State.Waiting) {
return true
}
}
// Check regular containers
for _, containerStatus := range pod.Status.ContainerStatuses {
if containerStatus.State.Waiting != nil && containerStatus.State.Waiting.Reason == ReasonCreateContainerConfigError {
// for subPath directory creation errors, we want to allow recovery
if strings.Contains(containerStatus.State.Waiting.Message, "failed to create subPath directory") {
return false
}
if containerStatus.State.Waiting != nil && predicate(*containerStatus.State.Waiting) {
return true
}
}
return false
}

// isPodHitConfigError returns true if the Pod's status indicates there are config error raised
func isPodHitConfigError(pod *corev1.Pod) bool {
return hasContainerWaitingReason(pod, func(waiting corev1.ContainerStateWaiting) bool {
if waiting.Reason != ReasonCreateContainerConfigError {
return false
}
// for subPath directory creation errors, we want to allow recovery
if strings.Contains(waiting.Message, "failed to create subPath directory") {
return false
}
return true
})
}

// isPullImageError returns true if the Pod's status indicates there are any error when pulling image
func isPullImageError(pod *corev1.Pod) bool {
for _, containerStatus := range pod.Status.ContainerStatuses {
if containerStatus.State.Waiting != nil && isImageErrorReason(containerStatus.State.Waiting.Reason) {
return true
}
}
return false
return hasContainerWaitingReason(pod, func(waiting corev1.ContainerStateWaiting) bool {
return isImageErrorReason(waiting.Reason)
})
}

func isImageErrorReason(reason string) bool {
Expand Down
153 changes: 99 additions & 54 deletions pkg/reconciler/taskrun/taskrun.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,16 +97,26 @@ type Reconciler struct {
tracerProvider trace.TracerProvider
}

const ImagePullBackOff = "ImagePullBackOff"
const (
ImagePullBackOff = "ImagePullBackOff"
InvalidImageName = "InvalidImageName" // Invalid image reference
CreateContainerConfigError = "CreateContainerConfigError" // Missing ConfigMap/Secret, invalid env vars, etc.
CreateContainerError = "CreateContainerError" // Other container creation failures
ErrImagePull = "ErrImagePull" // Initial image pull failure
)

var (
// Check that our Reconciler implements taskrunreconciler.Interface
_ taskrunreconciler.Interface = (*Reconciler)(nil)

// Pod failure reasons that trigger failure of the TaskRun
// Note: ErrImagePull is intentionally not included as it's a transient state
// that Kubernetes will automatically retry before transitioning to ImagePullBackOff
podFailureReasons = map[string]struct{}{
ImagePullBackOff: {},
"InvalidImageName": {},
ImagePullBackOff: {},
InvalidImageName: {},
CreateContainerConfigError: {},
CreateContainerError: {},
}
)

Expand Down Expand Up @@ -247,66 +257,101 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, tr *v1.TaskRun) pkgrecon
}

func (c *Reconciler) checkPodFailed(ctx context.Context, tr *v1.TaskRun) (bool, v1.TaskRunReason, string) {
imagePullBackOffTimeoutPodConditions := []string{string(corev1.PodInitialized), "PodReadyToStartContainers"}
for _, step := range tr.Status.Steps {
if step.Waiting != nil {
if _, found := podFailureReasons[step.Waiting.Reason]; found {
if step.Waiting.Reason == ImagePullBackOff {
imagePullBackOffTimeOut := config.FromContextOrDefaults(ctx).Defaults.DefaultImagePullBackOffTimeout
// only attempt to recover from the imagePullBackOff if specified
if imagePullBackOffTimeOut.Seconds() != 0 {
p, err := c.KubeClientSet.CoreV1().Pods(tr.Namespace).Get(ctx, tr.Status.PodName, metav1.GetOptions{})
if err != nil {
message := fmt.Sprintf(`the step %q in TaskRun %q failed to pull the image %q and the pod with error: "%s."`, step.Name, tr.Name, step.ImageID, err)
return true, v1.TaskRunReasonImagePullFailed, message
}
for _, condition := range p.Status.Conditions {
// check the pod condition to get the time when the pod was ready to start containers / initialized.
// keep trying until the pod schedule time has exceeded the specified imagePullBackOff timeout duration
if slices.Contains(imagePullBackOffTimeoutPodConditions, string(condition.Type)) {
if c.Clock.Since(condition.LastTransitionTime.Time) < imagePullBackOffTimeOut {
return false, "", ""
}
}
}
}
}
image := step.ImageID
message := fmt.Sprintf(`the step %q in TaskRun %q failed to pull the image %q. The pod errored with the message: "%s."`, step.Name, tr.Name, image, step.Waiting.Message)
return true, v1.TaskRunReasonImagePullFailed, message
}
if step.Waiting == nil {
continue
}

if _, found := podFailureReasons[step.Waiting.Reason]; !found {
continue
}

failed, reason, message := c.checkContainerFailure(
ctx,
tr,
step.Waiting,
step.Name,
step.ImageID,
"step",
)
if failed {
return true, reason, message
}
}

for _, sidecar := range tr.Status.Sidecars {
if sidecar.Waiting != nil {
if _, found := podFailureReasons[sidecar.Waiting.Reason]; found {
if sidecar.Waiting.Reason == ImagePullBackOff {
imagePullBackOffTimeOut := config.FromContextOrDefaults(ctx).Defaults.DefaultImagePullBackOffTimeout
// only attempt to recover from the imagePullBackOff if specified
if imagePullBackOffTimeOut.Seconds() != 0 {
p, err := c.KubeClientSet.CoreV1().Pods(tr.Namespace).Get(ctx, tr.Status.PodName, metav1.GetOptions{})
if err != nil {
message := fmt.Sprintf(`the sidecar %q in TaskRun %q failed to pull the image %q and the pod with error: "%s."`, sidecar.Name, tr.Name, sidecar.ImageID, err)
return true, v1.TaskRunReasonImagePullFailed, message
}
for _, condition := range p.Status.Conditions {
// check the pod condition to get the time when the pod was ready to start containers / initialized.
// keep trying until the pod schedule time has exceeded the specified imagePullBackOff timeout duration
if slices.Contains(imagePullBackOffTimeoutPodConditions, string(condition.Type)) {
if c.Clock.Since(condition.LastTransitionTime.Time) < imagePullBackOffTimeOut {
return false, "", ""
}
}
}
if sidecar.Waiting == nil {
continue
}

if _, found := podFailureReasons[sidecar.Waiting.Reason]; !found {
continue
}

failed, reason, message := c.checkContainerFailure(
ctx,
tr,
sidecar.Waiting,
sidecar.Name,
sidecar.ImageID,
"sidecar",
)
if failed {
return true, reason, message
}
}

return false, "", ""
}

func (c *Reconciler) checkContainerFailure(
ctx context.Context,
tr *v1.TaskRun,
waiting *corev1.ContainerStateWaiting,
name,
imageID,
containerType string,
) (bool, v1.TaskRunReason, string) {
if waiting.Reason == ImagePullBackOff {
imagePullBackOffTimeOut := config.FromContextOrDefaults(ctx).Defaults.DefaultImagePullBackOffTimeout
// only attempt to recover from the imagePullBackOff if specified
if imagePullBackOffTimeOut.Seconds() != 0 {
p, err := c.KubeClientSet.CoreV1().Pods(tr.Namespace).Get(ctx, tr.Status.PodName, metav1.GetOptions{})
if err != nil {
message := fmt.Sprintf(`the %s %q in TaskRun %q failed to pull the image %q. Failed to get pod with error: "%s."`, containerType, name, tr.Name, imageID, err)
return true, v1.TaskRunReasonImagePullFailed, message
}
imagePullBackOffTimeoutPodConditions := []string{string(corev1.PodInitialized), "PodReadyToStartContainers"}
for _, condition := range p.Status.Conditions {
// check the pod condition to get the time when the pod was ready to start containers / initialized.
// keep trying until the pod schedule time has exceeded the specified imagePullBackOff timeout duration
if slices.Contains(imagePullBackOffTimeoutPodConditions, string(condition.Type)) {
if c.Clock.Since(condition.LastTransitionTime.Time) < imagePullBackOffTimeOut {
return false, "", ""
}
}
image := sidecar.ImageID
message := fmt.Sprintf(`the sidecar %q in TaskRun %q failed to pull the image %q. The pod errored with the message: "%s."`, sidecar.Name, tr.Name, image, sidecar.Waiting.Message)
return true, v1.TaskRunReasonImagePullFailed, message
}
}
// ImagePullBackOff timeout exceeded or not configured
message := fmt.Sprintf(`the %s %q in TaskRun %q failed to pull the image %q. The pod errored with the message: "%s."`, containerType, name, tr.Name, imageID, waiting.Message)
return true, v1.TaskRunReasonImagePullFailed, message
}
return false, "", ""

// Handle CreateContainerConfigError (missing ConfigMap/Secret, invalid env vars, etc.)
if waiting.Reason == CreateContainerConfigError {
message := fmt.Sprintf(`the %s %q in TaskRun %q failed to start. The pod errored with the message: "%s."`, containerType, name, tr.Name, waiting.Message)
return true, v1.TaskRunReasonCreateContainerConfigError, message
}

// Handle InvalidImageName (unrecoverable error)
if waiting.Reason == InvalidImageName {
message := fmt.Sprintf(`the %s %q in TaskRun %q failed to pull the image %q. The pod errored with the message: "%s."`, containerType, name, tr.Name, imageID, waiting.Message)
return true, v1.TaskRunReasonImagePullFailed, message
}

// Handle CreateContainerError and other generic failures
message := fmt.Sprintf(`the %s %q in TaskRun %q failed to start. The pod errored with the message: "%s."`, containerType, name, tr.Name, waiting.Message)
return true, v1.TaskRunReasonPodCreationFailed, message
}

func (c *Reconciler) durationAndCountMetrics(ctx context.Context, tr *v1.TaskRun, beforeCondition *apis.Condition) {
Expand Down
130 changes: 128 additions & 2 deletions pkg/reconciler/taskrun/taskrun_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2846,8 +2846,8 @@ status:
}
// the error message includes the error if the pod is not found
if tc.podNotFound {
expectedStatus.Message = fmt.Sprintf(`the %s "unnamed-%d" in TaskRun "test-imagepull-fail" failed to pull the image "whatever" and the pod with error: "%s."`, tc.failure, stepNumber, tc.message)
wantEvents[1] = fmt.Sprintf(`Warning Failed the %s "unnamed-%d" in TaskRun "test-imagepull-fail" failed to pull the image "whatever" and the pod with error: "%s.`, tc.failure, stepNumber, tc.message)
expectedStatus.Message = fmt.Sprintf(`the %s "unnamed-%d" in TaskRun "test-imagepull-fail" failed to pull the image "whatever". Failed to get pod with error: "%s."`, tc.failure, stepNumber, tc.message)
wantEvents[1] = fmt.Sprintf(`Warning Failed the %s "unnamed-%d" in TaskRun "test-imagepull-fail" failed to pull the image "whatever". Failed to get pod with error: "%s.`, tc.failure, stepNumber, tc.message)
}
condition := newTr.Status.GetCondition(apis.ConditionSucceeded)
if d := cmp.Diff(expectedStatus, condition, ignoreLastTransitionTime); d != "" {
Expand All @@ -2862,6 +2862,132 @@ status:
}
}

func TestReconcileContainerFailures(t *testing.T) {
testCases := []struct {
name string
reason string
message string
containerType string
expectedReason v1.TaskRunReason
}{{
name: "CreateContainerConfigError for step - missing configmap",
reason: "CreateContainerConfigError",
message: "configmap \"config-for-testing\" not found",
containerType: "step",
expectedReason: "CreateContainerConfigError",
}, {
name: "CreateContainerConfigError for sidecar - missing secret",
reason: "CreateContainerConfigError",
message: "secret \"secret-for-testing\" not found",
containerType: "sidecar",
expectedReason: "CreateContainerConfigError",
}, {
name: "CreateContainerError for step",
reason: "CreateContainerError",
message: "failed to create container",
containerType: "step",
expectedReason: "PodCreationFailed",
}, {
name: "CreateContainerError for sidecar",
reason: "CreateContainerError",
message: "failed to create container",
containerType: "sidecar",
expectedReason: "PodCreationFailed",
}, {
name: "InvalidImageName for step",
reason: "InvalidImageName",
message: "invalid image reference",
containerType: "step",
expectedReason: "TaskRunImagePullFailed",
}, {
name: "InvalidImageName for sidecar",
reason: "InvalidImageName",
message: "invalid image reference",
containerType: "sidecar",
expectedReason: "TaskRunImagePullFailed",
}}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
taskRun := parse.MustParseV1TaskRun(t, `
metadata:
name: test-container-failure
namespace: foo
spec:
taskSpec:
sidecars:
- image: busybox
steps:
- image: alpine
status:
podName: "test-pod"
sidecars:
- container: sidecar-busybox
name: busybox
imageID: docker.io/library/busybox:latest
steps:
- container: step-alpine
name: alpine
imageID: docker.io/library/alpine:latest
`)

// Set the waiting state based on container type
if tc.containerType == "step" {
taskRun.Status.Steps[0].Waiting = &corev1.ContainerStateWaiting{
Reason: tc.reason,
Message: tc.message,
}
} else {
taskRun.Status.Sidecars[0].Waiting = &corev1.ContainerStateWaiting{
Reason: tc.reason,
Message: tc.message,
}
}

d := test.Data{
TaskRuns: []*v1.TaskRun{taskRun},
}

testAssets, cancel := getTaskRunController(t, d)
defer cancel()

// Reconcile the TaskRun
if err := testAssets.Controller.Reconciler.Reconcile(testAssets.Ctx, getRunName(taskRun)); err != nil {
t.Fatalf("Unexpected error reconciling TaskRun: %v", err)
}

// Verify the TaskRun failed with the expected reason
reconciledTr, err := testAssets.Clients.Pipeline.TektonV1().TaskRuns(taskRun.Namespace).Get(
testAssets.Ctx, taskRun.Name, metav1.GetOptions{},
)
if err != nil {
t.Fatalf("Failed to get reconciled TaskRun: %v", err)
}

condition := reconciledTr.Status.GetCondition(apis.ConditionSucceeded)
if condition == nil {
t.Fatal("TaskRun should have a Succeeded condition")
}

if condition.Status != corev1.ConditionFalse {
t.Errorf("Expected TaskRun to fail, but status is: %v", condition.Status)
}

if condition.Reason != string(tc.expectedReason) {
t.Errorf("Expected reason %q, got %q", tc.expectedReason, condition.Reason)
}

if !strings.Contains(condition.Message, tc.message) {
t.Errorf("Expected message to contain %q, got: %q", tc.message, condition.Message)
}

if !strings.Contains(condition.Message, tc.containerType) {
t.Errorf("Expected message to mention container type %q, got: %q", tc.containerType, condition.Message)
}
})
}
}

func TestReconcileWithTimeoutDisabled(t *testing.T) {
type testCase struct {
name string
Expand Down
Loading