Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions pkg/reconciler/taskrun/taskrun.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,11 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, tr *v1.TaskRun) pkgrecon
// Check if the TaskRun has timed out; if it is, this will set its status
// accordingly.
if tr.HasTimedOut(ctx, c.Clock) {
// Before failing the TaskRun, ensure step statuses are populated from the pod
// This prevents a race condition where the timeout occurs before pod status is fetched
if err := c.updateStepStatusesFromPod(ctx, tr); err != nil {
logger.Warnf("Failed to update step statuses from pod before timeout: %v", err)
}
message := fmt.Sprintf("TaskRun %q failed to finish within %q", tr.Name, tr.GetTimeout(ctx))
err := c.failTaskRun(ctx, tr, v1.TaskRunReasonTimedOut, message)
return c.finishReconcileUpdateEmitEvents(ctx, tr, before, err)
Expand Down Expand Up @@ -821,6 +826,37 @@ func (c *Reconciler) failTaskRun(ctx context.Context, tr *v1.TaskRun, reason v1.
return nil
}

// updateStepStatusesFromPod fetches the pod and updates step statuses in the TaskRun
// This is called before failing a TaskRun to ensure step statuses are populated
func (c *Reconciler) updateStepStatusesFromPod(ctx context.Context, tr *v1.TaskRun) error {
logger := logging.FromContext(ctx)

// If there's no pod yet, nothing to update
if tr.Status.PodName == "" {
return nil
}

// Fetch the pod
pod, err := c.podLister.Pods(tr.Namespace).Get(tr.Status.PodName)
if k8serrors.IsNotFound(err) {
// Pod doesn't exist yet, nothing to update
return nil
} else if err != nil {
return err
}

// Update step statuses from pod using the existing MakeTaskRunStatus function
// This ensures consistency with the normal reconciliation path
status, err := podconvert.MakeTaskRunStatus(ctx, logger, *tr, pod, c.KubeClientSet, tr.Status.TaskSpec)
if err != nil {
return err
}

// Only update the Steps field to avoid overwriting other status fields
tr.Status.Steps = status.Steps
return nil
}

// terminateStepsInPod updates step states for TaskRun on TaskRun object since pod has been deleted for cancel or timeout
func terminateStepsInPod(tr *v1.TaskRun, taskRunReason v1.TaskRunReason) {
for i, step := range tr.Status.Steps {
Expand Down
3 changes: 3 additions & 0 deletions pkg/reconciler/taskrun/taskrun_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1345,6 +1345,9 @@ spec:
name: test-task
status:
podName: the-pod
taskSpec:
steps:
- image: foo
`)
taskRun.Status.StartTime = &metav1.Time{Time: startTime}
d := test.Data{
Expand Down
120 changes: 120 additions & 0 deletions pkg/reconciler/taskrun/taskrun_timeout_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package taskrun

import (
"testing"
"time"

"github.com/google/go-cmp/cmp"
"github.com/tektoncd/pipeline/pkg/apis/config"
Expand All @@ -37,6 +38,125 @@ import (
_ "knative.dev/pkg/system/testing" // Setup system.Namespace()
)

// TestReconcileTimeoutWithEmptyStepStatus tests that when a TaskRun times out
// and tr.Status.Steps is empty (race condition), the reconciler populates
// step statuses from the pod before terminating them.
func TestReconcileTimeoutWithEmptyStepStatus(t *testing.T) {
task := parse.MustParseV1Task(t, `
metadata:
name: test-task
namespace: foo
spec:
steps:
- name: step1
image: busybox
- name: step2
image: busybox
`)

taskRun := parse.MustParseV1TaskRun(t, `
metadata:
name: test-taskrun-timeout-empty-steps
namespace: foo
spec:
taskRef:
name: test-task
timeout: 1s
status:
conditions:
- status: Unknown
type: Succeeded
startTime: "2021-12-31T23:59:58Z"
podName: test-taskrun-timeout-empty-steps-pod
`)
// In a real scenario, TaskSpec would have been set during pod creation in a previous reconcile
taskRun.Status.TaskSpec = &task.Spec

// Create a pod with running step containers
// This simulates a pod that exists but hasn't had its status synced to the TaskRun yet
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Namespace: "foo",
Name: "test-taskrun-timeout-empty-steps-pod",
},
Status: corev1.PodStatus{
Phase: corev1.PodRunning,
ContainerStatuses: []corev1.ContainerStatus{
{
Name: "step-step1",
State: corev1.ContainerState{
Running: &corev1.ContainerStateRunning{
StartedAt: metav1.Time{Time: testClock.Now().Add(-30 * time.Second)},
},
},
},
{
Name: "step-step2",
State: corev1.ContainerState{
Waiting: &corev1.ContainerStateWaiting{
Reason: "PodInitializing",
},
},
},
},
},
}

d := test.Data{
TaskRuns: []*v1.TaskRun{taskRun},
Tasks: []*v1.Task{task},
Pods: []*corev1.Pod{pod},
}

testAssets, cancel := getTaskRunController(t, d)
defer cancel()
c := testAssets.Controller
clients := testAssets.Clients

if err := c.Reconciler.Reconcile(testAssets.Ctx, getRunName(taskRun)); err != nil {
t.Fatalf("Unexpected error when reconciling timed out TaskRun: %v", err)
}

reconciledRun, err := clients.Pipeline.TektonV1().TaskRuns("foo").Get(testAssets.Ctx, "test-taskrun-timeout-empty-steps", metav1.GetOptions{})
if err != nil {
t.Fatalf("Error getting reconciled TaskRun: %v", err)
}

// Verify the TaskRun is marked as timed out
condition := reconciledRun.Status.GetCondition(apis.ConditionSucceeded)
if condition == nil || condition.Status != corev1.ConditionFalse {
t.Errorf("Expected TaskRun to be failed, got condition: %v", condition)
}
if condition != nil && condition.Reason != v1.TaskRunReasonTimedOut.String() {
t.Errorf("Expected TaskRun reason to be %s, got %s", v1.TaskRunReasonTimedOut.String(), condition.Reason)
}

// Verify that step statuses are populated and terminated
if len(reconciledRun.Status.Steps) == 0 {
t.Fatal("Expected step statuses to be populated, but got empty Steps array")
}

if len(reconciledRun.Status.Steps) != 2 {
t.Fatalf("Expected 2 steps, got %d", len(reconciledRun.Status.Steps))
}

// Verify all steps are terminated with TimedOut reason
for i, step := range reconciledRun.Status.Steps {
if step.Terminated == nil {
t.Errorf("Step %d (%s) should have Terminated status, but it's nil", i, step.Name)
continue
}
if step.Terminated.Reason != v1.TaskRunReasonTimedOut.String() {
t.Errorf("Step %d (%s) should have reason %s, got %s",
i, step.Name, v1.TaskRunReasonTimedOut.String(), step.Terminated.Reason)
}
if step.TerminationReason != v1.TaskRunReasonTimedOut.String() {
t.Errorf("Step %d (%s) should have TerminationReason %s, got %s",
i, step.Name, v1.TaskRunReasonTimedOut.String(), step.TerminationReason)
}
}
}

func TestFailTaskRun_Timeout(t *testing.T) {
testCases := []struct {
name string
Expand Down
Loading