diff --git a/pkg/reconciler/taskrun/taskrun.go b/pkg/reconciler/taskrun/taskrun.go index 27c8f852f79..5e37114d386 100644 --- a/pkg/reconciler/taskrun/taskrun.go +++ b/pkg/reconciler/taskrun/taskrun.go @@ -184,6 +184,11 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, tr *v1.TaskRun) pkgrecon // Check if the TaskRun has timed out; if it is, this will set its status // accordingly. if tr.HasTimedOut(ctx, c.Clock) { + // Before failing the TaskRun, ensure step statuses are populated from the pod + // This prevents a race condition where the timeout occurs before pod status is fetched + if err := c.updateStepStatusesFromPod(ctx, tr); err != nil { + logger.Warnf("Failed to update step statuses from pod before timeout: %v", err) + } message := fmt.Sprintf("TaskRun %q failed to finish within %q", tr.Name, tr.GetTimeout(ctx)) err := c.failTaskRun(ctx, tr, v1.TaskRunReasonTimedOut, message) return c.finishReconcileUpdateEmitEvents(ctx, tr, before, err) @@ -821,6 +826,37 @@ func (c *Reconciler) failTaskRun(ctx context.Context, tr *v1.TaskRun, reason v1. return nil } +// updateStepStatusesFromPod fetches the pod and updates step statuses in the TaskRun +// This is called before failing a TaskRun to ensure step statuses are populated +func (c *Reconciler) updateStepStatusesFromPod(ctx context.Context, tr *v1.TaskRun) error { + logger := logging.FromContext(ctx) + + // If there's no pod yet, nothing to update + if tr.Status.PodName == "" { + return nil + } + + // Fetch the pod + pod, err := c.podLister.Pods(tr.Namespace).Get(tr.Status.PodName) + if k8serrors.IsNotFound(err) { + // Pod doesn't exist yet, nothing to update + return nil + } else if err != nil { + return err + } + + // Update step statuses from pod using the existing MakeTaskRunStatus function + // This ensures consistency with the normal reconciliation path + status, err := podconvert.MakeTaskRunStatus(ctx, logger, *tr, pod, c.KubeClientSet, tr.Status.TaskSpec) + if err != nil { + return err + } + + // Only update the Steps field to avoid overwriting other status fields + tr.Status.Steps = status.Steps + return nil +} + // terminateStepsInPod updates step states for TaskRun on TaskRun object since pod has been deleted for cancel or timeout func terminateStepsInPod(tr *v1.TaskRun, taskRunReason v1.TaskRunReason) { for i, step := range tr.Status.Steps { diff --git a/pkg/reconciler/taskrun/taskrun_test.go b/pkg/reconciler/taskrun/taskrun_test.go index 5d8c44f930f..69e60638ebf 100644 --- a/pkg/reconciler/taskrun/taskrun_test.go +++ b/pkg/reconciler/taskrun/taskrun_test.go @@ -1345,6 +1345,9 @@ spec: name: test-task status: podName: the-pod + taskSpec: + steps: + - image: foo `) taskRun.Status.StartTime = &metav1.Time{Time: startTime} d := test.Data{ diff --git a/pkg/reconciler/taskrun/taskrun_timeout_test.go b/pkg/reconciler/taskrun/taskrun_timeout_test.go index 3d1a8c7fb47..712979a63d7 100644 --- a/pkg/reconciler/taskrun/taskrun_timeout_test.go +++ b/pkg/reconciler/taskrun/taskrun_timeout_test.go @@ -18,6 +18,7 @@ package taskrun import ( "testing" + "time" "github.com/google/go-cmp/cmp" "github.com/tektoncd/pipeline/pkg/apis/config" @@ -37,6 +38,125 @@ import ( _ "knative.dev/pkg/system/testing" // Setup system.Namespace() ) +// TestReconcileTimeoutWithEmptyStepStatus tests that when a TaskRun times out +// and tr.Status.Steps is empty (race condition), the reconciler populates +// step statuses from the pod before terminating them. +func TestReconcileTimeoutWithEmptyStepStatus(t *testing.T) { + task := parse.MustParseV1Task(t, ` +metadata: + name: test-task + namespace: foo +spec: + steps: + - name: step1 + image: busybox + - name: step2 + image: busybox +`) + + taskRun := parse.MustParseV1TaskRun(t, ` +metadata: + name: test-taskrun-timeout-empty-steps + namespace: foo +spec: + taskRef: + name: test-task + timeout: 1s +status: + conditions: + - status: Unknown + type: Succeeded + startTime: "2021-12-31T23:59:58Z" + podName: test-taskrun-timeout-empty-steps-pod +`) + // In a real scenario, TaskSpec would have been set during pod creation in a previous reconcile + taskRun.Status.TaskSpec = &task.Spec + + // Create a pod with running step containers + // This simulates a pod that exists but hasn't had its status synced to the TaskRun yet + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "foo", + Name: "test-taskrun-timeout-empty-steps-pod", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "step-step1", + State: corev1.ContainerState{ + Running: &corev1.ContainerStateRunning{ + StartedAt: metav1.Time{Time: testClock.Now().Add(-30 * time.Second)}, + }, + }, + }, + { + Name: "step-step2", + State: corev1.ContainerState{ + Waiting: &corev1.ContainerStateWaiting{ + Reason: "PodInitializing", + }, + }, + }, + }, + }, + } + + d := test.Data{ + TaskRuns: []*v1.TaskRun{taskRun}, + Tasks: []*v1.Task{task}, + Pods: []*corev1.Pod{pod}, + } + + testAssets, cancel := getTaskRunController(t, d) + defer cancel() + c := testAssets.Controller + clients := testAssets.Clients + + if err := c.Reconciler.Reconcile(testAssets.Ctx, getRunName(taskRun)); err != nil { + t.Fatalf("Unexpected error when reconciling timed out TaskRun: %v", err) + } + + reconciledRun, err := clients.Pipeline.TektonV1().TaskRuns("foo").Get(testAssets.Ctx, "test-taskrun-timeout-empty-steps", metav1.GetOptions{}) + if err != nil { + t.Fatalf("Error getting reconciled TaskRun: %v", err) + } + + // Verify the TaskRun is marked as timed out + condition := reconciledRun.Status.GetCondition(apis.ConditionSucceeded) + if condition == nil || condition.Status != corev1.ConditionFalse { + t.Errorf("Expected TaskRun to be failed, got condition: %v", condition) + } + if condition != nil && condition.Reason != v1.TaskRunReasonTimedOut.String() { + t.Errorf("Expected TaskRun reason to be %s, got %s", v1.TaskRunReasonTimedOut.String(), condition.Reason) + } + + // Verify that step statuses are populated and terminated + if len(reconciledRun.Status.Steps) == 0 { + t.Fatal("Expected step statuses to be populated, but got empty Steps array") + } + + if len(reconciledRun.Status.Steps) != 2 { + t.Fatalf("Expected 2 steps, got %d", len(reconciledRun.Status.Steps)) + } + + // Verify all steps are terminated with TimedOut reason + for i, step := range reconciledRun.Status.Steps { + if step.Terminated == nil { + t.Errorf("Step %d (%s) should have Terminated status, but it's nil", i, step.Name) + continue + } + if step.Terminated.Reason != v1.TaskRunReasonTimedOut.String() { + t.Errorf("Step %d (%s) should have reason %s, got %s", + i, step.Name, v1.TaskRunReasonTimedOut.String(), step.Terminated.Reason) + } + if step.TerminationReason != v1.TaskRunReasonTimedOut.String() { + t.Errorf("Step %d (%s) should have TerminationReason %s, got %s", + i, step.Name, v1.TaskRunReasonTimedOut.String(), step.TerminationReason) + } + } +} + func TestFailTaskRun_Timeout(t *testing.T) { testCases := []struct { name string