Skip to content

Commit 9732652

Browse files
committed
fix: Populate step statuses before TaskRun timeout handling
This prevent some race condition where timeout fires before pod status fetch. TestTaskRunTimeout validates that steps are terminated, but it can be not populated from time to time (in my test 4/5 times out of 25). Signed-off-by: Vincent Demeester <vdemeest@redhat.com>
1 parent b1f8f62 commit 9732652

File tree

3 files changed

+159
-0
lines changed

3 files changed

+159
-0
lines changed

pkg/reconciler/taskrun/taskrun.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,11 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, tr *v1.TaskRun) pkgrecon
184184
// Check if the TaskRun has timed out; if it is, this will set its status
185185
// accordingly.
186186
if tr.HasTimedOut(ctx, c.Clock) {
187+
// Before failing the TaskRun, ensure step statuses are populated from the pod
188+
// This prevents a race condition where the timeout occurs before pod status is fetched
189+
if err := c.updateStepStatusesFromPod(ctx, tr); err != nil {
190+
logger.Warnf("Failed to update step statuses from pod before timeout: %v", err)
191+
}
187192
message := fmt.Sprintf("TaskRun %q failed to finish within %q", tr.Name, tr.GetTimeout(ctx))
188193
err := c.failTaskRun(ctx, tr, v1.TaskRunReasonTimedOut, message)
189194
return c.finishReconcileUpdateEmitEvents(ctx, tr, before, err)
@@ -821,6 +826,37 @@ func (c *Reconciler) failTaskRun(ctx context.Context, tr *v1.TaskRun, reason v1.
821826
return nil
822827
}
823828

829+
// updateStepStatusesFromPod fetches the pod and updates step statuses in the TaskRun
830+
// This is called before failing a TaskRun to ensure step statuses are populated
831+
func (c *Reconciler) updateStepStatusesFromPod(ctx context.Context, tr *v1.TaskRun) error {
832+
logger := logging.FromContext(ctx)
833+
834+
// If there's no pod yet, nothing to update
835+
if tr.Status.PodName == "" {
836+
return nil
837+
}
838+
839+
// Fetch the pod
840+
pod, err := c.podLister.Pods(tr.Namespace).Get(tr.Status.PodName)
841+
if k8serrors.IsNotFound(err) {
842+
// Pod doesn't exist yet, nothing to update
843+
return nil
844+
} else if err != nil {
845+
return err
846+
}
847+
848+
// Update step statuses from pod using the existing MakeTaskRunStatus function
849+
// This ensures consistency with the normal reconciliation path
850+
status, err := podconvert.MakeTaskRunStatus(ctx, logger, *tr, pod, c.KubeClientSet, tr.Status.TaskSpec)
851+
if err != nil {
852+
return err
853+
}
854+
855+
// Only update the Steps field to avoid overwriting other status fields
856+
tr.Status.Steps = status.Steps
857+
return nil
858+
}
859+
824860
// terminateStepsInPod updates step states for TaskRun on TaskRun object since pod has been deleted for cancel or timeout
825861
func terminateStepsInPod(tr *v1.TaskRun, taskRunReason v1.TaskRunReason) {
826862
for i, step := range tr.Status.Steps {

pkg/reconciler/taskrun/taskrun_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1345,6 +1345,9 @@ spec:
13451345
name: test-task
13461346
status:
13471347
podName: the-pod
1348+
taskSpec:
1349+
steps:
1350+
- image: foo
13481351
`)
13491352
taskRun.Status.StartTime = &metav1.Time{Time: startTime}
13501353
d := test.Data{

pkg/reconciler/taskrun/taskrun_timeout_test.go

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package taskrun
1818

1919
import (
2020
"testing"
21+
"time"
2122

2223
"github.com/google/go-cmp/cmp"
2324
"github.com/tektoncd/pipeline/pkg/apis/config"
@@ -37,6 +38,125 @@ import (
3738
_ "knative.dev/pkg/system/testing" // Setup system.Namespace()
3839
)
3940

41+
// TestReconcileTimeoutWithEmptyStepStatus tests that when a TaskRun times out
42+
// and tr.Status.Steps is empty (race condition), the reconciler populates
43+
// step statuses from the pod before terminating them.
44+
func TestReconcileTimeoutWithEmptyStepStatus(t *testing.T) {
45+
task := parse.MustParseV1Task(t, `
46+
metadata:
47+
name: test-task
48+
namespace: foo
49+
spec:
50+
steps:
51+
- name: step1
52+
image: busybox
53+
- name: step2
54+
image: busybox
55+
`)
56+
57+
taskRun := parse.MustParseV1TaskRun(t, `
58+
metadata:
59+
name: test-taskrun-timeout-empty-steps
60+
namespace: foo
61+
spec:
62+
taskRef:
63+
name: test-task
64+
timeout: 1s
65+
status:
66+
conditions:
67+
- status: Unknown
68+
type: Succeeded
69+
startTime: "2021-12-31T23:59:58Z"
70+
podName: test-taskrun-timeout-empty-steps-pod
71+
`)
72+
// In a real scenario, TaskSpec would have been set during pod creation in a previous reconcile
73+
taskRun.Status.TaskSpec = &task.Spec
74+
75+
// Create a pod with running step containers
76+
// This simulates a pod that exists but hasn't had its status synced to the TaskRun yet
77+
pod := &corev1.Pod{
78+
ObjectMeta: metav1.ObjectMeta{
79+
Namespace: "foo",
80+
Name: "test-taskrun-timeout-empty-steps-pod",
81+
},
82+
Status: corev1.PodStatus{
83+
Phase: corev1.PodRunning,
84+
ContainerStatuses: []corev1.ContainerStatus{
85+
{
86+
Name: "step-step1",
87+
State: corev1.ContainerState{
88+
Running: &corev1.ContainerStateRunning{
89+
StartedAt: metav1.Time{Time: testClock.Now().Add(-30 * time.Second)},
90+
},
91+
},
92+
},
93+
{
94+
Name: "step-step2",
95+
State: corev1.ContainerState{
96+
Waiting: &corev1.ContainerStateWaiting{
97+
Reason: "PodInitializing",
98+
},
99+
},
100+
},
101+
},
102+
},
103+
}
104+
105+
d := test.Data{
106+
TaskRuns: []*v1.TaskRun{taskRun},
107+
Tasks: []*v1.Task{task},
108+
Pods: []*corev1.Pod{pod},
109+
}
110+
111+
testAssets, cancel := getTaskRunController(t, d)
112+
defer cancel()
113+
c := testAssets.Controller
114+
clients := testAssets.Clients
115+
116+
if err := c.Reconciler.Reconcile(testAssets.Ctx, getRunName(taskRun)); err != nil {
117+
t.Fatalf("Unexpected error when reconciling timed out TaskRun: %v", err)
118+
}
119+
120+
reconciledRun, err := clients.Pipeline.TektonV1().TaskRuns("foo").Get(testAssets.Ctx, "test-taskrun-timeout-empty-steps", metav1.GetOptions{})
121+
if err != nil {
122+
t.Fatalf("Error getting reconciled TaskRun: %v", err)
123+
}
124+
125+
// Verify the TaskRun is marked as timed out
126+
condition := reconciledRun.Status.GetCondition(apis.ConditionSucceeded)
127+
if condition == nil || condition.Status != corev1.ConditionFalse {
128+
t.Errorf("Expected TaskRun to be failed, got condition: %v", condition)
129+
}
130+
if condition != nil && condition.Reason != v1.TaskRunReasonTimedOut.String() {
131+
t.Errorf("Expected TaskRun reason to be %s, got %s", v1.TaskRunReasonTimedOut.String(), condition.Reason)
132+
}
133+
134+
// Verify that step statuses are populated and terminated
135+
if len(reconciledRun.Status.Steps) == 0 {
136+
t.Fatal("Expected step statuses to be populated, but got empty Steps array")
137+
}
138+
139+
if len(reconciledRun.Status.Steps) != 2 {
140+
t.Fatalf("Expected 2 steps, got %d", len(reconciledRun.Status.Steps))
141+
}
142+
143+
// Verify all steps are terminated with TimedOut reason
144+
for i, step := range reconciledRun.Status.Steps {
145+
if step.Terminated == nil {
146+
t.Errorf("Step %d (%s) should have Terminated status, but it's nil", i, step.Name)
147+
continue
148+
}
149+
if step.Terminated.Reason != v1.TaskRunReasonTimedOut.String() {
150+
t.Errorf("Step %d (%s) should have reason %s, got %s",
151+
i, step.Name, v1.TaskRunReasonTimedOut.String(), step.Terminated.Reason)
152+
}
153+
if step.TerminationReason != v1.TaskRunReasonTimedOut.String() {
154+
t.Errorf("Step %d (%s) should have TerminationReason %s, got %s",
155+
i, step.Name, v1.TaskRunReasonTimedOut.String(), step.TerminationReason)
156+
}
157+
}
158+
}
159+
40160
func TestFailTaskRun_Timeout(t *testing.T) {
41161
testCases := []struct {
42162
name string

0 commit comments

Comments
 (0)