Skip to content

Commit d8f7b93

Browse files
pramodbindaltekton-robot
authored andcommitted
fix: reserve Pods for timed out TaskRuns when 'keep-pod-on-cancel' is true
When a TaskRun is timed out then default behaviour is to delete the corresponding pod and there is no way to debug the pod later With this fix, POD will not be deleted when taskrun is timed out and feature flag 'keep-pod-on-cancel' is set to true. This is same behavour as it was with cancled task runs. Signed-off-by: Pramod Bindal <prbindal@redhat.com> Signed-off-by: Pramod Bindal <prbindal@redhat.com>
1 parent 8c81ef8 commit d8f7b93

File tree

2 files changed

+198
-2
lines changed

2 files changed

+198
-2
lines changed

pkg/reconciler/taskrun/taskrun.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -807,8 +807,8 @@ func (c *Reconciler) failTaskRun(ctx context.Context, tr *v1.TaskRun, reason v1.
807807
terminateStepsInPod(tr, reason)
808808

809809
var err error
810-
if reason == v1.TaskRunReasonCancelled && (config.FromContextOrDefaults(ctx).FeatureFlags.EnableKeepPodOnCancel) {
811-
logger.Infof("Canceling task run %q by entrypoint", tr.Name)
810+
if (reason == v1.TaskRunReasonCancelled || reason == v1.TaskRunReasonTimedOut) && (config.FromContextOrDefaults(ctx).FeatureFlags.EnableKeepPodOnCancel) {
811+
logger.Infof("Canceling task run %q by entrypoint, Reason: %s", tr.Name, reason)
812812
err = podconvert.CancelPod(ctx, c.KubeClientSet, tr.Namespace, tr.Status.PodName)
813813
} else {
814814
err = c.KubeClientSet.CoreV1().Pods(tr.Namespace).Delete(ctx, tr.Status.PodName, metav1.DeleteOptions{})
Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
/*
2+
Copyright 2025 The Tekton Authors
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package taskrun
18+
19+
import (
20+
"testing"
21+
22+
"github.com/google/go-cmp/cmp"
23+
"github.com/tektoncd/pipeline/pkg/apis/config"
24+
v1 "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1"
25+
"github.com/tektoncd/pipeline/pkg/reconciler/volumeclaim"
26+
"go.opentelemetry.io/otel/trace"
27+
28+
_ "github.com/tektoncd/pipeline/pkg/taskrunmetrics/fake"
29+
"github.com/tektoncd/pipeline/test"
30+
"github.com/tektoncd/pipeline/test/diff"
31+
"github.com/tektoncd/pipeline/test/parse"
32+
33+
corev1 "k8s.io/api/core/v1"
34+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
35+
"knative.dev/pkg/apis"
36+
"knative.dev/pkg/system"
37+
_ "knative.dev/pkg/system/testing" // Setup system.Namespace()
38+
)
39+
40+
func TestFailTaskRun_Timeout(t *testing.T) {
41+
testCases := []struct {
42+
name string
43+
taskRun *v1.TaskRun
44+
pod *corev1.Pod
45+
reason v1.TaskRunReason
46+
message string
47+
featureFlags map[string]string
48+
expectedStatus apis.Condition
49+
expectedPods []corev1.Pod
50+
}{
51+
{
52+
name: "taskrun-timeout-keep-pod-on-cancel",
53+
taskRun: parse.MustParseV1TaskRun(t, `
54+
metadata:
55+
name: test-taskrun-run-timeout
56+
namespace: foo
57+
spec:
58+
taskRef:
59+
name: test-task
60+
timeout: 10s
61+
status:
62+
startTime: "2000-01-01T01:01:01Z"
63+
conditions:
64+
- status: Unknown
65+
type: Succeeded
66+
podName: foo-is-bar
67+
steps:
68+
- running:
69+
startedAt: "2022-01-01T00:00:00Z"
70+
`),
71+
pod: &corev1.Pod{ObjectMeta: metav1.ObjectMeta{
72+
Namespace: "foo",
73+
Name: "foo-is-bar",
74+
Annotations: map[string]string{
75+
"test": "test value",
76+
},
77+
}},
78+
featureFlags: map[string]string{
79+
config.KeepPodOnCancel: "true",
80+
},
81+
reason: v1.TaskRunReasonTimedOut,
82+
message: "TaskRun test-taskrun-run-timeout failed to finish within 10s",
83+
expectedStatus: apis.Condition{
84+
Type: apis.ConditionSucceeded,
85+
Status: corev1.ConditionFalse,
86+
Reason: v1.TaskRunReasonTimedOut.String(),
87+
Message: "TaskRun test-taskrun-run-timeout failed to finish within 10s",
88+
},
89+
expectedPods: []corev1.Pod{{ObjectMeta: metav1.ObjectMeta{
90+
Namespace: "foo",
91+
Name: "foo-is-bar",
92+
Annotations: map[string]string{
93+
"test": "test value",
94+
"tekton.dev/cancel": "CANCEL",
95+
},
96+
}}},
97+
},
98+
{
99+
name: "taskrun-timeout-keep-pod-on-cancel-false",
100+
taskRun: parse.MustParseV1TaskRun(t, `
101+
metadata:
102+
name: test-taskrun-run-timeout
103+
namespace: foo
104+
spec:
105+
taskRef:
106+
name: test-task
107+
timeout: 10s
108+
status:
109+
startTime: "2000-01-01T01:01:01Z"
110+
conditions:
111+
- status: Unknown
112+
type: Succeeded
113+
podName: foo-is-bar
114+
steps:
115+
- running:
116+
startedAt: "2022-01-01T00:00:00Z"
117+
`),
118+
pod: &corev1.Pod{ObjectMeta: metav1.ObjectMeta{
119+
Namespace: "foo",
120+
Name: "foo-is-bar",
121+
Annotations: map[string]string{
122+
"test": "test value",
123+
},
124+
}},
125+
featureFlags: map[string]string{
126+
config.KeepPodOnCancel: "false",
127+
},
128+
reason: v1.TaskRunReasonTimedOut,
129+
message: "TaskRun test-taskrun-run-timeout failed to finish within 10s",
130+
expectedStatus: apis.Condition{
131+
Type: apis.ConditionSucceeded,
132+
Status: corev1.ConditionFalse,
133+
Reason: v1.TaskRunReasonTimedOut.String(),
134+
Message: "TaskRun test-taskrun-run-timeout failed to finish within 10s",
135+
},
136+
}}
137+
138+
for _, tc := range testCases {
139+
t.Run(tc.name, func(t *testing.T) {
140+
d := test.Data{
141+
TaskRuns: []*v1.TaskRun{tc.taskRun},
142+
ConfigMaps: []*corev1.ConfigMap{
143+
{
144+
ObjectMeta: metav1.ObjectMeta{
145+
Name: config.GetFeatureFlagsConfigName(),
146+
Namespace: system.Namespace(),
147+
},
148+
Data: tc.featureFlags,
149+
},
150+
},
151+
Pods: []*corev1.Pod{
152+
tc.pod,
153+
},
154+
}
155+
156+
testAssets, cancel := getTaskRunController(t, d)
157+
defer cancel()
158+
159+
c := &Reconciler{
160+
KubeClientSet: testAssets.Clients.Kube,
161+
PipelineClientSet: testAssets.Clients.Pipeline,
162+
Clock: testClock,
163+
taskRunLister: testAssets.Informers.TaskRun.Lister(),
164+
limitrangeLister: testAssets.Informers.LimitRange.Lister(),
165+
cloudEventClient: testAssets.Clients.CloudEvents,
166+
metrics: nil, // Not used
167+
entrypointCache: nil, // Not used
168+
pvcHandler: volumeclaim.NewPVCHandler(testAssets.Clients.Kube, testAssets.Logger),
169+
tracerProvider: trace.NewNoopTracerProvider(),
170+
}
171+
ctx := testAssets.Ctx
172+
173+
ff, _ := config.NewFeatureFlagsFromMap(tc.featureFlags)
174+
175+
ctx = config.ToContext(ctx, &config.Config{
176+
FeatureFlags: ff,
177+
})
178+
179+
if err := c.failTaskRun(ctx, tc.taskRun, tc.reason, tc.message); err != nil {
180+
t.Errorf("fail timeout test: %v", err)
181+
}
182+
183+
if d := cmp.Diff(&tc.expectedStatus, tc.taskRun.Status.GetCondition(apis.ConditionSucceeded), ignoreLastTransitionTime); d != "" {
184+
t.Fatal(diff.PrintWantGot(d))
185+
}
186+
187+
pods, err := c.KubeClientSet.CoreV1().Pods(tc.pod.Namespace).List(ctx, metav1.ListOptions{})
188+
if err != nil {
189+
t.Fatal("Error while fetching pod: "+tc.pod.Name, err.Error())
190+
}
191+
if d := cmp.Diff(tc.expectedPods, pods.Items); d != "" {
192+
t.Fatal(diff.PrintWantGot(d))
193+
}
194+
})
195+
}
196+
}

0 commit comments

Comments
 (0)