Skip to content

Commit 2aa3e22

Browse files
committed
adding efficient polling to waitForStepsToFinish
The current waitForStepsToFinish implementation is a classic busy-wait. It checks for file existence without any sleep, resulting in a high CPU usage. Adding a profile with a unit test to show that almost all time is spent in system calls with a high total sample count. This led to execssive CPU usage by the sidecar even when just waiting. The function now sleeps 100ms between checks, drastically reducing the frequency. The sidecar now uses minimal CPU while waiting. Signed-off-by: Priti Desai <[email protected]>
1 parent d504890 commit 2aa3e22

File tree

8 files changed

+185
-5
lines changed

8 files changed

+185
-5
lines changed

config/config-defaults.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,3 +149,8 @@ data:
149149
# limits:
150150
# memory: "256Mi"
151151
# cpu: "500m"
152+
153+
# default-sidecar-log-polling-interval specifies the polling interval for the Tekton sidecar log results container.
154+
# This controls how frequently the sidecar checks for step completion files.
155+
# Example values: "100ms", "500ms", "1s"
156+
default-sidecar-log-polling-interval: "100ms"

internal/sidecarlogresults/sidecarlogresults.go

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"os"
2727
"path/filepath"
2828
"strings"
29+
"time"
2930

3031
"github.com/tektoncd/pipeline/pkg/apis/config"
3132
"github.com/tektoncd/pipeline/pkg/apis/pipeline"
@@ -74,7 +75,7 @@ func encode(w io.Writer, v any) error {
7475
return json.NewEncoder(w).Encode(v)
7576
}
7677

77-
func waitForStepsToFinish(runDir string) error {
78+
func waitForStepsToFinish(runDir string, sleepInterval time.Duration) error {
7879
steps := make(map[string]bool)
7980
files, err := os.ReadDir(runDir)
8081
if err != nil {
@@ -103,6 +104,9 @@ func waitForStepsToFinish(runDir string) error {
103104
return err
104105
}
105106
}
107+
if sleepInterval > 0 {
108+
time.Sleep(sleepInterval)
109+
}
106110
}
107111
return nil
108112
}
@@ -143,7 +147,15 @@ func readResults(resultsDir, resultFile, stepName string, resultType SidecarLogR
143147
// in their results path and prints them in a structured way to its
144148
// stdout so that the reconciler can parse those logs.
145149
func LookForResults(w io.Writer, runDir string, resultsDir string, resultNames []string, stepResultsDir string, stepResults map[string][]string) error {
146-
if err := waitForStepsToFinish(runDir); err != nil {
150+
intervalStr := os.Getenv("SIDECAR_LOG_POLLING_INTERVAL")
151+
if intervalStr == "" {
152+
intervalStr = "100ms"
153+
}
154+
interval, err := time.ParseDuration(intervalStr)
155+
if err != nil {
156+
interval = 100 * time.Millisecond
157+
}
158+
if err := waitForStepsToFinish(runDir, interval); err != nil {
147159
return fmt.Errorf("error while waiting for the steps to finish %w", err)
148160
}
149161
results := make(chan SidecarLogResult)
@@ -205,7 +217,15 @@ func LookForResults(w io.Writer, runDir string, resultsDir string, resultNames [
205217
// If the provenance file exists, the function extracts artifact information, formats it into a
206218
// JSON string, and encodes it for output alongside relevant metadata (step name, artifact type).
207219
func LookForArtifacts(w io.Writer, names []string, runDir string) error {
208-
if err := waitForStepsToFinish(runDir); err != nil {
220+
intervalStr := os.Getenv("SIDECAR_LOG_POLLING_INTERVAL")
221+
if intervalStr == "" {
222+
intervalStr = "100ms"
223+
}
224+
interval, err := time.ParseDuration(intervalStr)
225+
if err != nil {
226+
interval = 100 * time.Millisecond
227+
}
228+
if err := waitForStepsToFinish(runDir, interval); err != nil {
209229
return err
210230
}
211231

internal/sidecarlogresults/sidecarlogresults_test.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,11 @@ import (
2323
"fmt"
2424
"os"
2525
"path/filepath"
26+
"runtime/pprof"
2627
"sort"
2728
"strings"
2829
"testing"
30+
"time"
2931

3032
"github.com/google/go-cmp/cmp"
3133
v1 "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1"
@@ -608,6 +610,66 @@ func TestExtractStepAndResultFromSidecarResultName_Error(t *testing.T) {
608610
}
609611
}
610612

613+
// TestWaitForStepsToFinish_Profile ensures that waitForStepsToFinish correctly waits for all step output files to appear before returning
614+
// The test creates a file called cpu.prof and starts Go's CPU profiler
615+
// A temporary directory is created to simulate the Tekton step run directory.
616+
// The test creates a large number of subdirectories e.g. step0, step1, ..., each representing a step in a TaskRun
617+
// A goroutine is started that, one by one, writes an out file in each step directory, with a small delay between each
618+
// The test calls the function and waits for it to complete and the profile is saved for later analysis
619+
// This is helpful to compare the impact of code changes, provides a reproducible way to profile and optimize the function waitForStepsToFinish
620+
func TestWaitForStepsToFinish_Profile(t *testing.T) {
621+
f, err := os.Create("cpu.prof")
622+
if err != nil {
623+
t.Fatalf("could not create CPU profile: %v", err)
624+
}
625+
defer func(f *os.File) {
626+
err := f.Close()
627+
if err != nil {
628+
return
629+
}
630+
}(f)
631+
err = pprof.StartCPUProfile(f)
632+
if err != nil {
633+
return
634+
}
635+
defer pprof.StopCPUProfile()
636+
637+
// Setup: create a temp runDir with many fake step files
638+
runDir := t.TempDir()
639+
stepCount := 100
640+
for i := range stepCount {
641+
dir := filepath.Join(runDir, fmt.Sprintf("step%d", i))
642+
err := os.MkdirAll(dir, 0755)
643+
if err != nil {
644+
return
645+
}
646+
}
647+
648+
// Simulate steps finishing one by one with a delay
649+
go func() {
650+
for i := range stepCount {
651+
file := filepath.Join(runDir, fmt.Sprintf("step%d", i), "out")
652+
err := os.WriteFile(file, []byte("done"), 0644)
653+
if err != nil {
654+
return
655+
}
656+
time.Sleep(10 * time.Millisecond)
657+
}
658+
}()
659+
660+
intervalStr := os.Getenv("SIDECAR_LOG_POLLING_INTERVAL")
661+
if intervalStr == "" {
662+
intervalStr = "100ms"
663+
}
664+
interval, err := time.ParseDuration(intervalStr)
665+
if err != nil {
666+
interval = 100 * time.Millisecond
667+
}
668+
if err := waitForStepsToFinish(runDir, interval); err != nil {
669+
t.Fatalf("waitForStepsToFinish failed: %v", err)
670+
}
671+
}
672+
611673
func TestLookForArtifacts(t *testing.T) {
612674
base := basicArtifacts()
613675
modified := base.DeepCopy()

pkg/apis/config/default.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ const (
5454
// Default maximum resolution timeout used by the resolution controller before timing out when exceeded
5555
DefaultMaximumResolutionTimeout = 1 * time.Minute
5656

57+
DefaultSidecarLogPollingInterval = 100 * time.Millisecond
58+
5759
defaultTimeoutMinutesKey = "default-timeout-minutes"
5860
defaultServiceAccountKey = "default-service-account"
5961
defaultManagedByLabelValueKey = "default-managed-by-label-value"
@@ -67,6 +69,7 @@ const (
6769
defaultContainerResourceRequirementsKey = "default-container-resource-requirements"
6870
defaultImagePullBackOffTimeout = "default-imagepullbackoff-timeout"
6971
defaultMaximumResolutionTimeout = "default-maximum-resolution-timeout"
72+
defaultSidecarLogPollingIntervalKey = "default-sidecar-log-polling-interval"
7073
)
7174

7275
// DefaultConfig holds all the default configurations for the config.
@@ -88,6 +91,10 @@ type Defaults struct {
8891
DefaultContainerResourceRequirements map[string]corev1.ResourceRequirements
8992
DefaultImagePullBackOffTimeout time.Duration
9093
DefaultMaximumResolutionTimeout time.Duration
94+
// DefaultSidecarLogPollingInterval specifies how frequently (as a time.Duration) the Tekton sidecar log results container polls for step completion files.
95+
// This value is loaded from the 'sidecar-log-polling-interval' key in the config-defaults ConfigMap.
96+
// It is used to control the responsiveness and resource usage of the sidecar in both production and test environments.
97+
DefaultSidecarLogPollingInterval time.Duration
9198
}
9299

93100
// GetDefaultsConfigName returns the name of the configmap containing all
@@ -120,6 +127,7 @@ func (cfg *Defaults) Equals(other *Defaults) bool {
120127
other.DefaultResolverType == cfg.DefaultResolverType &&
121128
other.DefaultImagePullBackOffTimeout == cfg.DefaultImagePullBackOffTimeout &&
122129
other.DefaultMaximumResolutionTimeout == cfg.DefaultMaximumResolutionTimeout &&
130+
other.DefaultSidecarLogPollingInterval == cfg.DefaultSidecarLogPollingInterval &&
123131
reflect.DeepEqual(other.DefaultForbiddenEnv, cfg.DefaultForbiddenEnv)
124132
}
125133

@@ -134,6 +142,7 @@ func NewDefaultsFromMap(cfgMap map[string]string) (*Defaults, error) {
134142
DefaultResolverType: DefaultResolverTypeValue,
135143
DefaultImagePullBackOffTimeout: DefaultImagePullBackOffTimeout,
136144
DefaultMaximumResolutionTimeout: DefaultMaximumResolutionTimeout,
145+
DefaultSidecarLogPollingInterval: DefaultSidecarLogPollingInterval,
137146
}
138147

139148
if defaultTimeoutMin, ok := cfgMap[defaultTimeoutMinutesKey]; ok {
@@ -220,6 +229,14 @@ func NewDefaultsFromMap(cfgMap map[string]string) (*Defaults, error) {
220229
tc.DefaultMaximumResolutionTimeout = timeout
221230
}
222231

232+
if defaultSidecarPollingInterval, ok := cfgMap[defaultSidecarLogPollingIntervalKey]; ok {
233+
interval, err := time.ParseDuration(defaultSidecarPollingInterval)
234+
if err != nil {
235+
return nil, fmt.Errorf("failed parsing default config %q", defaultSidecarPollingInterval)
236+
}
237+
tc.DefaultSidecarLogPollingInterval = interval
238+
}
239+
223240
return &tc, nil
224241
}
225242

pkg/apis/config/default_test.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
4646
DefaultResolverType: "git",
4747
DefaultImagePullBackOffTimeout: time.Duration(5) * time.Second,
4848
DefaultMaximumResolutionTimeout: 1 * time.Minute,
49+
DefaultSidecarLogPollingInterval: 100 * time.Millisecond,
4950
},
5051
fileName: config.GetDefaultsConfigName(),
5152
},
@@ -67,6 +68,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
6768
DefaultMaxMatrixCombinationsCount: 256,
6869
DefaultImagePullBackOffTimeout: 0,
6970
DefaultMaximumResolutionTimeout: 1 * time.Minute,
71+
DefaultSidecarLogPollingInterval: 100 * time.Millisecond,
7072
},
7173
fileName: "config-defaults-with-pod-template",
7274
},
@@ -91,6 +93,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
9193
DefaultMaxMatrixCombinationsCount: 256,
9294
DefaultImagePullBackOffTimeout: 0,
9395
DefaultMaximumResolutionTimeout: 1 * time.Minute,
96+
DefaultSidecarLogPollingInterval: 100 * time.Millisecond,
9497
},
9598
},
9699
{
@@ -104,6 +107,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
104107
DefaultMaxMatrixCombinationsCount: 256,
105108
DefaultImagePullBackOffTimeout: 0,
106109
DefaultMaximumResolutionTimeout: 1 * time.Minute,
110+
DefaultSidecarLogPollingInterval: 100 * time.Millisecond,
107111
},
108112
},
109113
{
@@ -120,6 +124,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
120124
DefaultManagedByLabelValue: config.DefaultManagedByLabelValue,
121125
DefaultImagePullBackOffTimeout: 0,
122126
DefaultMaximumResolutionTimeout: 1 * time.Minute,
127+
DefaultSidecarLogPollingInterval: 100 * time.Millisecond,
123128
},
124129
},
125130
{
@@ -133,6 +138,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
133138
DefaultForbiddenEnv: []string{"TEKTON_POWER_MODE", "TEST_ENV", "TEST_TEKTON"},
134139
DefaultImagePullBackOffTimeout: time.Duration(15) * time.Second,
135140
DefaultMaximumResolutionTimeout: 1 * time.Minute,
141+
DefaultSidecarLogPollingInterval: 100 * time.Millisecond,
136142
},
137143
},
138144
{
@@ -146,6 +152,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
146152
DefaultContainerResourceRequirements: map[string]corev1.ResourceRequirements{},
147153
DefaultImagePullBackOffTimeout: 0,
148154
DefaultMaximumResolutionTimeout: 1 * time.Minute,
155+
DefaultSidecarLogPollingInterval: 100 * time.Millisecond,
149156
},
150157
},
151158
{
@@ -162,6 +169,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
162169
DefaultMaxMatrixCombinationsCount: 256,
163170
DefaultImagePullBackOffTimeout: 0,
164171
DefaultMaximumResolutionTimeout: 1 * time.Minute,
172+
DefaultSidecarLogPollingInterval: 100 * time.Millisecond,
165173
DefaultContainerResourceRequirements: map[string]corev1.ResourceRequirements{
166174
config.ResourceRequirementDefaultContainerKey: {
167175
Requests: corev1.ResourceList{
@@ -219,6 +227,7 @@ func TestNewDefaultsFromEmptyConfigMap(t *testing.T) {
219227
DefaultMaxMatrixCombinationsCount: 256,
220228
DefaultImagePullBackOffTimeout: 0,
221229
DefaultMaximumResolutionTimeout: 1 * time.Minute,
230+
DefaultSidecarLogPollingInterval: 100 * time.Millisecond,
222231
}
223232
verifyConfigFileWithExpectedConfig(t, DefaultsConfigEmptyName, expectedConfig)
224233
}
@@ -417,6 +426,51 @@ func TestEquals(t *testing.T) {
417426
}
418427
}
419428

429+
func TestSidecarLogPollingIntervalParsing(t *testing.T) {
430+
cases := []struct {
431+
name string
432+
data map[string]string
433+
expected time.Duration
434+
wantErr bool
435+
}{
436+
{
437+
name: "valid interval",
438+
data: map[string]string{"default-sidecar-log-polling-interval": "42ms"},
439+
expected: 42 * time.Millisecond,
440+
wantErr: false,
441+
},
442+
{
443+
name: "invalid interval",
444+
data: map[string]string{"default-sidecar-log-polling-interval": "notaduration"},
445+
expected: 0,
446+
wantErr: true,
447+
},
448+
{
449+
name: "not set (default)",
450+
data: map[string]string{},
451+
expected: 100 * time.Millisecond,
452+
wantErr: false,
453+
},
454+
}
455+
for _, tc := range cases {
456+
t.Run(tc.name, func(t *testing.T) {
457+
cfg, err := config.NewDefaultsFromMap(tc.data)
458+
if tc.wantErr {
459+
if err == nil {
460+
t.Errorf("expected error, got nil")
461+
}
462+
return
463+
}
464+
if err != nil {
465+
t.Fatalf("unexpected error: %v", err)
466+
}
467+
if cfg.DefaultSidecarLogPollingInterval != tc.expected {
468+
t.Errorf("got %v, want %v", cfg.DefaultSidecarLogPollingInterval, tc.expected)
469+
}
470+
})
471+
}
472+
}
473+
420474
func verifyConfigFileWithExpectedConfig(t *testing.T, fileName string, expectedConfig *config.Defaults) {
421475
t.Helper()
422476
cm := test.ConfigMapFromTestFile(t, fileName)

pkg/pod/pod.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"path/filepath"
2626
"strconv"
2727
"strings"
28+
"time"
2829

2930
"github.com/tektoncd/pipeline/internal/artifactref"
3031
"github.com/tektoncd/pipeline/pkg/apis/config"
@@ -203,10 +204,11 @@ func (b *Builder) Build(ctx context.Context, taskRun *v1.TaskRun, taskSpec v1.Ta
203204
}
204205

205206
windows := usesWindows(taskRun)
207+
pollingInterval := config.FromContextOrDefaults(ctx).Defaults.DefaultSidecarLogPollingInterval
206208
if sidecarLogsResultsEnabled {
207209
if taskSpec.Results != nil || artifactsPathReferenced(steps) {
208210
// create a results sidecar
209-
resultsSidecar, err := createResultsSidecar(taskSpec, b.Images.SidecarLogResultsImage, securityContextConfig, windows)
211+
resultsSidecar, err := createResultsSidecar(taskSpec, b.Images.SidecarLogResultsImage, securityContextConfig, windows, pollingInterval)
210212
if err != nil {
211213
return nil, err
212214
}
@@ -618,7 +620,7 @@ func entrypointInitContainer(image string, steps []v1.Step, securityContext Secu
618620
// whether it will run on a windows node, and whether the sidecar should include a security context
619621
// that will allow it to run in namespaces with "restricted" pod security admission.
620622
// It will also provide arguments to the binary that allow it to surface the step results.
621-
func createResultsSidecar(taskSpec v1.TaskSpec, image string, securityContext SecurityContextConfig, windows bool) (v1.Sidecar, error) {
623+
func createResultsSidecar(taskSpec v1.TaskSpec, image string, securityContext SecurityContextConfig, windows bool, pollingInterval time.Duration) (v1.Sidecar, error) {
622624
names := make([]string, 0, len(taskSpec.Results))
623625
for _, r := range taskSpec.Results {
624626
names = append(names, r.Name)
@@ -660,6 +662,12 @@ func createResultsSidecar(taskSpec v1.TaskSpec, image string, securityContext Se
660662
Name: pipeline.ReservedResultsSidecarName,
661663
Image: image,
662664
Command: command,
665+
Env: []corev1.EnvVar{
666+
{
667+
Name: "SIDECAR_LOG_POLLING_INTERVAL",
668+
Value: pollingInterval.String(),
669+
},
670+
},
663671
}
664672

665673
if securityContext.SetSecurityContext {

0 commit comments

Comments
 (0)