Skip to content
Open
2 changes: 1 addition & 1 deletion go/flags/endtoend/vtcombo.txt
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ Flags:
--grpc-use-effective-groups If set, and SSL is not used, will set the immediate caller's security groups from the effective caller id's groups.
--grpc-use-static-authentication-callerid If set, will set the immediate caller id to the username authenticated by the static auth plugin.
--health-check-interval duration Interval between health checks (default 20s)
--healthcheck-retry-delay duration health check retry delay (default 2ms)
--healthcheck-retry-delay duration health check retry delay (default 5s)
--healthcheck-timeout duration the health check timeout period (default 1m0s)
--heartbeat-enable If true, vttablet records (if master) or checks (if replica) the current time of a replication heartbeat in the sidecar database's heartbeat table. The result is used to inform the serving state of the vttablet via healthchecks.
--heartbeat-interval duration How frequently to read and write replication heartbeat. (default 1s)
Expand Down
2 changes: 1 addition & 1 deletion go/flags/endtoend/vtgate.txt
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ Flags:
--grpc-use-effective-callerid If set, and SSL is not used, will set the immediate caller id from the effective caller id's principal.
--grpc-use-effective-groups If set, and SSL is not used, will set the immediate caller's security groups from the effective caller id's groups.
--grpc-use-static-authentication-callerid If set, will set the immediate caller id to the username authenticated by the static auth plugin.
--healthcheck-retry-delay duration health check retry delay (default 2ms)
--healthcheck-retry-delay duration health check retry delay (default 5s)
--healthcheck-timeout duration the health check timeout period (default 1m0s)
-h, --help help for vtgate
--keep-logs duration keep logs for this long (using ctime) (zero to keep forever)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package newfeaturetest

import (
"strings"
"testing"
"time"

Expand Down Expand Up @@ -113,7 +114,15 @@ func testExecuteErrorWhileTabletIsNotServing(t *testing.T, conn *mysql.Conn, clu
if !assert.ErrorContains(t, err, "VT15001") {
return
}
if !assert.ErrorContains(t, err, vterrors.WrongTablet) {
// Depending on whether vtgate has re-established its health stream to the
// restarted tablet by the time the query runs, the underlying reason is
// either the tablet rejecting the query as the wrong type, or vtgate not
// yet having a live connection to it. Both are valid VT15001 transaction
// errors against a no-longer-serving pinned tablet.
if !assert.True(t,
strings.Contains(err.Error(), vterrors.WrongTablet) ||
strings.Contains(err.Error(), "is either down or nonexistent"),
"unexpected VT15001 reason: %v", err) {
return
}

Expand Down
72 changes: 71 additions & 1 deletion go/vt/discovery/healthcheck_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,76 @@ func TestHealthCheckStreamError(t *testing.T) {
assert.Empty(t, a, "wrong result, expected empty list")
}

// TestHealthCheckRetryDelayIsBounded verifies that repeated stream failures do
// not cause the retry delay to grow. After multiple consecutive errors the
// tablet should still be rediscovered within a short, bounded window once it
// recovers, rather than the ever-growing window the old exponential backoff
// produced. Regression test for https://github.com/vitessio/vitess/issues/19894.
func TestHealthCheckRetryDelayIsBounded(t *testing.T) {
ctx := utils.LeakCheckContext(t)

ts := memorytopo.NewServer(ctx, "cell")
defer ts.Close()
hc := createTestHc(ctx, ts)
// Use a short but measurable retry delay so we can assert timing. With
// jitter the actual interval stays within [7.5ms, 12.5ms].
hc.retryDelay = 10 * time.Millisecond
defer hc.Close()

tablet := createTestTablet(0, "cell", "a")
// input is buffered so the recovery send below does not block until the
// healthcheck goroutine has already slept and reconnected. See the timing
// comment where the healthy response is sent.
input := make(chan *querypb.StreamHealthResponse, 1)
resultChan := hc.Subscribe("TestHealthCheckRetryDelayIsBounded")
fc := createFakeConn(tablet, input)
fc.errCh = make(chan error)
hc.AddTablet(tablet)

// Drain the initial not-serving notification from AddTablet.
<-resultChan

// Send multiple consecutive stream errors to simulate a prolonged outage
// where the tablet is unreachable for many retry cycles.
const numErrors = 8 // with exponential backoff the post-recovery sleep would reach ~1.28s (10ms*2^7)
for range numErrors {
fc.errCh <- errors.New("connection refused")
<-resultChan // drain the not-serving update
Comment on lines +368 to +380
}

// The tablet recovers. Build the healthy response up front so the timing
// window below covers only the reconnect delay, not test bookkeeping.
shr := &querypb.StreamHealthResponse{
TabletAlias: tablet.Alias,
Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
Serving: true,
PrimaryTermStartTimestamp: 0,
RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
}

// Because input is buffered, this send returns immediately instead of
// blocking until the healthcheck goroutine has finished its retry sleep and
// reconnected. That is what lets us start timing before the sleep and
// actually measure it. With an unbuffered channel the send would absorb the
// sleep, and the test would pass even if exponential backoff were restored.
start := time.Now()
input <- shr
Comment on lines +388 to +399

Copilot AI Apr 27, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

input <- shr can block until fakeConn.StreamHealth is running again (because input/hcChan is unbuffered). That means the test’s 100ms timeout starts after the reconnect happens, so it won’t actually catch an exponential-backoff regression. Make the health stream channel buffered (e.g. size 1) or send shr asynchronously and start the timer before/while sending so the assertion includes the retry sleep time.

Copilot uses AI. Check for mistakes.

// The fixed retry delay (~10ms +/- jitter) rediscovers the tablet within a
// single cycle. The old exponential backoff would sleep ~1.28s after
// numErrors failures (10ms * 2^7), so the 400ms bound sits far above the fixed
// interval (no CI-timing flakiness) yet well below the backoff value, failing
// clearly if the delay regrows. The 5s arm only trips on a true hang.
select {
case result := <-resultChan:
assert.True(t, result.Serving, "tablet should be serving after recovery")
assert.Less(t, time.Since(start), 400*time.Millisecond,
"rediscovery took too long after recovery; retry delay may be growing exponentially")
Comment on lines +398 to +410
case <-time.After(5 * time.Second):
require.Fail(t, "tablet was not rediscovered after recovery")
}
}

// TestHealthCheckErrorOnPrimary is the same as TestHealthCheckStreamError except for tablet type
func TestHealthCheckErrorOnPrimary(t *testing.T) {
ctx := utils.LeakCheckContext(t)
Expand Down Expand Up @@ -668,7 +738,7 @@ func TestHealthCheckTimeout(t *testing.T) {
fc.resetCanceledFlag()
input <- shr

// wait for the exponential backoff to wear off and health monitoring to resume.
// wait for the retry delay to pass and health monitoring to resume.
result = <-resultChan
mustMatch(t, want, result, "Wrong TabletHealth data")
}
Expand Down
32 changes: 22 additions & 10 deletions go/vt/discovery/tablet_health_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package discovery
import (
"context"
"fmt"
"math/rand/v2"
"strings"
"sync"
"sync/atomic"
Expand Down Expand Up @@ -243,7 +244,6 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) {
// Initialize error counter
hcErrorCounters.Add([]string{thc.Target.Keyspace, thc.Target.Shard, topoproto.TabletTypeLString(thc.Target.TabletType)}, 0)

retryDelay := hc.retryDelay
for {
streamCtx, streamCancel := context.WithCancel(thc.ctx)

Expand Down Expand Up @@ -275,8 +275,6 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) {

// Read stream health responses.
err := thc.stream(streamCtx, func(shr *query.StreamHealthResponse) error {
// We received a message. Reset the back-off.
retryDelay = hc.retryDelay
// Don't block on send to avoid deadlocks.
select {
case servingStatus <- shr.Serving:
Expand Down Expand Up @@ -318,20 +316,34 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) {

// Streaming RPC failed e.g. because vttablet was restarted or took too long.
// Sleep until the next retry is up or the context is done/canceled.
// We use a fixed retry interval with jitter instead of exponential backoff
// so that vtgate rediscovers recovered tablets promptly without the fleet
// stampeding the tablet in lockstep after a shared outage. See #19894.
select {
case <-thc.ctx.Done():
return
case <-time.After(retryDelay):
// Exponentially back-off to prevent tight-loop.
retryDelay *= 2
// Limit the retry delay backoff to the health check timeout
if retryDelay > hc.healthCheckTimeout {
retryDelay = hc.healthCheckTimeout
}
case <-time.After(retryInterval(hc.retryDelay)):
}
Comment on lines 317 to 326

Copilot AI Apr 27, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the switch to a fixed retry interval, retryDelay now directly controls how aggressively vtgate reconnects after stream failures. However, vtgate’s --healthcheck-retry-delay default is currently 2ms (see go/vt/vtgate/vtgate.go and generated flags docs), which would cause very tight reconnect loops and time.After allocations during outages. Consider updating the vtgate default to a saner value (e.g. discovery’s 5s default) and/or enforcing a reasonable minimum retryDelay here to avoid runaway retries.

Copilot uses AI. Check for mistakes.
Comment on lines 322 to 326
}
}

// retryInterval returns the configured retry delay with +/-25% jitter applied.
// The jitter de-synchronizes reconnection attempts across vtgate instances so a
// fleet recovering from a shared tablet outage does not stampede the tablet at
// the same instant, while never growing the interval the way exponential
// backoff did. See #19894.
func retryInterval(base time.Duration) time.Duration {
// half is the jitter span passed to rand.Int64N, which panics on n <= 0. For
// a non-positive base, or one so small that base/2 truncates to zero (sub-2ns),
// there is no room to jitter, so return the base unchanged.
half := base / 2
if half <= 0 {
return base
}
// Spread uniformly within [base-base/4, base+base/4).
return base - base/4 + time.Duration(rand.Int64N(int64(half)))
}
Comment on lines +335 to +345
Comment on lines +335 to +345

func (thc *tabletHealthCheck) closeConnection(ctx context.Context, err error) {
thc.logger.Warningf("tablet %v healthcheck stream error: %v", thc.Tablet, err)
thc.setServingState(false, err.Error())
Expand Down
62 changes: 62 additions & 0 deletions go/vt/discovery/tablet_health_check_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
Copyright 2026 The Vitess Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package discovery

import (
"testing"
"time"

"github.com/stretchr/testify/assert"
)

// TestRetryInterval verifies that the retry interval stays within a bounded
// jitter window around the configured base delay. The jitter de-synchronizes
// reconnection attempts across vtgate instances so that a fleet recovering from
// a shared outage does not stampede a tablet at the same instant (#19894),
// while never growing the interval the way exponential backoff did.
func TestRetryInterval(t *testing.T) {
const base = 5 * time.Second
lower := time.Duration(float64(base) * 0.75)
upper := time.Duration(float64(base) * 1.25)
Comment on lines +32 to +34

seen := make(map[time.Duration]struct{})
for range 1000 {
got := retryInterval(base)
assert.GreaterOrEqual(t, got, lower, "interval must not drop below 75% of base")
assert.LessOrEqual(t, got, upper, "interval must not exceed 125% of base")
seen[got] = struct{}{}
}

// Jitter must actually vary; a single repeated value would re-synchronize
// the fleet and defeat the purpose.
assert.Greater(t, len(seen), 1, "retry interval should vary across calls")
}

// TestRetryIntervalNonPositive verifies that a zero or negative base delay is
// returned unchanged, so the jitter math never produces a panic or a negative
// sleep duration.
func TestRetryIntervalNonPositive(t *testing.T) {
assert.Equal(t, time.Duration(0), retryInterval(0))
assert.Equal(t, -time.Second, retryInterval(-time.Second))
}

// TestRetryIntervalTinyBase verifies that a positive base too small to jitter
// (base/2 truncates to zero) is returned unchanged rather than panicking inside
// rand.Int64N, which rejects a non-positive argument.
func TestRetryIntervalTinyBase(t *testing.T) {
assert.Equal(t, time.Nanosecond, retryInterval(time.Nanosecond))
}
6 changes: 4 additions & 2 deletions go/vt/vtgate/vtgate.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,10 @@ var (
preventCrossKeyspaceReads bool
enableShardRouting bool

// healthCheckRetryDelay is the time to wait before retrying healthcheck
healthCheckRetryDelay = 2 * time.Millisecond
// healthCheckRetryDelay is the fixed interval between healthcheck reconnection
// attempts. Since it is now a steady-state interval rather than a backoff seed,
// it defaults to discovery's 5s instead of a few milliseconds.
healthCheckRetryDelay = discovery.DefaultHealthCheckRetryDelay
Comment on lines +90 to +93
// healthCheckTimeout is the timeout on the RPC call to tablets
healthCheckTimeout = time.Minute

Expand Down
Loading