diff --git a/go/flags/endtoend/vtcombo.txt b/go/flags/endtoend/vtcombo.txt index 860ff5c62fa..2d06cc309b2 100644 --- a/go/flags/endtoend/vtcombo.txt +++ b/go/flags/endtoend/vtcombo.txt @@ -170,7 +170,7 @@ Flags: --grpc-use-effective-groups If set, and SSL is not used, will set the immediate caller's security groups from the effective caller id's groups. --grpc-use-static-authentication-callerid If set, will set the immediate caller id to the username authenticated by the static auth plugin. --health-check-interval duration Interval between health checks (default 20s) - --healthcheck-retry-delay duration health check retry delay (default 2ms) + --healthcheck-retry-delay duration health check retry delay (default 5s) --healthcheck-timeout duration the health check timeout period (default 1m0s) --heartbeat-enable If true, vttablet records (if master) or checks (if replica) the current time of a replication heartbeat in the sidecar database's heartbeat table. The result is used to inform the serving state of the vttablet via healthchecks. --heartbeat-interval duration How frequently to read and write replication heartbeat. (default 1s) diff --git a/go/flags/endtoend/vtgate.txt b/go/flags/endtoend/vtgate.txt index ba928b978d6..c0bf837d8da 100644 --- a/go/flags/endtoend/vtgate.txt +++ b/go/flags/endtoend/vtgate.txt @@ -100,7 +100,7 @@ Flags: --grpc-use-effective-callerid If set, and SSL is not used, will set the immediate caller id from the effective caller id's principal. --grpc-use-effective-groups If set, and SSL is not used, will set the immediate caller's security groups from the effective caller id's groups. --grpc-use-static-authentication-callerid If set, will set the immediate caller id to the username authenticated by the static auth plugin. - --healthcheck-retry-delay duration health check retry delay (default 2ms) + --healthcheck-retry-delay duration health check retry delay (default 5s) --healthcheck-timeout duration the health check timeout period (default 1m0s) -h, --help help for vtgate --keep-logs duration keep logs for this long (using ctime) (zero to keep forever) diff --git a/go/test/endtoend/reparent/newfeaturetest/reparent_with_open_tx_test.go b/go/test/endtoend/reparent/newfeaturetest/reparent_with_open_tx_test.go index b217d99a1ae..7fc58b54d21 100644 --- a/go/test/endtoend/reparent/newfeaturetest/reparent_with_open_tx_test.go +++ b/go/test/endtoend/reparent/newfeaturetest/reparent_with_open_tx_test.go @@ -17,6 +17,7 @@ limitations under the License. package newfeaturetest import ( + "strings" "testing" "time" @@ -113,7 +114,15 @@ func testExecuteErrorWhileTabletIsNotServing(t *testing.T, conn *mysql.Conn, clu if !assert.ErrorContains(t, err, "VT15001") { return } - if !assert.ErrorContains(t, err, vterrors.WrongTablet) { + // Depending on whether vtgate has re-established its health stream to the + // restarted tablet by the time the query runs, the underlying reason is + // either the tablet rejecting the query as the wrong type, or vtgate not + // yet having a live connection to it. Both are valid VT15001 transaction + // errors against a no-longer-serving pinned tablet. + if !assert.True(t, + strings.Contains(err.Error(), vterrors.WrongTablet) || + strings.Contains(err.Error(), "is either down or nonexistent"), + "unexpected VT15001 reason: %v", err) { return } diff --git a/go/vt/discovery/healthcheck_test.go b/go/vt/discovery/healthcheck_test.go index 7ae4cbb2e57..9ed9193c74e 100644 --- a/go/vt/discovery/healthcheck_test.go +++ b/go/vt/discovery/healthcheck_test.go @@ -343,6 +343,76 @@ func TestHealthCheckStreamError(t *testing.T) { assert.Empty(t, a, "wrong result, expected empty list") } +// TestHealthCheckRetryDelayIsBounded verifies that repeated stream failures do +// not cause the retry delay to grow. After multiple consecutive errors the +// tablet should still be rediscovered within a short, bounded window once it +// recovers, rather than the ever-growing window the old exponential backoff +// produced. Regression test for https://github.com/vitessio/vitess/issues/19894. +func TestHealthCheckRetryDelayIsBounded(t *testing.T) { + ctx := utils.LeakCheckContext(t) + + ts := memorytopo.NewServer(ctx, "cell") + defer ts.Close() + hc := createTestHc(ctx, ts) + // Use a short but measurable retry delay so we can assert timing. With + // jitter the actual interval stays within [7.5ms, 12.5ms]. + hc.retryDelay = 10 * time.Millisecond + defer hc.Close() + + tablet := createTestTablet(0, "cell", "a") + // input is buffered so the recovery send below does not block until the + // healthcheck goroutine has already slept and reconnected. See the timing + // comment where the healthy response is sent. + input := make(chan *querypb.StreamHealthResponse, 1) + resultChan := hc.Subscribe("TestHealthCheckRetryDelayIsBounded") + fc := createFakeConn(tablet, input) + fc.errCh = make(chan error) + hc.AddTablet(tablet) + + // Drain the initial not-serving notification from AddTablet. + <-resultChan + + // Send multiple consecutive stream errors to simulate a prolonged outage + // where the tablet is unreachable for many retry cycles. + const numErrors = 8 // with exponential backoff the post-recovery sleep would reach ~1.28s (10ms*2^7) + for range numErrors { + fc.errCh <- errors.New("connection refused") + <-resultChan // drain the not-serving update + } + + // The tablet recovers. Build the healthy response up front so the timing + // window below covers only the reconnect delay, not test bookkeeping. + shr := &querypb.StreamHealthResponse{ + TabletAlias: tablet.Alias, + Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, + Serving: true, + PrimaryTermStartTimestamp: 0, + RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, + } + + // Because input is buffered, this send returns immediately instead of + // blocking until the healthcheck goroutine has finished its retry sleep and + // reconnected. That is what lets us start timing before the sleep and + // actually measure it. With an unbuffered channel the send would absorb the + // sleep, and the test would pass even if exponential backoff were restored. + start := time.Now() + input <- shr + + // The fixed retry delay (~10ms +/- jitter) rediscovers the tablet within a + // single cycle. The old exponential backoff would sleep ~1.28s after + // numErrors failures (10ms * 2^7), so the 400ms bound sits far above the fixed + // interval (no CI-timing flakiness) yet well below the backoff value, failing + // clearly if the delay regrows. The 5s arm only trips on a true hang. + select { + case result := <-resultChan: + assert.True(t, result.Serving, "tablet should be serving after recovery") + assert.Less(t, time.Since(start), 400*time.Millisecond, + "rediscovery took too long after recovery; retry delay may be growing exponentially") + case <-time.After(5 * time.Second): + require.Fail(t, "tablet was not rediscovered after recovery") + } +} + // TestHealthCheckErrorOnPrimary is the same as TestHealthCheckStreamError except for tablet type func TestHealthCheckErrorOnPrimary(t *testing.T) { ctx := utils.LeakCheckContext(t) @@ -668,7 +738,7 @@ func TestHealthCheckTimeout(t *testing.T) { fc.resetCanceledFlag() input <- shr - // wait for the exponential backoff to wear off and health monitoring to resume. + // wait for the retry delay to pass and health monitoring to resume. result = <-resultChan mustMatch(t, want, result, "Wrong TabletHealth data") } diff --git a/go/vt/discovery/tablet_health_check.go b/go/vt/discovery/tablet_health_check.go index 4c6e569cfc5..b12340a7145 100644 --- a/go/vt/discovery/tablet_health_check.go +++ b/go/vt/discovery/tablet_health_check.go @@ -19,6 +19,7 @@ package discovery import ( "context" "fmt" + "math/rand/v2" "strings" "sync" "sync/atomic" @@ -243,7 +244,6 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) { // Initialize error counter hcErrorCounters.Add([]string{thc.Target.Keyspace, thc.Target.Shard, topoproto.TabletTypeLString(thc.Target.TabletType)}, 0) - retryDelay := hc.retryDelay for { streamCtx, streamCancel := context.WithCancel(thc.ctx) @@ -275,8 +275,6 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) { // Read stream health responses. err := thc.stream(streamCtx, func(shr *query.StreamHealthResponse) error { - // We received a message. Reset the back-off. - retryDelay = hc.retryDelay // Don't block on send to avoid deadlocks. select { case servingStatus <- shr.Serving: @@ -318,20 +316,34 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) { // Streaming RPC failed e.g. because vttablet was restarted or took too long. // Sleep until the next retry is up or the context is done/canceled. + // We use a fixed retry interval with jitter instead of exponential backoff + // so that vtgate rediscovers recovered tablets promptly without the fleet + // stampeding the tablet in lockstep after a shared outage. See #19894. select { case <-thc.ctx.Done(): return - case <-time.After(retryDelay): - // Exponentially back-off to prevent tight-loop. - retryDelay *= 2 - // Limit the retry delay backoff to the health check timeout - if retryDelay > hc.healthCheckTimeout { - retryDelay = hc.healthCheckTimeout - } + case <-time.After(retryInterval(hc.retryDelay)): } } } +// retryInterval returns the configured retry delay with +/-25% jitter applied. +// The jitter de-synchronizes reconnection attempts across vtgate instances so a +// fleet recovering from a shared tablet outage does not stampede the tablet at +// the same instant, while never growing the interval the way exponential +// backoff did. See #19894. +func retryInterval(base time.Duration) time.Duration { + // half is the jitter span passed to rand.Int64N, which panics on n <= 0. For + // a non-positive base, or one so small that base/2 truncates to zero (sub-2ns), + // there is no room to jitter, so return the base unchanged. + half := base / 2 + if half <= 0 { + return base + } + // Spread uniformly within [base-base/4, base+base/4). + return base - base/4 + time.Duration(rand.Int64N(int64(half))) +} + func (thc *tabletHealthCheck) closeConnection(ctx context.Context, err error) { thc.logger.Warningf("tablet %v healthcheck stream error: %v", thc.Tablet, err) thc.setServingState(false, err.Error()) diff --git a/go/vt/discovery/tablet_health_check_test.go b/go/vt/discovery/tablet_health_check_test.go new file mode 100644 index 00000000000..63ce59283b7 --- /dev/null +++ b/go/vt/discovery/tablet_health_check_test.go @@ -0,0 +1,62 @@ +/* +Copyright 2026 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package discovery + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +// TestRetryInterval verifies that the retry interval stays within a bounded +// jitter window around the configured base delay. The jitter de-synchronizes +// reconnection attempts across vtgate instances so that a fleet recovering from +// a shared outage does not stampede a tablet at the same instant (#19894), +// while never growing the interval the way exponential backoff did. +func TestRetryInterval(t *testing.T) { + const base = 5 * time.Second + lower := time.Duration(float64(base) * 0.75) + upper := time.Duration(float64(base) * 1.25) + + seen := make(map[time.Duration]struct{}) + for range 1000 { + got := retryInterval(base) + assert.GreaterOrEqual(t, got, lower, "interval must not drop below 75% of base") + assert.LessOrEqual(t, got, upper, "interval must not exceed 125% of base") + seen[got] = struct{}{} + } + + // Jitter must actually vary; a single repeated value would re-synchronize + // the fleet and defeat the purpose. + assert.Greater(t, len(seen), 1, "retry interval should vary across calls") +} + +// TestRetryIntervalNonPositive verifies that a zero or negative base delay is +// returned unchanged, so the jitter math never produces a panic or a negative +// sleep duration. +func TestRetryIntervalNonPositive(t *testing.T) { + assert.Equal(t, time.Duration(0), retryInterval(0)) + assert.Equal(t, -time.Second, retryInterval(-time.Second)) +} + +// TestRetryIntervalTinyBase verifies that a positive base too small to jitter +// (base/2 truncates to zero) is returned unchanged rather than panicking inside +// rand.Int64N, which rejects a non-positive argument. +func TestRetryIntervalTinyBase(t *testing.T) { + assert.Equal(t, time.Nanosecond, retryInterval(time.Nanosecond)) +} diff --git a/go/vt/vtgate/vtgate.go b/go/vt/vtgate/vtgate.go index b2ee573e3b4..8ca37b5074b 100644 --- a/go/vt/vtgate/vtgate.go +++ b/go/vt/vtgate/vtgate.go @@ -87,8 +87,10 @@ var ( preventCrossKeyspaceReads bool enableShardRouting bool - // healthCheckRetryDelay is the time to wait before retrying healthcheck - healthCheckRetryDelay = 2 * time.Millisecond + // healthCheckRetryDelay is the fixed interval between healthcheck reconnection + // attempts. Since it is now a steady-state interval rather than a backoff seed, + // it defaults to discovery's 5s instead of a few milliseconds. + healthCheckRetryDelay = discovery.DefaultHealthCheckRetryDelay // healthCheckTimeout is the timeout on the RPC call to tablets healthCheckTimeout = time.Minute