From 22c7e5c3d1d3fe7c3a20d960566baf5b18d156f5 Mon Sep 17 00:00:00 2001 From: Gwanho Kim Date: Mon, 27 Apr 2026 21:15:17 +0900 Subject: [PATCH 1/8] vtgate: use fixed retry interval for health check reconnection Replace exponential backoff in checkConn with a fixed retry interval (default 5s). Previously, the retry delay doubled on each failure up to healthCheckTimeout (default 60s), causing vtgates to take up to 60s to rediscover tablets after prolonged outages. Fixes #19894 Signed-off-by: Gwanho Kim --- go/vt/discovery/tablet_health_check.go | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/go/vt/discovery/tablet_health_check.go b/go/vt/discovery/tablet_health_check.go index 4c6e569cfc5..7d01d0251c4 100644 --- a/go/vt/discovery/tablet_health_check.go +++ b/go/vt/discovery/tablet_health_check.go @@ -318,16 +318,12 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) { // Streaming RPC failed e.g. because vttablet was restarted or took too long. // Sleep until the next retry is up or the context is done/canceled. + // We use a fixed retry interval instead of exponential backoff so that + // vtgate rediscovers recovered tablets promptly. See #19894. select { case <-thc.ctx.Done(): return case <-time.After(retryDelay): - // Exponentially back-off to prevent tight-loop. - retryDelay *= 2 - // Limit the retry delay backoff to the health check timeout - if retryDelay > hc.healthCheckTimeout { - retryDelay = hc.healthCheckTimeout - } } } } From ebb9781d15fbf61e9e7c7583ac2f1fde1c250e0d Mon Sep 17 00:00:00 2001 From: Gwanho Kim Date: Mon, 27 Apr 2026 21:18:56 +0900 Subject: [PATCH 2/8] vtgate: add regression test for fixed health check retry interval Add TestHealthCheckRetryDelayIsFixed to verify that repeated stream failures do not cause the retry delay to grow. The test sends multiple consecutive errors and asserts that the tablet is still rediscovered within a short, fixed window once it recovers. Also update a stale comment referencing exponential backoff in TestHealthCheckTimeout. Signed-off-by: Gwanho Kim --- go/vt/discovery/healthcheck_test.go | 54 ++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/go/vt/discovery/healthcheck_test.go b/go/vt/discovery/healthcheck_test.go index 05eb8ed39d5..b37beaf0286 100644 --- a/go/vt/discovery/healthcheck_test.go +++ b/go/vt/discovery/healthcheck_test.go @@ -345,6 +345,58 @@ func TestHealthCheckStreamError(t *testing.T) { assert.Empty(t, a, "wrong result, expected empty list") } +// TestHealthCheckRetryDelayIsFixed verifies that repeated stream failures do not +// cause the retry delay to grow. After multiple consecutive errors the tablet +// should still be rediscovered within a short, fixed window once it recovers. +// Regression test for https://github.com/vitessio/vitess/issues/19894. +func TestHealthCheckRetryDelayIsFixed(t *testing.T) { + ctx := utils.LeakCheckContext(t) + + ts := memorytopo.NewServer(ctx, "cell") + defer ts.Close() + hc := createTestHc(ctx, ts) + // Use a short but measurable retry delay so we can assert timing. + hc.retryDelay = 10 * time.Millisecond + defer hc.Close() + + tablet := createTestTablet(0, "cell", "a") + input := make(chan *querypb.StreamHealthResponse) + resultChan := hc.Subscribe("TestHealthCheckRetryDelayIsFixed") + fc := createFakeConn(tablet, input) + fc.errCh = make(chan error) + hc.AddTablet(tablet) + + // Drain the initial not-serving notification from AddTablet. + <-resultChan + + // Send multiple consecutive stream errors to simulate a prolonged outage + // where the tablet is unreachable for many retry cycles. + const numErrors = 6 // with exponential backoff this would reach 320ms (10*2^5) + for range numErrors { + fc.errCh <- errors.New("connection refused") + <-resultChan // drain the not-serving update + } + + // Now the tablet recovers. Send a healthy response. + shr := &querypb.StreamHealthResponse{ + TabletAlias: tablet.Alias, + Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, + Serving: true, + PrimaryTermStartTimestamp: 0, + RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, + } + input <- shr + + // With a fixed 10ms retry delay, rediscovery should happen well within 100ms. + // With the old exponential backoff (10ms * 2^6 = 640ms cap), this would time out. + select { + case result := <-resultChan: + assert.True(t, result.Serving, "tablet should be serving after recovery") + case <-time.After(100 * time.Millisecond): + t.Fatal("tablet was not rediscovered promptly after recovery; retry delay may be growing exponentially") + } +} + // TestHealthCheckErrorOnPrimary is the same as TestHealthCheckStreamError except for tablet type func TestHealthCheckErrorOnPrimary(t *testing.T) { ctx := utils.LeakCheckContext(t) @@ -670,7 +722,7 @@ func TestHealthCheckTimeout(t *testing.T) { fc.resetCanceledFlag() input <- shr - // wait for the exponential backoff to wear off and health monitoring to resume. + // wait for the retry delay to pass and health monitoring to resume. result = <-resultChan mustMatch(t, want, result, "Wrong TabletHealth data") } From 87052ab2a299f8cf2ce1b735fd9299dd0f431f4c Mon Sep 17 00:00:00 2001 From: Gwanho Kim Date: Mon, 1 Jun 2026 21:04:55 +0900 Subject: [PATCH 3/8] discovery: jitter the health check retry interval Replace the fixed retry interval in checkConn with the same base delay plus +/-25% jitter, and remove the now-dead local retryDelay variable and its no-op reset. A purely fixed interval re-synchronizes vtgate instances that started retrying together during a shared outage, so they would all reconnect to a recovered tablet in lockstep. Jittering the interval spreads those reconnects out while still rediscovering recovered tablets promptly. See #19894. Signed-off-by: Gwanho Kim --- go/vt/discovery/tablet_health_check.go | 24 ++++++--- go/vt/discovery/tablet_health_check_test.go | 55 +++++++++++++++++++++ 2 files changed, 73 insertions(+), 6 deletions(-) create mode 100644 go/vt/discovery/tablet_health_check_test.go diff --git a/go/vt/discovery/tablet_health_check.go b/go/vt/discovery/tablet_health_check.go index 7d01d0251c4..436117bf7dc 100644 --- a/go/vt/discovery/tablet_health_check.go +++ b/go/vt/discovery/tablet_health_check.go @@ -19,6 +19,7 @@ package discovery import ( "context" "fmt" + "math/rand/v2" "strings" "sync" "sync/atomic" @@ -243,7 +244,6 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) { // Initialize error counter hcErrorCounters.Add([]string{thc.Target.Keyspace, thc.Target.Shard, topoproto.TabletTypeLString(thc.Target.TabletType)}, 0) - retryDelay := hc.retryDelay for { streamCtx, streamCancel := context.WithCancel(thc.ctx) @@ -275,8 +275,6 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) { // Read stream health responses. err := thc.stream(streamCtx, func(shr *query.StreamHealthResponse) error { - // We received a message. Reset the back-off. - retryDelay = hc.retryDelay // Don't block on send to avoid deadlocks. select { case servingStatus <- shr.Serving: @@ -318,16 +316,30 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) { // Streaming RPC failed e.g. because vttablet was restarted or took too long. // Sleep until the next retry is up or the context is done/canceled. - // We use a fixed retry interval instead of exponential backoff so that - // vtgate rediscovers recovered tablets promptly. See #19894. + // We use a fixed retry interval with jitter instead of exponential backoff + // so that vtgate rediscovers recovered tablets promptly without the fleet + // stampeding the tablet in lockstep after a shared outage. See #19894. select { case <-thc.ctx.Done(): return - case <-time.After(retryDelay): + case <-time.After(retryInterval(hc.retryDelay)): } } } +// retryInterval returns the configured retry delay with +/-25% jitter applied. +// The jitter de-synchronizes reconnection attempts across vtgate instances so a +// fleet recovering from a shared tablet outage does not stampede the tablet at +// the same instant, while never growing the interval the way exponential +// backoff did. See #19894. +func retryInterval(base time.Duration) time.Duration { + if base <= 0 { + return base + } + // Spread uniformly within [base-base/4, base+base/4). + return base - base/4 + time.Duration(rand.Int64N(int64(base/2))) +} + func (thc *tabletHealthCheck) closeConnection(ctx context.Context, err error) { thc.logger.Warningf("tablet %v healthcheck stream error: %v", thc.Tablet, err) thc.setServingState(false, err.Error()) diff --git a/go/vt/discovery/tablet_health_check_test.go b/go/vt/discovery/tablet_health_check_test.go new file mode 100644 index 00000000000..95ff94b475c --- /dev/null +++ b/go/vt/discovery/tablet_health_check_test.go @@ -0,0 +1,55 @@ +/* +Copyright 2026 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package discovery + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +// TestRetryInterval verifies that the retry interval stays within a bounded +// jitter window around the configured base delay. The jitter de-synchronizes +// reconnection attempts across vtgate instances so that a fleet recovering from +// a shared outage does not stampede a tablet at the same instant (#19894), +// while never growing the interval the way exponential backoff did. +func TestRetryInterval(t *testing.T) { + const base = 5 * time.Second + lower := time.Duration(float64(base) * 0.75) + upper := time.Duration(float64(base) * 1.25) + + seen := make(map[time.Duration]struct{}) + for range 1000 { + got := retryInterval(base) + assert.GreaterOrEqual(t, got, lower, "interval must not drop below 75% of base") + assert.LessOrEqual(t, got, upper, "interval must not exceed 125% of base") + seen[got] = struct{}{} + } + + // Jitter must actually vary; a single repeated value would re-synchronize + // the fleet and defeat the purpose. + assert.Greater(t, len(seen), 1, "retry interval should vary across calls") +} + +// TestRetryIntervalNonPositive verifies that a zero or negative base delay is +// returned unchanged, so the jitter math never produces a panic or a negative +// sleep duration. +func TestRetryIntervalNonPositive(t *testing.T) { + assert.Equal(t, time.Duration(0), retryInterval(0)) + assert.Equal(t, -time.Second, retryInterval(-time.Second)) +} From dce956b303079d82d7b76198e66e3fe947196ebd Mon Sep 17 00:00:00 2001 From: Gwanho Kim Date: Mon, 1 Jun 2026 21:05:11 +0900 Subject: [PATCH 4/8] discovery: harden retry-delay regression test for jitter Rename TestHealthCheckRetryDelayIsFixed to TestHealthCheckRetryDelayIsBounded since the interval now carries jitter rather than being strictly fixed, and update its comments accordingly. Replace t.Fatal with require.Fail per the project test conventions, and widen the timeout to 30s: the success path returns almost immediately, so a healthy run never approaches it, while a regression that re-grows the delay still trips it without CI flakiness. Signed-off-by: Gwanho Kim --- go/vt/discovery/healthcheck_test.go | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/go/vt/discovery/healthcheck_test.go b/go/vt/discovery/healthcheck_test.go index b37beaf0286..136e326d163 100644 --- a/go/vt/discovery/healthcheck_test.go +++ b/go/vt/discovery/healthcheck_test.go @@ -345,23 +345,25 @@ func TestHealthCheckStreamError(t *testing.T) { assert.Empty(t, a, "wrong result, expected empty list") } -// TestHealthCheckRetryDelayIsFixed verifies that repeated stream failures do not -// cause the retry delay to grow. After multiple consecutive errors the tablet -// should still be rediscovered within a short, fixed window once it recovers. -// Regression test for https://github.com/vitessio/vitess/issues/19894. -func TestHealthCheckRetryDelayIsFixed(t *testing.T) { +// TestHealthCheckRetryDelayIsBounded verifies that repeated stream failures do +// not cause the retry delay to grow. After multiple consecutive errors the +// tablet should still be rediscovered within a short, bounded window once it +// recovers, rather than the ever-growing window the old exponential backoff +// produced. Regression test for https://github.com/vitessio/vitess/issues/19894. +func TestHealthCheckRetryDelayIsBounded(t *testing.T) { ctx := utils.LeakCheckContext(t) ts := memorytopo.NewServer(ctx, "cell") defer ts.Close() hc := createTestHc(ctx, ts) - // Use a short but measurable retry delay so we can assert timing. + // Use a short but measurable retry delay so we can assert timing. With + // jitter the actual interval stays within [7.5ms, 12.5ms]. hc.retryDelay = 10 * time.Millisecond defer hc.Close() tablet := createTestTablet(0, "cell", "a") input := make(chan *querypb.StreamHealthResponse) - resultChan := hc.Subscribe("TestHealthCheckRetryDelayIsFixed") + resultChan := hc.Subscribe("TestHealthCheckRetryDelayIsBounded") fc := createFakeConn(tablet, input) fc.errCh = make(chan error) hc.AddTablet(tablet) @@ -387,13 +389,16 @@ func TestHealthCheckRetryDelayIsFixed(t *testing.T) { } input <- shr - // With a fixed 10ms retry delay, rediscovery should happen well within 100ms. - // With the old exponential backoff (10ms * 2^6 = 640ms cap), this would time out. + // With the bounded ~10ms retry delay, rediscovery happens almost immediately + // (within a couple of jittered cycles). With the old exponential backoff + // (10ms * 2^6 = 640ms cap) it would lag far behind. The timeout is generous + // so a healthy run never flakes; only a regression that re-grows the delay + // would approach it. select { case result := <-resultChan: assert.True(t, result.Serving, "tablet should be serving after recovery") - case <-time.After(100 * time.Millisecond): - t.Fatal("tablet was not rediscovered promptly after recovery; retry delay may be growing exponentially") + case <-time.After(30 * time.Second): + require.Fail(t, "tablet was not rediscovered promptly after recovery; retry delay may be growing exponentially") } } From 2ac3c004f3fd560d98979d548259e263e4529878 Mon Sep 17 00:00:00 2001 From: Gwanho Kim Date: Thu, 4 Jun 2026 23:17:45 +0900 Subject: [PATCH 5/8] discovery: measure reconnect interval in retry-delay regression test Signed-off-by: Gwanho Kim --- go/vt/discovery/healthcheck_test.go | 30 ++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/go/vt/discovery/healthcheck_test.go b/go/vt/discovery/healthcheck_test.go index 136e326d163..28df303fd7d 100644 --- a/go/vt/discovery/healthcheck_test.go +++ b/go/vt/discovery/healthcheck_test.go @@ -362,7 +362,10 @@ func TestHealthCheckRetryDelayIsBounded(t *testing.T) { defer hc.Close() tablet := createTestTablet(0, "cell", "a") - input := make(chan *querypb.StreamHealthResponse) + // input is buffered so the recovery send below does not block until the + // healthcheck goroutine has already slept and reconnected. See the timing + // comment where the healthy response is sent. + input := make(chan *querypb.StreamHealthResponse, 1) resultChan := hc.Subscribe("TestHealthCheckRetryDelayIsBounded") fc := createFakeConn(tablet, input) fc.errCh = make(chan error) @@ -379,7 +382,8 @@ func TestHealthCheckRetryDelayIsBounded(t *testing.T) { <-resultChan // drain the not-serving update } - // Now the tablet recovers. Send a healthy response. + // The tablet recovers. Build the healthy response up front so the timing + // window below covers only the reconnect delay, not test bookkeeping. shr := &querypb.StreamHealthResponse{ TabletAlias: tablet.Alias, Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, @@ -387,18 +391,26 @@ func TestHealthCheckRetryDelayIsBounded(t *testing.T) { PrimaryTermStartTimestamp: 0, RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, } + + // Because input is buffered, this send returns immediately instead of + // blocking until the healthcheck goroutine has finished its retry sleep and + // reconnected. That is what lets us start timing before the sleep and + // actually measure it. With an unbuffered channel the send would absorb the + // sleep, and the test would pass even if exponential backoff were restored. + start := time.Now() input <- shr - // With the bounded ~10ms retry delay, rediscovery happens almost immediately - // (within a couple of jittered cycles). With the old exponential backoff - // (10ms * 2^6 = 640ms cap) it would lag far behind. The timeout is generous - // so a healthy run never flakes; only a regression that re-grows the delay - // would approach it. + // The fixed retry delay (~10ms +/- jitter) rediscovers the tablet within a + // single cycle. The old exponential backoff would sleep ~320ms after + // numErrors failures (10ms * 2^5), so a regression that regrows the delay + // blows past this bound. The 5s arm only trips on a true hang. select { case result := <-resultChan: assert.True(t, result.Serving, "tablet should be serving after recovery") - case <-time.After(30 * time.Second): - require.Fail(t, "tablet was not rediscovered promptly after recovery; retry delay may be growing exponentially") + assert.Less(t, time.Since(start), 100*time.Millisecond, + "rediscovery took too long after recovery; retry delay may be growing exponentially") + case <-time.After(5 * time.Second): + require.Fail(t, "tablet was not rediscovered after recovery") } } From f591d6df3fe3e49f95fcd9807fcff9b21821398f Mon Sep 17 00:00:00 2001 From: Gwanho Kim Date: Thu, 4 Jun 2026 23:17:52 +0900 Subject: [PATCH 6/8] vtgate: default health check retry delay to fixed 5s interval Signed-off-by: Gwanho Kim --- go/flags/endtoend/vtcombo.txt | 2 +- go/flags/endtoend/vtgate.txt | 2 +- go/vt/vtgate/vtgate.go | 6 ++++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/go/flags/endtoend/vtcombo.txt b/go/flags/endtoend/vtcombo.txt index d6c25ca837b..827a71f6895 100644 --- a/go/flags/endtoend/vtcombo.txt +++ b/go/flags/endtoend/vtcombo.txt @@ -168,7 +168,7 @@ Flags: --grpc-use-effective-groups If set, and SSL is not used, will set the immediate caller's security groups from the effective caller id's groups. --grpc-use-static-authentication-callerid If set, will set the immediate caller id to the username authenticated by the static auth plugin. --health-check-interval duration Interval between health checks (default 20s) - --healthcheck-retry-delay duration health check retry delay (default 2ms) + --healthcheck-retry-delay duration health check retry delay (default 5s) --healthcheck-timeout duration the health check timeout period (default 1m0s) --heartbeat-enable If true, vttablet records (if master) or checks (if replica) the current time of a replication heartbeat in the sidecar database's heartbeat table. The result is used to inform the serving state of the vttablet via healthchecks. --heartbeat-interval duration How frequently to read and write replication heartbeat. (default 1s) diff --git a/go/flags/endtoend/vtgate.txt b/go/flags/endtoend/vtgate.txt index f56fc262ef1..4e0ded2fbe1 100644 --- a/go/flags/endtoend/vtgate.txt +++ b/go/flags/endtoend/vtgate.txt @@ -100,7 +100,7 @@ Flags: --grpc-use-effective-callerid If set, and SSL is not used, will set the immediate caller id from the effective caller id's principal. --grpc-use-effective-groups If set, and SSL is not used, will set the immediate caller's security groups from the effective caller id's groups. --grpc-use-static-authentication-callerid If set, will set the immediate caller id to the username authenticated by the static auth plugin. - --healthcheck-retry-delay duration health check retry delay (default 2ms) + --healthcheck-retry-delay duration health check retry delay (default 5s) --healthcheck-timeout duration the health check timeout period (default 1m0s) -h, --help help for vtgate --jaeger-agent-host string host and port to send spans to. if empty, no tracing will be done diff --git a/go/vt/vtgate/vtgate.go b/go/vt/vtgate/vtgate.go index d2c38ac50b7..3f9bb3d5062 100644 --- a/go/vt/vtgate/vtgate.go +++ b/go/vt/vtgate/vtgate.go @@ -84,8 +84,10 @@ var ( noScatter bool enableShardRouting bool - // healthCheckRetryDelay is the time to wait before retrying healthcheck - healthCheckRetryDelay = 2 * time.Millisecond + // healthCheckRetryDelay is the fixed interval between healthcheck reconnection + // attempts. Since it is now a steady-state interval rather than a backoff seed, + // it defaults to discovery's 5s instead of a few milliseconds. + healthCheckRetryDelay = discovery.DefaultHealthCheckRetryDelay // healthCheckTimeout is the timeout on the RPC call to tablets healthCheckTimeout = time.Minute From 7868952e5fd7944d8093ea2fbaff35ccbf5b55a5 Mon Sep 17 00:00:00 2001 From: Gwanho Kim Date: Thu, 4 Jun 2026 23:35:51 +0900 Subject: [PATCH 7/8] discovery: guard retryInterval jitter and de-flake its regression test Signed-off-by: Gwanho Kim --- go/vt/discovery/healthcheck_test.go | 11 ++++++----- go/vt/discovery/tablet_health_check.go | 8 ++++++-- go/vt/discovery/tablet_health_check_test.go | 7 +++++++ 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/go/vt/discovery/healthcheck_test.go b/go/vt/discovery/healthcheck_test.go index 28df303fd7d..e8bc4114859 100644 --- a/go/vt/discovery/healthcheck_test.go +++ b/go/vt/discovery/healthcheck_test.go @@ -376,7 +376,7 @@ func TestHealthCheckRetryDelayIsBounded(t *testing.T) { // Send multiple consecutive stream errors to simulate a prolonged outage // where the tablet is unreachable for many retry cycles. - const numErrors = 6 // with exponential backoff this would reach 320ms (10*2^5) + const numErrors = 8 // with exponential backoff the post-recovery sleep would reach ~1.28s (10ms*2^7) for range numErrors { fc.errCh <- errors.New("connection refused") <-resultChan // drain the not-serving update @@ -401,13 +401,14 @@ func TestHealthCheckRetryDelayIsBounded(t *testing.T) { input <- shr // The fixed retry delay (~10ms +/- jitter) rediscovers the tablet within a - // single cycle. The old exponential backoff would sleep ~320ms after - // numErrors failures (10ms * 2^5), so a regression that regrows the delay - // blows past this bound. The 5s arm only trips on a true hang. + // single cycle. The old exponential backoff would sleep ~1.28s after + // numErrors failures (10ms * 2^7), so the 400ms bound sits far above the fixed + // interval (no CI-timing flakiness) yet well below the backoff value, failing + // clearly if the delay regrows. The 5s arm only trips on a true hang. select { case result := <-resultChan: assert.True(t, result.Serving, "tablet should be serving after recovery") - assert.Less(t, time.Since(start), 100*time.Millisecond, + assert.Less(t, time.Since(start), 400*time.Millisecond, "rediscovery took too long after recovery; retry delay may be growing exponentially") case <-time.After(5 * time.Second): require.Fail(t, "tablet was not rediscovered after recovery") diff --git a/go/vt/discovery/tablet_health_check.go b/go/vt/discovery/tablet_health_check.go index 436117bf7dc..b12340a7145 100644 --- a/go/vt/discovery/tablet_health_check.go +++ b/go/vt/discovery/tablet_health_check.go @@ -333,11 +333,15 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) { // the same instant, while never growing the interval the way exponential // backoff did. See #19894. func retryInterval(base time.Duration) time.Duration { - if base <= 0 { + // half is the jitter span passed to rand.Int64N, which panics on n <= 0. For + // a non-positive base, or one so small that base/2 truncates to zero (sub-2ns), + // there is no room to jitter, so return the base unchanged. + half := base / 2 + if half <= 0 { return base } // Spread uniformly within [base-base/4, base+base/4). - return base - base/4 + time.Duration(rand.Int64N(int64(base/2))) + return base - base/4 + time.Duration(rand.Int64N(int64(half))) } func (thc *tabletHealthCheck) closeConnection(ctx context.Context, err error) { diff --git a/go/vt/discovery/tablet_health_check_test.go b/go/vt/discovery/tablet_health_check_test.go index 95ff94b475c..63ce59283b7 100644 --- a/go/vt/discovery/tablet_health_check_test.go +++ b/go/vt/discovery/tablet_health_check_test.go @@ -53,3 +53,10 @@ func TestRetryIntervalNonPositive(t *testing.T) { assert.Equal(t, time.Duration(0), retryInterval(0)) assert.Equal(t, -time.Second, retryInterval(-time.Second)) } + +// TestRetryIntervalTinyBase verifies that a positive base too small to jitter +// (base/2 truncates to zero) is returned unchanged rather than panicking inside +// rand.Int64N, which rejects a non-positive argument. +func TestRetryIntervalTinyBase(t *testing.T) { + assert.Equal(t, time.Nanosecond, retryInterval(time.Nanosecond)) +} From 393ffeac909b83e6d83173c77eaa4e5c7426236e Mon Sep 17 00:00:00 2001 From: Gwanho Kim Date: Sat, 13 Jun 2026 17:00:28 +0900 Subject: [PATCH 8/8] endtoend: accept down-or-nonexistent error in reparent open-tx test The fixed 5s health check retry interval can leave vtgate without a reconnected stream when the DML runs, so accept "is either down or nonexistent" alongside "wrong tablet type" as the VT15001 reason. Signed-off-by: Gwanho Kim --- .../newfeaturetest/reparent_with_open_tx_test.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/go/test/endtoend/reparent/newfeaturetest/reparent_with_open_tx_test.go b/go/test/endtoend/reparent/newfeaturetest/reparent_with_open_tx_test.go index 6d845fc41cd..f8a20d1726c 100644 --- a/go/test/endtoend/reparent/newfeaturetest/reparent_with_open_tx_test.go +++ b/go/test/endtoend/reparent/newfeaturetest/reparent_with_open_tx_test.go @@ -18,6 +18,7 @@ package newfeaturetest import ( "context" + "strings" "testing" "time" @@ -96,7 +97,15 @@ func testExecuteErrorWhileTabletIsNotServing(t *testing.T, conn *mysql.Conn, clu <-tabletNotServing _, err := conn.ExecuteFetch(utils.GetInsertMultipleValuesQuery(idx, idx+1, idx+2, idx+3), 0, false) require.ErrorContains(t, err, "VT15001") - require.ErrorContains(t, err, vterrors.WrongTablet) + // Depending on whether vtgate has re-established its health stream to the + // restarted tablet by the time the query runs, the underlying reason is + // either the tablet rejecting the query as the wrong type, or vtgate not + // yet having a live connection to it. Both are valid VT15001 transaction + // errors against a no-longer-serving pinned tablet. + require.True(t, + strings.Contains(err.Error(), vterrors.WrongTablet) || + strings.Contains(err.Error(), "is either down or nonexistent"), + "unexpected VT15001 reason: %v", err) // Subsequent queries after a VT15001 should start returning a VT09032 error until we issue a ROLLBACK _, err = conn.ExecuteFetch("select * from vt_insert_test", 1, false)