From 721e63ee19d367b0a0ba243657aef804963b0b0d Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 4 Mar 2026 13:11:44 -0500 Subject: [PATCH 01/27] vttablet: handle applier metadata init failures in relay-log recovery `handleRelayLogError` currently retries replication restart for known recoverable metadata-init failures (relay log info and master info). MySQL can also return: ``` Replica failed to initialize applier metadata structure from the repository ``` This treats this error as the same recoverable class by triggering `RestartReplication` (STOP REPLICA, RESET REPLICA, START REPLICA). Signed-off-by: Mohamed Hamza --- .../vttablet/tabletmanager/rpc_replication.go | 33 +- .../tabletmanager/rpc_replication_test.go | 63 ++++ .../testlib/planned_reparent_shard_test.go | 330 ++++++++++-------- go/vt/wrangler/testlib/reparent_utils_test.go | 85 +++-- 4 files changed, 328 insertions(+), 183 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index 7d9d36a94d9..7ca73c5d4fb 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "runtime" + "slices" "strings" "time" @@ -1237,6 +1238,35 @@ func (tm *TabletManager) fixSemiSyncAndReplication(ctx context.Context, tabletTy return nil } +// Known MySQL replication metadata initialization failures that can be repaired +// by restarting replication. +const ( + relayLogInfoInitializationError = "Replica failed to initialize relay log info structure from the repository" + masterInfoInitializationError = "Could not initialize master info structure" + applierMetadataInitializationError = "Replica failed to initialize applier metadata structure from the repository" +) + +// recoverableReplicationInitializationErrors enumerates the error substrings we +// treat as recoverable through RestartReplication. +var recoverableReplicationInitializationErrors = []string{ + relayLogInfoInitializationError, + masterInfoInitializationError, + applierMetadataInitializationError, +} + +// isRecoverableReplicationInitializationError returns true if err contains one +// of the known recoverable metadata initialization failures. +func isRecoverableReplicationInitializationError(err error) bool { + if err == nil { + return false + } + + errMessage := err.Error() + return slices.ContainsFunc(recoverableReplicationInitializationErrors, func(s string) bool { + return strings.Contains(errMessage, s) + }) +} + // handleRelayLogError resets replication of the instance. // This is required because sometimes MySQL gets stuck due to improper initialization of // master info structure or related failures and throws errors like @@ -1247,8 +1277,7 @@ func (tm *TabletManager) handleRelayLogError(ctx context.Context, err error) err // Replica failed to initialize relay log info structure from the repository (errno 1872) (sqlstate HY000) during query: START REPLICA // see https://bugs.mysql.com/bug.php?id=83713 or https://github.com/vitessio/vitess/issues/5067 // The same fix also works for https://github.com/vitessio/vitess/issues/10955. - if strings.Contains(err.Error(), "Replica failed to initialize relay log info structure from the repository") || - strings.Contains(err.Error(), "Could not initialize master info structure") { + if isRecoverableReplicationInitializationError(err) { // Stop, reset and start replication again to resolve this error if err := tm.MysqlDaemon.RestartReplication(ctx, tm.hookExtraEnv()); err != nil { return err diff --git a/go/vt/vttablet/tabletmanager/rpc_replication_test.go b/go/vt/vttablet/tabletmanager/rpc_replication_test.go index 31da3abb732..4624d78ec2f 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication_test.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication_test.go @@ -18,6 +18,7 @@ package tabletmanager import ( "context" + "errors" "sync/atomic" "testing" "time" @@ -326,3 +327,65 @@ func TestUndoDemotePrimaryStateChange(t *testing.T) { require.NoError(t, err) require.False(t, isReadOnly) } + +func TestHandleRelayLogError(t *testing.T) { + testCases := []struct { + name string + inputErr error + shouldRestart bool + }{ + { + name: "relay log info error", + inputErr: errors.New(relayLogInfoInitializationError), + shouldRestart: true, + }, + { + name: "master info error", + inputErr: errors.New(masterInfoInitializationError), + shouldRestart: true, + }, + { + name: "applier metadata error", + inputErr: errors.New(applierMetadataInitializationError), + shouldRestart: true, + }, + { + name: "unrelated error", + inputErr: errors.New("unexpected replication failure"), + shouldRestart: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + fakeMysqlDaemon := newTestMysqlDaemon(t, 1) + if tc.shouldRestart { + fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + "STOP REPLICA", + "RESET REPLICA", + "START REPLICA", + } + } + + tablet := newTestTablet(t, 100, "ks", "0", nil) + tm := &TabletManager{ + MysqlDaemon: fakeMysqlDaemon, + tabletAlias: tablet.Alias, + tmState: &tmState{ + displayState: displayState{ + tablet: tablet, + }, + }, + } + + err := tm.handleRelayLogError(context.Background(), tc.inputErr) + if tc.shouldRestart { + require.NoError(t, err) + } else { + require.ErrorIs(t, err, tc.inputErr) + } + + require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) + }) + } +} diff --git a/go/vt/wrangler/testlib/planned_reparent_shard_test.go b/go/vt/wrangler/testlib/planned_reparent_shard_test.go index 3b7daa76de3..1c738ed3eaf 100644 --- a/go/vt/wrangler/testlib/planned_reparent_shard_test.go +++ b/go/vt/wrangler/testlib/planned_reparent_shard_test.go @@ -593,80 +593,98 @@ func TestPlannedReparentShardWaitForPositionTimeout(t *testing.T) { } func TestPlannedReparentShardRelayLogError(t *testing.T) { - delay := discovery.GetTabletPickerRetryDelay() - defer func() { - discovery.SetTabletPickerRetryDelay(delay) - }() - discovery.SetTabletPickerRetryDelay(5 * time.Millisecond) - - ctx := t.Context() - ts := memorytopo.NewServer(ctx, "cell1") - wr := wrangler.New(vtenv.NewTestEnv(), logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient()) - vp := NewVtctlPipe(ctx, t, ts) - defer vp.Close() - - // Create a primary, a couple good replicas - primary := NewFakeTablet(t, wr, "cell1", 0, topodatapb.TabletType_PRIMARY, nil) - goodReplica1 := NewFakeTablet(t, wr, "cell1", 2, topodatapb.TabletType_REPLICA, nil) - - // old primary - primary.FakeMysqlDaemon.ReadOnly = false - primary.FakeMysqlDaemon.Replicating = false - primary.FakeMysqlDaemon.ReplicationStatusError = mysql.ErrNotReplica - primary.FakeMysqlDaemon.SetPrimaryPositionLocked(replication.Position{ - GTIDSet: replication.MariadbGTIDSet{ - 7: replication.MariadbGTID{ - Domain: 7, - Server: 123, - Sequence: 990, - }, + relayErrors := []struct { + name string + message string + }{ + { + name: "relay log info", + message: "Replica failed to initialize relay log info structure from the repository", + }, + { + name: "applier metadata", + message: "Replica failed to initialize applier metadata structure from the repository", }, - }) - primary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - "SUBINSERT INTO _vt.reparent_journal (time_created_ns, action_name, primary_alias, replication_position) VALUES", } - primary.StartActionLoop(t, wr) - defer primary.StopActionLoop(t) - primary.TM.QueryServiceControl.(*tabletservermock.Controller).SetQueryServiceEnabledForTests(true) - // goodReplica1 is replicating - goodReplica1.FakeMysqlDaemon.ReadOnly = true - goodReplica1.FakeMysqlDaemon.Replicating = true - goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(primary.Tablet)) - goodReplica1.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 3 statements come from tablet startup - "STOP REPLICA", - "FAKE SET SOURCE", - "START REPLICA", - // simulate error that will trigger a call to RestartReplication - "STOP REPLICA", - "RESET REPLICA", - "START REPLICA", - "START REPLICA", + for _, relayError := range relayErrors { + t.Run(relayError.name, func(t *testing.T) { + delay := discovery.GetTabletPickerRetryDelay() + defer func() { + discovery.SetTabletPickerRetryDelay(delay) + }() + discovery.SetTabletPickerRetryDelay(5 * time.Millisecond) + + ctx := t.Context() + ts := memorytopo.NewServer(ctx, "cell1") + wr := wrangler.New(vtenv.NewTestEnv(), logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient()) + vp := NewVtctlPipe(ctx, t, ts) + defer vp.Close() + + // Create a primary, a couple good replicas + primary := NewFakeTablet(t, wr, "cell1", 0, topodatapb.TabletType_PRIMARY, nil) + goodReplica1 := NewFakeTablet(t, wr, "cell1", 2, topodatapb.TabletType_REPLICA, nil) + + // old primary + primary.FakeMysqlDaemon.ReadOnly = false + primary.FakeMysqlDaemon.Replicating = false + primary.FakeMysqlDaemon.ReplicationStatusError = mysql.ErrNotReplica + primary.FakeMysqlDaemon.SetPrimaryPositionLocked(replication.Position{ + GTIDSet: replication.MariadbGTIDSet{ + 7: replication.MariadbGTID{ + Domain: 7, + Server: 123, + Sequence: 990, + }, + }, + }) + primary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + "SUBINSERT INTO _vt.reparent_journal (time_created_ns, action_name, primary_alias, replication_position) VALUES", + } + primary.StartActionLoop(t, wr) + defer primary.StopActionLoop(t) + primary.TM.QueryServiceControl.(*tabletservermock.Controller).SetQueryServiceEnabledForTests(true) + + // goodReplica1 is replicating + goodReplica1.FakeMysqlDaemon.ReadOnly = true + goodReplica1.FakeMysqlDaemon.Replicating = true + goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(primary.Tablet)) + goodReplica1.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + // These 3 statements come from tablet startup + "STOP REPLICA", + "FAKE SET SOURCE", + "START REPLICA", + // simulate error that will trigger a call to RestartReplication + "STOP REPLICA", + "RESET REPLICA", + "START REPLICA", + "START REPLICA", + } + goodReplica1.StartActionLoop(t, wr) + goodReplica1.FakeMysqlDaemon.StopReplicationError = errors.New(relayError.message) + defer goodReplica1.StopActionLoop(t) + + // run PlannedReparentShard + err := vp.Run([]string{ + "PlannedReparentShard", "--wait_replicas_timeout", "10s", "--keyspace_shard", primary.Tablet.Keyspace + "/" + primary.Tablet.Shard, "--new_primary", + topoproto.TabletAliasString(primary.Tablet.Alias), + }) + require.NoError(t, err) + // check what was run + err = primary.FakeMysqlDaemon.CheckSuperQueryList() + require.NoError(t, err) + err = goodReplica1.FakeMysqlDaemon.CheckSuperQueryList() + require.NoError(t, err) + + assert.False(t, primary.FakeMysqlDaemon.ReadOnly, "primary.FakeMysqlDaemon.ReadOnly set") + assert.True(t, goodReplica1.FakeMysqlDaemon.ReadOnly, "goodReplica1.FakeMysqlDaemon.ReadOnly not set") + assert.True(t, primary.TM.QueryServiceControl.IsServing(), "primary...QueryServiceControl not serving") + + // verify the old primary was told to start replicating (and not + // the replica that wasn't replicating in the first place) + assert.True(t, goodReplica1.FakeMysqlDaemon.Replicating, "goodReplica1.FakeMysqlDaemon.Replicating not set") + }) } - goodReplica1.StartActionLoop(t, wr) - goodReplica1.FakeMysqlDaemon.StopReplicationError = errors.New("Replica failed to initialize relay log info structure from the repository") - defer goodReplica1.StopActionLoop(t) - - // run PlannedReparentShard - err := vp.Run([]string{ - "PlannedReparentShard", "--wait_replicas_timeout", "10s", "--keyspace_shard", primary.Tablet.Keyspace + "/" + primary.Tablet.Shard, "--new_primary", - topoproto.TabletAliasString(primary.Tablet.Alias), - }) - require.NoError(t, err) - // check what was run - err = primary.FakeMysqlDaemon.CheckSuperQueryList() - require.NoError(t, err) - err = goodReplica1.FakeMysqlDaemon.CheckSuperQueryList() - require.NoError(t, err) - - assert.False(t, primary.FakeMysqlDaemon.ReadOnly, "primary.FakeMysqlDaemon.ReadOnly set") - assert.True(t, goodReplica1.FakeMysqlDaemon.ReadOnly, "goodReplica1.FakeMysqlDaemon.ReadOnly not set") - assert.True(t, primary.TM.QueryServiceControl.IsServing(), "primary...QueryServiceControl not serving") - - // verify the old primary was told to start replicating (and not - // the replica that wasn't replicating in the first place) - assert.True(t, goodReplica1.FakeMysqlDaemon.Replicating, "goodReplica1.FakeMysqlDaemon.Replicating not set") } // TestPlannedReparentShardRelayLogErrorStartReplication is similar to @@ -674,87 +692,105 @@ func TestPlannedReparentShardRelayLogError(t *testing.T) { // is not replicating to start with (IO_Thread is not running) and we // simulate an error from the attempt to start replication func TestPlannedReparentShardRelayLogErrorStartReplication(t *testing.T) { - delay := discovery.GetTabletPickerRetryDelay() - defer func() { - discovery.SetTabletPickerRetryDelay(delay) - }() - discovery.SetTabletPickerRetryDelay(5 * time.Millisecond) - - ctx := t.Context() - ts := memorytopo.NewServer(ctx, "cell1") - wr := wrangler.New(vtenv.NewTestEnv(), logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient()) - vp := NewVtctlPipe(ctx, t, ts) - defer vp.Close() - - // Create a primary, a couple good replicas - primary := NewFakeTablet(t, wr, "cell1", 0, topodatapb.TabletType_PRIMARY, nil) - goodReplica1 := NewFakeTablet(t, wr, "cell1", 2, topodatapb.TabletType_REPLICA, nil) - reparenttestutil.SetKeyspaceDurability(context.Background(), t, ts, "test_keyspace", policy.DurabilitySemiSync) - - // old primary - primary.FakeMysqlDaemon.ReadOnly = false - primary.FakeMysqlDaemon.Replicating = false - primary.FakeMysqlDaemon.ReplicationStatusError = mysql.ErrNotReplica - primary.FakeMysqlDaemon.SetPrimaryPositionLocked(replication.Position{ - GTIDSet: replication.MariadbGTIDSet{ - 7: replication.MariadbGTID{ - Domain: 7, - Server: 123, - Sequence: 990, - }, + relayErrors := []struct { + name string + message string + }{ + { + name: "relay log info", + message: "Replica failed to initialize relay log info structure from the repository", + }, + { + name: "applier metadata", + message: "Replica failed to initialize applier metadata structure from the repository", }, - }) - primary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - "SUBINSERT INTO _vt.reparent_journal (time_created_ns, action_name, primary_alias, replication_position) VALUES", } - primary.StartActionLoop(t, wr) - defer primary.StopActionLoop(t) - primary.TM.QueryServiceControl.(*tabletservermock.Controller).SetQueryServiceEnabledForTests(true) - // goodReplica1 is not replicating - goodReplica1.FakeMysqlDaemon.ReadOnly = true - goodReplica1.FakeMysqlDaemon.Replicating = true - goodReplica1.FakeMysqlDaemon.IOThreadRunning = false - goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(primary.Tablet)) - goodReplica1.FakeMysqlDaemon.CurrentSourceHost = primary.Tablet.MysqlHostname - goodReplica1.FakeMysqlDaemon.CurrentSourcePort = primary.Tablet.MysqlPort - goodReplica1.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // simulate error that will trigger a call to RestartReplication - // These 3 statements come from tablet startup - "STOP REPLICA", - "FAKE SET SOURCE", - "START REPLICA", - // In SetReplicationSource, we find that the source host and port was already set correctly, - // So we try to stop and start replication. The first STOP REPLICA comes from there - "STOP REPLICA", - // During the START REPLICA call, we find a relay log error, so we try to restart replication. - "STOP REPLICA", - "RESET REPLICA", - "START REPLICA", + for _, relayError := range relayErrors { + t.Run(relayError.name, func(t *testing.T) { + delay := discovery.GetTabletPickerRetryDelay() + defer func() { + discovery.SetTabletPickerRetryDelay(delay) + }() + discovery.SetTabletPickerRetryDelay(5 * time.Millisecond) + + ctx := t.Context() + ts := memorytopo.NewServer(ctx, "cell1") + wr := wrangler.New(vtenv.NewTestEnv(), logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient()) + vp := NewVtctlPipe(ctx, t, ts) + defer vp.Close() + + // Create a primary, a couple good replicas + primary := NewFakeTablet(t, wr, "cell1", 0, topodatapb.TabletType_PRIMARY, nil) + goodReplica1 := NewFakeTablet(t, wr, "cell1", 2, topodatapb.TabletType_REPLICA, nil) + reparenttestutil.SetKeyspaceDurability(context.Background(), t, ts, "test_keyspace", policy.DurabilitySemiSync) + + // old primary + primary.FakeMysqlDaemon.ReadOnly = false + primary.FakeMysqlDaemon.Replicating = false + primary.FakeMysqlDaemon.ReplicationStatusError = mysql.ErrNotReplica + primary.FakeMysqlDaemon.SetPrimaryPositionLocked(replication.Position{ + GTIDSet: replication.MariadbGTIDSet{ + 7: replication.MariadbGTID{ + Domain: 7, + Server: 123, + Sequence: 990, + }, + }, + }) + primary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + "SUBINSERT INTO _vt.reparent_journal (time_created_ns, action_name, primary_alias, replication_position) VALUES", + } + primary.StartActionLoop(t, wr) + defer primary.StopActionLoop(t) + primary.TM.QueryServiceControl.(*tabletservermock.Controller).SetQueryServiceEnabledForTests(true) + + // goodReplica1 is not replicating + goodReplica1.FakeMysqlDaemon.ReadOnly = true + goodReplica1.FakeMysqlDaemon.Replicating = true + goodReplica1.FakeMysqlDaemon.IOThreadRunning = false + goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs = append(goodReplica1.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(primary.Tablet)) + goodReplica1.FakeMysqlDaemon.CurrentSourceHost = primary.Tablet.MysqlHostname + goodReplica1.FakeMysqlDaemon.CurrentSourcePort = primary.Tablet.MysqlPort + goodReplica1.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + // simulate error that will trigger a call to RestartReplication + // These 3 statements come from tablet startup + "STOP REPLICA", + "FAKE SET SOURCE", + "START REPLICA", + // In SetReplicationSource, we find that the source host and port was already set correctly, + // So we try to stop and start replication. The first STOP REPLICA comes from there + "STOP REPLICA", + // During the START REPLICA call, we find a relay log error, so we try to restart replication. + "STOP REPLICA", + "RESET REPLICA", + "START REPLICA", + } + goodReplica1.StartActionLoop(t, wr) + goodReplica1.FakeMysqlDaemon.StartReplicationError = errors.New(relayError.message) + defer goodReplica1.StopActionLoop(t) + + // run PlannedReparentShard + err := vp.Run([]string{ + "PlannedReparentShard", "--wait_replicas_timeout", "10s", "--keyspace_shard", primary.Tablet.Keyspace + "/" + primary.Tablet.Shard, "--new_primary", + topoproto.TabletAliasString(primary.Tablet.Alias), + }) + require.NoError(t, err) + // check what was run + err = primary.FakeMysqlDaemon.CheckSuperQueryList() + require.NoError(t, err) + err = goodReplica1.FakeMysqlDaemon.CheckSuperQueryList() + require.NoError(t, err) + + assert.False(t, primary.FakeMysqlDaemon.ReadOnly, "primary.FakeMysqlDaemon.ReadOnly set") + assert.True(t, goodReplica1.FakeMysqlDaemon.ReadOnly, "goodReplica1.FakeMysqlDaemon.ReadOnly not set") + assert.True(t, primary.TM.QueryServiceControl.IsServing(), "primary...QueryServiceControl not serving") + + // verify the old primary was told to start replicating (and not + // the replica that wasn't replicating in the first place) + assert.True(t, goodReplica1.FakeMysqlDaemon.Replicating, "goodReplica1.FakeMysqlDaemon.Replicating not set") + }) } - goodReplica1.StartActionLoop(t, wr) - goodReplica1.FakeMysqlDaemon.StartReplicationError = errors.New("Replica failed to initialize relay log info structure from the repository") - defer goodReplica1.StopActionLoop(t) - - // run PlannedReparentShard - err := vp.Run([]string{ - "PlannedReparentShard", "--wait_replicas_timeout", "10s", "--keyspace_shard", primary.Tablet.Keyspace + "/" + primary.Tablet.Shard, "--new_primary", - topoproto.TabletAliasString(primary.Tablet.Alias), - }) - require.NoError(t, err) - // check what was run - err = primary.FakeMysqlDaemon.CheckSuperQueryList() - require.NoError(t, err) - err = goodReplica1.FakeMysqlDaemon.CheckSuperQueryList() - require.NoError(t, err) - - assert.False(t, primary.FakeMysqlDaemon.ReadOnly, "primary.FakeMysqlDaemon.ReadOnly set") - assert.True(t, goodReplica1.FakeMysqlDaemon.ReadOnly, "goodReplica1.FakeMysqlDaemon.ReadOnly not set") - assert.True(t, primary.TM.QueryServiceControl.IsServing(), "primary...QueryServiceControl not serving") - - // verify the old primary was told to start replicating (and not - // the replica that wasn't replicating in the first place) - assert.True(t, goodReplica1.FakeMysqlDaemon.Replicating, "goodReplica1.FakeMysqlDaemon.Replicating not set") } // TestPlannedReparentShardPromoteReplicaFail simulates a failure of the PromoteReplica call diff --git a/go/vt/wrangler/testlib/reparent_utils_test.go b/go/vt/wrangler/testlib/reparent_utils_test.go index c28954b9146..9550f01cafa 100644 --- a/go/vt/wrangler/testlib/reparent_utils_test.go +++ b/go/vt/wrangler/testlib/reparent_utils_test.go @@ -212,41 +212,58 @@ func TestSetReplicationSource(t *testing.T) { primary.StartActionLoop(t, wr) defer primary.StopActionLoop(t) - // test when we receive a relay log error while starting replication - t.Run("Relay log error", func(t *testing.T) { - replica := NewFakeTablet(t, wr, "cell1", 2, topodatapb.TabletType_REPLICA, nil) - // replica loop - // We have to set the settings as replicating. Otherwise, - // the replication manager intervenes and tries to fix replication, - // which ends up making this test unpredictable. - replica.FakeMysqlDaemon.Replicating = true - replica.FakeMysqlDaemon.IOThreadRunning = true - replica.FakeMysqlDaemon.SetReplicationSourceInputs = append(replica.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(primary.Tablet)) - replica.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - // These 3 statements come from tablet startup - "STOP REPLICA", - "FAKE SET SOURCE", - "START REPLICA", - // We stop and reset the replication parameters because of relay log issues. - "STOP REPLICA", - "STOP REPLICA", - "RESET REPLICA", - "START REPLICA", - } - replica.StartActionLoop(t, wr) - defer replica.StopActionLoop(t) - - // Set the correct error message that indicates we have received a relay log error. - replica.FakeMysqlDaemon.StartReplicationError = errors.New("ERROR 1201 (HY000): Could not initialize master info structure; more error messages can be found in the MySQL error log") - // run ReparentTablet - err = wr.SetReplicationSource(ctx, replica.Tablet) - require.NoError(t, err, "SetReplicationSource failed") + relayErrors := []struct { + name string + message string + uid uint32 + }{ + { + name: "master info relay error", + message: "ERROR 1201 (HY000): Could not initialize master info structure; more error messages can be found in the MySQL error log", + uid: 2, + }, + { + name: "applier metadata relay error", + message: "Replica failed to initialize applier metadata structure from the repository", + uid: 5, + }, + } - // check what was run - err = replica.FakeMysqlDaemon.CheckSuperQueryList() - require.NoError(t, err, "CheckSuperQueryList failed") - checkSemiSyncEnabled(t, false, true, replica) - }) + for _, relayError := range relayErrors { + t.Run(relayError.name, func(t *testing.T) { + replica := NewFakeTablet(t, wr, "cell1", relayError.uid, topodatapb.TabletType_REPLICA, nil) + // replica loop + // We have to set the settings as replicating. Otherwise, + // the replication manager intervenes and tries to fix replication, + // which ends up making this test unpredictable. + replica.FakeMysqlDaemon.Replicating = true + replica.FakeMysqlDaemon.IOThreadRunning = true + replica.FakeMysqlDaemon.SetReplicationSourceInputs = append(replica.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(primary.Tablet)) + replica.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + // These 3 statements come from tablet startup + "STOP REPLICA", + "FAKE SET SOURCE", + "START REPLICA", + // We stop and reset the replication parameters because of relay log issues. + "STOP REPLICA", + "STOP REPLICA", + "RESET REPLICA", + "START REPLICA", + } + replica.StartActionLoop(t, wr) + defer replica.StopActionLoop(t) + + // Set the correct error message that indicates we have received a relay log error. + replica.FakeMysqlDaemon.StartReplicationError = errors.New(relayError.message) + err := wr.SetReplicationSource(ctx, replica.Tablet) + require.NoError(t, err, "SetReplicationSource failed") + + // check what was run + err = replica.FakeMysqlDaemon.CheckSuperQueryList() + require.NoError(t, err, "CheckSuperQueryList failed") + checkSemiSyncEnabled(t, false, true, replica) + }) + } t.Run("Errant GTIDs on the replica", func(t *testing.T) { replica := NewFakeTablet(t, wr, "cell1", 4, topodatapb.TabletType_REPLICA, nil) From 2357fb899466efc30c9b121d14d8d0e0422f7625 Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 11 Mar 2026 10:07:25 -0400 Subject: [PATCH 02/27] tabletmanager: use sqlerror for replication init failures Signed-off-by: Mohamed Hamza --- go/mysql/sqlerror/constants.go | 3 ++ .../vttablet/tabletmanager/rpc_replication.go | 33 +++++-------------- .../tabletmanager/rpc_replication_test.go | 19 ++++++++--- .../testlib/planned_reparent_shard_test.go | 29 ++++++++-------- go/vt/wrangler/testlib/reparent_utils_test.go | 22 ++++++------- 5 files changed, 52 insertions(+), 54 deletions(-) diff --git a/go/mysql/sqlerror/constants.go b/go/mysql/sqlerror/constants.go index 9a89ee5fc0b..4d40df3d508 100644 --- a/go/mysql/sqlerror/constants.go +++ b/go/mysql/sqlerror/constants.go @@ -105,6 +105,7 @@ const ( ERDupUnique = ErrorCode(1169) ERRequiresPrimaryKey = ErrorCode(1173) ERCantDoThisDuringAnTransaction = ErrorCode(1179) + ERMasterInfo = ErrorCode(1201) ERReadOnlyTransaction = ErrorCode(1207) ERCannotAddForeign = ErrorCode(1215) ERNoReferencedRow = ErrorCode(1216) @@ -128,6 +129,8 @@ const ( ERSourceHasPurgedRequiredGtids = ErrorCode(1789) ERInnodbIndexCorrupt = ErrorCode(1817) ERDupIndex = ErrorCode(1831) + ERReplicaCMInitRepository = ErrorCode(1871) + ERReplicaAMInitRepository = ErrorCode(1872) ERInnodbReadOnly = ErrorCode(1874) ERVectorConversion = ErrorCode(6138) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index 7ca73c5d4fb..c908dea90bd 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -20,8 +20,6 @@ import ( "context" "fmt" "runtime" - "slices" - "strings" "time" "vitess.io/vitess/go/mysql" @@ -1238,33 +1236,18 @@ func (tm *TabletManager) fixSemiSyncAndReplication(ctx context.Context, tabletTy return nil } -// Known MySQL replication metadata initialization failures that can be repaired -// by restarting replication. -const ( - relayLogInfoInitializationError = "Replica failed to initialize relay log info structure from the repository" - masterInfoInitializationError = "Could not initialize master info structure" - applierMetadataInitializationError = "Replica failed to initialize applier metadata structure from the repository" -) - -// recoverableReplicationInitializationErrors enumerates the error substrings we -// treat as recoverable through RestartReplication. -var recoverableReplicationInitializationErrors = []string{ - relayLogInfoInitializationError, - masterInfoInitializationError, - applierMetadataInitializationError, -} - -// isRecoverableReplicationInitializationError returns true if err contains one -// of the known recoverable metadata initialization failures. func isRecoverableReplicationInitializationError(err error) bool { - if err == nil { + sqlErr, ok := sqlerror.NewSQLErrorFromError(err).(*sqlerror.SQLError) + if !ok || sqlErr == nil { return false } - errMessage := err.Error() - return slices.ContainsFunc(recoverableReplicationInitializationErrors, func(s string) bool { - return strings.Contains(errMessage, s) - }) + switch sqlErr.Number() { + case sqlerror.ERMasterInfo, sqlerror.ERReplicaCMInitRepository, sqlerror.ERReplicaAMInitRepository: + return true + default: + return false + } } // handleRelayLogError resets replication of the instance. diff --git a/go/vt/vttablet/tabletmanager/rpc_replication_test.go b/go/vt/vttablet/tabletmanager/rpc_replication_test.go index 4624d78ec2f..81882f62b30 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication_test.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication_test.go @@ -26,6 +26,7 @@ import ( "github.com/stretchr/testify/require" "golang.org/x/sync/semaphore" + "vitess.io/vitess/go/mysql/sqlerror" "vitess.io/vitess/go/protoutil" "vitess.io/vitess/go/sqltypes" "vitess.io/vitess/go/vt/mysqlctl" @@ -335,20 +336,30 @@ func TestHandleRelayLogError(t *testing.T) { shouldRestart bool }{ { - name: "relay log info error", - inputErr: errors.New(relayLogInfoInitializationError), + name: "relay log info repository error", + inputErr: sqlerror.NewSQLError(sqlerror.ERReplicaAMInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize relay log info structure from the repository"), shouldRestart: true, }, { name: "master info error", - inputErr: errors.New(masterInfoInitializationError), + inputErr: sqlerror.NewSQLError(sqlerror.ERMasterInfo, sqlerror.SSUnknownSQLState, "Could not initialize master info structure; more error messages can be found in the MySQL error log"), + shouldRestart: true, + }, + { + name: "connection metadata repository error", + inputErr: sqlerror.NewSQLError(sqlerror.ERReplicaCMInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize connection metadata structure from the repository"), shouldRestart: true, }, { name: "applier metadata error", - inputErr: errors.New(applierMetadataInitializationError), + inputErr: sqlerror.NewSQLError(sqlerror.ERReplicaAMInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize applier metadata structure from the repository"), shouldRestart: true, }, + { + name: "applier metadata message with wrong errno", + inputErr: sqlerror.NewSQLError(sqlerror.ERUnknownError, sqlerror.SSUnknownSQLState, "Replica failed to initialize applier metadata structure from the repository"), + shouldRestart: false, + }, { name: "unrelated error", inputErr: errors.New("unexpected replication failure"), diff --git a/go/vt/wrangler/testlib/planned_reparent_shard_test.go b/go/vt/wrangler/testlib/planned_reparent_shard_test.go index 1c738ed3eaf..6161f1ae5cd 100644 --- a/go/vt/wrangler/testlib/planned_reparent_shard_test.go +++ b/go/vt/wrangler/testlib/planned_reparent_shard_test.go @@ -23,6 +23,7 @@ import ( "time" "vitess.io/vitess/go/mysql/replication" + "vitess.io/vitess/go/mysql/sqlerror" "vitess.io/vitess/go/vt/mysqlctl" "vitess.io/vitess/go/vt/vtctl/reparentutil/policy" "vitess.io/vitess/go/vt/vtenv" @@ -594,16 +595,16 @@ func TestPlannedReparentShardWaitForPositionTimeout(t *testing.T) { func TestPlannedReparentShardRelayLogError(t *testing.T) { relayErrors := []struct { - name string - message string + name string + err error }{ { - name: "relay log info", - message: "Replica failed to initialize relay log info structure from the repository", + name: "relay log info", + err: sqlerror.NewSQLError(sqlerror.ERReplicaAMInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize relay log info structure from the repository"), }, { - name: "applier metadata", - message: "Replica failed to initialize applier metadata structure from the repository", + name: "applier metadata", + err: sqlerror.NewSQLError(sqlerror.ERReplicaAMInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize applier metadata structure from the repository"), }, } @@ -661,7 +662,7 @@ func TestPlannedReparentShardRelayLogError(t *testing.T) { "START REPLICA", } goodReplica1.StartActionLoop(t, wr) - goodReplica1.FakeMysqlDaemon.StopReplicationError = errors.New(relayError.message) + goodReplica1.FakeMysqlDaemon.StopReplicationError = relayError.err defer goodReplica1.StopActionLoop(t) // run PlannedReparentShard @@ -693,16 +694,16 @@ func TestPlannedReparentShardRelayLogError(t *testing.T) { // simulate an error from the attempt to start replication func TestPlannedReparentShardRelayLogErrorStartReplication(t *testing.T) { relayErrors := []struct { - name string - message string + name string + err error }{ { - name: "relay log info", - message: "Replica failed to initialize relay log info structure from the repository", + name: "relay log info", + err: sqlerror.NewSQLError(sqlerror.ERReplicaAMInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize relay log info structure from the repository"), }, { - name: "applier metadata", - message: "Replica failed to initialize applier metadata structure from the repository", + name: "applier metadata", + err: sqlerror.NewSQLError(sqlerror.ERReplicaAMInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize applier metadata structure from the repository"), }, } @@ -767,7 +768,7 @@ func TestPlannedReparentShardRelayLogErrorStartReplication(t *testing.T) { "START REPLICA", } goodReplica1.StartActionLoop(t, wr) - goodReplica1.FakeMysqlDaemon.StartReplicationError = errors.New(relayError.message) + goodReplica1.FakeMysqlDaemon.StartReplicationError = relayError.err defer goodReplica1.StopActionLoop(t) // run PlannedReparentShard diff --git a/go/vt/wrangler/testlib/reparent_utils_test.go b/go/vt/wrangler/testlib/reparent_utils_test.go index 9550f01cafa..b770d9bf214 100644 --- a/go/vt/wrangler/testlib/reparent_utils_test.go +++ b/go/vt/wrangler/testlib/reparent_utils_test.go @@ -18,13 +18,13 @@ package testlib import ( "context" - "errors" "testing" "time" "github.com/stretchr/testify/require" "vitess.io/vitess/go/mysql/replication" + "vitess.io/vitess/go/mysql/sqlerror" "vitess.io/vitess/go/vt/discovery" "vitess.io/vitess/go/vt/logutil" "vitess.io/vitess/go/vt/topo" @@ -213,19 +213,19 @@ func TestSetReplicationSource(t *testing.T) { defer primary.StopActionLoop(t) relayErrors := []struct { - name string - message string - uid uint32 + name string + err error + uid uint32 }{ { - name: "master info relay error", - message: "ERROR 1201 (HY000): Could not initialize master info structure; more error messages can be found in the MySQL error log", - uid: 2, + name: "master info relay error", + err: sqlerror.NewSQLError(sqlerror.ERMasterInfo, sqlerror.SSUnknownSQLState, "Could not initialize master info structure; more error messages can be found in the MySQL error log"), + uid: 2, }, { - name: "applier metadata relay error", - message: "Replica failed to initialize applier metadata structure from the repository", - uid: 5, + name: "applier metadata relay error", + err: sqlerror.NewSQLError(sqlerror.ERReplicaAMInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize applier metadata structure from the repository"), + uid: 5, }, } @@ -254,7 +254,7 @@ func TestSetReplicationSource(t *testing.T) { defer replica.StopActionLoop(t) // Set the correct error message that indicates we have received a relay log error. - replica.FakeMysqlDaemon.StartReplicationError = errors.New(relayError.message) + replica.FakeMysqlDaemon.StartReplicationError = relayError.err err := wr.SetReplicationSource(ctx, replica.Tablet) require.NoError(t, err, "SetReplicationSource failed") From f5209b4940103024431bd8f226ef4a1aa62ad74d Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 11 Mar 2026 12:53:02 -0400 Subject: [PATCH 03/27] tabletmanager: clarify replication init recovery helper Signed-off-by: Mohamed Hamza --- .../vttablet/tabletmanager/rpc_replication.go | 43 +++++++++++-------- .../tabletmanager/rpc_replication_test.go | 4 +- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index c908dea90bd..756498cafde 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -19,6 +19,7 @@ package tabletmanager import ( "context" "fmt" + "log/slog" "runtime" "time" @@ -949,7 +950,7 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA if status.SourceHost != host || status.SourcePort != port || heartbeatInterval != 0 { // This handles both changing the address and starting replication. if err := tm.MysqlDaemon.SetReplicationSource(ctx, host, port, heartbeatInterval, wasReplicating, shouldbeReplicating); err != nil { - if err := tm.handleRelayLogError(ctx, err); err != nil { + if err := tm.handleRecoverableReplicationInitializationError(ctx, err); err != nil { return err } } @@ -957,12 +958,12 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA // The address is correct. We need to restart replication so that any semi-sync changes if any // are taken into account if err := tm.MysqlDaemon.StopReplication(ctx, tm.hookExtraEnv()); err != nil { - if err := tm.handleRelayLogError(ctx, err); err != nil { + if err := tm.handleRecoverableReplicationInitializationError(ctx, err); err != nil { return err } } if err := tm.MysqlDaemon.StartReplication(ctx, tm.hookExtraEnv()); err != nil { - if err := tm.handleRelayLogError(ctx, err); err != nil { + if err := tm.handleRecoverableReplicationInitializationError(ctx, err); err != nil { return err } } @@ -1236,32 +1237,38 @@ func (tm *TabletManager) fixSemiSyncAndReplication(ctx context.Context, tabletTy return nil } +// recoverableReplicationInitializationErrorCodes is the set of replication initialization error +// codes that can be recovered from by restarting replication. +var recoverableReplicationInitializationErrorCodes = map[sqlerror.ErrorCode]struct{}{ + sqlerror.ERMasterInfo: {}, + sqlerror.ERReplicaCMInitRepository: {}, + sqlerror.ERReplicaAMInitRepository: {}, +} + +// isRecoverableReplicationInitializationError reports whether an error can be recovered from by +// restarting replication. func isRecoverableReplicationInitializationError(err error) bool { sqlErr, ok := sqlerror.NewSQLErrorFromError(err).(*sqlerror.SQLError) if !ok || sqlErr == nil { return false } - switch sqlErr.Number() { - case sqlerror.ERMasterInfo, sqlerror.ERReplicaCMInitRepository, sqlerror.ERReplicaAMInitRepository: - return true - default: - return false - } + _, ok = recoverableReplicationInitializationErrorCodes[sqlErr.Number()] + return ok } -// handleRelayLogError resets replication of the instance. -// This is required because sometimes MySQL gets stuck due to improper initialization of -// master info structure or related failures and throws errors like -// ERROR 1201 (HY000): Could not initialize master info structure; more error messages can be found in the MySQL error log -// These errors can only be resolved by resetting the replication, otherwise START REPLICA fails. -func (tm *TabletManager) handleRelayLogError(ctx context.Context, err error) error { - // attempt to fix this error: - // Replica failed to initialize relay log info structure from the repository (errno 1872) (sqlstate HY000) during query: START REPLICA +// handleRecoverableReplicationInitializationError repairs recoverable replication initialization +// failures by restarting replication. +func (tm *TabletManager) handleRecoverableReplicationInitializationError(ctx context.Context, err error) error { + // Attempt to self-heal by restarting replication when initialization fails. // see https://bugs.mysql.com/bug.php?id=83713 or https://github.com/vitessio/vitess/issues/5067 // The same fix also works for https://github.com/vitessio/vitess/issues/10955. if isRecoverableReplicationInitializationError(err) { - // Stop, reset and start replication again to resolve this error + log.Warn( + "Encountered recoverable replication initialization error, restarting replication", + slog.Any("error", err), + ) + if err := tm.MysqlDaemon.RestartReplication(ctx, tm.hookExtraEnv()); err != nil { return err } diff --git a/go/vt/vttablet/tabletmanager/rpc_replication_test.go b/go/vt/vttablet/tabletmanager/rpc_replication_test.go index 81882f62b30..c64685efbb6 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication_test.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication_test.go @@ -329,7 +329,7 @@ func TestUndoDemotePrimaryStateChange(t *testing.T) { require.False(t, isReadOnly) } -func TestHandleRelayLogError(t *testing.T) { +func TestHandleRecoverableReplicationInitializationError(t *testing.T) { testCases := []struct { name string inputErr error @@ -389,7 +389,7 @@ func TestHandleRelayLogError(t *testing.T) { }, } - err := tm.handleRelayLogError(context.Background(), tc.inputErr) + err := tm.handleRecoverableReplicationInitializationError(context.Background(), tc.inputErr) if tc.shouldRestart { require.NoError(t, err) } else { From 871bdfd100825a85b7c33808603298de8d485891 Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 11 Mar 2026 13:13:20 -0400 Subject: [PATCH 04/27] no abbreviation Signed-off-by: Mohamed Hamza --- go/mysql/sqlerror/constants.go | 80 +++++++++---------- .../vttablet/tabletmanager/rpc_replication.go | 6 +- .../tabletmanager/rpc_replication_test.go | 6 +- .../testlib/planned_reparent_shard_test.go | 8 +- go/vt/wrangler/testlib/reparent_utils_test.go | 2 +- 5 files changed, 51 insertions(+), 51 deletions(-) diff --git a/go/mysql/sqlerror/constants.go b/go/mysql/sqlerror/constants.go index 4d40df3d508..fc637715cba 100644 --- a/go/mysql/sqlerror/constants.go +++ b/go/mysql/sqlerror/constants.go @@ -92,46 +92,46 @@ const ( ERBinlogCreateRoutineNeedSuper = ErrorCode(1419) // failed precondition - ERNoDb = ErrorCode(1046) - ERNoSuchIndex = ErrorCode(1082) - ERCantDropFieldOrKey = ErrorCode(1091) - ERTableNotLockedForWrite = ErrorCode(1099) - ERTableNotLocked = ErrorCode(1100) - ERTooBigSelect = ErrorCode(1104) - ERTableAccessDenied = ErrorCode(1142) - ERNotAllowedCommand = ErrorCode(1148) - ERTooLongString = ErrorCode(1162) - ERDelayedInsertTableLocked = ErrorCode(1165) - ERDupUnique = ErrorCode(1169) - ERRequiresPrimaryKey = ErrorCode(1173) - ERCantDoThisDuringAnTransaction = ErrorCode(1179) - ERMasterInfo = ErrorCode(1201) - ERReadOnlyTransaction = ErrorCode(1207) - ERCannotAddForeign = ErrorCode(1215) - ERNoReferencedRow = ErrorCode(1216) - ERRowIsReferenced = ErrorCode(1217) - ERCantUpdateWithReadLock = ErrorCode(1223) - ERNoDefault = ErrorCode(1230) - ERMasterFatalReadingBinlog = ErrorCode(1236) - EROperandColumns = ErrorCode(1241) - ERSubqueryNo1Row = ErrorCode(1242) - ERUnknownStmtHandler = ErrorCode(1243) - ERWarnDataOutOfRange = ErrorCode(1264) - ERNonUpdateableTable = ErrorCode(1288) - ERFeatureDisabled = ErrorCode(1289) - EROptionPreventsStatement = ErrorCode(1290) - ERDuplicatedValueInType = ErrorCode(1291) - ERSPDoesNotExist = ErrorCode(1305) - ERNoDefaultForField = ErrorCode(1364) - ErSPNotVarArg = ErrorCode(1414) - ERRowIsReferenced2 = ErrorCode(1451) - ErNoReferencedRow2 = ErrorCode(1452) - ERSourceHasPurgedRequiredGtids = ErrorCode(1789) - ERInnodbIndexCorrupt = ErrorCode(1817) - ERDupIndex = ErrorCode(1831) - ERReplicaCMInitRepository = ErrorCode(1871) - ERReplicaAMInitRepository = ErrorCode(1872) - ERInnodbReadOnly = ErrorCode(1874) + ERNoDb = ErrorCode(1046) + ERNoSuchIndex = ErrorCode(1082) + ERCantDropFieldOrKey = ErrorCode(1091) + ERTableNotLockedForWrite = ErrorCode(1099) + ERTableNotLocked = ErrorCode(1100) + ERTooBigSelect = ErrorCode(1104) + ERTableAccessDenied = ErrorCode(1142) + ERNotAllowedCommand = ErrorCode(1148) + ERTooLongString = ErrorCode(1162) + ERDelayedInsertTableLocked = ErrorCode(1165) + ERDupUnique = ErrorCode(1169) + ERRequiresPrimaryKey = ErrorCode(1173) + ERCantDoThisDuringAnTransaction = ErrorCode(1179) + ERMasterInfo = ErrorCode(1201) + ERReadOnlyTransaction = ErrorCode(1207) + ERCannotAddForeign = ErrorCode(1215) + ERNoReferencedRow = ErrorCode(1216) + ERRowIsReferenced = ErrorCode(1217) + ERCantUpdateWithReadLock = ErrorCode(1223) + ERNoDefault = ErrorCode(1230) + ERMasterFatalReadingBinlog = ErrorCode(1236) + EROperandColumns = ErrorCode(1241) + ERSubqueryNo1Row = ErrorCode(1242) + ERUnknownStmtHandler = ErrorCode(1243) + ERWarnDataOutOfRange = ErrorCode(1264) + ERNonUpdateableTable = ErrorCode(1288) + ERFeatureDisabled = ErrorCode(1289) + EROptionPreventsStatement = ErrorCode(1290) + ERDuplicatedValueInType = ErrorCode(1291) + ERSPDoesNotExist = ErrorCode(1305) + ERNoDefaultForField = ErrorCode(1364) + ErSPNotVarArg = ErrorCode(1414) + ERRowIsReferenced2 = ErrorCode(1451) + ErNoReferencedRow2 = ErrorCode(1452) + ERSourceHasPurgedRequiredGtids = ErrorCode(1789) + ERInnodbIndexCorrupt = ErrorCode(1817) + ERDupIndex = ErrorCode(1831) + ERReplicaConnectionMetadataInitRepository = ErrorCode(1871) + ERReplicaApplierMetadataInitRepository = ErrorCode(1872) + ERInnodbReadOnly = ErrorCode(1874) ERVectorConversion = ErrorCode(6138) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index 756498cafde..4f9d310e41d 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -1240,9 +1240,9 @@ func (tm *TabletManager) fixSemiSyncAndReplication(ctx context.Context, tabletTy // recoverableReplicationInitializationErrorCodes is the set of replication initialization error // codes that can be recovered from by restarting replication. var recoverableReplicationInitializationErrorCodes = map[sqlerror.ErrorCode]struct{}{ - sqlerror.ERMasterInfo: {}, - sqlerror.ERReplicaCMInitRepository: {}, - sqlerror.ERReplicaAMInitRepository: {}, + sqlerror.ERMasterInfo: {}, + sqlerror.ERReplicaConnectionMetadataInitRepository: {}, + sqlerror.ERReplicaApplierMetadataInitRepository: {}, } // isRecoverableReplicationInitializationError reports whether an error can be recovered from by diff --git a/go/vt/vttablet/tabletmanager/rpc_replication_test.go b/go/vt/vttablet/tabletmanager/rpc_replication_test.go index c64685efbb6..2ed0c9d7943 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication_test.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication_test.go @@ -337,7 +337,7 @@ func TestHandleRecoverableReplicationInitializationError(t *testing.T) { }{ { name: "relay log info repository error", - inputErr: sqlerror.NewSQLError(sqlerror.ERReplicaAMInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize relay log info structure from the repository"), + inputErr: sqlerror.NewSQLError(sqlerror.ERReplicaApplierMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize relay log info structure from the repository"), shouldRestart: true, }, { @@ -347,12 +347,12 @@ func TestHandleRecoverableReplicationInitializationError(t *testing.T) { }, { name: "connection metadata repository error", - inputErr: sqlerror.NewSQLError(sqlerror.ERReplicaCMInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize connection metadata structure from the repository"), + inputErr: sqlerror.NewSQLError(sqlerror.ERReplicaConnectionMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize connection metadata structure from the repository"), shouldRestart: true, }, { name: "applier metadata error", - inputErr: sqlerror.NewSQLError(sqlerror.ERReplicaAMInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize applier metadata structure from the repository"), + inputErr: sqlerror.NewSQLError(sqlerror.ERReplicaApplierMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize applier metadata structure from the repository"), shouldRestart: true, }, { diff --git a/go/vt/wrangler/testlib/planned_reparent_shard_test.go b/go/vt/wrangler/testlib/planned_reparent_shard_test.go index 6161f1ae5cd..6f9ac165aea 100644 --- a/go/vt/wrangler/testlib/planned_reparent_shard_test.go +++ b/go/vt/wrangler/testlib/planned_reparent_shard_test.go @@ -600,11 +600,11 @@ func TestPlannedReparentShardRelayLogError(t *testing.T) { }{ { name: "relay log info", - err: sqlerror.NewSQLError(sqlerror.ERReplicaAMInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize relay log info structure from the repository"), + err: sqlerror.NewSQLError(sqlerror.ERReplicaApplierMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize relay log info structure from the repository"), }, { name: "applier metadata", - err: sqlerror.NewSQLError(sqlerror.ERReplicaAMInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize applier metadata structure from the repository"), + err: sqlerror.NewSQLError(sqlerror.ERReplicaApplierMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize applier metadata structure from the repository"), }, } @@ -699,11 +699,11 @@ func TestPlannedReparentShardRelayLogErrorStartReplication(t *testing.T) { }{ { name: "relay log info", - err: sqlerror.NewSQLError(sqlerror.ERReplicaAMInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize relay log info structure from the repository"), + err: sqlerror.NewSQLError(sqlerror.ERReplicaApplierMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize relay log info structure from the repository"), }, { name: "applier metadata", - err: sqlerror.NewSQLError(sqlerror.ERReplicaAMInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize applier metadata structure from the repository"), + err: sqlerror.NewSQLError(sqlerror.ERReplicaApplierMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize applier metadata structure from the repository"), }, } diff --git a/go/vt/wrangler/testlib/reparent_utils_test.go b/go/vt/wrangler/testlib/reparent_utils_test.go index b770d9bf214..dc03fbf5dd5 100644 --- a/go/vt/wrangler/testlib/reparent_utils_test.go +++ b/go/vt/wrangler/testlib/reparent_utils_test.go @@ -224,7 +224,7 @@ func TestSetReplicationSource(t *testing.T) { }, { name: "applier metadata relay error", - err: sqlerror.NewSQLError(sqlerror.ERReplicaAMInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize applier metadata structure from the repository"), + err: sqlerror.NewSQLError(sqlerror.ERReplicaApplierMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize applier metadata structure from the repository"), uid: 5, }, } From 554e615385f01a8c418591bea0fb5c6734d9ec7e Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 11 Mar 2026 13:16:22 -0400 Subject: [PATCH 05/27] change helper name Signed-off-by: Mohamed Hamza --- go/vt/vttablet/tabletmanager/rpc_replication.go | 10 +++++----- go/vt/vttablet/tabletmanager/rpc_replication_test.go | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index 4f9d310e41d..08fd8dd35ee 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -950,7 +950,7 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA if status.SourceHost != host || status.SourcePort != port || heartbeatInterval != 0 { // This handles both changing the address and starting replication. if err := tm.MysqlDaemon.SetReplicationSource(ctx, host, port, heartbeatInterval, wasReplicating, shouldbeReplicating); err != nil { - if err := tm.handleRecoverableReplicationInitializationError(ctx, err); err != nil { + if err := tm.handleRecoverableReplicationError(ctx, err); err != nil { return err } } @@ -958,12 +958,12 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA // The address is correct. We need to restart replication so that any semi-sync changes if any // are taken into account if err := tm.MysqlDaemon.StopReplication(ctx, tm.hookExtraEnv()); err != nil { - if err := tm.handleRecoverableReplicationInitializationError(ctx, err); err != nil { + if err := tm.handleRecoverableReplicationError(ctx, err); err != nil { return err } } if err := tm.MysqlDaemon.StartReplication(ctx, tm.hookExtraEnv()); err != nil { - if err := tm.handleRecoverableReplicationInitializationError(ctx, err); err != nil { + if err := tm.handleRecoverableReplicationError(ctx, err); err != nil { return err } } @@ -1257,9 +1257,9 @@ func isRecoverableReplicationInitializationError(err error) bool { return ok } -// handleRecoverableReplicationInitializationError repairs recoverable replication initialization +// handleRecoverableReplicationError repairs recoverable replication initialization // failures by restarting replication. -func (tm *TabletManager) handleRecoverableReplicationInitializationError(ctx context.Context, err error) error { +func (tm *TabletManager) handleRecoverableReplicationError(ctx context.Context, err error) error { // Attempt to self-heal by restarting replication when initialization fails. // see https://bugs.mysql.com/bug.php?id=83713 or https://github.com/vitessio/vitess/issues/5067 // The same fix also works for https://github.com/vitessio/vitess/issues/10955. diff --git a/go/vt/vttablet/tabletmanager/rpc_replication_test.go b/go/vt/vttablet/tabletmanager/rpc_replication_test.go index 2ed0c9d7943..ea7a716a5a5 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication_test.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication_test.go @@ -389,7 +389,7 @@ func TestHandleRecoverableReplicationInitializationError(t *testing.T) { }, } - err := tm.handleRecoverableReplicationInitializationError(context.Background(), tc.inputErr) + err := tm.handleRecoverableReplicationError(context.Background(), tc.inputErr) if tc.shouldRestart { require.NoError(t, err) } else { From f213844c0f2ccda11792ce4b622befafe45ab597 Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 11 Mar 2026 13:30:24 -0400 Subject: [PATCH 06/27] Add missing PRS relay error test cases Signed-off-by: Mohamed Hamza --- .../testlib/planned_reparent_shard_test.go | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/go/vt/wrangler/testlib/planned_reparent_shard_test.go b/go/vt/wrangler/testlib/planned_reparent_shard_test.go index 6f9ac165aea..47cd8060dfd 100644 --- a/go/vt/wrangler/testlib/planned_reparent_shard_test.go +++ b/go/vt/wrangler/testlib/planned_reparent_shard_test.go @@ -602,6 +602,14 @@ func TestPlannedReparentShardRelayLogError(t *testing.T) { name: "relay log info", err: sqlerror.NewSQLError(sqlerror.ERReplicaApplierMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize relay log info structure from the repository"), }, + { + name: "master info", + err: sqlerror.NewSQLError(sqlerror.ERMasterInfo, sqlerror.SSUnknownSQLState, "Could not initialize master info structure; more error messages can be found in the MySQL error log"), + }, + { + name: "connection metadata", + err: sqlerror.NewSQLError(sqlerror.ERReplicaConnectionMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize connection metadata structure from the repository"), + }, { name: "applier metadata", err: sqlerror.NewSQLError(sqlerror.ERReplicaApplierMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize applier metadata structure from the repository"), @@ -701,6 +709,14 @@ func TestPlannedReparentShardRelayLogErrorStartReplication(t *testing.T) { name: "relay log info", err: sqlerror.NewSQLError(sqlerror.ERReplicaApplierMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize relay log info structure from the repository"), }, + { + name: "master info", + err: sqlerror.NewSQLError(sqlerror.ERMasterInfo, sqlerror.SSUnknownSQLState, "Could not initialize master info structure; more error messages can be found in the MySQL error log"), + }, + { + name: "connection metadata", + err: sqlerror.NewSQLError(sqlerror.ERReplicaConnectionMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize connection metadata structure from the repository"), + }, { name: "applier metadata", err: sqlerror.NewSQLError(sqlerror.ERReplicaApplierMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize applier metadata structure from the repository"), From b151ec20c619e03462a8bf9b6342ebed4e49f6dd Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Fri, 13 Mar 2026 13:18:44 -0400 Subject: [PATCH 07/27] rerun ci Signed-off-by: Mohamed Hamza From 86eafef064afe7b01a87f9599ab173373a179b1e Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Mon, 16 Mar 2026 11:06:19 -0400 Subject: [PATCH 08/27] handleRecoverableReplicationInitError Signed-off-by: Mohamed Hamza --- go/vt/vttablet/tabletmanager/rpc_replication.go | 10 +++++----- go/vt/vttablet/tabletmanager/rpc_replication_test.go | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index 08fd8dd35ee..fb6c16ba7f3 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -950,7 +950,7 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA if status.SourceHost != host || status.SourcePort != port || heartbeatInterval != 0 { // This handles both changing the address and starting replication. if err := tm.MysqlDaemon.SetReplicationSource(ctx, host, port, heartbeatInterval, wasReplicating, shouldbeReplicating); err != nil { - if err := tm.handleRecoverableReplicationError(ctx, err); err != nil { + if err := tm.handleRecoverableReplicationInitError(ctx, err); err != nil { return err } } @@ -958,12 +958,12 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA // The address is correct. We need to restart replication so that any semi-sync changes if any // are taken into account if err := tm.MysqlDaemon.StopReplication(ctx, tm.hookExtraEnv()); err != nil { - if err := tm.handleRecoverableReplicationError(ctx, err); err != nil { + if err := tm.handleRecoverableReplicationInitError(ctx, err); err != nil { return err } } if err := tm.MysqlDaemon.StartReplication(ctx, tm.hookExtraEnv()); err != nil { - if err := tm.handleRecoverableReplicationError(ctx, err); err != nil { + if err := tm.handleRecoverableReplicationInitError(ctx, err); err != nil { return err } } @@ -1257,9 +1257,9 @@ func isRecoverableReplicationInitializationError(err error) bool { return ok } -// handleRecoverableReplicationError repairs recoverable replication initialization +// handleRecoverableReplicationInitError repairs recoverable replication initialization // failures by restarting replication. -func (tm *TabletManager) handleRecoverableReplicationError(ctx context.Context, err error) error { +func (tm *TabletManager) handleRecoverableReplicationInitError(ctx context.Context, err error) error { // Attempt to self-heal by restarting replication when initialization fails. // see https://bugs.mysql.com/bug.php?id=83713 or https://github.com/vitessio/vitess/issues/5067 // The same fix also works for https://github.com/vitessio/vitess/issues/10955. diff --git a/go/vt/vttablet/tabletmanager/rpc_replication_test.go b/go/vt/vttablet/tabletmanager/rpc_replication_test.go index ea7a716a5a5..f34159cd922 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication_test.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication_test.go @@ -389,7 +389,7 @@ func TestHandleRecoverableReplicationInitializationError(t *testing.T) { }, } - err := tm.handleRecoverableReplicationError(context.Background(), tc.inputErr) + err := tm.handleRecoverableReplicationInitError(context.Background(), tc.inputErr) if tc.shouldRestart { require.NoError(t, err) } else { From 2691e6e4ef2fa98ebfa6e0df358ee3b21b529ba2 Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Mon, 16 Mar 2026 12:40:08 -0400 Subject: [PATCH 09/27] add further handling Signed-off-by: Mohamed Hamza --- .../vttablet/tabletmanager/rpc_replication.go | 20 +++- .../tabletmanager/rpc_replication_test.go | 111 ++++++++++++++++++ 2 files changed, 127 insertions(+), 4 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index fb6c16ba7f3..f151527c84d 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -306,7 +306,7 @@ func (tm *TabletManager) StartReplication(ctx context.Context, semiSync bool) er if err := tm.fixSemiSync(ctx, tm.Tablet().Type, semiSyncAction); err != nil { return err } - return tm.MysqlDaemon.StartReplication(ctx, tm.hookExtraEnv()) + return tm.startReplicationRecoverable(ctx) } // RestartReplication will stop replication and then start it again @@ -335,7 +335,7 @@ func (tm *TabletManager) RestartReplication(ctx context.Context, semiSync bool) } // Start replication - return tm.MysqlDaemon.StartReplication(ctx, tm.hookExtraEnv()) + return tm.startReplicationRecoverable(ctx) } // StartReplicationUntilAfter will start the replication and let it catch up @@ -525,7 +525,9 @@ func (tm *TabletManager) InitReplica(ctx context.Context, parent *topodatapb.Tab return err } if err := tm.MysqlDaemon.SetReplicationSource(ctx, ti.MysqlHostname, ti.MysqlPort, 0, false, true); err != nil { - return err + if err := tm.handleRecoverableReplicationInitError(ctx, err); err != nil { + return err + } } // wait until we get the replicated row, or our context times out @@ -1231,12 +1233,22 @@ func (tm *TabletManager) fixSemiSyncAndReplication(ctx context.Context, tabletTy if err := tm.MysqlDaemon.StopReplication(ctx, tm.hookExtraEnv()); err != nil { return vterrors.Wrap(err, "failed to StopReplication") } - if err := tm.MysqlDaemon.StartReplication(ctx, tm.hookExtraEnv()); err != nil { + if err := tm.startReplicationRecoverable(ctx); err != nil { return vterrors.Wrap(err, "failed to StartReplication") } return nil } +// startReplicationRecoverable starts replication and handles recoverable errors by resetting replication. +func (tm *TabletManager) startReplicationRecoverable(ctx context.Context) error { + if err := tm.MysqlDaemon.StartReplication(ctx, tm.hookExtraEnv()); err != nil { + if err := tm.handleRecoverableReplicationInitError(ctx, err); err != nil { + return err + } + } + return nil +} + // recoverableReplicationInitializationErrorCodes is the set of replication initialization error // codes that can be recovered from by restarting replication. var recoverableReplicationInitializationErrorCodes = map[sqlerror.ErrorCode]struct{}{ diff --git a/go/vt/vttablet/tabletmanager/rpc_replication_test.go b/go/vt/vttablet/tabletmanager/rpc_replication_test.go index f34159cd922..584e52a2835 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication_test.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication_test.go @@ -38,6 +38,28 @@ import ( topodatapb "vitess.io/vitess/go/vt/proto/topodata" ) +func newTestReplicationTM(tablet *topodatapb.Tablet, mysqlDaemon *mysqlctl.FakeMysqlDaemon, ts *topo.Server) *TabletManager { + waitForGrantsComplete := make(chan struct{}) + close(waitForGrantsComplete) + + return &TabletManager{ + actionSema: semaphore.NewWeighted(1), + TopoServer: ts, + MysqlDaemon: mysqlDaemon, + tabletAlias: tablet.Alias, + _waitForGrantsComplete: waitForGrantsComplete, + tmState: &tmState{ + displayState: displayState{ + tablet: tablet, + }, + }, + } +} + +func recoverableReplicationInitErrorForTests() error { + return sqlerror.NewSQLError(sqlerror.ERMasterInfo, sqlerror.SSUnknownSQLState, "Could not initialize master info structure; more error messages can be found in the MySQL error log") +} + // TestWaitForGrantsToHaveApplied tests that waitForGrantsToHaveApplied only succeeds after waitForDBAGrants has been called. func TestWaitForGrantsToHaveApplied(t *testing.T) { tm := &TabletManager{ @@ -400,3 +422,92 @@ func TestHandleRecoverableReplicationInitializationError(t *testing.T) { }) } } + +// TestStartReplicationRecoversFromRecoverableReplicationInitError verifies StartReplication self-heals recoverable init failures. +func TestStartReplicationRecoversFromRecoverableReplicationInitError(t *testing.T) { + fakeMysqlDaemon := newTestMysqlDaemon(t, 1) + fakeMysqlDaemon.StartReplicationError = recoverableReplicationInitErrorForTests() + fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + "STOP REPLICA", + "RESET REPLICA", + "START REPLICA", + } + + tm := newTestReplicationTM(newTestTablet(t, 100, "ks", "0", nil), fakeMysqlDaemon, nil) + err := tm.StartReplication(context.Background(), false) + require.NoError(t, err) + require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) +} + +// TestRestartReplicationRecoversFromRecoverableReplicationInitializationError verifies RestartReplication self-heals recoverable init failures. +func TestRestartReplicationRecoversFromRecoverableReplicationInitializationError(t *testing.T) { + fakeMysqlDaemon := newTestMysqlDaemon(t, 1) + fakeMysqlDaemon.StartReplicationError = recoverableReplicationInitErrorForTests() + fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + "STOP REPLICA", + "STOP REPLICA", + "RESET REPLICA", + "START REPLICA", + } + + tm := newTestReplicationTM(newTestTablet(t, 100, "ks", "0", nil), fakeMysqlDaemon, nil) + err := tm.RestartReplication(context.Background(), false) + require.NoError(t, err) + require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) +} + +// TestFixSemiSyncAndReplicationRecoversFromRecoverableReplicationInitializationError verifies semi-sync restart path self-heals recoverable init failures. +func TestFixSemiSyncAndReplicationRecoversFromRecoverableReplicationInitializationError(t *testing.T) { + fakeMysqlDaemon := newTestMysqlDaemon(t, 1) + fakeMysqlDaemon.Replicating = true + fakeMysqlDaemon.StartReplicationError = recoverableReplicationInitErrorForTests() + fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + "STOP REPLICA", + "STOP REPLICA", + "RESET REPLICA", + "START REPLICA", + } + + tm := newTestReplicationTM(newTestTablet(t, 100, "ks", "0", nil), fakeMysqlDaemon, nil) + err := tm.fixSemiSyncAndReplication(context.Background(), topodatapb.TabletType_REPLICA, SemiSyncActionUnset) + require.NoError(t, err) + require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) +} + +// TestInitReplicaRecoversFromRecoverableReplicationInitializationError verifies InitReplica self-heals recoverable init failures from SetReplicationSource(startReplicationAfter=true). +func TestInitReplicaRecoversFromRecoverableReplicationInitializationError(t *testing.T) { + ctx := context.Background() + ts := memorytopo.NewServer(ctx, "cell1") + + _, err := ts.GetOrCreateShard(ctx, "ks", "0") + require.NoError(t, err) + + parent := &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "cell1", + Uid: 200, + }, + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "mysql-primary", + MysqlPort: 3306, + } + require.NoError(t, ts.CreateTablet(ctx, parent)) + + fakeMysqlDaemon := newTestMysqlDaemon(t, 1) + fakeMysqlDaemon.SetReplicationSourceInputs = []string{"mysql-primary:3306"} + fakeMysqlDaemon.SetReplicationSourceError = recoverableReplicationInitErrorForTests() + fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + "FAKE RESET BINARY LOGS AND GTIDS", + "FAKE SET GLOBAL gtid_purged", + "STOP REPLICA", + "RESET REPLICA", + "START REPLICA", + } + + tm := newTestReplicationTM(newTestTablet(t, 100, "ks", "0", nil), fakeMysqlDaemon, ts) + err = tm.InitReplica(ctx, parent.Alias, "", 0, false) + require.NoError(t, err) + require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) +} From ce99c0fad47863c577e6d8aa8c50206ebc8152a4 Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 18 Mar 2026 14:02:32 -0400 Subject: [PATCH 10/27] Document reused MySQL replication init errnos MySQL 8.0.33 reused error numbers 1871 and 1872 for connection and applier metadata initialization failures, while older versions used the same numbers for master and relay-log repository initialization failures. This documents that version split in and next to so the mixed errnos in recovery code are clearly intentional. Signed-off-by: Mohamed Hamza --- go/mysql/sqlerror/constants.go | 77 ++++++++++--------- .../vttablet/tabletmanager/rpc_replication.go | 4 +- 2 files changed, 43 insertions(+), 38 deletions(-) diff --git a/go/mysql/sqlerror/constants.go b/go/mysql/sqlerror/constants.go index fc637715cba..279e75034d8 100644 --- a/go/mysql/sqlerror/constants.go +++ b/go/mysql/sqlerror/constants.go @@ -92,43 +92,46 @@ const ( ERBinlogCreateRoutineNeedSuper = ErrorCode(1419) // failed precondition - ERNoDb = ErrorCode(1046) - ERNoSuchIndex = ErrorCode(1082) - ERCantDropFieldOrKey = ErrorCode(1091) - ERTableNotLockedForWrite = ErrorCode(1099) - ERTableNotLocked = ErrorCode(1100) - ERTooBigSelect = ErrorCode(1104) - ERTableAccessDenied = ErrorCode(1142) - ERNotAllowedCommand = ErrorCode(1148) - ERTooLongString = ErrorCode(1162) - ERDelayedInsertTableLocked = ErrorCode(1165) - ERDupUnique = ErrorCode(1169) - ERRequiresPrimaryKey = ErrorCode(1173) - ERCantDoThisDuringAnTransaction = ErrorCode(1179) - ERMasterInfo = ErrorCode(1201) - ERReadOnlyTransaction = ErrorCode(1207) - ERCannotAddForeign = ErrorCode(1215) - ERNoReferencedRow = ErrorCode(1216) - ERRowIsReferenced = ErrorCode(1217) - ERCantUpdateWithReadLock = ErrorCode(1223) - ERNoDefault = ErrorCode(1230) - ERMasterFatalReadingBinlog = ErrorCode(1236) - EROperandColumns = ErrorCode(1241) - ERSubqueryNo1Row = ErrorCode(1242) - ERUnknownStmtHandler = ErrorCode(1243) - ERWarnDataOutOfRange = ErrorCode(1264) - ERNonUpdateableTable = ErrorCode(1288) - ERFeatureDisabled = ErrorCode(1289) - EROptionPreventsStatement = ErrorCode(1290) - ERDuplicatedValueInType = ErrorCode(1291) - ERSPDoesNotExist = ErrorCode(1305) - ERNoDefaultForField = ErrorCode(1364) - ErSPNotVarArg = ErrorCode(1414) - ERRowIsReferenced2 = ErrorCode(1451) - ErNoReferencedRow2 = ErrorCode(1452) - ERSourceHasPurgedRequiredGtids = ErrorCode(1789) - ERInnodbIndexCorrupt = ErrorCode(1817) - ERDupIndex = ErrorCode(1831) + ERNoDb = ErrorCode(1046) + ERNoSuchIndex = ErrorCode(1082) + ERCantDropFieldOrKey = ErrorCode(1091) + ERTableNotLockedForWrite = ErrorCode(1099) + ERTableNotLocked = ErrorCode(1100) + ERTooBigSelect = ErrorCode(1104) + ERTableAccessDenied = ErrorCode(1142) + ERNotAllowedCommand = ErrorCode(1148) + ERTooLongString = ErrorCode(1162) + ERDelayedInsertTableLocked = ErrorCode(1165) + ERDupUnique = ErrorCode(1169) + ERRequiresPrimaryKey = ErrorCode(1173) + ERCantDoThisDuringAnTransaction = ErrorCode(1179) + ERMasterInfo = ErrorCode(1201) + ERReadOnlyTransaction = ErrorCode(1207) + ERCannotAddForeign = ErrorCode(1215) + ERNoReferencedRow = ErrorCode(1216) + ERRowIsReferenced = ErrorCode(1217) + ERCantUpdateWithReadLock = ErrorCode(1223) + ERNoDefault = ErrorCode(1230) + ERMasterFatalReadingBinlog = ErrorCode(1236) + EROperandColumns = ErrorCode(1241) + ERSubqueryNo1Row = ErrorCode(1242) + ERUnknownStmtHandler = ErrorCode(1243) + ERWarnDataOutOfRange = ErrorCode(1264) + ERNonUpdateableTable = ErrorCode(1288) + ERFeatureDisabled = ErrorCode(1289) + EROptionPreventsStatement = ErrorCode(1290) + ERDuplicatedValueInType = ErrorCode(1291) + ERSPDoesNotExist = ErrorCode(1305) + ERNoDefaultForField = ErrorCode(1364) + ErSPNotVarArg = ErrorCode(1414) + ERRowIsReferenced2 = ErrorCode(1451) + ErNoReferencedRow2 = ErrorCode(1452) + ERSourceHasPurgedRequiredGtids = ErrorCode(1789) + ERInnodbIndexCorrupt = ErrorCode(1817) + ERDupIndex = ErrorCode(1831) + // MySQL reused 1871/1872 in 8.0.33+: older versions reported master/relay-log + // repository init failures under these numbers, while newer versions report + // connection/applier metadata init failures. ERReplicaConnectionMetadataInitRepository = ErrorCode(1871) ERReplicaApplierMetadataInitRepository = ErrorCode(1872) ERInnodbReadOnly = ErrorCode(1874) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index f151527c84d..cd71e7f3919 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -1250,7 +1250,9 @@ func (tm *TabletManager) startReplicationRecoverable(ctx context.Context) error } // recoverableReplicationInitializationErrorCodes is the set of replication initialization error -// codes that can be recovered from by restarting replication. +// codes that can be recovered from by restarting replication. MySQL reused 1871/1872 across +// versions, so these numeric errnos intentionally cover both the older master/relay-log names and +// the newer connection/applier metadata names. var recoverableReplicationInitializationErrorCodes = map[sqlerror.ErrorCode]struct{}{ sqlerror.ERMasterInfo: {}, sqlerror.ERReplicaConnectionMetadataInitRepository: {}, From 7f335e2820955c34214dfc1d1418f4d7f91965a2 Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 18 Mar 2026 14:02:57 -0400 Subject: [PATCH 11/27] Add mysqlctl-wrapped replication init error test Replication init recovery depends on `sqlerror.NewSQLErrorFromError` parsing the mysqlctl-style `(errno N) (sqlstate XXXXX)` wrapper returned from `ExecuteSuperQueryListConn`. This adds a regression case for `ExecuteFetch(START REPLICA) failed: ... (errno 1201) (sqlstate HY000)` so `handleRecoverableReplicationInitError` keeps self-healing through that wrapped error format. Signed-off-by: Mohamed Hamza --- go/vt/vttablet/tabletmanager/rpc_replication_test.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication_test.go b/go/vt/vttablet/tabletmanager/rpc_replication_test.go index 584e52a2835..27e8aa2ca2f 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication_test.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication_test.go @@ -382,6 +382,11 @@ func TestHandleRecoverableReplicationInitializationError(t *testing.T) { inputErr: sqlerror.NewSQLError(sqlerror.ERUnknownError, sqlerror.SSUnknownSQLState, "Replica failed to initialize applier metadata structure from the repository"), shouldRestart: false, }, + { + name: "mysqlctl wrapped master info error", + inputErr: errors.New("ExecuteFetch(START REPLICA) failed: Could not initialize master info structure; more error messages can be found in the MySQL error log (errno 1201) (sqlstate HY000)"), + shouldRestart: true, + }, { name: "unrelated error", inputErr: errors.New("unexpected replication failure"), From 9ee99803cec5cedca6c54a1db570dffcb14aff74 Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 18 Mar 2026 14:57:22 -0400 Subject: [PATCH 12/27] sqlerror: parse native MySQL error strings Native MySQL errors such as ERROR 1201 (HY000): ... were not converted into SQLError, so replication recovery no longer recognized that format after switching from substring matching to errno-based handling. This changes NewSQLErrorFromError to extract native MySQL ERROR (): strings in addition to the existing Vitess-wrapped (errno ...) (sqlstate ...) format, and adds shared-layer and tabletmanager regression coverage for both forms. Signed-off-by: Mohamed Hamza --- go/mysql/sqlerror/sql_error.go | 24 ++++++++++++++++--- go/mysql/sqlerror/sql_error_test.go | 10 ++++++++ .../tabletmanager/rpc_replication_test.go | 5 ++++ 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/go/mysql/sqlerror/sql_error.go b/go/mysql/sqlerror/sql_error.go index 0fbf421797d..21f668446f6 100644 --- a/go/mysql/sqlerror/sql_error.go +++ b/go/mysql/sqlerror/sql_error.go @@ -154,10 +154,23 @@ func (se *SQLError) VtRpcErrorCode() vtrpcpb.Code { } } -var errExtract = regexp.MustCompile(`\(errno ([0-9]*)\) \(sqlstate ([0-9a-zA-Z]{5})\)`) +var ( + // errExtract matches Vitess-wrapped SQL errors ending with `(errno ) (sqlstate )`. + errExtract = regexp.MustCompile(`\(errno ([0-9]*)\) \(sqlstate ([0-9a-zA-Z]{5})\)`) -// NewSQLErrorFromError returns a *SQLError from the provided error. -// If it's not the right type, it still tries to get it from a regexp. + // nativeErrExtract matches native MySQL server errors using `ERROR (): `. + nativeErrExtract = regexp.MustCompile(`ERROR ([0-9]*) \(([0-9a-zA-Z]{5})\):`) +) + +// NewSQLErrorFromError returns a SQLError from the provided error. +// +// - If err already is a SQLError, it returns err unchanged. +// - If err is a Vitess error with a mapped MySQL code, it returns the converted SQLError. +// - If err contains a Vitess-wrapped `(errno ) (sqlstate )` suffix, it extracts that code and state. +// - If err contains a native MySQL `ERROR (): ` string, it extracts that code and state. +// - Otherwise, it maps the Vitess error code to a MySQL error code when one is defined and returns +// a generic SQLError with that code and err.Error() as the message. +// // Notes about the `error` return type: // The function really returns *SQLError or `nil`. Seemingly, the function could just return // `*SQLError` type. However, it really must return `error`. The reason is the way `golang` @@ -189,6 +202,11 @@ func NewSQLErrorFromError(err error) error { return extractSQLErrorFromMessage(match, msg) } + match = nativeErrExtract.FindStringSubmatch(msg) + if len(match) >= 2 { + return extractSQLErrorFromMessage(match, msg) + } + return mapToSQLErrorFromErrorCode(err, msg) } diff --git a/go/mysql/sqlerror/sql_error_test.go b/go/mysql/sqlerror/sql_error_test.go index cf2d3a5f4bd..9b88803f29a 100644 --- a/go/mysql/sqlerror/sql_error_test.go +++ b/go/mysql/sqlerror/sql_error_test.go @@ -155,6 +155,16 @@ func TestNewSQLErrorFromError(t *testing.T) { num: ERNoDb, ss: SSNoDB, }, + { + err: errors.New("ERROR 1201 (HY000): Could not initialize master info structure; more error messages can be found in the MySQL error log"), + num: ERMasterInfo, + ss: SSUnknownSQLState, + }, + { + err: errors.New("ERROR 1872 (HY000): Replica failed to initialize applier metadata structure from the repository"), + num: ERReplicaApplierMetadataInitRepository, + ss: SSUnknownSQLState, + }, { err: errors.New("just some random text here"), num: ERUnknownError, diff --git a/go/vt/vttablet/tabletmanager/rpc_replication_test.go b/go/vt/vttablet/tabletmanager/rpc_replication_test.go index 27e8aa2ca2f..4f85782c74b 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication_test.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication_test.go @@ -387,6 +387,11 @@ func TestHandleRecoverableReplicationInitializationError(t *testing.T) { inputErr: errors.New("ExecuteFetch(START REPLICA) failed: Could not initialize master info structure; more error messages can be found in the MySQL error log (errno 1201) (sqlstate HY000)"), shouldRestart: true, }, + { + name: "native mysql master info error", + inputErr: errors.New("ERROR 1201 (HY000): Could not initialize master info structure; more error messages can be found in the MySQL error log"), + shouldRestart: true, + }, { name: "unrelated error", inputErr: errors.New("unexpected replication failure"), From 6e33a9a4ebcfa08c0313767370e242830d60e994 Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 18 Mar 2026 15:02:08 -0400 Subject: [PATCH 13/27] Update sql_error.go Signed-off-by: Mohamed Hamza --- go/mysql/sqlerror/sql_error.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go/mysql/sqlerror/sql_error.go b/go/mysql/sqlerror/sql_error.go index 21f668446f6..e0707cc67cd 100644 --- a/go/mysql/sqlerror/sql_error.go +++ b/go/mysql/sqlerror/sql_error.go @@ -162,9 +162,9 @@ var ( nativeErrExtract = regexp.MustCompile(`ERROR ([0-9]*) \(([0-9a-zA-Z]{5})\):`) ) -// NewSQLErrorFromError returns a SQLError from the provided error. +// NewSQLErrorFromError returns a *SQLError from the provided error. // -// - If err already is a SQLError, it returns err unchanged. +// - If err already is a *SQLError, it returns err unchanged. // - If err is a Vitess error with a mapped MySQL code, it returns the converted SQLError. // - If err contains a Vitess-wrapped `(errno ) (sqlstate )` suffix, it extracts that code and state. // - If err contains a native MySQL `ERROR (): ` string, it extracts that code and state. From 875517d2a090fe2c1d1872d5aee7275b1449c75e Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 18 Mar 2026 15:08:35 -0400 Subject: [PATCH 14/27] Clarify MySQL 1871/1872 reassignment The errno comments around `1871` and `1872` were imprecise about how MySQL changed those codes across versions. This rewrites the comments to state that MySQL used those numbers for master-info and relay-log-info initialization errors through 8.0.32, then reassigned them in 8.0.33 to connection-metadata and applier-metadata initialization errors. Signed-off-by: Mohamed Hamza --- go/mysql/sqlerror/constants.go | 6 +++--- go/vt/vttablet/tabletmanager/rpc_replication.go | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go/mysql/sqlerror/constants.go b/go/mysql/sqlerror/constants.go index 279e75034d8..18125632a61 100644 --- a/go/mysql/sqlerror/constants.go +++ b/go/mysql/sqlerror/constants.go @@ -129,9 +129,9 @@ const ( ERSourceHasPurgedRequiredGtids = ErrorCode(1789) ERInnodbIndexCorrupt = ErrorCode(1817) ERDupIndex = ErrorCode(1831) - // MySQL reused 1871/1872 in 8.0.33+: older versions reported master/relay-log - // repository init failures under these numbers, while newer versions report - // connection/applier metadata init failures. + // MySQL used 1871/1872 for master-info and relay-log-info initialization + // errors through 8.0.32, and reassigned those numbers in 8.0.33 to + // connection-metadata and applier-metadata initialization errors. ERReplicaConnectionMetadataInitRepository = ErrorCode(1871) ERReplicaApplierMetadataInitRepository = ErrorCode(1872) ERInnodbReadOnly = ErrorCode(1874) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index cd71e7f3919..c7570688f35 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -1250,9 +1250,9 @@ func (tm *TabletManager) startReplicationRecoverable(ctx context.Context) error } // recoverableReplicationInitializationErrorCodes is the set of replication initialization error -// codes that can be recovered from by restarting replication. MySQL reused 1871/1872 across -// versions, so these numeric errnos intentionally cover both the older master/relay-log names and -// the newer connection/applier metadata names. +// codes that can be recovered from by restarting replication. MySQL used 1871/1872 for master-info +// and relay-log-info initialization errors through 8.0.32, and reassigned those numbers in 8.0.33 +// to connection-metadata and applier-metadata initialization errors. var recoverableReplicationInitializationErrorCodes = map[sqlerror.ErrorCode]struct{}{ sqlerror.ERMasterInfo: {}, sqlerror.ERReplicaConnectionMetadataInitRepository: {}, From 7f5c352a74c2b89b5a7f898ffdaed65eef518c29 Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Thu, 19 Mar 2026 13:00:47 -0400 Subject: [PATCH 15/27] tabletmanager: recover startup replication init errors Replica startup initialization still wrapped recoverable SetReplicationSource(..., true, true) failures directly, so a tablet coming up as a replica could fail on the same metadata-init errors the other replication paths now self-heal from. This routes initializeReplication through handleRecoverableReplicationInitError for recoverable SetReplicationSource failures, adds a startup regression test, and reuses the recoverable init error helper across the replication tests. Signed-off-by: Mohamed Hamza --- .../tabletmanager/rpc_replication_test.go | 10 +-- go/vt/vttablet/tabletmanager/tm_init.go | 4 +- go/vt/vttablet/tabletmanager/tm_init_test.go | 79 +++++++++++++++++++ 3 files changed, 87 insertions(+), 6 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication_test.go b/go/vt/vttablet/tabletmanager/rpc_replication_test.go index 4f85782c74b..e0486d43066 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication_test.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication_test.go @@ -56,7 +56,7 @@ func newTestReplicationTM(tablet *topodatapb.Tablet, mysqlDaemon *mysqlctl.FakeM } } -func recoverableReplicationInitErrorForTests() error { +func recoverableReplicationInitError() error { return sqlerror.NewSQLError(sqlerror.ERMasterInfo, sqlerror.SSUnknownSQLState, "Could not initialize master info structure; more error messages can be found in the MySQL error log") } @@ -436,7 +436,7 @@ func TestHandleRecoverableReplicationInitializationError(t *testing.T) { // TestStartReplicationRecoversFromRecoverableReplicationInitError verifies StartReplication self-heals recoverable init failures. func TestStartReplicationRecoversFromRecoverableReplicationInitError(t *testing.T) { fakeMysqlDaemon := newTestMysqlDaemon(t, 1) - fakeMysqlDaemon.StartReplicationError = recoverableReplicationInitErrorForTests() + fakeMysqlDaemon.StartReplicationError = recoverableReplicationInitError() fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ "STOP REPLICA", "RESET REPLICA", @@ -452,7 +452,7 @@ func TestStartReplicationRecoversFromRecoverableReplicationInitError(t *testing. // TestRestartReplicationRecoversFromRecoverableReplicationInitializationError verifies RestartReplication self-heals recoverable init failures. func TestRestartReplicationRecoversFromRecoverableReplicationInitializationError(t *testing.T) { fakeMysqlDaemon := newTestMysqlDaemon(t, 1) - fakeMysqlDaemon.StartReplicationError = recoverableReplicationInitErrorForTests() + fakeMysqlDaemon.StartReplicationError = recoverableReplicationInitError() fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ "STOP REPLICA", "STOP REPLICA", @@ -470,7 +470,7 @@ func TestRestartReplicationRecoversFromRecoverableReplicationInitializationError func TestFixSemiSyncAndReplicationRecoversFromRecoverableReplicationInitializationError(t *testing.T) { fakeMysqlDaemon := newTestMysqlDaemon(t, 1) fakeMysqlDaemon.Replicating = true - fakeMysqlDaemon.StartReplicationError = recoverableReplicationInitErrorForTests() + fakeMysqlDaemon.StartReplicationError = recoverableReplicationInitError() fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ "STOP REPLICA", "STOP REPLICA", @@ -507,7 +507,7 @@ func TestInitReplicaRecoversFromRecoverableReplicationInitializationError(t *tes fakeMysqlDaemon := newTestMysqlDaemon(t, 1) fakeMysqlDaemon.SetReplicationSourceInputs = []string{"mysql-primary:3306"} - fakeMysqlDaemon.SetReplicationSourceError = recoverableReplicationInitErrorForTests() + fakeMysqlDaemon.SetReplicationSourceError = recoverableReplicationInitError() fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ "FAKE RESET BINARY LOGS AND GTIDS", "FAKE SET GLOBAL gtid_purged", diff --git a/go/vt/vttablet/tabletmanager/tm_init.go b/go/vt/vttablet/tabletmanager/tm_init.go index d0695e46939..c4e60dd57a8 100644 --- a/go/vt/vttablet/tabletmanager/tm_init.go +++ b/go/vt/vttablet/tabletmanager/tm_init.go @@ -1136,7 +1136,9 @@ func (tm *TabletManager) initializeReplication(ctx context.Context, tabletType t } if err := tm.MysqlDaemon.SetReplicationSource(ctx, currentPrimary.MysqlHostname, currentPrimary.MysqlPort, 0, true, true); err != nil { - return "", vterrors.Wrap(err, "MysqlDaemon.SetReplicationSource failed") + if err := tm.handleRecoverableReplicationInitError(ctx, err); err != nil { + return "", vterrors.Wrap(err, "MysqlDaemon.SetReplicationSource failed") + } } return primaryStatus.Position, nil diff --git a/go/vt/vttablet/tabletmanager/tm_init_test.go b/go/vt/vttablet/tabletmanager/tm_init_test.go index 99b760c6cc8..685c091d53a 100644 --- a/go/vt/vttablet/tabletmanager/tm_init_test.go +++ b/go/vt/vttablet/tabletmanager/tm_init_test.go @@ -24,10 +24,12 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "golang.org/x/sync/semaphore" "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/mysql/collations" "vitess.io/vitess/go/mysql/fakesqldb" + "vitess.io/vitess/go/mysql/replication" "vitess.io/vitess/go/protoutil" "vitess.io/vitess/go/sqltypes" "vitess.io/vitess/go/test/utils" @@ -40,6 +42,8 @@ import ( "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/topo/memorytopo" "vitess.io/vitess/go/vt/topotools" + "vitess.io/vitess/go/vt/vtctl/reparentutil/policy" + "vitess.io/vitess/go/vt/vtctl/reparentutil/reparenttestutil" "vitess.io/vitess/go/vt/vttablet/tabletmanager/semisyncmonitor" "vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv" "vitess.io/vitess/go/vt/vttablet/tabletservermock" @@ -1248,3 +1252,78 @@ func TestInitTabletTypeLookup_InteractionWithCheckPrimaryShip(t *testing.T) { // Should be PRIMARY due to checkPrimaryShip logic assert.Equal(t, topodatapb.TabletType_PRIMARY, ti.Type) } + +// TestInitReplicationRecovery verifies replica startup initialization self-heals recoverable +// init failures returned from SetReplicationSource. +func TestInitReplicationRecovery(t *testing.T) { + ctx := t.Context() + ts := memorytopo.NewServer(ctx, "cell1") + tablet := newTestTablet(t, 1, "ks", "0", nil) + fakeMysqlDaemon := newTestMysqlDaemon(t, 1) + + tm := &TabletManager{ + actionSema: semaphore.NewWeighted(1), + BatchCtx: ctx, + TopoServer: ts, + MysqlDaemon: fakeMysqlDaemon, + tmc: newFakeTMClient(), + tabletAlias: tablet.Alias, + _waitForGrantsComplete: make(chan struct{}), + tmState: &tmState{ + displayState: displayState{ + tablet: tablet, + }, + }, + } + close(tm._waitForGrantsComplete) + + _, err := ts.GetOrCreateShard(ctx, "ks", "0") + require.NoError(t, err) + require.NoError(t, ts.CreateTablet(ctx, tablet)) + + reparenttestutil.SetKeyspaceDurability(ctx, t, ts, "ks", policy.DurabilityNone) + + primary := &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "cell1", + Uid: 2, + }, + Hostname: "primary-host", + PortMap: map[string]int32{ + "vt": 1234, + "grpc": 3456, + }, + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "mysql-primary", + MysqlPort: 3306, + } + require.NoError(t, ts.CreateTablet(ctx, primary)) + _, err = ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error { + si.PrimaryAlias = primary.Alias + return nil + }) + require.NoError(t, err) + + pos, err := replication.ParsePosition(gtidFlavor, gtidPosition) + require.NoError(t, err) + + // Make SetReplicationSource return a recoverable init error and expect the + // startup path to self-heal by restarting replication. + fakeMysqlDaemon.SetPrimaryPositionLocked(pos) + fakeMysqlDaemon.SetReplicationSourceInputs = []string{"mysql-primary:3306"} + fakeMysqlDaemon.SetReplicationSourceError = recoverableReplicationInitError() + fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + "STOP REPLICA", + "RESET REPLICA", + "START REPLICA", + } + + // initializeReplication should now succeed and return the primary position + // after routing the recoverable error through RestartReplication. + gotPosition, err := tm.initializeReplication(ctx, topodatapb.TabletType_REPLICA) + require.NoError(t, err) + require.Equal(t, fmt.Sprintf("%s/%s", gtidFlavor, gtidPosition), gotPosition) + require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) +} From 9742cf159b337ed6651dea342725f07ed30f4ece Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Thu, 19 Mar 2026 13:37:40 -0400 Subject: [PATCH 16/27] Use `t.Context()` in relay recovery tests The new relay recovery test cases used `context.Background()`, which misses the test-scoped context requested in review. This switches the new `rpc_replication_test.go` and planned reparent test cases to `t.Context()`. Signed-off-by: Mohamed Hamza --- go/vt/vttablet/tabletmanager/rpc_replication_test.go | 10 +++++----- go/vt/wrangler/testlib/planned_reparent_shard_test.go | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication_test.go b/go/vt/vttablet/tabletmanager/rpc_replication_test.go index e0486d43066..045129880b9 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication_test.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication_test.go @@ -421,7 +421,7 @@ func TestHandleRecoverableReplicationInitializationError(t *testing.T) { }, } - err := tm.handleRecoverableReplicationInitError(context.Background(), tc.inputErr) + err := tm.handleRecoverableReplicationInitError(t.Context(), tc.inputErr) if tc.shouldRestart { require.NoError(t, err) } else { @@ -444,7 +444,7 @@ func TestStartReplicationRecoversFromRecoverableReplicationInitError(t *testing. } tm := newTestReplicationTM(newTestTablet(t, 100, "ks", "0", nil), fakeMysqlDaemon, nil) - err := tm.StartReplication(context.Background(), false) + err := tm.StartReplication(t.Context(), false) require.NoError(t, err) require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) } @@ -461,7 +461,7 @@ func TestRestartReplicationRecoversFromRecoverableReplicationInitializationError } tm := newTestReplicationTM(newTestTablet(t, 100, "ks", "0", nil), fakeMysqlDaemon, nil) - err := tm.RestartReplication(context.Background(), false) + err := tm.RestartReplication(t.Context(), false) require.NoError(t, err) require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) } @@ -479,14 +479,14 @@ func TestFixSemiSyncAndReplicationRecoversFromRecoverableReplicationInitializati } tm := newTestReplicationTM(newTestTablet(t, 100, "ks", "0", nil), fakeMysqlDaemon, nil) - err := tm.fixSemiSyncAndReplication(context.Background(), topodatapb.TabletType_REPLICA, SemiSyncActionUnset) + err := tm.fixSemiSyncAndReplication(t.Context(), topodatapb.TabletType_REPLICA, SemiSyncActionUnset) require.NoError(t, err) require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) } // TestInitReplicaRecoversFromRecoverableReplicationInitializationError verifies InitReplica self-heals recoverable init failures from SetReplicationSource(startReplicationAfter=true). func TestInitReplicaRecoversFromRecoverableReplicationInitializationError(t *testing.T) { - ctx := context.Background() + ctx := t.Context() ts := memorytopo.NewServer(ctx, "cell1") _, err := ts.GetOrCreateShard(ctx, "ks", "0") diff --git a/go/vt/wrangler/testlib/planned_reparent_shard_test.go b/go/vt/wrangler/testlib/planned_reparent_shard_test.go index 47cd8060dfd..730d7d46b40 100644 --- a/go/vt/wrangler/testlib/planned_reparent_shard_test.go +++ b/go/vt/wrangler/testlib/planned_reparent_shard_test.go @@ -740,7 +740,7 @@ func TestPlannedReparentShardRelayLogErrorStartReplication(t *testing.T) { // Create a primary, a couple good replicas primary := NewFakeTablet(t, wr, "cell1", 0, topodatapb.TabletType_PRIMARY, nil) goodReplica1 := NewFakeTablet(t, wr, "cell1", 2, topodatapb.TabletType_REPLICA, nil) - reparenttestutil.SetKeyspaceDurability(context.Background(), t, ts, "test_keyspace", policy.DurabilitySemiSync) + reparenttestutil.SetKeyspaceDurability(t.Context(), t, ts, "test_keyspace", policy.DurabilitySemiSync) // old primary primary.FakeMysqlDaemon.ReadOnly = false From d23a352fe92c1573315aa4a15f076063ba9b770e Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 1 Apr 2026 10:14:27 -0400 Subject: [PATCH 17/27] tabletmanager: make `SetReplicationSource` recover safely Recovering every 1201/1871/1872 failure from `SetReplicationSource` with `RestartReplication` is unsafe because MySQL can raise those errors before the new source is applied. That can restart replication on the old source, while the narrower split in this branch also drops recovery for the running-replica reparent path. This changes tabletmanager to route source setup through `setReplicationSourceRecoverable`, which always applies the source without implicitly starting replication, repairs recoverable running-replica source-change failures by restarting and reapplying the requested source, and then handles the final `START REPLICA` step separately. The tests now cover init, running-replica source changes, and the non-running replica error path. Signed-off-by: Mohamed Hamza --- go/vt/mysqlctl/fakemysqldaemon.go | 7 + go/vt/vttablet/tabletmanager/restore.go | 2 +- .../vttablet/tabletmanager/rpc_replication.go | 86 ++++++-- .../tabletmanager/rpc_replication_test.go | 199 +++++++++++++++--- go/vt/vttablet/tabletmanager/tm_init.go | 6 +- go/vt/vttablet/tabletmanager/tm_init_test.go | 10 +- 6 files changed, 257 insertions(+), 53 deletions(-) diff --git a/go/vt/mysqlctl/fakemysqldaemon.go b/go/vt/mysqlctl/fakemysqldaemon.go index 6d5c13ee47e..415cbbb1a95 100644 --- a/go/vt/mysqlctl/fakemysqldaemon.go +++ b/go/vt/mysqlctl/fakemysqldaemon.go @@ -148,6 +148,9 @@ type FakeMysqlDaemon struct { // SetReplicationSourceError is used by SetReplicationSource. SetReplicationSourceError error + // SetReplicationSourceFunc overrides SetReplicationSource when it is set. + SetReplicationSourceFunc func(ctx context.Context, host string, port int32, heartbeatInterval float64, stopReplicationBefore bool, startReplicationAfter bool) error + // StopReplicationError error is used by StopReplication. StopReplicationError error @@ -549,6 +552,10 @@ func (fmd *FakeMysqlDaemon) SetReplicationPosition(ctx context.Context, pos repl // SetReplicationSource is part of the MysqlDaemon interface. func (fmd *FakeMysqlDaemon) SetReplicationSource(ctx context.Context, host string, port int32, heartbeatInterval float64, stopReplicationBefore bool, startReplicationAfter bool) error { + if fmd.SetReplicationSourceFunc != nil { + return fmd.SetReplicationSourceFunc(ctx, host, port, heartbeatInterval, stopReplicationBefore, startReplicationAfter) + } + input := fmt.Sprintf("%v:%v", host, port) found := false for _, sourceInput := range fmd.SetReplicationSourceInputs { diff --git a/go/vt/vttablet/tabletmanager/restore.go b/go/vt/vttablet/tabletmanager/restore.go index bc15fcad6bc..691969c0057 100644 --- a/go/vt/vttablet/tabletmanager/restore.go +++ b/go/vt/vttablet/tabletmanager/restore.go @@ -352,7 +352,7 @@ func (tm *TabletManager) disableReplication(ctx context.Context) error { return vterrors.Wrap(err, "failed to reset replication") } - if err := tm.MysqlDaemon.SetReplicationSource(ctx, "//", 0, 0, false, true); err != nil { + if err := tm.setReplicationSourceRecoverable(ctx, "//", 0, 0, false, true); err != nil { return vterrors.Wrap(err, "failed to disable replication") } diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index c7570688f35..9fb4d3570ea 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -524,10 +524,9 @@ func (tm *TabletManager) InitReplica(ctx context.Context, parent *topodatapb.Tab if err := tm.MysqlDaemon.SetReplicationPosition(ctx, pos); err != nil { return err } - if err := tm.MysqlDaemon.SetReplicationSource(ctx, ti.MysqlHostname, ti.MysqlPort, 0, false, true); err != nil { - if err := tm.handleRecoverableReplicationInitError(ctx, err); err != nil { - return err - } + + if err := tm.setReplicationSourceRecoverable(ctx, ti.MysqlHostname, ti.MysqlPort, 0, false, true); err != nil { + return err } // wait until we get the replicated row, or our context times out @@ -950,11 +949,8 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA } } if status.SourceHost != host || status.SourcePort != port || heartbeatInterval != 0 { - // This handles both changing the address and starting replication. - if err := tm.MysqlDaemon.SetReplicationSource(ctx, host, port, heartbeatInterval, wasReplicating, shouldbeReplicating); err != nil { - if err := tm.handleRecoverableReplicationInitError(ctx, err); err != nil { - return err - } + if err := tm.setReplicationSourceRecoverable(ctx, host, port, heartbeatInterval, wasReplicating, shouldbeReplicating); err != nil { + return err } } else if shouldbeReplicating { // The address is correct. We need to restart replication so that any semi-sync changes if any @@ -1241,14 +1237,78 @@ func (tm *TabletManager) fixSemiSyncAndReplication(ctx context.Context, tabletTy // startReplicationRecoverable starts replication and handles recoverable errors by resetting replication. func (tm *TabletManager) startReplicationRecoverable(ctx context.Context) error { - if err := tm.MysqlDaemon.StartReplication(ctx, tm.hookExtraEnv()); err != nil { - if err := tm.handleRecoverableReplicationInitError(ctx, err); err != nil { - return err - } + err := tm.MysqlDaemon.StartReplication(ctx, tm.hookExtraEnv()) + if err == nil { + return nil } + + // Try to recover from the error. + if err := tm.handleRecoverableReplicationInitError(ctx, err); err != nil { + return err + } + return nil } +// setReplicationSourceRecoverable configures the requested replication source and optionally starts +// replication afterward. If possible, certain errors are recovered by restarting replication. +func (tm *TabletManager) setReplicationSourceRecoverable(ctx context.Context, host string, port int32, heartbeatInterval float64, wasReplicating bool, shouldStartReplication bool) error { + // Create a helper to set the replication without starting replication afterward. This is used so we can better + // handle errors in each stage. + setReplicationSource := func(stopReplicationBefore bool) error { + return tm.MysqlDaemon.SetReplicationSource(ctx, host, port, heartbeatInterval, stopReplicationBefore, false) + } + + // Let's first try to apply the requested source without starting replication. If the replica was replicating + // before, we tell the helper to stop replication first. + err := setReplicationSource(wasReplicating) + if err == nil { + // If we succeeded, let's start replication but only if it was requested. + if !shouldStartReplication { + return nil + } + + return tm.startReplicationRecoverable(ctx) + } + + // Next, if the error is not one of the recoverable ones, return it. + if !isRecoverableReplicationInitializationError(err) { + return err + } + + // Recovery is performed by restarting replication. If the replica was not previously replicating, + // let's not continue with the recovery so that we don't inadvertently enable replication. + if !wasReplicating { + return err + } + + log.Warn( + "Encountered recoverable replication initialization error while changing replication source, restarting "+ + "replication and reapplying source", + slog.String("source_host", host), + slog.Int("source_port", int(port)), + slog.Any("error", err), + ) + + // Recover from the error by restarting replication. + if err := tm.MysqlDaemon.RestartReplication(ctx, tm.hookExtraEnv()); err != nil { + return err + } + + // Now that we've recovered, let's try setting the replication source again. Since we've just + // restarted replication, we tell the helper to stop replication beforehand. + if err := setReplicationSource(true); err != nil { + return err + } + + // The replication source has finally been set. Let's also start replication if it was requested. + if !shouldStartReplication { + return nil + } + + return tm.startReplicationRecoverable(ctx) +} + // recoverableReplicationInitializationErrorCodes is the set of replication initialization error // codes that can be recovered from by restarting replication. MySQL used 1871/1872 for master-info // and relay-log-info initialization errors through 8.0.32, and reassigned those numbers in 8.0.33 diff --git a/go/vt/vttablet/tabletmanager/rpc_replication_test.go b/go/vt/vttablet/tabletmanager/rpc_replication_test.go index 045129880b9..46b8d08c083 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication_test.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication_test.go @@ -19,6 +19,7 @@ package tabletmanager import ( "context" "errors" + "fmt" "sync/atomic" "testing" "time" @@ -484,40 +485,174 @@ func TestFixSemiSyncAndReplicationRecoversFromRecoverableReplicationInitializati require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) } -// TestInitReplicaRecoversFromRecoverableReplicationInitializationError verifies InitReplica self-heals recoverable init failures from SetReplicationSource(startReplicationAfter=true). -func TestInitReplicaRecoversFromRecoverableReplicationInitializationError(t *testing.T) { - ctx := t.Context() - ts := memorytopo.NewServer(ctx, "cell1") +func TestSetReplicationSourceRecovery(t *testing.T) { + t.Run("InitReplica recovers from start replication error", func(t *testing.T) { + ctx := t.Context() + ts := memorytopo.NewServer(ctx, "cell1") - _, err := ts.GetOrCreateShard(ctx, "ks", "0") - require.NoError(t, err) + // Create a shard with a primary that InitReplica will point to. + _, err := ts.GetOrCreateShard(ctx, "ks", "0") + require.NoError(t, err) - parent := &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{ - Cell: "cell1", - Uid: 200, - }, - Keyspace: "ks", - Shard: "0", - Type: topodatapb.TabletType_PRIMARY, - MysqlHostname: "mysql-primary", - MysqlPort: 3306, - } - require.NoError(t, ts.CreateTablet(ctx, parent)) + parent := &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "cell1", + Uid: 200, + }, + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "mysql-primary", + MysqlPort: 3306, + } + require.NoError(t, ts.CreateTablet(ctx, parent)) + + fakeMysqlDaemon := newTestMysqlDaemon(t, 1) + + // Let the source change succeed, then fail the explicit START REPLICA so + // the recovery path is exercised after the source is already configured. + fakeMysqlDaemon.SetReplicationSourceInputs = []string{"mysql-primary:3306"} + fakeMysqlDaemon.StartReplicationError = recoverableReplicationInitError() + fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + "FAKE RESET BINARY LOGS AND GTIDS", + "FAKE SET GLOBAL gtid_purged", + "FAKE SET SOURCE", + "STOP REPLICA", + "RESET REPLICA", + "START REPLICA", + } + + tm := newTestReplicationTM(newTestTablet(t, 100, "ks", "0", nil), fakeMysqlDaemon, ts) + + // InitReplica should recover the start failure and still complete. + err = tm.InitReplica(ctx, parent.Alias, "", 0, false) + require.NoError(t, err) + require.Equal(t, "mysql-primary", fakeMysqlDaemon.CurrentSourceHost) + require.EqualValues(t, 3306, fakeMysqlDaemon.CurrentSourcePort) + require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) + }) - fakeMysqlDaemon := newTestMysqlDaemon(t, 1) - fakeMysqlDaemon.SetReplicationSourceInputs = []string{"mysql-primary:3306"} - fakeMysqlDaemon.SetReplicationSourceError = recoverableReplicationInitError() - fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - "FAKE RESET BINARY LOGS AND GTIDS", - "FAKE SET GLOBAL gtid_purged", - "STOP REPLICA", - "RESET REPLICA", - "START REPLICA", - } + t.Run("SetReplicationSource recovers on source change for running replica", func(t *testing.T) { + ctx := t.Context() + ts := memorytopo.NewServer(ctx, "cell1") + + tablet := newTestTablet(t, 100, "ks", "0", nil) + fakeMysqlDaemon := newTestMysqlDaemon(t, 1) + + // Start from a running replica that still points at the old primary. + fakeMysqlDaemon.Replicating = true + fakeMysqlDaemon.CurrentSourceHost = "mysql-old-primary" + fakeMysqlDaemon.CurrentSourcePort = 3305 + fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + "STOP REPLICA", + "STOP REPLICA", + "RESET REPLICA", + "START REPLICA", + "STOP REPLICA", + "FAKE SET SOURCE", + "START REPLICA", + } + + setSourceCalls := 0 + + // Fail the first source-change attempt after the internal STOP REPLICA. + // The second attempt should succeed after recovery has cleared the broken + // metadata and the source should end up on the requested primary. + fakeMysqlDaemon.SetReplicationSourceFunc = func(ctx context.Context, host string, port int32, heartbeatInterval float64, stopReplicationBefore bool, startReplicationAfter bool) error { + setSourceCalls++ + + require.Equal(t, "mysql-new-primary", host) + require.EqualValues(t, 3306, port) + require.Zero(t, heartbeatInterval) + require.True(t, stopReplicationBefore) + require.False(t, startReplicationAfter) + + if setSourceCalls == 1 { + require.NoError(t, fakeMysqlDaemon.ExecuteSuperQueryList(ctx, []string{"STOP REPLICA"})) + return recoverableReplicationInitError() + } - tm := newTestReplicationTM(newTestTablet(t, 100, "ks", "0", nil), fakeMysqlDaemon, ts) - err = tm.InitReplica(ctx, parent.Alias, "", 0, false) - require.NoError(t, err) - require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) + if setSourceCalls == 2 { + require.NoError(t, fakeMysqlDaemon.ExecuteSuperQueryList(ctx, []string{"STOP REPLICA", "FAKE SET SOURCE"})) + + fakeMysqlDaemon.CurrentSourceHost = host + fakeMysqlDaemon.CurrentSourcePort = port + + return nil + } + + return fmt.Errorf("unexpected SetReplicationSource call %d", setSourceCalls) + } + + tm := &TabletManager{ + actionSema: semaphore.NewWeighted(1), + BatchCtx: ctx, + TopoServer: ts, + MysqlDaemon: fakeMysqlDaemon, + tmc: newFakeTMClient(), + tabletAlias: tablet.Alias, + _waitForGrantsComplete: make(chan struct{}), + tmState: &tmState{ + displayState: displayState{ + tablet: tablet, + }, + }, + } + close(tm._waitForGrantsComplete) + + // Register both the replica and the new primary in topo. + _, err := ts.GetOrCreateShard(ctx, "ks", "0") + require.NoError(t, err) + require.NoError(t, ts.CreateTablet(ctx, tablet)) + + parent := &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "cell1", + Uid: 200, + }, + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "mysql-new-primary", + MysqlPort: 3306, + } + require.NoError(t, ts.CreateTablet(ctx, parent)) + + // SetReplicationSource should recover the source-change error, then + // leave the replica configured for the new primary. + err = tm.SetReplicationSource(ctx, parent.Alias, 0, "", false, false, 0) + require.NoError(t, err) + + require.Equal(t, 2, setSourceCalls) + require.Equal(t, "mysql-new-primary", fakeMysqlDaemon.CurrentSourceHost) + require.EqualValues(t, 3306, fakeMysqlDaemon.CurrentSourcePort) + require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) + }) + + t.Run("non-running replica returns recoverable source error directly", func(t *testing.T) { + fakeMysqlDaemon := newTestMysqlDaemon(t, 1) + + setSourceCalls := 0 + + // When replication was not already running, the helper should not try to + // recover a source-change failure because recovery would start replication + // as a side effect. + fakeMysqlDaemon.SetReplicationSourceFunc = func(ctx context.Context, host string, port int32, heartbeatInterval float64, stopReplicationBefore bool, startReplicationAfter bool) error { + setSourceCalls++ + + require.Equal(t, "mysql-new-primary", host) + require.EqualValues(t, 3306, port) + require.False(t, stopReplicationBefore) + require.False(t, startReplicationAfter) + return recoverableReplicationInitError() + } + + tm := newTestReplicationTM(newTestTablet(t, 100, "ks", "0", nil), fakeMysqlDaemon, nil) + + // The original error should be returned unchanged in this case. + err := tm.setReplicationSourceRecoverable(t.Context(), "mysql-new-primary", 3306, 0, false, false) + require.ErrorContains(t, err, "Could not initialize master info structure") + require.Equal(t, 1, setSourceCalls) + require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) + }) } diff --git a/go/vt/vttablet/tabletmanager/tm_init.go b/go/vt/vttablet/tabletmanager/tm_init.go index c4e60dd57a8..2b4cac6504c 100644 --- a/go/vt/vttablet/tabletmanager/tm_init.go +++ b/go/vt/vttablet/tabletmanager/tm_init.go @@ -1135,10 +1135,8 @@ func (tm *TabletManager) initializeReplication(ctx context.Context, tabletType t return "", vterrors.New(vtrpc.Code_FAILED_PRECONDITION, fmt.Sprintf("Errant GTID detected - %s; Primary GTID - %s, Replica GTID - %s", errantGtid, primaryPosition, replicaPos.String())) } - if err := tm.MysqlDaemon.SetReplicationSource(ctx, currentPrimary.MysqlHostname, currentPrimary.MysqlPort, 0, true, true); err != nil { - if err := tm.handleRecoverableReplicationInitError(ctx, err); err != nil { - return "", vterrors.Wrap(err, "MysqlDaemon.SetReplicationSource failed") - } + if err := tm.setReplicationSourceRecoverable(ctx, currentPrimary.MysqlHostname, currentPrimary.MysqlPort, 0, true, true); err != nil { + return "", vterrors.Wrap(err, "failed to configure replication source") } return primaryStatus.Position, nil diff --git a/go/vt/vttablet/tabletmanager/tm_init_test.go b/go/vt/vttablet/tabletmanager/tm_init_test.go index 685c091d53a..ef82d034d49 100644 --- a/go/vt/vttablet/tabletmanager/tm_init_test.go +++ b/go/vt/vttablet/tabletmanager/tm_init_test.go @@ -1309,12 +1309,16 @@ func TestInitReplicationRecovery(t *testing.T) { pos, err := replication.ParsePosition(gtidFlavor, gtidPosition) require.NoError(t, err) - // Make SetReplicationSource return a recoverable init error and expect the - // startup path to self-heal by restarting replication. + // Make StartReplication return a recoverable init error and expect the + // startup path to self-heal by restarting replication. SetReplicationSource + // is called with startReplicationAfter=false so recovery only applies to + // the separate StartReplication call. fakeMysqlDaemon.SetPrimaryPositionLocked(pos) fakeMysqlDaemon.SetReplicationSourceInputs = []string{"mysql-primary:3306"} - fakeMysqlDaemon.SetReplicationSourceError = recoverableReplicationInitError() + fakeMysqlDaemon.StartReplicationError = recoverableReplicationInitError() fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + "STOP REPLICA", + "FAKE SET SOURCE", "STOP REPLICA", "RESET REPLICA", "START REPLICA", From 0a9d9fb23080f327ee9c386bf47d14e0722e4a18 Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 1 Apr 2026 10:26:45 -0400 Subject: [PATCH 18/27] sqlerror: add aliases for versioned replication errnos MySQL reused errnos 1871 and 1872 for different replication metadata initialization failures across versions, but we only exposed the 8.0.33+ names. This makes older relay-log and master-info test cases read as if they were using the wrong constants. This adds `ERReplicaMasterInfoInitRepository` and `ERReplicaRelayLogInfoInitRepository` as aliases for the pre-8.0.33 meanings while keeping the existing 8.0.33+ names. The shared comment now states explicitly that these errnos are version-dependent. Signed-off-by: Mohamed Hamza --- go/mysql/sqlerror/constants.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/go/mysql/sqlerror/constants.go b/go/mysql/sqlerror/constants.go index 8c5078750b6..8ba8c3e329b 100644 --- a/go/mysql/sqlerror/constants.go +++ b/go/mysql/sqlerror/constants.go @@ -131,7 +131,10 @@ const ( ERDupIndex = ErrorCode(1831) // MySQL used 1871/1872 for master-info and relay-log-info initialization // errors through 8.0.32, and reassigned those numbers in 8.0.33 to - // connection-metadata and applier-metadata initialization errors. + // connection-metadata and applier-metadata initialization errors. These + // errnos therefore map to different metadata types depending on version. + ERReplicaMasterInfoInitRepository = ErrorCode(1871) + ERReplicaRelayLogInfoInitRepository = ErrorCode(1872) ERReplicaConnectionMetadataInitRepository = ErrorCode(1871) ERReplicaApplierMetadataInitRepository = ErrorCode(1872) ERInnodbReadOnly = ErrorCode(1874) From 2350f61cbbbeaa31a5d1c3f951d2463790f52447 Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 1 Apr 2026 10:27:55 -0400 Subject: [PATCH 19/27] wrangler: clarify relay-log recovery tests The recovery logic keys off errnos, so the relay-log subtests were using the 8.0.33+ applier-metadata constant with pre-8.0.33 relay-log messages and duplicated the same 1872 coverage. That made the test intent harder to follow. This switches the relay-log cases to `ERReplicaRelayLogInfoInitRepository` and drops the redundant extra 1872 cases in tabletmanager and wrangler test tables. The remaining cases still cover the versioned messages without implying the wrong metadata type. Signed-off-by: Mohamed Hamza --- go/vt/vttablet/tabletmanager/rpc_replication_test.go | 7 +------ .../wrangler/testlib/planned_reparent_shard_test.go | 12 ++---------- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication_test.go b/go/vt/vttablet/tabletmanager/rpc_replication_test.go index 46b8d08c083..2b4bc3ba912 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication_test.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication_test.go @@ -360,7 +360,7 @@ func TestHandleRecoverableReplicationInitializationError(t *testing.T) { }{ { name: "relay log info repository error", - inputErr: sqlerror.NewSQLError(sqlerror.ERReplicaApplierMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize relay log info structure from the repository"), + inputErr: sqlerror.NewSQLError(sqlerror.ERReplicaRelayLogInfoInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize relay log info structure from the repository"), shouldRestart: true, }, { @@ -373,11 +373,6 @@ func TestHandleRecoverableReplicationInitializationError(t *testing.T) { inputErr: sqlerror.NewSQLError(sqlerror.ERReplicaConnectionMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize connection metadata structure from the repository"), shouldRestart: true, }, - { - name: "applier metadata error", - inputErr: sqlerror.NewSQLError(sqlerror.ERReplicaApplierMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize applier metadata structure from the repository"), - shouldRestart: true, - }, { name: "applier metadata message with wrong errno", inputErr: sqlerror.NewSQLError(sqlerror.ERUnknownError, sqlerror.SSUnknownSQLState, "Replica failed to initialize applier metadata structure from the repository"), diff --git a/go/vt/wrangler/testlib/planned_reparent_shard_test.go b/go/vt/wrangler/testlib/planned_reparent_shard_test.go index 730d7d46b40..8e844777712 100644 --- a/go/vt/wrangler/testlib/planned_reparent_shard_test.go +++ b/go/vt/wrangler/testlib/planned_reparent_shard_test.go @@ -600,7 +600,7 @@ func TestPlannedReparentShardRelayLogError(t *testing.T) { }{ { name: "relay log info", - err: sqlerror.NewSQLError(sqlerror.ERReplicaApplierMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize relay log info structure from the repository"), + err: sqlerror.NewSQLError(sqlerror.ERReplicaRelayLogInfoInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize relay log info structure from the repository"), }, { name: "master info", @@ -610,10 +610,6 @@ func TestPlannedReparentShardRelayLogError(t *testing.T) { name: "connection metadata", err: sqlerror.NewSQLError(sqlerror.ERReplicaConnectionMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize connection metadata structure from the repository"), }, - { - name: "applier metadata", - err: sqlerror.NewSQLError(sqlerror.ERReplicaApplierMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize applier metadata structure from the repository"), - }, } for _, relayError := range relayErrors { @@ -707,7 +703,7 @@ func TestPlannedReparentShardRelayLogErrorStartReplication(t *testing.T) { }{ { name: "relay log info", - err: sqlerror.NewSQLError(sqlerror.ERReplicaApplierMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize relay log info structure from the repository"), + err: sqlerror.NewSQLError(sqlerror.ERReplicaRelayLogInfoInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize relay log info structure from the repository"), }, { name: "master info", @@ -717,10 +713,6 @@ func TestPlannedReparentShardRelayLogErrorStartReplication(t *testing.T) { name: "connection metadata", err: sqlerror.NewSQLError(sqlerror.ERReplicaConnectionMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize connection metadata structure from the repository"), }, - { - name: "applier metadata", - err: sqlerror.NewSQLError(sqlerror.ERReplicaApplierMetadataInitRepository, sqlerror.SSUnknownSQLState, "Replica failed to initialize applier metadata structure from the repository"), - }, } for _, relayError := range relayErrors { From dac71ba47276c0997f5f35cfb5fb50ea2e4fc466 Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 1 Apr 2026 10:30:39 -0400 Subject: [PATCH 20/27] Update constants.go Signed-off-by: Mohamed Hamza --- go/mysql/sqlerror/constants.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/go/mysql/sqlerror/constants.go b/go/mysql/sqlerror/constants.go index 8ba8c3e329b..1f2d8c0d4d1 100644 --- a/go/mysql/sqlerror/constants.go +++ b/go/mysql/sqlerror/constants.go @@ -129,6 +129,7 @@ const ( ERSourceHasPurgedRequiredGtids = ErrorCode(1789) ERInnodbIndexCorrupt = ErrorCode(1817) ERDupIndex = ErrorCode(1831) + // MySQL used 1871/1872 for master-info and relay-log-info initialization // errors through 8.0.32, and reassigned those numbers in 8.0.33 to // connection-metadata and applier-metadata initialization errors. These @@ -137,6 +138,7 @@ const ( ERReplicaRelayLogInfoInitRepository = ErrorCode(1872) ERReplicaConnectionMetadataInitRepository = ErrorCode(1871) ERReplicaApplierMetadataInitRepository = ErrorCode(1872) + ERInnodbReadOnly = ErrorCode(1874) ERVectorConversion = ErrorCode(6138) From 22ff699fbf69765f77e92d3995ae1accfe8c4752 Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 1 Apr 2026 10:36:16 -0400 Subject: [PATCH 21/27] Update rpc_replication.go Signed-off-by: Mohamed Hamza --- go/vt/vttablet/tabletmanager/rpc_replication.go | 1 + 1 file changed, 1 insertion(+) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index 9fb4d3570ea..1f3438b8b06 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -949,6 +949,7 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA } } if status.SourceHost != host || status.SourcePort != port || heartbeatInterval != 0 { + // This handles both changing the address and starting replication. if err := tm.setReplicationSourceRecoverable(ctx, host, port, heartbeatInterval, wasReplicating, shouldbeReplicating); err != nil { return err } From 3ee8dc7e695e063fb6b63fb5e29adb0ba272b73e Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 1 Apr 2026 10:33:52 -0400 Subject: [PATCH 22/27] tabletmanager: use `startReplicationRecoverable` in reparent restart path The reparent branch that restarts replication after a no-op source check still called `StartReplication` inline and routed the error through `handleRecoverableReplicationInitError` manually. That duplicated the helper we already use everywhere else for the same start recovery behavior. This switches that branch to call `startReplicationRecoverable` directly so the explicit START REPLICA recovery stays in one place. Signed-off-by: Mohamed Hamza --- go/vt/vttablet/tabletmanager/rpc_replication.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index 1f3438b8b06..0b6e507968f 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -961,10 +961,8 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA return err } } - if err := tm.MysqlDaemon.StartReplication(ctx, tm.hookExtraEnv()); err != nil { - if err := tm.handleRecoverableReplicationInitError(ctx, err); err != nil { - return err - } + if err := tm.startReplicationRecoverable(ctx); err != nil { + return err } } From 4762624b01d831c54ace1f20e9de3a601e1dc4e3 Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 1 Apr 2026 10:34:36 -0400 Subject: [PATCH 23/27] tabletmanager: add `stopReplicationRecoverable` The reparent restart branch still open-coded a recoverable `STOP REPLICA` path while start recovery had already been pulled into a helper. That left the stop handling inconsistent and made the restart path harder to read. This adds `stopReplicationRecoverable`, uses it where the reparent restart flow already had identical recoverable-stop behavior, and adds a focused unit test for the new helper. Signed-off-by: Mohamed Hamza --- .../vttablet/tabletmanager/rpc_replication.go | 22 +++++++++++++++---- .../tabletmanager/rpc_replication_test.go | 15 +++++++++++++ 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index 0b6e507968f..a8c94b9f8bd 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -956,10 +956,8 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA } else if shouldbeReplicating { // The address is correct. We need to restart replication so that any semi-sync changes if any // are taken into account - if err := tm.MysqlDaemon.StopReplication(ctx, tm.hookExtraEnv()); err != nil { - if err := tm.handleRecoverableReplicationInitError(ctx, err); err != nil { - return err - } + if err := tm.stopReplicationRecoverable(ctx); err != nil { + return err } if err := tm.startReplicationRecoverable(ctx); err != nil { return err @@ -1234,6 +1232,22 @@ func (tm *TabletManager) fixSemiSyncAndReplication(ctx context.Context, tabletTy return nil } +// stopReplicationRecoverable stops replication and handles recoverable errors +// by resetting replication metadata. +func (tm *TabletManager) stopReplicationRecoverable(ctx context.Context) error { + err := tm.MysqlDaemon.StopReplication(ctx, tm.hookExtraEnv()) + if err == nil { + return nil + } + + // Try to recover from the error. + if err := tm.handleRecoverableReplicationInitError(ctx, err); err != nil { + return err + } + + return nil +} + // startReplicationRecoverable starts replication and handles recoverable errors by resetting replication. func (tm *TabletManager) startReplicationRecoverable(ctx context.Context) error { err := tm.MysqlDaemon.StartReplication(ctx, tm.hookExtraEnv()) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication_test.go b/go/vt/vttablet/tabletmanager/rpc_replication_test.go index 2b4bc3ba912..48943f3d200 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication_test.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication_test.go @@ -445,6 +445,21 @@ func TestStartReplicationRecoversFromRecoverableReplicationInitError(t *testing. require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) } +func TestStopReplicationRecoversFromRecoverableReplicationInitError(t *testing.T) { + fakeMysqlDaemon := newTestMysqlDaemon(t, 1) + fakeMysqlDaemon.StopReplicationError = recoverableReplicationInitError() + fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + "STOP REPLICA", + "RESET REPLICA", + "START REPLICA", + } + + tm := newTestReplicationTM(newTestTablet(t, 100, "ks", "0", nil), fakeMysqlDaemon, nil) + err := tm.stopReplicationRecoverable(t.Context()) + require.NoError(t, err) + require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) +} + // TestRestartReplicationRecoversFromRecoverableReplicationInitializationError verifies RestartReplication self-heals recoverable init failures. func TestRestartReplicationRecoversFromRecoverableReplicationInitializationError(t *testing.T) { fakeMysqlDaemon := newTestMysqlDaemon(t, 1) From b925a4b7c3c67189f1c333ca186e26cef7b8e54f Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 1 Apr 2026 12:12:42 -0400 Subject: [PATCH 24/27] tabletmanager: recover `SetReplicationSource` with `RESET REPLICA ALL` `SetReplicationSource` source-change failures were still handled with restart-style recovery, which can resume the old source before the requested one is known to be stored. The same recovery shape was also awkward for `STOP REPLICA`, where a recoverable stop failure could return success with replication running. This changes `setReplicationSourceRecoverable` to repair recoverable source-change errors by `ResetReplicationParameters`, reapply the requested source, and only start replication when requested. It also stops attempting recoverable handling for `STOP REPLICA`, updates the helper comments, and adds regression coverage for running and non-running replicas. Signed-off-by: Mohamed Hamza --- .../vttablet/tabletmanager/rpc_replication.go | 75 ++++-------- .../tabletmanager/rpc_replication_test.go | 112 +++++++++++++----- 2 files changed, 107 insertions(+), 80 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index a8c94b9f8bd..dcbfde4c2c6 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -955,8 +955,10 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA } } else if shouldbeReplicating { // The address is correct. We need to restart replication so that any semi-sync changes if any - // are taken into account - if err := tm.stopReplicationRecoverable(ctx); err != nil { + // are taken into account. We don't attempt to recover from the known recoverable errors here + // because recovery requires running `STOP REPLICA` in order to reset the replication metadata. + // If we error the first time, we're likely to error the second time as well. + if err := tm.MysqlDaemon.StopReplication(ctx, tm.hookExtraEnv()); err != nil { return err } if err := tm.startReplicationRecoverable(ctx); err != nil { @@ -1232,22 +1234,6 @@ func (tm *TabletManager) fixSemiSyncAndReplication(ctx context.Context, tabletTy return nil } -// stopReplicationRecoverable stops replication and handles recoverable errors -// by resetting replication metadata. -func (tm *TabletManager) stopReplicationRecoverable(ctx context.Context) error { - err := tm.MysqlDaemon.StopReplication(ctx, tm.hookExtraEnv()) - if err == nil { - return nil - } - - // Try to recover from the error. - if err := tm.handleRecoverableReplicationInitError(ctx, err); err != nil { - return err - } - - return nil -} - // startReplicationRecoverable starts replication and handles recoverable errors by resetting replication. func (tm *TabletManager) startReplicationRecoverable(ctx context.Context) error { err := tm.MysqlDaemon.StartReplication(ctx, tm.hookExtraEnv()) @@ -1264,19 +1250,14 @@ func (tm *TabletManager) startReplicationRecoverable(ctx context.Context) error } // setReplicationSourceRecoverable configures the requested replication source and optionally starts -// replication afterward. If possible, certain errors are recovered by restarting replication. +// replication afterward. When possible, certain errors are recovered by reinitializing replication +// metadata. func (tm *TabletManager) setReplicationSourceRecoverable(ctx context.Context, host string, port int32, heartbeatInterval float64, wasReplicating bool, shouldStartReplication bool) error { - // Create a helper to set the replication without starting replication afterward. This is used so we can better - // handle errors in each stage. - setReplicationSource := func(stopReplicationBefore bool) error { - return tm.MysqlDaemon.SetReplicationSource(ctx, host, port, heartbeatInterval, stopReplicationBefore, false) - } - - // Let's first try to apply the requested source without starting replication. If the replica was replicating - // before, we tell the helper to stop replication first. - err := setReplicationSource(wasReplicating) + // Let's first try to apply the requested source without starting replication afterwards. If the + // replica was replicating before, we stop replication first. + err := tm.MysqlDaemon.SetReplicationSource(ctx, host, port, heartbeatInterval, wasReplicating, false) if err == nil { - // If we succeeded, let's start replication but only if it was requested. + // We succeeded, let's start replication but only if it was requested. if !shouldStartReplication { return nil } @@ -1284,48 +1265,42 @@ func (tm *TabletManager) setReplicationSourceRecoverable(ctx context.Context, ho return tm.startReplicationRecoverable(ctx) } - // Next, if the error is not one of the recoverable ones, return it. + // We hit an error. If the error is not one of the recoverable ones, we can't recover and should return it. if !isRecoverableReplicationInitializationError(err) { return err } - // Recovery is performed by restarting replication. If the replica was not previously replicating, - // let's not continue with the recovery so that we don't inadvertently enable replication. - if !wasReplicating { - return err - } - log.Warn( - "Encountered recoverable replication initialization error while changing replication source, restarting "+ - "replication and reapplying source", + "Encountered recoverable replication initialization error while changing replication source, resetting "+ + "replication parameters and reapplying source", slog.String("source_host", host), slog.Int("source_port", int(port)), slog.Any("error", err), ) - // Recover from the error by restarting replication. - if err := tm.MysqlDaemon.RestartReplication(ctx, tm.hookExtraEnv()); err != nil { + // Recover from the error by reinitializing replication metadata through `RESET REPLICA ALL`. + if err := tm.MysqlDaemon.ResetReplicationParameters(ctx); err != nil { return err } - // Now that we've recovered, let's try setting the replication source again. Since we've just - // restarted replication, we tell the helper to stop replication beforehand. - if err := setReplicationSource(true); err != nil { + // Now that we've reinitialized the replication metadata, try setting the source again. + if err := tm.MysqlDaemon.SetReplicationSource(ctx, host, port, heartbeatInterval, false, false); err != nil { return err } // The replication source has finally been set. Let's also start replication if it was requested. - if !shouldStartReplication { - return nil + if shouldStartReplication { + return tm.startReplicationRecoverable(ctx) } - return tm.startReplicationRecoverable(ctx) + return nil } // recoverableReplicationInitializationErrorCodes is the set of replication initialization error -// codes that can be recovered from by restarting replication. MySQL used 1871/1872 for master-info -// and relay-log-info initialization errors through 8.0.32, and reassigned those numbers in 8.0.33 -// to connection-metadata and applier-metadata initialization errors. +// codes that can be recovered from by reinitializing replication metadata. +// MySQL used 1871/1872 for master-info and relay-log-info initialization errors +// through 8.0.32, and reassigned those numbers in 8.0.33 to connection-metadata +// and applier-metadata initialization errors. var recoverableReplicationInitializationErrorCodes = map[sqlerror.ErrorCode]struct{}{ sqlerror.ERMasterInfo: {}, sqlerror.ERReplicaConnectionMetadataInitRepository: {}, @@ -1333,7 +1308,7 @@ var recoverableReplicationInitializationErrorCodes = map[sqlerror.ErrorCode]stru } // isRecoverableReplicationInitializationError reports whether an error can be recovered from by -// restarting replication. +// reinitializing replication metadata. func isRecoverableReplicationInitializationError(err error) bool { sqlErr, ok := sqlerror.NewSQLErrorFromError(err).(*sqlerror.SQLError) if !ok || sqlErr == nil { diff --git a/go/vt/vttablet/tabletmanager/rpc_replication_test.go b/go/vt/vttablet/tabletmanager/rpc_replication_test.go index 48943f3d200..07b0fbced10 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication_test.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication_test.go @@ -445,21 +445,6 @@ func TestStartReplicationRecoversFromRecoverableReplicationInitError(t *testing. require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) } -func TestStopReplicationRecoversFromRecoverableReplicationInitError(t *testing.T) { - fakeMysqlDaemon := newTestMysqlDaemon(t, 1) - fakeMysqlDaemon.StopReplicationError = recoverableReplicationInitError() - fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ - "STOP REPLICA", - "RESET REPLICA", - "START REPLICA", - } - - tm := newTestReplicationTM(newTestTablet(t, 100, "ks", "0", nil), fakeMysqlDaemon, nil) - err := tm.stopReplicationRecoverable(t.Context()) - require.NoError(t, err) - require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) -} - // TestRestartReplicationRecoversFromRecoverableReplicationInitializationError verifies RestartReplication self-heals recoverable init failures. func TestRestartReplicationRecoversFromRecoverableReplicationInitializationError(t *testing.T) { fakeMysqlDaemon := newTestMysqlDaemon(t, 1) @@ -555,10 +540,7 @@ func TestSetReplicationSourceRecovery(t *testing.T) { fakeMysqlDaemon.CurrentSourcePort = 3305 fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ "STOP REPLICA", - "STOP REPLICA", - "RESET REPLICA", - "START REPLICA", - "STOP REPLICA", + "FAKE RESET REPLICA ALL", "FAKE SET SOURCE", "START REPLICA", } @@ -567,23 +549,24 @@ func TestSetReplicationSourceRecovery(t *testing.T) { // Fail the first source-change attempt after the internal STOP REPLICA. // The second attempt should succeed after recovery has cleared the broken - // metadata and the source should end up on the requested primary. + // metadata and reapplied the requested source. fakeMysqlDaemon.SetReplicationSourceFunc = func(ctx context.Context, host string, port int32, heartbeatInterval float64, stopReplicationBefore bool, startReplicationAfter bool) error { setSourceCalls++ require.Equal(t, "mysql-new-primary", host) require.EqualValues(t, 3306, port) require.Zero(t, heartbeatInterval) - require.True(t, stopReplicationBefore) require.False(t, startReplicationAfter) if setSourceCalls == 1 { + require.True(t, stopReplicationBefore) require.NoError(t, fakeMysqlDaemon.ExecuteSuperQueryList(ctx, []string{"STOP REPLICA"})) return recoverableReplicationInitError() } if setSourceCalls == 2 { - require.NoError(t, fakeMysqlDaemon.ExecuteSuperQueryList(ctx, []string{"STOP REPLICA", "FAKE SET SOURCE"})) + require.False(t, stopReplicationBefore) + require.NoError(t, fakeMysqlDaemon.ExecuteSuperQueryList(ctx, []string{"FAKE SET SOURCE"})) fakeMysqlDaemon.CurrentSourceHost = host fakeMysqlDaemon.CurrentSourcePort = port @@ -639,14 +622,19 @@ func TestSetReplicationSourceRecovery(t *testing.T) { require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) }) - t.Run("non-running replica returns recoverable source error directly", func(t *testing.T) { + t.Run("non-running replica reapplies source after recoverable source error", func(t *testing.T) { fakeMysqlDaemon := newTestMysqlDaemon(t, 1) + fakeMysqlDaemon.CurrentSourceHost = "mysql-old-primary" + fakeMysqlDaemon.CurrentSourcePort = 3305 + fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + "FAKE RESET REPLICA ALL", + "FAKE SET SOURCE", + } setSourceCalls := 0 - // When replication was not already running, the helper should not try to - // recover a source-change failure because recovery would start replication - // as a side effect. + // When replication was not running, recovery should clear any stale source + // settings and reapply the requested source without starting replication. fakeMysqlDaemon.SetReplicationSourceFunc = func(ctx context.Context, host string, port int32, heartbeatInterval float64, stopReplicationBefore bool, startReplicationAfter bool) error { setSourceCalls++ @@ -654,15 +642,79 @@ func TestSetReplicationSourceRecovery(t *testing.T) { require.EqualValues(t, 3306, port) require.False(t, stopReplicationBefore) require.False(t, startReplicationAfter) - return recoverableReplicationInitError() + + if setSourceCalls == 1 { + return recoverableReplicationInitError() + } + + if setSourceCalls == 2 { + require.NoError(t, fakeMysqlDaemon.ExecuteSuperQueryList(ctx, []string{"FAKE SET SOURCE"})) + + fakeMysqlDaemon.CurrentSourceHost = host + fakeMysqlDaemon.CurrentSourcePort = port + + return nil + } + + return fmt.Errorf("unexpected SetReplicationSource call %d", setSourceCalls) } tm := newTestReplicationTM(newTestTablet(t, 100, "ks", "0", nil), fakeMysqlDaemon, nil) - // The original error should be returned unchanged in this case. err := tm.setReplicationSourceRecoverable(t.Context(), "mysql-new-primary", 3306, 0, false, false) - require.ErrorContains(t, err, "Could not initialize master info structure") - require.Equal(t, 1, setSourceCalls) + require.NoError(t, err) + require.Equal(t, 2, setSourceCalls) + require.Equal(t, "mysql-new-primary", fakeMysqlDaemon.CurrentSourceHost) + require.EqualValues(t, 3306, fakeMysqlDaemon.CurrentSourcePort) + require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) + }) + + t.Run("non-running replica with start requested reapplies source and starts replication", func(t *testing.T) { + fakeMysqlDaemon := newTestMysqlDaemon(t, 1) + fakeMysqlDaemon.CurrentSourceHost = "mysql-old-primary" + fakeMysqlDaemon.CurrentSourcePort = 3305 + fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + "FAKE RESET REPLICA ALL", + "FAKE SET SOURCE", + "START REPLICA", + } + + setSourceCalls := 0 + + // A source-change failure can happen before the new source is applied. + // Recovery should clear the old source settings, reapply the requested + // source, and only then start replication. + fakeMysqlDaemon.SetReplicationSourceFunc = func(ctx context.Context, host string, port int32, heartbeatInterval float64, stopReplicationBefore bool, startReplicationAfter bool) error { + setSourceCalls++ + + require.Equal(t, "mysql-new-primary", host) + require.EqualValues(t, 3306, port) + require.False(t, stopReplicationBefore) + require.False(t, startReplicationAfter) + + if setSourceCalls == 1 { + return recoverableReplicationInitError() + } + + if setSourceCalls == 2 { + require.NoError(t, fakeMysqlDaemon.ExecuteSuperQueryList(ctx, []string{"FAKE SET SOURCE"})) + + fakeMysqlDaemon.CurrentSourceHost = host + fakeMysqlDaemon.CurrentSourcePort = port + + return nil + } + + return fmt.Errorf("unexpected SetReplicationSource call %d", setSourceCalls) + } + + tm := newTestReplicationTM(newTestTablet(t, 100, "ks", "0", nil), fakeMysqlDaemon, nil) + + err := tm.setReplicationSourceRecoverable(t.Context(), "mysql-new-primary", 3306, 0, false, true) + require.NoError(t, err) + require.Equal(t, 2, setSourceCalls) + require.Equal(t, "mysql-new-primary", fakeMysqlDaemon.CurrentSourceHost) + require.EqualValues(t, 3306, fakeMysqlDaemon.CurrentSourcePort) require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) }) } From a34cbbde985baa93180c7d4fa455f01b2f250142 Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 1 Apr 2026 12:24:30 -0400 Subject: [PATCH 25/27] tabletmanager: stop before `RESET REPLICA ALL` recovery MySQL requires the replica SQL and I/O threads to be stopped before `RESET REPLICA [ALL]`, but the new `SetReplicationSource` recovery path could reach `ResetReplicationParameters` immediately after a failed source-change attempt on a running replica. That left the recovery logic depending on an unstated assumption about the failed `SetReplicationSource(..., stopReplicationBefore=true)` call. This now issues an explicit `StopReplication` before `ResetReplicationParameters` when the replica was running, and updates the running-replica recovery test to expect that extra stop. Signed-off-by: Mohamed Hamza --- go/vt/vttablet/tabletmanager/rpc_replication.go | 11 ++++++++++- go/vt/vttablet/tabletmanager/rpc_replication_test.go | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index dcbfde4c2c6..f58a381742a 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -1278,7 +1278,16 @@ func (tm *TabletManager) setReplicationSourceRecoverable(ctx context.Context, ho slog.Any("error", err), ) - // Recover from the error by reinitializing replication metadata through `RESET REPLICA ALL`. + // If the replica was running when the source-change attempt failed, stop it + // explicitly before resetting replication metadata. + if wasReplicating { + if err := tm.MysqlDaemon.StopReplication(ctx, tm.hookExtraEnv()); err != nil { + return err + } + } + + // Recover from the error by reinitializing replication metadata through + // `RESET REPLICA ALL`. if err := tm.MysqlDaemon.ResetReplicationParameters(ctx); err != nil { return err } diff --git a/go/vt/vttablet/tabletmanager/rpc_replication_test.go b/go/vt/vttablet/tabletmanager/rpc_replication_test.go index 07b0fbced10..c7a9e67c1fb 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication_test.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication_test.go @@ -539,6 +539,7 @@ func TestSetReplicationSourceRecovery(t *testing.T) { fakeMysqlDaemon.CurrentSourceHost = "mysql-old-primary" fakeMysqlDaemon.CurrentSourcePort = 3305 fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + "STOP REPLICA", "STOP REPLICA", "FAKE RESET REPLICA ALL", "FAKE SET SOURCE", From 1f8320ad1f78c0f24044e5f2c7f1921ae70e26df Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 1 Apr 2026 12:28:00 -0400 Subject: [PATCH 26/27] sqlerror: fix `gofumpt` formatting The replication errno alias block in `go/mysql/sqlerror/constants.go` kept the old aligned spacing on `ERInnodbReadOnly`, which fails `gofumpt`. This reapplies the formatter output so CI accepts the file. Signed-off-by: Mohamed Hamza --- go/mysql/sqlerror/constants.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/mysql/sqlerror/constants.go b/go/mysql/sqlerror/constants.go index 1f2d8c0d4d1..45a051a4939 100644 --- a/go/mysql/sqlerror/constants.go +++ b/go/mysql/sqlerror/constants.go @@ -139,7 +139,7 @@ const ( ERReplicaConnectionMetadataInitRepository = ErrorCode(1871) ERReplicaApplierMetadataInitRepository = ErrorCode(1872) - ERInnodbReadOnly = ErrorCode(1874) + ERInnodbReadOnly = ErrorCode(1874) ERVectorConversion = ErrorCode(6138) From 37ce03c297f42c9819dd07322bf8094bb3675c11 Mon Sep 17 00:00:00 2001 From: Mohamed Hamza Date: Wed, 1 Apr 2026 13:52:56 -0400 Subject: [PATCH 27/27] wrangler: update PRS relay-log stop error expectations `TestPlannedReparentShardRelayLogError` still expected `PlannedReparentShard` to succeed when `STOP REPLICA` returned a recoverable metadata-init error, but the tabletmanager change intentionally removed stop recovery. That made the wrangler test fail in CI even though the supported `START REPLICA` recovery path still works. This changes the stop-error PRS test to expect `SetReplicationSource` to fail and keeps the start-error companion test as the success-path coverage for the recovery we still support. Signed-off-by: Mohamed Hamza --- .../testlib/planned_reparent_shard_test.go | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/go/vt/wrangler/testlib/planned_reparent_shard_test.go b/go/vt/wrangler/testlib/planned_reparent_shard_test.go index 8e844777712..7d581731341 100644 --- a/go/vt/wrangler/testlib/planned_reparent_shard_test.go +++ b/go/vt/wrangler/testlib/planned_reparent_shard_test.go @@ -659,11 +659,6 @@ func TestPlannedReparentShardRelayLogError(t *testing.T) { "STOP REPLICA", "FAKE SET SOURCE", "START REPLICA", - // simulate error that will trigger a call to RestartReplication - "STOP REPLICA", - "RESET REPLICA", - "START REPLICA", - "START REPLICA", } goodReplica1.StartActionLoop(t, wr) goodReplica1.FakeMysqlDaemon.StopReplicationError = relayError.err @@ -674,20 +669,13 @@ func TestPlannedReparentShardRelayLogError(t *testing.T) { "PlannedReparentShard", "--wait_replicas_timeout", "10s", "--keyspace_shard", primary.Tablet.Keyspace + "/" + primary.Tablet.Shard, "--new_primary", topoproto.TabletAliasString(primary.Tablet.Alias), }) - require.NoError(t, err) + require.ErrorContains(t, err, "failed to SetReplicationSource") + require.ErrorContains(t, err, relayError.err.Error()) // check what was run err = primary.FakeMysqlDaemon.CheckSuperQueryList() require.NoError(t, err) err = goodReplica1.FakeMysqlDaemon.CheckSuperQueryList() require.NoError(t, err) - - assert.False(t, primary.FakeMysqlDaemon.ReadOnly, "primary.FakeMysqlDaemon.ReadOnly set") - assert.True(t, goodReplica1.FakeMysqlDaemon.ReadOnly, "goodReplica1.FakeMysqlDaemon.ReadOnly not set") - assert.True(t, primary.TM.QueryServiceControl.IsServing(), "primary...QueryServiceControl not serving") - - // verify the old primary was told to start replicating (and not - // the replica that wasn't replicating in the first place) - assert.True(t, goodReplica1.FakeMysqlDaemon.Replicating, "goodReplica1.FakeMysqlDaemon.Replicating not set") }) } }