Skip to content

Commit 721e63e

Browse files
committed
vttablet: handle applier metadata init failures in relay-log recovery
`handleRelayLogError` currently retries replication restart for known recoverable metadata-init failures (relay log info and master info). MySQL can also return: ``` Replica failed to initialize applier metadata structure from the repository ``` This treats this error as the same recoverable class by triggering `RestartReplication` (STOP REPLICA, RESET REPLICA, START REPLICA). Signed-off-by: Mohamed Hamza <mhamza@fastmail.com>
1 parent 9daf71e commit 721e63e

4 files changed

Lines changed: 328 additions & 183 deletions

File tree

go/vt/vttablet/tabletmanager/rpc_replication.go

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"context"
2121
"fmt"
2222
"runtime"
23+
"slices"
2324
"strings"
2425
"time"
2526

@@ -1237,6 +1238,35 @@ func (tm *TabletManager) fixSemiSyncAndReplication(ctx context.Context, tabletTy
12371238
return nil
12381239
}
12391240

1241+
// Known MySQL replication metadata initialization failures that can be repaired
1242+
// by restarting replication.
1243+
const (
1244+
relayLogInfoInitializationError = "Replica failed to initialize relay log info structure from the repository"
1245+
masterInfoInitializationError = "Could not initialize master info structure"
1246+
applierMetadataInitializationError = "Replica failed to initialize applier metadata structure from the repository"
1247+
)
1248+
1249+
// recoverableReplicationInitializationErrors enumerates the error substrings we
1250+
// treat as recoverable through RestartReplication.
1251+
var recoverableReplicationInitializationErrors = []string{
1252+
relayLogInfoInitializationError,
1253+
masterInfoInitializationError,
1254+
applierMetadataInitializationError,
1255+
}
1256+
1257+
// isRecoverableReplicationInitializationError returns true if err contains one
1258+
// of the known recoverable metadata initialization failures.
1259+
func isRecoverableReplicationInitializationError(err error) bool {
1260+
if err == nil {
1261+
return false
1262+
}
1263+
1264+
errMessage := err.Error()
1265+
return slices.ContainsFunc(recoverableReplicationInitializationErrors, func(s string) bool {
1266+
return strings.Contains(errMessage, s)
1267+
})
1268+
}
1269+
12401270
// handleRelayLogError resets replication of the instance.
12411271
// This is required because sometimes MySQL gets stuck due to improper initialization of
12421272
// master info structure or related failures and throws errors like
@@ -1247,8 +1277,7 @@ func (tm *TabletManager) handleRelayLogError(ctx context.Context, err error) err
12471277
// Replica failed to initialize relay log info structure from the repository (errno 1872) (sqlstate HY000) during query: START REPLICA
12481278
// see https://bugs.mysql.com/bug.php?id=83713 or https://github.com/vitessio/vitess/issues/5067
12491279
// The same fix also works for https://github.com/vitessio/vitess/issues/10955.
1250-
if strings.Contains(err.Error(), "Replica failed to initialize relay log info structure from the repository") ||
1251-
strings.Contains(err.Error(), "Could not initialize master info structure") {
1280+
if isRecoverableReplicationInitializationError(err) {
12521281
// Stop, reset and start replication again to resolve this error
12531282
if err := tm.MysqlDaemon.RestartReplication(ctx, tm.hookExtraEnv()); err != nil {
12541283
return err

go/vt/vttablet/tabletmanager/rpc_replication_test.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package tabletmanager
1818

1919
import (
2020
"context"
21+
"errors"
2122
"sync/atomic"
2223
"testing"
2324
"time"
@@ -326,3 +327,65 @@ func TestUndoDemotePrimaryStateChange(t *testing.T) {
326327
require.NoError(t, err)
327328
require.False(t, isReadOnly)
328329
}
330+
331+
func TestHandleRelayLogError(t *testing.T) {
332+
testCases := []struct {
333+
name string
334+
inputErr error
335+
shouldRestart bool
336+
}{
337+
{
338+
name: "relay log info error",
339+
inputErr: errors.New(relayLogInfoInitializationError),
340+
shouldRestart: true,
341+
},
342+
{
343+
name: "master info error",
344+
inputErr: errors.New(masterInfoInitializationError),
345+
shouldRestart: true,
346+
},
347+
{
348+
name: "applier metadata error",
349+
inputErr: errors.New(applierMetadataInitializationError),
350+
shouldRestart: true,
351+
},
352+
{
353+
name: "unrelated error",
354+
inputErr: errors.New("unexpected replication failure"),
355+
shouldRestart: false,
356+
},
357+
}
358+
359+
for _, tc := range testCases {
360+
t.Run(tc.name, func(t *testing.T) {
361+
fakeMysqlDaemon := newTestMysqlDaemon(t, 1)
362+
if tc.shouldRestart {
363+
fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{
364+
"STOP REPLICA",
365+
"RESET REPLICA",
366+
"START REPLICA",
367+
}
368+
}
369+
370+
tablet := newTestTablet(t, 100, "ks", "0", nil)
371+
tm := &TabletManager{
372+
MysqlDaemon: fakeMysqlDaemon,
373+
tabletAlias: tablet.Alias,
374+
tmState: &tmState{
375+
displayState: displayState{
376+
tablet: tablet,
377+
},
378+
},
379+
}
380+
381+
err := tm.handleRelayLogError(context.Background(), tc.inputErr)
382+
if tc.shouldRestart {
383+
require.NoError(t, err)
384+
} else {
385+
require.ErrorIs(t, err, tc.inputErr)
386+
}
387+
388+
require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList())
389+
})
390+
}
391+
}

0 commit comments

Comments
 (0)