-
Notifications
You must be signed in to change notification settings - Fork 2.4k
vttablet: handle applier metadata init failures in relay-log recovery #19560
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
721e63e
2357fb8
f5209b4
871bdfd
554e615
f213844
b151ec2
86eafef
2691e6e
ce99c0f
7f335e2
9ee9980
6e33a9a
875517d
7f5c352
9742cf1
2124145
d23a352
0a9d9fb
2350f61
dac71ba
22ff699
3ee8dc7
4762624
b925a4b
a34cbbd
1f8320a
37ce03c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -105,6 +105,7 @@ const ( | |
| ERDupUnique = ErrorCode(1169) | ||
| ERRequiresPrimaryKey = ErrorCode(1173) | ||
| ERCantDoThisDuringAnTransaction = ErrorCode(1179) | ||
| ERMasterInfo = ErrorCode(1201) | ||
| ERReadOnlyTransaction = ErrorCode(1207) | ||
| ERCannotAddForeign = ErrorCode(1215) | ||
| ERNoReferencedRow = ErrorCode(1216) | ||
|
|
@@ -128,7 +129,17 @@ const ( | |
| ERSourceHasPurgedRequiredGtids = ErrorCode(1789) | ||
| ERInnodbIndexCorrupt = ErrorCode(1817) | ||
| ERDupIndex = ErrorCode(1831) | ||
| ERInnodbReadOnly = ErrorCode(1874) | ||
|
|
||
| // MySQL used 1871/1872 for master-info and relay-log-info initialization | ||
| // errors through 8.0.32, and reassigned those numbers in 8.0.33 to | ||
| // connection-metadata and applier-metadata initialization errors. These | ||
| // errnos therefore map to different metadata types depending on version. | ||
| ERReplicaMasterInfoInitRepository = ErrorCode(1871) | ||
| ERReplicaRelayLogInfoInitRepository = ErrorCode(1872) | ||
| ERReplicaConnectionMetadataInitRepository = ErrorCode(1871) | ||
| ERReplicaApplierMetadataInitRepository = ErrorCode(1872) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not 1873, but maybe we should use 1871 and 1872? |
||
|
|
||
| ERInnodbReadOnly = ErrorCode(1874) | ||
|
|
||
| ERVectorConversion = ErrorCode(6138) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,8 +19,8 @@ package tabletmanager | |
| import ( | ||
| "context" | ||
| "fmt" | ||
| "log/slog" | ||
| "runtime" | ||
| "strings" | ||
| "time" | ||
|
|
||
| "vitess.io/vitess/go/mysql" | ||
|
|
@@ -306,7 +306,7 @@ func (tm *TabletManager) StartReplication(ctx context.Context, semiSync bool) er | |
| if err := tm.fixSemiSync(ctx, tm.Tablet().Type, semiSyncAction); err != nil { | ||
| return err | ||
| } | ||
| return tm.MysqlDaemon.StartReplication(ctx, tm.hookExtraEnv()) | ||
| return tm.startReplicationRecoverable(ctx) | ||
| } | ||
|
|
||
| // RestartReplication will stop replication and then start it again | ||
|
|
@@ -335,7 +335,7 @@ func (tm *TabletManager) RestartReplication(ctx context.Context, semiSync bool) | |
| } | ||
|
|
||
| // Start replication | ||
| return tm.MysqlDaemon.StartReplication(ctx, tm.hookExtraEnv()) | ||
| return tm.startReplicationRecoverable(ctx) | ||
| } | ||
|
|
||
| // StartReplicationUntilAfter will start the replication and let it catch up | ||
|
|
@@ -524,7 +524,8 @@ func (tm *TabletManager) InitReplica(ctx context.Context, parent *topodatapb.Tab | |
| if err := tm.MysqlDaemon.SetReplicationPosition(ctx, pos); err != nil { | ||
| return err | ||
| } | ||
| if err := tm.MysqlDaemon.SetReplicationSource(ctx, ti.MysqlHostname, ti.MysqlPort, 0, false, true); err != nil { | ||
|
|
||
| if err := tm.setReplicationSourceRecoverable(ctx, ti.MysqlHostname, ti.MysqlPort, 0, false, true); err != nil { | ||
| return err | ||
| } | ||
|
|
||
|
|
@@ -949,23 +950,19 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA | |
| } | ||
| if status.SourceHost != host || status.SourcePort != port || heartbeatInterval != 0 { | ||
| // This handles both changing the address and starting replication. | ||
| if err := tm.MysqlDaemon.SetReplicationSource(ctx, host, port, heartbeatInterval, wasReplicating, shouldbeReplicating); err != nil { | ||
| if err := tm.handleRelayLogError(ctx, err); err != nil { | ||
| return err | ||
| } | ||
| if err := tm.setReplicationSourceRecoverable(ctx, host, port, heartbeatInterval, wasReplicating, shouldbeReplicating); err != nil { | ||
| return err | ||
| } | ||
| } else if shouldbeReplicating { | ||
| // The address is correct. We need to restart replication so that any semi-sync changes if any | ||
| // are taken into account | ||
| // are taken into account. We don't attempt to recover from the known recoverable errors here | ||
| // because recovery requires running `STOP REPLICA` in order to reset the replication metadata. | ||
| // If we error the first time, we're likely to error the second time as well. | ||
| if err := tm.MysqlDaemon.StopReplication(ctx, tm.hookExtraEnv()); err != nil { | ||
| if err := tm.handleRelayLogError(ctx, err); err != nil { | ||
| return err | ||
| } | ||
| return err | ||
| } | ||
|
Comment on lines
961
to
963
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
In the Useful? React with 👍 / 👎.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is intentional, see #19560 (comment) |
||
| if err := tm.MysqlDaemon.StartReplication(ctx, tm.hookExtraEnv()); err != nil { | ||
| if err := tm.handleRelayLogError(ctx, err); err != nil { | ||
| return err | ||
| } | ||
| if err := tm.startReplicationRecoverable(ctx); err != nil { | ||
| return err | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -1231,25 +1228,118 @@ func (tm *TabletManager) fixSemiSyncAndReplication(ctx context.Context, tabletTy | |
| if err := tm.MysqlDaemon.StopReplication(ctx, tm.hookExtraEnv()); err != nil { | ||
| return vterrors.Wrap(err, "failed to StopReplication") | ||
| } | ||
| if err := tm.MysqlDaemon.StartReplication(ctx, tm.hookExtraEnv()); err != nil { | ||
| if err := tm.startReplicationRecoverable(ctx); err != nil { | ||
| return vterrors.Wrap(err, "failed to StartReplication") | ||
| } | ||
| return nil | ||
|
mattlord marked this conversation as resolved.
|
||
| } | ||
|
|
||
| // handleRelayLogError resets replication of the instance. | ||
| // This is required because sometimes MySQL gets stuck due to improper initialization of | ||
| // master info structure or related failures and throws errors like | ||
| // ERROR 1201 (HY000): Could not initialize master info structure; more error messages can be found in the MySQL error log | ||
| // These errors can only be resolved by resetting the replication, otherwise START REPLICA fails. | ||
| func (tm *TabletManager) handleRelayLogError(ctx context.Context, err error) error { | ||
| // attempt to fix this error: | ||
| // Replica failed to initialize relay log info structure from the repository (errno 1872) (sqlstate HY000) during query: START REPLICA | ||
| // startReplicationRecoverable starts replication and handles recoverable errors by resetting replication. | ||
| func (tm *TabletManager) startReplicationRecoverable(ctx context.Context) error { | ||
| err := tm.MysqlDaemon.StartReplication(ctx, tm.hookExtraEnv()) | ||
| if err == nil { | ||
| return nil | ||
| } | ||
|
|
||
| // Try to recover from the error. | ||
| if err := tm.handleRecoverableReplicationInitError(ctx, err); err != nil { | ||
| return err | ||
| } | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| // setReplicationSourceRecoverable configures the requested replication source and optionally starts | ||
| // replication afterward. When possible, certain errors are recovered by reinitializing replication | ||
| // metadata. | ||
| func (tm *TabletManager) setReplicationSourceRecoverable(ctx context.Context, host string, port int32, heartbeatInterval float64, wasReplicating bool, shouldStartReplication bool) error { | ||
| // Let's first try to apply the requested source without starting replication afterwards. If the | ||
| // replica was replicating before, we stop replication first. | ||
| err := tm.MysqlDaemon.SetReplicationSource(ctx, host, port, heartbeatInterval, wasReplicating, false) | ||
| if err == nil { | ||
| // We succeeded, let's start replication but only if it was requested. | ||
| if !shouldStartReplication { | ||
| return nil | ||
| } | ||
|
|
||
| return tm.startReplicationRecoverable(ctx) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
When Useful? React with 👍 / 👎.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is certainly a behavior change, but I'm not sure it's a wrong change. I'd expect the hook to run in this case, but it may cause some unexpected behavior. |
||
| } | ||
|
|
||
| // We hit an error. If the error is not one of the recoverable ones, we can't recover and should return it. | ||
| if !isRecoverableReplicationInitializationError(err) { | ||
| return err | ||
| } | ||
|
|
||
| log.Warn( | ||
| "Encountered recoverable replication initialization error while changing replication source, resetting "+ | ||
| "replication parameters and reapplying source", | ||
| slog.String("source_host", host), | ||
| slog.Int("source_port", int(port)), | ||
| slog.Any("error", err), | ||
| ) | ||
|
|
||
| // If the replica was running when the source-change attempt failed, stop it | ||
| // explicitly before resetting replication metadata. | ||
| if wasReplicating { | ||
| if err := tm.MysqlDaemon.StopReplication(ctx, tm.hookExtraEnv()); err != nil { | ||
| return err | ||
| } | ||
| } | ||
|
|
||
| // Recover from the error by reinitializing replication metadata through | ||
| // `RESET REPLICA ALL`. | ||
| if err := tm.MysqlDaemon.ResetReplicationParameters(ctx); err != nil { | ||
| return err | ||
| } | ||
|
|
||
| // Now that we've reinitialized the replication metadata, try setting the source again. | ||
| if err := tm.MysqlDaemon.SetReplicationSource(ctx, host, port, heartbeatInterval, false, false); err != nil { | ||
| return err | ||
| } | ||
|
|
||
| // The replication source has finally been set. Let's also start replication if it was requested. | ||
| if shouldStartReplication { | ||
| return tm.startReplicationRecoverable(ctx) | ||
| } | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| // recoverableReplicationInitializationErrorCodes is the set of replication initialization error | ||
| // codes that can be recovered from by reinitializing replication metadata. | ||
| // MySQL used 1871/1872 for master-info and relay-log-info initialization errors | ||
| // through 8.0.32, and reassigned those numbers in 8.0.33 to connection-metadata | ||
| // and applier-metadata initialization errors. | ||
| var recoverableReplicationInitializationErrorCodes = map[sqlerror.ErrorCode]struct{}{ | ||
| sqlerror.ERMasterInfo: {}, | ||
| sqlerror.ERReplicaConnectionMetadataInitRepository: {}, | ||
| sqlerror.ERReplicaApplierMetadataInitRepository: {}, | ||
|
mhamza15 marked this conversation as resolved.
|
||
| } | ||
|
|
||
| // isRecoverableReplicationInitializationError reports whether an error can be recovered from by | ||
| // reinitializing replication metadata. | ||
| func isRecoverableReplicationInitializationError(err error) bool { | ||
| sqlErr, ok := sqlerror.NewSQLErrorFromError(err).(*sqlerror.SQLError) | ||
|
mattlord marked this conversation as resolved.
|
||
| if !ok || sqlErr == nil { | ||
| return false | ||
| } | ||
|
|
||
|
mhamza15 marked this conversation as resolved.
|
||
| _, ok = recoverableReplicationInitializationErrorCodes[sqlErr.Number()] | ||
| return ok | ||
| } | ||
|
|
||
| // handleRecoverableReplicationInitError repairs recoverable replication initialization | ||
| // failures by restarting replication. | ||
| func (tm *TabletManager) handleRecoverableReplicationInitError(ctx context.Context, err error) error { | ||
| // Attempt to self-heal by restarting replication when initialization fails. | ||
| // see https://bugs.mysql.com/bug.php?id=83713 or https://github.com/vitessio/vitess/issues/5067 | ||
| // The same fix also works for https://github.com/vitessio/vitess/issues/10955. | ||
| if strings.Contains(err.Error(), "Replica failed to initialize relay log info structure from the repository") || | ||
| strings.Contains(err.Error(), "Could not initialize master info structure") { | ||
| // Stop, reset and start replication again to resolve this error | ||
| if isRecoverableReplicationInitializationError(err) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess we don't have access to the MySQL error codes here? It feels quite brittle to check against the error message strings - but if that's the only thing we can do here it's fine.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I was thinking the same thing, but it gets flattened earlier: https://github.com/vitessio/vitess/blob/main/go/vt/mysqlctl/query.go?plain=1#L84. It's definitely doable and preferable, but I think it'd require a bit of a refactor that I'll leave as a follow-up.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 that I just wanted to point out many RPCs map
mattlord marked this conversation as resolved.
mhamza15 marked this conversation as resolved.
|
||
| log.Warn( | ||
| "Encountered recoverable replication initialization error, restarting replication", | ||
| slog.Any("error", err), | ||
| ) | ||
|
|
||
| if err := tm.MysqlDaemon.RestartReplication(ctx, tm.hookExtraEnv()); err != nil { | ||
| return err | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.