Skip to content

Commit bd45bd6

Browse files
[release-22.0] vtorc: detect errant GTIDs for replicas not connected to primary (#19224) (#19233)
Co-authored-by: vitess-bot[bot] <108069721+vitess-bot[bot]@users.noreply.github.com>
1 parent 04ada26 commit bd45bd6

2 files changed

Lines changed: 170 additions & 36 deletions

File tree

go/vt/vtorc/inst/instance_dao.go

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -405,27 +405,37 @@ Cleanup:
405405
func detectErrantGTIDs(instance *Instance, tablet *topodatapb.Tablet) (err error) {
406406
// If the tablet is not replicating from anyone, then it could be the previous primary.
407407
// We should check for errant GTIDs by finding the difference with the shard's current primary.
408-
if instance.primaryExecutedGtidSet == "" && instance.SourceHost == "" {
409-
var primaryInstance *Instance
410-
primaryAlias, _, _ := ReadShardPrimaryInformation(tablet.Keyspace, tablet.Shard)
411-
if primaryAlias != "" {
412-
// Check if the current tablet is the primary.
413-
// If it is, then we don't need to run errant gtid detection on it.
414-
if primaryAlias == instance.InstanceAlias {
415-
return nil
416-
}
417-
primaryInstance, _, _ = ReadInstance(primaryAlias)
408+
primaryAlias, _, err := ReadShardPrimaryInformation(tablet.Keyspace, tablet.Shard)
409+
if err != nil {
410+
return fmt.Errorf("failed to read shard primary for %s/%s: %w", tablet.Keyspace, tablet.Shard, err)
411+
}
412+
413+
// Check if the current tablet is the primary. If it is, then we don't need to
414+
// run errant GTID detection on it.
415+
if primaryAlias == instance.InstanceAlias {
416+
return nil
417+
}
418+
419+
var primaryInstance *Instance
420+
if primaryAlias != "" {
421+
primaryInstance, _, err = ReadInstance(primaryAlias)
422+
if err != nil {
423+
return fmt.Errorf("failed to read primary instance %q: %w", primaryAlias, err)
418424
}
419-
// Only run errant GTID detection, if we are sure that the data read of the current primary
420-
// is up-to-date enough to reflect that it has been promoted. This is needed to prevent
421-
// flagging incorrect errant GTIDs. If we were to use old data, we could have some GTIDs
422-
// accepted by the old primary (this tablet) that don't show in the new primary's set.
423-
if primaryInstance != nil {
424-
if primaryInstance.SourceHost == "" {
425-
instance.primaryExecutedGtidSet = primaryInstance.ExecutedGtidSet
426-
}
425+
}
426+
427+
// Only run errant GTID detection if we are sure that the data read of the current primary
428+
// is up-to-date enough to reflect that it has been promoted. This is needed to prevent
429+
// flagging incorrect errant GTIDs. If we were to use old data, we could have some GTIDs
430+
// accepted by the old primary (this tablet) that don't show in the new primary's set.
431+
if primaryInstance != nil && primaryInstance.SourceHost == "" {
432+
// If the instance has no replication source and no primary GTID set yet, or if the instance's replication
433+
// source is not the primary, use the shard primary's executed GTID set for comparison.
434+
if (instance.SourceHost == "" && instance.primaryExecutedGtidSet == "") || !sourceIsPrimary(instance, primaryInstance) {
435+
instance.primaryExecutedGtidSet = primaryInstance.ExecutedGtidSet
427436
}
428437
}
438+
429439
if instance.ExecutedGtidSet != "" && instance.primaryExecutedGtidSet != "" {
430440
// Compare primary & replica GTID sets, but ignore the sets that present the primary's UUID.
431441
// This is because vtorc may pool primary and replica at an inconvenient timing,
@@ -458,6 +468,19 @@ func detectErrantGTIDs(instance *Instance, tablet *topodatapb.Tablet) (err error
458468
return err
459469
}
460470

471+
// sourceIsPrimary returns true if the instance's replication source is the given primary.
472+
func sourceIsPrimary(instance *Instance, primaryInstance *Instance) bool {
473+
if instance.SourceHost == "" {
474+
return false
475+
}
476+
477+
if instance.SourceUUID != "" && primaryInstance.ServerUUID != "" {
478+
return instance.SourceUUID == primaryInstance.ServerUUID
479+
}
480+
481+
return instance.SourceHost == primaryInstance.Hostname && instance.SourcePort == primaryInstance.Port
482+
}
483+
461484
// getKeyspaceShardName returns a single string having both the keyspace and shard
462485
func getKeyspaceShardName(keyspace, shard string) string {
463486
return fmt.Sprintf("%v:%v", keyspace, shard)

go/vt/vtorc/inst/instance_dao_test.go

Lines changed: 129 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -898,6 +898,29 @@ func TestExpireTableData(t *testing.T) {
898898
}
899899

900900
func TestDetectErrantGTIDs(t *testing.T) {
901+
keyspaceName := "ks"
902+
shardName := "0"
903+
tablet := &topodatapb.Tablet{
904+
Alias: &topodatapb.TabletAlias{
905+
Cell: "zone-1",
906+
Uid: 100,
907+
},
908+
Keyspace: keyspaceName,
909+
Shard: shardName,
910+
}
911+
primaryTablet := &topodatapb.Tablet{
912+
Alias: &topodatapb.TabletAlias{
913+
Cell: "zone-1",
914+
Uid: 101,
915+
},
916+
Keyspace: keyspaceName,
917+
Shard: shardName,
918+
Type: topodatapb.TabletType_PRIMARY,
919+
920+
MysqlHostname: "primary-host",
921+
MysqlPort: 6714,
922+
}
923+
901924
tests := []struct {
902925
name string
903926
instance *Instance
@@ -947,6 +970,8 @@ func TestDetectErrantGTIDs(t *testing.T) {
947970
primaryInstance: &Instance{
948971
SourceHost: "",
949972
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10589,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-341",
973+
Hostname: primaryTablet.MysqlHostname,
974+
Port: int(primaryTablet.MysqlPort),
950975
},
951976
wantErrantGTID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff:342",
952977
}, {
@@ -963,24 +988,6 @@ func TestDetectErrantGTIDs(t *testing.T) {
963988
},
964989
}
965990

966-
keyspaceName := "ks"
967-
shardName := "0"
968-
tablet := &topodatapb.Tablet{
969-
Alias: &topodatapb.TabletAlias{
970-
Cell: "zone-1",
971-
Uid: 100,
972-
},
973-
Keyspace: keyspaceName,
974-
Shard: shardName,
975-
}
976-
primaryTablet := &topodatapb.Tablet{
977-
Alias: &topodatapb.TabletAlias{
978-
Cell: "zone-1",
979-
Uid: 101,
980-
},
981-
Keyspace: keyspaceName,
982-
Shard: shardName,
983-
}
984991
for _, tt := range tests {
985992
t.Run(tt.name, func(t *testing.T) {
986993
// Clear the database after the test. The easiest way to do that is to run all the initialization commands again.
@@ -997,6 +1004,7 @@ func TestDetectErrantGTIDs(t *testing.T) {
9971004

9981005
if tt.primaryInstance != nil {
9991006
tt.primaryInstance.InstanceAlias = topoproto.TabletAliasString(primaryTablet.Alias)
1007+
10001008
err = SaveTablet(primaryTablet)
10011009
require.NoError(t, err)
10021010
err = WriteInstance(tt.primaryInstance, true, nil)
@@ -1015,6 +1023,109 @@ func TestDetectErrantGTIDs(t *testing.T) {
10151023
}
10161024
}
10171025

1026+
// TestDetectErrantGTIDsWithWrongPrimarySource ensures that errant GTIDs are
1027+
// detected by comparing against the shard primary when a replica is pointed at
1028+
// a different source.
1029+
func TestDetectErrantGTIDsWithWrongPrimarySource(t *testing.T) {
1030+
defer func() {
1031+
db.ClearVTOrcDatabase()
1032+
}()
1033+
db.ClearVTOrcDatabase()
1034+
1035+
keyspaceName := "ks"
1036+
shardName := "0"
1037+
1038+
primaryUUID := "a5ce8e8a-4e56-11ef-9f7d-92339a7d9f6c"
1039+
wrongPrimaryUUID := "bb1db5f6-4e56-11ef-8bda-3ae65c7f7f5e"
1040+
replicaUUID := "cc946b58-4e56-11ef-9e6f-3e527c1a9e9d"
1041+
1042+
primaryTablet := &topodatapb.Tablet{
1043+
Alias: &topodatapb.TabletAlias{
1044+
Cell: "zone-1",
1045+
Uid: 101,
1046+
},
1047+
Keyspace: keyspaceName,
1048+
Shard: shardName,
1049+
Type: topodatapb.TabletType_PRIMARY,
1050+
1051+
MysqlHostname: "primary-host",
1052+
MysqlPort: 6714,
1053+
}
1054+
1055+
replicaTablet := &topodatapb.Tablet{
1056+
Alias: &topodatapb.TabletAlias{
1057+
Cell: "zone-1",
1058+
Uid: 100,
1059+
},
1060+
Keyspace: keyspaceName,
1061+
Shard: shardName,
1062+
}
1063+
1064+
err := SaveShard(topo.NewShardInfo(keyspaceName, shardName, &topodatapb.Shard{
1065+
PrimaryAlias: primaryTablet.Alias,
1066+
}, nil))
1067+
require.NoError(t, err)
1068+
1069+
err = SaveTablet(primaryTablet)
1070+
require.NoError(t, err)
1071+
1072+
// Create the real shard primary instance and give it a GTID set that does
1073+
// not include the replica UUID. This represents the authoritative primary.
1074+
primaryInstance := &Instance{
1075+
InstanceAlias: topoproto.TabletAliasString(primaryTablet.Alias),
1076+
Hostname: "primary-host",
1077+
Port: 6714,
1078+
SourceHost: "",
1079+
ExecutedGtidSet: primaryUUID + ":1-10",
1080+
ServerUUID: primaryUUID,
1081+
}
1082+
1083+
err = WriteInstance(primaryInstance, true, nil)
1084+
require.NoError(t, err)
1085+
1086+
// Create a wrong primary instance that contains the replica UUID in its
1087+
// executed GTID set. This masks errant GTIDs if the replica uses it for
1088+
// comparison
1089+
wrongPrimaryInstance := &Instance{
1090+
InstanceAlias: "zone-1-0000000102",
1091+
Hostname: "wrong-primary",
1092+
Port: 6720,
1093+
SourceHost: "",
1094+
ExecutedGtidSet: fmt.Sprintf("%s:1-10,%s:1-5", wrongPrimaryUUID, replicaUUID),
1095+
ServerUUID: wrongPrimaryUUID,
1096+
AncestryUUID: wrongPrimaryUUID,
1097+
}
1098+
1099+
err = WriteInstance(wrongPrimaryInstance, true, nil)
1100+
require.NoError(t, err)
1101+
1102+
// Create a replica instance that is pointed at the wrong primary. The replica
1103+
// contains errant GTIDs that should be caught.
1104+
replicaInstance := &Instance{
1105+
InstanceAlias: topoproto.TabletAliasString(replicaTablet.Alias),
1106+
Hostname: "replica-host",
1107+
Port: 6711,
1108+
SourceHost: wrongPrimaryInstance.Hostname,
1109+
SourcePort: wrongPrimaryInstance.Port,
1110+
SourceUUID: wrongPrimaryInstance.ServerUUID,
1111+
ExecutedGtidSet: fmt.Sprintf("%s:1-10,%s:1-5", wrongPrimaryUUID, replicaUUID),
1112+
ServerUUID: replicaUUID,
1113+
}
1114+
1115+
err = ReadInstanceClusterAttributes(replicaInstance)
1116+
require.NoError(t, err)
1117+
require.Equal(t, wrongPrimaryInstance.ExecutedGtidSet, replicaInstance.primaryExecutedGtidSet)
1118+
require.Equal(t, wrongPrimaryInstance.AncestryUUID, replicaInstance.AncestryUUID)
1119+
1120+
// Run errant GTID detection. We should find some.
1121+
err = detectErrantGTIDs(replicaInstance, replicaTablet)
1122+
require.NoError(t, err)
1123+
1124+
// The replica's own UUID entries should be reported as errant because the
1125+
// real shard primary does not include them.
1126+
require.Equal(t, replicaUUID+":1-5", replicaInstance.GtidErrant)
1127+
}
1128+
10181129
// TestPrimaryErrantGTIDs tests that we don't run Errant GTID detection on the primary tablet itself!
10191130
func TestPrimaryErrantGTIDs(t *testing.T) {
10201131
// Clear the database after the test. The easiest way to do that is to run all the initialization commands again.

0 commit comments

Comments
 (0)