Skip to content

Commit ba3cbbe

Browse files
[release-24.0] vtorc: reset CurrentErrantGTIDCount gauge when errant GTIDs are resolved (#20259) (#20292)
Signed-off-by: Matthias Crauwels <matthias.crauwels@planetscale.com> Co-authored-by: vitess-bot[bot] <108069721+vitess-bot[bot]@users.noreply.github.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent eb82dd0 commit ba3cbbe

2 files changed

Lines changed: 74 additions & 1 deletion

File tree

go/vt/vtorc/inst/instance_dao.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,7 @@ Cleanup:
406406

407407
// detectErrantGTIDs detects the errant GTIDs on an instance.
408408
func detectErrantGTIDs(instance *Instance, tablet *topodatapb.Tablet) (err error) {
409+
tabletAliasString := topoproto.TabletAliasString(instance.InstanceAlias)
409410
// If the tablet is not replicating from anyone, then it could be the previous primary.
410411
// We should check for errant GTIDs by finding the difference with the shard's current primary.
411412
primaryAlias, _, err := ReadShardPrimaryInformation(tablet.Keyspace, tablet.Shard)
@@ -416,6 +417,10 @@ func detectErrantGTIDs(instance *Instance, tablet *topodatapb.Tablet) (err error
416417
// Check if the current tablet is the primary. If it is, then we don't need to
417418
// run errant GTID detection on it.
418419
if topoproto.TabletAliasEqual(primaryAlias, instance.InstanceAlias) {
420+
// A primary cannot have errant GTIDs relative to itself; clear any
421+
// value left over from before this tablet was promoted.
422+
instance.GtidErrant = ""
423+
currentErrantGTIDCount.Reset(tabletAliasString)
419424
return nil
420425
}
421426

@@ -439,6 +444,7 @@ func detectErrantGTIDs(instance *Instance, tablet *topodatapb.Tablet) (err error
439444
}
440445
}
441446

447+
var errantGtidCount int64
442448
if instance.ExecutedGtidSet != "" && instance.primaryExecutedGtidSet != "" {
443449
// Compare primary & replica GTID sets, but ignore the sets that present the primary's UUID.
444450
// This is because vtorc may pool primary and replica at an inconvenient timing,
@@ -469,10 +475,17 @@ func detectErrantGTIDs(instance *Instance, tablet *topodatapb.Tablet) (err error
469475
errantGtidSet := redactedExecutedGtidSet.Difference(redactedPrimaryExecutedGtidSet)
470476
if !errantGtidSet.Empty() {
471477
instance.GtidErrant = errantGtidSet.String()
472-
currentErrantGTIDCount.Set(topoproto.TabletAliasString(instance.InstanceAlias), errantGtidSet.Count())
478+
errantGtidCount = errantGtidSet.Count()
473479
}
474480
}
475481
}
482+
// Always publish the result. Writing 0 / "" here is what allows the
483+
// gauge and GtidErrant field to recover after errant GTIDs are
484+
// reconciled or a transient race clears.
485+
if errantGtidCount == 0 {
486+
instance.GtidErrant = ""
487+
}
488+
currentErrantGTIDCount.Set(tabletAliasString, errantGtidCount)
476489
return err
477490
}
478491

go/vt/vtorc/inst/instance_dao_test.go

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1156,3 +1156,63 @@ func TestPrimaryErrantGTIDs(t *testing.T) {
11561156
require.NoError(t, err)
11571157
require.EqualValues(t, "", instance.GtidErrant)
11581158
}
1159+
1160+
// TestErrantGTIDCountGaugeIsResetWhenResolved verifies that the per-tablet
1161+
// CurrentErrantGTIDCount gauge is reset to 0 when errant GTIDs are reconciled
1162+
// on a subsequent poll. Previously, the gauge was only ever written when
1163+
// errant GTIDs were detected, so it stuck at the last positive value forever
1164+
// (https://github.com/vitessio/vitess/issues/20258).
1165+
func TestErrantGTIDCountGaugeIsResetWhenResolved(t *testing.T) {
1166+
defer func() {
1167+
db.ClearVTOrcDatabase()
1168+
currentErrantGTIDCount.ResetAll()
1169+
}()
1170+
db.ClearVTOrcDatabase()
1171+
currentErrantGTIDCount.ResetAll()
1172+
1173+
keyspaceName := "ks"
1174+
shardName := "0"
1175+
primaryUUID := "230ea8ea-81e3-11e4-972a-e25ec4bd140a"
1176+
replicaUUID := "316d193c-70e5-11e5-adb2-ecf4bb2262ff"
1177+
1178+
primaryTablet := &topodatapb.Tablet{
1179+
Alias: &topodatapb.TabletAlias{Cell: "zone-1", Uid: 101},
1180+
Keyspace: keyspaceName,
1181+
Shard: shardName,
1182+
Type: topodatapb.TabletType_PRIMARY,
1183+
}
1184+
replicaTablet := &topodatapb.Tablet{
1185+
Alias: &topodatapb.TabletAlias{Cell: "zone-1", Uid: 100},
1186+
Keyspace: keyspaceName,
1187+
Shard: shardName,
1188+
}
1189+
replicaAlias := topoproto.TabletAliasString(replicaTablet.Alias)
1190+
1191+
require.NoError(t, SaveShard(topo.NewShardInfo(keyspaceName, shardName, &topodatapb.Shard{
1192+
PrimaryAlias: primaryTablet.Alias,
1193+
}, nil)))
1194+
1195+
// First poll: the replica has an errant GTID under its own UUID.
1196+
replicaInstance := &Instance{
1197+
InstanceAlias: replicaTablet.Alias,
1198+
ServerUUID: replicaUUID,
1199+
SourceUUID: primaryUUID,
1200+
AncestryUUID: replicaUUID + "," + primaryUUID,
1201+
ExecutedGtidSet: primaryUUID + ":1-100," + replicaUUID + ":1",
1202+
primaryExecutedGtidSet: primaryUUID + ":1-100",
1203+
}
1204+
require.NoError(t, detectErrantGTIDs(replicaInstance, replicaTablet))
1205+
require.Equal(t, replicaUUID+":1", replicaInstance.GtidErrant)
1206+
require.EqualValues(t, 1, currentErrantGTIDCount.Counts()[replicaAlias],
1207+
"gauge should be 1 after a single errant GTID is detected")
1208+
1209+
// Second poll: the errant GTID has been reconciled (e.g., via the
1210+
// empty-transaction injection technique on the primary), so the primary's
1211+
// executed GTID set now also includes the replica UUID range.
1212+
replicaInstance.primaryExecutedGtidSet = primaryUUID + ":1-100," + replicaUUID + ":1"
1213+
require.NoError(t, detectErrantGTIDs(replicaInstance, replicaTablet))
1214+
require.Empty(t, replicaInstance.GtidErrant,
1215+
"GtidErrant should be cleared on the no-errant path so callers reusing the Instance don't see stale state")
1216+
require.EqualValues(t, 0, currentErrantGTIDCount.Counts()[replicaAlias],
1217+
"gauge should be reset to 0 after errant GTIDs are resolved")
1218+
}

0 commit comments

Comments
 (0)