@@ -1156,3 +1156,63 @@ func TestPrimaryErrantGTIDs(t *testing.T) {
11561156 require .NoError (t , err )
11571157 require .EqualValues (t , "" , instance .GtidErrant )
11581158}
1159+
1160+ // TestErrantGTIDCountGaugeIsResetWhenResolved verifies that the per-tablet
1161+ // CurrentErrantGTIDCount gauge is reset to 0 when errant GTIDs are reconciled
1162+ // on a subsequent poll. Previously, the gauge was only ever written when
1163+ // errant GTIDs were detected, so it stuck at the last positive value forever
1164+ // (https://github.com/vitessio/vitess/issues/20258).
1165+ func TestErrantGTIDCountGaugeIsResetWhenResolved (t * testing.T ) {
1166+ defer func () {
1167+ db .ClearVTOrcDatabase ()
1168+ currentErrantGTIDCount .ResetAll ()
1169+ }()
1170+ db .ClearVTOrcDatabase ()
1171+ currentErrantGTIDCount .ResetAll ()
1172+
1173+ keyspaceName := "ks"
1174+ shardName := "0"
1175+ primaryUUID := "230ea8ea-81e3-11e4-972a-e25ec4bd140a"
1176+ replicaUUID := "316d193c-70e5-11e5-adb2-ecf4bb2262ff"
1177+
1178+ primaryTablet := & topodatapb.Tablet {
1179+ Alias : & topodatapb.TabletAlias {Cell : "zone-1" , Uid : 101 },
1180+ Keyspace : keyspaceName ,
1181+ Shard : shardName ,
1182+ Type : topodatapb .TabletType_PRIMARY ,
1183+ }
1184+ replicaTablet := & topodatapb.Tablet {
1185+ Alias : & topodatapb.TabletAlias {Cell : "zone-1" , Uid : 100 },
1186+ Keyspace : keyspaceName ,
1187+ Shard : shardName ,
1188+ }
1189+ replicaAlias := topoproto .TabletAliasString (replicaTablet .Alias )
1190+
1191+ require .NoError (t , SaveShard (topo .NewShardInfo (keyspaceName , shardName , & topodatapb.Shard {
1192+ PrimaryAlias : primaryTablet .Alias ,
1193+ }, nil )))
1194+
1195+ // First poll: the replica has an errant GTID under its own UUID.
1196+ replicaInstance := & Instance {
1197+ InstanceAlias : replicaTablet .Alias ,
1198+ ServerUUID : replicaUUID ,
1199+ SourceUUID : primaryUUID ,
1200+ AncestryUUID : replicaUUID + "," + primaryUUID ,
1201+ ExecutedGtidSet : primaryUUID + ":1-100," + replicaUUID + ":1" ,
1202+ primaryExecutedGtidSet : primaryUUID + ":1-100" ,
1203+ }
1204+ require .NoError (t , detectErrantGTIDs (replicaInstance , replicaTablet ))
1205+ require .Equal (t , replicaUUID + ":1" , replicaInstance .GtidErrant )
1206+ require .EqualValues (t , 1 , currentErrantGTIDCount .Counts ()[replicaAlias ],
1207+ "gauge should be 1 after a single errant GTID is detected" )
1208+
1209+ // Second poll: the errant GTID has been reconciled (e.g., via the
1210+ // empty-transaction injection technique on the primary), so the primary's
1211+ // executed GTID set now also includes the replica UUID range.
1212+ replicaInstance .primaryExecutedGtidSet = primaryUUID + ":1-100," + replicaUUID + ":1"
1213+ require .NoError (t , detectErrantGTIDs (replicaInstance , replicaTablet ))
1214+ require .Empty (t , replicaInstance .GtidErrant ,
1215+ "GtidErrant should be cleared on the no-errant path so callers reusing the Instance don't see stale state" )
1216+ require .EqualValues (t , 0 , currentErrantGTIDCount .Counts ()[replicaAlias ],
1217+ "gauge should be reset to 0 after errant GTIDs are resolved" )
1218+ }
0 commit comments