Skip to content

Commit a7262cb

Browse files
more pr suggestions
Signed-off-by: Tim Vaillancourt <tim@timvaillancourt.com>
1 parent 0f3ec1f commit a7262cb

3 files changed

Lines changed: 138 additions & 9 deletions

File tree

changelog/25.0/25.0.0/summary.md

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -137,11 +137,6 @@ When the leading GTID-based candidates have incomparable `Combined` positions (s
137137

138138
A new `--allow-split-brain-promotion` flag is added to `vtctldclient EmergencyReparentShard` (and `--allow_split_brain_promotion` on the legacy `vtctl`). It is **off by default**. Operators who deliberately need to force ERS through a detected split-brain — typically because they already know which side to keep and plan to re-clone the losing side — can set it to convert the abort into a `WARN` log and proceed. The non-promoted side's unique GTIDs will become errant after promotion, so this is an explicit operator override, not a default-on safety knob.
139139

140-
Two limitations to be aware of when using the override:
141-
142-
1. **Specify `--new-primary` for deterministic side selection.** In a symmetric split-brain (incomparable Combined positions, neither side dominating the other), the sort comparator returns false in both directions and Go's sort cannot establish a total order — the winning side ends up depending on map iteration order. Pair `--allow-split-brain-promotion` with `--new-primary` if you have a preferred side; the flag teaches `findMostAdvanced` to honour `--new-primary` even when `AtLeast` would otherwise reject it.
143-
2. **A lagging tablet in the candidate pool can still be picked over the diverged leaders.** If the leading group is mutually-errant AND a non-leading (lagged) tablet exists in `validCandidates`, errant-GTID detection removes both leaders, leaves the lagger, and ERS promotes the lagger — losing both sides' unique writes. The flag's restoration logic only fires when *every* candidate is pruned; the mixed-survivor case is unchanged from pre-PR. Ensure the candidate pool is constrained to the diverged sides (via `--ignore-replicas`) when forcing through.
144-
145140
Three new stats are exported for observability:
146141

147142
- `EmergencyReparentFilteredCandidates` — counts replicas excluded from the relay-log wait because their `Combined` position is strictly behind the leading group.

go/vt/vtctl/reparentutil/emergency_reparenter.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -435,10 +435,6 @@ func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *eve
435435
return vterrors.Wrap(err, lostTopologyLockMsg)
436436
}
437437

438-
// Relay logs have been successfully applied and we're ready to start repointing replicas,
439-
// so we no longer need to restart replication manually in the event of an error.
440-
replicasToRestart = nil
441-
442438
// initialize the newPrimary with the intermediate source, override this value if it is not the ideal candidate
443439
newPrimary := intermediateSource
444440
if !isIdeal {

go/vt/vtctl/reparentutil/emergency_reparenter_test.go

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2435,6 +2435,144 @@ func TestEmergencyReparenterRestartsStoppedIOThreadsOnFailure(t *testing.T) {
24352435
})
24362436
}
24372437

2438+
// TestEmergencyReparenterRestartsStoppedIOThreadsAfterRelayLogApply verifies
2439+
// that ERS restarts replication on replicas whose IO threads it stopped when
2440+
// promotion fails AFTER relay-log application has already succeeded. Prevents
2441+
// regression of a bug where replicasToRestart was cleared just before the
2442+
// promotion attempt, so a PromoteReplica/SetReplicationSource/waitForCatchUp
2443+
// failure would silently skip the deferred cleanup and leave replicas with
2444+
// IO_THREAD stopped.
2445+
func TestEmergencyReparenterRestartsStoppedIOThreadsAfterRelayLogApply(t *testing.T) {
2446+
synctest.Test(t, func(t *testing.T) {
2447+
ctx := t.Context()
2448+
2449+
const (
2450+
keyspace = "testkeyspace"
2451+
shard = "-"
2452+
primaryAlias = "zone1-0000000100"
2453+
stoppedIOAlias1 = "zone1-0000000101"
2454+
stoppedIOAlias2 = "zone1-0000000102"
2455+
relayLogPosition = "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21"
2456+
sourceUUID = "3E11FA47-71CA-11E1-9E33-C80AA9429562"
2457+
)
2458+
2459+
stoppedIOStatus := &replicationdatapb.StopReplicationStatus{
2460+
Before: &replicationdatapb.Status{
2461+
IoState: int32(replication.ReplicationStateRunning),
2462+
SqlState: int32(replication.ReplicationStateRunning),
2463+
},
2464+
After: &replicationdatapb.Status{
2465+
SourceUuid: sourceUUID,
2466+
RelayLogPosition: relayLogPosition,
2467+
},
2468+
}
2469+
2470+
mockController := gomock.NewController(t)
2471+
tmc := tmcmock.NewMockTabletManagerClient(mockController)
2472+
2473+
// Primary is dead, both replicas have their IO threads stopped by ERS.
2474+
tmc.EXPECT().
2475+
StopReplicationAndGetStatus(gomock.Any(), tabletAliasMatcher(primaryAlias), replicationdatapb.StopReplicationMode_IOTHREADONLY).
2476+
Return(nil, assert.AnError)
2477+
2478+
tmc.EXPECT().
2479+
StopReplicationAndGetStatus(gomock.Any(), tabletAliasMatcher(stoppedIOAlias1), replicationdatapb.StopReplicationMode_IOTHREADONLY).
2480+
Return(stoppedIOStatus, nil)
2481+
2482+
tmc.EXPECT().
2483+
StopReplicationAndGetStatus(gomock.Any(), tabletAliasMatcher(stoppedIOAlias2), replicationdatapb.StopReplicationMode_IOTHREADONLY).
2484+
Return(stoppedIOStatus, nil)
2485+
2486+
// Both replicas successfully apply their relay logs.
2487+
tmc.EXPECT().
2488+
WaitForPosition(gomock.Any(), tabletAliasMatcher(stoppedIOAlias1), relayLogPosition).
2489+
Return(nil).
2490+
Times(1)
2491+
2492+
tmc.EXPECT().
2493+
WaitForPosition(gomock.Any(), tabletAliasMatcher(stoppedIOAlias2), relayLogPosition).
2494+
Return(nil).
2495+
Times(1)
2496+
2497+
// Errant-GTID detection reads each candidate's reparent journal length.
2498+
// Same length on both keeps both in the leading set with no lagged
2499+
// handling.
2500+
tmc.EXPECT().
2501+
ReadReparentJournalInfo(gomock.Any(), gomock.Any()).
2502+
Return(int32(1), nil).
2503+
AnyTimes()
2504+
2505+
// PromoteReplica fails on the new primary, triggering the
2506+
// post-relay-log-apply abort path.
2507+
tmc.EXPECT().
2508+
PromoteReplica(gomock.Any(), gomock.Any(), gomock.Any()).
2509+
Return("", assert.AnError).
2510+
Times(1)
2511+
2512+
// handleReplica goroutines race in reparentReplicas. Allow any number
2513+
// of SetReplicationSource calls; the test asserts only on cleanup.
2514+
tmc.EXPECT().
2515+
SetReplicationSource(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).
2516+
Return(nil).
2517+
AnyTimes()
2518+
2519+
// Both replicas whose IO threads ERS stopped must have replication
2520+
// restarted by the deferred cleanup.
2521+
tmc.EXPECT().
2522+
StartReplication(gomock.Any(), tabletAliasMatcher(stoppedIOAlias1), false).
2523+
Return(nil).
2524+
Times(1)
2525+
2526+
tmc.EXPECT().
2527+
StartReplication(gomock.Any(), tabletAliasMatcher(stoppedIOAlias2), false).
2528+
Return(nil).
2529+
Times(1)
2530+
2531+
ts := memorytopo.NewServer(ctx, "zone1")
2532+
defer ts.Close()
2533+
2534+
testutil.AddShards(ctx, t, ts, &vtctldatapb.Shard{
2535+
Keyspace: keyspace,
2536+
Name: shard,
2537+
Shard: &topodatapb.Shard{
2538+
PrimaryAlias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100},
2539+
},
2540+
})
2541+
2542+
testutil.AddTablets(
2543+
ctx, t, ts, nil,
2544+
&topodatapb.Tablet{
2545+
Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100},
2546+
Type: topodatapb.TabletType_PRIMARY,
2547+
Keyspace: keyspace,
2548+
Shard: shard,
2549+
},
2550+
&topodatapb.Tablet{
2551+
Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101},
2552+
Type: topodatapb.TabletType_REPLICA,
2553+
Keyspace: keyspace,
2554+
Shard: shard,
2555+
},
2556+
&topodatapb.Tablet{
2557+
Alias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 102},
2558+
Type: topodatapb.TabletType_REPLICA,
2559+
Keyspace: keyspace,
2560+
Shard: shard,
2561+
},
2562+
)
2563+
2564+
reparenttestutil.SetKeyspaceDurability(ctx, t, ts, keyspace, policy.DurabilityNone)
2565+
2566+
erp := NewEmergencyReparenter(ts, tmc, logutil.NewMemoryLogger())
2567+
2568+
_, err := erp.ReparentShard(ctx, keyspace, shard, EmergencyReparentOptions{
2569+
WaitReplicasTimeout: time.Second,
2570+
})
2571+
2572+
require.Error(t, err)
2573+
})
2574+
}
2575+
24382576
// tabletAliasMatcher matches tablets by alias string for gomock expectations.
24392577
func tabletAliasMatcher(alias string) gomock.Matcher {
24402578
return gomock.Cond(func(tablet *topodatapb.Tablet) bool {

0 commit comments

Comments
 (0)