Skip to content

Commit 265b2c6

Browse files
review tweaks
Signed-off-by: Tim Vaillancourt <tim@timvaillancourt.com>
1 parent 3797cdb commit 265b2c6

4 files changed

Lines changed: 391 additions & 153 deletions

File tree

go/test/endtoend/reparent/emergencyreparent/ers_test.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -721,13 +721,15 @@ func TestERSSplitBrainDetection(t *testing.T) {
721721
utils.StopTablet(t, tablets[0], true)
722722

723723
// ERS must abort with the upfront split-brain error, not silently promote
724-
// one of the diverged sides.
724+
// one of the diverged sides. The vtctldclient surfaces the RPC error message
725+
// in stdout/stderr — assert against `out` rather than err (which is just
726+
// "exit status 1" from the command process).
725727
out, err := utils.Ers(clusterInstance, nil, "60s", "30s")
726728
require.Error(t, err, out)
727-
require.ErrorContains(t, err, "suspected split-brain")
729+
require.Contains(t, out, "suspected split-brain", "ERS output: %s", out)
728730
// describeCombinedPositions names the offending tablets in the error.
729-
require.ErrorContains(t, err, tablets[2].Alias)
730-
require.ErrorContains(t, err, tablets[3].Alias)
731+
require.Contains(t, out, tablets[2].Alias)
732+
require.Contains(t, out, tablets[3].Alias)
731733
}
732734

733735
// TestReplicationStopped checks that ERS ignores the tablets that have sql thread stopped.

go/vt/vtctl/reparentutil/emergency_reparenter.go

Lines changed: 100 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -283,30 +283,17 @@ func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *eve
283283
return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "no valid candidates for emergency reparent")
284284
}
285285

286-
// Pick the relay-log-apply wait set and mode based on the replication flavor.
287-
//
288-
// For GTID-based replication, Combined includes the received relay-log position, so we
289-
// can filter out tablets strictly behind the leading group and wait only for that group.
290-
// When the filtered set is uniform (single Combined value) one success is sufficient.
291-
// When it is non-uniform we have incomparable maxima — a suspected split-brain — and ERS
292-
// must error rather than silently pick one side (see AGENTS.md / CLAUDE.md ERS section).
293-
//
294-
// For non-GTID flavors (FilePos, MariaDB), Combined is the executed position and does not
295-
// reflect received-but-not-applied relay logs. The filter is unsafe, so we wait for every
296-
// candidate (pre-PR behavior) and fail on any error.
286+
// For GTID-based flavors, filter to tablets at the leading Combined position and wait
287+
// only for that group — one success is sufficient. Abort upfront on incomparable maxima
288+
// (suspected split-brain — see AGENTS.md / CLAUDE.md ERS section). For non-GTID flavors
289+
// (FilePos, MariaDB) Combined is the executed position, so the filter is unsafe — keep
290+
// pre-PR behavior of waiting for every candidate and failing on any error.
297291
relayLogWaitCandidates := validCandidates
298292
requireAll := true
299293
if isGTIDBased {
300-
relayLogWaitCandidates = filterToMostAdvancedCombined(validCandidates, erp.logger)
301-
if !uniformCombined(relayLogWaitCandidates) {
302-
return vterrors.Errorf(
303-
vtrpc.Code_FAILED_PRECONDITION,
304-
"emergency reparent aborted: candidates have incomparable Combined GTID positions (suspected split-brain): %s",
305-
describeCombinedPositions(relayLogWaitCandidates),
306-
)
307-
}
308-
if filtered := int64(len(validCandidates) - len(relayLogWaitCandidates)); filtered > 0 {
309-
ersFilteredCandidates.Add([]string{keyspace, shard}, filtered)
294+
relayLogWaitCandidates, err = erp.filterAndCheckUniform(validCandidates, keyspace, shard, "candidates")
295+
if err != nil {
296+
return err
310297
}
311298
requireAll = false
312299
}
@@ -326,21 +313,15 @@ func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *eve
326313
return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "no valid candidates for emergency reparent: all candidates have errant GTIDs")
327314
}
328315

329-
// Errant filtering can leave promotion candidates that were never waited on:
330-
// tablets strictly behind the original leading group (excluded by the filter) and
331-
// tablets cancelled mid-apply (excluded from successMap by the optimization). If
332-
// every originally-applied tablet is removed by errant detection, one of these
333-
// unwaited survivors could be promoted with received-but-unapplied transactions.
334-
// Detect this case (no applied tablet survives) and run a second wait pass on the
335-
// survivors to bring them to post-apply state before promotion. Applies the same
336-
// filter/uniform/wait pipeline as the first pass — uniformity is re-checked
337-
// because the survivor set is a different shape than the original leading group.
316+
// If errant detection removed every originally-applied tablet, promotion candidates
317+
// only include "unwaited survivors" (filter-excluded laggers, or peers cancelled mid-apply
318+
// by our short-circuit) — promoting one risks received-but-unapplied transactions. Run a
319+
// second filter/uniform/wait pass over the survivors. Uniformity is re-checked because
320+
// the survivor set is a different shape than the original leading group.
338321
//
339-
// If at least one applied tablet survived, no re-wait is needed: findMostAdvanced
340-
// will pick the applied tablet over any unwaited peer (Executed tie-break), and
341-
// pos.AtLeast() also gates NewPrimaryAlias to candidates that are at least as
342-
// advanced as the winner, so an unwaited peer with lower Executed cannot be
343-
// promoted out from under the applied winner.
322+
// If at least one applied tablet survived no re-wait is needed: findMostAdvanced prefers
323+
// it via the Executed tie-break, and pos.AtLeast() gates NewPrimaryAlias against the
324+
// winner so an unwaited peer with lower Executed cannot be promoted.
344325
appliedSurvived := false
345326
for alias := range validCandidates {
346327
if applied, ok := relayLogSuccessMap[alias]; ok && applied {
@@ -350,16 +331,9 @@ func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *eve
350331
}
351332
if !appliedSurvived {
352333
erp.logger.Warningf("all originally-applied candidates were removed by errant-GTID detection; running second relay-log-apply wait on surviving candidates before promotion")
353-
rewaitCandidates := filterToMostAdvancedCombined(validCandidates, erp.logger)
354-
if !uniformCombined(rewaitCandidates) {
355-
return vterrors.Errorf(
356-
vtrpc.Code_FAILED_PRECONDITION,
357-
"emergency reparent aborted: surviving unwaited candidates have incomparable Combined GTID positions (suspected split-brain): %s",
358-
describeCombinedPositions(rewaitCandidates),
359-
)
360-
}
361-
if filtered := int64(len(validCandidates) - len(rewaitCandidates)); filtered > 0 {
362-
ersFilteredCandidates.Add([]string{keyspace, shard}, filtered)
334+
rewaitCandidates, err := erp.filterAndCheckUniform(validCandidates, keyspace, shard, "surviving unwaited candidates")
335+
if err != nil {
336+
return err
363337
}
364338
if _, err := erp.applyRelayLogsAndReconcile(ctx, rewaitCandidates, validCandidates, tabletMap, stoppedReplicationSnapshot.statusMap, opts.WaitReplicasTimeout, false /* requireAll */, keyspace, shard); err != nil {
365339
return err
@@ -506,11 +480,34 @@ type relayLogResult struct {
506480
err error
507481
}
508482

509-
// applyRelayLogsAndReconcile runs waitForAllRelayLogsToApply on waitCandidates and
510-
// reconciles validCandidates with the outcome: applied tablets get Executed bumped to
511-
// Combined (so the intermediate-selection sorter prefers them), failed tablets are
512-
// removed (so they cannot be picked for promotion or catch-up). Updates the failure
513-
// metric. The wait function's successMap is returned for callers that need to track
483+
// filterAndCheckUniform applies filterToMostAdvancedCombined to validCandidates, increments
484+
// the filtered-count metric, and returns a FAILED_PRECONDITION error if the resulting set
485+
// has incomparable Combined positions (suspected split-brain). The descriptor parameter is
486+
// interpolated into the error message to identify which ERS pipeline stage detected the
487+
// split-brain (first wait vs. errant-GTID re-wait).
488+
func (erp *EmergencyReparenter) filterAndCheckUniform(
489+
validCandidates map[string]*RelayLogPositions,
490+
keyspace, shard, descriptor string,
491+
) (map[string]*RelayLogPositions, error) {
492+
filtered := filterToMostAdvancedCombined(validCandidates, erp.logger)
493+
if !uniformCombined(filtered) {
494+
return nil, vterrors.Errorf(
495+
vtrpc.Code_FAILED_PRECONDITION,
496+
"emergency reparent aborted: %s have incomparable Combined GTID positions (suspected split-brain): %s",
497+
descriptor,
498+
describeCombinedPositions(filtered),
499+
)
500+
}
501+
if excluded := int64(len(validCandidates) - len(filtered)); excluded > 0 {
502+
ersFilteredCandidates.Add([]string{keyspace, shard}, excluded)
503+
}
504+
return filtered, nil
505+
}
506+
507+
// applyRelayLogsAndReconcile waits on waitCandidates, then mutates validCandidates:
508+
// applied tablets get Executed bumped to Combined (so the sorter prefers them via the
509+
// existing Combined→Executed tie-break), failed tablets are removed (so they cannot
510+
// be picked for promotion or catch-up). The returned successMap lets callers track
514511
// which aliases were waited on across multiple passes.
515512
func (erp *EmergencyReparenter) applyRelayLogsAndReconcile(
516513
ctx context.Context,
@@ -523,9 +520,8 @@ func (erp *EmergencyReparenter) applyRelayLogsAndReconcile(
523520
keyspace, shard string,
524521
) (successMap map[string]bool, err error) {
525522
successMap, waitErr := erp.waitForAllRelayLogsToApply(ctx, waitCandidates, tabletMap, statusMap, waitReplicasTimeout, requireAll)
526-
// Update the failure metric before any error-path return so that aborts caused by
527-
// relay-log-apply failures are visible to operators (otherwise the metric would
528-
// undercount in exactly the worst case).
523+
// Increment the failure metric before any error return so operators still see failure
524+
// counts when ERS aborts.
529525
failedCount := 0
530526
for _, applied := range successMap {
531527
if !applied {
@@ -538,17 +534,11 @@ func (erp *EmergencyReparenter) applyRelayLogsAndReconcile(
538534
if waitErr != nil {
539535
return successMap, waitErr
540536
}
541-
// Reconcile validCandidates with the wait results:
542-
// - Applied tablets get Executed bumped to Combined so the intermediate-selection
543-
// sorter naturally prefers them via the existing Combined→Executed→promotion-rule
544-
// tie-break.
545-
// - Failed tablets are removed entirely so they cannot be picked as intermediate
546-
// source, primary candidate, or catch-up target. Safe to remove because the
547-
// upfront uniformCombined check guarantees every wait candidate shares the same
548-
// Combined position — failed tablets carry no unique GTIDs vs the applied tablet,
549-
// so no split-brain signal is lost.
550-
// - Cancelled tablets (not in successMap) are left untouched; the sort ranks them
551-
// below applied peers at the same Combined.
537+
// Reconcile validCandidates with the wait results. Removing failed tablets is safe
538+
// because the upfront uniformCombined check guarantees every wait candidate shared
539+
// the same Combined position — failed tablets carry no unique GTIDs vs the applied
540+
// tablet, so no split-brain signal is lost. Cancelled tablets (absent from successMap)
541+
// are left untouched; the sort ranks them below applied peers at the same Combined.
552542
for alias, applied := range successMap {
553543
if applied {
554544
if pos, ok := validCandidates[alias]; ok {
@@ -562,28 +552,17 @@ func (erp *EmergencyReparenter) applyRelayLogsAndReconcile(
562552
}
563553

564554
// waitForAllRelayLogsToApply waits for the given candidates to apply their relay logs
565-
// and returns a map of alias to outcome:
555+
// and returns a successMap with three possible per-alias outcomes:
566556
//
567-
// - successMap[alias] == true: the tablet fully applied its relay logs.
568-
// - successMap[alias] == false: the tablet genuinely failed (RPC error, MySQL error,
569-
// or timeout).
570-
// - alias absent: the tablet was not attempted (no statusMap entry) or
571-
// was cancelled by our own short-circuit after a peer succeeded.
557+
// - successMap[alias] == true: fully applied, OR no statusMap entry (no relay log
558+
// to apply — already at its position; the caller treats this the same as applied).
559+
// - successMap[alias] == false: genuinely failed (RPC error, MySQL error, or timeout).
560+
// - alias absent: cancelled by our own short-circuit after a peer
561+
// succeeded, or by the parent context.
572562
//
573-
// Behavior depends on requireAll:
574-
//
575-
// - requireAll=true (pre-PR semantics): wait for every candidate's result and return
576-
// an error on the first failure. Used for non-GTID flavors where Combined does not
577-
// reflect received-but-not-applied relay logs, so we cannot safely short-circuit
578-
// and must reach post-apply state before any candidate is selected.
579-
// - requireAll=false (optimization): cancel the remaining goroutines as soon as one
580-
// candidate succeeds. Used only when the caller has proven a single success is
581-
// sufficient (GTID-based + filtered candidates share the same Combined position).
582-
// Subsequent context.Canceled errors are treated as expected cancellations, not
583-
// failures — they are not logged and are excluded from successMap entirely.
584-
//
585-
// The function does not mutate validCandidates; the caller decides how to handle the
586-
// failed aliases (e.g., remove them from promotion consideration).
563+
// When requireAll is true the function aborts on the first failure (pre-PR semantics,
564+
// used for non-GTID flavors). When false it short-circuits on the first success and
565+
// treats subsequent cancellation errors as expected. Does not mutate validCandidates.
587566
func (erp *EmergencyReparenter) waitForAllRelayLogsToApply(
588567
ctx context.Context,
589568
validCandidates map[string]*RelayLogPositions,
@@ -597,7 +576,9 @@ func (erp *EmergencyReparenter) waitForAllRelayLogsToApply(
597576
groupCtx, groupCancel := context.WithTimeout(ctx, waitReplicasTimeout)
598577
defer groupCancel()
599578

579+
successMap = make(map[string]bool, len(validCandidates))
600580
waiterCount := 0
581+
preSatisfied := 0
601582

602583
for candidate := range validCandidates {
603584
// When we called stopReplicationAndBuildStatusMaps, we got back two
@@ -609,17 +590,20 @@ func (erp *EmergencyReparenter) waitForAllRelayLogsToApply(
609590
// If we have a tablet in the validCandidates map that does not appear
610591
// in the statusMap, then we have either (a) the current primary, which
611592
// is not replicating, so it is not applying relay logs; or (b) a tablet
612-
// that is stuck thinking it is PRIMARY but is not in actuality. In that
613-
// second case - (b) - we will most likely find that the stuck PRIMARY
614-
// does not have a winning position, and fail the ERS. If, on the other
615-
// hand, it does have a winning position, we are trusting the operator
616-
// to know what they are doing by emergency-reparenting onto that
617-
// tablet. In either case, it does not make sense to wait for relay logs
618-
// to apply on a tablet that was never applying relay logs in the first
619-
// place, so we skip it, and log that we did.
593+
// that is stuck thinking it is PRIMARY but is not in actuality. Such a
594+
// tablet has no relay logs to apply, so it is already at its position —
595+
// mark it applied in successMap so applyRelayLogsAndReconcile bumps
596+
// Executed=Combined (Combined is its executed position by construction
597+
// from FindPositionsOfAllCandidates). Without that bump, peers cancelled
598+
// at the same Combined could keep their pre-wait Executed and sort ahead
599+
// in findMostAdvanced, risking promotion of a tablet with received-but-
600+
// unapplied transactions. In case (b) the stuck PRIMARY will most likely
601+
// fail downstream split-brain / errant-GTID checks anyway.
620602
status, ok := statusMap[candidate]
621603
if !ok {
622604
erp.logger.Infof("EmergencyReparent candidate %v not in replica status map; this means it was not running replication (because it was formerly PRIMARY), so skipping WaitForRelayLogsToApply step for this candidate", candidate)
605+
successMap[candidate] = true
606+
preSatisfied++
623607
continue
624608
}
625609

@@ -636,35 +620,29 @@ func (erp *EmergencyReparenter) waitForAllRelayLogsToApply(
636620
// If no candidates needed to apply relay logs (e.g., initialization with no replication),
637621
// there's nothing to wait for.
638622
if waiterCount == 0 {
639-
return map[string]bool{}, nil
623+
return successMap, nil
640624
}
641625

642-
// Collect results. In optimization mode (!requireAll), cancel remaining goroutines
643-
// as soon as one tablet succeeds — the others will continue applying on their own
644-
// after ERS completes. In requireAll mode, wait for every result and surface the
645-
// first failure so the caller can abort ERS.
646-
successMap = make(map[string]bool, waiterCount)
647-
successes := 0
626+
// In optimization mode, a pre-satisfied candidate is already a winner — short-circuit
627+
// the wait. The remaining waiters will return context.Canceled, which the cancellation
628+
// classification below treats as expected noise (successes > 0).
629+
if !requireAll && preSatisfied > 0 {
630+
groupCancel()
631+
}
632+
633+
successes := preSatisfied
648634
var firstFailure error
649635
for range waiterCount {
650636
result := <-resultCh
651637
if result.err != nil {
652-
// We call groupCancel() at two points: in optimization mode once a peer
653-
// succeeds, and in requireAll mode once any peer fails (since we will abort
654-
// anyway). Cancellation errors that arrive after either trigger are expected
655-
// noise — omit them from successMap entirely. We must check both
656-
// context.Canceled (in-process) and the gRPC-wrapped equivalent
657-
// (vterrors.Code(err) == vtrpcpb.Code_CANCELED), which is what real tmclient
658-
// RPCs return after vterrors.FromGRPC has converted the gRPC status — that
659-
// one does NOT satisfy errors.Is(err, context.Canceled).
660-
//
661-
// Non-cancellation errors that arrive after a trigger are conservatively
662-
// recorded as real failures even though they may have been caused indirectly
663-
// by our own cancellation (e.g., torn-down RPC connections, MySQL session
664-
// closure). Over-reporting is preferable to silently dropping a genuine
638+
// Cancellation errors arriving after our own groupCancel(), or after the parent
639+
// ctx was cancelled, are expected noise — omit them entirely. Must also check
640+
// vterrors.Code (gRPC-wrapped CANCELED), which does NOT satisfy errors.Is(err,
641+
// context.Canceled). Non-cancellation errors after a trigger are conservatively
642+
// counted as real failures — over-reporting is preferable to dropping a genuine
665643
// failure signal.
666644
isCancellation := errors.Is(result.err, context.Canceled) || vterrors.Code(result.err) == vtrpc.Code_CANCELED
667-
weTriggeredCancellation := (!requireAll && successes > 0) || (requireAll && firstFailure != nil)
645+
weTriggeredCancellation := (!requireAll && successes > 0) || (requireAll && firstFailure != nil) || ctx.Err() != nil
668646
if isCancellation && weTriggeredCancellation {
669647
continue
670648
}
@@ -673,21 +651,27 @@ func (erp *EmergencyReparenter) waitForAllRelayLogsToApply(
673651
firstFailure = result.err
674652
erp.logger.Warningf("EmergencyReparent candidate %s failed to apply relay logs: %v", result.alias, result.err)
675653
if requireAll {
676-
// requireAll means any failure must abort ERS — short-circuit the
677-
// remaining waiters so we don't block until waitReplicasTimeout.
678654
groupCancel()
679655
}
680656
}
681657
continue
682658
}
683659
if !requireAll && successes == 0 {
684-
// Optimization: cancel remaining goroutines — we have a winner.
685660
groupCancel()
686661
}
687662
successMap[result.alias] = true
688663
successes++
689664
}
690665

666+
// If the parent context was cancelled, surface that regardless of mode or how many
667+
// candidates were pre-satisfied. Pre-PR errgroup.Wait surfaced any cancellation; this
668+
// keeps behavior symmetric across preSatisfied=0 and preSatisfied>0 paths and avoids
669+
// misreporting a "all candidates failed" error when the real cause was operator abort.
670+
// A real tablet failure (firstFailure set) takes precedence — it happened first and is
671+
// more actionable.
672+
if ctxErr := ctx.Err(); ctxErr != nil && firstFailure == nil {
673+
return successMap, vterrors.Wrapf(ctxErr, "emergency reparent aborted while waiting for relay logs to apply")
674+
}
691675
if requireAll && firstFailure != nil {
692676
return successMap, vterrors.Wrapf(firstFailure, "could not apply all relay logs within the provided waitReplicasTimeout (%s)", waitReplicasTimeout)
693677
}

0 commit comments

Comments
 (0)