@@ -283,30 +283,17 @@ func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *eve
283283 return vterrors .Errorf (vtrpc .Code_FAILED_PRECONDITION , "no valid candidates for emergency reparent" )
284284 }
285285
286- // Pick the relay-log-apply wait set and mode based on the replication flavor.
287- //
288- // For GTID-based replication, Combined includes the received relay-log position, so we
289- // can filter out tablets strictly behind the leading group and wait only for that group.
290- // When the filtered set is uniform (single Combined value) one success is sufficient.
291- // When it is non-uniform we have incomparable maxima — a suspected split-brain — and ERS
292- // must error rather than silently pick one side (see AGENTS.md / CLAUDE.md ERS section).
293- //
294- // For non-GTID flavors (FilePos, MariaDB), Combined is the executed position and does not
295- // reflect received-but-not-applied relay logs. The filter is unsafe, so we wait for every
296- // candidate (pre-PR behavior) and fail on any error.
286+ // For GTID-based flavors, filter to tablets at the leading Combined position and wait
287+ // only for that group — one success is sufficient. Abort upfront on incomparable maxima
288+ // (suspected split-brain — see AGENTS.md / CLAUDE.md ERS section). For non-GTID flavors
289+ // (FilePos, MariaDB) Combined is the executed position, so the filter is unsafe — keep
290+ // pre-PR behavior of waiting for every candidate and failing on any error.
297291 relayLogWaitCandidates := validCandidates
298292 requireAll := true
299293 if isGTIDBased {
300- relayLogWaitCandidates = filterToMostAdvancedCombined (validCandidates , erp .logger )
301- if ! uniformCombined (relayLogWaitCandidates ) {
302- return vterrors .Errorf (
303- vtrpc .Code_FAILED_PRECONDITION ,
304- "emergency reparent aborted: candidates have incomparable Combined GTID positions (suspected split-brain): %s" ,
305- describeCombinedPositions (relayLogWaitCandidates ),
306- )
307- }
308- if filtered := int64 (len (validCandidates ) - len (relayLogWaitCandidates )); filtered > 0 {
309- ersFilteredCandidates .Add ([]string {keyspace , shard }, filtered )
294+ relayLogWaitCandidates , err = erp .filterAndCheckUniform (validCandidates , keyspace , shard , "candidates" )
295+ if err != nil {
296+ return err
310297 }
311298 requireAll = false
312299 }
@@ -326,21 +313,15 @@ func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *eve
326313 return vterrors .Errorf (vtrpc .Code_FAILED_PRECONDITION , "no valid candidates for emergency reparent: all candidates have errant GTIDs" )
327314 }
328315
329- // Errant filtering can leave promotion candidates that were never waited on:
330- // tablets strictly behind the original leading group (excluded by the filter) and
331- // tablets cancelled mid-apply (excluded from successMap by the optimization). If
332- // every originally-applied tablet is removed by errant detection, one of these
333- // unwaited survivors could be promoted with received-but-unapplied transactions.
334- // Detect this case (no applied tablet survives) and run a second wait pass on the
335- // survivors to bring them to post-apply state before promotion. Applies the same
336- // filter/uniform/wait pipeline as the first pass — uniformity is re-checked
337- // because the survivor set is a different shape than the original leading group.
316+ // If errant detection removed every originally-applied tablet, promotion candidates
317+ // only include "unwaited survivors" (filter-excluded laggers, or peers cancelled mid-apply
318+ // by our short-circuit) — promoting one risks received-but-unapplied transactions. Run a
319+ // second filter/uniform/wait pass over the survivors. Uniformity is re-checked because
320+ // the survivor set is a different shape than the original leading group.
338321 //
339- // If at least one applied tablet survived, no re-wait is needed: findMostAdvanced
340- // will pick the applied tablet over any unwaited peer (Executed tie-break), and
341- // pos.AtLeast() also gates NewPrimaryAlias to candidates that are at least as
342- // advanced as the winner, so an unwaited peer with lower Executed cannot be
343- // promoted out from under the applied winner.
322+ // If at least one applied tablet survived no re-wait is needed: findMostAdvanced prefers
323+ // it via the Executed tie-break, and pos.AtLeast() gates NewPrimaryAlias against the
324+ // winner so an unwaited peer with lower Executed cannot be promoted.
344325 appliedSurvived := false
345326 for alias := range validCandidates {
346327 if applied , ok := relayLogSuccessMap [alias ]; ok && applied {
@@ -350,16 +331,9 @@ func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *eve
350331 }
351332 if ! appliedSurvived {
352333 erp .logger .Warningf ("all originally-applied candidates were removed by errant-GTID detection; running second relay-log-apply wait on surviving candidates before promotion" )
353- rewaitCandidates := filterToMostAdvancedCombined (validCandidates , erp .logger )
354- if ! uniformCombined (rewaitCandidates ) {
355- return vterrors .Errorf (
356- vtrpc .Code_FAILED_PRECONDITION ,
357- "emergency reparent aborted: surviving unwaited candidates have incomparable Combined GTID positions (suspected split-brain): %s" ,
358- describeCombinedPositions (rewaitCandidates ),
359- )
360- }
361- if filtered := int64 (len (validCandidates ) - len (rewaitCandidates )); filtered > 0 {
362- ersFilteredCandidates .Add ([]string {keyspace , shard }, filtered )
334+ rewaitCandidates , err := erp .filterAndCheckUniform (validCandidates , keyspace , shard , "surviving unwaited candidates" )
335+ if err != nil {
336+ return err
363337 }
364338 if _ , err := erp .applyRelayLogsAndReconcile (ctx , rewaitCandidates , validCandidates , tabletMap , stoppedReplicationSnapshot .statusMap , opts .WaitReplicasTimeout , false /* requireAll */ , keyspace , shard ); err != nil {
365339 return err
@@ -506,11 +480,34 @@ type relayLogResult struct {
506480 err error
507481}
508482
509- // applyRelayLogsAndReconcile runs waitForAllRelayLogsToApply on waitCandidates and
510- // reconciles validCandidates with the outcome: applied tablets get Executed bumped to
511- // Combined (so the intermediate-selection sorter prefers them), failed tablets are
512- // removed (so they cannot be picked for promotion or catch-up). Updates the failure
513- // metric. The wait function's successMap is returned for callers that need to track
483+ // filterAndCheckUniform applies filterToMostAdvancedCombined to validCandidates, increments
484+ // the filtered-count metric, and returns a FAILED_PRECONDITION error if the resulting set
485+ // has incomparable Combined positions (suspected split-brain). The descriptor parameter is
486+ // interpolated into the error message to identify which ERS pipeline stage detected the
487+ // split-brain (first wait vs. errant-GTID re-wait).
488+ func (erp * EmergencyReparenter ) filterAndCheckUniform (
489+ validCandidates map [string ]* RelayLogPositions ,
490+ keyspace , shard , descriptor string ,
491+ ) (map [string ]* RelayLogPositions , error ) {
492+ filtered := filterToMostAdvancedCombined (validCandidates , erp .logger )
493+ if ! uniformCombined (filtered ) {
494+ return nil , vterrors .Errorf (
495+ vtrpc .Code_FAILED_PRECONDITION ,
496+ "emergency reparent aborted: %s have incomparable Combined GTID positions (suspected split-brain): %s" ,
497+ descriptor ,
498+ describeCombinedPositions (filtered ),
499+ )
500+ }
501+ if excluded := int64 (len (validCandidates ) - len (filtered )); excluded > 0 {
502+ ersFilteredCandidates .Add ([]string {keyspace , shard }, excluded )
503+ }
504+ return filtered , nil
505+ }
506+
507+ // applyRelayLogsAndReconcile waits on waitCandidates, then mutates validCandidates:
508+ // applied tablets get Executed bumped to Combined (so the sorter prefers them via the
509+ // existing Combined→Executed tie-break), failed tablets are removed (so they cannot
510+ // be picked for promotion or catch-up). The returned successMap lets callers track
514511// which aliases were waited on across multiple passes.
515512func (erp * EmergencyReparenter ) applyRelayLogsAndReconcile (
516513 ctx context.Context ,
@@ -523,9 +520,8 @@ func (erp *EmergencyReparenter) applyRelayLogsAndReconcile(
523520 keyspace , shard string ,
524521) (successMap map [string ]bool , err error ) {
525522 successMap , waitErr := erp .waitForAllRelayLogsToApply (ctx , waitCandidates , tabletMap , statusMap , waitReplicasTimeout , requireAll )
526- // Update the failure metric before any error-path return so that aborts caused by
527- // relay-log-apply failures are visible to operators (otherwise the metric would
528- // undercount in exactly the worst case).
523+ // Increment the failure metric before any error return so operators still see failure
524+ // counts when ERS aborts.
529525 failedCount := 0
530526 for _ , applied := range successMap {
531527 if ! applied {
@@ -538,17 +534,11 @@ func (erp *EmergencyReparenter) applyRelayLogsAndReconcile(
538534 if waitErr != nil {
539535 return successMap , waitErr
540536 }
541- // Reconcile validCandidates with the wait results:
542- // - Applied tablets get Executed bumped to Combined so the intermediate-selection
543- // sorter naturally prefers them via the existing Combined→Executed→promotion-rule
544- // tie-break.
545- // - Failed tablets are removed entirely so they cannot be picked as intermediate
546- // source, primary candidate, or catch-up target. Safe to remove because the
547- // upfront uniformCombined check guarantees every wait candidate shares the same
548- // Combined position — failed tablets carry no unique GTIDs vs the applied tablet,
549- // so no split-brain signal is lost.
550- // - Cancelled tablets (not in successMap) are left untouched; the sort ranks them
551- // below applied peers at the same Combined.
537+ // Reconcile validCandidates with the wait results. Removing failed tablets is safe
538+ // because the upfront uniformCombined check guarantees every wait candidate shared
539+ // the same Combined position — failed tablets carry no unique GTIDs vs the applied
540+ // tablet, so no split-brain signal is lost. Cancelled tablets (absent from successMap)
541+ // are left untouched; the sort ranks them below applied peers at the same Combined.
552542 for alias , applied := range successMap {
553543 if applied {
554544 if pos , ok := validCandidates [alias ]; ok {
@@ -562,28 +552,17 @@ func (erp *EmergencyReparenter) applyRelayLogsAndReconcile(
562552}
563553
564554// waitForAllRelayLogsToApply waits for the given candidates to apply their relay logs
565- // and returns a map of alias to outcome :
555+ // and returns a successMap with three possible per-alias outcomes :
566556//
567- // - successMap[alias] == true: the tablet fully applied its relay logs.
568- // - successMap[alias] == false: the tablet genuinely failed (RPC error, MySQL error,
569- // or timeout).
570- // - alias absent: the tablet was not attempted (no statusMap entry) or
571- // was cancelled by our own short-circuit after a peer succeeded .
557+ // - successMap[alias] == true: fully applied, OR no statusMap entry (no relay log
558+ // to apply — already at its position; the caller treats this the same as applied).
559+ // - successMap[alias] == false: genuinely failed (RPC error, MySQL error, or timeout).
560+ // - alias absent: cancelled by our own short-circuit after a peer
561+ // succeeded, or by the parent context .
572562//
573- // Behavior depends on requireAll:
574- //
575- // - requireAll=true (pre-PR semantics): wait for every candidate's result and return
576- // an error on the first failure. Used for non-GTID flavors where Combined does not
577- // reflect received-but-not-applied relay logs, so we cannot safely short-circuit
578- // and must reach post-apply state before any candidate is selected.
579- // - requireAll=false (optimization): cancel the remaining goroutines as soon as one
580- // candidate succeeds. Used only when the caller has proven a single success is
581- // sufficient (GTID-based + filtered candidates share the same Combined position).
582- // Subsequent context.Canceled errors are treated as expected cancellations, not
583- // failures — they are not logged and are excluded from successMap entirely.
584- //
585- // The function does not mutate validCandidates; the caller decides how to handle the
586- // failed aliases (e.g., remove them from promotion consideration).
563+ // When requireAll is true the function aborts on the first failure (pre-PR semantics,
564+ // used for non-GTID flavors). When false it short-circuits on the first success and
565+ // treats subsequent cancellation errors as expected. Does not mutate validCandidates.
587566func (erp * EmergencyReparenter ) waitForAllRelayLogsToApply (
588567 ctx context.Context ,
589568 validCandidates map [string ]* RelayLogPositions ,
@@ -597,7 +576,9 @@ func (erp *EmergencyReparenter) waitForAllRelayLogsToApply(
597576 groupCtx , groupCancel := context .WithTimeout (ctx , waitReplicasTimeout )
598577 defer groupCancel ()
599578
579+ successMap = make (map [string ]bool , len (validCandidates ))
600580 waiterCount := 0
581+ preSatisfied := 0
601582
602583 for candidate := range validCandidates {
603584 // When we called stopReplicationAndBuildStatusMaps, we got back two
@@ -609,17 +590,20 @@ func (erp *EmergencyReparenter) waitForAllRelayLogsToApply(
609590 // If we have a tablet in the validCandidates map that does not appear
610591 // in the statusMap, then we have either (a) the current primary, which
611592 // is not replicating, so it is not applying relay logs; or (b) a tablet
612- // that is stuck thinking it is PRIMARY but is not in actuality. In that
613- // second case - (b) - we will most likely find that the stuck PRIMARY
614- // does not have a winning position, and fail the ERS. If, on the other
615- // hand, it does have a winning position, we are trusting the operator
616- // to know what they are doing by emergency-reparenting onto that
617- // tablet. In either case, it does not make sense to wait for relay logs
618- // to apply on a tablet that was never applying relay logs in the first
619- // place, so we skip it, and log that we did.
593+ // that is stuck thinking it is PRIMARY but is not in actuality. Such a
594+ // tablet has no relay logs to apply, so it is already at its position —
595+ // mark it applied in successMap so applyRelayLogsAndReconcile bumps
596+ // Executed=Combined (Combined is its executed position by construction
597+ // from FindPositionsOfAllCandidates). Without that bump, peers cancelled
598+ // at the same Combined could keep their pre-wait Executed and sort ahead
599+ // in findMostAdvanced, risking promotion of a tablet with received-but-
600+ // unapplied transactions. In case (b) the stuck PRIMARY will most likely
601+ // fail downstream split-brain / errant-GTID checks anyway.
620602 status , ok := statusMap [candidate ]
621603 if ! ok {
622604 erp .logger .Infof ("EmergencyReparent candidate %v not in replica status map; this means it was not running replication (because it was formerly PRIMARY), so skipping WaitForRelayLogsToApply step for this candidate" , candidate )
605+ successMap [candidate ] = true
606+ preSatisfied ++
623607 continue
624608 }
625609
@@ -636,35 +620,29 @@ func (erp *EmergencyReparenter) waitForAllRelayLogsToApply(
636620 // If no candidates needed to apply relay logs (e.g., initialization with no replication),
637621 // there's nothing to wait for.
638622 if waiterCount == 0 {
639- return map [ string ] bool {} , nil
623+ return successMap , nil
640624 }
641625
642- // Collect results. In optimization mode (!requireAll), cancel remaining goroutines
643- // as soon as one tablet succeeds — the others will continue applying on their own
644- // after ERS completes. In requireAll mode, wait for every result and surface the
645- // first failure so the caller can abort ERS.
646- successMap = make (map [string ]bool , waiterCount )
647- successes := 0
626+ // In optimization mode, a pre-satisfied candidate is already a winner — short-circuit
627+ // the wait. The remaining waiters will return context.Canceled, which the cancellation
628+ // classification below treats as expected noise (successes > 0).
629+ if ! requireAll && preSatisfied > 0 {
630+ groupCancel ()
631+ }
632+
633+ successes := preSatisfied
648634 var firstFailure error
649635 for range waiterCount {
650636 result := <- resultCh
651637 if result .err != nil {
652- // We call groupCancel() at two points: in optimization mode once a peer
653- // succeeds, and in requireAll mode once any peer fails (since we will abort
654- // anyway). Cancellation errors that arrive after either trigger are expected
655- // noise — omit them from successMap entirely. We must check both
656- // context.Canceled (in-process) and the gRPC-wrapped equivalent
657- // (vterrors.Code(err) == vtrpcpb.Code_CANCELED), which is what real tmclient
658- // RPCs return after vterrors.FromGRPC has converted the gRPC status — that
659- // one does NOT satisfy errors.Is(err, context.Canceled).
660- //
661- // Non-cancellation errors that arrive after a trigger are conservatively
662- // recorded as real failures even though they may have been caused indirectly
663- // by our own cancellation (e.g., torn-down RPC connections, MySQL session
664- // closure). Over-reporting is preferable to silently dropping a genuine
638+ // Cancellation errors arriving after our own groupCancel(), or after the parent
639+ // ctx was cancelled, are expected noise — omit them entirely. Must also check
640+ // vterrors.Code (gRPC-wrapped CANCELED), which does NOT satisfy errors.Is(err,
641+ // context.Canceled). Non-cancellation errors after a trigger are conservatively
642+ // counted as real failures — over-reporting is preferable to dropping a genuine
665643 // failure signal.
666644 isCancellation := errors .Is (result .err , context .Canceled ) || vterrors .Code (result .err ) == vtrpc .Code_CANCELED
667- weTriggeredCancellation := (! requireAll && successes > 0 ) || (requireAll && firstFailure != nil )
645+ weTriggeredCancellation := (! requireAll && successes > 0 ) || (requireAll && firstFailure != nil ) || ctx . Err () != nil
668646 if isCancellation && weTriggeredCancellation {
669647 continue
670648 }
@@ -673,21 +651,27 @@ func (erp *EmergencyReparenter) waitForAllRelayLogsToApply(
673651 firstFailure = result .err
674652 erp .logger .Warningf ("EmergencyReparent candidate %s failed to apply relay logs: %v" , result .alias , result .err )
675653 if requireAll {
676- // requireAll means any failure must abort ERS — short-circuit the
677- // remaining waiters so we don't block until waitReplicasTimeout.
678654 groupCancel ()
679655 }
680656 }
681657 continue
682658 }
683659 if ! requireAll && successes == 0 {
684- // Optimization: cancel remaining goroutines — we have a winner.
685660 groupCancel ()
686661 }
687662 successMap [result .alias ] = true
688663 successes ++
689664 }
690665
666+ // If the parent context was cancelled, surface that regardless of mode or how many
667+ // candidates were pre-satisfied. Pre-PR errgroup.Wait surfaced any cancellation; this
668+ // keeps behavior symmetric across preSatisfied=0 and preSatisfied>0 paths and avoids
669+ // misreporting a "all candidates failed" error when the real cause was operator abort.
670+ // A real tablet failure (firstFailure set) takes precedence — it happened first and is
671+ // more actionable.
672+ if ctxErr := ctx .Err (); ctxErr != nil && firstFailure == nil {
673+ return successMap , vterrors .Wrapf (ctxErr , "emergency reparent aborted while waiting for relay logs to apply" )
674+ }
691675 if requireAll && firstFailure != nil {
692676 return successMap , vterrors .Wrapf (firstFailure , "could not apply all relay logs within the provided waitReplicasTimeout (%s)" , waitReplicasTimeout )
693677 }
0 commit comments