diff --git a/akka-cluster/src/main/resources/reference.conf b/akka-cluster/src/main/resources/reference.conf index 23a54102d72..6b0eddd43be 100644 --- a/akka-cluster/src/main/resources/reference.conf +++ b/akka-cluster/src/main/resources/reference.conf @@ -423,6 +423,14 @@ akka.cluster.split-brain-resolver { # 4 seconds. down-all-when-unstable = on + # In a malfunctioning network there can be situations where nodes are observed as unreachable + # via some network links, but they are still indirectly connected via other nodes, i.e. it's + # not a clean network partition. + # By default it will keep fully connected nodes and down all the indirectly connected nodes, + # but when this flag is enabled it will down all nodes as precaution for the possible + # uncertainty that indirectly connected nodes can cause. + down-all-when-indirectly-connected = off + } #//#split-brain-resolver diff --git a/akka-cluster/src/main/scala/akka/cluster/sbr/SplitBrainResolver.scala b/akka-cluster/src/main/scala/akka/cluster/sbr/SplitBrainResolver.scala index 6f8e833a97d..9d0457b8095 100644 --- a/akka-cluster/src/main/scala/akka/cluster/sbr/SplitBrainResolver.scala +++ b/akka-cluster/src/main/scala/akka/cluster/sbr/SplitBrainResolver.scala @@ -435,12 +435,16 @@ import akka.remote.artery.ThisActorSystemQuarantinedEvent */ def actOnDecision(decision: Decision): Set[UniqueAddress] = { val nodesToDown = - try { - strategy.nodesToDown(decision) - } catch { - case e: IllegalStateException => - log.warning(e.getMessage) - strategy.nodesToDown(DownAll) + if (settings.DownAllWhenIndirectlyConnected && decision.isIndirectlyConnected) { + strategy.nodesToDown(DownAll) + } else { + try { + strategy.nodesToDown(decision) + } catch { + case e: IllegalStateException => + log.warning(e.getMessage) + strategy.nodesToDown(DownAll) + } } observeDecision(decision, nodesToDown, unreachableDataCenters) diff --git a/akka-cluster/src/main/scala/akka/cluster/sbr/SplitBrainResolverSettings.scala b/akka-cluster/src/main/scala/akka/cluster/sbr/SplitBrainResolverSettings.scala index bfcb2107d37..ee7d766c361 100644 --- a/akka-cluster/src/main/scala/akka/cluster/sbr/SplitBrainResolverSettings.scala +++ b/akka-cluster/src/main/scala/akka/cluster/sbr/SplitBrainResolverSettings.scala @@ -67,6 +67,9 @@ import akka.util.Helpers.Requiring } } + val DownAllWhenIndirectlyConnected: Boolean = + cc.getBoolean("down-all-when-indirectly-connected") + // the individual sub-configs below should only be called when the strategy has been selected def keepMajorityRole: Option[String] = role(strategyConfig(KeepMajorityName)) diff --git a/akka-cluster/src/multi-jvm/scala/akka/cluster/sbr/DownAllIndirectlyConnected3NodeSpec.scala b/akka-cluster/src/multi-jvm/scala/akka/cluster/sbr/DownAllIndirectlyConnected3NodeSpec.scala new file mode 100644 index 00000000000..7b04beb3542 --- /dev/null +++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/sbr/DownAllIndirectlyConnected3NodeSpec.scala @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2020-2025 Lightbend Inc. + */ + +package akka.cluster.sbr + +import scala.concurrent.duration._ + +import com.typesafe.config.ConfigFactory + +import akka.cluster.Cluster +import akka.cluster.MemberStatus +import akka.cluster.MultiNodeClusterSpec +import akka.remote.testkit.Direction +import akka.remote.testkit.MultiNodeConfig + +object DownAllIndirectlyConnected3NodeSpec extends MultiNodeConfig { + val node1 = role("node1") + val node2 = role("node2") + val node3 = role("node3") + + commonConfig(ConfigFactory.parseString(""" + akka { + loglevel = INFO + cluster { + downing-provider-class = "akka.cluster.sbr.SplitBrainResolverProvider" + split-brain-resolver.active-strategy = keep-majority + split-brain-resolver.stable-after = 6s + split-brain-resolver.down-all-when-indirectly-connected = on + + run-coordinated-shutdown-when-down = off + } + + actor.provider = cluster + + test.filter-leeway = 10s + } + """)) + + testTransport(on = true) +} + +class DownAllIndirectlyConnected3NodeSpecMultiJvmNode1 extends DownAllIndirectlyConnected3NodeSpec +class DownAllIndirectlyConnected3NodeSpecMultiJvmNode2 extends DownAllIndirectlyConnected3NodeSpec +class DownAllIndirectlyConnected3NodeSpecMultiJvmNode3 extends DownAllIndirectlyConnected3NodeSpec + +class DownAllIndirectlyConnected3NodeSpec extends MultiNodeClusterSpec(DownAllIndirectlyConnected3NodeSpec) { + import DownAllIndirectlyConnected3NodeSpec._ + + "A 3-node cluster with down-all-when-indirectly-connected=on" should { + "down all when two unreachable but can talk via third" in { + val cluster = Cluster(system) + + runOn(node1) { + cluster.join(cluster.selfAddress) + } + enterBarrier("node1 joined") + runOn(node2, node3) { + cluster.join(node(node1).address) + } + within(10.seconds) { + awaitAssert { + cluster.state.members.size should ===(3) + cluster.state.members.foreach { + _.status should ===(MemberStatus.Up) + } + } + } + enterBarrier("Cluster formed") + + runOn(node1) { + testConductor.blackhole(node2, node3, Direction.Both).await + } + enterBarrier("Blackholed") + + within(10.seconds) { + awaitAssert { + runOn(node3) { + cluster.state.unreachable.map(_.address) should ===(Set(node(node2).address)) + } + runOn(node2) { + cluster.state.unreachable.map(_.address) should ===(Set(node(node3).address)) + } + runOn(node1) { + cluster.state.unreachable.map(_.address) should ===(Set(node(node3).address, node(node2).address)) + } + } + } + enterBarrier("unreachable") + + // all downed + awaitCond(cluster.isTerminated, max = 15.seconds) + + enterBarrier("done") + } + } + +} diff --git a/akka-cluster/src/test/scala/akka/cluster/sbr/SplitBrainResolverSpec.scala b/akka-cluster/src/test/scala/akka/cluster/sbr/SplitBrainResolverSpec.scala index 943b7f6a0c9..ca1e6af0a55 100644 --- a/akka-cluster/src/test/scala/akka/cluster/sbr/SplitBrainResolverSpec.scala +++ b/akka-cluster/src/test/scala/akka/cluster/sbr/SplitBrainResolverSpec.scala @@ -590,7 +590,6 @@ class SplitBrainResolverSpec // from side2 of the partition, majority assertDowningSide(side2, Set(memberA, memberB, memberC, memberD, memberE, memberF, memberG)) } - } "KeepOldest" must { diff --git a/akka-docs/src/main/paradox/split-brain-resolver.md b/akka-docs/src/main/paradox/split-brain-resolver.md index 517cf9de26b..8957f3e9b8e 100644 --- a/akka-docs/src/main/paradox/split-brain-resolver.md +++ b/akka-docs/src/main/paradox/split-brain-resolver.md @@ -392,6 +392,17 @@ If there is a combination of indirectly connected nodes and a clean network part above decision with the ordinary decision, e.g. keep majority, after excluding suspicious failure detection observations. +### Down all when indirectly connected + +Additional precaution can be enabled for the uncertainty that may follow from indirectly connected nodes +by enabling downing of all nodes if indirectly connected nodes are detected: + +``` +akka.cluster.split-brain-resolver { + down-all-when-indirectly-connected = on +} +``` + ## Down all when unstable When reachability observations by the failure detector are changed the SBR decisions