Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions akka-cluster/src/main/resources/reference.conf
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,14 @@ akka.cluster.split-brain-resolver {
# 4 seconds.
down-all-when-unstable = on

# In a malfunctioning network there can be situations where nodes are observed as unreachable
# via some network links, but they are still indirectly connected via other nodes, i.e. it's
# not a clean network partition.
# By default it will keep fully connected nodes and down all the indirectly connected nodes,
# but when this flag is enabled it will down all nodes as precaution for the possible
# uncertainty that indirectly connected nodes can cause.
down-all-when-indirectly-connected = off

}
#//#split-brain-resolver

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -435,12 +435,16 @@ import akka.remote.artery.ThisActorSystemQuarantinedEvent
*/
def actOnDecision(decision: Decision): Set[UniqueAddress] = {
val nodesToDown =
try {
strategy.nodesToDown(decision)
} catch {
case e: IllegalStateException =>
log.warning(e.getMessage)
strategy.nodesToDown(DownAll)
if (settings.DownAllWhenIndirectlyConnected && decision.isIndirectlyConnected) {
strategy.nodesToDown(DownAll)
} else {
try {
strategy.nodesToDown(decision)
} catch {
case e: IllegalStateException =>
log.warning(e.getMessage)
strategy.nodesToDown(DownAll)
}
}

observeDecision(decision, nodesToDown, unreachableDataCenters)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ import akka.util.Helpers.Requiring
}
}

val DownAllWhenIndirectlyConnected: Boolean =
cc.getBoolean("down-all-when-indirectly-connected")

// the individual sub-configs below should only be called when the strategy has been selected

def keepMajorityRole: Option[String] = role(strategyConfig(KeepMajorityName))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/*
* Copyright (C) 2020-2025 Lightbend Inc. <https://www.lightbend.com>
*/

package akka.cluster.sbr

import scala.concurrent.duration._

import com.typesafe.config.ConfigFactory

import akka.cluster.Cluster
import akka.cluster.MemberStatus
import akka.cluster.MultiNodeClusterSpec
import akka.remote.testkit.Direction
import akka.remote.testkit.MultiNodeConfig

object DownAllIndirectlyConnected3NodeSpec extends MultiNodeConfig {
val node1 = role("node1")
val node2 = role("node2")
val node3 = role("node3")

commonConfig(ConfigFactory.parseString("""
akka {
loglevel = INFO
cluster {
downing-provider-class = "akka.cluster.sbr.SplitBrainResolverProvider"
split-brain-resolver.active-strategy = keep-majority
split-brain-resolver.stable-after = 6s
split-brain-resolver.down-all-when-indirectly-connected = on

run-coordinated-shutdown-when-down = off
}

actor.provider = cluster

test.filter-leeway = 10s
}
"""))

testTransport(on = true)
}

class DownAllIndirectlyConnected3NodeSpecMultiJvmNode1 extends DownAllIndirectlyConnected3NodeSpec
class DownAllIndirectlyConnected3NodeSpecMultiJvmNode2 extends DownAllIndirectlyConnected3NodeSpec
class DownAllIndirectlyConnected3NodeSpecMultiJvmNode3 extends DownAllIndirectlyConnected3NodeSpec

class DownAllIndirectlyConnected3NodeSpec extends MultiNodeClusterSpec(DownAllIndirectlyConnected3NodeSpec) {
import DownAllIndirectlyConnected3NodeSpec._

"A 3-node cluster with down-all-when-indirectly-connected=on" should {
"down all when two unreachable but can talk via third" in {
val cluster = Cluster(system)

runOn(node1) {
cluster.join(cluster.selfAddress)
}
enterBarrier("node1 joined")
runOn(node2, node3) {
cluster.join(node(node1).address)
}
within(10.seconds) {
awaitAssert {
cluster.state.members.size should ===(3)
cluster.state.members.foreach {
_.status should ===(MemberStatus.Up)
}
}
}
enterBarrier("Cluster formed")

runOn(node1) {
testConductor.blackhole(node2, node3, Direction.Both).await
}
enterBarrier("Blackholed")

within(10.seconds) {
awaitAssert {
runOn(node3) {
cluster.state.unreachable.map(_.address) should ===(Set(node(node2).address))
}
runOn(node2) {
cluster.state.unreachable.map(_.address) should ===(Set(node(node3).address))
}
runOn(node1) {
cluster.state.unreachable.map(_.address) should ===(Set(node(node3).address, node(node2).address))
}
}
}
enterBarrier("unreachable")

// all downed
awaitCond(cluster.isTerminated, max = 15.seconds)

enterBarrier("done")
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -590,7 +590,6 @@ class SplitBrainResolverSpec
// from side2 of the partition, majority
assertDowningSide(side2, Set(memberA, memberB, memberC, memberD, memberE, memberF, memberG))
}

}

"KeepOldest" must {
Expand Down
11 changes: 11 additions & 0 deletions akka-docs/src/main/paradox/split-brain-resolver.md
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,17 @@ If there is a combination of indirectly connected nodes and a clean network part
above decision with the ordinary decision, e.g. keep majority, after excluding suspicious failure detection
observations.

### Down all when indirectly connected

Additional precaution can be enabled for the uncertainty that may follow from indirectly connected nodes
by enabling downing of all nodes if indirectly connected nodes are detected:

```
akka.cluster.split-brain-resolver {
down-all-when-indirectly-connected = on
}
```

## Down all when unstable

When reachability observations by the failure detector are changed the SBR decisions
Expand Down