diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 8f57b28eab..9095796887 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -2146,12 +2146,22 @@ void clearNodeFailureIfNeeded(clusterNode *node) { clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_SAVE_CONFIG); } + /* If none of the replica can failover or it's primary only setup, + * then immediately mark the node as alive. */ + int dont_wait = 1; + for (int j = 0; j < node->num_replicas; j++) { + if (!clusterNodeIsNoFailover(node->replicas[j])) { + dont_wait = 0; + break; + } + } + /* If it is a primary and... * 1) The FAIL state is old enough. * 2) It is yet serving slots from our point of view (not failed over). * Apparently no one is going to fix these slots, clear the FAIL flag. */ if (clusterNodeIsVotingPrimary(node) && - (now - node->fail_time) > (server.cluster_node_timeout * CLUSTER_FAIL_UNDO_TIME_MULT)) { + ((now - node->fail_time) > (server.cluster_node_timeout * CLUSTER_FAIL_UNDO_TIME_MULT) || dont_wait)) { serverLog( LL_NOTICE, "Clear FAIL state for node %.40s (%s): is reachable again and nobody is serving its slots after some time.", @@ -4735,6 +4745,10 @@ void clusterLogCantFailover(int reason) { case CLUSTER_CANT_FAILOVER_WAITING_DELAY: msg = "Waiting the delay before I can start a new failover."; break; case CLUSTER_CANT_FAILOVER_EXPIRED: msg = "Failover attempt expired."; break; case CLUSTER_CANT_FAILOVER_WAITING_VOTES: msg = "Waiting for votes, but majority still not reached."; break; + case CLUSTER_CANT_FAILOVER_DISABLED: + msg = "Failover has been disabled. " + "Please check the 'cluster-replica-no-failover' configuration option."; + break; default: serverPanic("Unknown cant failover reason code."); } lastlog_time = time(NULL); @@ -4827,14 +4841,19 @@ void clusterHandleReplicaFailover(void) { * 3) We don't have the no failover configuration set, and this is * not a manual failover. */ if (clusterNodeIsPrimary(myself) || myself->replicaof == NULL || - (!nodeFailed(myself->replicaof) && !manual_failover) || - (server.cluster_replica_no_failover && !manual_failover)) { + (!nodeFailed(myself->replicaof) && !manual_failover)) { /* There are no reasons to failover, so we set the reason why we * are returning without failing over to NONE. */ server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE; return; } + if (server.cluster_replica_no_failover && !manual_failover) { + server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_DISABLED; + clusterLogCantFailover(CLUSTER_CANT_FAILOVER_DISABLED); + return; + } + /* Set data_age to the number of milliseconds we are disconnected from * the primary. */ if (server.repl_state == REPL_STATE_CONNECTED) { @@ -6602,7 +6621,7 @@ int clusterNodeIsFailing(clusterNode *node) { } int clusterNodeIsNoFailover(clusterNode *node) { - return node->flags & CLUSTER_NODE_NOFAILOVER; + return nodeCantFailover(node); } const char **clusterDebugCommandExtendedHelp(void) { diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 9f981a797f..29bb734b53 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -16,6 +16,7 @@ #define CLUSTER_CANT_FAILOVER_WAITING_DELAY 2 #define CLUSTER_CANT_FAILOVER_EXPIRED 3 #define CLUSTER_CANT_FAILOVER_WAITING_VOTES 4 +#define CLUSTER_CANT_FAILOVER_DISABLED 5 #define CLUSTER_CANT_FAILOVER_RELOG_PERIOD 1 /* seconds. */ /* clusterState todo_before_sleep flags. */ diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl index 2272a150ee..fe2c896ca6 100644 --- a/tests/unit/cluster/failover2.tcl +++ b/tests/unit/cluster/failover2.tcl @@ -101,6 +101,66 @@ start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval } } ;# start_cluster +# Tests to verify scenarios where failover is not possible and verify faster availability +# of primary once the network partition heals. +foreach type {"primary-only" "primary-with-replicas"} { + set ::node_timeout 5000 + if {$type eq "primary-only"} { + set ::primary_count 6 + set ::replica_count 0 + } else { + set ::primary_count 3 + set ::replica_count 3 + } + + set options [list \ + tags {external:skip cluster} \ + overrides [list \ + cluster-ping-interval 1000 \ + cluster-node-timeout $::node_timeout \ + cluster-replica-no-failover yes \ + ]] + + start_cluster $::primary_count $::replica_count $options { + # Killing one primary node. + pause_process [srv 0 pid] + + if {$::replica_count > 0} { + test "no failover - verify replica is not promoted if failover has been disabled" { + # Observe no failover + wait_for_log_messages -3 {"*Currently unable to failover: Failover has been disabled*"} 0 200 50 + } + } else { + # wait for node failure detection + after $::node_timeout + } + + test "no failover - cluster is in failed state" { + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused [srv -$j pid]]} continue + wait_for_condition 100 25 { + [CI $j cluster_state] eq "fail" + } else { + set ts [clock format [clock seconds] -format %H:%M:%S] + fail "Cluster node $j cluster_state:[r -1 CLUSTER NODES]" + } + } + } + + resume_process [srv 0 pid] + + test "no failover - cluster is in healthy state" { + for {set j 0} {$j < [llength $::servers]} {incr j} { + wait_for_condition 100 50 { + [CI $j cluster_state] eq "ok" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } + } + } ;# start_cluster +} + run_solo {cluster} { start_cluster 32 15 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 15000}} { test "Multiple primary nodes are down, rank them based on the failed primary" {