Improve failover mechanism for primary and secondary replica connections (#243)

Ganeshwara Hananda · web-flow · commit 41c18e4a2979 · 2021-01-26T13:55:58.000Z
## What is the goal of this PR?

We have increased resilience by improving the failover mechanism between replicas.

## What are the changes implemented in this PR?

- Remove the terminology leader / non-leader which are Raft specific. We now use the terminology "replica" to refer to a single copy of a database. The active replica that can receive data is now called "primary replica" whereas the passive ones "secondary replica"
- Split `GraknOptions` to `GraknOptions.core()` and `GraknOptions.cluster()`, the later of which contains an option to read from secondary replica
- Increase resilience:
  - Database and cluster discovery will now be re-attempted to all cluster members instead of just to one of them
  - When the cluster have not decided which replica is the primary replica, the client will wait and retry instead of simply failing
- Changed info-level log to debug
diff --git a/Grakn.java b/Grakn.java
@@ -114,7 +114,6 @@ interface Transaction extends AutoCloseable {
 
         enum Type {
             READ(0),
-            READ_REPLICA(2),
             WRITE(1);
 
             private final int id;
diff --git a/GraknClient.java b/GraknClient.java
@@ -36,21 +36,21 @@
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Collectors;
 
-import static grakn.client.common.exception.ErrorMessage.Client.CLUSTER_NOT_AVAILABLE;
+import static grakn.client.common.exception.ErrorMessage.Client.CLUSTER_UNABLE_TO_CONNECT;
+import static grakn.client.common.exception.ErrorMessage.Client.ILLEGAL_ARGUMENT;
 import static grakn.common.collection.Collections.pair;
 
 public class GraknClient {
     public static final String DEFAULT_ADDRESS = "localhost:1729";
 
-    public static GraknClient.Core core() {
+    public static Core core() {
         return core(DEFAULT_ADDRESS);
     }
 
-    public static GraknClient.Core core(String address) {
-        return new GraknClient.Core(address);
+    public static Core core(String address) {
+        return new Core(address);
     }
 
     public static GraknClient.Cluster cluster() {
@@ -72,7 +72,7 @@ private Core(String address) {
 
         @Override
         public RPCSession.Core session(String database, Grakn.Session.Type type) {
-            return session(database, type, new GraknOptions());
+            return session(database, type, GraknOptions.core());
         }
 
         @Override
@@ -106,23 +106,20 @@ public Channel channel() {
 
     public static class Cluster implements Grakn.Client {
         private static final Logger LOG = LoggerFactory.getLogger(Cluster.class);
-        private final Map<Address.Cluster.Server, Core> coreClientMap;
-        private final Core[] coreClientArray;
-        private final AtomicInteger selectedCoreClientIndex;
-        private GraknClusterGrpc.GraknClusterBlockingStub clusterDiscoveryRPC;
+        private final Map<Address.Cluster.Server, Core> coreClients;
+        private final Map<Address.Cluster.Server, GraknClusterGrpc.GraknClusterBlockingStub> graknClusterRPCs;
         private final RPCDatabaseManager.Cluster databases;
         private boolean isOpen;
 
         private Cluster(String address) {
-            Pair<GraknClusterGrpc.GraknClusterBlockingStub, Set<Address.Cluster.Server>> discovery = discoverCluster(address);
-            clusterDiscoveryRPC = discovery.first();
-            coreClientMap = discovery.second().stream()
+            coreClients = discoverCluster(address).stream()
                     .map(addr -> pair(addr, new Core(addr.client())))
                     .collect(Collectors.toMap(Pair::first, Pair::second));
-            coreClientArray = coreClientMap.values().toArray(new Core[] {});
-            selectedCoreClientIndex = new AtomicInteger();
+            graknClusterRPCs = coreClients.entrySet().stream()
+                    .map(client -> pair(client.getKey(), GraknClusterGrpc.newBlockingStub(client.getValue().channel())))
+                    .collect(Collectors.toMap(Pair::first, Pair::second));
             databases = new RPCDatabaseManager.Cluster(
-                    coreClientMap.entrySet().stream()
+                    coreClients.entrySet().stream()
                             .map(client -> pair(client.getKey(), client.getValue().databases()))
                             .collect(Collectors.toMap(Pair::first, Pair::second))
             );
@@ -131,18 +128,13 @@ private Cluster(String address) {
 
         @Override
         public RPCSession.Cluster session(String database, Grakn.Session.Type type) {
-            return session(database, type, new GraknOptions());
+            return session(database, type, GraknOptions.cluster());
         }
 
         @Override
         public RPCSession.Cluster session(String database, Grakn.Session.Type type, GraknOptions options) {
-            return new RPCSession.Cluster(this, database, type, options, clusterDiscoveryRPC);
-        }
-
-        public GraknClusterGrpc.GraknClusterBlockingStub selectNextClusterDiscoveryRPC() {
-            Core selected = selectNextCoreClient();
-            clusterDiscoveryRPC = GraknClusterGrpc.newBlockingStub(selected.channel());
-            return clusterDiscoveryRPC;
+            if (!options.isCluster()) throw new GraknClientException(ILLEGAL_ARGUMENT, options);
+            return new RPCSession.Cluster(this, database, type, options.asCluster());
         }
 
         @Override
@@ -157,33 +149,42 @@ public boolean isOpen() {
 
         @Override
         public void close() {
-            coreClientMap.values().forEach(GraknClient.Core::close);
+            coreClients.values().forEach(GraknClient.Core::close);
             isOpen = false;
         }
 
-        public Map<Address.Cluster.Server, Core> coreClients() {
-            return coreClientMap;
+        public Set<Address.Cluster.Server> servers() {
+            return coreClients.keySet();
+        }
+
+        public Core coreClient(Address.Cluster.Server address) {
+            return coreClients.get(address);
+        }
+
+        public GraknClusterGrpc.GraknClusterBlockingStub graknClusterRPC(Address.Cluster.Server address) {
+            return graknClusterRPCs.get(address);
         }
 
-        private Pair<GraknClusterGrpc.GraknClusterBlockingStub, Set<Address.Cluster.Server>> discoverCluster(String... addresses) {
+        private Set<Address.Cluster.Server> discoverCluster(String... addresses) {
             for (String address: addresses) {
-                try (GraknClient.Core client = new Core(address)) {
-                    LOG.info("Performing server discovery to {}...", address);
-                    GraknClusterGrpc.GraknClusterBlockingStub clusterDiscoveryRPC = GraknClusterGrpc.newBlockingStub(client.channel());
+                try (Core client = new Core(address)) {
+                    LOG.debug("Performing server discovery to {}...", address);
+                    GraknClusterGrpc.GraknClusterBlockingStub graknClusterRPC = GraknClusterGrpc.newBlockingStub(client.channel());
                     ClusterProto.Cluster.Discover.Res res =
-                            clusterDiscoveryRPC.clusterDiscover(ClusterProto.Cluster.Discover.Req.newBuilder().build());
+                            graknClusterRPC.clusterDiscover(ClusterProto.Cluster.Discover.Req.newBuilder().build());
                     Set<Address.Cluster.Server> servers = res.getServersList().stream().map(Address.Cluster.Server::parse).collect(Collectors.toSet());
-                    LOG.info("Discovered {}", servers);
-                    return pair(clusterDiscoveryRPC, servers);
+                    LOG.debug("Discovered {}", servers);
+                    return servers;
                 } catch (StatusRuntimeException e) {
                     LOG.error("Server discovery to {} failed.", address);
                 }
             }
-            throw new GraknClientException(CLUSTER_NOT_AVAILABLE.message((Object) addresses)); // remove ambiguity by casting to Object
+            throw clusterNotAvailableException();
         }
 
-        private Core selectNextCoreClient() {
-            return coreClientArray[selectedCoreClientIndex.getAndIncrement() % coreClientMap.size()];
+        private GraknClientException clusterNotAvailableException() {
+            String addresses = servers().stream().map(Address.Cluster.Server::toString).collect(Collectors.joining(","));
+            return new GraknClientException(CLUSTER_UNABLE_TO_CONNECT, addresses); // remove ambiguity by casting to Object
         }
     }
 }
diff --git a/GraknOptions.java b/GraknOptions.java
@@ -24,9 +24,9 @@
 import java.util.Optional;
 
 import static grakn.client.common.exception.ErrorMessage.Client.NEGATIVE_BATCH_SIZE;
+import static grakn.client.common.exception.ErrorMessage.Internal.ILLEGAL_CAST;
 
 public class GraknOptions {
-
     private Boolean infer = null;
     private Boolean explain = null;
     private Integer batchSize = null;
@@ -60,4 +60,45 @@ public GraknOptions batchSize(int batchSize) {
         this.batchSize = batchSize;
         return this;
     }
+
+    public static GraknOptions core() {
+        return new GraknOptions();
+    }
+
+    public static GraknOptions.Cluster cluster() {
+        return new Cluster();
+    }
+
+    public boolean isCluster() {
+        return false;
+    }
+
+    public Cluster asCluster() {
+        throw new GraknClientException(ILLEGAL_CAST, Cluster.class);
+    }
+
+    public static class Cluster extends GraknOptions {
+        private Boolean allowSecondaryReplica = null;
+
+        Cluster() {}
+
+        public Optional<Boolean> allowSecondaryReplica() {
+            return Optional.ofNullable(allowSecondaryReplica);
+        }
+
+        public Cluster allowSecondaryReplica(boolean primaryReplica) {
+            this.allowSecondaryReplica = primaryReplica;
+            return this;
+        }
+
+        @Override
+        public boolean isCluster() {
+            return true;
+        }
+
+        @Override
+        public Cluster asCluster() {
+            return this;
+        }
+    }
 }
diff --git a/GraknProtoBuilder.java b/GraknProtoBuilder.java
@@ -28,6 +28,11 @@ public static OptionsProto.Options options(GraknOptions options) {
         options.infer().ifPresent(builder::setInfer);
         options.explain().ifPresent(builder::setExplain);
         options.batchSize().ifPresent(builder::setBatchSize);
+
+        if (options.isCluster()) {
+            options.asCluster().allowSecondaryReplica().ifPresent(builder::setAllowSecondaryReplica);
+        }
+
         return builder.build();
     }
 }
diff --git a/common/exception/ErrorMessage.java b/common/exception/ErrorMessage.java
@@ -35,16 +35,15 @@ public static class Client extends ErrorMessage {
         public static final Client MISSING_RESPONSE =
                 new Client(4, "The required field 'res' of type '%s' was not set.");
         public static final Client UNKNOWN_REQUEST_ID =
-                new Client(5, "Received a response with unknown request id '%s'.");
+                new Client(5, "Received a response with unknown request id '%s'.")  ;
         public static final Client ILLEGAL_ARGUMENT = new Client(6, "Illegal argument passed into the method: '%s'.");
-
-        public static final Client CLUSTER_LEADER_NOT_YET_ELECTED =
-                new Client(7, "No leader has been elected for latest known term '%s'.");
-
-        public static final Client CLUSTER_NOT_AVAILABLE =
-                new Client(8, "Attempted connecting to these servers, but none are available: '%s'.");
-        public static final Client UNABLE_TO_CONNECT =
-                new Client(9, "Unable to connect to Grakn Core Server.");
+        public static final Client UNABLE_TO_CONNECT = new Client(7, "Unable to connect to Grakn Core Server.");
+        public static final Client CLUSTER_NO_PRIMARY_REPLICA_YET =
+                new Client(8, "No replica has been marked as the primary replica for latest known term '%s'.");
+        public static final Client CLUSTER_UNABLE_TO_CONNECT =
+                new Client(9, "Unable to connect to Grakn Cluster. Attempted connecting to these servers, but none are available: '%s'.");
+        public static final Client CLUSTER_REPLICA_NOT_PRIMARY =
+                new Client(10, "The replica is not the primary replica");
 
         private static final String codePrefix = "CLI";
         private static final String messagePrefix = "Illegal Client State";
@@ -87,8 +86,6 @@ public static class Query extends ErrorMessage {
                 new Query(3, "The answer type '%s' was not recognised.");
         public static final Query MISSING_ANSWER =
                 new Query(4, "The required field 'answer' of type '%s' was not set.");
-        public static final Query ILLEGAL_CAST =
-                new Query(5, "Illegal casting operation to '%s'.");
 
         private static final String codePrefix = "QRY";
         private static final String messagePrefix = "Query Error";
@@ -97,4 +94,18 @@ public static class Query extends ErrorMessage {
             super(codePrefix, number, messagePrefix, message);
         }
     }
+
+    public static class Internal extends ErrorMessage {
+        public static final Internal UNEXPECTED_INTERRUPTION =
+                new Internal(1, "Unexpected thread interruption!");
+        public static final Internal ILLEGAL_CAST =
+                new Internal(2, "Illegal casting operation to '%s'.");
+
+        private static final String codePrefix = "INT";
+        private static final String messagePrefix = "Internal Error";
+
+        Internal(int number, String message) {
+            super(codePrefix, number, messagePrefix, message);
+        }
+    }
 }
diff --git a/common/exception/GraknClientException.java b/common/exception/GraknClientException.java
@@ -33,15 +33,17 @@ public GraknClientException(String error) {
         this.errorMessage = null;
     }
 
-    public GraknClientException(ErrorMessage error) {
-        super(error.toString());
+    public GraknClientException(ErrorMessage error, Object... parameters) {
+        super(error.message(parameters));
         assert !getMessage().contains("%s");
         this.errorMessage = error;
     }
 
     public static GraknClientException of(StatusRuntimeException statusRuntimeException) {
         if (statusRuntimeException.getStatus().getCode() == Status.Code.UNAVAILABLE) {
             return new GraknClientException(ErrorMessage.Client.UNABLE_TO_CONNECT);
+        } else if (isReplicaNotPrimaryException(statusRuntimeException)) {
+            return new GraknClientException(ErrorMessage.Client.CLUSTER_REPLICA_NOT_PRIMARY);
         }
         return new GraknClientException(statusRuntimeException.getStatus().getDescription());
     }
@@ -59,4 +61,9 @@ public String getName() {
     public ErrorMessage getErrorMessage() {
         return errorMessage;
     }
+
+    // TODO: propagate exception from the server side in a less-brittle way
+    private static boolean isReplicaNotPrimaryException(StatusRuntimeException statusRuntimeException) {
+        return statusRuntimeException.getStatus().getCode() == Status.Code.INTERNAL && statusRuntimeException.getStatus().getDescription().contains("[RPL01]");
+    }
 }
diff --git a/common/tracing/TracingProtoBuilder.java b/common/tracing/TracingProtoBuilder.java
@@ -20,8 +20,6 @@
 package grakn.client.common.tracing;
 
 import grabl.tracing.client.GrablTracingThreadStatic;
-import grakn.client.GraknOptions;
-import grakn.protocol.OptionsProto;
 
 import java.util.Collections;
 import java.util.HashMap;
diff --git a/concept/answer/Numeric.java b/concept/answer/Numeric.java
@@ -25,7 +25,7 @@
 import javax.annotation.Nullable;
 
 import static grakn.client.common.exception.ErrorMessage.Query.BAD_ANSWER_TYPE;
-import static grakn.client.common.exception.ErrorMessage.Query.ILLEGAL_CAST;
+import static grakn.client.common.exception.ErrorMessage.Internal.ILLEGAL_CAST;
 
 public class Numeric {
     @Nullable
diff --git a/dependencies/graknlabs/repositories.bzl b/dependencies/graknlabs/repositories.bzl
@@ -50,8 +50,8 @@ def graknlabs_dependencies():
 def graknlabs_protocol():
     git_repository(
         name = "graknlabs_protocol",
-        remote = "https://github.com/graknlabs/protocol",
-        tag = "2.0.0-alpha-6", # sync-marker: do not remove this comment, this is used for sync-dependencies by @graknlabs_protocol
+        remote = "https://github.com/lolski/protocol",
+        commit = "6d5a5e1b58d91fd001e06b2820363e7194f7fd3f", # sync-marker: do not remove this comment, this is used for sync-dependencies by @graknlabs_protocol
     )
 
 def graknlabs_behaviour():
diff --git a/query/QueryManager.java b/query/QueryManager.java
diff --git a/rpc/RPCDatabaseManager.java b/rpc/RPCDatabaseManager.java
diff --git a/rpc/RPCSession.java b/rpc/RPCSession.java
diff --git a/rpc/RPCTransaction.java b/rpc/RPCTransaction.java

Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,11 @@ public static OptionsProto.Options options(GraknOptions options) {`
`28`	`28`	`options.infer().ifPresent(builder::setInfer);`
`29`	`29`	`options.explain().ifPresent(builder::setExplain);`
`30`	`30`	`options.batchSize().ifPresent(builder::setBatchSize);`
	`31`	`+`
	`32`	`+ if (options.isCluster()) {`
	`33`	`+ options.asCluster().allowSecondaryReplica().ifPresent(builder::setAllowSecondaryReplica);`
	`34`	`+ }`
	`35`	`+`
`31`	`36`	`return builder.build();`
`32`	`37`	`}`
`33`	`38`	`}`
Original file line number	Diff line number	Diff line change
`@@ -33,15 +33,17 @@ public GraknClientException(String error) {`
`33`	`33`	`this.errorMessage = null;`
`34`	`34`	`}`
`35`	`35`
`36`		`- public GraknClientException(ErrorMessage error) {`
`37`		`- super(error.toString());`
	`36`	`+ public GraknClientException(ErrorMessage error, Object... parameters) {`
	`37`	`+ super(error.message(parameters));`
`38`	`38`	`assert !getMessage().contains("%s");`
`39`	`39`	`this.errorMessage = error;`
`40`	`40`	`}`
`41`	`41`
`42`	`42`	`public static GraknClientException of(StatusRuntimeException statusRuntimeException) {`
`43`	`43`	`if (statusRuntimeException.getStatus().getCode() == Status.Code.UNAVAILABLE) {`
`44`	`44`	`return new GraknClientException(ErrorMessage.Client.UNABLE_TO_CONNECT);`
	`45`	`+ } else if (isReplicaNotPrimaryException(statusRuntimeException)) {`
	`46`	`+ return new GraknClientException(ErrorMessage.Client.CLUSTER_REPLICA_NOT_PRIMARY);`
`45`	`47`	`}`
`46`	`48`	`return new GraknClientException(statusRuntimeException.getStatus().getDescription());`
`47`	`49`	`}`
`@@ -59,4 +61,9 @@ public String getName() {`
`59`	`61`	`public ErrorMessage getErrorMessage() {`
`60`	`62`	`return errorMessage;`
`61`	`63`	`}`
	`64`	`+`
	`65`	`+ // TODO: propagate exception from the server side in a less-brittle way`
	`66`	`+ private static boolean isReplicaNotPrimaryException(StatusRuntimeException statusRuntimeException) {`
	`67`	`+ return statusRuntimeException.getStatus().getCode() == Status.Code.INTERNAL && statusRuntimeException.getStatus().getDescription().contains("[RPL01]");`
	`68`	`+ }`
`62`	`69`	`}`