feat: Make client gRPC retry more configurable (#879)

bergundy · web-flow · commit 6a9139079872 · 2022-09-21T09:18:04.000-07:00
diff --git a/packages/client/src/grpc-retry.ts b/packages/client/src/grpc-retry.ts
@@ -1,37 +1,85 @@
-import {
-  InterceptingCall,
-  Interceptor,
-  ListenerBuilder,
-  Metadata,
-  RequesterBuilder,
-  StatusObject,
-} from '@grpc/grpc-js';
+import { InterceptingCall, Interceptor, ListenerBuilder, RequesterBuilder, StatusObject } from '@grpc/grpc-js';
 import * as grpc from '@grpc/grpc-js';
 
 export interface GrpcRetryOptions {
-  /** Maximum number of allowed retries. Defaults to 10. */
-  maxRetries: number;
-
   /**
-   * A function which accepts the current retry attempt (starts at 0) and returns the millisecond
+   * A function which accepts the current retry attempt (starts at 1) and returns the millisecond
    * delay that should be applied before the next retry.
    */
-  delayFunction: (attempt: number) => number;
+  delayFunction: (attempt: number, status: StatusObject) => number;
 
   /**
    * A function which accepts a failed status object and returns true if the call should be retried
    */
-  retryableDecider: (status: StatusObject) => boolean;
+  retryableDecider: (attempt: number, status: StatusObject) => boolean;
+}
+
+/**
+ * Options for the backoff formula: `factor ^ attempt * initialIntervalMs(status) * jitter(maxJitter)`
+ */
+export interface BackoffOptions {
+  /**
+   * Exponential backoff factor
+   *
+   * @default 2
+   */
+  factor: number;
+
+  /**
+   * Maximum number of attempts
+   *
+   * @default 10
+   */
+  maxAttempts: number;
+  /**
+   * Maximum amount of jitter to apply
+   *
+   * @default 0.1
+   */
+  maxJitter: number;
+  /**
+   * Function that returns the "initial" backoff interval based on the returned status.
+   *
+   * The default is 1 second for RESOURCE_EXHAUSTED errors and 20 millis for other retryable errors.
+   */
+  initialIntervalMs(status: StatusObject): number;
 }
 
-export function defaultGrpcRetryOptions(): GrpcRetryOptions {
+/**
+ * Add defaults as documented in {@link BackoffOptions}
+ */
+function withDefaultBackoffOptions({
+  maxAttempts,
+  factor,
+  maxJitter,
+  initialIntervalMs,
+}: Partial<BackoffOptions>): BackoffOptions {
   return {
-    maxRetries: 10,
-    delayFunction: backOffAmount,
-    retryableDecider: isRetryableError,
+    maxAttempts: maxAttempts ?? 10,
+    factor: factor ?? 2,
+    maxJitter: maxJitter ?? 0.1,
+    initialIntervalMs: initialIntervalMs ?? defaultInitialIntervalMs,
   };
 }
 
+/**
+ * Generates the default retry behavior based on given backoff options
+ */
+export function defaultGrpcRetryOptions(options: Partial<BackoffOptions> = {}): GrpcRetryOptions {
+  const { maxAttempts, factor, maxJitter, initialIntervalMs } = withDefaultBackoffOptions(options);
+  return {
+    delayFunction(attempt, status) {
+      return factor ** attempt * initialIntervalMs(status) * jitter(maxJitter);
+    },
+    retryableDecider(attempt, status) {
+      return attempt < maxAttempts && isRetryableError(status);
+    },
+  };
+}
+
+/**
+ * Set of retryable gRPC status codes
+ */
 const retryableCodes = new Set([
   grpc.status.UNKNOWN,
   grpc.status.RESOURCE_EXHAUSTED,
@@ -45,69 +93,75 @@ export function isRetryableError(status: StatusObject): boolean {
   return retryableCodes.has(status.code);
 }
 
-/** Return backoff amount in ms */
-export function backOffAmount(attempt: number): number {
-  return 2 ** attempt * 20;
+/**
+ * Calculates random amount of jitter between 0 and `max`
+ */
+function jitter(max: number) {
+  return 1 - max + Math.random() * max * 2;
+}
+
+/**
+ * Default implementation - backs off more on RESOURCE_EXHAUSTED errors
+ */
+function defaultInitialIntervalMs({ code }: StatusObject) {
+  // Backoff more on RESOURCE_EXHAUSTED
+  if (code === grpc.status.RESOURCE_EXHAUSTED) {
+    return 1000;
+  }
+  return 20;
 }
 
 /**
  * Returns a GRPC interceptor that will perform automatic retries for some types of failed calls
  *
  * @param retryOptions Options for the retry interceptor
  */
-export function makeGrpcRetryInterceptor(retryOptions: GrpcRetryOptions): Interceptor {
+export function makeGrpcRetryInterceptor({ retryableDecider, delayFunction }: GrpcRetryOptions): Interceptor {
   return (options, nextCall) => {
-    let savedMetadata: Metadata;
     let savedSendMessage: any;
     let savedReceiveMessage: any;
-    let savedMessageNext: any;
+    let savedMessageNext: (message: any) => void;
+
     const requester = new RequesterBuilder()
       .withStart(function (metadata, _listener, next) {
-        savedMetadata = metadata;
-        const newListener = new ListenerBuilder()
+        // First attempt
+        let attempt = 1;
+
+        const listener = new ListenerBuilder()
           .withOnReceiveMessage((message, next) => {
             savedReceiveMessage = message;
             savedMessageNext = next;
           })
           .withOnReceiveStatus((status, next) => {
-            let retries = 0;
-            const retry = (message: any, metadata: Metadata) => {
-              retries++;
-              const newCall = nextCall(options);
-              newCall.start(metadata, {
-                onReceiveMessage: (message) => {
+            const retry = () => {
+              attempt++;
+              const call = nextCall(options);
+              call.start(metadata, {
+                onReceiveMessage(message) {
                   savedReceiveMessage = message;
                 },
-                onReceiveStatus: (status) => {
-                  if (retryOptions.retryableDecider(status)) {
-                    if (retries <= retryOptions.maxRetries) {
-                      setTimeout(() => retry(message, metadata), retryOptions.delayFunction(retries));
-                    } else {
-                      savedMessageNext(savedReceiveMessage);
-                      next(status);
-                    }
-                  } else {
-                    savedMessageNext(savedReceiveMessage);
-                    // TODO: For reasons that are completely unclear to me, if you pass a handcrafted
-                    //   status object here, node will magically just exit at the end of this line.
-                    //   No warning, no nothing. Here be dragons.
-                    next(status);
-                  }
-                },
+                onReceiveStatus,
               });
-              newCall.sendMessage(message);
-              newCall.halfClose();
+              call.sendMessage(savedSendMessage);
+              call.halfClose();
+            };
+
+            const onReceiveStatus = (status: StatusObject) => {
+              if (retryableDecider(attempt, status)) {
+                setTimeout(retry, delayFunction(attempt, status));
+              } else {
+                savedMessageNext(savedReceiveMessage);
+                // TODO: For reasons that are completely unclear to me, if you pass a handcrafted
+                // status object here, node will magically just exit at the end of this line.
+                // No warning, no nothing. Here be dragons.
+                next(status);
+              }
             };
 
-            if (retryOptions.retryableDecider(status)) {
-              setTimeout(() => retry(savedSendMessage, savedMetadata), backOffAmount(retries));
-            } else {
-              savedMessageNext(savedReceiveMessage);
-              next(status);
-            }
+            onReceiveStatus(status);
           })
           .build();
-        next(metadata, newListener);
+        next(metadata, listener);
       })
       .withSendMessage((message, next) => {
         savedSendMessage = message;
diff --git a/packages/test/src/test-client-connection.ts b/packages/test/src/test-client-connection.ts
@@ -3,26 +3,30 @@ import util from 'util';
 import path from 'path';
 import * as grpc from '@grpc/grpc-js';
 import * as protoLoader from '@grpc/proto-loader';
-import { Connection } from '@temporalio/client';
+import { Connection, defaultGrpcRetryOptions, makeGrpcRetryInterceptor } from '@temporalio/client';
 import pkg from '@temporalio/client/lib/pkg';
 import { temporal, grpc as grpcProto } from '@temporalio/proto';
 
-test('withMetadata / withDeadline set the CallContext for RPC call', async (t) => {
-  const packageDefinition = protoLoader.loadSync(
-    path.resolve(
-      __dirname,
-      '../../core-bridge/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/service.proto'
-    ),
-    { includeDirs: [path.resolve(__dirname, '../../core-bridge/sdk-core/protos/api_upstream')] }
-  );
-  const protoDescriptor = grpc.loadPackageDefinition(packageDefinition) as any;
+const workflowServicePackageDefinition = protoLoader.loadSync(
+  path.resolve(
+    __dirname,
+    '../../core-bridge/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/service.proto'
+  ),
+  { includeDirs: [path.resolve(__dirname, '../../core-bridge/sdk-core/protos/api_upstream')] }
+);
+const workflowServiceProtoDescriptor = grpc.loadPackageDefinition(workflowServicePackageDefinition) as any;
 
+async function bindLocalhost(server: grpc.Server): Promise<number> {
+  return await util.promisify(server.bindAsync.bind(server))('127.0.0.1:0', grpc.ServerCredentials.createInsecure());
+}
+
+test('withMetadata / withDeadline set the CallContext for RPC call', async (t) => {
   const server = new grpc.Server();
   let gotTestHeaders = false;
   let gotDeadline = false;
   const deadline = Date.now() + 10000;
 
-  server.addService(protoDescriptor.temporal.api.workflowservice.v1.WorkflowService.service, {
+  server.addService(workflowServiceProtoDescriptor.temporal.api.workflowservice.v1.WorkflowService.service, {
     registerNamespace(
       call: grpc.ServerUnaryCall<
         temporal.api.workflowservice.v1.IRegisterNamespaceRequest,
@@ -52,10 +56,7 @@ test('withMetadata / withDeadline set the CallContext for RPC call', async (t) =
       callback(null, {});
     },
   });
-  const port = await util.promisify(server.bindAsync.bind(server))(
-    '127.0.0.1:0',
-    grpc.ServerCredentials.createInsecure()
-  );
+  const port = await bindLocalhost(server);
   server.start();
   const conn = await Connection.connect({ address: `127.0.0.1:${port}`, metadata: { staticKey: 'set' } });
   await conn.withMetadata({ test: 'true' }, () =>
@@ -88,12 +89,75 @@ test('healthService works', async (t) => {
       );
     },
   });
-  const port = await util.promisify(server.bindAsync.bind(server))(
-    '127.0.0.1:0',
-    grpc.ServerCredentials.createInsecure()
-  );
+  const port = await bindLocalhost(server);
   server.start();
   const conn = await Connection.connect({ address: `127.0.0.1:${port}` });
   const response = await conn.healthService.check({});
   t.is(response.status, grpcProto.health.v1.HealthCheckResponse.ServingStatus.SERVING);
 });
+
+test('grpc retry passes request and headers on retry, propagates responses', async (t) => {
+  let attempt = 0;
+  let successAttempt = 3;
+
+  const meta = Array<string>();
+  const namespaces = Array<string>();
+
+  const server = new grpc.Server();
+
+  server.addService(workflowServiceProtoDescriptor.temporal.api.workflowservice.v1.WorkflowService.service, {
+    describeWorkflowExecution(
+      call: grpc.ServerUnaryCall<
+        temporal.api.workflowservice.v1.IDescribeWorkflowExecutionRequest,
+        temporal.api.workflowservice.v1.IDescribeWorkflowExecutionResponse
+      >,
+      callback: grpc.sendUnaryData<temporal.api.workflowservice.v1.IRegisterNamespaceResponse>
+    ) {
+      const { namespace } = call.request;
+      if (typeof namespace === 'string') {
+        namespaces.push(namespace);
+      }
+      const [aValue] = call.metadata.get('a');
+      if (typeof aValue === 'string') {
+        meta.push(aValue);
+      }
+
+      attempt++;
+      if (attempt < successAttempt) {
+        callback({ code: grpc.status.UNKNOWN });
+        return;
+      }
+      const response: temporal.api.workflowservice.v1.IDescribeWorkflowExecutionResponse = {
+        workflowExecutionInfo: { execution: { workflowId: 'test' } },
+      };
+      callback(null, response);
+    },
+  });
+  const port = await bindLocalhost(server);
+  server.start();
+
+  // Default interceptor config with backoff factor of 1 to speed things up
+  const interceptor = makeGrpcRetryInterceptor(defaultGrpcRetryOptions({ factor: 1 }));
+  const conn = await Connection.connect({
+    address: `127.0.0.1:${port}`,
+    metadata: { a: 'bc' },
+    interceptors: [interceptor],
+  });
+  const response = await conn.workflowService.describeWorkflowExecution({ namespace: 'a' });
+  // Check that response is sent correctly
+  t.is(response.workflowExecutionInfo?.execution?.workflowId, 'test');
+  t.is(attempt, 3);
+  // Check that request is sent correctly in each attempt
+  t.deepEqual(namespaces, ['a', 'a', 'a']);
+  // Check that metadata is sent correctly in each attempt
+  t.deepEqual(meta, ['bc', 'bc', 'bc']);
+
+  // Reset and rerun expecting error in the response
+  attempt = 0;
+  successAttempt = 11; // never
+
+  await t.throwsAsync(() => conn.workflowService.describeWorkflowExecution({ namespace: 'a' }), {
+    message: '2 UNKNOWN: Unknown Error',
+  });
+  t.is(attempt, 10);
+});