webgpu: Reduce binary ops shader variants

qjia7 · qjia7 · commit b9d2e24df9c4 · 2021-11-05T15:13:56.000+08:00
PERF

This PR aims to reduce the warmup time by reducing binary ops shader variants.
It may slightly hurt the inference time, but can greatly improve the warmup time.

bodypix-mobilenet reduced ~300ms in the first pass on CFL.
diff --git a/tfjs-backend-webgpu/src/kernels/Prelu.ts b/tfjs-backend-webgpu/src/kernels/Prelu.ts
@@ -20,14 +20,14 @@ import {KernelConfig, KernelFunc, Prelu, PreluInputs, TensorInfo} from '@tensorf
 import {WebGPUBackend} from '../backend_webgpu';
 
 import {BinaryOpType} from './binary_op_util';
-import {BinaryOpProgram} from './binary_op_webgpu';
+import {getBinaryProgram} from './binary_ops';
 
 export function prelu(args: {inputs: PreluInputs, backend: WebGPUBackend}):
     TensorInfo {
   const {inputs, backend} = args;
   const {x, alpha} = inputs;
 
-  const program = new BinaryOpProgram(BinaryOpType.PRELU, x.shape, alpha.shape);
+  const program = getBinaryProgram(BinaryOpType.PRELU, x.shape, alpha.shape);
   return backend.runWebGPUProgram(program, [x, alpha], 'float32');
 }
 
diff --git a/tfjs-backend-webgpu/src/kernels/binary_op_shared_webgpu.ts b/tfjs-backend-webgpu/src/kernels/binary_op_shared_webgpu.ts
@@ -15,8 +15,6 @@
  * =============================================================================
  */
 
-import {backend_util} from '@tensorflow/tfjs-core';
-
 import {getMainHeaderAndGlobalIndexString} from '../shader_preprocessor';
 import {computeDispatch, flatDispatchLayout} from '../webgpu_util';
 import {BinaryOpType, getBinaryOpString} from './binary_op_util';
@@ -29,66 +27,57 @@ export class BinaryOpSharedProgram implements WebGPUProgram {
   dispatchLayout: {x: number[]};
   dispatch: [number, number, number];
   variableNames = ['A', 'B'];
-  workPerThread: number;
+  workPerThread = 4;
   workGroupSize: [number, number, number];
   useSharedMemoryWithB: boolean;
-  lastDimensionSize: number;
+  isScater: boolean;
   op: BinaryOpType;
   size = true;
 
   constructor(
-      op: BinaryOpType, aShape: number[], bShape: number[],
-      useSharedMemoryWithB: boolean) {
+      op: BinaryOpType, outputShape: number[], useSharedMemoryWithB: boolean,
+      isScater: boolean) {
     // This is an experimental value when using shared memory.
     // Note that the maximum of workgroup X dimension is 256.
     const workGroupSizeX = 256;
     this.workGroupSize = [workGroupSizeX, 1, 1];
-    this.outputShape = backend_util.assertAndGetBroadcastShape(aShape, bShape);
+    this.outputShape = outputShape;
     this.dispatchLayout = flatDispatchLayout(this.outputShape);
-    this.lastDimensionSize = useSharedMemoryWithB ? bShape[0] : aShape[0];
-    if (this.lastDimensionSize < 256) {
-      this.workPerThread = 1;
-    } else if (this.lastDimensionSize < 512) {
-      this.workPerThread = 2;
-    } else {
-      this.workPerThread = 4;
-    }
+    this.isScater = isScater;
     this.dispatch = computeDispatch(
         this.dispatchLayout, this.outputShape, this.workGroupSize,
         [this.workPerThread, 1, 1]);
 
     this.useSharedMemoryWithB = useSharedMemoryWithB;
     this.op = op;
-    // this.lastDimensionSize is used as sharedBuf array size, so can not be
-    // used as uniform.
-    this.shaderKey = `binaryShared_${op}_${this.lastDimensionSize}_${
-        this.useSharedMemoryWithB}`;
+    this.shaderKey =
+        `binaryShared_${op}_${this.useSharedMemoryWithB}_${isScater}`;
   }
 
   getUserCode(): string {
-    const sharedIndexSnippet = this.lastDimensionSize > 1 ?
-        `coords[${this.outputShape.length - 1}]` :
-        '0';
+    const sharedIndexSnippet =
+        this.isScater ? '0' : `coords[${this.outputShape.length - 1}]`;
     const accessDataSnippet = this.useSharedMemoryWithB ?
-        `let a = getAAtOutCoordsByCoords(coords);
+        `let a = getAAtOutCoordsByGlobalIndex(flatIndex);
          let b = sharedBuf[${sharedIndexSnippet}];` :
         `let a = sharedBuf[${sharedIndexSnippet}];
-         let b = getBAtOutCoordsByCoords(coords);`;
+         let b = getBAtOutCoordsByGlobalIndex(flatIndex);`;
 
-    const opStr = getBinaryOpString(this.op, false);
     const userCode = `
         fn binaryOperation(a : f32, b : f32) -> f32 {
-          ${opStr}
+          ${getBinaryOpString(this.op, false)}
         }
-        var<workgroup> sharedBuf : array<f32, ${this.lastDimensionSize}>;
+
+        var<workgroup> sharedBuf : array<f32, ${
+        this.workGroupSize[0] * this.workPerThread}>;
         ${getMainHeaderAndGlobalIndexString()}
 
           // Fill in the shared memory buffer. Here we need a loop to make sure
           // that all data in A|B are uploaded when |sharedMemorySize| is larger
           // than work group size.
           for(var localIndex = i32(localId.x); localIndex < ${
-        this.lastDimensionSize}; localIndex = localIndex + ${
-        this.workGroupSize[0]}) {
+        this.useSharedMemoryWithB ? 'uniforms.bShape' : 'uniforms.aShape'};
+              localIndex = localIndex + ${this.workGroupSize[0]}) {
             sharedBuf[localIndex] = f32(${
         this.useSharedMemoryWithB ? 'B' : 'A'}.numbers[localIndex]);
           }
diff --git a/tfjs-backend-webgpu/src/kernels/binary_op_vec4_webgpu.ts b/tfjs-backend-webgpu/src/kernels/binary_op_vec4_webgpu.ts
@@ -15,7 +15,6 @@
  * =============================================================================
  */
 
-import {backend_util} from '@tensorflow/tfjs-core';
 import {getMainHeaderAndGlobalIndexString} from '../shader_preprocessor';
 import {computeDispatch, flatDispatchLayout} from '../webgpu_util';
 import {BinaryOpType, getBinaryOpString} from './binary_op_util';
@@ -33,13 +32,12 @@ export class BinaryOpVec4Program implements WebGPUProgram {
   isVec4 = true;
   op: BinaryOpType;
   size = true;
-  fitShape: boolean;
 
-  constructor(op: BinaryOpType, aShape: number[], bShape: number[]) {
+  constructor(op: BinaryOpType, outputShape: number[]) {
     // TODO(jiajia.qin@intel.com): Heuristically select a good work group size.
     const workGroupSizeX = 128;
     this.workGroupSize = [workGroupSizeX, 1, 1];
-    this.outputShape = backend_util.assertAndGetBroadcastShape(aShape, bShape);
+    this.outputShape = outputShape;
     this.dispatchLayout = flatDispatchLayout(this.outputShape);
     this.dispatch = computeDispatch(
         this.dispatchLayout, this.outputShape, this.workGroupSize,
diff --git a/tfjs-backend-webgpu/src/kernels/binary_op_webgpu.ts b/tfjs-backend-webgpu/src/kernels/binary_op_webgpu.ts
@@ -15,7 +15,6 @@
  * =============================================================================
  */
 
-import {backend_util} from '@tensorflow/tfjs-core';
 import {getMainHeaderAndGlobalIndexString} from '../shader_preprocessor';
 import {computeDispatch, flatDispatchLayout} from '../webgpu_util';
 import {BinaryOpType, getBinaryOpString} from './binary_op_util';
@@ -32,11 +31,11 @@ export class BinaryOpProgram implements WebGPUProgram {
   op: BinaryOpType;
   size = true;
 
-  constructor(op: BinaryOpType, aShape: number[], bShape: number[]) {
+  constructor(op: BinaryOpType, outputShape: number[]) {
     // TODO(jiajia.qin@intel.com): Heuristically select a good work group size.
     const workGroupSizeX = 128;
     this.workGroupSize = [workGroupSizeX, 1, 1];
-    this.outputShape = backend_util.assertAndGetBroadcastShape(aShape, bShape);
+    this.outputShape = outputShape;
     this.dispatchLayout = flatDispatchLayout(this.outputShape);
 
     this.dispatch = computeDispatch(
diff --git a/tfjs-backend-webgpu/src/kernels/binary_ops.ts b/tfjs-backend-webgpu/src/kernels/binary_ops.ts
@@ -15,26 +15,30 @@
  * =============================================================================
  */
 
-import {util} from '@tensorflow/tfjs-core';
+import {backend_util, util} from '@tensorflow/tfjs-core';
+
 import {BinaryOpSharedProgram} from './binary_op_shared_webgpu';
+import {BinaryOpType} from './binary_op_util';
 import {BinaryOpVec4Program} from './binary_op_vec4_webgpu';
 import {BinaryOpProgram} from './binary_op_webgpu';
-import {BinaryOpType} from './binary_op_util';
 
 export function getBinaryProgram(
     op: BinaryOpType, aShape: number[], bShape: number[]) {
+  const outputShape = backend_util.assertAndGetBroadcastShape(aShape, bShape);
   const useVec4 =
       util.arraysEqual(aShape, bShape) && util.sizeFromShape(aShape) % 4 === 0;
   if (useVec4) {
-    return new BinaryOpVec4Program(op, aShape, bShape);
+    return new BinaryOpVec4Program(op, outputShape);
   }
   const useSharedMemoryWithA =
-      aShape.length === 1 && bShape.length > 1 && aShape[0] < 1024;
+      aShape.length === 1 && bShape.length > 1 && aShape[0] < 512;
   const useSharedMemoryWithB =
-      bShape.length === 1 && aShape.length > 1 && bShape[0] < 1024;
+      bShape.length === 1 && aShape.length > 1 && bShape[0] < 512;
   if (useSharedMemoryWithA || useSharedMemoryWithB) {
-    return new BinaryOpSharedProgram(op, aShape, bShape, useSharedMemoryWithB);
+    const isScater = useSharedMemoryWithB ? bShape[0] === 1 : aShape[0] === 1;
+    return new BinaryOpSharedProgram(
+        op, outputShape, useSharedMemoryWithB, isScater);
   } else {
-    return new BinaryOpProgram(op, aShape, bShape);
+    return new BinaryOpProgram(op, outputShape);
   }
 }