Add wavenet benchmark

nicolasvasilache · nicolasvasilache · commit b346d01d21f3 · 2018-06-03T17:58:18.000-04:00
Thie commit adds a single layer wavenet benchmar following the description in the paper https://arxiv.org/pdf/1609.03499.pdf. The correctness has been tested against a PyTorch baseline written by @Artix18.
diff --git a/tc/benchmarks/CMakeLists.txt b/tc/benchmarks/CMakeLists.txt
@@ -22,6 +22,7 @@ set(BENCHMARKS
   moments
   tmm
   MLP_model
+  wavenet
 )
 foreach(i ${BENCHMARKS})
   add_executable(benchmark_${i} ${i}.cc)
diff --git a/tc/benchmarks/wavenet.cc b/tc/benchmarks/wavenet.cc
@@ -0,0 +1,183 @@
+/**
+ * Copyright (c) 2017-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "wavenet.h"
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "tc/aten/aten.h"
+
+#include "tc/aten/aten_compiler.h"
+#include "tc/core/cuda/cuda_mapping_options.h"
+
+#include "../test/caffe2/cuda/test_harness.h"
+#include "../test/caffe2/test_harness.h"
+#include "../test/test_harness_aten_cuda.h"
+#include "benchmark_fixture.h"
+
+#include "tc/c2/context.h"
+#include "tc/core/cuda/cuda.h"
+#include "tc/core/flags.h"
+
+using namespace caffe2;
+
+DEFINE_uint32(B, 1, "Batch size");
+DEFINE_uint32(
+    RESIDUAL_C,
+    32,
+    "Residual channels (i.e. WaveNet block input channels)");
+DEFINE_uint32(
+    DILATION_C,
+    32,
+    "Dilation channels (i.e. WaveNet block channels after dilated convolution)");
+DEFINE_uint32(
+    SKIP_C,
+    32,
+    "Skip channels (i.e. WaveNet block channels in the skip tensor)");
+DEFINE_uint32(
+    RECEPTIVE_FIELD,
+    4000,
+    "https://arxiv.org/pdf/1609.03499.pdf paper mentions 16K samples per second"
+    "and a receptive field of 240ms so we approx. set the default to 4000)");
+DEFINE_uint32(DILATION_FACTOR, 1, "Powers of 2 from 1 to 512 in the paper");
+
+// https://arxiv.org/pdf/1609.03499.pdf paper mentions 16K samples per second
+// and a receptive field of 240ms so about 4K RECEPTIVE_FIELD
+class WaveNet : public Benchmark {
+ protected:
+  uint32_t B;
+  uint32_t RESIDUAL_C;
+  uint32_t DILATION_C;
+  uint32_t SKIP_C;
+  uint32_t RECEPTIVE_FIELD;
+  uint32_t DILATION_FACTOR; // 2^layer where layer in 0:10
+
+ public:
+  void Init(
+      uint32_t b,
+      uint32_t residual_c,
+      uint32_t dilation_c,
+      uint32_t skip_c,
+      uint32_t receptive_field,
+      uint32_t dilation_factor) {
+    B = b;
+    RESIDUAL_C = residual_c;
+    DILATION_C = dilation_c;
+    SKIP_C = skip_c;
+    RECEPTIVE_FIELD = receptive_field;
+    DILATION_FACTOR = dilation_factor;
+  }
+  void runWaveNet1(const tc::CudaMappingOptions& options);
+};
+
+void WaveNet::runWaveNet1(const tc::CudaMappingOptions& options) {
+  at::Tensor data = at::CUDA(at::kFloat).rand({B, RESIDUAL_C, RECEPTIVE_FIELD});
+  at::Tensor filterWeight =
+      at::CUDA(at::kFloat).rand({DILATION_C, RESIDUAL_C, 2});
+  at::Tensor filterBias = at::CUDA(at::kFloat).rand({DILATION_C});
+  at::Tensor gateWeight =
+      at::CUDA(at::kFloat).rand({DILATION_C, RESIDUAL_C, 2});
+  at::Tensor gateBias = at::CUDA(at::kFloat).rand({DILATION_C});
+  at::Tensor resWeight = at::CUDA(at::kFloat).rand({RESIDUAL_C, DILATION_C});
+  at::Tensor resBias = at::CUDA(at::kFloat).rand({RESIDUAL_C});
+  at::Tensor skipWeight = at::CUDA(at::kFloat).rand({SKIP_C, DILATION_C});
+  at::Tensor skipBias = at::CUDA(at::kFloat).rand({SKIP_C});
+  at::Tensor dilation = at::CUDA(at::kFloat).rand({DILATION_FACTOR});
+
+  std::vector<at::Tensor> inputs = {data,
+                                    filterWeight,
+                                    filterBias,
+                                    gateWeight,
+                                    gateBias,
+                                    resWeight,
+                                    resBias,
+                                    skipWeight,
+                                    skipBias,
+                                    dilation};
+
+  std::vector<tc::CudaMappingOptions> bestOptions{options};
+  if (FLAGS_autotune) {
+    bestOptions = autotune(
+        FLAGS_save_tuner_proto_prefix + std::string("/wavenet_1_cache"),
+        FLAGS_save_tuner_proto_prefix + std::string("/wavenet_1_best"),
+        tc::TC_WAVENET,
+        tc::TC_WAVENET1_NAME,
+        inputs,
+        options);
+    CHECK_GE(bestOptions.size(), 1u);
+  }
+  Check(tc::TC_WAVENET, tc::TC_WAVENET1_NAME, bestOptions[0], inputs);
+}
+
+/// WaveNet 1 block
+// Generic
+TEST_F(WaveNet, WaveNet1) {
+  Init(
+      FLAGS_B,
+      FLAGS_RESIDUAL_C,
+      FLAGS_DILATION_C,
+      FLAGS_SKIP_C,
+      FLAGS_RECEPTIVE_FIELD,
+      FLAGS_DILATION_FACTOR);
+  runWaveNet1(tc::CudaMappingOptions::makeNaiveMappingOptions());
+}
+
+// P100
+TEST_F(
+    WaveNet,
+    WaveNet1_P100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1) {
+  Init(1, 32, 32, 256, 4000, 1);
+  runWaveNet1(
+      tc::options_WaveNet1_P100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1);
+}
+
+TEST_F(
+    WaveNet,
+    WaveNet1_P100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32) {
+  Init(1, 32, 32, 256, 4000, 32);
+  runWaveNet1(
+      tc::options_WaveNet1_P100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32);
+}
+
+// V100
+TEST_F(
+    WaveNet,
+    WaveNet1_V100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1) {
+  Init(1, 32, 32, 256, 4000, 1);
+  runWaveNet1(
+      tc::options_WaveNet1_V100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1);
+}
+
+TEST_F(
+    WaveNet,
+    WaveNet1_V100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32) {
+  Init(1, 32, 32, 256, 4000, 32);
+  runWaveNet1(
+      tc::options_WaveNet1_V100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32);
+}
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  ::gflags::ParseCommandLineFlags(&argc, &argv, true);
+  ::google::InitGoogleLogging(argv[0]);
+  tc::aten::setAtenSeed(tc::initRandomSeed(), at::Backend::CUDA);
+  return RUN_ALL_TESTS();
+}
diff --git a/tc/benchmarks/wavenet.h b/tc/benchmarks/wavenet.h
@@ -0,0 +1,157 @@
+/**
+ * Copyright (c) 2017-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "tc/aten/aten.h"
+#include "tc/core/cuda/cuda_mapping_options.h"
+
+namespace tc {
+constexpr static auto TC_WAVENET1_NAME = "wavenet1";
+constexpr static auto TC_WAVENET = R"TC(
+# Original data is float(B, C, RECEPTIVE_FIELD) and undergoes a \
+# Conv1d to become float(B, RESIDUAL_C, RECEPTIVE_FIELD)
+
+def wavenet1(
+    float(B, RESIDUAL_C, RECEPTIVE_FIELD) Data,
+    float(DILATION_C, RESIDUAL_C, 2) FilterWeight,
+    float(DILATION_C) FilterBias,
+    float(DILATION_C, RESIDUAL_C, 2) GateWeight,
+    float(DILATION_C) GateBias,
+    float(RESIDUAL_C, DILATION_C) ResWeight,
+    float(RESIDUAL_C) ResBias,
+    float(SKIP_C, DILATION_C) SkipWeight,
+    float(SKIP_C) SkipBias,
+    float(DILATION_FACTOR) Dilation)
+    -> (FilterOut, GateOut, NonLin, Res, Skip)
+{
+    FilterOut(b, dilation_c, rf)   = FilterBias(dilation_c)
+        where b in 0:B, dilation_c in 0:DILATION_C, rf in 0:RECEPTIVE_FIELD
+    FilterOut(b, dilation_c, rf)  += Data(b, r_residual_c, rf) * FilterWeight(dilation_c, r_residual_c, 1) +
+        (
+          (rf - DILATION_FACTOR >= 0) ?
+            Data(b, r_residual_c, rf - DILATION_FACTOR) * FilterWeight(dilation_c, r_residual_c, 0) :
+            float(0)
+        )
+        where rf in 0:RECEPTIVE_FIELD
+
+    GateOut(b, dilation_c, rf)   = GateBias(dilation_c)
+        where b in 0:B, dilation_c in 0:DILATION_C, rf in 0:RECEPTIVE_FIELD
+    GateOut(b, dilation_c, rf)  += Data(b, r_residual_c, rf) * GateWeight(dilation_c, r_residual_c, 1) +
+        (
+          (rf - DILATION_FACTOR >= 0) ?
+            Data(b, r_residual_c, rf - DILATION_FACTOR) * GateWeight(dilation_c, r_residual_c, 0) :
+            float(0)
+        )
+        where rf in 0:RECEPTIVE_FIELD
+
+    NonLin(b, dilation_c, rf)   =         tanh(FilterOut(b, dilation_c, rf))
+        where rf in 0:RECEPTIVE_FIELD
+    NonLin(b, dilation_c, rf)  *= 1 / (1 + exp( -GateOut(b, dilation_c, rf)))
+        where rf in 0:RECEPTIVE_FIELD
+
+       Res(b, residual_c, rf)   =   Data(b,  residual_c, rf) + ResBias(residual_c)
+       Res(b, residual_c, rf)  += NonLin(b, r_dilation_c, rf) * ResWeight(residual_c, r_dilation_c)
+
+      Skip(b, skip, rf) +=! NonLin(b, r_dilation_c, rf) * SkipWeight(skip, r_dilation_c)
+        where rf in 0:RECEPTIVE_FIELD
+      Skip(b, skip, rf)  = Skip(b, skip, rf) + SkipBias(skip)
+        where rf in 0:RECEPTIVE_FIELD
+}
+  )TC";
+
+auto options_WaveNet1_P100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1 =
+    tc::CudaMappingOptions::makeNaiveMappingOptions()
+        .outerScheduleFusionStrategy(tc::FusionStrategy::Max)
+        .outerScheduleAllowSkewing(false)
+        .outerSchedulePositiveOrthant(true)
+        .intraTileScheduleFusionStrategy(tc::FusionStrategy::Min)
+        .intraTileScheduleAllowSkewing(false)
+        .intraTileSchedulePositiveOrthant(true)
+        .fixParametersBeforeScheduling(true)
+        .tile(63)
+        .unroll(32)
+        .tileImperfectlyNested(false)
+        .matchLibraryCalls(false)
+        .mapToThreads(32, 4, 1)
+        .mapToBlocks(256, 4, 63)
+        .useSharedMemory(true)
+        .usePrivateMemory(true)
+        .unrollCopyShared(false)
+        .useReadOnlyCache(false);
+
+auto options_WaveNet1_P100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32 =
+    tc::CudaMappingOptions::makeNaiveMappingOptions()
+        .outerScheduleFusionStrategy(tc::FusionStrategy::Max)
+        .outerScheduleAllowSkewing(false)
+        .outerSchedulePositiveOrthant(true)
+        .intraTileScheduleFusionStrategy(
+            tc::FusionStrategy::Preserve3Coincident)
+        .intraTileScheduleAllowSkewing(false)
+        .intraTileSchedulePositiveOrthant(true)
+        .fixParametersBeforeScheduling(true)
+        .tile(128, 4096, 1000, 64)
+        .unroll(1)
+        .tileImperfectlyNested(false)
+        .matchLibraryCalls(true)
+        .mapToThreads(128)
+        .mapToBlocks(63)
+        .useSharedMemory(true)
+        .usePrivateMemory(true)
+        .unrollCopyShared(false)
+        .useReadOnlyCache(false);
+
+auto options_WaveNet1_V100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1 =
+    tc::CudaMappingOptions::makeNaiveMappingOptions()
+        .outerScheduleFusionStrategy(tc::FusionStrategy::Max)
+        .outerScheduleAllowSkewing(false)
+        .outerSchedulePositiveOrthant(true)
+        .intraTileScheduleFusionStrategy(
+            tc::FusionStrategy::Preserve3Coincident)
+        .intraTileScheduleAllowSkewing(false)
+        .intraTileSchedulePositiveOrthant(true)
+        .fixParametersBeforeScheduling(false)
+        .tile(1000, 128, 500)
+        .unroll(2)
+        .tileImperfectlyNested(false)
+        .matchLibraryCalls(false)
+        .mapToThreads(256)
+        .mapToBlocks(4000, 128)
+        .useSharedMemory(true)
+        .usePrivateMemory(true)
+        .unrollCopyShared(true)
+        .useReadOnlyCache(false);
+
+auto options_WaveNet1_V100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32 =
+    tc::CudaMappingOptions::makeNaiveMappingOptions()
+        .outerScheduleFusionStrategy(tc::FusionStrategy::Max)
+        .outerScheduleAllowSkewing(false)
+        .outerSchedulePositiveOrthant(true)
+        .intraTileScheduleFusionStrategy(tc::FusionStrategy::Min)
+        .intraTileScheduleAllowSkewing(false)
+        .intraTileSchedulePositiveOrthant(true)
+        .fixParametersBeforeScheduling(true)
+        .tile(8, 125, 512, 500)
+        .unroll(32)
+        .tileImperfectlyNested(false)
+        .matchLibraryCalls(false)
+        .mapToThreads(16, 16)
+        .mapToBlocks(4000, 2048, 4096)
+        .useSharedMemory(true)
+        .usePrivateMemory(true)
+        .unrollCopyShared(true)
+        .useReadOnlyCache(false);
+
+} // namespace tc

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@ set(BENCHMARKS`
`22`	`22`	`moments`
`23`	`23`	`tmm`
`24`	`24`	`MLP_model`
	`25`	`+ wavenet`
`25`	`26`	`)`
`26`	`27`	`foreach(i ${BENCHMARKS})`
`27`	`28`	`add_executable(benchmark_${i} ${i}.cc)`