Cleanup batchmatmul benchmark

nicolasvasilache · nicolasvasilache · commit 0bf63dfd7134 · 2018-06-03T17:58:17.000-04:00
This commit refactors the batchmatmul benchmark, saves the best options found
by the tuner and sets up the Caffe2/ATen/CUBLAS/CUDNN baselines for future
reproducibility.
diff --git a/tc/benchmarks/batchmatmul.cc b/tc/benchmarks/batchmatmul.cc
@@ -13,6 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "batchmatmul.h"
+
 #include <iostream>
 #include <string>
 #include <vector>
@@ -43,23 +45,22 @@ DEFINE_uint32(M, 72, "M dimension in Z(b, n, m) += X(b, n, kk) * Y(b, kk, m)");
 DEFINE_uint32(K, 26, "K dimension in Z(b, n, m) += X(b, n, kk) * Y(b, kk, m)");
 
 class BatchMatMul : public Benchmark {
+ protected:
+  uint32_t B, N, M, K;
+
  public:
-  void runBatchMatMul(
-      uint32_t B,
-      uint32_t N,
-      uint32_t M,
-      uint32_t K,
-      const tc::CudaMappingOptions& options,
-      bool use_flags = false);
+  void Init(uint32_t b, uint32_t n, uint32_t m, uint32_t k) {
+    B = b;
+    N = n;
+    M = m;
+    K = k;
+  }
+  void runBatchMatMul(const tc::CudaMappingOptions& options);
+  void runCaffe2BatchMatMul();
+  void runATenBatchMatMul();
 };
 
-void BatchMatMul::runBatchMatMul(
-    uint32_t B,
-    uint32_t N,
-    uint32_t M,
-    uint32_t K,
-    const tc::CudaMappingOptions& options,
-    bool use_flags) {
+void BatchMatMul::runBatchMatMul(const tc::CudaMappingOptions& options) {
   at::Tensor X = at::CUDA(at::kFloat).rand({B, N, M});
   at::Tensor Y = at::CUDA(at::kFloat).rand({B, M, K});
 
@@ -85,96 +86,83 @@ def batch_matmul(float(B, N, M) X, float(B, M, K) Y) -> (Z) {
   std::string suffix = std::string("_B_") + std::to_string(FLAGS_B) +
       std::string("_K_") + std::to_string(FLAGS_K) + std::string("_M_") +
       std::to_string(FLAGS_M) + std::string("_N_") + std::to_string(FLAGS_N);
-  if (use_flags && FLAGS_validate_proto) {
-    validateProto(
+  std::vector<tc::CudaMappingOptions> bestOptions{options};
+  if (FLAGS_autotune) {
+    bestOptions = autotune(
         FLAGS_save_tuner_proto_prefix + std::string("/batchmatmul_cache") +
             suffix,
+        FLAGS_save_tuner_proto_prefix + std::string("/batchmatmul_best") +
+            suffix,
         tc,
         "batch_matmul",
         inputs,
+        options,
         check_fun);
-  } else {
-    Check(tc, "batch_matmul", options, inputs, check_fun);
-    if (use_flags) {
-      autotune(
-          FLAGS_save_tuner_proto_prefix + std::string("/batchmatmul_cache") +
-              suffix,
-          FLAGS_save_tuner_proto_prefix + std::string("/batchmatmul_best") +
-              suffix,
-          tc,
-          "batch_matmul",
-          inputs,
-          options,
-          check_fun);
-    }
   }
+  Check(tc, "batch_matmul", bestOptions[0], inputs, check_fun);
 }
 
-TEST_F(BatchMatMul, TransposedBatchMatMul) {
-  auto B = FLAGS_B;
-  auto N = FLAGS_N;
-  auto M = FLAGS_M;
-  auto K = FLAGS_K;
-  auto options = tc::CudaMappingOptions::makeNaiveMappingOptions()
-                     .tile(1)
-                     .mapToThreads({128})
-                     .mapToBlocks({B})
-                     .useSharedMemory(true)
-                     .usePrivateMemory(true)
-                     .unroll(256);
-  runBatchMatMul(B, N, M, K, options, true);
-}
-
-TEST_F(BatchMatMul, TransposedBatchMatMul_P100_autotuned_B_500_K_26_M_72_N_26) {
-  uint32_t B = 500;
-  uint32_t K = 26;
-  uint32_t M = 72;
-  uint32_t N = 26;
-  auto options = tc::CudaMappingOptions::makeNaiveMappingOptions()
-                     .outerScheduleFusionStrategy(tc::FusionStrategy::Max)
-                     .outerScheduleAllowSkewing(false)
-                     .outerSchedulePositiveOrthant(true)
-                     .intraTileScheduleFusionStrategy(tc::FusionStrategy::Min)
-                     .intraTileScheduleAllowSkewing(false)
-                     .intraTileSchedulePositiveOrthant(true)
-                     .tile(3)
-                     .mapToThreads(4, 36, 3)
-                     .mapToBlocks(512)
-                     .unroll(64)
-                     .tileImperfectlyNested(false)
-                     .useSharedMemory(true)
-                     .usePrivateMemory(false)
-                     .unrollCopyShared(true)
-                     .matchLibraryCalls(true);
-  runBatchMatMul(B, N, M, K, options);
+void BatchMatMul::runCaffe2BatchMatMul() {
+  Workspace w_ref;
+  auto AddInput = AddDeterministicallyRandomInput<caffe2::CUDABackend, float>;
+  AddInput(w_ref, {B, N, M}, "X");
+  AddInput(w_ref, {B, M, K}, "Y");
+  OperatorDef ref_def =
+      MakeOperatorDef<caffe2::CUDABackend>("BatchMatMul", {"X", "Y"}, {"Z"});
+  std::unique_ptr<OperatorBase> net(CreateOperator(ref_def, &w_ref));
+  Reference([&]() { return true; }, [&](bool flag) { net->Run(); });
 }
 
-TEST_F(BatchMatMul, ATenTransposedBatchMatMulReference) {
-  auto B = FLAGS_B;
-  auto N = FLAGS_N;
-  auto M = FLAGS_M;
-  auto K = FLAGS_K;
+void BatchMatMul::runATenBatchMatMul() {
   at::Tensor X = at::CUDA(at::kFloat).rand({B, N, M});
   at::Tensor Y = at::CUDA(at::kFloat).rand({B, M, K});
   Reference(
       [&]() { return bmm(X, Y); },
       [&](at::Tensor& res) { bmm_out(res, X, Y); });
 }
 
-TEST_F(BatchMatMul, C2TransposedBatchMatMulReference) {
-  int B = FLAGS_B;
-  int N = FLAGS_N;
-  int M = FLAGS_M;
-  int K = FLAGS_K;
+// Generic
+TEST_F(BatchMatMul, TransposedBatchMatMul) {
+  Init(FLAGS_B, FLAGS_N, FLAGS_M, FLAGS_K);
+  runBatchMatMul(tc::CudaMappingOptions::makeNaiveMappingOptions());
+}
 
-  Workspace w_ref;
-  auto AddInput = AddDeterministicallyRandomInput<caffe2::CUDABackend, float>;
-  AddInput(w_ref, {B, N, M}, "X");
-  AddInput(w_ref, {B, M, K}, "Y");
-  OperatorDef ref_def =
-      MakeOperatorDef<caffe2::CUDABackend>("BatchMatMul", {"X", "Y"}, {"Z"});
-  std::unique_ptr<OperatorBase> net(CreateOperator(ref_def, &w_ref));
-  Reference([&]() { return true; }, [&](bool flag) { net->Run(); });
+// P100 TC
+TEST_F(BatchMatMul, TransposedBatchMatMul_P100_autotuned_B_500_K_26_M_72_N_26) {
+  Init(500, 26, 72, 26);
+  runBatchMatMul(
+      tc::options_TransposedBatchMatMul_P100_autotuned_B_500_K_26_M_72_N_26);
+}
+
+// P100 ATen
+TEST_F(BatchMatMul, TransposedBatchMatMul_ATen_P100_B_500_K_26_M_72_N_26) {
+  Init(500, 26, 72, 26);
+  runATenBatchMatMul();
+}
+
+// P100 Caffe2
+TEST_F(BatchMatMul, TransposedBatchMatMul_Caffe2_P100_B_500_K_26_M_72_N_26) {
+  Init(500, 26, 72, 26);
+  runCaffe2BatchMatMul();
+}
+
+// V100 TC
+TEST_F(BatchMatMul, TransposedBatchMatMul_V100_autotuned_B_500_K_26_M_72_N_26) {
+  Init(500, 26, 72, 26);
+  runBatchMatMul(
+      tc::options_TransposedBatchMatMul_V100_autotuned_B_500_K_26_M_72_N_26);
+}
+
+// V100 ATen
+TEST_F(BatchMatMul, TransposedBatchMatMul_ATen_V100_B_500_K_26_M_72_N_26) {
+  Init(500, 26, 72, 26);
+  runATenBatchMatMul();
+}
+
+// V100 Caffe2
+TEST_F(BatchMatMul, TransposedBatchMatMul_Caffe2_V100_B_500_K_26_M_72_N_26) {
+  Init(500, 26, 72, 26);
+  runCaffe2BatchMatMul();
 }
 
 int main(int argc, char** argv) {
diff --git a/tc/benchmarks/batchmatmul.h b/tc/benchmarks/batchmatmul.h
@@ -0,0 +1,60 @@
+/**
+ * Copyright (c) 2017-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "tc/aten/aten.h"
+#include "tc/core/cuda/cuda_mapping_options.h"
+
+namespace tc {
+auto options_TransposedBatchMatMul_P100_autotuned_B_500_K_26_M_72_N_26 =
+    tc::CudaMappingOptions::makeNaiveMappingOptions()
+        .outerScheduleFusionStrategy(tc::FusionStrategy::Max)
+        .outerScheduleAllowSkewing(false)
+        .outerSchedulePositiveOrthant(true)
+        .intraTileScheduleFusionStrategy(tc::FusionStrategy::Min)
+        .intraTileScheduleAllowSkewing(false)
+        .intraTileSchedulePositiveOrthant(true)
+        .tile(3)
+        .mapToThreads(4, 36, 3)
+        .mapToBlocks(512)
+        .unroll(64)
+        .tileImperfectlyNested(false)
+        .useSharedMemory(true)
+        .usePrivateMemory(false)
+        .unrollCopyShared(true)
+        .matchLibraryCalls(true);
+
+auto options_TransposedBatchMatMul_V100_autotuned_B_500_K_26_M_72_N_26 =
+    tc::CudaMappingOptions::makeNaiveMappingOptions()
+        .outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
+        .outerScheduleAllowSkewing(false)
+        .outerSchedulePositiveOrthant(true)
+        .intraTileScheduleFusionStrategy(
+            tc::FusionStrategy::Preserve3Coincident)
+        .intraTileScheduleAllowSkewing(false)
+        .intraTileSchedulePositiveOrthant(true)
+        .fixParametersBeforeScheduling(true)
+        .tile(1, 1, 32, 32)
+        .unroll(1)
+        .tileImperfectlyNested(false)
+        .matchLibraryCalls(false)
+        .mapToThreads(72)
+        .mapToBlocks(500)
+        .useSharedMemory(true)
+        .usePrivateMemory(true)
+        .unrollCopyShared(false)
+        .useReadOnlyCache(false);
+} // namespace tc