Merge pull request #228 from nicolasvasilache/pr/example_blockdiagperm

nicolasvasilache · web-flow · commit e7cd42511bce · 2018-04-10T10:00:45.000-04:00
Add a blockdiagperm example for @pierrestock
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -15,6 +15,7 @@ set(GTEST_LIBS gtest gtest_main)
 ################################################################################
 set(EXAMPLES_FILES
   tensordot
+  blockdiagperm
 )
 foreach(i ${EXAMPLES_FILES})
   add_executable(${i} ${i}.cc)
diff --git a/examples/blockdiagperm.cc b/examples/blockdiagperm.cc
@@ -0,0 +1,129 @@
+/**
+ * Copyright (c) 2017-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <ATen/ATen.h>
+
+#include "tc/aten/aten_compiler.h"
+#include "tc/autotuner/genetic_autotuner_aten.h"
+#include "tc/core/cuda/cuda.h"
+#include "tc/core/cuda/cuda_tc_executor.h"
+#include "tc/core/flags.h"
+#include "tc/core/mapping_options.h"
+
+DEFINE_string(tuner_proto, "", "Filename to load and store proto cache ");
+
+TEST(BlockDiagPerm, SimpleAutotune) {
+  // 1. Define and setup the TC compilation unit with CUDA memory
+  // management backed by ATen tensors.
+  std::string tc = R"TC(
+# The following TCs (blockdiagperm2d and blockdiagperm2dinlined) illustrate
+# how we would likely want to write blockdiagperm to synthesize a single
+# kernel. However both versions currently fail to emit a good single cuda kernel.
+#   1. blockdiagperm2d requires additional information to relax dependencies and
+#     allow fusion
+#   2. blockdiagperm2dinlined requires general LHS indexing
+# A third version blockdiagperm2dfissioned_1/2 is a workaround by using 2
+# independent TCs.
+# This TC probably requires extra information to perform fusion which we do
+# not know how to propagate at this point
+# def blockdiagperm2d(float(B, K, NBYK) I, float(K, NBYK, NBYK) W, float(K, NBYK) IdxR, float(K, NBYK) IdxC)
+#     -> (O1, O2) {
+#     O1(b, k, nbyk1) +=!  I(b, k, r_nbyk0) * W(k, r_nbyk0, nbyk1)
+#     O2(b, k, nbyk)   =  O1(b, Idxr(k, nbyk), Idxc(k, nbyk))
+# }
+# This TC requires LHS indexing which is a WIP + extra information that all
+# accesses are parallel (i.e. (IdxR, IdxC) form a permutation)
+# def blockdiagperm2dinlined(float(B, K, NBYK) I, float(K, NBYK, NBYK) W, float(K, NBYK) IdxR, float(K, NBYK) IdxC)
+#     -> (O1) {
+#     O1(b, IdxR(k, nbyk0), IdxC(k, nbyk0)) +=! I(b, k, r_nbyk0) * W(k, r_nbyk0, nbyk1)
+# }
+
+# This is the poor man's way of making things work today with a reshape
+# operation in between (in framework land).
+def blockdiagperm2dfissioned_1(float(B, K, NBYK) I, float(K, NBYK, NBYK) W) -> (O)
+{
+    O(b, k, nbyk1) +=! I(b, k, r_nbyk0) * W(k, r_nbyk0, nbyk1)
+}
+def blockdiagperm2dfissioned_2(float(B, N) I, int32(N) Idx) -> (O) {
+    O(b, n) = I(b, Idx(n)) where n in 0:N
+}
+  )TC";
+  tc::ATenCompilationUnit<tc::CudaTcExecutor> atCompl;
+  atCompl.define(tc);
+
+  // 1. Allocate and autotune
+  at::Tensor I = at::CUDA(at::kFloat).rand({128, 10, 50});
+  at::Tensor W = at::CUDA(at::kFloat).rand({10, 50, 50});
+  auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions();
+  tc::autotune::GeneticAutotunerATen geneticAutotuneATen(tc);
+  auto bestOption = geneticAutotuneATen.tune(
+      FLAGS_tuner_proto, "blockdiagperm2dfissioned_1", {I, W}, options);
+  auto handle = atCompl.compile(
+      "blockdiagperm2dfissioned_1", {I, W}, bestOption.getValue());
+  std::vector<at::Tensor> outputs;
+  auto duration =
+      atCompl.run("blockdiagperm2dfissioned_1", {I, W}, outputs, handle, true);
+
+  // 2. Allocate and autotune
+  at::Tensor O = outputs[0].clone().resize_({128, 500});
+  at::Tensor Idx = at::CPU(at::kInt).randperm({500}).toBackend(at::kCUDA);
+  tc::autotune::GeneticAutotunerATen geneticAutotuneATen2(tc);
+  auto bestOption2 = geneticAutotuneATen.tune(
+      FLAGS_tuner_proto, "blockdiagperm2dfissioned_2", {O, Idx}, options);
+  auto handle2 = atCompl.compile(
+      "blockdiagperm2dfissioned_2", {O, Idx}, bestOption2.getValue());
+  std::vector<at::Tensor> outputs2;
+  auto duration2 = atCompl.run(
+      "blockdiagperm2dfissioned_2", {O, Idx}, outputs2, handle2, true);
+
+  // 3. Report best standalone times
+  std::cout
+      << "blockdiagperm2dfissioned_1 size I: " << I.sizes() << ", "
+      << "size W: " << W.sizes() << " ran in: "
+      << std::chrono::duration_cast<std::chrono::microseconds>(duration).count()
+      << "us\n";
+  std::cout << "blockdiagperm2dfissioned_2 size O: " << O.sizes() << ", "
+            << "size Idx: " << Idx.sizes() << " ran in: "
+            << std::chrono::duration_cast<std::chrono::microseconds>(duration2)
+                   .count()
+            << "us\n";
+
+  // 4. Run unchecked one last time, use with:
+  //   nvprof --profile-from-start off executable --use_nvprof=1
+  {
+    tc::CudaProfiler cp;
+    atCompl.uncheckedRun({I, W}, outputs, handle);
+    atCompl.uncheckedRun({O, Idx}, outputs2, handle2);
+  }
+}
+
+// From root, run with:
+//   ./build/examples/blockdiagperm --tuner_threads=10 --tuner_gen_pop_size=10
+//   --tuner_gen_generations=3 --tuner_gen_number_elites=4
+//   --tuner_proto="/tmp/blockdiagperm"
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  ::gflags::ParseCommandLineFlags(&argc, &argv, true);
+  ::google::InitGoogleLogging(argv[0]);
+  return RUN_ALL_TESTS();
+}
diff --git a/examples/tensordot.cc b/examples/tensordot.cc
@@ -30,11 +30,7 @@
 
 #include "../test/test_harness_aten_cuda.h"
 
-DEFINE_uint32(number_elites, 2, "Number of elites per generation");
-DEFINE_uint32(generations, 3, "Number of generations to tune for");
-DEFINE_uint32(pop_size, 10, "Population size to tune for");
-DEFINE_uint32(threads, 16, "Number of threads to tune with");
-DEFINE_string(gpus, "0", "List of gpus to evaluate on");
+DEFINE_string(tuner_proto, "", "Filename to load and store proto cache ");
 
 TEST(TensorDot, SimpleAutotune) {
   // 1. Define and setup the TC compilation unit with CUDA memory
@@ -57,7 +53,7 @@ def tensordot(float(N, C1, C2, H, W) I0,
   auto naiveOptions = tc::CudaMappingOptions::makeNaiveCudaMappingOptions();
   tc::autotune::GeneticAutotunerATen geneticAutotuneATen(tc);
   auto bestOption = geneticAutotuneATen.tune(
-      "/tmp/save_results", "tensordot", {I0, I1}, naiveOptions);
+      FLAGS_tuner_proto, "tensordot", {I0, I1}, naiveOptions);
 
   // 4. Compile and run the TC with the best option.
   // Outputs get allocated; could also be pre-allocated and passed.
@@ -91,15 +87,14 @@ def tensordot(float(N, C1, C2, H, W) I0,
   }
 }
 
+// From root, run with:
+//   ./build/examples/tensordot --tuner_threads=10 --tuner_gen_pop_size=10
+//   --tuner_gen_generations=3 --tuner_gen_number_elites=4
+//   --tuner_proto="/tmp/tensordot"
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   ::gflags::ParseCommandLineFlags(&argc, &argv, true);
   ::google::InitGoogleLogging(argv[0]);
   setAtenSeed(tc::initRandomSeed(), at::Backend::CUDA);
-  tc::FLAGS_tuner_gen_number_elites = FLAGS_number_elites;
-  tc::FLAGS_tuner_gen_generations = FLAGS_generations;
-  tc::FLAGS_tuner_gen_pop_size = FLAGS_pop_size;
-  tc::FLAGS_tuner_threads = FLAGS_threads;
-  tc::FLAGS_tuner_gpus = FLAGS_gpus;
   return RUN_ALL_TESTS();
 }
diff --git a/include/tc/core/cuda/cuda.h b/include/tc/core/cuda/cuda.h
@@ -27,6 +27,7 @@
 #include <stdexcept>
 
 #include <cuda.h>
+#include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
 
 #include <glog/logging.h>
@@ -70,6 +71,8 @@
 
 namespace tc {
 
+DECLARE_bool(use_nvprof);
+
 struct WithDevice {
   WithDevice(size_t g) : newGpu(g) {
     int dev;
@@ -111,4 +114,17 @@ class CudaGPUInfo {
   std::vector<size_t> sharedMemSizes_;
 };
 
+struct CudaProfiler {
+  CudaProfiler() {
+    if (FLAGS_use_nvprof) {
+      cudaProfilerStart();
+    }
+  }
+  ~CudaProfiler() {
+    if (FLAGS_use_nvprof) {
+      cudaProfilerStop();
+    }
+  }
+};
+
 } // namespace tc
diff --git a/src/core/cuda/cuda.cc b/src/core/cuda/cuda.cc
@@ -26,6 +26,8 @@
 #include "tc/core/flags.h"
 
 namespace tc {
+DEFINE_bool(use_nvprof, false, "Start / stop nvprof");
+
 namespace {
 
 std::tuple<std::vector<std::string>, std::vector<size_t>> init() {
diff --git a/test/test_harness.h b/test/test_harness.h
@@ -15,7 +15,6 @@
  */
 #pragma once
 
-#include <cuda_profiler_api.h>
 #include <gtest/gtest.h>
 #include <mutex>
 #include <string>
@@ -32,23 +31,8 @@
 #include "tc/c2/tc_op.h"
 #include "tc/core/cuda/cuda.h"
 
-DEFINE_bool(use_nvprof, false, "Start / stop nvprof");
-
 namespace caffe2 {
 
-struct CudaProfiler {
-  CudaProfiler() {
-    if (FLAGS_use_nvprof) {
-      cudaProfilerStart();
-    }
-  }
-  ~CudaProfiler() {
-    if (FLAGS_use_nvprof) {
-      cudaProfilerStop();
-    }
-  }
-};
-
 caffe2::TensorCPU context2tensor(caffe2::CPUContext& ctx) {
   return caffe2::TensorCPU();
 }
@@ -315,7 +299,7 @@ struct TestHarness {
 
     void RunReference() {
       ASSERT_TRUE(net_ref.get());
-      CudaProfiler p;
+      tc::CudaProfiler p;
       ASSERT_TRUE(net_ref->Run());
     }
 
@@ -326,7 +310,7 @@ struct TestHarness {
 
     void Run() {
       ASSERT_TRUE(op_test.get());
-      CudaProfiler p;
+      tc::CudaProfiler p;
       ASSERT_TRUE(op_test->Run());
     }
 
@@ -406,7 +390,7 @@ struct TestHarness {
       unique_ptr<OperatorBase> op_g(CreateOperator(g_op, &w));
       ASSERT_TRUE(op_g.get());
       {
-        CudaProfiler p;
+        tc::CudaProfiler p;
         ASSERT_TRUE(op_g->Run());
       }
     }
@@ -424,7 +408,7 @@ struct TestHarness {
     unique_ptr<NetBase> ref_net(CreateNet(ref_net_def, &w1));
     ASSERT_TRUE(ref_net.get());
     {
-      CudaProfiler p;
+      tc::CudaProfiler p;
       ASSERT_TRUE(ref_net->Run());
     }
 
@@ -433,7 +417,7 @@ struct TestHarness {
     unique_ptr<NetBase> net(CreateNet(net_def, &w2));
     ASSERT_TRUE(net.get());
     {
-      CudaProfiler p;
+      tc::CudaProfiler p;
       ASSERT_TRUE(net->Run());
     }
 
@@ -467,7 +451,7 @@ struct TestHarness {
     unique_ptr<NetBase> net(CreateNet(net_def, &w1));
     ASSERT_TRUE(net.get());
     {
-      CudaProfiler p;
+      tc::CudaProfiler p;
       ASSERT_TRUE(net->Run());
     }
     RunGradient(w1, *net_def.mutable_op()->Mutable(0));
@@ -477,7 +461,7 @@ struct TestHarness {
     unique_ptr<OperatorBase> op(CreateOperator(op_def, &w2));
     ASSERT_TRUE(op.get());
     {
-      CudaProfiler p;
+      tc::CudaProfiler p;
       ASSERT_TRUE(op->Run());
     }
     OperatorDef def = op_def;

Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,7 @@ set(GTEST_LIBS gtest gtest_main)`
`15`	`15`	`################################################################################`
`16`	`16`	`set(EXAMPLES_FILES`
`17`	`17`	`tensordot`
	`18`	`+ blockdiagperm`
`18`	`19`	`)`
`19`	`20`	`foreach(i ${EXAMPLES_FILES})`
`20`	`21`	`add_executable(${i} ${i}.cc)`