facebookresearch
diff --git a/‎.jenkins/run_test.sh
Lines changed: 4 additions & 4 deletions b/‎.jenkins/run_test.sh
Lines changed: 4 additions & 4 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 8 additions & 2 deletions b/‎CMakeLists.txt
Lines changed: 8 additions & 2 deletions
diff --git a/‎README.md
Lines changed: 109 additions & 0 deletions b/‎README.md
Lines changed: 109 additions & 0 deletions
diff --git a/‎benchmarks/CMakeLists.txt
Lines changed: 48 additions & 0 deletions b/‎benchmarks/CMakeLists.txt
Lines changed: 48 additions & 0 deletions
diff --git a/‎examples/example_MLP_model.cc renamed to ‎benchmarks/MLP_model.cc
Lines changed: 1 addition & 1 deletion b/‎examples/example_MLP_model.cc renamed to ‎benchmarks/MLP_model.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/example_batchmatmul.cc renamed to ‎benchmarks/batchmatmul.cc
Lines changed: 1 addition & 1 deletion b/‎examples/example_batchmatmul.cc renamed to ‎benchmarks/batchmatmul.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/example_fixture.h renamed to ‎benchmarks/benchmark_fixture.h b/‎examples/example_fixture.h renamed to ‎benchmarks/benchmark_fixture.h
diff --git a/‎examples/example_group_convolution.cc renamed to ‎benchmarks/group_convolution.cc
Lines changed: 1 addition & 1 deletion b/‎examples/example_group_convolution.cc renamed to ‎benchmarks/group_convolution.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/scripts/8.0.sh renamed to ‎benchmarks/scripts/8.0.sh b/‎examples/scripts/8.0.sh renamed to ‎benchmarks/scripts/8.0.sh
diff --git a/‎examples/scripts/9.0.sh renamed to ‎benchmarks/scripts/9.0.sh b/‎examples/scripts/9.0.sh renamed to ‎benchmarks/scripts/9.0.sh
@@ -12,8 +12,8 @@ source /etc/lsb-release
 
 if [[ "$DISTRIB_RELEASE" == 14.04 ]]; then
   echo "Running TC backend tests"
-  FILTER_OUT=example_MLP_model ./test.sh
-  ./build/examples/example_MLP_model --gtest_filter=-*2LUT*
+  FILTER_OUT=MLP_model ./test.sh
+  ./build/benchmarks/MLP_model --gtest_filter=-*2LUT*
   if [[ $(conda --version | wc -c) -ne 0 ]]; then
     source activate tc-env
     echo "Running TC PyTorch tests"
@@ -28,7 +28,7 @@ if [[ "$DISTRIB_RELEASE" == 16.04 ]]; then
     ./test_python/run_test.sh
   else
     echo "Running TC backend tests"
-    FILTER_OUT=example_MLP_model ./test.sh
-    ./build/examples/example_MLP_model --gtest_filter=-*2LUT*
+    FILTER_OUT=MLP_model ./test.sh
+    ./build/benchmarks/MLP_model --gtest_filter=-*2LUT*
   fi
 fi
@@ -258,8 +258,14 @@ endif()
 enable_testing()
 add_subdirectory(test)
 
-if (WITH_CAFFE2 AND WITH_CUDA)
+if (WITH_CUDA)
   add_subdirectory(examples)
 else()
-  message(STATUS "Not building examples, caffe2 or CUDA not available")
+  message(STATUS "Not building examples, CUDA not available")
+endif()
+
+if (WITH_CAFFE2 AND WITH_CUDA)
+  add_subdirectory(benchmarks)
+else()
+  message(STATUS "Not building benchmarks, caffe2 or CUDA not available")
 endif()
@@ -29,6 +29,115 @@ After a few generations of `autotuning` on a 2-GPU P100 system, we see results r
 
 ![Autotuning Sample](docs/source/_static/img/autotuning.png)
 
+In C++ a minimal autotuning example resembles the [following](example/example_tensordot.cc):
+```cpp
+TEST(TensorDot, SimpleAutotune) {
+  // 1. Define and setup the TC compilation unit with CUDA memory
+  // management backed by ATen tensors.
+  std::string tc = R"TC(
+def tensordot(float(N, C1, C2, H, W) I0,
+              float(N, C2, C3, H, W) I1)  -> (O)
+{
+    O(n, c1, c3, h, w) +=! I0(n, c1, r_c2, h, w) * I1(n, r_c2, c3, h, w)
+}
+  )TC";
+  tc::ATenCompilationUnit<tc::CudaTcExecutor> atCompl;
+  atCompl.define(tc);
+
+  // 2. Allocate tensors with random data.
+  at::Tensor I0 = at::CUDA(at::kFloat).rand({32,  8, 16, 17, 25});
+  at::Tensor I1 = at::CUDA(at::kFloat).rand({32, 16, 2, 17, 25});
+
+  // 3. Run autotuning with evolutionary search starting from a naive option.
+  auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions();
+  tc::autotune::GeneticAutotunerATen geneticAutotuneATen(tc);
+  auto bestOption = geneticAutotuneATen.tune(
+    "/tmp/save_results", "tensordot", {I0, I1}, options);
+
+  // 4. Compile and run the TC with the best option.
+  // Outputs get allocated; could also be pre-allocated and passed.
+  auto handle = atCompl.compile("tensordot", {I0, I1}, bestOption.getValue());
+  std::vector<at::Tensor> outputs;
+  auto duration = atCompl.run("tensordot", {I0, I1}, outputs, handle, true);
+  std::cout
+       << "tensordot size I0: " << I0.sizes() << ", "
+       << "size I1: " << I1.sizes() << " ran in: "
+       << std::chrono::duration_cast<std::chrono::microseconds>(duration).count()
+       << "us\n";
+}
+```
+
+Note that we only need to **autotune a TC once** to obtain reasonable mapping options
+that can translate to other problem sizes for a given TC as the following snippet
+illustrates:
+```cpp
+// 5. Reuse bestOptions from autotuning on another kernel
+for (auto sizes : std::vector<std::pair<at::IntList, at::IntList>>{
+         {{4, 9, 7, 16, 14}, {4, 7, 3, 16, 14}},
+         {{8, 5, 11, 10, 10}, {8, 11, 16, 10, 10}},
+     }) {
+  at::Tensor I0 = at::CUDA(at::kFloat).rand(sizes.first);
+  at::Tensor I1 = at::CUDA(at::kFloat).rand(sizes.second);
+  auto handle = atCompl.compile("tensordot", {I0, I1}, bestOption.getValue());
+  std::vector<at::Tensor> outputs;
+  auto duration = atCompl.run("tensordot", {I0, I1}, outputs, handle, true);
+  std::cout << "tensordot size I0: " << I0.sizes() << ", "
+            << "size I1: " << I1.sizes() << " ran in: "
+            << std::chrono::duration_cast<std::chrono::microseconds>(duration)
+                   .count()
+            << "us\n";
+}
+```
+
+Putting it all together, one may see:
+```shell
+> build$ ./examples/example_simple
+[==========] Running 1 test from 1 test case.
+[----------] Global test environment set-up.
+[----------] 1 test from TensorDot
+[ RUN      ] TensorDot.SimpleAutotune
+Loading proto from: /tmp/save_results.options and /tmp/save_results.cuda
+Generation 0    Jobs(Compiled, GPU)/total  (10, 10)/10   (best/median/worst)us: 226/4238/7345
+Generation 1    Jobs(Compiled, GPU)/total  (10, 10)/10   (best/median/worst)us: 220/221/233
+Generation 2    Jobs(Compiled, GPU)/total  (10, 10)/10   (best/median/worst)us: 220/221/234
+Dumping cache to /tmp/save_results.cuda/options
+tensordot size I0: [16, 8, 16, 17, 25], size I1: [16, 16, 2, 17, 25] ran in: 239us
+tensordot size I0: [4, 9, 7, 16, 14], size I1: [4, 7, 3, 16, 14] ran in: 56us
+tensordot size I0: [8, 5, 11, 10, 10], size I1: [8, 11, 16, 10, 10] ran in: 210us
+[       OK ] TensorDot.SimpleAutotune (27812 ms)
+[----------] 1 test from TensorDot (27812 ms total)
+
+[----------] Global test environment tear-down
+[==========] 1 test from 1 test case ran. (27812 ms total)
+[  PASSED  ] 1 test.
+```
+
+Tuning results are then available and reusable in ```/tmp/save_results.cuda``` and ```/tmp/save_results.proto```.
+
+Interestingly, note that running the same example again will start form the best saved results and improve upon them.
+Of course this has diminishing returns:
+```shell
+> build$ ./examples/example_simple
+[==========] Running 1 test from 1 test case.
+[----------] Global test environment set-up.
+[----------] 1 test from TensorDot
+[ RUN      ] TensorDot.SimpleAutotune
+Loading proto from: /tmp/save_results.options and /tmp/save_results.cuda
+Generation 0    Jobs(Compiled, GPU)/total  (10, 10)/10   (best/median/worst)us: 256/258/270
+Generation 1    Jobs(Compiled, GPU)/total  (10, 10)/10   (best/median/worst)us: 158/255/616
+Generation 2    Jobs(Compiled, GPU)/total  (10, 10)/10   (best/median/worst)us: 157/252/720
+Dumping cache to /tmp/save_results.cuda/options
+tensordot size I0: [16, 8, 16, 17, 25], size I1: [16, 16, 2, 17, 25] ran in: 172us
+tensordot size I0: [4, 9, 7, 16, 14], size I1: [4, 7, 3, 16, 14] ran in: 44us
+tensordot size I0: [8, 5, 11, 10, 10], size I1: [8, 11, 16, 10, 10] ran in: 88us
+[       OK ] TensorDot.SimpleAutotune (28232 ms)
+[----------] 1 test from TensorDot (28232 ms total)
+
+[----------] Global test environment tear-down
+[==========] 1 test from 1 test case ran. (28232 ms total)
+[  PASSED  ] 1 test.
+```
+
 We have not yet characterized the precise fraction of peak performance we obtain but it is not uncommon to obtain 80%+ of peak shared memory bandwidth after autotuning. Solid register-level optimizations are still in the work but TC in its current form already addresses the productivity gap between the needs of research and the needs of production. Which is why we are excited to share it with the entire community and bring this collaborative effort in the open.
 
 # Documentation
 
@@ -0,0 +1,48 @@
+include_directories(.)
+include_directories(..)
+include_directories(../../include)
+
+set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+
+# Needed to ensure reproducibility (proper cublas version) via call to
+# cublasGetVersion_v2.
+find_library(CUDA_CUBLAS_LIBRARIES cublas
+  PATHS ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs)
+find_library(CUDA_CUDNN_LIBRARIES cudnn
+  PATHS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES lib lib64)
+
+################################################################################
+# Don't ask ... TODO: cleanup gtest
+################################################################################
+include_directories(${PROJECT_SOURCE_DIR}/third-party/googletest/googletest/include)
+set(GTEST_LIBS gtest gtest_main)
+
+################################################################################
+# Examples
+################################################################################
+set(EXAMPLES_FILES
+  batchmatmul
+  group_convolution
+  tmm
+  MLP_model
+)
+foreach(i ${EXAMPLES_FILES})
+  add_executable(${i} ${i}.cc)
+  add_test(${i} ${i})
+  target_link_libraries(
+     ${i}
+     tc_autotuner
+     tc_core
+     tc_c2
+
+     ${CUDA_CUBLAS_LIBRARIES}
+     ${CUDA_CUDNN_LIBRARIES}
+     ${GTEST_LIBS}
+     ${GFLAGS_LIBRARIES}
+     ${GLOG_LIBRARIES}
+
+     ${ATEN_LIBRARIES}
+  )
+endforeach()
@@ -28,7 +28,7 @@
 
 #include "../test/test_harness.h"
 #include "../test/test_harness_aten_cuda.h"
-#include "example_fixture.h"
+#include "benchmark_fixture.h"
 
 #include "tc/c2/context.h"
 #include "tc/core/cuda/cuda.h"
 
@@ -28,7 +28,7 @@
 
 #include "../test/test_harness.h"
 #include "../test/test_harness_aten_cuda.h"
-#include "example_fixture.h"
+#include "benchmark_fixture.h"
 
 #include "tc/c2/context.h"
 #include "tc/core/cuda/cuda.h"
 
@@ -28,7 +28,7 @@
 
 #include "../test/test_harness.h"
 #include "../test/test_harness_aten_cuda.h"
-#include "example_fixture.h"
+#include "benchmark_fixture.h"
 
 #include "tc/c2/context.h"
 #include "tc/core/cuda/cuda.h"