Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit addd4d2

Browse files
Merge pull request #225 from nicolasvasilache/pr/simple_example_with_options_reuse
Simple example with options reuse
2 parents 70127a6 + bdb3740 commit addd4d2

18 files changed

+303
-46
lines changed

.jenkins/run_test.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ source /etc/lsb-release
1212

1313
if [[ "$DISTRIB_RELEASE" == 14.04 ]]; then
1414
echo "Running TC backend tests"
15-
FILTER_OUT=example_MLP_model ./test.sh
16-
./build/examples/example_MLP_model --gtest_filter=-*2LUT*
15+
FILTER_OUT=MLP_model ./test.sh
16+
./build/benchmarks/MLP_model --gtest_filter=-*2LUT*
1717
if [[ $(conda --version | wc -c) -ne 0 ]]; then
1818
source activate tc-env
1919
echo "Running TC PyTorch tests"
@@ -28,7 +28,7 @@ if [[ "$DISTRIB_RELEASE" == 16.04 ]]; then
2828
./test_python/run_test.sh
2929
else
3030
echo "Running TC backend tests"
31-
FILTER_OUT=example_MLP_model ./test.sh
32-
./build/examples/example_MLP_model --gtest_filter=-*2LUT*
31+
FILTER_OUT=MLP_model ./test.sh
32+
./build/benchmarks/MLP_model --gtest_filter=-*2LUT*
3333
fi
3434
fi

CMakeLists.txt

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -258,8 +258,14 @@ endif()
258258
enable_testing()
259259
add_subdirectory(test)
260260

261-
if (WITH_CAFFE2 AND WITH_CUDA)
261+
if (WITH_CUDA)
262262
add_subdirectory(examples)
263263
else()
264-
message(STATUS "Not building examples, caffe2 or CUDA not available")
264+
message(STATUS "Not building examples, CUDA not available")
265+
endif()
266+
267+
if (WITH_CAFFE2 AND WITH_CUDA)
268+
add_subdirectory(benchmarks)
269+
else()
270+
message(STATUS "Not building benchmarks, caffe2 or CUDA not available")
265271
endif()

README.md

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,115 @@ After a few generations of `autotuning` on a 2-GPU P100 system, we see results r
2929

3030
![Autotuning Sample](docs/source/_static/img/autotuning.png)
3131

32+
In C++ a minimal autotuning example resembles the [following](example/example_tensordot.cc):
33+
```cpp
34+
TEST(TensorDot, SimpleAutotune) {
35+
// 1. Define and setup the TC compilation unit with CUDA memory
36+
// management backed by ATen tensors.
37+
std::string tc = R"TC(
38+
def tensordot(float(N, C1, C2, H, W) I0,
39+
float(N, C2, C3, H, W) I1) -> (O)
40+
{
41+
O(n, c1, c3, h, w) +=! I0(n, c1, r_c2, h, w) * I1(n, r_c2, c3, h, w)
42+
}
43+
)TC";
44+
tc::ATenCompilationUnit<tc::CudaTcExecutor> atCompl;
45+
atCompl.define(tc);
46+
47+
// 2. Allocate tensors with random data.
48+
at::Tensor I0 = at::CUDA(at::kFloat).rand({32, 8, 16, 17, 25});
49+
at::Tensor I1 = at::CUDA(at::kFloat).rand({32, 16, 2, 17, 25});
50+
51+
// 3. Run autotuning with evolutionary search starting from a naive option.
52+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions();
53+
tc::autotune::GeneticAutotunerATen geneticAutotuneATen(tc);
54+
auto bestOption = geneticAutotuneATen.tune(
55+
"/tmp/save_results", "tensordot", {I0, I1}, options);
56+
57+
// 4. Compile and run the TC with the best option.
58+
// Outputs get allocated; could also be pre-allocated and passed.
59+
auto handle = atCompl.compile("tensordot", {I0, I1}, bestOption.getValue());
60+
std::vector<at::Tensor> outputs;
61+
auto duration = atCompl.run("tensordot", {I0, I1}, outputs, handle, true);
62+
std::cout
63+
<< "tensordot size I0: " << I0.sizes() << ", "
64+
<< "size I1: " << I1.sizes() << " ran in: "
65+
<< std::chrono::duration_cast<std::chrono::microseconds>(duration).count()
66+
<< "us\n";
67+
}
68+
```
69+
70+
Note that we only need to **autotune a TC once** to obtain reasonable mapping options
71+
that can translate to other problem sizes for a given TC as the following snippet
72+
illustrates:
73+
```cpp
74+
// 5. Reuse bestOptions from autotuning on another kernel
75+
for (auto sizes : std::vector<std::pair<at::IntList, at::IntList>>{
76+
{{4, 9, 7, 16, 14}, {4, 7, 3, 16, 14}},
77+
{{8, 5, 11, 10, 10}, {8, 11, 16, 10, 10}},
78+
}) {
79+
at::Tensor I0 = at::CUDA(at::kFloat).rand(sizes.first);
80+
at::Tensor I1 = at::CUDA(at::kFloat).rand(sizes.second);
81+
auto handle = atCompl.compile("tensordot", {I0, I1}, bestOption.getValue());
82+
std::vector<at::Tensor> outputs;
83+
auto duration = atCompl.run("tensordot", {I0, I1}, outputs, handle, true);
84+
std::cout << "tensordot size I0: " << I0.sizes() << ", "
85+
<< "size I1: " << I1.sizes() << " ran in: "
86+
<< std::chrono::duration_cast<std::chrono::microseconds>(duration)
87+
.count()
88+
<< "us\n";
89+
}
90+
```
91+
92+
Putting it all together, one may see:
93+
```shell
94+
> build$ ./examples/example_simple
95+
[==========] Running 1 test from 1 test case.
96+
[----------] Global test environment set-up.
97+
[----------] 1 test from TensorDot
98+
[ RUN ] TensorDot.SimpleAutotune
99+
Loading proto from: /tmp/save_results.options and /tmp/save_results.cuda
100+
Generation 0 Jobs(Compiled, GPU)/total (10, 10)/10 (best/median/worst)us: 226/4238/7345
101+
Generation 1 Jobs(Compiled, GPU)/total (10, 10)/10 (best/median/worst)us: 220/221/233
102+
Generation 2 Jobs(Compiled, GPU)/total (10, 10)/10 (best/median/worst)us: 220/221/234
103+
Dumping cache to /tmp/save_results.cuda/options
104+
tensordot size I0: [16, 8, 16, 17, 25], size I1: [16, 16, 2, 17, 25] ran in: 239us
105+
tensordot size I0: [4, 9, 7, 16, 14], size I1: [4, 7, 3, 16, 14] ran in: 56us
106+
tensordot size I0: [8, 5, 11, 10, 10], size I1: [8, 11, 16, 10, 10] ran in: 210us
107+
[ OK ] TensorDot.SimpleAutotune (27812 ms)
108+
[----------] 1 test from TensorDot (27812 ms total)
109+
110+
[----------] Global test environment tear-down
111+
[==========] 1 test from 1 test case ran. (27812 ms total)
112+
[ PASSED ] 1 test.
113+
```
114+
115+
Tuning results are then available and reusable in ```/tmp/save_results.cuda``` and ```/tmp/save_results.proto```.
116+
117+
Interestingly, note that running the same example again will start form the best saved results and improve upon them.
118+
Of course this has diminishing returns:
119+
```shell
120+
> build$ ./examples/example_simple
121+
[==========] Running 1 test from 1 test case.
122+
[----------] Global test environment set-up.
123+
[----------] 1 test from TensorDot
124+
[ RUN ] TensorDot.SimpleAutotune
125+
Loading proto from: /tmp/save_results.options and /tmp/save_results.cuda
126+
Generation 0 Jobs(Compiled, GPU)/total (10, 10)/10 (best/median/worst)us: 256/258/270
127+
Generation 1 Jobs(Compiled, GPU)/total (10, 10)/10 (best/median/worst)us: 158/255/616
128+
Generation 2 Jobs(Compiled, GPU)/total (10, 10)/10 (best/median/worst)us: 157/252/720
129+
Dumping cache to /tmp/save_results.cuda/options
130+
tensordot size I0: [16, 8, 16, 17, 25], size I1: [16, 16, 2, 17, 25] ran in: 172us
131+
tensordot size I0: [4, 9, 7, 16, 14], size I1: [4, 7, 3, 16, 14] ran in: 44us
132+
tensordot size I0: [8, 5, 11, 10, 10], size I1: [8, 11, 16, 10, 10] ran in: 88us
133+
[ OK ] TensorDot.SimpleAutotune (28232 ms)
134+
[----------] 1 test from TensorDot (28232 ms total)
135+
136+
[----------] Global test environment tear-down
137+
[==========] 1 test from 1 test case ran. (28232 ms total)
138+
[ PASSED ] 1 test.
139+
```
140+
32141
We have not yet characterized the precise fraction of peak performance we obtain but it is not uncommon to obtain 80%+ of peak shared memory bandwidth after autotuning. Solid register-level optimizations are still in the work but TC in its current form already addresses the productivity gap between the needs of research and the needs of production. Which is why we are excited to share it with the entire community and bring this collaborative effort in the open.
33142
34143
# Documentation

benchmarks/CMakeLists.txt

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
include_directories(.)
2+
include_directories(..)
3+
include_directories(../../include)
4+
5+
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
6+
7+
# Needed to ensure reproducibility (proper cublas version) via call to
8+
# cublasGetVersion_v2.
9+
find_library(CUDA_CUBLAS_LIBRARIES cublas
10+
PATHS ${CUDA_TOOLKIT_ROOT_DIR}
11+
PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs)
12+
find_library(CUDA_CUDNN_LIBRARIES cudnn
13+
PATHS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
14+
PATH_SUFFIXES lib lib64)
15+
16+
################################################################################
17+
# Don't ask ... TODO: cleanup gtest
18+
################################################################################
19+
include_directories(${PROJECT_SOURCE_DIR}/third-party/googletest/googletest/include)
20+
set(GTEST_LIBS gtest gtest_main)
21+
22+
################################################################################
23+
# Examples
24+
################################################################################
25+
set(EXAMPLES_FILES
26+
batchmatmul
27+
group_convolution
28+
tmm
29+
MLP_model
30+
)
31+
foreach(i ${EXAMPLES_FILES})
32+
add_executable(${i} ${i}.cc)
33+
add_test(${i} ${i})
34+
target_link_libraries(
35+
${i}
36+
tc_autotuner
37+
tc_core
38+
tc_c2
39+
40+
${CUDA_CUBLAS_LIBRARIES}
41+
${CUDA_CUDNN_LIBRARIES}
42+
${GTEST_LIBS}
43+
${GFLAGS_LIBRARIES}
44+
${GLOG_LIBRARIES}
45+
46+
${ATEN_LIBRARIES}
47+
)
48+
endforeach()

examples/example_MLP_model.cc renamed to benchmarks/MLP_model.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929
#include "../test/test_harness.h"
3030
#include "../test/test_harness_aten_cuda.h"
31-
#include "example_fixture.h"
31+
#include "benchmark_fixture.h"
3232

3333
#include "tc/c2/context.h"
3434
#include "tc/core/cuda/cuda.h"

examples/example_batchmatmul.cc renamed to benchmarks/batchmatmul.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929
#include "../test/test_harness.h"
3030
#include "../test/test_harness_aten_cuda.h"
31-
#include "example_fixture.h"
31+
#include "benchmark_fixture.h"
3232

3333
#include "tc/c2/context.h"
3434
#include "tc/core/cuda/cuda.h"
File renamed without changes.

examples/example_group_convolution.cc renamed to benchmarks/group_convolution.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929
#include "../test/test_harness.h"
3030
#include "../test/test_harness_aten_cuda.h"
31-
#include "example_fixture.h"
31+
#include "benchmark_fixture.h"
3232

3333
#include "tc/c2/context.h"
3434
#include "tc/core/cuda/cuda.h"
File renamed without changes.
File renamed without changes.

0 commit comments

Comments
 (0)