Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit da7a8f1

Browse files
Add an example for 2 layers of WaveNet
Prints: ``` wavenet2layers size weight0: [128, 64, 2] ran in: 88us ```
1 parent 2334d73 commit da7a8f1

File tree

2 files changed

+177
-1
lines changed

2 files changed

+177
-1
lines changed

examples/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@ set(GTEST_LIBS gtest gtest_main)
1414
# Examples
1515
################################################################################
1616
set(EXAMPLES_FILES
17-
tensordot
1817
blockdiagperm
18+
tensordot
19+
wavenet
1920
)
2021
foreach(i ${EXAMPLES_FILES})
2122
add_executable(${i} ${i}.cc)

examples/wavenet.cc

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
/**
2+
* Copyright (c) 2017-present, Facebook, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#include <iostream>
17+
#include <string>
18+
#include <vector>
19+
20+
#include <gflags/gflags.h>
21+
#include <glog/logging.h>
22+
#include <gtest/gtest.h>
23+
24+
#include <ATen/ATen.h>
25+
26+
#include "tc/aten/aten_compiler.h"
27+
#include "tc/autotuner/genetic_autotuner_aten.h"
28+
#include "tc/core/cuda/cuda_mapping_options.h"
29+
#include "tc/core/flags.h"
30+
31+
#include "../test/test_harness_aten_cuda.h"
32+
33+
DEFINE_string(tuner_proto, "", "Filename to load and store proto cache ");
34+
35+
TEST(WaveNet2Layers, SimpleAutotune) {
36+
// 1. Define and setup the TC compilation unit with CUDA memory
37+
// management backed by ATen tensors.
38+
std::string tc = R"TC(
39+
def wavenet2layers(
40+
float(OUT, IN, KERN) Weight0,
41+
float(OUT) Bias0,
42+
float(BATCH, IN, KERN) Data0,
43+
float(IN, IN) ResWeight0,
44+
float(IN) ResBias0,
45+
float(SKIP, IN) SkipWeight0,
46+
float(SKIP) SkipBias0,
47+
float(OUT, IN, KERN) Weight1,
48+
float(OUT) Bias1,
49+
float(BATCH, IN, KERN) Data1,
50+
float(IN, IN) ResWeight1,
51+
float(IN) ResBias1,
52+
float(SKIP, IN) SkipWeight1,
53+
float(SKIP) SkipBias1)
54+
-> (Res0, Dilate0, NonLin0, Skip0, Res1, Dilate1, NonLin1, Skip1)
55+
{
56+
Dilate0(batch, out) = Bias0(out) where batch in 0:BATCH
57+
Dilate0(batch, out) += Weight0(out, r_in, r_kern) * Data0(batch, r_in, r_kern)
58+
NonLin0(batch, out) = 1 / (1 + exp(-1*(Dilate0(batch, out))))
59+
NonLin0(batch, out) *= tanh(Dilate0(batch, out + 64))
60+
61+
Skip0(batch, skip) = SkipBias0(skip) where batch in 0:BATCH
62+
Skip0(batch, skip) += SkipWeight0(skip, r_in) * NonLin0(batch, r_in)
63+
where r_in in 0:IN # necessary because r_in gets into unresolved min/max
64+
Res0(batch, out) = ResBias0( out) where batch in 0:BATCH
65+
Res0(batch, out) += ResWeight0( out, r_in) * NonLin0(batch, r_in)
66+
where r_in in 0:IN # necessary because r_in gets into unresolved min/max
67+
Res0(batch, out) = Res0(batch, out) + NonLin0(batch, out)
68+
where out in 0:IN # necessary because out gets into unresolved min/max
69+
70+
Dilate1(batch, out) = Bias1(out)
71+
where batch in 0:BATCH
72+
Dilate1(batch, out) += Weight1(out, r_in, r_kern) * Data1(batch, r_in, r_kern)
73+
NonLin1(batch, out) = 1 / (1 + exp(-1*(Dilate1(batch, out))))
74+
NonLin1(batch, out) *= tanh(Dilate1(batch, out + 64))
75+
76+
Skip1(batch, skip) = SkipBias1(skip) where batch in 0:BATCH
77+
Skip1(batch, skip) += SkipWeight1(skip, r_in) * NonLin1(batch, r_in)
78+
where r_in in 0:IN # necessary because r_in gets into unresolved min/max
79+
Res1(batch, out) = ResBias1( out) where batch in 0:BATCH
80+
Res1(batch, out) += ResWeight1( out, r_in) * NonLin1(batch, r_in)
81+
where r_in in 0:IN # necessary because r_in gets into unresolved min/max
82+
Res1(batch, out) = Res1(batch, out) + NonLin1(batch, out)
83+
where out in 0:IN # necessary because out gets into unresolved min/max
84+
}
85+
)TC";
86+
tc::ATenCompilationUnit<tc::CudaTcExecutor> atCompl;
87+
atCompl.define(tc);
88+
89+
// 2. Allocate tensors with random data.
90+
at::Tensor weight0 = at::CUDA(at::kFloat).rand({128, 64, 2});
91+
at::Tensor bias0 = at::CUDA(at::kFloat).rand({128});
92+
at::Tensor data0 = at::CUDA(at::kFloat).rand({1, 64, 2});
93+
at::Tensor res_weight0 = at::CUDA(at::kFloat).rand({64, 64});
94+
at::Tensor res_bias0 = at::CUDA(at::kFloat).rand({64});
95+
at::Tensor skip_weight0 = at::CUDA(at::kFloat).rand({256, 64});
96+
at::Tensor skip_bias0 = at::CUDA(at::kFloat).rand({256});
97+
98+
at::Tensor weight1 = at::CUDA(at::kFloat).rand({128, 64, 2});
99+
at::Tensor bias1 = at::CUDA(at::kFloat).rand({128});
100+
at::Tensor data1 = at::CUDA(at::kFloat).rand({1, 64, 2});
101+
at::Tensor res_weight1 = at::CUDA(at::kFloat).rand({64, 64});
102+
at::Tensor res_bias1 = at::CUDA(at::kFloat).rand({64});
103+
at::Tensor skip_weight1 = at::CUDA(at::kFloat).rand({256, 64});
104+
at::Tensor skip_bias1 = at::CUDA(at::kFloat).rand({256});
105+
106+
// 3. Run autotuning with evolutionary search starting from a naive option.
107+
auto naiveOptions = tc::CudaMappingOptions::makeNaiveCudaMappingOptions();
108+
tc::autotune::GeneticAutotunerATen geneticAutotuneATen(tc);
109+
std::vector<at::Tensor> tensors = {weight0,
110+
bias0,
111+
data0,
112+
res_weight0,
113+
res_bias0,
114+
skip_weight0,
115+
skip_bias0,
116+
weight1,
117+
bias1,
118+
data1,
119+
res_weight1,
120+
res_bias1,
121+
skip_weight1,
122+
skip_bias1};
123+
auto bestOption = geneticAutotuneATen.tune(
124+
FLAGS_tuner_proto, "wavenet2layers", tensors, naiveOptions);
125+
126+
// 4. Compile and run the TC with the best option.
127+
// Outputs get allocated; could also be pre-allocated and passed.
128+
auto handle =
129+
atCompl.compile("wavenet2layers", tensors, bestOption.getValue());
130+
std::vector<at::Tensor> outputs;
131+
auto duration = atCompl.run("wavenet2layers", tensors, outputs, handle, true);
132+
std::cout
133+
<< "wavenet2layers size weight0: " << weight0.sizes() << " ran in: "
134+
<< std::chrono::duration_cast<std::chrono::microseconds>(duration).count()
135+
<< "us\n";
136+
137+
// 5. The following represent reasonable initialization operations,
138+
// ported from PyTorch.
139+
weight0 = 5 * (at::CUDA(at::kFloat).rand({128, 64, 2}) - 0.5f);
140+
bias0 = 2 * (at::CUDA(at::kFloat).rand({128}) - 0.5f);
141+
data0 = 2 * (at::CUDA(at::kFloat).rand({1, 64, 2}) - 0.5f);
142+
res_weight0 = 2 * (at::CUDA(at::kFloat).rand({64, 64}) - 0.5f);
143+
res_bias0 = 2 * (at::CUDA(at::kFloat).rand({64}) - 0.5f);
144+
skip_weight0 = 2 * (at::CUDA(at::kFloat).rand({256, 64}) - 0.5f);
145+
skip_bias0 = 2 * (at::CUDA(at::kFloat).rand({256}) - 0.5f);
146+
147+
weight1 = 5 * (at::CUDA(at::kFloat).rand({128, 64, 2}) - 1.5f);
148+
bias1 = 2 * (at::CUDA(at::kFloat).rand({128}) - 1.5f);
149+
data1 = 2 * (at::CUDA(at::kFloat).rand({1, 64, 2}) - 1.5f);
150+
res_weight1 = 2 * (at::CUDA(at::kFloat).rand({64, 64}) - 1.5f);
151+
res_bias1 = 2 * (at::CUDA(at::kFloat).rand({64}) - 1.5f);
152+
skip_weight1 = 2 * (at::CUDA(at::kFloat).rand({256, 64}) - 1.5f);
153+
skip_bias1 = 2 * (at::CUDA(at::kFloat).rand({256}) - 1.5f);
154+
155+
// 6. Run unchecked multiple times, to put GPU in high usage mode, use with:
156+
// nvprof --profile-from-start off executable --use_nvprof=1
157+
{
158+
tc::CudaProfiler cp;
159+
for (int i = 0; i < tc::FLAGS_benchmark_iterations; ++i) {
160+
atCompl.uncheckedRun(tensors, outputs, handle);
161+
}
162+
}
163+
}
164+
165+
// From root, run with:
166+
// ./build/examples/wavenet --tuner_threads=10 --tuner_gen_pop_size=10
167+
// --tuner_gen_generations=3 --tuner_gen_number_elites=4
168+
// --benchmark_iterations=1000 --tuner_proto="/tmp/wavenet"
169+
int main(int argc, char** argv) {
170+
::testing::InitGoogleTest(&argc, argv);
171+
::gflags::ParseCommandLineFlags(&argc, &argv, true);
172+
::google::InitGoogleLogging(argv[0]);
173+
setAtenSeed(tc::initRandomSeed(), at::Backend::CUDA);
174+
return RUN_ALL_TESTS();
175+
}

0 commit comments

Comments
 (0)