Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit b346d01

Browse files
Add wavenet benchmark
Thie commit adds a single layer wavenet benchmar following the description in the paper https://arxiv.org/pdf/1609.03499.pdf. The correctness has been tested against a PyTorch baseline written by @Artix18.
1 parent 5266f73 commit b346d01

File tree

3 files changed

+341
-0
lines changed

3 files changed

+341
-0
lines changed

tc/benchmarks/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ set(BENCHMARKS
2222
moments
2323
tmm
2424
MLP_model
25+
wavenet
2526
)
2627
foreach(i ${BENCHMARKS})
2728
add_executable(benchmark_${i} ${i}.cc)

tc/benchmarks/wavenet.cc

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
/**
2+
* Copyright (c) 2017-present, Facebook, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#include "wavenet.h"
17+
18+
#include <iostream>
19+
#include <string>
20+
#include <vector>
21+
22+
#include <gflags/gflags.h>
23+
#include <glog/logging.h>
24+
#include <gtest/gtest.h>
25+
26+
#include "tc/aten/aten.h"
27+
28+
#include "tc/aten/aten_compiler.h"
29+
#include "tc/core/cuda/cuda_mapping_options.h"
30+
31+
#include "../test/caffe2/cuda/test_harness.h"
32+
#include "../test/caffe2/test_harness.h"
33+
#include "../test/test_harness_aten_cuda.h"
34+
#include "benchmark_fixture.h"
35+
36+
#include "tc/c2/context.h"
37+
#include "tc/core/cuda/cuda.h"
38+
#include "tc/core/flags.h"
39+
40+
using namespace caffe2;
41+
42+
DEFINE_uint32(B, 1, "Batch size");
43+
DEFINE_uint32(
44+
RESIDUAL_C,
45+
32,
46+
"Residual channels (i.e. WaveNet block input channels)");
47+
DEFINE_uint32(
48+
DILATION_C,
49+
32,
50+
"Dilation channels (i.e. WaveNet block channels after dilated convolution)");
51+
DEFINE_uint32(
52+
SKIP_C,
53+
32,
54+
"Skip channels (i.e. WaveNet block channels in the skip tensor)");
55+
DEFINE_uint32(
56+
RECEPTIVE_FIELD,
57+
4000,
58+
"https://arxiv.org/pdf/1609.03499.pdf paper mentions 16K samples per second"
59+
"and a receptive field of 240ms so we approx. set the default to 4000)");
60+
DEFINE_uint32(DILATION_FACTOR, 1, "Powers of 2 from 1 to 512 in the paper");
61+
62+
// https://arxiv.org/pdf/1609.03499.pdf paper mentions 16K samples per second
63+
// and a receptive field of 240ms so about 4K RECEPTIVE_FIELD
64+
class WaveNet : public Benchmark {
65+
protected:
66+
uint32_t B;
67+
uint32_t RESIDUAL_C;
68+
uint32_t DILATION_C;
69+
uint32_t SKIP_C;
70+
uint32_t RECEPTIVE_FIELD;
71+
uint32_t DILATION_FACTOR; // 2^layer where layer in 0:10
72+
73+
public:
74+
void Init(
75+
uint32_t b,
76+
uint32_t residual_c,
77+
uint32_t dilation_c,
78+
uint32_t skip_c,
79+
uint32_t receptive_field,
80+
uint32_t dilation_factor) {
81+
B = b;
82+
RESIDUAL_C = residual_c;
83+
DILATION_C = dilation_c;
84+
SKIP_C = skip_c;
85+
RECEPTIVE_FIELD = receptive_field;
86+
DILATION_FACTOR = dilation_factor;
87+
}
88+
void runWaveNet1(const tc::CudaMappingOptions& options);
89+
};
90+
91+
void WaveNet::runWaveNet1(const tc::CudaMappingOptions& options) {
92+
at::Tensor data = at::CUDA(at::kFloat).rand({B, RESIDUAL_C, RECEPTIVE_FIELD});
93+
at::Tensor filterWeight =
94+
at::CUDA(at::kFloat).rand({DILATION_C, RESIDUAL_C, 2});
95+
at::Tensor filterBias = at::CUDA(at::kFloat).rand({DILATION_C});
96+
at::Tensor gateWeight =
97+
at::CUDA(at::kFloat).rand({DILATION_C, RESIDUAL_C, 2});
98+
at::Tensor gateBias = at::CUDA(at::kFloat).rand({DILATION_C});
99+
at::Tensor resWeight = at::CUDA(at::kFloat).rand({RESIDUAL_C, DILATION_C});
100+
at::Tensor resBias = at::CUDA(at::kFloat).rand({RESIDUAL_C});
101+
at::Tensor skipWeight = at::CUDA(at::kFloat).rand({SKIP_C, DILATION_C});
102+
at::Tensor skipBias = at::CUDA(at::kFloat).rand({SKIP_C});
103+
at::Tensor dilation = at::CUDA(at::kFloat).rand({DILATION_FACTOR});
104+
105+
std::vector<at::Tensor> inputs = {data,
106+
filterWeight,
107+
filterBias,
108+
gateWeight,
109+
gateBias,
110+
resWeight,
111+
resBias,
112+
skipWeight,
113+
skipBias,
114+
dilation};
115+
116+
std::vector<tc::CudaMappingOptions> bestOptions{options};
117+
if (FLAGS_autotune) {
118+
bestOptions = autotune(
119+
FLAGS_save_tuner_proto_prefix + std::string("/wavenet_1_cache"),
120+
FLAGS_save_tuner_proto_prefix + std::string("/wavenet_1_best"),
121+
tc::TC_WAVENET,
122+
tc::TC_WAVENET1_NAME,
123+
inputs,
124+
options);
125+
CHECK_GE(bestOptions.size(), 1u);
126+
}
127+
Check(tc::TC_WAVENET, tc::TC_WAVENET1_NAME, bestOptions[0], inputs);
128+
}
129+
130+
/// WaveNet 1 block
131+
// Generic
132+
TEST_F(WaveNet, WaveNet1) {
133+
Init(
134+
FLAGS_B,
135+
FLAGS_RESIDUAL_C,
136+
FLAGS_DILATION_C,
137+
FLAGS_SKIP_C,
138+
FLAGS_RECEPTIVE_FIELD,
139+
FLAGS_DILATION_FACTOR);
140+
runWaveNet1(tc::CudaMappingOptions::makeNaiveMappingOptions());
141+
}
142+
143+
// P100
144+
TEST_F(
145+
WaveNet,
146+
WaveNet1_P100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1) {
147+
Init(1, 32, 32, 256, 4000, 1);
148+
runWaveNet1(
149+
tc::options_WaveNet1_P100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1);
150+
}
151+
152+
TEST_F(
153+
WaveNet,
154+
WaveNet1_P100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32) {
155+
Init(1, 32, 32, 256, 4000, 32);
156+
runWaveNet1(
157+
tc::options_WaveNet1_P100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32);
158+
}
159+
160+
// V100
161+
TEST_F(
162+
WaveNet,
163+
WaveNet1_V100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1) {
164+
Init(1, 32, 32, 256, 4000, 1);
165+
runWaveNet1(
166+
tc::options_WaveNet1_V100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1);
167+
}
168+
169+
TEST_F(
170+
WaveNet,
171+
WaveNet1_V100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32) {
172+
Init(1, 32, 32, 256, 4000, 32);
173+
runWaveNet1(
174+
tc::options_WaveNet1_V100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32);
175+
}
176+
177+
int main(int argc, char** argv) {
178+
::testing::InitGoogleTest(&argc, argv);
179+
::gflags::ParseCommandLineFlags(&argc, &argv, true);
180+
::google::InitGoogleLogging(argv[0]);
181+
tc::aten::setAtenSeed(tc::initRandomSeed(), at::Backend::CUDA);
182+
return RUN_ALL_TESTS();
183+
}

tc/benchmarks/wavenet.h

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
/**
2+
* Copyright (c) 2017-present, Facebook, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#pragma once
17+
18+
#include "tc/aten/aten.h"
19+
#include "tc/core/cuda/cuda_mapping_options.h"
20+
21+
namespace tc {
22+
constexpr static auto TC_WAVENET1_NAME = "wavenet1";
23+
constexpr static auto TC_WAVENET = R"TC(
24+
# Original data is float(B, C, RECEPTIVE_FIELD) and undergoes a \
25+
# Conv1d to become float(B, RESIDUAL_C, RECEPTIVE_FIELD)
26+
27+
def wavenet1(
28+
float(B, RESIDUAL_C, RECEPTIVE_FIELD) Data,
29+
float(DILATION_C, RESIDUAL_C, 2) FilterWeight,
30+
float(DILATION_C) FilterBias,
31+
float(DILATION_C, RESIDUAL_C, 2) GateWeight,
32+
float(DILATION_C) GateBias,
33+
float(RESIDUAL_C, DILATION_C) ResWeight,
34+
float(RESIDUAL_C) ResBias,
35+
float(SKIP_C, DILATION_C) SkipWeight,
36+
float(SKIP_C) SkipBias,
37+
float(DILATION_FACTOR) Dilation)
38+
-> (FilterOut, GateOut, NonLin, Res, Skip)
39+
{
40+
FilterOut(b, dilation_c, rf) = FilterBias(dilation_c)
41+
where b in 0:B, dilation_c in 0:DILATION_C, rf in 0:RECEPTIVE_FIELD
42+
FilterOut(b, dilation_c, rf) += Data(b, r_residual_c, rf) * FilterWeight(dilation_c, r_residual_c, 1) +
43+
(
44+
(rf - DILATION_FACTOR >= 0) ?
45+
Data(b, r_residual_c, rf - DILATION_FACTOR) * FilterWeight(dilation_c, r_residual_c, 0) :
46+
float(0)
47+
)
48+
where rf in 0:RECEPTIVE_FIELD
49+
50+
GateOut(b, dilation_c, rf) = GateBias(dilation_c)
51+
where b in 0:B, dilation_c in 0:DILATION_C, rf in 0:RECEPTIVE_FIELD
52+
GateOut(b, dilation_c, rf) += Data(b, r_residual_c, rf) * GateWeight(dilation_c, r_residual_c, 1) +
53+
(
54+
(rf - DILATION_FACTOR >= 0) ?
55+
Data(b, r_residual_c, rf - DILATION_FACTOR) * GateWeight(dilation_c, r_residual_c, 0) :
56+
float(0)
57+
)
58+
where rf in 0:RECEPTIVE_FIELD
59+
60+
NonLin(b, dilation_c, rf) = tanh(FilterOut(b, dilation_c, rf))
61+
where rf in 0:RECEPTIVE_FIELD
62+
NonLin(b, dilation_c, rf) *= 1 / (1 + exp( -GateOut(b, dilation_c, rf)))
63+
where rf in 0:RECEPTIVE_FIELD
64+
65+
Res(b, residual_c, rf) = Data(b, residual_c, rf) + ResBias(residual_c)
66+
Res(b, residual_c, rf) += NonLin(b, r_dilation_c, rf) * ResWeight(residual_c, r_dilation_c)
67+
68+
Skip(b, skip, rf) +=! NonLin(b, r_dilation_c, rf) * SkipWeight(skip, r_dilation_c)
69+
where rf in 0:RECEPTIVE_FIELD
70+
Skip(b, skip, rf) = Skip(b, skip, rf) + SkipBias(skip)
71+
where rf in 0:RECEPTIVE_FIELD
72+
}
73+
)TC";
74+
75+
auto options_WaveNet1_P100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1 =
76+
tc::CudaMappingOptions::makeNaiveMappingOptions()
77+
.outerScheduleFusionStrategy(tc::FusionStrategy::Max)
78+
.outerScheduleAllowSkewing(false)
79+
.outerSchedulePositiveOrthant(true)
80+
.intraTileScheduleFusionStrategy(tc::FusionStrategy::Min)
81+
.intraTileScheduleAllowSkewing(false)
82+
.intraTileSchedulePositiveOrthant(true)
83+
.fixParametersBeforeScheduling(true)
84+
.tile(63)
85+
.unroll(32)
86+
.tileImperfectlyNested(false)
87+
.matchLibraryCalls(false)
88+
.mapToThreads(32, 4, 1)
89+
.mapToBlocks(256, 4, 63)
90+
.useSharedMemory(true)
91+
.usePrivateMemory(true)
92+
.unrollCopyShared(false)
93+
.useReadOnlyCache(false);
94+
95+
auto options_WaveNet1_P100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32 =
96+
tc::CudaMappingOptions::makeNaiveMappingOptions()
97+
.outerScheduleFusionStrategy(tc::FusionStrategy::Max)
98+
.outerScheduleAllowSkewing(false)
99+
.outerSchedulePositiveOrthant(true)
100+
.intraTileScheduleFusionStrategy(
101+
tc::FusionStrategy::Preserve3Coincident)
102+
.intraTileScheduleAllowSkewing(false)
103+
.intraTileSchedulePositiveOrthant(true)
104+
.fixParametersBeforeScheduling(true)
105+
.tile(128, 4096, 1000, 64)
106+
.unroll(1)
107+
.tileImperfectlyNested(false)
108+
.matchLibraryCalls(true)
109+
.mapToThreads(128)
110+
.mapToBlocks(63)
111+
.useSharedMemory(true)
112+
.usePrivateMemory(true)
113+
.unrollCopyShared(false)
114+
.useReadOnlyCache(false);
115+
116+
auto options_WaveNet1_V100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1 =
117+
tc::CudaMappingOptions::makeNaiveMappingOptions()
118+
.outerScheduleFusionStrategy(tc::FusionStrategy::Max)
119+
.outerScheduleAllowSkewing(false)
120+
.outerSchedulePositiveOrthant(true)
121+
.intraTileScheduleFusionStrategy(
122+
tc::FusionStrategy::Preserve3Coincident)
123+
.intraTileScheduleAllowSkewing(false)
124+
.intraTileSchedulePositiveOrthant(true)
125+
.fixParametersBeforeScheduling(false)
126+
.tile(1000, 128, 500)
127+
.unroll(2)
128+
.tileImperfectlyNested(false)
129+
.matchLibraryCalls(false)
130+
.mapToThreads(256)
131+
.mapToBlocks(4000, 128)
132+
.useSharedMemory(true)
133+
.usePrivateMemory(true)
134+
.unrollCopyShared(true)
135+
.useReadOnlyCache(false);
136+
137+
auto options_WaveNet1_V100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32 =
138+
tc::CudaMappingOptions::makeNaiveMappingOptions()
139+
.outerScheduleFusionStrategy(tc::FusionStrategy::Max)
140+
.outerScheduleAllowSkewing(false)
141+
.outerSchedulePositiveOrthant(true)
142+
.intraTileScheduleFusionStrategy(tc::FusionStrategy::Min)
143+
.intraTileScheduleAllowSkewing(false)
144+
.intraTileSchedulePositiveOrthant(true)
145+
.fixParametersBeforeScheduling(true)
146+
.tile(8, 125, 512, 500)
147+
.unroll(32)
148+
.tileImperfectlyNested(false)
149+
.matchLibraryCalls(false)
150+
.mapToThreads(16, 16)
151+
.mapToBlocks(4000, 2048, 4096)
152+
.useSharedMemory(true)
153+
.usePrivateMemory(true)
154+
.unrollCopyShared(true)
155+
.useReadOnlyCache(false);
156+
157+
} // namespace tc

0 commit comments

Comments
 (0)