Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit fb1439d

Browse files
Add group normalization benchmark
This commit adds a benchmark and reference implementations for group normalization (https://arxiv.org/abs/1803.08494). In particular we found that it was generally beneficial to split the computation into 2 kernels on GPUs: 1 kernels for the moments and 1 kernel for group normalization from moments. We also provide a single fused TC which generates a single kernel.
1 parent 7f3c7fb commit fb1439d

File tree

3 files changed

+672
-0
lines changed

3 files changed

+672
-0
lines changed

tc/benchmarks/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ find_library(CUDA_CUDNN_LIBRARIES cudnn
1818
set(BENCHMARKS
1919
batchmatmul
2020
group_convolution
21+
group_normalization
2122
kronecker
2223
moments
2324
tmm

tc/benchmarks/group_normalization.cc

Lines changed: 361 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,361 @@
1+
/**
2+
* Copyright (c) 2017-present, Facebook, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#include "group_normalization.h"
17+
18+
#include <iostream>
19+
#include <string>
20+
#include <vector>
21+
22+
#include <gflags/gflags.h>
23+
#include <glog/logging.h>
24+
#include <gtest/gtest.h>
25+
26+
#include "tc/aten/aten.h"
27+
28+
#include "tc/aten/aten_compiler.h"
29+
#include "tc/core/cuda/cuda_mapping_options.h"
30+
31+
#include "../test/caffe2/cuda/test_harness.h"
32+
#include "../test/caffe2/test_harness.h"
33+
#include "../test/test_harness_aten_cuda.h"
34+
#include "benchmark_fixture.h"
35+
36+
#include "tc/c2/context.h"
37+
#include "tc/core/cuda/cuda.h"
38+
#include "tc/core/flags.h"
39+
40+
using namespace caffe2;
41+
42+
DEFINE_uint32(N, 32, "N batch size");
43+
DEFINE_uint32(C, 512, "Number of channels (that will get divided into groups)");
44+
DEFINE_uint32(G, 32, "Number of groups");
45+
DEFINE_uint32(H, 48, "Height");
46+
DEFINE_uint32(W, 48, "Width");
47+
48+
class GroupNormalization : public Benchmark {
49+
protected:
50+
uint32_t N, C, G, D, H, W;
51+
52+
public:
53+
void Init(uint32_t n, uint32_t c, uint32_t g, uint32_t h, uint32_t w) {
54+
N = n;
55+
C = c;
56+
G = g;
57+
D = C / G;
58+
H = h;
59+
W = w;
60+
}
61+
std::vector<at::Tensor> runGroupNormalization(
62+
const tc::CudaMappingOptions& options,
63+
const tc::CudaMappingOptions& optionsMoments =
64+
tc::CudaMappingOptions::makeNaiveMappingOptions());
65+
std::vector<at::Tensor> runGroupNormalizationSingleKernel(
66+
const tc::CudaMappingOptions& options);
67+
void runCaffe2GroupNormalization();
68+
void runATenGroupNormalization();
69+
};
70+
71+
std::vector<at::Tensor> GroupNormalization::runGroupNormalization(
72+
const tc::CudaMappingOptions& options,
73+
const tc::CudaMappingOptions& optionsMoments) {
74+
at::Tensor I = at::CUDA(at::kFloat).rand({N, G, D, H, W});
75+
at::Tensor gamma = at::CUDA(at::kFloat).rand({G, D});
76+
at::Tensor beta = at::CUDA(at::kFloat).rand({G, D});
77+
auto view = I.view({N, G, -1});
78+
auto mean = view.mean(-1, true);
79+
auto var = view.var(-1, true).view({N, G, 1});
80+
81+
auto check_fun = [&](const std::vector<at::Tensor>& inputs,
82+
const std::vector<at::Tensor>& outputs) {
83+
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
84+
auto x = ((view - mean) / (var + 1e-5f).sqrt());
85+
auto y = gamma.view({1, G, D, 1, 1}) * x.view({N, G, D, H, W}) +
86+
beta.view({1, G, D, 1, 1});
87+
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
88+
checkRtol(outputs[0] - y, {I}, D * H * W, 1e-6);
89+
return true;
90+
};
91+
92+
auto inputs = std::vector<at::Tensor>{
93+
I, gamma, beta, mean.view({N, G}), var.view({N, G})};
94+
std::string suffix = std::string("_N_") + std::to_string(N) +
95+
std::string("_C_") + std::to_string(C) + std::string("_G_") +
96+
std::to_string(G) + std::string("_H_") + std::to_string(H) +
97+
std::string("_W_") + std::to_string(W);
98+
std::vector<tc::CudaMappingOptions> bestOptions{options};
99+
if (FLAGS_autotune) {
100+
bestOptions = autotune(
101+
FLAGS_save_tuner_proto_prefix +
102+
std::string("/group_normalization_cache") + suffix,
103+
FLAGS_save_tuner_proto_prefix +
104+
std::string("/group_normalization_best") + suffix,
105+
tc::TC_GroupNormalization,
106+
tc::TC_GroupNormalization_NAME,
107+
inputs,
108+
options);
109+
CHECK_GE(bestOptions.size(), 1u);
110+
}
111+
112+
auto pExecutorMoments = tc::aten::compile<tc::CudaBackend>(
113+
tc::TC_GroupNormalization,
114+
tc::TC_Moments2_2D_1D_NAME,
115+
{I.view({N * G, D * H * W})},
116+
optionsMoments);
117+
std::vector<at::Tensor> outputs = tc::aten::prepareOutputs(
118+
tc::TC_GroupNormalization,
119+
tc::TC_Moments2_2D_1D_NAME,
120+
{I.view({N * G, D * H * W})});
121+
tc::aten::run(*pExecutorMoments, {I.view({N * G, D * H * W})}, outputs);
122+
auto computeMoments = [&I, &outputs, &pExecutorMoments, this]() {
123+
return tc::aten::profile(
124+
*pExecutorMoments, {I.view({N * G, D * H * W})}, outputs);
125+
};
126+
return Check(
127+
tc::TC_GroupNormalization,
128+
tc::TC_GroupNormalization_NAME,
129+
bestOptions[0],
130+
inputs,
131+
check_fun,
132+
computeMoments);
133+
}
134+
135+
std::vector<at::Tensor> GroupNormalization::runGroupNormalizationSingleKernel(
136+
const tc::CudaMappingOptions& options) {
137+
at::Tensor I = at::CUDA(at::kFloat).rand({N, G, D, H, W});
138+
at::Tensor gamma = at::CUDA(at::kFloat).rand({G, D});
139+
at::Tensor beta = at::CUDA(at::kFloat).rand({G, D});
140+
141+
auto check_fun = [&](const std::vector<at::Tensor>& inputs,
142+
const std::vector<at::Tensor>& outputs) {
143+
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
144+
auto view = I.view({N, G, -1});
145+
auto mean = view.mean(-1, true);
146+
auto var = view.var(-1, true).view({N, G, 1});
147+
auto x = ((view - mean) / (var + 1e-5f).sqrt());
148+
auto y = gamma.view({1, G, D, 1, 1}) * x.view({N, G, D, H, W}) +
149+
beta.view({1, G, D, 1, 1});
150+
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
151+
checkRtol(outputs[0] - y, {I}, D * H * W, 1e-6);
152+
return true;
153+
};
154+
155+
auto inputs = std::vector<at::Tensor>{I, gamma, beta};
156+
std::string suffix = std::string("_N_") + std::to_string(N) +
157+
std::string("_C_") + std::to_string(C) + std::string("_G_") +
158+
std::to_string(G) + std::string("_H_") + std::to_string(H) +
159+
std::string("_W_") + std::to_string(W);
160+
std::vector<tc::CudaMappingOptions> bestOptions{options};
161+
if (FLAGS_autotune) {
162+
bestOptions = autotune(
163+
FLAGS_save_tuner_proto_prefix +
164+
std::string("/group_normalization_cache") + suffix,
165+
FLAGS_save_tuner_proto_prefix +
166+
std::string("/group_normalization_best") + suffix,
167+
tc::TC_GroupNormalization,
168+
tc::TC_GroupNormalizationSingleKernel_NAME,
169+
inputs,
170+
options);
171+
CHECK_GE(bestOptions.size(), 1u);
172+
}
173+
return Check(
174+
tc::TC_GroupNormalization,
175+
tc::TC_GroupNormalizationSingleKernel_NAME,
176+
bestOptions[0],
177+
inputs,
178+
check_fun);
179+
}
180+
181+
void GroupNormalization::runCaffe2GroupNormalization() {
182+
Workspace w;
183+
auto AddInput = AddDeterministicallyRandomInput<caffe2::CUDABackend, float>;
184+
AddInput(w, {N, C, H, W}, "I");
185+
AddInput(w, {G, D}, "gamma");
186+
AddInput(w, {G, D}, "beta");
187+
OperatorDef def = MakeOperatorDef<caffe2::CUDABackend>(
188+
"GroupNorm", {"I", "gamma", "beta"}, {"O", "mean", "var"});
189+
unique_ptr<OperatorBase> op(CreateOperator(def, &w));
190+
Reference([&]() { return true; }, [&op](bool flag) { op->Run(); });
191+
}
192+
193+
void GroupNormalization::runATenGroupNormalization() {
194+
at::Tensor I = at::CUDA(at::kFloat).rand({N, G, D, H, W});
195+
at::Tensor gamma = at::CUDA(at::kFloat).rand({G, D});
196+
at::Tensor beta = at::CUDA(at::kFloat).rand({G, D});
197+
Reference(
198+
[&]() { return true; },
199+
[&I, &gamma, &beta, this](bool flag) {
200+
auto v = I.view({N, G, -1});
201+
auto mean = v.mean(-1, true);
202+
auto var = v.var(-1, true).view({N, G, 1});
203+
auto x = ((v - mean) / (var + 1e-5f).sqrt());
204+
auto y = gamma.view({1, G, D, 1, 1}) * x.view({N, G, D, H, W}) +
205+
beta.view({1, G, D, 1, 1});
206+
;
207+
});
208+
}
209+
210+
/// Generic
211+
TEST_F(GroupNormalization, GroupNormalization) {
212+
Init(FLAGS_N, FLAGS_C, FLAGS_G, FLAGS_H, FLAGS_W);
213+
runGroupNormalization(tc::CudaMappingOptions::makeNaiveMappingOptions());
214+
}
215+
216+
// P100 TC
217+
TEST_F(
218+
GroupNormalization,
219+
GroupNormalization_P100_autotuned_N_4_C_512_G_32_H_12_W_12) {
220+
Init(4, 512, 32, 12, 12);
221+
runGroupNormalization(
222+
tc::options_GroupNormalization_P100_autotuned_N_4_C_512_G_32_H_12_W_12,
223+
tc::options_Moments2_2D_1D_P100_autotuned_N_128_K_2304);
224+
}
225+
226+
TEST_F(
227+
GroupNormalization,
228+
GroupNormalization_P100_autotuned_N_32_C_512_G_32_H_48_W_48) {
229+
Init(32, 512, 32, 48, 48);
230+
runGroupNormalization(
231+
tc::options_GroupNormalization_P100_autotuned_N_32_C_512_G_32_H_48_W_48,
232+
tc::options_Moments2_2D_1D_P100_autotuned_N_1024_K_36864);
233+
}
234+
235+
// P100 Caffe2
236+
TEST_F(
237+
GroupNormalization,
238+
GroupNormalization_Caffe2_P100_N_4_C_512_G_32_H_12_W_12) {
239+
Init(4, 512, 32, 12, 12);
240+
runCaffe2GroupNormalization();
241+
}
242+
243+
TEST_F(
244+
GroupNormalization,
245+
GroupNormalization_Caffe2_P100_N_32_C_512_G_32_H_48_W_48) {
246+
Init(32, 512, 32, 48, 48);
247+
runCaffe2GroupNormalization();
248+
}
249+
250+
// P100 ATen
251+
TEST_F(
252+
GroupNormalization,
253+
GroupNormalization_ATen_P100_N_4_C_512_G_32_H_12_W_12) {
254+
Init(4, 512, 32, 12, 12);
255+
runATenGroupNormalization();
256+
}
257+
258+
TEST_F(
259+
GroupNormalization,
260+
GroupNormalization_ATen_P100_N_32_C_512_G_32_H_48_W_48) {
261+
Init(32, 512, 32, 48, 48);
262+
runATenGroupNormalization();
263+
}
264+
265+
// V100 TC
266+
TEST_F(
267+
GroupNormalization,
268+
GroupNormalization_V100_autotuned_N_4_C_512_G_32_H_12_W_12) {
269+
Init(4, 512, 32, 12, 12);
270+
runGroupNormalization(
271+
tc::options_GroupNormalization_V100_autotuned_N_4_C_512_G_32_H_12_W_12,
272+
tc::options_Moments2_2D_1D_V100_autotuned_N_128_K_2304);
273+
}
274+
275+
TEST_F(
276+
GroupNormalization,
277+
GroupNormalization_V100_autotuned_N_32_C_512_G_32_H_48_W_48) {
278+
Init(32, 512, 32, 48, 48);
279+
runGroupNormalization(
280+
tc::options_GroupNormalization_V100_autotuned_N_32_C_512_G_32_H_48_W_48,
281+
tc::options_Moments2_2D_1D_V100_autotuned_N_1024_K_36864);
282+
}
283+
284+
// V100 Caffe2
285+
TEST_F(
286+
GroupNormalization,
287+
GroupNormalization_Caffe2_V100_N_4_C_512_G_32_H_12_W_12) {
288+
Init(4, 512, 32, 12, 12);
289+
runCaffe2GroupNormalization();
290+
}
291+
292+
TEST_F(
293+
GroupNormalization,
294+
GroupNormalization_Caffe2_V100_N_32_C_512_G_32_H_48_W_48) {
295+
Init(32, 512, 32, 48, 48);
296+
runCaffe2GroupNormalization();
297+
}
298+
299+
// V100 ATen
300+
TEST_F(
301+
GroupNormalization,
302+
GroupNormalization_ATen_V100_N_4_C_512_G_32_H_12_W_12) {
303+
Init(4, 512, 32, 12, 12);
304+
runATenGroupNormalization();
305+
}
306+
307+
TEST_F(
308+
GroupNormalization,
309+
GroupNormalization_ATen_V100_N_32_C_512_G_32_H_48_W_48) {
310+
Init(32, 512, 32, 48, 48);
311+
runATenGroupNormalization();
312+
}
313+
314+
// Generic
315+
TEST_F(GroupNormalization, GroupNormalizationSingleKernel) {
316+
Init(FLAGS_N, FLAGS_C, FLAGS_G, FLAGS_H, FLAGS_W);
317+
runGroupNormalizationSingleKernel(
318+
tc::CudaMappingOptions::makeNaiveMappingOptions());
319+
}
320+
321+
// P100 TC
322+
TEST_F(
323+
GroupNormalization,
324+
GroupNormalizationSingleKernel_P100_autotuned_N_4_C_512_G_32_H_12_W_12) {
325+
Init(4, 512, 32, 12, 12);
326+
runGroupNormalizationSingleKernel(
327+
tc::options_GroupNormalizationSingleKernel_P100_autotuned_N_4_C_512_G_32_H_12_W_12);
328+
}
329+
330+
TEST_F(
331+
GroupNormalization,
332+
GroupNormalizationSingleKernel_P100_autotuned_N_32_C_512_G_32_H_48_W_48) {
333+
Init(32, 512, 32, 48, 48);
334+
runGroupNormalizationSingleKernel(
335+
tc::options_GroupNormalizationSingleKernel_P100_autotuned_N_32_C_512_G_32_H_48_W_48);
336+
}
337+
338+
// V100 TC
339+
TEST_F(
340+
GroupNormalization,
341+
GroupNormalizationSingleKernel_V100_autotuned_N_4_C_512_G_32_H_12_W_12) {
342+
Init(4, 512, 32, 12, 12);
343+
runGroupNormalizationSingleKernel(
344+
tc::options_GroupNormalizationSingleKernel_V100_autotuned_N_4_C_512_G_32_H_12_W_12);
345+
}
346+
347+
TEST_F(
348+
GroupNormalization,
349+
GroupNormalizationSingleKernel_V100_autotuned_N_32_C_512_G_32_H_48_W_48) {
350+
Init(32, 512, 32, 48, 48);
351+
runGroupNormalizationSingleKernel(
352+
tc::options_GroupNormalizationSingleKernel_V100_autotuned_N_32_C_512_G_32_H_48_W_48);
353+
}
354+
355+
int main(int argc, char** argv) {
356+
::testing::InitGoogleTest(&argc, argv);
357+
::gflags::ParseCommandLineFlags(&argc, &argv, true);
358+
::google::InitGoogleLogging(argv[0]);
359+
tc::aten::setAtenSeed(tc::initRandomSeed(), at::Backend::CUDA);
360+
return RUN_ALL_TESTS();
361+
}

0 commit comments

Comments
 (0)