|
| 1 | +/** |
| 2 | + * Copyright (c) 2017-present, Facebook, Inc. |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + */ |
| 16 | +#include "group_normalization.h" |
| 17 | + |
| 18 | +#include <iostream> |
| 19 | +#include <string> |
| 20 | +#include <vector> |
| 21 | + |
| 22 | +#include <gflags/gflags.h> |
| 23 | +#include <glog/logging.h> |
| 24 | +#include <gtest/gtest.h> |
| 25 | + |
| 26 | +#include "tc/aten/aten.h" |
| 27 | + |
| 28 | +#include "tc/aten/aten_compiler.h" |
| 29 | +#include "tc/core/cuda/cuda_mapping_options.h" |
| 30 | + |
| 31 | +#include "../test/caffe2/cuda/test_harness.h" |
| 32 | +#include "../test/caffe2/test_harness.h" |
| 33 | +#include "../test/test_harness_aten_cuda.h" |
| 34 | +#include "benchmark_fixture.h" |
| 35 | + |
| 36 | +#include "tc/c2/context.h" |
| 37 | +#include "tc/core/cuda/cuda.h" |
| 38 | +#include "tc/core/flags.h" |
| 39 | + |
| 40 | +using namespace caffe2; |
| 41 | + |
| 42 | +DEFINE_uint32(N, 32, "N batch size"); |
| 43 | +DEFINE_uint32(C, 512, "Number of channels (that will get divided into groups)"); |
| 44 | +DEFINE_uint32(G, 32, "Number of groups"); |
| 45 | +DEFINE_uint32(H, 48, "Height"); |
| 46 | +DEFINE_uint32(W, 48, "Width"); |
| 47 | + |
| 48 | +class GroupNormalization : public Benchmark { |
| 49 | + protected: |
| 50 | + uint32_t N, C, G, D, H, W; |
| 51 | + |
| 52 | + public: |
| 53 | + void Init(uint32_t n, uint32_t c, uint32_t g, uint32_t h, uint32_t w) { |
| 54 | + N = n; |
| 55 | + C = c; |
| 56 | + G = g; |
| 57 | + D = C / G; |
| 58 | + H = h; |
| 59 | + W = w; |
| 60 | + } |
| 61 | + std::vector<at::Tensor> runGroupNormalization( |
| 62 | + const tc::CudaMappingOptions& options, |
| 63 | + const tc::CudaMappingOptions& optionsMoments = |
| 64 | + tc::CudaMappingOptions::makeNaiveMappingOptions()); |
| 65 | + std::vector<at::Tensor> runGroupNormalizationSingleKernel( |
| 66 | + const tc::CudaMappingOptions& options); |
| 67 | + void runCaffe2GroupNormalization(); |
| 68 | + void runATenGroupNormalization(); |
| 69 | +}; |
| 70 | + |
| 71 | +std::vector<at::Tensor> GroupNormalization::runGroupNormalization( |
| 72 | + const tc::CudaMappingOptions& options, |
| 73 | + const tc::CudaMappingOptions& optionsMoments) { |
| 74 | + at::Tensor I = at::CUDA(at::kFloat).rand({N, G, D, H, W}); |
| 75 | + at::Tensor gamma = at::CUDA(at::kFloat).rand({G, D}); |
| 76 | + at::Tensor beta = at::CUDA(at::kFloat).rand({G, D}); |
| 77 | + auto view = I.view({N, G, -1}); |
| 78 | + auto mean = view.mean(-1, true); |
| 79 | + auto var = view.var(-1, true).view({N, G, 1}); |
| 80 | + |
| 81 | + auto check_fun = [&](const std::vector<at::Tensor>& inputs, |
| 82 | + const std::vector<at::Tensor>& outputs) { |
| 83 | + TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize()); |
| 84 | + auto x = ((view - mean) / (var + 1e-5f).sqrt()); |
| 85 | + auto y = gamma.view({1, G, D, 1, 1}) * x.view({N, G, D, H, W}) + |
| 86 | + beta.view({1, G, D, 1, 1}); |
| 87 | + TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize()); |
| 88 | + checkRtol(outputs[0] - y, {I}, D * H * W, 1e-6); |
| 89 | + return true; |
| 90 | + }; |
| 91 | + |
| 92 | + auto inputs = std::vector<at::Tensor>{ |
| 93 | + I, gamma, beta, mean.view({N, G}), var.view({N, G})}; |
| 94 | + std::string suffix = std::string("_N_") + std::to_string(N) + |
| 95 | + std::string("_C_") + std::to_string(C) + std::string("_G_") + |
| 96 | + std::to_string(G) + std::string("_H_") + std::to_string(H) + |
| 97 | + std::string("_W_") + std::to_string(W); |
| 98 | + std::vector<tc::CudaMappingOptions> bestOptions{options}; |
| 99 | + if (FLAGS_autotune) { |
| 100 | + bestOptions = autotune( |
| 101 | + FLAGS_save_tuner_proto_prefix + |
| 102 | + std::string("/group_normalization_cache") + suffix, |
| 103 | + FLAGS_save_tuner_proto_prefix + |
| 104 | + std::string("/group_normalization_best") + suffix, |
| 105 | + tc::TC_GroupNormalization, |
| 106 | + tc::TC_GroupNormalization_NAME, |
| 107 | + inputs, |
| 108 | + options); |
| 109 | + CHECK_GE(bestOptions.size(), 1u); |
| 110 | + } |
| 111 | + |
| 112 | + auto pExecutorMoments = tc::aten::compile<tc::CudaBackend>( |
| 113 | + tc::TC_GroupNormalization, |
| 114 | + tc::TC_Moments2_2D_1D_NAME, |
| 115 | + {I.view({N * G, D * H * W})}, |
| 116 | + optionsMoments); |
| 117 | + std::vector<at::Tensor> outputs = tc::aten::prepareOutputs( |
| 118 | + tc::TC_GroupNormalization, |
| 119 | + tc::TC_Moments2_2D_1D_NAME, |
| 120 | + {I.view({N * G, D * H * W})}); |
| 121 | + tc::aten::run(*pExecutorMoments, {I.view({N * G, D * H * W})}, outputs); |
| 122 | + auto computeMoments = [&I, &outputs, &pExecutorMoments, this]() { |
| 123 | + return tc::aten::profile( |
| 124 | + *pExecutorMoments, {I.view({N * G, D * H * W})}, outputs); |
| 125 | + }; |
| 126 | + return Check( |
| 127 | + tc::TC_GroupNormalization, |
| 128 | + tc::TC_GroupNormalization_NAME, |
| 129 | + bestOptions[0], |
| 130 | + inputs, |
| 131 | + check_fun, |
| 132 | + computeMoments); |
| 133 | +} |
| 134 | + |
| 135 | +std::vector<at::Tensor> GroupNormalization::runGroupNormalizationSingleKernel( |
| 136 | + const tc::CudaMappingOptions& options) { |
| 137 | + at::Tensor I = at::CUDA(at::kFloat).rand({N, G, D, H, W}); |
| 138 | + at::Tensor gamma = at::CUDA(at::kFloat).rand({G, D}); |
| 139 | + at::Tensor beta = at::CUDA(at::kFloat).rand({G, D}); |
| 140 | + |
| 141 | + auto check_fun = [&](const std::vector<at::Tensor>& inputs, |
| 142 | + const std::vector<at::Tensor>& outputs) { |
| 143 | + TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize()); |
| 144 | + auto view = I.view({N, G, -1}); |
| 145 | + auto mean = view.mean(-1, true); |
| 146 | + auto var = view.var(-1, true).view({N, G, 1}); |
| 147 | + auto x = ((view - mean) / (var + 1e-5f).sqrt()); |
| 148 | + auto y = gamma.view({1, G, D, 1, 1}) * x.view({N, G, D, H, W}) + |
| 149 | + beta.view({1, G, D, 1, 1}); |
| 150 | + TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize()); |
| 151 | + checkRtol(outputs[0] - y, {I}, D * H * W, 1e-6); |
| 152 | + return true; |
| 153 | + }; |
| 154 | + |
| 155 | + auto inputs = std::vector<at::Tensor>{I, gamma, beta}; |
| 156 | + std::string suffix = std::string("_N_") + std::to_string(N) + |
| 157 | + std::string("_C_") + std::to_string(C) + std::string("_G_") + |
| 158 | + std::to_string(G) + std::string("_H_") + std::to_string(H) + |
| 159 | + std::string("_W_") + std::to_string(W); |
| 160 | + std::vector<tc::CudaMappingOptions> bestOptions{options}; |
| 161 | + if (FLAGS_autotune) { |
| 162 | + bestOptions = autotune( |
| 163 | + FLAGS_save_tuner_proto_prefix + |
| 164 | + std::string("/group_normalization_cache") + suffix, |
| 165 | + FLAGS_save_tuner_proto_prefix + |
| 166 | + std::string("/group_normalization_best") + suffix, |
| 167 | + tc::TC_GroupNormalization, |
| 168 | + tc::TC_GroupNormalizationSingleKernel_NAME, |
| 169 | + inputs, |
| 170 | + options); |
| 171 | + CHECK_GE(bestOptions.size(), 1u); |
| 172 | + } |
| 173 | + return Check( |
| 174 | + tc::TC_GroupNormalization, |
| 175 | + tc::TC_GroupNormalizationSingleKernel_NAME, |
| 176 | + bestOptions[0], |
| 177 | + inputs, |
| 178 | + check_fun); |
| 179 | +} |
| 180 | + |
| 181 | +void GroupNormalization::runCaffe2GroupNormalization() { |
| 182 | + Workspace w; |
| 183 | + auto AddInput = AddDeterministicallyRandomInput<caffe2::CUDABackend, float>; |
| 184 | + AddInput(w, {N, C, H, W}, "I"); |
| 185 | + AddInput(w, {G, D}, "gamma"); |
| 186 | + AddInput(w, {G, D}, "beta"); |
| 187 | + OperatorDef def = MakeOperatorDef<caffe2::CUDABackend>( |
| 188 | + "GroupNorm", {"I", "gamma", "beta"}, {"O", "mean", "var"}); |
| 189 | + unique_ptr<OperatorBase> op(CreateOperator(def, &w)); |
| 190 | + Reference([&]() { return true; }, [&op](bool flag) { op->Run(); }); |
| 191 | +} |
| 192 | + |
| 193 | +void GroupNormalization::runATenGroupNormalization() { |
| 194 | + at::Tensor I = at::CUDA(at::kFloat).rand({N, G, D, H, W}); |
| 195 | + at::Tensor gamma = at::CUDA(at::kFloat).rand({G, D}); |
| 196 | + at::Tensor beta = at::CUDA(at::kFloat).rand({G, D}); |
| 197 | + Reference( |
| 198 | + [&]() { return true; }, |
| 199 | + [&I, &gamma, &beta, this](bool flag) { |
| 200 | + auto v = I.view({N, G, -1}); |
| 201 | + auto mean = v.mean(-1, true); |
| 202 | + auto var = v.var(-1, true).view({N, G, 1}); |
| 203 | + auto x = ((v - mean) / (var + 1e-5f).sqrt()); |
| 204 | + auto y = gamma.view({1, G, D, 1, 1}) * x.view({N, G, D, H, W}) + |
| 205 | + beta.view({1, G, D, 1, 1}); |
| 206 | + ; |
| 207 | + }); |
| 208 | +} |
| 209 | + |
| 210 | +/// Generic |
| 211 | +TEST_F(GroupNormalization, GroupNormalization) { |
| 212 | + Init(FLAGS_N, FLAGS_C, FLAGS_G, FLAGS_H, FLAGS_W); |
| 213 | + runGroupNormalization(tc::CudaMappingOptions::makeNaiveMappingOptions()); |
| 214 | +} |
| 215 | + |
| 216 | +// P100 TC |
| 217 | +TEST_F( |
| 218 | + GroupNormalization, |
| 219 | + GroupNormalization_P100_autotuned_N_4_C_512_G_32_H_12_W_12) { |
| 220 | + Init(4, 512, 32, 12, 12); |
| 221 | + runGroupNormalization( |
| 222 | + tc::options_GroupNormalization_P100_autotuned_N_4_C_512_G_32_H_12_W_12, |
| 223 | + tc::options_Moments2_2D_1D_P100_autotuned_N_128_K_2304); |
| 224 | +} |
| 225 | + |
| 226 | +TEST_F( |
| 227 | + GroupNormalization, |
| 228 | + GroupNormalization_P100_autotuned_N_32_C_512_G_32_H_48_W_48) { |
| 229 | + Init(32, 512, 32, 48, 48); |
| 230 | + runGroupNormalization( |
| 231 | + tc::options_GroupNormalization_P100_autotuned_N_32_C_512_G_32_H_48_W_48, |
| 232 | + tc::options_Moments2_2D_1D_P100_autotuned_N_1024_K_36864); |
| 233 | +} |
| 234 | + |
| 235 | +// P100 Caffe2 |
| 236 | +TEST_F( |
| 237 | + GroupNormalization, |
| 238 | + GroupNormalization_Caffe2_P100_N_4_C_512_G_32_H_12_W_12) { |
| 239 | + Init(4, 512, 32, 12, 12); |
| 240 | + runCaffe2GroupNormalization(); |
| 241 | +} |
| 242 | + |
| 243 | +TEST_F( |
| 244 | + GroupNormalization, |
| 245 | + GroupNormalization_Caffe2_P100_N_32_C_512_G_32_H_48_W_48) { |
| 246 | + Init(32, 512, 32, 48, 48); |
| 247 | + runCaffe2GroupNormalization(); |
| 248 | +} |
| 249 | + |
| 250 | +// P100 ATen |
| 251 | +TEST_F( |
| 252 | + GroupNormalization, |
| 253 | + GroupNormalization_ATen_P100_N_4_C_512_G_32_H_12_W_12) { |
| 254 | + Init(4, 512, 32, 12, 12); |
| 255 | + runATenGroupNormalization(); |
| 256 | +} |
| 257 | + |
| 258 | +TEST_F( |
| 259 | + GroupNormalization, |
| 260 | + GroupNormalization_ATen_P100_N_32_C_512_G_32_H_48_W_48) { |
| 261 | + Init(32, 512, 32, 48, 48); |
| 262 | + runATenGroupNormalization(); |
| 263 | +} |
| 264 | + |
| 265 | +// V100 TC |
| 266 | +TEST_F( |
| 267 | + GroupNormalization, |
| 268 | + GroupNormalization_V100_autotuned_N_4_C_512_G_32_H_12_W_12) { |
| 269 | + Init(4, 512, 32, 12, 12); |
| 270 | + runGroupNormalization( |
| 271 | + tc::options_GroupNormalization_V100_autotuned_N_4_C_512_G_32_H_12_W_12, |
| 272 | + tc::options_Moments2_2D_1D_V100_autotuned_N_128_K_2304); |
| 273 | +} |
| 274 | + |
| 275 | +TEST_F( |
| 276 | + GroupNormalization, |
| 277 | + GroupNormalization_V100_autotuned_N_32_C_512_G_32_H_48_W_48) { |
| 278 | + Init(32, 512, 32, 48, 48); |
| 279 | + runGroupNormalization( |
| 280 | + tc::options_GroupNormalization_V100_autotuned_N_32_C_512_G_32_H_48_W_48, |
| 281 | + tc::options_Moments2_2D_1D_V100_autotuned_N_1024_K_36864); |
| 282 | +} |
| 283 | + |
| 284 | +// V100 Caffe2 |
| 285 | +TEST_F( |
| 286 | + GroupNormalization, |
| 287 | + GroupNormalization_Caffe2_V100_N_4_C_512_G_32_H_12_W_12) { |
| 288 | + Init(4, 512, 32, 12, 12); |
| 289 | + runCaffe2GroupNormalization(); |
| 290 | +} |
| 291 | + |
| 292 | +TEST_F( |
| 293 | + GroupNormalization, |
| 294 | + GroupNormalization_Caffe2_V100_N_32_C_512_G_32_H_48_W_48) { |
| 295 | + Init(32, 512, 32, 48, 48); |
| 296 | + runCaffe2GroupNormalization(); |
| 297 | +} |
| 298 | + |
| 299 | +// V100 ATen |
| 300 | +TEST_F( |
| 301 | + GroupNormalization, |
| 302 | + GroupNormalization_ATen_V100_N_4_C_512_G_32_H_12_W_12) { |
| 303 | + Init(4, 512, 32, 12, 12); |
| 304 | + runATenGroupNormalization(); |
| 305 | +} |
| 306 | + |
| 307 | +TEST_F( |
| 308 | + GroupNormalization, |
| 309 | + GroupNormalization_ATen_V100_N_32_C_512_G_32_H_48_W_48) { |
| 310 | + Init(32, 512, 32, 48, 48); |
| 311 | + runATenGroupNormalization(); |
| 312 | +} |
| 313 | + |
| 314 | +// Generic |
| 315 | +TEST_F(GroupNormalization, GroupNormalizationSingleKernel) { |
| 316 | + Init(FLAGS_N, FLAGS_C, FLAGS_G, FLAGS_H, FLAGS_W); |
| 317 | + runGroupNormalizationSingleKernel( |
| 318 | + tc::CudaMappingOptions::makeNaiveMappingOptions()); |
| 319 | +} |
| 320 | + |
| 321 | +// P100 TC |
| 322 | +TEST_F( |
| 323 | + GroupNormalization, |
| 324 | + GroupNormalizationSingleKernel_P100_autotuned_N_4_C_512_G_32_H_12_W_12) { |
| 325 | + Init(4, 512, 32, 12, 12); |
| 326 | + runGroupNormalizationSingleKernel( |
| 327 | + tc::options_GroupNormalizationSingleKernel_P100_autotuned_N_4_C_512_G_32_H_12_W_12); |
| 328 | +} |
| 329 | + |
| 330 | +TEST_F( |
| 331 | + GroupNormalization, |
| 332 | + GroupNormalizationSingleKernel_P100_autotuned_N_32_C_512_G_32_H_48_W_48) { |
| 333 | + Init(32, 512, 32, 48, 48); |
| 334 | + runGroupNormalizationSingleKernel( |
| 335 | + tc::options_GroupNormalizationSingleKernel_P100_autotuned_N_32_C_512_G_32_H_48_W_48); |
| 336 | +} |
| 337 | + |
| 338 | +// V100 TC |
| 339 | +TEST_F( |
| 340 | + GroupNormalization, |
| 341 | + GroupNormalizationSingleKernel_V100_autotuned_N_4_C_512_G_32_H_12_W_12) { |
| 342 | + Init(4, 512, 32, 12, 12); |
| 343 | + runGroupNormalizationSingleKernel( |
| 344 | + tc::options_GroupNormalizationSingleKernel_V100_autotuned_N_4_C_512_G_32_H_12_W_12); |
| 345 | +} |
| 346 | + |
| 347 | +TEST_F( |
| 348 | + GroupNormalization, |
| 349 | + GroupNormalizationSingleKernel_V100_autotuned_N_32_C_512_G_32_H_48_W_48) { |
| 350 | + Init(32, 512, 32, 48, 48); |
| 351 | + runGroupNormalizationSingleKernel( |
| 352 | + tc::options_GroupNormalizationSingleKernel_V100_autotuned_N_32_C_512_G_32_H_48_W_48); |
| 353 | +} |
| 354 | + |
| 355 | +int main(int argc, char** argv) { |
| 356 | + ::testing::InitGoogleTest(&argc, argv); |
| 357 | + ::gflags::ParseCommandLineFlags(&argc, &argv, true); |
| 358 | + ::google::InitGoogleLogging(argv[0]); |
| 359 | + tc::aten::setAtenSeed(tc::initRandomSeed(), at::Backend::CUDA); |
| 360 | + return RUN_ALL_TESTS(); |
| 361 | +} |
0 commit comments