|
| 1 | +/** |
| 2 | + * Copyright (c) 2017-present, Facebook, Inc. |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + */ |
| 16 | +#include <iostream> |
| 17 | +#include <string> |
| 18 | +#include <vector> |
| 19 | + |
| 20 | +#include <gflags/gflags.h> |
| 21 | +#include <glog/logging.h> |
| 22 | +#include <gtest/gtest.h> |
| 23 | + |
| 24 | +#include <ATen/ATen.h> |
| 25 | + |
| 26 | +#include "tc/aten/aten_compiler.h" |
| 27 | +#include "tc/autotuner/genetic_autotuner_aten.h" |
| 28 | +#include "tc/core/cuda/cuda_mapping_options.h" |
| 29 | +#include "tc/core/flags.h" |
| 30 | + |
| 31 | +#include "../test/test_harness_aten_cuda.h" |
| 32 | + |
| 33 | +DEFINE_string(tuner_proto, "", "Filename to load and store proto cache "); |
| 34 | + |
| 35 | +TEST(WaveNet2Layers, SimpleAutotune) { |
| 36 | + // 1. Define and setup the TC compilation unit with CUDA memory |
| 37 | + // management backed by ATen tensors. |
| 38 | + std::string tc = R"TC( |
| 39 | +def wavenet2layers( |
| 40 | + float(OUT, IN, KERN) Weight0, |
| 41 | + float(OUT) Bias0, |
| 42 | + float(BATCH, IN, KERN) Data0, |
| 43 | + float(IN, IN) ResWeight0, |
| 44 | + float(IN) ResBias0, |
| 45 | + float(SKIP, IN) SkipWeight0, |
| 46 | + float(SKIP) SkipBias0, |
| 47 | + float(OUT, IN, KERN) Weight1, |
| 48 | + float(OUT) Bias1, |
| 49 | + float(BATCH, IN, KERN) Data1, |
| 50 | + float(IN, IN) ResWeight1, |
| 51 | + float(IN) ResBias1, |
| 52 | + float(SKIP, IN) SkipWeight1, |
| 53 | + float(SKIP) SkipBias1) |
| 54 | + -> (Res0, Dilate0, NonLin0, Skip0, Res1, Dilate1, NonLin1, Skip1) |
| 55 | +{ |
| 56 | + Dilate0(batch, out) = Bias0(out) where batch in 0:BATCH |
| 57 | + Dilate0(batch, out) += Weight0(out, r_in, r_kern) * Data0(batch, r_in, r_kern) |
| 58 | + NonLin0(batch, out) = 1 / (1 + exp(-1*(Dilate0(batch, out)))) |
| 59 | + NonLin0(batch, out) *= tanh(Dilate0(batch, out + 64)) |
| 60 | +
|
| 61 | + Skip0(batch, skip) = SkipBias0(skip) where batch in 0:BATCH |
| 62 | + Skip0(batch, skip) += SkipWeight0(skip, r_in) * NonLin0(batch, r_in) |
| 63 | + where r_in in 0:IN # necessary because r_in gets into unresolved min/max |
| 64 | + Res0(batch, out) = ResBias0( out) where batch in 0:BATCH |
| 65 | + Res0(batch, out) += ResWeight0( out, r_in) * NonLin0(batch, r_in) |
| 66 | + where r_in in 0:IN # necessary because r_in gets into unresolved min/max |
| 67 | + Res0(batch, out) = Res0(batch, out) + NonLin0(batch, out) |
| 68 | + where out in 0:IN # necessary because out gets into unresolved min/max |
| 69 | +
|
| 70 | + Dilate1(batch, out) = Bias1(out) |
| 71 | + where batch in 0:BATCH |
| 72 | + Dilate1(batch, out) += Weight1(out, r_in, r_kern) * Data1(batch, r_in, r_kern) |
| 73 | + NonLin1(batch, out) = 1 / (1 + exp(-1*(Dilate1(batch, out)))) |
| 74 | + NonLin1(batch, out) *= tanh(Dilate1(batch, out + 64)) |
| 75 | +
|
| 76 | + Skip1(batch, skip) = SkipBias1(skip) where batch in 0:BATCH |
| 77 | + Skip1(batch, skip) += SkipWeight1(skip, r_in) * NonLin1(batch, r_in) |
| 78 | + where r_in in 0:IN # necessary because r_in gets into unresolved min/max |
| 79 | + Res1(batch, out) = ResBias1( out) where batch in 0:BATCH |
| 80 | + Res1(batch, out) += ResWeight1( out, r_in) * NonLin1(batch, r_in) |
| 81 | + where r_in in 0:IN # necessary because r_in gets into unresolved min/max |
| 82 | + Res1(batch, out) = Res1(batch, out) + NonLin1(batch, out) |
| 83 | + where out in 0:IN # necessary because out gets into unresolved min/max |
| 84 | +} |
| 85 | + )TC"; |
| 86 | + tc::ATenCompilationUnit<tc::CudaTcExecutor> atCompl; |
| 87 | + atCompl.define(tc); |
| 88 | + |
| 89 | + // 2. Allocate tensors with random data. |
| 90 | + at::Tensor weight0 = at::CUDA(at::kFloat).rand({128, 64, 2}); |
| 91 | + at::Tensor bias0 = at::CUDA(at::kFloat).rand({128}); |
| 92 | + at::Tensor data0 = at::CUDA(at::kFloat).rand({1, 64, 2}); |
| 93 | + at::Tensor res_weight0 = at::CUDA(at::kFloat).rand({64, 64}); |
| 94 | + at::Tensor res_bias0 = at::CUDA(at::kFloat).rand({64}); |
| 95 | + at::Tensor skip_weight0 = at::CUDA(at::kFloat).rand({256, 64}); |
| 96 | + at::Tensor skip_bias0 = at::CUDA(at::kFloat).rand({256}); |
| 97 | + |
| 98 | + at::Tensor weight1 = at::CUDA(at::kFloat).rand({128, 64, 2}); |
| 99 | + at::Tensor bias1 = at::CUDA(at::kFloat).rand({128}); |
| 100 | + at::Tensor data1 = at::CUDA(at::kFloat).rand({1, 64, 2}); |
| 101 | + at::Tensor res_weight1 = at::CUDA(at::kFloat).rand({64, 64}); |
| 102 | + at::Tensor res_bias1 = at::CUDA(at::kFloat).rand({64}); |
| 103 | + at::Tensor skip_weight1 = at::CUDA(at::kFloat).rand({256, 64}); |
| 104 | + at::Tensor skip_bias1 = at::CUDA(at::kFloat).rand({256}); |
| 105 | + |
| 106 | + // 3. Run autotuning with evolutionary search starting from a naive option. |
| 107 | + auto naiveOptions = tc::CudaMappingOptions::makeNaiveCudaMappingOptions(); |
| 108 | + tc::autotune::GeneticAutotunerATen geneticAutotuneATen(tc); |
| 109 | + std::vector<at::Tensor> tensors = {weight0, |
| 110 | + bias0, |
| 111 | + data0, |
| 112 | + res_weight0, |
| 113 | + res_bias0, |
| 114 | + skip_weight0, |
| 115 | + skip_bias0, |
| 116 | + weight1, |
| 117 | + bias1, |
| 118 | + data1, |
| 119 | + res_weight1, |
| 120 | + res_bias1, |
| 121 | + skip_weight1, |
| 122 | + skip_bias1}; |
| 123 | + auto bestOption = geneticAutotuneATen.tune( |
| 124 | + FLAGS_tuner_proto, "wavenet2layers", tensors, naiveOptions); |
| 125 | + |
| 126 | + // 4. Compile and run the TC with the best option. |
| 127 | + // Outputs get allocated; could also be pre-allocated and passed. |
| 128 | + auto handle = |
| 129 | + atCompl.compile("wavenet2layers", tensors, bestOption.getValue()); |
| 130 | + std::vector<at::Tensor> outputs; |
| 131 | + auto duration = atCompl.run("wavenet2layers", tensors, outputs, handle, true); |
| 132 | + std::cout |
| 133 | + << "wavenet2layers size weight0: " << weight0.sizes() << " ran in: " |
| 134 | + << std::chrono::duration_cast<std::chrono::microseconds>(duration).count() |
| 135 | + << "us\n"; |
| 136 | + |
| 137 | + // 5. The following represent reasonable initialization operations, |
| 138 | + // ported from PyTorch. |
| 139 | + weight0 = 5 * (at::CUDA(at::kFloat).rand({128, 64, 2}) - 0.5f); |
| 140 | + bias0 = 2 * (at::CUDA(at::kFloat).rand({128}) - 0.5f); |
| 141 | + data0 = 2 * (at::CUDA(at::kFloat).rand({1, 64, 2}) - 0.5f); |
| 142 | + res_weight0 = 2 * (at::CUDA(at::kFloat).rand({64, 64}) - 0.5f); |
| 143 | + res_bias0 = 2 * (at::CUDA(at::kFloat).rand({64}) - 0.5f); |
| 144 | + skip_weight0 = 2 * (at::CUDA(at::kFloat).rand({256, 64}) - 0.5f); |
| 145 | + skip_bias0 = 2 * (at::CUDA(at::kFloat).rand({256}) - 0.5f); |
| 146 | + |
| 147 | + weight1 = 5 * (at::CUDA(at::kFloat).rand({128, 64, 2}) - 1.5f); |
| 148 | + bias1 = 2 * (at::CUDA(at::kFloat).rand({128}) - 1.5f); |
| 149 | + data1 = 2 * (at::CUDA(at::kFloat).rand({1, 64, 2}) - 1.5f); |
| 150 | + res_weight1 = 2 * (at::CUDA(at::kFloat).rand({64, 64}) - 1.5f); |
| 151 | + res_bias1 = 2 * (at::CUDA(at::kFloat).rand({64}) - 1.5f); |
| 152 | + skip_weight1 = 2 * (at::CUDA(at::kFloat).rand({256, 64}) - 1.5f); |
| 153 | + skip_bias1 = 2 * (at::CUDA(at::kFloat).rand({256}) - 1.5f); |
| 154 | + |
| 155 | + // 6. Run unchecked multiple times, to put GPU in high usage mode, use with: |
| 156 | + // nvprof --profile-from-start off executable --use_nvprof=1 |
| 157 | + { |
| 158 | + tc::CudaProfiler cp; |
| 159 | + for (int i = 0; i < tc::FLAGS_benchmark_iterations; ++i) { |
| 160 | + atCompl.uncheckedRun(tensors, outputs, handle); |
| 161 | + } |
| 162 | + } |
| 163 | +} |
| 164 | + |
| 165 | +// From root, run with: |
| 166 | +// ./build/examples/wavenet --tuner_threads=10 --tuner_gen_pop_size=10 |
| 167 | +// --tuner_gen_generations=3 --tuner_gen_number_elites=4 |
| 168 | +// --benchmark_iterations=1000 --tuner_proto="/tmp/wavenet" |
| 169 | +int main(int argc, char** argv) { |
| 170 | + ::testing::InitGoogleTest(&argc, argv); |
| 171 | + ::gflags::ParseCommandLineFlags(&argc, &argv, true); |
| 172 | + ::google::InitGoogleLogging(argv[0]); |
| 173 | + setAtenSeed(tc::initRandomSeed(), at::Backend::CUDA); |
| 174 | + return RUN_ALL_TESTS(); |
| 175 | +} |
0 commit comments