|
| 1 | +/** |
| 2 | + * Copyright (c) 2017-present, Facebook, Inc. |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + */ |
| 16 | +#include <iostream> |
| 17 | +#include <string> |
| 18 | +#include <vector> |
| 19 | + |
| 20 | +#include <gflags/gflags.h> |
| 21 | +#include <glog/logging.h> |
| 22 | +#include <gtest/gtest.h> |
| 23 | + |
| 24 | +#include <ATen/ATen.h> |
| 25 | + |
| 26 | +#include "tc/aten/aten_compiler.h" |
| 27 | +#include "tc/autotuner/genetic_autotuner_aten.h" |
| 28 | +#include "tc/core/cuda/cuda.h" |
| 29 | +#include "tc/core/cuda/cuda_tc_executor.h" |
| 30 | +#include "tc/core/flags.h" |
| 31 | +#include "tc/core/mapping_options.h" |
| 32 | + |
| 33 | +DEFINE_string(tuner_proto, "", "Filename to load and store proto cache "); |
| 34 | + |
| 35 | +TEST(BlockDiagPerm, SimpleAutotune) { |
| 36 | + // 1. Define and setup the TC compilation unit with CUDA memory |
| 37 | + // management backed by ATen tensors. |
| 38 | + std::string tc = R"TC( |
| 39 | +# The following TCs (blockdiagperm2d and blockdiagperm2dinlined) illustrate |
| 40 | +# how we would likely want to write blockdiagperm to synthesize a single |
| 41 | +# kernel. However both versions currently fail to emit a good single cuda kernel. |
| 42 | +# 1. blockdiagperm2d requires additional information to relax dependencies and |
| 43 | +# allow fusion |
| 44 | +# 2. blockdiagperm2dinlined requires general LHS indexing |
| 45 | +# A third version blockdiagperm2dfissioned_1/2 is a workaround by using 2 |
| 46 | +# independent TCs. |
| 47 | +# This TC probably requires extra information to perform fusion which we do |
| 48 | +# not know how to propagate at this point |
| 49 | +# def blockdiagperm2d(float(B, K, NBYK) I, float(K, NBYK, NBYK) W, float(K, NBYK) IdxR, float(K, NBYK) IdxC) |
| 50 | +# -> (O1, O2) { |
| 51 | +# O1(b, k, nbyk1) +=! I(b, k, r_nbyk0) * W(k, r_nbyk0, nbyk1) |
| 52 | +# O2(b, k, nbyk) = O1(b, Idxr(k, nbyk), Idxc(k, nbyk)) |
| 53 | +# } |
| 54 | +# This TC requires LHS indexing which is a WIP + extra information that all |
| 55 | +# accesses are parallel (i.e. (IdxR, IdxC) form a permutation) |
| 56 | +# def blockdiagperm2dinlined(float(B, K, NBYK) I, float(K, NBYK, NBYK) W, float(K, NBYK) IdxR, float(K, NBYK) IdxC) |
| 57 | +# -> (O1) { |
| 58 | +# O1(b, IdxR(k, nbyk0), IdxC(k, nbyk0)) +=! I(b, k, r_nbyk0) * W(k, r_nbyk0, nbyk1) |
| 59 | +# } |
| 60 | +
|
| 61 | +# This is the poor man's way of making things work today with a reshape |
| 62 | +# operation in between (in framework land). |
| 63 | +def blockdiagperm2dfissioned_1(float(B, K, NBYK) I, float(K, NBYK, NBYK) W) -> (O) |
| 64 | +{ |
| 65 | + O(b, k, nbyk1) +=! I(b, k, r_nbyk0) * W(k, r_nbyk0, nbyk1) |
| 66 | +} |
| 67 | +def blockdiagperm2dfissioned_2(float(B, N) I, int32(N) Idx) -> (O) { |
| 68 | + O(b, n) = I(b, Idx(n)) where n in 0:N |
| 69 | +} |
| 70 | + )TC"; |
| 71 | + tc::ATenCompilationUnit<tc::CudaTcExecutor> atCompl; |
| 72 | + atCompl.define(tc); |
| 73 | + |
| 74 | + // 1. Allocate and autotune |
| 75 | + at::Tensor I = at::CUDA(at::kFloat).rand({128, 10, 50}); |
| 76 | + at::Tensor W = at::CUDA(at::kFloat).rand({10, 50, 50}); |
| 77 | + auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions(); |
| 78 | + tc::autotune::GeneticAutotunerATen geneticAutotuneATen(tc); |
| 79 | + auto bestOption = geneticAutotuneATen.tune( |
| 80 | + FLAGS_tuner_proto, "blockdiagperm2dfissioned_1", {I, W}, options); |
| 81 | + auto handle = atCompl.compile( |
| 82 | + "blockdiagperm2dfissioned_1", {I, W}, bestOption.getValue()); |
| 83 | + std::vector<at::Tensor> outputs; |
| 84 | + auto duration = |
| 85 | + atCompl.run("blockdiagperm2dfissioned_1", {I, W}, outputs, handle, true); |
| 86 | + |
| 87 | + // 2. Allocate and autotune |
| 88 | + at::Tensor O = outputs[0].clone().resize_({128, 500}); |
| 89 | + at::Tensor Idx = at::CPU(at::kInt).randperm({500}).toBackend(at::kCUDA); |
| 90 | + tc::autotune::GeneticAutotunerATen geneticAutotuneATen2(tc); |
| 91 | + auto bestOption2 = geneticAutotuneATen.tune( |
| 92 | + FLAGS_tuner_proto, "blockdiagperm2dfissioned_2", {O, Idx}, options); |
| 93 | + auto handle2 = atCompl.compile( |
| 94 | + "blockdiagperm2dfissioned_2", {O, Idx}, bestOption2.getValue()); |
| 95 | + std::vector<at::Tensor> outputs2; |
| 96 | + auto duration2 = atCompl.run( |
| 97 | + "blockdiagperm2dfissioned_2", {O, Idx}, outputs2, handle2, true); |
| 98 | + |
| 99 | + // 3. Report best standalone times |
| 100 | + std::cout |
| 101 | + << "blockdiagperm2dfissioned_1 size I: " << I.sizes() << ", " |
| 102 | + << "size W: " << W.sizes() << " ran in: " |
| 103 | + << std::chrono::duration_cast<std::chrono::microseconds>(duration).count() |
| 104 | + << "us\n"; |
| 105 | + std::cout << "blockdiagperm2dfissioned_2 size O: " << O.sizes() << ", " |
| 106 | + << "size Idx: " << Idx.sizes() << " ran in: " |
| 107 | + << std::chrono::duration_cast<std::chrono::microseconds>(duration2) |
| 108 | + .count() |
| 109 | + << "us\n"; |
| 110 | + |
| 111 | + // 4. Run unchecked one last time, use with: |
| 112 | + // nvprof --profile-from-start off executable --use_nvprof=1 |
| 113 | + { |
| 114 | + tc::CudaProfiler cp; |
| 115 | + atCompl.uncheckedRun({I, W}, outputs, handle); |
| 116 | + atCompl.uncheckedRun({O, Idx}, outputs2, handle2); |
| 117 | + } |
| 118 | +} |
| 119 | + |
| 120 | +// From root, run with: |
| 121 | +// ./build/examples/blockdiagperm --tuner_threads=10 --tuner_gen_pop_size=10 |
| 122 | +// --tuner_gen_generations=3 --tuner_gen_number_elites=4 |
| 123 | +// --tuner_proto="/tmp/blockdiagperm" |
| 124 | +int main(int argc, char** argv) { |
| 125 | + ::testing::InitGoogleTest(&argc, argv); |
| 126 | + ::gflags::ParseCommandLineFlags(&argc, &argv, true); |
| 127 | + ::google::InitGoogleLogging(argv[0]); |
| 128 | + return RUN_ALL_TESTS(); |
| 129 | +} |
0 commit comments