|
| 1 | +/** |
| 2 | + * Copyright (c) 2017-present, Facebook, Inc. |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + */ |
| 16 | +#include <iostream> |
| 17 | +#include <string> |
| 18 | +#include <vector> |
| 19 | + |
| 20 | +#include <gflags/gflags.h> |
| 21 | +#include <glog/logging.h> |
| 22 | +#include <gtest/gtest.h> |
| 23 | + |
| 24 | +#include <ATen/ATen.h> |
| 25 | + |
| 26 | +#include "tc/aten/aten_compiler.h" |
| 27 | +#include "tc/autotuner/genetic_autotuner_aten.h" |
| 28 | +#include "tc/core/cuda/cuda_mapping_options.h" |
| 29 | +#include "tc/core/flags.h" |
| 30 | + |
| 31 | +#include "../test/test_harness_aten_cuda.h" |
| 32 | + |
| 33 | +DEFINE_uint32(number_elites, 2, "Number of elites per generation"); |
| 34 | +DEFINE_uint32(generations, 3, "Number of generations to tune for"); |
| 35 | +DEFINE_uint32(pop_size, 10, "Population size to tune for"); |
| 36 | +DEFINE_uint32(threads, 16, "Number of threads to tune with"); |
| 37 | +DEFINE_string(gpus, "0", "List of gpus to evaluate on"); |
| 38 | + |
| 39 | +TEST(TensorDot, SimpleAutotune) { |
| 40 | + // 1. Define and setup the TC compilation unit with CUDA memory |
| 41 | + // management backed by ATen tensors. |
| 42 | + std::string tc = R"TC( |
| 43 | +def tensordot(float(N, C1, C2, H, W) I0, |
| 44 | + float(N, C2, C3, H, W) I1) -> (O) |
| 45 | +{ |
| 46 | + O(n, c1, c3, h, w) +=! I0(n, c1, r_c2, h, w) * I1(n, r_c2, c3, h, w) |
| 47 | +} |
| 48 | + )TC"; |
| 49 | + tc::ATenCompilationUnit<tc::CudaTcExecutor> atCompl; |
| 50 | + atCompl.define(tc); |
| 51 | + |
| 52 | + // 2. Allocate tensors with random data. |
| 53 | + at::Tensor I0 = at::CUDA(at::kFloat).rand({16, 8, 16, 17, 25}); |
| 54 | + at::Tensor I1 = at::CUDA(at::kFloat).rand({16, 16, 2, 17, 25}); |
| 55 | + |
| 56 | + // 3. Run autotuning with evolutionary search starting from a naive option. |
| 57 | + auto naiveOptions = tc::CudaMappingOptions::makeNaiveCudaMappingOptions(); |
| 58 | + tc::autotune::GeneticAutotunerATen geneticAutotuneATen(tc); |
| 59 | + auto bestOption = geneticAutotuneATen.tune( |
| 60 | + "/tmp/save_results", "tensordot", {I0, I1}, naiveOptions); |
| 61 | + |
| 62 | + // 4. Compile and run the TC with the best option. |
| 63 | + // Outputs get allocated; could also be pre-allocated and passed. |
| 64 | + auto handle = atCompl.compile("tensordot", {I0, I1}, bestOption.getValue()); |
| 65 | + std::vector<at::Tensor> outputs; |
| 66 | + auto duration = atCompl.run("tensordot", {I0, I1}, outputs, handle, true); |
| 67 | + std::cout |
| 68 | + << "tensordot size I0: " << I0.sizes() << ", " |
| 69 | + << "size I1: " << I1.sizes() << " ran in: " |
| 70 | + << std::chrono::duration_cast<std::chrono::microseconds>(duration).count() |
| 71 | + << "us\n"; |
| 72 | + |
| 73 | + // 5. Optionally, perform precision checks against a ref. implementation. |
| 74 | + // TODO. |
| 75 | + |
| 76 | + // 6. Reuse bestOptions from autotuning on another kernel |
| 77 | + for (auto sizes : std::vector<std::pair<at::IntList, at::IntList>>{ |
| 78 | + {{4, 9, 7, 16, 14}, {4, 7, 3, 16, 14}}, |
| 79 | + {{8, 5, 11, 10, 10}, {8, 11, 16, 10, 10}}, |
| 80 | + }) { |
| 81 | + at::Tensor I0 = at::CUDA(at::kFloat).rand(sizes.first); |
| 82 | + at::Tensor I1 = at::CUDA(at::kFloat).rand(sizes.second); |
| 83 | + auto handle = atCompl.compile("tensordot", {I0, I1}, bestOption.getValue()); |
| 84 | + std::vector<at::Tensor> outputs; |
| 85 | + auto duration = atCompl.run("tensordot", {I0, I1}, outputs, handle, true); |
| 86 | + std::cout << "tensordot size I0: " << I0.sizes() << ", " |
| 87 | + << "size I1: " << I1.sizes() << " ran in: " |
| 88 | + << std::chrono::duration_cast<std::chrono::microseconds>(duration) |
| 89 | + .count() |
| 90 | + << "us\n"; |
| 91 | + } |
| 92 | +} |
| 93 | + |
| 94 | +int main(int argc, char** argv) { |
| 95 | + ::testing::InitGoogleTest(&argc, argv); |
| 96 | + ::gflags::ParseCommandLineFlags(&argc, &argv, true); |
| 97 | + ::google::InitGoogleLogging(argv[0]); |
| 98 | + setAtenSeed(tc::initRandomSeed(), at::Backend::CUDA); |
| 99 | + tc::FLAGS_tuner_gen_number_elites = FLAGS_number_elites; |
| 100 | + tc::FLAGS_tuner_gen_generations = FLAGS_generations; |
| 101 | + tc::FLAGS_tuner_gen_pop_size = FLAGS_pop_size; |
| 102 | + tc::FLAGS_tuner_threads = FLAGS_threads; |
| 103 | + tc::FLAGS_tuner_gpus = FLAGS_gpus; |
| 104 | + return RUN_ALL_TESTS(); |
| 105 | +} |
0 commit comments