Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 7939778

Browse files
Add a blockdiagperm example
This changeset introduces a simple example for multiplication by a block diagonal matrix followed by vector permutation. This is more easily lifted as TC expressions on 3-D tensors. In commented code, we sketch how we would likely want to write blockdiagperm to synthesize a single kernel. However both versions currently fail to emit a good single cuda kernel. 1. blockdiagperm2d requires additional information to relax dependencies and allow fusion 2. blockdiagperm2dinlined requires general LHS indexing A third version blockdiagperm2dfissioned_1/2 is a workaround by using 2 independent TCs. This works fine for now.
1 parent fc9bb4f commit 7939778

File tree

2 files changed

+130
-0
lines changed

2 files changed

+130
-0
lines changed

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ set(GTEST_LIBS gtest gtest_main)
1515
################################################################################
1616
set(EXAMPLES_FILES
1717
tensordot
18+
blockdiagperm
1819
)
1920
foreach(i ${EXAMPLES_FILES})
2021
add_executable(${i} ${i}.cc)

examples/blockdiagperm.cc

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
/**
2+
* Copyright (c) 2017-present, Facebook, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#include <iostream>
17+
#include <string>
18+
#include <vector>
19+
20+
#include <gflags/gflags.h>
21+
#include <glog/logging.h>
22+
#include <gtest/gtest.h>
23+
24+
#include <ATen/ATen.h>
25+
26+
#include "tc/aten/aten_compiler.h"
27+
#include "tc/autotuner/genetic_autotuner_aten.h"
28+
#include "tc/core/cuda/cuda.h"
29+
#include "tc/core/cuda/cuda_tc_executor.h"
30+
#include "tc/core/flags.h"
31+
#include "tc/core/mapping_options.h"
32+
33+
DEFINE_string(tuner_proto, "", "Filename to load and store proto cache ");
34+
35+
TEST(BlockDiagPerm, SimpleAutotune) {
36+
// 1. Define and setup the TC compilation unit with CUDA memory
37+
// management backed by ATen tensors.
38+
std::string tc = R"TC(
39+
# The following TCs (blockdiagperm2d and blockdiagperm2dinlined) illustrate
40+
# how we would likely want to write blockdiagperm to synthesize a single
41+
# kernel. However both versions currently fail to emit a good single cuda kernel.
42+
# 1. blockdiagperm2d requires additional information to relax dependencies and
43+
# allow fusion
44+
# 2. blockdiagperm2dinlined requires general LHS indexing
45+
# A third version blockdiagperm2dfissioned_1/2 is a workaround by using 2
46+
# independent TCs.
47+
# This TC probably requires extra information to perform fusion which we do
48+
# not know how to propagate at this point
49+
# def blockdiagperm2d(float(B, K, NBYK) I, float(K, NBYK, NBYK) W, float(K, NBYK) IdxR, float(K, NBYK) IdxC)
50+
# -> (O1, O2) {
51+
# O1(b, k, nbyk1) +=! I(b, k, r_nbyk0) * W(k, r_nbyk0, nbyk1)
52+
# O2(b, k, nbyk) = O1(b, Idxr(k, nbyk), Idxc(k, nbyk))
53+
# }
54+
# This TC requires LHS indexing which is a WIP + extra information that all
55+
# accesses are parallel (i.e. (IdxR, IdxC) form a permutation)
56+
# def blockdiagperm2dinlined(float(B, K, NBYK) I, float(K, NBYK, NBYK) W, float(K, NBYK) IdxR, float(K, NBYK) IdxC)
57+
# -> (O1) {
58+
# O1(b, IdxR(k, nbyk0), IdxC(k, nbyk0)) +=! I(b, k, r_nbyk0) * W(k, r_nbyk0, nbyk1)
59+
# }
60+
61+
# This is the poor man's way of making things work today with a reshape
62+
# operation in between (in framework land).
63+
def blockdiagperm2dfissioned_1(float(B, K, NBYK) I, float(K, NBYK, NBYK) W) -> (O)
64+
{
65+
O(b, k, nbyk1) +=! I(b, k, r_nbyk0) * W(k, r_nbyk0, nbyk1)
66+
}
67+
def blockdiagperm2dfissioned_2(float(B, N) I, int32(N) Idx) -> (O) {
68+
O(b, n) = I(b, Idx(n)) where n in 0:N
69+
}
70+
)TC";
71+
tc::ATenCompilationUnit<tc::CudaTcExecutor> atCompl;
72+
atCompl.define(tc);
73+
74+
// 1. Allocate and autotune
75+
at::Tensor I = at::CUDA(at::kFloat).rand({128, 10, 50});
76+
at::Tensor W = at::CUDA(at::kFloat).rand({10, 50, 50});
77+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions();
78+
tc::autotune::GeneticAutotunerATen geneticAutotuneATen(tc);
79+
auto bestOption = geneticAutotuneATen.tune(
80+
FLAGS_tuner_proto, "blockdiagperm2dfissioned_1", {I, W}, options);
81+
auto handle = atCompl.compile(
82+
"blockdiagperm2dfissioned_1", {I, W}, bestOption.getValue());
83+
std::vector<at::Tensor> outputs;
84+
auto duration =
85+
atCompl.run("blockdiagperm2dfissioned_1", {I, W}, outputs, handle, true);
86+
87+
// 2. Allocate and autotune
88+
at::Tensor O = outputs[0].clone().resize_({128, 500});
89+
at::Tensor Idx = at::CPU(at::kInt).randperm({500}).toBackend(at::kCUDA);
90+
tc::autotune::GeneticAutotunerATen geneticAutotuneATen2(tc);
91+
auto bestOption2 = geneticAutotuneATen.tune(
92+
FLAGS_tuner_proto, "blockdiagperm2dfissioned_2", {O, Idx}, options);
93+
auto handle2 = atCompl.compile(
94+
"blockdiagperm2dfissioned_2", {O, Idx}, bestOption2.getValue());
95+
std::vector<at::Tensor> outputs2;
96+
auto duration2 = atCompl.run(
97+
"blockdiagperm2dfissioned_2", {O, Idx}, outputs2, handle2, true);
98+
99+
// 3. Report best standalone times
100+
std::cout
101+
<< "blockdiagperm2dfissioned_1 size I: " << I.sizes() << ", "
102+
<< "size W: " << W.sizes() << " ran in: "
103+
<< std::chrono::duration_cast<std::chrono::microseconds>(duration).count()
104+
<< "us\n";
105+
std::cout << "blockdiagperm2dfissioned_2 size O: " << O.sizes() << ", "
106+
<< "size Idx: " << Idx.sizes() << " ran in: "
107+
<< std::chrono::duration_cast<std::chrono::microseconds>(duration2)
108+
.count()
109+
<< "us\n";
110+
111+
// 4. Run unchecked one last time, use with:
112+
// nvprof --profile-from-start off executable --use_nvprof=1
113+
{
114+
tc::CudaProfiler cp;
115+
atCompl.uncheckedRun({I, W}, outputs, handle);
116+
atCompl.uncheckedRun({O, Idx}, outputs2, handle2);
117+
}
118+
}
119+
120+
// From root, run with:
121+
// ./build/examples/blockdiagperm --tuner_threads=10 --tuner_gen_pop_size=10
122+
// --tuner_gen_generations=3 --tuner_gen_number_elites=4
123+
// --tuner_proto="/tmp/blockdiagperm"
124+
int main(int argc, char** argv) {
125+
::testing::InitGoogleTest(&argc, argv);
126+
::gflags::ParseCommandLineFlags(&argc, &argv, true);
127+
::google::InitGoogleLogging(argv[0]);
128+
return RUN_ALL_TESTS();
129+
}

0 commit comments

Comments
 (0)