Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit e7cd425

Browse files
Merge pull request #228 from nicolasvasilache/pr/example_blockdiagperm
Add a blockdiagperm example for @pierrestock
2 parents 1687f6f + 5194ea4 commit e7cd425

File tree

6 files changed

+161
-34
lines changed

6 files changed

+161
-34
lines changed

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ set(GTEST_LIBS gtest gtest_main)
1515
################################################################################
1616
set(EXAMPLES_FILES
1717
tensordot
18+
blockdiagperm
1819
)
1920
foreach(i ${EXAMPLES_FILES})
2021
add_executable(${i} ${i}.cc)

examples/blockdiagperm.cc

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
/**
2+
* Copyright (c) 2017-present, Facebook, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#include <iostream>
17+
#include <string>
18+
#include <vector>
19+
20+
#include <gflags/gflags.h>
21+
#include <glog/logging.h>
22+
#include <gtest/gtest.h>
23+
24+
#include <ATen/ATen.h>
25+
26+
#include "tc/aten/aten_compiler.h"
27+
#include "tc/autotuner/genetic_autotuner_aten.h"
28+
#include "tc/core/cuda/cuda.h"
29+
#include "tc/core/cuda/cuda_tc_executor.h"
30+
#include "tc/core/flags.h"
31+
#include "tc/core/mapping_options.h"
32+
33+
DEFINE_string(tuner_proto, "", "Filename to load and store proto cache ");
34+
35+
TEST(BlockDiagPerm, SimpleAutotune) {
36+
// 1. Define and setup the TC compilation unit with CUDA memory
37+
// management backed by ATen tensors.
38+
std::string tc = R"TC(
39+
# The following TCs (blockdiagperm2d and blockdiagperm2dinlined) illustrate
40+
# how we would likely want to write blockdiagperm to synthesize a single
41+
# kernel. However both versions currently fail to emit a good single cuda kernel.
42+
# 1. blockdiagperm2d requires additional information to relax dependencies and
43+
# allow fusion
44+
# 2. blockdiagperm2dinlined requires general LHS indexing
45+
# A third version blockdiagperm2dfissioned_1/2 is a workaround by using 2
46+
# independent TCs.
47+
# This TC probably requires extra information to perform fusion which we do
48+
# not know how to propagate at this point
49+
# def blockdiagperm2d(float(B, K, NBYK) I, float(K, NBYK, NBYK) W, float(K, NBYK) IdxR, float(K, NBYK) IdxC)
50+
# -> (O1, O2) {
51+
# O1(b, k, nbyk1) +=! I(b, k, r_nbyk0) * W(k, r_nbyk0, nbyk1)
52+
# O2(b, k, nbyk) = O1(b, Idxr(k, nbyk), Idxc(k, nbyk))
53+
# }
54+
# This TC requires LHS indexing which is a WIP + extra information that all
55+
# accesses are parallel (i.e. (IdxR, IdxC) form a permutation)
56+
# def blockdiagperm2dinlined(float(B, K, NBYK) I, float(K, NBYK, NBYK) W, float(K, NBYK) IdxR, float(K, NBYK) IdxC)
57+
# -> (O1) {
58+
# O1(b, IdxR(k, nbyk0), IdxC(k, nbyk0)) +=! I(b, k, r_nbyk0) * W(k, r_nbyk0, nbyk1)
59+
# }
60+
61+
# This is the poor man's way of making things work today with a reshape
62+
# operation in between (in framework land).
63+
def blockdiagperm2dfissioned_1(float(B, K, NBYK) I, float(K, NBYK, NBYK) W) -> (O)
64+
{
65+
O(b, k, nbyk1) +=! I(b, k, r_nbyk0) * W(k, r_nbyk0, nbyk1)
66+
}
67+
def blockdiagperm2dfissioned_2(float(B, N) I, int32(N) Idx) -> (O) {
68+
O(b, n) = I(b, Idx(n)) where n in 0:N
69+
}
70+
)TC";
71+
tc::ATenCompilationUnit<tc::CudaTcExecutor> atCompl;
72+
atCompl.define(tc);
73+
74+
// 1. Allocate and autotune
75+
at::Tensor I = at::CUDA(at::kFloat).rand({128, 10, 50});
76+
at::Tensor W = at::CUDA(at::kFloat).rand({10, 50, 50});
77+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions();
78+
tc::autotune::GeneticAutotunerATen geneticAutotuneATen(tc);
79+
auto bestOption = geneticAutotuneATen.tune(
80+
FLAGS_tuner_proto, "blockdiagperm2dfissioned_1", {I, W}, options);
81+
auto handle = atCompl.compile(
82+
"blockdiagperm2dfissioned_1", {I, W}, bestOption.getValue());
83+
std::vector<at::Tensor> outputs;
84+
auto duration =
85+
atCompl.run("blockdiagperm2dfissioned_1", {I, W}, outputs, handle, true);
86+
87+
// 2. Allocate and autotune
88+
at::Tensor O = outputs[0].clone().resize_({128, 500});
89+
at::Tensor Idx = at::CPU(at::kInt).randperm({500}).toBackend(at::kCUDA);
90+
tc::autotune::GeneticAutotunerATen geneticAutotuneATen2(tc);
91+
auto bestOption2 = geneticAutotuneATen.tune(
92+
FLAGS_tuner_proto, "blockdiagperm2dfissioned_2", {O, Idx}, options);
93+
auto handle2 = atCompl.compile(
94+
"blockdiagperm2dfissioned_2", {O, Idx}, bestOption2.getValue());
95+
std::vector<at::Tensor> outputs2;
96+
auto duration2 = atCompl.run(
97+
"blockdiagperm2dfissioned_2", {O, Idx}, outputs2, handle2, true);
98+
99+
// 3. Report best standalone times
100+
std::cout
101+
<< "blockdiagperm2dfissioned_1 size I: " << I.sizes() << ", "
102+
<< "size W: " << W.sizes() << " ran in: "
103+
<< std::chrono::duration_cast<std::chrono::microseconds>(duration).count()
104+
<< "us\n";
105+
std::cout << "blockdiagperm2dfissioned_2 size O: " << O.sizes() << ", "
106+
<< "size Idx: " << Idx.sizes() << " ran in: "
107+
<< std::chrono::duration_cast<std::chrono::microseconds>(duration2)
108+
.count()
109+
<< "us\n";
110+
111+
// 4. Run unchecked one last time, use with:
112+
// nvprof --profile-from-start off executable --use_nvprof=1
113+
{
114+
tc::CudaProfiler cp;
115+
atCompl.uncheckedRun({I, W}, outputs, handle);
116+
atCompl.uncheckedRun({O, Idx}, outputs2, handle2);
117+
}
118+
}
119+
120+
// From root, run with:
121+
// ./build/examples/blockdiagperm --tuner_threads=10 --tuner_gen_pop_size=10
122+
// --tuner_gen_generations=3 --tuner_gen_number_elites=4
123+
// --tuner_proto="/tmp/blockdiagperm"
124+
int main(int argc, char** argv) {
125+
::testing::InitGoogleTest(&argc, argv);
126+
::gflags::ParseCommandLineFlags(&argc, &argv, true);
127+
::google::InitGoogleLogging(argv[0]);
128+
return RUN_ALL_TESTS();
129+
}

examples/tensordot.cc

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,7 @@
3030

3131
#include "../test/test_harness_aten_cuda.h"
3232

33-
DEFINE_uint32(number_elites, 2, "Number of elites per generation");
34-
DEFINE_uint32(generations, 3, "Number of generations to tune for");
35-
DEFINE_uint32(pop_size, 10, "Population size to tune for");
36-
DEFINE_uint32(threads, 16, "Number of threads to tune with");
37-
DEFINE_string(gpus, "0", "List of gpus to evaluate on");
33+
DEFINE_string(tuner_proto, "", "Filename to load and store proto cache ");
3834

3935
TEST(TensorDot, SimpleAutotune) {
4036
// 1. Define and setup the TC compilation unit with CUDA memory
@@ -57,7 +53,7 @@ def tensordot(float(N, C1, C2, H, W) I0,
5753
auto naiveOptions = tc::CudaMappingOptions::makeNaiveCudaMappingOptions();
5854
tc::autotune::GeneticAutotunerATen geneticAutotuneATen(tc);
5955
auto bestOption = geneticAutotuneATen.tune(
60-
"/tmp/save_results", "tensordot", {I0, I1}, naiveOptions);
56+
FLAGS_tuner_proto, "tensordot", {I0, I1}, naiveOptions);
6157

6258
// 4. Compile and run the TC with the best option.
6359
// Outputs get allocated; could also be pre-allocated and passed.
@@ -91,15 +87,14 @@ def tensordot(float(N, C1, C2, H, W) I0,
9187
}
9288
}
9389

90+
// From root, run with:
91+
// ./build/examples/tensordot --tuner_threads=10 --tuner_gen_pop_size=10
92+
// --tuner_gen_generations=3 --tuner_gen_number_elites=4
93+
// --tuner_proto="/tmp/tensordot"
9494
int main(int argc, char** argv) {
9595
::testing::InitGoogleTest(&argc, argv);
9696
::gflags::ParseCommandLineFlags(&argc, &argv, true);
9797
::google::InitGoogleLogging(argv[0]);
9898
setAtenSeed(tc::initRandomSeed(), at::Backend::CUDA);
99-
tc::FLAGS_tuner_gen_number_elites = FLAGS_number_elites;
100-
tc::FLAGS_tuner_gen_generations = FLAGS_generations;
101-
tc::FLAGS_tuner_gen_pop_size = FLAGS_pop_size;
102-
tc::FLAGS_tuner_threads = FLAGS_threads;
103-
tc::FLAGS_tuner_gpus = FLAGS_gpus;
10499
return RUN_ALL_TESTS();
105100
}

include/tc/core/cuda/cuda.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include <stdexcept>
2828

2929
#include <cuda.h>
30+
#include <cuda_profiler_api.h>
3031
#include <cuda_runtime.h>
3132

3233
#include <glog/logging.h>
@@ -70,6 +71,8 @@
7071

7172
namespace tc {
7273

74+
DECLARE_bool(use_nvprof);
75+
7376
struct WithDevice {
7477
WithDevice(size_t g) : newGpu(g) {
7578
int dev;
@@ -111,4 +114,17 @@ class CudaGPUInfo {
111114
std::vector<size_t> sharedMemSizes_;
112115
};
113116

117+
struct CudaProfiler {
118+
CudaProfiler() {
119+
if (FLAGS_use_nvprof) {
120+
cudaProfilerStart();
121+
}
122+
}
123+
~CudaProfiler() {
124+
if (FLAGS_use_nvprof) {
125+
cudaProfilerStop();
126+
}
127+
}
128+
};
129+
114130
} // namespace tc

src/core/cuda/cuda.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
#include "tc/core/flags.h"
2727

2828
namespace tc {
29+
DEFINE_bool(use_nvprof, false, "Start / stop nvprof");
30+
2931
namespace {
3032

3133
std::tuple<std::vector<std::string>, std::vector<size_t>> init() {

test/test_harness.h

Lines changed: 7 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
*/
1616
#pragma once
1717

18-
#include <cuda_profiler_api.h>
1918
#include <gtest/gtest.h>
2019
#include <mutex>
2120
#include <string>
@@ -32,23 +31,8 @@
3231
#include "tc/c2/tc_op.h"
3332
#include "tc/core/cuda/cuda.h"
3433

35-
DEFINE_bool(use_nvprof, false, "Start / stop nvprof");
36-
3734
namespace caffe2 {
3835

39-
struct CudaProfiler {
40-
CudaProfiler() {
41-
if (FLAGS_use_nvprof) {
42-
cudaProfilerStart();
43-
}
44-
}
45-
~CudaProfiler() {
46-
if (FLAGS_use_nvprof) {
47-
cudaProfilerStop();
48-
}
49-
}
50-
};
51-
5236
caffe2::TensorCPU context2tensor(caffe2::CPUContext& ctx) {
5337
return caffe2::TensorCPU();
5438
}
@@ -315,7 +299,7 @@ struct TestHarness {
315299

316300
void RunReference() {
317301
ASSERT_TRUE(net_ref.get());
318-
CudaProfiler p;
302+
tc::CudaProfiler p;
319303
ASSERT_TRUE(net_ref->Run());
320304
}
321305

@@ -326,7 +310,7 @@ struct TestHarness {
326310

327311
void Run() {
328312
ASSERT_TRUE(op_test.get());
329-
CudaProfiler p;
313+
tc::CudaProfiler p;
330314
ASSERT_TRUE(op_test->Run());
331315
}
332316

@@ -406,7 +390,7 @@ struct TestHarness {
406390
unique_ptr<OperatorBase> op_g(CreateOperator(g_op, &w));
407391
ASSERT_TRUE(op_g.get());
408392
{
409-
CudaProfiler p;
393+
tc::CudaProfiler p;
410394
ASSERT_TRUE(op_g->Run());
411395
}
412396
}
@@ -424,7 +408,7 @@ struct TestHarness {
424408
unique_ptr<NetBase> ref_net(CreateNet(ref_net_def, &w1));
425409
ASSERT_TRUE(ref_net.get());
426410
{
427-
CudaProfiler p;
411+
tc::CudaProfiler p;
428412
ASSERT_TRUE(ref_net->Run());
429413
}
430414

@@ -433,7 +417,7 @@ struct TestHarness {
433417
unique_ptr<NetBase> net(CreateNet(net_def, &w2));
434418
ASSERT_TRUE(net.get());
435419
{
436-
CudaProfiler p;
420+
tc::CudaProfiler p;
437421
ASSERT_TRUE(net->Run());
438422
}
439423

@@ -467,7 +451,7 @@ struct TestHarness {
467451
unique_ptr<NetBase> net(CreateNet(net_def, &w1));
468452
ASSERT_TRUE(net.get());
469453
{
470-
CudaProfiler p;
454+
tc::CudaProfiler p;
471455
ASSERT_TRUE(net->Run());
472456
}
473457
RunGradient(w1, *net_def.mutable_op()->Mutable(0));
@@ -477,7 +461,7 @@ struct TestHarness {
477461
unique_ptr<OperatorBase> op(CreateOperator(op_def, &w2));
478462
ASSERT_TRUE(op.get());
479463
{
480-
CudaProfiler p;
464+
tc::CudaProfiler p;
481465
ASSERT_TRUE(op->Run());
482466
}
483467
OperatorDef def = op_def;

0 commit comments

Comments
 (0)