Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit d1f7c2b

Browse files
Split MappingOptions into a base MappingOptions and a CudaMappingOptions then follow the rabbit trail.
The main modification in this changeset is to split mapping_options.proto into a MappingOptionsProto that contain the generic options from CudaMappingOptionsProto which contains the CUDA-specific options. This unfortunately turns out to be significantly more complex than originally hoped. After some digging I realize that classes owning proto that derive from view classes that point to such protos make it extremely easy to shoot oneself in the foot and view uninitialized memory when combining with a hierarchy of options. As a consequence the biggest change is really about making an owning proto class **have a** view and not **be a** view. While this change may make sense independently I don't think it makes sense to invest time of splitting just for the purpose of having a smaller incremental changeset. The rest of the commit is mostly about renaming and splitting options that are cuda specific from options that are generic. The model used here is that proto owning classes have a view that one needs to access to be able to call the transformation functions while factory functions belong in the owning proto class. This is not yet implemented for leaf proto-owning classes (CudaMappingOptions and CpuMappingOptions) because there is not immediate need to. We may still want it for consistency. An additional change people may be interested to see is an implicit conversion from an options class to a view class so that we can avoid writing ```options.genericMappingOptionsView``` and ```options.view``` to access the underlying functions.
1 parent 3d55faa commit d1f7c2b

File tree

80 files changed

+1938
-1187
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

80 files changed

+1938
-1187
lines changed

docs/source/mapping_options.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ C++
2424

2525
.. code-block:: c++
2626

27-
#include <tc/core/mapping_options.h>
27+
#include <tc/core/cuda/cuda_mapping_options.h>
2828

2929
auto options = MappingOptions::makeNaiveMappingOptions()
3030
.mapToBlocks(100, 20)

examples/example_MLP_model.cc

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
#include <ATen/ATen.h>
2525

2626
#include "tc/aten/aten_compiler.h"
27-
#include "tc/core/mapping_options.h"
27+
#include "tc/core/cuda/cuda_mapping_options.h"
2828

2929
#include "../test/test_harness.h"
3030
#include "../test/test_harness_aten_cuda.h"
@@ -109,7 +109,7 @@ class ProductionModel : public Benchmark {
109109
uint32_t D,
110110
uint32_t L1,
111111
uint32_t E1,
112-
const tc::MappingOptions& options,
112+
const tc::CudaMappingOptions& options,
113113
bool useFlags = false);
114114
void run2LUT(
115115
uint32_t B,
@@ -118,27 +118,27 @@ class ProductionModel : public Benchmark {
118118
uint32_t L2,
119119
uint32_t E1,
120120
uint32_t E2,
121-
const tc::MappingOptions& options,
121+
const tc::CudaMappingOptions& options,
122122
bool useFlags = false);
123123
void runC3(
124124
uint32_t B,
125125
uint32_t WX,
126126
uint32_t WY,
127-
const tc::MappingOptions& options,
127+
const tc::CudaMappingOptions& options,
128128
bool useFlags = false);
129129
void runMLP1(
130130
uint32_t B,
131131
uint32_t N,
132132
uint32_t M,
133-
const tc::MappingOptions& options,
133+
const tc::CudaMappingOptions& options,
134134
bool useFlags = false);
135135
void runMLP3(
136136
uint32_t B,
137137
uint32_t N,
138138
uint32_t O,
139139
uint32_t P,
140140
uint32_t Q,
141-
const tc::MappingOptions& options,
141+
const tc::CudaMappingOptions& options,
142142
bool useFlags = false);
143143
};
144144

@@ -147,7 +147,7 @@ void ProductionModel::run1LUT(
147147
uint32_t D,
148148
uint32_t L1,
149149
uint32_t E1,
150-
const tc::MappingOptions& options,
150+
const tc::CudaMappingOptions& options,
151151
bool useFlags) {
152152
CHECK_LT(0, E1);
153153

@@ -232,7 +232,7 @@ void ProductionModel::run2LUT(
232232
uint32_t L2,
233233
uint32_t E1,
234234
uint32_t E2,
235-
const tc::MappingOptions& options,
235+
const tc::CudaMappingOptions& options,
236236
bool useFlags) {
237237
CHECK_LT(0, E1);
238238
CHECK_LT(0, E2);
@@ -335,7 +335,7 @@ void ProductionModel::runC3(
335335
uint32_t B,
336336
uint32_t WX,
337337
uint32_t WY,
338-
const tc::MappingOptions& options,
338+
const tc::CudaMappingOptions& options,
339339
bool useFlags) {
340340
at::Tensor I = at::CUDA(at::kFloat).rand({B, WX});
341341
at::Tensor W = at::CUDA(at::kFloat).rand({WY, WX});
@@ -389,7 +389,7 @@ void ProductionModel::runMLP1(
389389
uint32_t B,
390390
uint32_t N,
391391
uint32_t M,
392-
const tc::MappingOptions& options,
392+
const tc::CudaMappingOptions& options,
393393
bool useFlags) {
394394
at::Tensor I = at::CUDA(at::kFloat).rand({B, M});
395395
at::Tensor W1 = at::CUDA(at::kFloat).rand({M, N});
@@ -448,7 +448,7 @@ void ProductionModel::runMLP3(
448448
uint32_t O,
449449
uint32_t P,
450450
uint32_t Q,
451-
const tc::MappingOptions& options,
451+
const tc::CudaMappingOptions& options,
452452
bool useFlags) {
453453
at::Tensor I = at::CUDA(at::kFloat).rand({B, N});
454454
at::Tensor W2 = at::CUDA(at::kFloat).rand({O, N});
@@ -520,7 +520,7 @@ TEST_F(ProductionModel, 1LUT) {
520520
auto D = FLAGS_D;
521521
auto L1 = FLAGS_L1;
522522
auto E1 = FLAGS_E1;
523-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
523+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
524524
.tile({1, 32})
525525
.mapToThreads({1, 32})
526526
.mapToBlocks({128, 128})
@@ -534,7 +534,7 @@ TEST_F(ProductionModel, 1LUT_P100_autotuned_B_128_D_64_L1_50_E1_10000000) {
534534
uint32_t L1 = 50;
535535
uint32_t E1 = 10000000;
536536
auto options =
537-
tc::MappingOptions::makeNaiveMappingOptions()
537+
tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
538538
.outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
539539
.fixParametersBeforeScheduling(true)
540540
.tile(1)
@@ -551,7 +551,7 @@ TEST_F(ProductionModel, 1LUT_P100_autotuned_B_16_D_64_L1_50_E1_10000000) {
551551
uint32_t L1 = 50;
552552
uint32_t E1 = 10000000;
553553
auto options =
554-
tc::MappingOptions::makeNaiveMappingOptions()
554+
tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
555555
.outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
556556
.fixParametersBeforeScheduling(false)
557557
.tile(1, 32)
@@ -597,7 +597,7 @@ TEST_F(ProductionModel, 2LUT) {
597597
auto L2 = FLAGS_L2;
598598
auto E1 = FLAGS_E1;
599599
auto E2 = FLAGS_E2;
600-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
600+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
601601
.tile({1, 32})
602602
.mapToThreads({1, 32})
603603
.mapToBlocks({128, 128})
@@ -615,7 +615,7 @@ TEST_F(
615615
uint32_t L2 = 50;
616616
uint32_t E2 = 10000000;
617617
auto options =
618-
tc::MappingOptions::makeNaiveMappingOptions()
618+
tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
619619
.outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
620620
.fixParametersBeforeScheduling(false)
621621
.tile(1, 256, 1250000)
@@ -636,7 +636,7 @@ TEST_F(
636636
uint32_t L2 = 50;
637637
uint32_t E2 = 10000000;
638638
auto options =
639-
tc::MappingOptions::makeNaiveMappingOptions()
639+
tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
640640
.outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
641641
.fixParametersBeforeScheduling(false)
642642
.tile(1, 64)
@@ -686,7 +686,7 @@ TEST_F(ProductionModel, C3) {
686686
auto B = FLAGS_B;
687687
auto WX = FLAGS_WX;
688688
auto WY = FLAGS_WY;
689-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
689+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
690690
.fixParametersBeforeScheduling(true)
691691
.tile({32, 32, 32})
692692
.mapToThreads({4, 32})
@@ -702,7 +702,7 @@ TEST_F(ProductionModel, C3_P100_autotuned_B_128_WX_1000_WY_1024) {
702702
uint32_t B = 128;
703703
uint32_t WX = 1000;
704704
uint32_t WY = 1024;
705-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
705+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
706706
.outerScheduleFusionStrategy(tc::FusionStrategy::Max)
707707
.outerScheduleAllowSkewing(false)
708708
.outerSchedulePositiveOrthant(true)
@@ -725,7 +725,7 @@ TEST_F(ProductionModel, C3_P100_autotuned_B_16_WX_1000_WY_1024) {
725725
uint32_t B = 16;
726726
uint32_t WX = 1000;
727727
uint32_t WY = 1024;
728-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
728+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
729729
.outerScheduleFusionStrategy(tc::FusionStrategy::Max)
730730
.outerScheduleAllowSkewing(false)
731731
.outerSchedulePositiveOrthant(true)
@@ -781,7 +781,7 @@ TEST_F(ProductionModel, MLP1) {
781781
auto B = FLAGS_B;
782782
auto N = FLAGS_N;
783783
auto M = FLAGS_M;
784-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
784+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
785785
.fixParametersBeforeScheduling(true)
786786
.tile({16, 16, 128})
787787
.mapToThreads({16, 16})
@@ -797,7 +797,7 @@ TEST_F(ProductionModel, MLP1_P100_autotuned_B_128_M_2000_N_128) {
797797
uint32_t M = 2000;
798798
uint32_t N = 128;
799799
auto options =
800-
tc::MappingOptions::makeNaiveMappingOptions()
800+
tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
801801
.outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
802802
.outerScheduleAllowSkewing(false)
803803
.outerSchedulePositiveOrthant(true)
@@ -821,7 +821,7 @@ TEST_F(ProductionModel, MLP1_P100_autotuned_B_16_M_2000_N_128) {
821821
uint32_t M = 2000;
822822
uint32_t N = 128;
823823
auto options =
824-
tc::MappingOptions::makeNaiveMappingOptions()
824+
tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
825825
.outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
826826
.outerScheduleAllowSkewing(false)
827827
.outerSchedulePositiveOrthant(true)
@@ -880,7 +880,7 @@ TEST_F(ProductionModel, MLP3) {
880880
auto O = FLAGS_O;
881881
auto P = FLAGS_P;
882882
auto Q = FLAGS_Q;
883-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
883+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
884884
.fixParametersBeforeScheduling(true)
885885
.tile({16, 16, 128})
886886
.mapToThreads({16, 16})
@@ -897,7 +897,7 @@ TEST_F(ProductionModel, MLP3_P100_autotuned_B_128_N_128_O_64_P_32_Q_2) {
897897
auto O = 64;
898898
auto P = 32;
899899
auto Q = 2;
900-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
900+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
901901
.outerScheduleFusionStrategy(tc::FusionStrategy::Max)
902902
.outerScheduleAllowSkewing(false)
903903
.outerSchedulePositiveOrthant(true)
@@ -923,7 +923,7 @@ TEST_F(ProductionModel, MLP3_P100_autotuned_B_16_M_2000_N_128_Q_2) {
923923
auto O = 64;
924924
auto P = 32;
925925
auto Q = 2;
926-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
926+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
927927
.outerScheduleFusionStrategy(tc::FusionStrategy::Max)
928928
.outerScheduleAllowSkewing(false)
929929
.outerSchedulePositiveOrthant(true)

examples/example_batchmatmul.cc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
#include <ATen/ATen.h>
2525

2626
#include "tc/aten/aten_compiler.h"
27-
#include "tc/core/mapping_options.h"
27+
#include "tc/core/cuda/cuda_mapping_options.h"
2828

2929
#include "../test/test_harness.h"
3030
#include "../test/test_harness_aten_cuda.h"
@@ -48,7 +48,7 @@ class BatchMatMul : public Benchmark {
4848
uint32_t N,
4949
uint32_t M,
5050
uint32_t K,
51-
const tc::MappingOptions& options,
51+
const tc::CudaMappingOptions& options,
5252
bool useFlags = false);
5353
};
5454

@@ -57,7 +57,7 @@ void BatchMatMul::runBatchMatMul(
5757
uint32_t N,
5858
uint32_t M,
5959
uint32_t K,
60-
const tc::MappingOptions& options,
60+
const tc::CudaMappingOptions& options,
6161
bool useFlags) {
6262
at::Tensor X = at::CUDA(at::kFloat).rand({B, N, M});
6363
at::Tensor Y = at::CUDA(at::kFloat).rand({B, M, K});
@@ -116,7 +116,7 @@ TEST_F(BatchMatMul, TransposedBatchMatMul) {
116116
auto N = FLAGS_N;
117117
auto M = FLAGS_M;
118118
auto K = FLAGS_K;
119-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
119+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
120120
.tile({1})
121121
.mapToThreads({128})
122122
.mapToBlocks({B})
@@ -131,7 +131,7 @@ TEST_F(BatchMatMul, TransposedBatchMatMul_P100_autotuned_B_500_K_26_M_72_N_26) {
131131
uint32_t K = 26;
132132
uint32_t M = 72;
133133
uint32_t N = 26;
134-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
134+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
135135
.outerScheduleFusionStrategy(tc::FusionStrategy::Max)
136136
.outerScheduleAllowSkewing(false)
137137
.outerSchedulePositiveOrthant(true)

examples/example_fixture.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,10 @@
3131
#include "tc/autotuner/utils/utils.h"
3232
#include "tc/core/cuda/cuda.h"
3333
#include "tc/core/cuda/cuda_compilation_cache.h"
34+
#include "tc/core/cuda/cuda_mapping_options.h"
3435
#include "tc/core/cuda/cuda_rtc.h"
3536
#include "tc/core/cuda/cuda_tc_executor.h"
3637
#include "tc/core/flags.h"
37-
#include "tc/core/mapping_options.h"
3838
#include "tc/core/scope_guard.h"
3939

4040
#include <cublas_v2.h> // Must be the same as Caffe2
@@ -69,7 +69,7 @@ std::vector<const DLTensor*> inferOutputTensorInfo(
6969
return atCompl.inferOutputTensorInfo(name, inputs);
7070
}
7171

72-
tc::MappingOptions loadOptionsFromProto(
72+
tc::CudaMappingOptions loadOptionsFromProto(
7373
const std::string cacheFilename,
7474
const std::string& name,
7575
const std::vector<at::Tensor>& inputs,
@@ -127,7 +127,7 @@ struct Benchmark : public ::testing::Test {
127127
void Check(
128128
const std::string& tc,
129129
const std::string& name,
130-
const tc::MappingOptions& mappingOptions,
130+
const tc::CudaMappingOptions& mappingOptions,
131131
const std::vector<at::Tensor>& inputs,
132132
std::vector<at::Tensor>& outputs,
133133
CheckFunction checkFun = [](const std::vector<at::Tensor>& inputs,
@@ -379,8 +379,8 @@ struct Benchmark : public ::testing::Test {
379379
std::string TC,
380380
std::string kernelName,
381381
std::vector<at::Tensor> inputs,
382-
tc::MappingOptions baseMapping,
383-
std::vector<tc::MappingOptions> startingPoints,
382+
tc::CudaMappingOptions baseMapping,
383+
std::vector<tc::CudaMappingOptions> startingPoints,
384384
CheckFunction checkFun =
385385
[](const std::vector<at::Tensor>&, const std::vector<at::Tensor>&) {
386386
return true;

examples/example_group_convolution.cc

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
#include <ATen/ATen.h>
2525

2626
#include "tc/aten/aten_compiler.h"
27-
#include "tc/core/mapping_options.h"
27+
#include "tc/core/cuda/cuda_mapping_options.h"
2828

2929
#include "../test/test_harness.h"
3030
#include "../test/test_harness_aten_cuda.h"
@@ -56,7 +56,7 @@ class GroupConvolution : public Benchmark {
5656
uint32_t W,
5757
uint32_t KH,
5858
uint32_t KW,
59-
const tc::MappingOptions& options,
59+
const tc::CudaMappingOptions& options,
6060
bool useFlags = false);
6161
};
6262

@@ -69,7 +69,7 @@ void GroupConvolution::runGroupConvolution(
6969
uint32_t W,
7070
uint32_t KH,
7171
uint32_t KW,
72-
const tc::MappingOptions& options,
72+
const tc::CudaMappingOptions& options,
7373
bool useFlags) {
7474
Workspace w;
7575
auto AddInput =
@@ -176,7 +176,7 @@ TEST_F(GroupConvolution, GroupConvolution) {
176176
// If num threads is too small just get some better default
177177
auto threads = (W >= 10) ? std::vector<size_t>{W / 4, H / 2}
178178
: std::vector<size_t>{4, 8, 4};
179-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
179+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
180180
.tile({1, 1, 1})
181181
.mapToThreads(threads)
182182
.mapToBlocks({32, 32})
@@ -199,7 +199,7 @@ TEST_F(
199199
uint32_t KW = 3;
200200
uint32_t KH = 3;
201201
auto options =
202-
tc::MappingOptions::makeNaiveMappingOptions()
202+
tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
203203
.useSharedMemory(true)
204204
.usePrivateMemory(true)
205205
.unrollCopyShared(true)
@@ -225,7 +225,7 @@ TEST_F(
225225
uint32_t KW = 3;
226226
uint32_t KH = 3;
227227
auto options =
228-
tc::MappingOptions::makeNaiveMappingOptions()
228+
tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
229229
.outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
230230
.outerScheduleAllowSkewing(false)
231231
.outerSchedulePositiveOrthant(true)
@@ -257,7 +257,7 @@ TEST_F(
257257
uint32_t KW = 3;
258258
uint32_t KH = 3;
259259
auto options =
260-
tc::MappingOptions::makeNaiveMappingOptions()
260+
tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
261261
.outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
262262
.outerScheduleAllowSkewing(false)
263263
.outerSchedulePositiveOrthant(true)
@@ -288,7 +288,7 @@ TEST_F(
288288
uint32_t H = 28;
289289
uint32_t KW = 3;
290290
uint32_t KH = 3;
291-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
291+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
292292
.outerScheduleFusionStrategy(tc::FusionStrategy::Max)
293293
.outerScheduleAllowSkewing(false)
294294
.outerSchedulePositiveOrthant(true)

0 commit comments

Comments
 (0)