Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit f2d7743

Browse files
Merge pull request #197 from nicolasvasilache/pr/decouple-cuda
Decouple cuda
2 parents fa5746e + 9d48f0a commit f2d7743

File tree

94 files changed

+2108
-1395
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

94 files changed

+2108
-1395
lines changed

docs/source/mapping_options.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ C++
2424

2525
.. code-block:: c++
2626

27-
#include <tc/core/mapping_options.h>
27+
#include <tc/core/cuda/cuda_mapping_options.h>
2828

2929
auto options = MappingOptions::makeNaiveMappingOptions()
3030
.mapToBlocks(100, 20)

examples/example_MLP_model.cc

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
#include <ATen/ATen.h>
2525

2626
#include "tc/aten/aten_compiler.h"
27-
#include "tc/core/mapping_options.h"
27+
#include "tc/core/cuda/cuda_mapping_options.h"
2828

2929
#include "../test/test_harness.h"
3030
#include "../test/test_harness_aten_cuda.h"
@@ -109,7 +109,7 @@ class ProductionModel : public Benchmark {
109109
uint32_t D,
110110
uint32_t L1,
111111
uint32_t E1,
112-
const tc::MappingOptions& options,
112+
const tc::CudaMappingOptions& options,
113113
bool useFlags = false);
114114
void run2LUT(
115115
uint32_t B,
@@ -118,27 +118,27 @@ class ProductionModel : public Benchmark {
118118
uint32_t L2,
119119
uint32_t E1,
120120
uint32_t E2,
121-
const tc::MappingOptions& options,
121+
const tc::CudaMappingOptions& options,
122122
bool useFlags = false);
123123
void runC3(
124124
uint32_t B,
125125
uint32_t WX,
126126
uint32_t WY,
127-
const tc::MappingOptions& options,
127+
const tc::CudaMappingOptions& options,
128128
bool useFlags = false);
129129
void runMLP1(
130130
uint32_t B,
131131
uint32_t N,
132132
uint32_t M,
133-
const tc::MappingOptions& options,
133+
const tc::CudaMappingOptions& options,
134134
bool useFlags = false);
135135
void runMLP3(
136136
uint32_t B,
137137
uint32_t N,
138138
uint32_t O,
139139
uint32_t P,
140140
uint32_t Q,
141-
const tc::MappingOptions& options,
141+
const tc::CudaMappingOptions& options,
142142
bool useFlags = false);
143143
};
144144

@@ -147,7 +147,7 @@ void ProductionModel::run1LUT(
147147
uint32_t D,
148148
uint32_t L1,
149149
uint32_t E1,
150-
const tc::MappingOptions& options,
150+
const tc::CudaMappingOptions& options,
151151
bool useFlags) {
152152
CHECK_LT(0, E1);
153153

@@ -232,7 +232,7 @@ void ProductionModel::run2LUT(
232232
uint32_t L2,
233233
uint32_t E1,
234234
uint32_t E2,
235-
const tc::MappingOptions& options,
235+
const tc::CudaMappingOptions& options,
236236
bool useFlags) {
237237
CHECK_LT(0, E1);
238238
CHECK_LT(0, E2);
@@ -335,7 +335,7 @@ void ProductionModel::runC3(
335335
uint32_t B,
336336
uint32_t WX,
337337
uint32_t WY,
338-
const tc::MappingOptions& options,
338+
const tc::CudaMappingOptions& options,
339339
bool useFlags) {
340340
at::Tensor I = at::CUDA(at::kFloat).rand({B, WX});
341341
at::Tensor W = at::CUDA(at::kFloat).rand({WY, WX});
@@ -389,7 +389,7 @@ void ProductionModel::runMLP1(
389389
uint32_t B,
390390
uint32_t N,
391391
uint32_t M,
392-
const tc::MappingOptions& options,
392+
const tc::CudaMappingOptions& options,
393393
bool useFlags) {
394394
at::Tensor I = at::CUDA(at::kFloat).rand({B, M});
395395
at::Tensor W1 = at::CUDA(at::kFloat).rand({M, N});
@@ -448,7 +448,7 @@ void ProductionModel::runMLP3(
448448
uint32_t O,
449449
uint32_t P,
450450
uint32_t Q,
451-
const tc::MappingOptions& options,
451+
const tc::CudaMappingOptions& options,
452452
bool useFlags) {
453453
at::Tensor I = at::CUDA(at::kFloat).rand({B, N});
454454
at::Tensor W2 = at::CUDA(at::kFloat).rand({O, N});
@@ -520,8 +520,8 @@ TEST_F(ProductionModel, 1LUT) {
520520
auto D = FLAGS_D;
521521
auto L1 = FLAGS_L1;
522522
auto E1 = FLAGS_E1;
523-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
524-
.tile({1, 32})
523+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
524+
.tile(1, 32)
525525
.mapToThreads({1, 32})
526526
.mapToBlocks({128, 128})
527527
.unroll(256);
@@ -534,7 +534,7 @@ TEST_F(ProductionModel, 1LUT_P100_autotuned_B_128_D_64_L1_50_E1_10000000) {
534534
uint32_t L1 = 50;
535535
uint32_t E1 = 10000000;
536536
auto options =
537-
tc::MappingOptions::makeNaiveMappingOptions()
537+
tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
538538
.outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
539539
.fixParametersBeforeScheduling(true)
540540
.tile(1)
@@ -551,7 +551,7 @@ TEST_F(ProductionModel, 1LUT_P100_autotuned_B_16_D_64_L1_50_E1_10000000) {
551551
uint32_t L1 = 50;
552552
uint32_t E1 = 10000000;
553553
auto options =
554-
tc::MappingOptions::makeNaiveMappingOptions()
554+
tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
555555
.outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
556556
.fixParametersBeforeScheduling(false)
557557
.tile(1, 32)
@@ -597,8 +597,8 @@ TEST_F(ProductionModel, 2LUT) {
597597
auto L2 = FLAGS_L2;
598598
auto E1 = FLAGS_E1;
599599
auto E2 = FLAGS_E2;
600-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
601-
.tile({1, 32})
600+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
601+
.tile(1, 32)
602602
.mapToThreads({1, 32})
603603
.mapToBlocks({128, 128})
604604
.unroll(256);
@@ -615,7 +615,7 @@ TEST_F(
615615
uint32_t L2 = 50;
616616
uint32_t E2 = 10000000;
617617
auto options =
618-
tc::MappingOptions::makeNaiveMappingOptions()
618+
tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
619619
.outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
620620
.fixParametersBeforeScheduling(false)
621621
.tile(1, 256, 1250000)
@@ -636,7 +636,7 @@ TEST_F(
636636
uint32_t L2 = 50;
637637
uint32_t E2 = 10000000;
638638
auto options =
639-
tc::MappingOptions::makeNaiveMappingOptions()
639+
tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
640640
.outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
641641
.fixParametersBeforeScheduling(false)
642642
.tile(1, 64)
@@ -686,9 +686,9 @@ TEST_F(ProductionModel, C3) {
686686
auto B = FLAGS_B;
687687
auto WX = FLAGS_WX;
688688
auto WY = FLAGS_WY;
689-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
689+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
690690
.fixParametersBeforeScheduling(true)
691-
.tile({32, 32, 32})
691+
.tile(32, 32, 32)
692692
.mapToThreads({4, 32})
693693
.mapToBlocks({128, 128})
694694
.useSharedMemory(true)
@@ -702,7 +702,7 @@ TEST_F(ProductionModel, C3_P100_autotuned_B_128_WX_1000_WY_1024) {
702702
uint32_t B = 128;
703703
uint32_t WX = 1000;
704704
uint32_t WY = 1024;
705-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
705+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
706706
.outerScheduleFusionStrategy(tc::FusionStrategy::Max)
707707
.outerScheduleAllowSkewing(false)
708708
.outerSchedulePositiveOrthant(true)
@@ -725,7 +725,7 @@ TEST_F(ProductionModel, C3_P100_autotuned_B_16_WX_1000_WY_1024) {
725725
uint32_t B = 16;
726726
uint32_t WX = 1000;
727727
uint32_t WY = 1024;
728-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
728+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
729729
.outerScheduleFusionStrategy(tc::FusionStrategy::Max)
730730
.outerScheduleAllowSkewing(false)
731731
.outerSchedulePositiveOrthant(true)
@@ -781,9 +781,9 @@ TEST_F(ProductionModel, MLP1) {
781781
auto B = FLAGS_B;
782782
auto N = FLAGS_N;
783783
auto M = FLAGS_M;
784-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
784+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
785785
.fixParametersBeforeScheduling(true)
786-
.tile({16, 16, 128})
786+
.tile(16, 16, 128)
787787
.mapToThreads({16, 16})
788788
.mapToBlocks({32, 32})
789789
.useSharedMemory(true)
@@ -797,7 +797,7 @@ TEST_F(ProductionModel, MLP1_P100_autotuned_B_128_M_2000_N_128) {
797797
uint32_t M = 2000;
798798
uint32_t N = 128;
799799
auto options =
800-
tc::MappingOptions::makeNaiveMappingOptions()
800+
tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
801801
.outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
802802
.outerScheduleAllowSkewing(false)
803803
.outerSchedulePositiveOrthant(true)
@@ -821,7 +821,7 @@ TEST_F(ProductionModel, MLP1_P100_autotuned_B_16_M_2000_N_128) {
821821
uint32_t M = 2000;
822822
uint32_t N = 128;
823823
auto options =
824-
tc::MappingOptions::makeNaiveMappingOptions()
824+
tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
825825
.outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
826826
.outerScheduleAllowSkewing(false)
827827
.outerSchedulePositiveOrthant(true)
@@ -880,9 +880,9 @@ TEST_F(ProductionModel, MLP3) {
880880
auto O = FLAGS_O;
881881
auto P = FLAGS_P;
882882
auto Q = FLAGS_Q;
883-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
883+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
884884
.fixParametersBeforeScheduling(true)
885-
.tile({16, 16, 128})
885+
.tile(16, 16, 128)
886886
.mapToThreads({16, 16})
887887
.mapToBlocks({32, 32})
888888
.useSharedMemory(true)
@@ -897,7 +897,7 @@ TEST_F(ProductionModel, MLP3_P100_autotuned_B_128_N_128_O_64_P_32_Q_2) {
897897
auto O = 64;
898898
auto P = 32;
899899
auto Q = 2;
900-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
900+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
901901
.outerScheduleFusionStrategy(tc::FusionStrategy::Max)
902902
.outerScheduleAllowSkewing(false)
903903
.outerSchedulePositiveOrthant(true)
@@ -923,7 +923,7 @@ TEST_F(ProductionModel, MLP3_P100_autotuned_B_16_M_2000_N_128_Q_2) {
923923
auto O = 64;
924924
auto P = 32;
925925
auto Q = 2;
926-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
926+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
927927
.outerScheduleFusionStrategy(tc::FusionStrategy::Max)
928928
.outerScheduleAllowSkewing(false)
929929
.outerSchedulePositiveOrthant(true)

examples/example_batchmatmul.cc

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
#include <ATen/ATen.h>
2525

2626
#include "tc/aten/aten_compiler.h"
27-
#include "tc/core/mapping_options.h"
27+
#include "tc/core/cuda/cuda_mapping_options.h"
2828

2929
#include "../test/test_harness.h"
3030
#include "../test/test_harness_aten_cuda.h"
@@ -48,7 +48,7 @@ class BatchMatMul : public Benchmark {
4848
uint32_t N,
4949
uint32_t M,
5050
uint32_t K,
51-
const tc::MappingOptions& options,
51+
const tc::CudaMappingOptions& options,
5252
bool useFlags = false);
5353
};
5454

@@ -57,7 +57,7 @@ void BatchMatMul::runBatchMatMul(
5757
uint32_t N,
5858
uint32_t M,
5959
uint32_t K,
60-
const tc::MappingOptions& options,
60+
const tc::CudaMappingOptions& options,
6161
bool useFlags) {
6262
at::Tensor X = at::CUDA(at::kFloat).rand({B, N, M});
6363
at::Tensor Y = at::CUDA(at::kFloat).rand({B, M, K});
@@ -116,8 +116,8 @@ TEST_F(BatchMatMul, TransposedBatchMatMul) {
116116
auto N = FLAGS_N;
117117
auto M = FLAGS_M;
118118
auto K = FLAGS_K;
119-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
120-
.tile({1})
119+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
120+
.tile(1)
121121
.mapToThreads({128})
122122
.mapToBlocks({B})
123123
.useSharedMemory(true)
@@ -131,7 +131,7 @@ TEST_F(BatchMatMul, TransposedBatchMatMul_P100_autotuned_B_500_K_26_M_72_N_26) {
131131
uint32_t K = 26;
132132
uint32_t M = 72;
133133
uint32_t N = 26;
134-
auto options = tc::MappingOptions::makeNaiveMappingOptions()
134+
auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions()
135135
.outerScheduleFusionStrategy(tc::FusionStrategy::Max)
136136
.outerScheduleAllowSkewing(false)
137137
.outerSchedulePositiveOrthant(true)

examples/example_fixture.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,10 @@
3131
#include "tc/autotuner/utils/utils.h"
3232
#include "tc/core/cuda/cuda.h"
3333
#include "tc/core/cuda/cuda_compilation_cache.h"
34+
#include "tc/core/cuda/cuda_mapping_options.h"
3435
#include "tc/core/cuda/cuda_rtc.h"
3536
#include "tc/core/cuda/cuda_tc_executor.h"
3637
#include "tc/core/flags.h"
37-
#include "tc/core/mapping_options.h"
3838
#include "tc/core/scope_guard.h"
3939

4040
#include <cublas_v2.h> // Must be the same as Caffe2
@@ -69,7 +69,7 @@ std::vector<const DLTensor*> inferOutputTensorInfo(
6969
return atCompl.inferOutputTensorInfo(name, inputs);
7070
}
7171

72-
tc::MappingOptions loadOptionsFromProto(
72+
tc::CudaMappingOptions loadOptionsFromProto(
7373
const std::string cacheFilename,
7474
const std::string& name,
7575
const std::vector<at::Tensor>& inputs,
@@ -127,7 +127,7 @@ struct Benchmark : public ::testing::Test {
127127
void Check(
128128
const std::string& tc,
129129
const std::string& name,
130-
const tc::MappingOptions& mappingOptions,
130+
const tc::CudaMappingOptions& mappingOptions,
131131
const std::vector<at::Tensor>& inputs,
132132
std::vector<at::Tensor>& outputs,
133133
CheckFunction checkFun = [](const std::vector<at::Tensor>& inputs,
@@ -379,8 +379,8 @@ struct Benchmark : public ::testing::Test {
379379
std::string TC,
380380
std::string kernelName,
381381
std::vector<at::Tensor> inputs,
382-
tc::MappingOptions baseMapping,
383-
std::vector<tc::MappingOptions> startingPoints,
382+
tc::CudaMappingOptions baseMapping,
383+
std::vector<tc::CudaMappingOptions> startingPoints,
384384
CheckFunction checkFun =
385385
[](const std::vector<at::Tensor>&, const std::vector<at::Tensor>&) {
386386
return true;

0 commit comments

Comments
 (0)