24
24
#include < ATen/ATen.h>
25
25
26
26
#include " tc/aten/aten_compiler.h"
27
- #include " tc/core/mapping_options .h"
27
+ #include " tc/core/cuda/cuda_mapping_options .h"
28
28
29
29
#include " ../test/test_harness.h"
30
30
#include " ../test/test_harness_aten_cuda.h"
@@ -109,7 +109,7 @@ class ProductionModel : public Benchmark {
109
109
uint32_t D,
110
110
uint32_t L1,
111
111
uint32_t E1 ,
112
- const tc::MappingOptions & options,
112
+ const tc::CudaMappingOptions & options,
113
113
bool useFlags = false );
114
114
void run2LUT (
115
115
uint32_t B,
@@ -118,27 +118,27 @@ class ProductionModel : public Benchmark {
118
118
uint32_t L2,
119
119
uint32_t E1 ,
120
120
uint32_t E2 ,
121
- const tc::MappingOptions & options,
121
+ const tc::CudaMappingOptions & options,
122
122
bool useFlags = false );
123
123
void runC3 (
124
124
uint32_t B,
125
125
uint32_t WX,
126
126
uint32_t WY,
127
- const tc::MappingOptions & options,
127
+ const tc::CudaMappingOptions & options,
128
128
bool useFlags = false );
129
129
void runMLP1 (
130
130
uint32_t B,
131
131
uint32_t N,
132
132
uint32_t M,
133
- const tc::MappingOptions & options,
133
+ const tc::CudaMappingOptions & options,
134
134
bool useFlags = false );
135
135
void runMLP3 (
136
136
uint32_t B,
137
137
uint32_t N,
138
138
uint32_t O,
139
139
uint32_t P,
140
140
uint32_t Q,
141
- const tc::MappingOptions & options,
141
+ const tc::CudaMappingOptions & options,
142
142
bool useFlags = false );
143
143
};
144
144
@@ -147,7 +147,7 @@ void ProductionModel::run1LUT(
147
147
uint32_t D,
148
148
uint32_t L1,
149
149
uint32_t E1 ,
150
- const tc::MappingOptions & options,
150
+ const tc::CudaMappingOptions & options,
151
151
bool useFlags) {
152
152
CHECK_LT (0 , E1 );
153
153
@@ -232,7 +232,7 @@ void ProductionModel::run2LUT(
232
232
uint32_t L2,
233
233
uint32_t E1 ,
234
234
uint32_t E2 ,
235
- const tc::MappingOptions & options,
235
+ const tc::CudaMappingOptions & options,
236
236
bool useFlags) {
237
237
CHECK_LT (0 , E1 );
238
238
CHECK_LT (0 , E2 );
@@ -335,7 +335,7 @@ void ProductionModel::runC3(
335
335
uint32_t B,
336
336
uint32_t WX,
337
337
uint32_t WY,
338
- const tc::MappingOptions & options,
338
+ const tc::CudaMappingOptions & options,
339
339
bool useFlags) {
340
340
at::Tensor I = at::CUDA (at::kFloat ).rand ({B, WX});
341
341
at::Tensor W = at::CUDA (at::kFloat ).rand ({WY, WX});
@@ -389,7 +389,7 @@ void ProductionModel::runMLP1(
389
389
uint32_t B,
390
390
uint32_t N,
391
391
uint32_t M,
392
- const tc::MappingOptions & options,
392
+ const tc::CudaMappingOptions & options,
393
393
bool useFlags) {
394
394
at::Tensor I = at::CUDA (at::kFloat ).rand ({B, M});
395
395
at::Tensor W1 = at::CUDA (at::kFloat ).rand ({M, N});
@@ -448,7 +448,7 @@ void ProductionModel::runMLP3(
448
448
uint32_t O,
449
449
uint32_t P,
450
450
uint32_t Q,
451
- const tc::MappingOptions & options,
451
+ const tc::CudaMappingOptions & options,
452
452
bool useFlags) {
453
453
at::Tensor I = at::CUDA (at::kFloat ).rand ({B, N});
454
454
at::Tensor W2 = at::CUDA (at::kFloat ).rand ({O, N});
@@ -520,8 +520,8 @@ TEST_F(ProductionModel, 1LUT) {
520
520
auto D = FLAGS_D;
521
521
auto L1 = FLAGS_L1;
522
522
auto E1 = FLAGS_E1;
523
- auto options = tc::MappingOptions::makeNaiveMappingOptions ()
524
- .tile ({ 1 , 32 } )
523
+ auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions ()
524
+ .tile (1 , 32 )
525
525
.mapToThreads ({1 , 32 })
526
526
.mapToBlocks ({128 , 128 })
527
527
.unroll (256 );
@@ -534,7 +534,7 @@ TEST_F(ProductionModel, 1LUT_P100_autotuned_B_128_D_64_L1_50_E1_10000000) {
534
534
uint32_t L1 = 50 ;
535
535
uint32_t E1 = 10000000 ;
536
536
auto options =
537
- tc::MappingOptions::makeNaiveMappingOptions ()
537
+ tc::CudaMappingOptions::makeNaiveCudaMappingOptions ()
538
538
.outerScheduleFusionStrategy (tc::FusionStrategy::Preserve3Coincident)
539
539
.fixParametersBeforeScheduling (true )
540
540
.tile (1 )
@@ -551,7 +551,7 @@ TEST_F(ProductionModel, 1LUT_P100_autotuned_B_16_D_64_L1_50_E1_10000000) {
551
551
uint32_t L1 = 50 ;
552
552
uint32_t E1 = 10000000 ;
553
553
auto options =
554
- tc::MappingOptions::makeNaiveMappingOptions ()
554
+ tc::CudaMappingOptions::makeNaiveCudaMappingOptions ()
555
555
.outerScheduleFusionStrategy (tc::FusionStrategy::Preserve3Coincident)
556
556
.fixParametersBeforeScheduling (false )
557
557
.tile (1 , 32 )
@@ -597,8 +597,8 @@ TEST_F(ProductionModel, 2LUT) {
597
597
auto L2 = FLAGS_L2;
598
598
auto E1 = FLAGS_E1;
599
599
auto E2 = FLAGS_E2;
600
- auto options = tc::MappingOptions::makeNaiveMappingOptions ()
601
- .tile ({ 1 , 32 } )
600
+ auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions ()
601
+ .tile (1 , 32 )
602
602
.mapToThreads ({1 , 32 })
603
603
.mapToBlocks ({128 , 128 })
604
604
.unroll (256 );
@@ -615,7 +615,7 @@ TEST_F(
615
615
uint32_t L2 = 50 ;
616
616
uint32_t E2 = 10000000 ;
617
617
auto options =
618
- tc::MappingOptions::makeNaiveMappingOptions ()
618
+ tc::CudaMappingOptions::makeNaiveCudaMappingOptions ()
619
619
.outerScheduleFusionStrategy (tc::FusionStrategy::Preserve3Coincident)
620
620
.fixParametersBeforeScheduling (false )
621
621
.tile (1 , 256 , 1250000 )
@@ -636,7 +636,7 @@ TEST_F(
636
636
uint32_t L2 = 50 ;
637
637
uint32_t E2 = 10000000 ;
638
638
auto options =
639
- tc::MappingOptions::makeNaiveMappingOptions ()
639
+ tc::CudaMappingOptions::makeNaiveCudaMappingOptions ()
640
640
.outerScheduleFusionStrategy (tc::FusionStrategy::Preserve3Coincident)
641
641
.fixParametersBeforeScheduling (false )
642
642
.tile (1 , 64 )
@@ -686,9 +686,9 @@ TEST_F(ProductionModel, C3) {
686
686
auto B = FLAGS_B;
687
687
auto WX = FLAGS_WX;
688
688
auto WY = FLAGS_WY;
689
- auto options = tc::MappingOptions::makeNaiveMappingOptions ()
689
+ auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions ()
690
690
.fixParametersBeforeScheduling (true )
691
- .tile ({ 32 , 32 , 32 } )
691
+ .tile (32 , 32 , 32 )
692
692
.mapToThreads ({4 , 32 })
693
693
.mapToBlocks ({128 , 128 })
694
694
.useSharedMemory (true )
@@ -702,7 +702,7 @@ TEST_F(ProductionModel, C3_P100_autotuned_B_128_WX_1000_WY_1024) {
702
702
uint32_t B = 128 ;
703
703
uint32_t WX = 1000 ;
704
704
uint32_t WY = 1024 ;
705
- auto options = tc::MappingOptions::makeNaiveMappingOptions ()
705
+ auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions ()
706
706
.outerScheduleFusionStrategy (tc::FusionStrategy::Max)
707
707
.outerScheduleAllowSkewing (false )
708
708
.outerSchedulePositiveOrthant (true )
@@ -725,7 +725,7 @@ TEST_F(ProductionModel, C3_P100_autotuned_B_16_WX_1000_WY_1024) {
725
725
uint32_t B = 16 ;
726
726
uint32_t WX = 1000 ;
727
727
uint32_t WY = 1024 ;
728
- auto options = tc::MappingOptions::makeNaiveMappingOptions ()
728
+ auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions ()
729
729
.outerScheduleFusionStrategy (tc::FusionStrategy::Max)
730
730
.outerScheduleAllowSkewing (false )
731
731
.outerSchedulePositiveOrthant (true )
@@ -781,9 +781,9 @@ TEST_F(ProductionModel, MLP1) {
781
781
auto B = FLAGS_B;
782
782
auto N = FLAGS_N;
783
783
auto M = FLAGS_M;
784
- auto options = tc::MappingOptions::makeNaiveMappingOptions ()
784
+ auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions ()
785
785
.fixParametersBeforeScheduling (true )
786
- .tile ({ 16 , 16 , 128 } )
786
+ .tile (16 , 16 , 128 )
787
787
.mapToThreads ({16 , 16 })
788
788
.mapToBlocks ({32 , 32 })
789
789
.useSharedMemory (true )
@@ -797,7 +797,7 @@ TEST_F(ProductionModel, MLP1_P100_autotuned_B_128_M_2000_N_128) {
797
797
uint32_t M = 2000 ;
798
798
uint32_t N = 128 ;
799
799
auto options =
800
- tc::MappingOptions::makeNaiveMappingOptions ()
800
+ tc::CudaMappingOptions::makeNaiveCudaMappingOptions ()
801
801
.outerScheduleFusionStrategy (tc::FusionStrategy::Preserve3Coincident)
802
802
.outerScheduleAllowSkewing (false )
803
803
.outerSchedulePositiveOrthant (true )
@@ -821,7 +821,7 @@ TEST_F(ProductionModel, MLP1_P100_autotuned_B_16_M_2000_N_128) {
821
821
uint32_t M = 2000 ;
822
822
uint32_t N = 128 ;
823
823
auto options =
824
- tc::MappingOptions::makeNaiveMappingOptions ()
824
+ tc::CudaMappingOptions::makeNaiveCudaMappingOptions ()
825
825
.outerScheduleFusionStrategy (tc::FusionStrategy::Preserve3Coincident)
826
826
.outerScheduleAllowSkewing (false )
827
827
.outerSchedulePositiveOrthant (true )
@@ -880,9 +880,9 @@ TEST_F(ProductionModel, MLP3) {
880
880
auto O = FLAGS_O;
881
881
auto P = FLAGS_P;
882
882
auto Q = FLAGS_Q;
883
- auto options = tc::MappingOptions::makeNaiveMappingOptions ()
883
+ auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions ()
884
884
.fixParametersBeforeScheduling (true )
885
- .tile ({ 16 , 16 , 128 } )
885
+ .tile (16 , 16 , 128 )
886
886
.mapToThreads ({16 , 16 })
887
887
.mapToBlocks ({32 , 32 })
888
888
.useSharedMemory (true )
@@ -897,7 +897,7 @@ TEST_F(ProductionModel, MLP3_P100_autotuned_B_128_N_128_O_64_P_32_Q_2) {
897
897
auto O = 64 ;
898
898
auto P = 32 ;
899
899
auto Q = 2 ;
900
- auto options = tc::MappingOptions::makeNaiveMappingOptions ()
900
+ auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions ()
901
901
.outerScheduleFusionStrategy (tc::FusionStrategy::Max)
902
902
.outerScheduleAllowSkewing (false )
903
903
.outerSchedulePositiveOrthant (true )
@@ -923,7 +923,7 @@ TEST_F(ProductionModel, MLP3_P100_autotuned_B_16_M_2000_N_128_Q_2) {
923
923
auto O = 64 ;
924
924
auto P = 32 ;
925
925
auto Q = 2 ;
926
- auto options = tc::MappingOptions::makeNaiveMappingOptions ()
926
+ auto options = tc::CudaMappingOptions::makeNaiveCudaMappingOptions ()
927
927
.outerScheduleFusionStrategy (tc::FusionStrategy::Max)
928
928
.outerScheduleAllowSkewing (false )
929
929
.outerSchedulePositiveOrthant (true )
0 commit comments