13
13
* See the License for the specific language governing permissions and
14
14
* limitations under the License.
15
15
*/
16
+ #include " batchmatmul.h"
17
+
16
18
#include < iostream>
17
19
#include < string>
18
20
#include < vector>
@@ -43,23 +45,22 @@ DEFINE_uint32(M, 72, "M dimension in Z(b, n, m) += X(b, n, kk) * Y(b, kk, m)");
43
45
DEFINE_uint32 (K, 26 , " K dimension in Z(b, n, m) += X(b, n, kk) * Y(b, kk, m)" );
44
46
45
47
class BatchMatMul : public Benchmark {
48
+ protected:
49
+ uint32_t B, N, M, K;
50
+
46
51
public:
47
- void runBatchMatMul (
48
- uint32_t B,
49
- uint32_t N,
50
- uint32_t M,
51
- uint32_t K,
52
- const tc::CudaMappingOptions& options,
53
- bool use_flags = false );
52
+ void Init (uint32_t b, uint32_t n, uint32_t m, uint32_t k) {
53
+ B = b;
54
+ N = n;
55
+ M = m;
56
+ K = k;
57
+ }
58
+ void runBatchMatMul (const tc::CudaMappingOptions& options);
59
+ void runCaffe2BatchMatMul ();
60
+ void runATenBatchMatMul ();
54
61
};
55
62
56
- void BatchMatMul::runBatchMatMul (
57
- uint32_t B,
58
- uint32_t N,
59
- uint32_t M,
60
- uint32_t K,
61
- const tc::CudaMappingOptions& options,
62
- bool use_flags) {
63
+ void BatchMatMul::runBatchMatMul (const tc::CudaMappingOptions& options) {
63
64
at::Tensor X = at::CUDA (at::kFloat ).rand ({B, N, M});
64
65
at::Tensor Y = at::CUDA (at::kFloat ).rand ({B, M, K});
65
66
@@ -85,96 +86,83 @@ def batch_matmul(float(B, N, M) X, float(B, M, K) Y) -> (Z) {
85
86
std::string suffix = std::string (" _B_" ) + std::to_string (FLAGS_B) +
86
87
std::string (" _K_" ) + std::to_string (FLAGS_K) + std::string (" _M_" ) +
87
88
std::to_string (FLAGS_M) + std::string (" _N_" ) + std::to_string (FLAGS_N);
88
- if (use_flags && FLAGS_validate_proto) {
89
- validateProto (
89
+ std::vector<tc::CudaMappingOptions> bestOptions{options};
90
+ if (FLAGS_autotune) {
91
+ bestOptions = autotune (
90
92
FLAGS_save_tuner_proto_prefix + std::string (" /batchmatmul_cache" ) +
91
93
suffix,
94
+ FLAGS_save_tuner_proto_prefix + std::string (" /batchmatmul_best" ) +
95
+ suffix,
92
96
tc,
93
97
" batch_matmul" ,
94
98
inputs,
99
+ options,
95
100
check_fun);
96
- } else {
97
- Check (tc, " batch_matmul" , options, inputs, check_fun);
98
- if (use_flags) {
99
- autotune (
100
- FLAGS_save_tuner_proto_prefix + std::string (" /batchmatmul_cache" ) +
101
- suffix,
102
- FLAGS_save_tuner_proto_prefix + std::string (" /batchmatmul_best" ) +
103
- suffix,
104
- tc,
105
- " batch_matmul" ,
106
- inputs,
107
- options,
108
- check_fun);
109
- }
110
101
}
102
+ Check (tc, " batch_matmul" , bestOptions[0 ], inputs, check_fun);
111
103
}
112
104
113
- TEST_F (BatchMatMul, TransposedBatchMatMul) {
114
- auto B = FLAGS_B;
115
- auto N = FLAGS_N;
116
- auto M = FLAGS_M;
117
- auto K = FLAGS_K;
118
- auto options = tc::CudaMappingOptions::makeNaiveMappingOptions ()
119
- .tile (1 )
120
- .mapToThreads ({128 })
121
- .mapToBlocks ({B})
122
- .useSharedMemory (true )
123
- .usePrivateMemory (true )
124
- .unroll (256 );
125
- runBatchMatMul (B, N, M, K, options, true );
126
- }
127
-
128
- TEST_F (BatchMatMul, TransposedBatchMatMul_P100_autotuned_B_500_K_26_M_72_N_26) {
129
- uint32_t B = 500 ;
130
- uint32_t K = 26 ;
131
- uint32_t M = 72 ;
132
- uint32_t N = 26 ;
133
- auto options = tc::CudaMappingOptions::makeNaiveMappingOptions ()
134
- .outerScheduleFusionStrategy (tc::FusionStrategy::Max)
135
- .outerScheduleAllowSkewing (false )
136
- .outerSchedulePositiveOrthant (true )
137
- .intraTileScheduleFusionStrategy (tc::FusionStrategy::Min)
138
- .intraTileScheduleAllowSkewing (false )
139
- .intraTileSchedulePositiveOrthant (true )
140
- .tile (3 )
141
- .mapToThreads (4 , 36 , 3 )
142
- .mapToBlocks (512 )
143
- .unroll (64 )
144
- .tileImperfectlyNested (false )
145
- .useSharedMemory (true )
146
- .usePrivateMemory (false )
147
- .unrollCopyShared (true )
148
- .matchLibraryCalls (true );
149
- runBatchMatMul (B, N, M, K, options);
105
+ void BatchMatMul::runCaffe2BatchMatMul () {
106
+ Workspace w_ref;
107
+ auto AddInput = AddDeterministicallyRandomInput<caffe2::CUDABackend, float >;
108
+ AddInput (w_ref, {B, N, M}, " X" );
109
+ AddInput (w_ref, {B, M, K}, " Y" );
110
+ OperatorDef ref_def =
111
+ MakeOperatorDef<caffe2::CUDABackend>(" BatchMatMul" , {" X" , " Y" }, {" Z" });
112
+ std::unique_ptr<OperatorBase> net (CreateOperator (ref_def, &w_ref));
113
+ Reference ([&]() { return true ; }, [&](bool flag) { net->Run (); });
150
114
}
151
115
152
- TEST_F (BatchMatMul, ATenTransposedBatchMatMulReference) {
153
- auto B = FLAGS_B;
154
- auto N = FLAGS_N;
155
- auto M = FLAGS_M;
156
- auto K = FLAGS_K;
116
+ void BatchMatMul::runATenBatchMatMul () {
157
117
at::Tensor X = at::CUDA (at::kFloat ).rand ({B, N, M});
158
118
at::Tensor Y = at::CUDA (at::kFloat ).rand ({B, M, K});
159
119
Reference (
160
120
[&]() { return bmm (X, Y); },
161
121
[&](at::Tensor& res) { bmm_out (res, X, Y); });
162
122
}
163
123
164
- TEST_F (BatchMatMul, C2TransposedBatchMatMulReference) {
165
- int B = FLAGS_B;
166
- int N = FLAGS_N ;
167
- int M = FLAGS_M ;
168
- int K = FLAGS_K;
124
+ // Generic
125
+ TEST_F (BatchMatMul, TransposedBatchMatMul) {
126
+ Init (FLAGS_B, FLAGS_N, FLAGS_M, FLAGS_K) ;
127
+ runBatchMatMul ( tc::CudaMappingOptions::makeNaiveMappingOptions ()) ;
128
+ }
169
129
170
- Workspace w_ref;
171
- auto AddInput = AddDeterministicallyRandomInput<caffe2::CUDABackend, float >;
172
- AddInput (w_ref, {B, N, M}, " X" );
173
- AddInput (w_ref, {B, M, K}, " Y" );
174
- OperatorDef ref_def =
175
- MakeOperatorDef<caffe2::CUDABackend>(" BatchMatMul" , {" X" , " Y" }, {" Z" });
176
- std::unique_ptr<OperatorBase> net (CreateOperator (ref_def, &w_ref));
177
- Reference ([&]() { return true ; }, [&](bool flag) { net->Run (); });
130
+ // P100 TC
131
+ TEST_F (BatchMatMul, TransposedBatchMatMul_P100_autotuned_B_500_K_26_M_72_N_26) {
132
+ Init (500 , 26 , 72 , 26 );
133
+ runBatchMatMul (
134
+ tc::options_TransposedBatchMatMul_P100_autotuned_B_500_K_26_M_72_N_26);
135
+ }
136
+
137
+ // P100 ATen
138
+ TEST_F (BatchMatMul, TransposedBatchMatMul_ATen_P100_B_500_K_26_M_72_N_26) {
139
+ Init (500 , 26 , 72 , 26 );
140
+ runATenBatchMatMul ();
141
+ }
142
+
143
+ // P100 Caffe2
144
+ TEST_F (BatchMatMul, TransposedBatchMatMul_Caffe2_P100_B_500_K_26_M_72_N_26) {
145
+ Init (500 , 26 , 72 , 26 );
146
+ runCaffe2BatchMatMul ();
147
+ }
148
+
149
+ // V100 TC
150
+ TEST_F (BatchMatMul, TransposedBatchMatMul_V100_autotuned_B_500_K_26_M_72_N_26) {
151
+ Init (500 , 26 , 72 , 26 );
152
+ runBatchMatMul (
153
+ tc::options_TransposedBatchMatMul_V100_autotuned_B_500_K_26_M_72_N_26);
154
+ }
155
+
156
+ // V100 ATen
157
+ TEST_F (BatchMatMul, TransposedBatchMatMul_ATen_V100_B_500_K_26_M_72_N_26) {
158
+ Init (500 , 26 , 72 , 26 );
159
+ runATenBatchMatMul ();
160
+ }
161
+
162
+ // V100 Caffe2
163
+ TEST_F (BatchMatMul, TransposedBatchMatMul_Caffe2_V100_B_500_K_26_M_72_N_26) {
164
+ Init (500 , 26 , 72 , 26 );
165
+ runCaffe2BatchMatMul ();
178
166
}
179
167
180
168
int main (int argc, char ** argv) {
0 commit comments